Compare commits
425 Commits
debug-logg
...
remove_mam
| Author | SHA1 | Date | |
|---|---|---|---|
| ddb65dad96 | |||
| c41ea52634 | |||
| 31f5dc5b2a | |||
| ec7cb19224 | |||
| 2435ea7ed5 | |||
| 4a6b72c2ab | |||
| b4b9813b5e | |||
| 2cb6ef8996 | |||
| 9edd1db02b | |||
| f263a4b53f | |||
| 54991c548a | |||
| 178d03fbd6 | |||
| fa00c5d75b | |||
| 134a8ee8fd | |||
| 90ec006937 | |||
| a47e6ffe93 | |||
| 98a3a81024 | |||
| de98252f49 | |||
| 796bae07c5 | |||
| 6e20924350 | |||
| dd16bdc798 | |||
| e3c876dca3 | |||
| 5d5d419ca6 | |||
| 302962e806 | |||
| 7e6544c797 | |||
| 8e6c7e873f | |||
| 6a51530437 | |||
| 35509fc5be | |||
| 4b29d2784b | |||
| 59a0b8554b | |||
| 469b3ffaaa | |||
| ae87ddd040 | |||
| a7cb6101ca | |||
| c494f96fbc | |||
| 0c275ad5ad | |||
| 74333ae2f6 | |||
| 83156c7b89 | |||
| 4771df7b2b | |||
| 05fae02175 | |||
| d1bf1b9711 | |||
| 586f286789 | |||
| 811ac13d03 | |||
| e79a12fc3a | |||
| cdfd6871a5 | |||
| 4b3e4474d7 | |||
| bd3db7f469 | |||
| 29b97c0995 | |||
| 7b455cf1c0 | |||
| 8a6e108e76 | |||
| d7b28f3415 | |||
| 6fa41e0c32 | |||
| 031ca762d7 | |||
| 6ad6b8e115 | |||
| f4f4e7ef27 | |||
| 5ea71ff46f | |||
| 7175817637 | |||
| 2dffac464c | |||
| bdcb42e45d | |||
| c09efff976 | |||
| 309c1bb822 | |||
| 9af654cc38 | |||
| a5fff3bd49 | |||
| 1539ced93a | |||
| 54de71d0df | |||
| fed5849d3f | |||
| c1b4eb048a | |||
| a7b8788d2c | |||
| 8ecb3e9e93 | |||
| e5949e5ae0 | |||
| 49bcd893e7 | |||
| aa7012eb6d | |||
| c2e75b3c11 | |||
| 0d7db16a92 | |||
| 845420ac2c | |||
| e27d25a0dc | |||
| 6f5478298d | |||
| 6a39ba85fe | |||
| d3c18c9cb0 | |||
| 83f7bbb318 | |||
| b5dfb94fa0 | |||
| 6d98843b31 | |||
| aefeea0fde | |||
| 24d1dffbeb | |||
| 7de45db9a5 | |||
| 789562c28c | |||
| 3f36c325fa | |||
| 3dddbf1f25 | |||
| 337eb23bcc | |||
| 2ff46b8826 | |||
| 554df8a6a2 | |||
| 73e1b9b1d4 | |||
| 4abfd8796f | |||
| f5d0f4784f | |||
| b690e34824 | |||
| 25373b6c6c | |||
| 58eee5f2e0 | |||
| 067c34a155 | |||
| c64861d63c | |||
| 8564dc9448 | |||
| 4ac8437352 | |||
| d3a6f2120b | |||
| 0edaf752d7 | |||
| 6e8d8c4afb | |||
| 8d524ce79f | |||
| 9f9c38c392 | |||
| a65f46be5e | |||
| 57393715e8 | |||
| ee2eb6ecd8 | |||
| 23322431c8 | |||
| 3654847db5 | |||
| eefbf4a68b | |||
| 88faa466d7 | |||
| 881e1af43a | |||
| d84b97a3e3 | |||
| d331759488 | |||
| 9659bc7f27 | |||
| 3277e8f9e1 | |||
| 8d705996df | |||
| 38c8bce8b6 | |||
| ac45c44d98 | |||
| d6664664b4 | |||
| b879ecd6e2 | |||
| 3f8e952179 | |||
| 326a1b001d | |||
| 2d7b09b998 | |||
| 97608dc276 | |||
| 3146519add | |||
| 8026a335a1 | |||
| a59cd9d9f7 | |||
| 5c54d9759d | |||
| 0a6d305e0f | |||
| f81c1bb055 | |||
| fb0e0d46fc | |||
| 26b5f7bd2a | |||
| dfbc1f8880 | |||
| 87c94bc879 | |||
| 28b18cc741 | |||
| 4931486988 | |||
| 0f81b310db | |||
| e6680f9e25 | |||
| 27a145e893 | |||
| da31f6ad3d | |||
| 98df153abf | |||
| e0f63e4a35 | |||
| b4e081cb15 | |||
| 79731a79f0 | |||
| 53d7c39271 | |||
| 61dcc280fa | |||
| 0f46a780d4 | |||
| e1a7fe4af5 | |||
| 82de9b9d46 | |||
| ad57f23f6a | |||
| 3700642013 | |||
| 0bd409cf01 | |||
| e360316ab9 | |||
| c3e0e9337e | |||
| 6e672daf62 | |||
| 2dff2e21d9 | |||
| 71470bc4af | |||
| 9e0726e5bf | |||
| 53c21e492e | |||
| 0780bb5783 | |||
| 58bb902186 | |||
| 7349d5268b | |||
| 9484641616 | |||
| 207b750e19 | |||
| 5daffe7cf6 | |||
| 2836dd73f1 | |||
| d2aab336ad | |||
| 9532a6d563 | |||
| 3e36fcbee6 | |||
| 055bd3978e | |||
| 0f7919fca0 | |||
| 61445453df | |||
| ec02e536df | |||
| 9cb497bfa3 | |||
| ca9e2be3ed | |||
| 601f856d56 | |||
| 287f527f54 | |||
| f12d9256b3 | |||
| b9b753e7a7 | |||
| 56bd537dde | |||
| 8f0d516715 | |||
| f4135232b9 | |||
| 4904e53c32 | |||
| 004203e953 | |||
| 5c765aec65 | |||
| ad510309ee | |||
| 366f6b3a4d | |||
| 6e599eebe8 | |||
| 88edf5994c | |||
| ff08e51940 | |||
| 8f4a1c9a04 | |||
| 36ede45989 | |||
| 0e40b26073 | |||
| 0271c2ff2f | |||
| e91d3c9cda | |||
| bf668b5bf5 | |||
| da3e0bd6e5 | |||
| fcfd1eb9c5 | |||
| d979dd6beb | |||
| b876860c62 | |||
| 13986365a9 | |||
| 5c8fe389d6 | |||
| 5bbaf492a6 | |||
| 533db0935d | |||
| fc91da5499 | |||
| 547795232d | |||
| 30ef30ed5a | |||
| 02f82fe438 | |||
| 2ca5f82c2a | |||
| 6f8d261882 | |||
| 4cd7fe6cea | |||
| 16f3250527 | |||
| e3bc17ceea | |||
| 05cbbe20c5 | |||
| 65f311ce59 | |||
| 1b0a155534 | |||
| 44bc46da60 | |||
| b7b23da4d2 | |||
| fdde18229e | |||
| b917da442b | |||
| fb58e3a651 | |||
| 76080cff79 | |||
| ba5c5e5404 | |||
| 555e7225bc | |||
| 0e36abf993 | |||
| 452b2a3180 | |||
| 0d0cc9e150 | |||
| 9266d98048 | |||
| 176bbce1db | |||
| a1873db23d | |||
| a33ea28b1b | |||
| 7b49cb1c6b | |||
| f03e9cf2bb | |||
| 37f86d9048 | |||
| 58b11b24a6 | |||
| ad341c5194 | |||
| 759b87ef3e | |||
| f693b067a2 | |||
| 04e38500ee | |||
| ab714131e4 | |||
| 755fa8b657 | |||
| 2470419119 | |||
| 61a6905ab0 | |||
| 37efc63b64 | |||
| a4528f0cac | |||
| a2480251ec | |||
| 7234fe2685 | |||
| f1e2c095ec | |||
| 12a223ef9b | |||
| e18f085103 | |||
| afa2607596 | |||
| 48b763d6b5 | |||
| 947e982ede | |||
| c6c9122d50 | |||
| 8aa1485fcf | |||
| 89ac266b26 | |||
| c6f36cfa26 | |||
| b18b417fbf | |||
| 9ba1c88a93 | |||
| e0e58f9729 | |||
| b361f14e39 | |||
| 01c753ed98 | |||
| 94b71ae106 | |||
| 7d44c691b0 | |||
| e17a4d3bf9 | |||
| ec261b0291 | |||
| 04fe61aa3d | |||
| 25708d317a | |||
| 0e18a5d058 | |||
| 34a20c49b3 | |||
| 31084b3b1f | |||
| bccc43c033 | |||
| 1395dd9c28 | |||
| 9ace2eaf35 | |||
| 656c24f1b5 | |||
| 63fe3a700f | |||
| 0ae970ed15 | |||
| 65e8466c37 | |||
| 1b769dccf3 | |||
| 2cc571199b | |||
| a4ed731546 | |||
| d128d0d554 | |||
| a6c050286a | |||
| 139a7f07bd | |||
| 150d9e6337 | |||
| 139a97ec56 | |||
| 18cc33dd60 | |||
| 7656cf4cf3 | |||
| 3ea57a56d9 | |||
| 75856bc2cb | |||
| 304dcdf575 | |||
| 88e46c7c8d | |||
| d8937de4c8 | |||
| e626d286f5 | |||
| c7ffe93d9c | |||
| 15a72ac478 | |||
| 04ff4be310 | |||
| 93269bb43e | |||
| 82acf2184d | |||
| 86ae693f20 | |||
| 8f605ee309 | |||
| a9b2a1d704 | |||
| 57c22e57f9 | |||
| bda9d0535f | |||
| 3d847a3125 | |||
| 5f8c9a425e | |||
| 1cbf951ba2 | |||
| a8936e5193 | |||
| 01a395e9e7 | |||
| 971948b846 | |||
| eed2f463b2 | |||
| 20950b29fb | |||
| 3339cba3ff | |||
| 0b8caf9095 | |||
| ccf27cc4d4 | |||
| c657369841 | |||
| 6c66f28fa5 | |||
| de509ae8eb | |||
| e7c4f9ee86 | |||
| 9094d11c5d | |||
| 56e544f24b | |||
| 97d6c30cc9 | |||
| a40a8506df | |||
| c215f5c877 | |||
| 1cd6eaba54 | |||
| f27fdfc3ed | |||
| de10ff0b7c | |||
| 9d197280fa | |||
| e98def439c | |||
| 05c1126f29 | |||
| 875af38e01 | |||
| 7728dd77bb | |||
| 2f6e6b33fb | |||
| a55c95096b | |||
| 97349fe2bc | |||
| 62965de5fe | |||
| 7ae75fa6d0 | |||
| f1b286b2fb | |||
| c7742d6113 | |||
| cea96a0156 | |||
| 2eddd437ba | |||
| 75d29cf4e1 | |||
| 41d3082c41 | |||
| 7cfea0df39 | |||
| 5ac3168ee3 | |||
| 396ee94180 | |||
| e189b50f53 | |||
| 136d750f5f | |||
| b3caeb82e7 | |||
| eab2f3980c | |||
| 9fe98d4250 | |||
| 29c6fbe58c | |||
| c72f049cb4 | |||
| f3a683b7c9 | |||
| 46d81d6951 | |||
| 5c3f2628d5 | |||
| 7311f74468 | |||
| 8ed01e32f7 | |||
| e38e96a3c0 | |||
| 40d86ee412 | |||
| 85d051f026 | |||
| 5140f54b89 | |||
| 947edd099e | |||
| fde60ee775 | |||
| b38bc652ac | |||
| adaf2c6d4f | |||
| 42343f1f89 | |||
| 965bc71b04 | |||
| 807a328bb6 | |||
| e0be2c4d09 | |||
| 9c8b2c2a8a | |||
| 2212cd6cfb | |||
| ce3a9b1378 | |||
| 2ce90e5b01 | |||
| 633f6e804b | |||
| b57296bb9a | |||
| 34ddcf9ff4 | |||
| fe56180c7f | |||
| 07d80d7b0e | |||
| 2dd72d23d9 | |||
| a6c7fb8cff | |||
| a7272c23d0 | |||
| 6066284914 | |||
| 1e9ea8e69d | |||
| d9f9a3fd96 | |||
| 1b25f1fe75 | |||
| e8cb0d0495 | |||
| 684174115d | |||
| cdb79ee63d | |||
| 5a19a6c670 | |||
| 2ded067fd2 | |||
| 13abd0eaf9 | |||
| 61b8cea3b4 | |||
| 526078a96c | |||
| 6da0078523 | |||
| 73e3949d07 | |||
| 6eca337ce0 | |||
| 85bda9e7d0 | |||
| 610852a423 | |||
| f0f4de8f26 | |||
| fc5f756db4 | |||
| e74bfc70e4 | |||
| 90eeea8f85 | |||
| dde295a934 | |||
| 6d8d0a24c0 | |||
| 11ef7a611e | |||
| dc2f159f8a | |||
| d5b981f8b1 | |||
| eec6942014 | |||
| fd48d99ffd | |||
| f8c15c4efb | |||
| aa08a954f9 | |||
| 13e4ee1dc3 | |||
| 772ce5af97 | |||
| 63d92abb7c | |||
| 11599b0e1f | |||
| f3137cdd81 | |||
| 82ec66f514 | |||
| 78c13e30e1 | |||
| 5c9b807b34 | |||
| 14bf19e39f | |||
| 4ac7713e32 | |||
| 8560a5b258 |
@ -28,6 +28,7 @@ See [vLLM performance dashboard](https://perf.vllm.ai) for the latest performanc
|
|||||||
## Trigger the benchmark
|
## Trigger the benchmark
|
||||||
|
|
||||||
Performance benchmark will be triggered when:
|
Performance benchmark will be triggered when:
|
||||||
|
|
||||||
- A PR being merged into vllm.
|
- A PR being merged into vllm.
|
||||||
- Every commit for those PRs with `perf-benchmarks` label AND `ready` label.
|
- Every commit for those PRs with `perf-benchmarks` label AND `ready` label.
|
||||||
|
|
||||||
@ -38,6 +39,7 @@ bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
|
|||||||
```
|
```
|
||||||
|
|
||||||
Runtime environment variables:
|
Runtime environment variables:
|
||||||
|
|
||||||
- `ON_CPU`: set the value to '1' on Intel® Xeon® Processors. Default value is 0.
|
- `ON_CPU`: set the value to '1' on Intel® Xeon® Processors. Default value is 0.
|
||||||
- `SERVING_JSON`: JSON file to use for the serving tests. Default value is empty string (use default file).
|
- `SERVING_JSON`: JSON file to use for the serving tests. Default value is empty string (use default file).
|
||||||
- `LATENCY_JSON`: JSON file to use for the latency tests. Default value is empty string (use default file).
|
- `LATENCY_JSON`: JSON file to use for the latency tests. Default value is empty string (use default file).
|
||||||
@ -46,12 +48,14 @@ Runtime environment variables:
|
|||||||
- `REMOTE_PORT`: Port for the remote vLLM service to benchmark. Default value is empty string.
|
- `REMOTE_PORT`: Port for the remote vLLM service to benchmark. Default value is empty string.
|
||||||
|
|
||||||
Nightly benchmark will be triggered when:
|
Nightly benchmark will be triggered when:
|
||||||
|
|
||||||
- Every commit for those PRs with `perf-benchmarks` label and `nightly-benchmarks` label.
|
- Every commit for those PRs with `perf-benchmarks` label and `nightly-benchmarks` label.
|
||||||
|
|
||||||
## Performance benchmark details
|
## Performance benchmark details
|
||||||
|
|
||||||
See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.
|
See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.
|
||||||
> NOTE: For Intel® Xeon® Processors, use `tests/latency-tests-cpu.json`, `tests/throughput-tests-cpu.json`, `tests/serving-tests-cpu.json` instead.
|
> NOTE: For Intel® Xeon® Processors, use `tests/latency-tests-cpu.json`, `tests/throughput-tests-cpu.json`, `tests/serving-tests-cpu.json` instead.
|
||||||
|
>
|
||||||
### Latency test
|
### Latency test
|
||||||
|
|
||||||
Here is an example of one test inside `latency-tests.json`:
|
Here is an example of one test inside `latency-tests.json`:
|
||||||
@ -74,7 +78,7 @@ Here is an example of one test inside `latency-tests.json`:
|
|||||||
In this example:
|
In this example:
|
||||||
|
|
||||||
- The `test_name` attributes is a unique identifier for the test. In `latency-tests.json`, it must start with `latency_`.
|
- The `test_name` attributes is a unique identifier for the test. In `latency-tests.json`, it must start with `latency_`.
|
||||||
- The `parameters` attribute control the command line arguments to be used for `benchmark_latency.py`. Note that please use underline `_` instead of the dash `-` when specifying the command line arguments, and `run-performance-benchmarks.sh` will convert the underline to dash when feeding the arguments to `benchmark_latency.py`. For example, the corresponding command line arguments for `benchmark_latency.py` will be `--model meta-llama/Meta-Llama-3-8B --tensor-parallel-size 1 --load-format dummy --num-iters-warmup 5 --num-iters 15`
|
- The `parameters` attribute control the command line arguments to be used for `vllm bench latency`. Note that please use underline `_` instead of the dash `-` when specifying the command line arguments, and `run-performance-benchmarks.sh` will convert the underline to dash when feeding the arguments to `vllm bench latency`. For example, the corresponding command line arguments for `vllm bench latency` will be `--model meta-llama/Meta-Llama-3-8B --tensor-parallel-size 1 --load-format dummy --num-iters-warmup 5 --num-iters 15`
|
||||||
|
|
||||||
Note that the performance numbers are highly sensitive to the value of the parameters. Please make sure the parameters are set correctly.
|
Note that the performance numbers are highly sensitive to the value of the parameters. Please make sure the parameters are set correctly.
|
||||||
|
|
||||||
@ -82,13 +86,13 @@ WARNING: The benchmarking script will save json results by itself, so please do
|
|||||||
|
|
||||||
### Throughput test
|
### Throughput test
|
||||||
|
|
||||||
The tests are specified in `throughput-tests.json`. The syntax is similar to `latency-tests.json`, except for that the parameters will be fed forward to `benchmark_throughput.py`.
|
The tests are specified in `throughput-tests.json`. The syntax is similar to `latency-tests.json`, except for that the parameters will be fed forward to `vllm bench throughput`.
|
||||||
|
|
||||||
The number of this test is also stable -- a slight change on the value of this number might vary the performance numbers by a lot.
|
The number of this test is also stable -- a slight change on the value of this number might vary the performance numbers by a lot.
|
||||||
|
|
||||||
### Serving test
|
### Serving test
|
||||||
|
|
||||||
We test the throughput by using `benchmark_serving.py` with request rate = inf to cover the online serving overhead. The corresponding parameters are in `serving-tests.json`, and here is an example:
|
We test the throughput by using `vllm bench serve` with request rate = inf to cover the online serving overhead. The corresponding parameters are in `serving-tests.json`, and here is an example:
|
||||||
|
|
||||||
```json
|
```json
|
||||||
[
|
[
|
||||||
@ -100,7 +104,6 @@ We test the throughput by using `benchmark_serving.py` with request rate = inf t
|
|||||||
"tensor_parallel_size": 1,
|
"tensor_parallel_size": 1,
|
||||||
"swap_space": 16,
|
"swap_space": 16,
|
||||||
"disable_log_stats": "",
|
"disable_log_stats": "",
|
||||||
"disable_log_requests": "",
|
|
||||||
"load_format": "dummy"
|
"load_format": "dummy"
|
||||||
},
|
},
|
||||||
"client_parameters": {
|
"client_parameters": {
|
||||||
@ -118,8 +121,8 @@ Inside this example:
|
|||||||
|
|
||||||
- The `test_name` attribute is also a unique identifier for the test. It must start with `serving_`.
|
- The `test_name` attribute is also a unique identifier for the test. It must start with `serving_`.
|
||||||
- The `server-parameters` includes the command line arguments for vLLM server.
|
- The `server-parameters` includes the command line arguments for vLLM server.
|
||||||
- The `client-parameters` includes the command line arguments for `benchmark_serving.py`.
|
- The `client-parameters` includes the command line arguments for `vllm bench serve`.
|
||||||
- The `qps_list` controls the list of qps for test. It will be used to configure the `--request-rate` parameter in `benchmark_serving.py`
|
- The `qps_list` controls the list of qps for test. It will be used to configure the `--request-rate` parameter in `vllm bench serve`
|
||||||
|
|
||||||
The number of this test is less stable compared to the delay and latency benchmarks (due to randomized sharegpt dataset sampling inside `benchmark_serving.py`), but a large change on this number (e.g. 5% change) still vary the output greatly.
|
The number of this test is less stable compared to the delay and latency benchmarks (due to randomized sharegpt dataset sampling inside `benchmark_serving.py`), but a large change on this number (e.g. 5% change) still vary the output greatly.
|
||||||
|
|
||||||
@ -149,6 +152,7 @@ Here is an example using the script to compare result_a and result_b without det
|
|||||||
|
|
||||||
Here is an example using the script to compare result_a and result_b with detail test name.
|
Here is an example using the script to compare result_a and result_b with detail test name.
|
||||||
`python3 compare-json-results.py -f results_a/benchmark_results.json -f results_b/benchmark_results.json`
|
`python3 compare-json-results.py -f results_a/benchmark_results.json -f results_b/benchmark_results.json`
|
||||||
|
|
||||||
| | results_a/benchmark_results.json_name | results_a/benchmark_results.json | results_b/benchmark_results.json_name | results_b/benchmark_results.json | perf_ratio |
|
| | results_a/benchmark_results.json_name | results_a/benchmark_results.json | results_b/benchmark_results.json_name | results_b/benchmark_results.json | perf_ratio |
|
||||||
|---|---------------------------------------------|----------------------------------------|---------------------------------------------|----------------------------------------|----------|
|
|---|---------------------------------------------|----------------------------------------|---------------------------------------------|----------------------------------------|----------|
|
||||||
| 0 | serving_llama8B_tp1_sharegpt_qps_1 | 142.633982 | serving_llama8B_tp1_sharegpt_qps_1 | 156.526018 | 1.097396 |
|
| 0 | serving_llama8B_tp1_sharegpt_qps_1 | 142.633982 | serving_llama8B_tp1_sharegpt_qps_1 | 156.526018 | 1.097396 |
|
||||||
|
|||||||
@ -1,3 +1,4 @@
|
|||||||
|
# Nightly benchmark annotation
|
||||||
|
|
||||||
## Description
|
## Description
|
||||||
|
|
||||||
@ -13,15 +14,15 @@ Please download the visualization scripts in the post
|
|||||||
|
|
||||||
- Find the docker we use in `benchmarking pipeline`
|
- Find the docker we use in `benchmarking pipeline`
|
||||||
- Deploy the docker, and inside the docker:
|
- Deploy the docker, and inside the docker:
|
||||||
- Download `nightly-benchmarks.zip`.
|
- Download `nightly-benchmarks.zip`.
|
||||||
- In the same folder, run the following code:
|
- In the same folder, run the following code:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
export HF_TOKEN=<your HF token>
|
export HF_TOKEN=<your HF token>
|
||||||
apt update
|
apt update
|
||||||
apt install -y git
|
apt install -y git
|
||||||
unzip nightly-benchmarks.zip
|
unzip nightly-benchmarks.zip
|
||||||
VLLM_SOURCE_CODE_LOC=./ bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
|
VLLM_SOURCE_CODE_LOC=./ bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
|
||||||
```
|
```
|
||||||
|
|
||||||
And the results will be inside `./benchmarks/results`.
|
And the results will be inside `./benchmarks/results`.
|
||||||
|
|||||||
@ -13,25 +13,25 @@ Latest reproduction guilde: [github issue link](https://github.com/vllm-project/
|
|||||||
## Setup
|
## Setup
|
||||||
|
|
||||||
- Docker images:
|
- Docker images:
|
||||||
- vLLM: `vllm/vllm-openai:v0.6.2`
|
- vLLM: `vllm/vllm-openai:v0.6.2`
|
||||||
- SGLang: `lmsysorg/sglang:v0.3.2-cu121`
|
- SGLang: `lmsysorg/sglang:v0.3.2-cu121`
|
||||||
- LMDeploy: `openmmlab/lmdeploy:v0.6.1-cu12`
|
- LMDeploy: `openmmlab/lmdeploy:v0.6.1-cu12`
|
||||||
- TensorRT-LLM: `nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3`
|
- TensorRT-LLM: `nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3`
|
||||||
- *NOTE: we uses r24.07 as the current implementation only works for this version. We are going to bump this up.*
|
- *NOTE: we uses r24.07 as the current implementation only works for this version. We are going to bump this up.*
|
||||||
- Check [nightly-pipeline.yaml](nightly-pipeline.yaml) for the concrete docker images, specs and commands we use for the benchmark.
|
- Check [nightly-pipeline.yaml](nightly-pipeline.yaml) for the concrete docker images, specs and commands we use for the benchmark.
|
||||||
- Hardware
|
- Hardware
|
||||||
- 8x Nvidia A100 GPUs
|
- 8x Nvidia A100 GPUs
|
||||||
- Workload:
|
- Workload:
|
||||||
- Dataset
|
- Dataset
|
||||||
- ShareGPT dataset
|
- ShareGPT dataset
|
||||||
- Prefill-heavy dataset (in average 462 input tokens, 16 tokens as output)
|
- Prefill-heavy dataset (in average 462 input tokens, 16 tokens as output)
|
||||||
- Decode-heavy dataset (in average 462 input tokens, 256 output tokens)
|
- Decode-heavy dataset (in average 462 input tokens, 256 output tokens)
|
||||||
- Check [nightly-tests.json](tests/nightly-tests.json) for the concrete configuration of datasets we use.
|
- Check [nightly-tests.json](tests/nightly-tests.json) for the concrete configuration of datasets we use.
|
||||||
- Models: llama-3 8B, llama-3 70B.
|
- Models: llama-3 8B, llama-3 70B.
|
||||||
- We do not use llama 3.1 as it is incompatible with trt-llm r24.07. ([issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105)).
|
- We do not use llama 3.1 as it is incompatible with trt-llm r24.07. ([issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105)).
|
||||||
- Average QPS (query per second): 2, 4, 8, 16, 32 and inf.
|
- Average QPS (query per second): 2, 4, 8, 16, 32 and inf.
|
||||||
- Queries are randomly sampled, and arrival patterns are determined via Poisson process, but all with fixed random seed.
|
- Queries are randomly sampled, and arrival patterns are determined via Poisson process, but all with fixed random seed.
|
||||||
- Evaluation metrics: Throughput (higher the better), TTFT (time to the first token, lower the better), ITL (inter-token latency, lower the better).
|
- Evaluation metrics: Throughput (higher the better), TTFT (time to the first token, lower the better), ITL (inter-token latency, lower the better).
|
||||||
|
|
||||||
## Known issues
|
## Known issues
|
||||||
|
|
||||||
|
|||||||
@ -1,3 +1,4 @@
|
|||||||
|
# Performance benchmarks descriptions
|
||||||
|
|
||||||
## Latency tests
|
## Latency tests
|
||||||
|
|
||||||
|
|||||||
@ -44,6 +44,7 @@ serving_column_mapping = {
|
|||||||
"test_name": "Test name",
|
"test_name": "Test name",
|
||||||
"gpu_type": "GPU",
|
"gpu_type": "GPU",
|
||||||
"completed": "# of req.",
|
"completed": "# of req.",
|
||||||
|
"max_concurrency": "# of max concurrency.",
|
||||||
"request_throughput": "Tput (req/s)",
|
"request_throughput": "Tput (req/s)",
|
||||||
"total_token_throughput": "Total Token Tput (tok/s)",
|
"total_token_throughput": "Total Token Tput (tok/s)",
|
||||||
"output_throughput": "Output Tput (tok/s)",
|
"output_throughput": "Output Tput (tok/s)",
|
||||||
@ -100,7 +101,7 @@ if __name__ == "__main__":
|
|||||||
raw_result = json.loads(f.read())
|
raw_result = json.loads(f.read())
|
||||||
|
|
||||||
if "serving" in str(test_file):
|
if "serving" in str(test_file):
|
||||||
# this result is generated via `benchmark_serving.py`
|
# this result is generated via `vllm bench serve` command
|
||||||
|
|
||||||
# attach the benchmarking command to raw_result
|
# attach the benchmarking command to raw_result
|
||||||
try:
|
try:
|
||||||
@ -120,7 +121,7 @@ if __name__ == "__main__":
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
elif "latency" in f.name:
|
elif "latency" in f.name:
|
||||||
# this result is generated via `benchmark_latency.py`
|
# this result is generated via `vllm bench latency` command
|
||||||
|
|
||||||
# attach the benchmarking command to raw_result
|
# attach the benchmarking command to raw_result
|
||||||
try:
|
try:
|
||||||
@ -148,7 +149,7 @@ if __name__ == "__main__":
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
elif "throughput" in f.name:
|
elif "throughput" in f.name:
|
||||||
# this result is generated via `benchmark_throughput.py`
|
# this result is generated via `vllm bench throughput` command
|
||||||
|
|
||||||
# attach the benchmarking command to raw_result
|
# attach the benchmarking command to raw_result
|
||||||
try:
|
try:
|
||||||
|
|||||||
@ -73,7 +73,7 @@ get_current_llm_serving_engine() {
|
|||||||
echo "Container: vllm"
|
echo "Container: vllm"
|
||||||
# move to a completely irrelevant directory, to avoid import vllm from current folder
|
# move to a completely irrelevant directory, to avoid import vllm from current folder
|
||||||
export CURRENT_LLM_SERVING_ENGINE=vllm
|
export CURRENT_LLM_SERVING_ENGINE=vllm
|
||||||
|
|
||||||
return
|
return
|
||||||
fi
|
fi
|
||||||
}
|
}
|
||||||
@ -95,12 +95,14 @@ json2args() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
kill_gpu_processes() {
|
kill_gpu_processes() {
|
||||||
pkill -f python
|
pkill -f '[p]ython'
|
||||||
pkill -f python3
|
pkill -f '[p]ython3'
|
||||||
pkill -f tritonserver
|
pkill -f '[t]ritonserver'
|
||||||
pkill -f pt_main_thread
|
pkill -f '[p]t_main_thread'
|
||||||
pkill -f text-generation
|
pkill -f '[t]ext-generation'
|
||||||
pkill -f lmdeploy
|
pkill -f '[l]mdeploy'
|
||||||
|
# vLLM now names the process with VLLM prefix after https://github.com/vllm-project/vllm/pull/21445
|
||||||
|
pkill -f '[V]LLM'
|
||||||
|
|
||||||
while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
|
while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
|
||||||
sleep 1
|
sleep 1
|
||||||
@ -125,7 +127,7 @@ ensure_installed() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
run_serving_tests() {
|
run_serving_tests() {
|
||||||
# run serving tests using `benchmark_serving.py`
|
# run serving tests using `vllm bench serve` command
|
||||||
# $1: a json file specifying serving test cases
|
# $1: a json file specifying serving test cases
|
||||||
|
|
||||||
local serving_test_file
|
local serving_test_file
|
||||||
@ -225,7 +227,7 @@ run_serving_tests() {
|
|||||||
|
|
||||||
if [[ "$dataset_name" = "sharegpt" ]]; then
|
if [[ "$dataset_name" = "sharegpt" ]]; then
|
||||||
|
|
||||||
client_command="python3 benchmark_serving.py \
|
client_command="vllm bench serve \
|
||||||
--backend $backend \
|
--backend $backend \
|
||||||
--tokenizer /tokenizer_cache \
|
--tokenizer /tokenizer_cache \
|
||||||
--model $model \
|
--model $model \
|
||||||
@ -246,7 +248,7 @@ run_serving_tests() {
|
|||||||
sonnet_output_len=$(echo "$common_params" | jq -r '.sonnet_output_len')
|
sonnet_output_len=$(echo "$common_params" | jq -r '.sonnet_output_len')
|
||||||
sonnet_prefix_len=$(echo "$common_params" | jq -r '.sonnet_prefix_len')
|
sonnet_prefix_len=$(echo "$common_params" | jq -r '.sonnet_prefix_len')
|
||||||
|
|
||||||
client_command="python3 benchmark_serving.py \
|
client_command="vllm bench serve \
|
||||||
--backend $backend \
|
--backend $backend \
|
||||||
--tokenizer /tokenizer_cache \
|
--tokenizer /tokenizer_cache \
|
||||||
--model $model \
|
--model $model \
|
||||||
@ -265,13 +267,13 @@ run_serving_tests() {
|
|||||||
$client_args"
|
$client_args"
|
||||||
|
|
||||||
else
|
else
|
||||||
|
|
||||||
echo "The dataset name must be either 'sharegpt' or 'sonnet'. Got $dataset_name."
|
echo "The dataset name must be either 'sharegpt' or 'sonnet'. Got $dataset_name."
|
||||||
exit 1
|
exit 1
|
||||||
|
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
echo "Running test case $test_name with qps $qps"
|
echo "Running test case $test_name with qps $qps"
|
||||||
echo "Client command: $client_command"
|
echo "Client command: $client_command"
|
||||||
@ -302,7 +304,7 @@ run_serving_tests() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
run_genai_perf_tests() {
|
run_genai_perf_tests() {
|
||||||
# run genai-perf tests
|
# run genai-perf tests
|
||||||
|
|
||||||
# $1: a json file specifying genai-perf test cases
|
# $1: a json file specifying genai-perf test cases
|
||||||
local genai_perf_test_file
|
local genai_perf_test_file
|
||||||
@ -311,14 +313,14 @@ run_genai_perf_tests() {
|
|||||||
# Iterate over genai-perf tests
|
# Iterate over genai-perf tests
|
||||||
jq -c '.[]' "$genai_perf_test_file" | while read -r params; do
|
jq -c '.[]' "$genai_perf_test_file" | while read -r params; do
|
||||||
# get the test name, and append the GPU type back to it.
|
# get the test name, and append the GPU type back to it.
|
||||||
test_name=$(echo "$params" | jq -r '.test_name')
|
test_name=$(echo "$params" | jq -r '.test_name')
|
||||||
|
|
||||||
# if TEST_SELECTOR is set, only run the test cases that match the selector
|
# if TEST_SELECTOR is set, only run the test cases that match the selector
|
||||||
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
|
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
|
||||||
echo "Skip test case $test_name."
|
echo "Skip test case $test_name."
|
||||||
continue
|
continue
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# prepend the current serving engine to the test name
|
# prepend the current serving engine to the test name
|
||||||
test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name}
|
test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name}
|
||||||
|
|
||||||
@ -369,10 +371,10 @@ run_genai_perf_tests() {
|
|||||||
qps=$num_prompts
|
qps=$num_prompts
|
||||||
echo "now qps is $qps"
|
echo "now qps is $qps"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
new_test_name=$test_name"_qps_"$qps
|
new_test_name=$test_name"_qps_"$qps
|
||||||
backend=$CURRENT_LLM_SERVING_ENGINE
|
backend=$CURRENT_LLM_SERVING_ENGINE
|
||||||
|
|
||||||
if [[ "$backend" == *"vllm"* ]]; then
|
if [[ "$backend" == *"vllm"* ]]; then
|
||||||
backend="vllm"
|
backend="vllm"
|
||||||
fi
|
fi
|
||||||
@ -413,7 +415,7 @@ prepare_dataset() {
|
|||||||
do
|
do
|
||||||
cat sonnet.txt >> sonnet_4x.txt
|
cat sonnet.txt >> sonnet_4x.txt
|
||||||
done
|
done
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
main() {
|
main() {
|
||||||
|
|||||||
@ -33,7 +33,7 @@ check_gpus() {
|
|||||||
|
|
||||||
check_cpus() {
|
check_cpus() {
|
||||||
# check the number of CPUs and NUMA Node and GPU type.
|
# check the number of CPUs and NUMA Node and GPU type.
|
||||||
declare -g numa_count=$(python3 -c "from numa import info;numa_size = info.get_num_configured_nodes(); print(numa_size)")
|
declare -g numa_count=$(lscpu | grep "NUMA node(s):" | awk '{print $3}')
|
||||||
if [[ $numa_count -gt 0 ]]; then
|
if [[ $numa_count -gt 0 ]]; then
|
||||||
echo "NUMA found."
|
echo "NUMA found."
|
||||||
echo $numa_count
|
echo $numa_count
|
||||||
@ -126,7 +126,8 @@ kill_gpu_processes() {
|
|||||||
ps -aux
|
ps -aux
|
||||||
lsof -t -i:8000 | xargs -r kill -9
|
lsof -t -i:8000 | xargs -r kill -9
|
||||||
pgrep python3 | xargs -r kill -9
|
pgrep python3 | xargs -r kill -9
|
||||||
|
# vLLM now names the process with VLLM prefix after https://github.com/vllm-project/vllm/pull/21445
|
||||||
|
pgrep VLLM | xargs -r kill -9
|
||||||
|
|
||||||
# wait until GPU memory usage smaller than 1GB
|
# wait until GPU memory usage smaller than 1GB
|
||||||
if command -v nvidia-smi; then
|
if command -v nvidia-smi; then
|
||||||
@ -164,7 +165,7 @@ upload_to_buildkite() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
run_latency_tests() {
|
run_latency_tests() {
|
||||||
# run latency tests using `benchmark_latency.py`
|
# run latency tests using `vllm bench latency` command
|
||||||
# $1: a json file specifying latency test cases
|
# $1: a json file specifying latency test cases
|
||||||
|
|
||||||
local latency_test_file
|
local latency_test_file
|
||||||
@ -205,7 +206,7 @@ run_latency_tests() {
|
|||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
latency_command=" $latency_envs python3 benchmark_latency.py \
|
latency_command=" $latency_envs vllm bench latency \
|
||||||
--output-json $RESULTS_FOLDER/${test_name}.json \
|
--output-json $RESULTS_FOLDER/${test_name}.json \
|
||||||
$latency_args"
|
$latency_args"
|
||||||
|
|
||||||
@ -231,7 +232,7 @@ run_latency_tests() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
run_throughput_tests() {
|
run_throughput_tests() {
|
||||||
# run throughput tests using `benchmark_throughput.py`
|
# run throughput tests using `vllm bench throughput`
|
||||||
# $1: a json file specifying throughput test cases
|
# $1: a json file specifying throughput test cases
|
||||||
|
|
||||||
local throughput_test_file
|
local throughput_test_file
|
||||||
@ -272,7 +273,7 @@ run_throughput_tests() {
|
|||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
throughput_command=" $throughput_envs python3 benchmark_throughput.py \
|
throughput_command=" $throughput_envs vllm bench throughput \
|
||||||
--output-json $RESULTS_FOLDER/${test_name}.json \
|
--output-json $RESULTS_FOLDER/${test_name}.json \
|
||||||
$throughput_args"
|
$throughput_args"
|
||||||
|
|
||||||
@ -297,7 +298,7 @@ run_throughput_tests() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
run_serving_tests() {
|
run_serving_tests() {
|
||||||
# run serving tests using `benchmark_serving.py`
|
# run serving tests using `vllm bench serve` command
|
||||||
# $1: a json file specifying serving test cases
|
# $1: a json file specifying serving test cases
|
||||||
|
|
||||||
local serving_test_file
|
local serving_test_file
|
||||||
@ -393,7 +394,7 @@ run_serving_tests() {
|
|||||||
|
|
||||||
# pass the tensor parallel size to the client so that it can be displayed
|
# pass the tensor parallel size to the client so that it can be displayed
|
||||||
# on the benchmark dashboard
|
# on the benchmark dashboard
|
||||||
client_command="python3 benchmark_serving.py \
|
client_command="vllm bench serve \
|
||||||
--save-result \
|
--save-result \
|
||||||
--result-dir $RESULTS_FOLDER \
|
--result-dir $RESULTS_FOLDER \
|
||||||
--result-filename ${new_test_name}.json \
|
--result-filename ${new_test_name}.json \
|
||||||
@ -447,7 +448,7 @@ main() {
|
|||||||
(which jq) || (apt-get update && apt-get -y install jq)
|
(which jq) || (apt-get update && apt-get -y install jq)
|
||||||
(which lsof) || (apt-get update && apt-get install -y lsof)
|
(which lsof) || (apt-get update && apt-get install -y lsof)
|
||||||
|
|
||||||
# get the current IP address, required by benchmark_serving.py
|
# get the current IP address, required by `vllm bench serve` command
|
||||||
export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
|
export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
|
||||||
# turn of the reporting of the status of each request, to clean up the terminal output
|
# turn of the reporting of the status of each request, to clean up the terminal output
|
||||||
export VLLM_LOGGING_LEVEL="WARNING"
|
export VLLM_LOGGING_LEVEL="WARNING"
|
||||||
|
|||||||
@ -11,7 +11,6 @@
|
|||||||
},
|
},
|
||||||
"vllm_server_parameters": {
|
"vllm_server_parameters": {
|
||||||
"disable_log_stats": "",
|
"disable_log_stats": "",
|
||||||
"disable_log_requests": "",
|
|
||||||
"gpu_memory_utilization": 0.9,
|
"gpu_memory_utilization": 0.9,
|
||||||
"num_scheduler_steps": 10,
|
"num_scheduler_steps": 10,
|
||||||
"max_num_seqs": 512,
|
"max_num_seqs": 512,
|
||||||
|
|||||||
@ -35,7 +35,6 @@
|
|||||||
},
|
},
|
||||||
"vllm_server_parameters": {
|
"vllm_server_parameters": {
|
||||||
"disable_log_stats": "",
|
"disable_log_stats": "",
|
||||||
"disable_log_requests": "",
|
|
||||||
"gpu_memory_utilization": 0.9,
|
"gpu_memory_utilization": 0.9,
|
||||||
"num_scheduler_steps": 10,
|
"num_scheduler_steps": 10,
|
||||||
"max_num_seqs": 512,
|
"max_num_seqs": 512,
|
||||||
@ -90,7 +89,6 @@
|
|||||||
},
|
},
|
||||||
"vllm_server_parameters": {
|
"vllm_server_parameters": {
|
||||||
"disable_log_stats": "",
|
"disable_log_stats": "",
|
||||||
"disable_log_requests": "",
|
|
||||||
"gpu_memory_utilization": 0.9,
|
"gpu_memory_utilization": 0.9,
|
||||||
"num_scheduler_steps": 10,
|
"num_scheduler_steps": 10,
|
||||||
"max_num_seqs": 512,
|
"max_num_seqs": 512,
|
||||||
@ -145,7 +143,6 @@
|
|||||||
},
|
},
|
||||||
"vllm_server_parameters": {
|
"vllm_server_parameters": {
|
||||||
"disable_log_stats": "",
|
"disable_log_stats": "",
|
||||||
"disable_log_requests": "",
|
|
||||||
"gpu_memory_utilization": 0.9,
|
"gpu_memory_utilization": 0.9,
|
||||||
"num_scheduler_steps": 10,
|
"num_scheduler_steps": 10,
|
||||||
"max_num_seqs": 512,
|
"max_num_seqs": 512,
|
||||||
@ -197,7 +194,6 @@
|
|||||||
},
|
},
|
||||||
"vllm_server_parameters": {
|
"vllm_server_parameters": {
|
||||||
"disable_log_stats": "",
|
"disable_log_stats": "",
|
||||||
"disable_log_requests": "",
|
|
||||||
"gpu_memory_utilization": 0.9,
|
"gpu_memory_utilization": 0.9,
|
||||||
"num_scheduler_steps": 10,
|
"num_scheduler_steps": 10,
|
||||||
"max_num_seqs": 512,
|
"max_num_seqs": 512,
|
||||||
@ -251,7 +247,6 @@
|
|||||||
},
|
},
|
||||||
"vllm_server_parameters": {
|
"vllm_server_parameters": {
|
||||||
"disable_log_stats": "",
|
"disable_log_stats": "",
|
||||||
"disable_log_requests": "",
|
|
||||||
"gpu_memory_utilization": 0.9,
|
"gpu_memory_utilization": 0.9,
|
||||||
"num_scheduler_steps": 10,
|
"num_scheduler_steps": 10,
|
||||||
"max_num_seqs": 512,
|
"max_num_seqs": 512,
|
||||||
@ -305,7 +300,6 @@
|
|||||||
},
|
},
|
||||||
"vllm_server_parameters": {
|
"vllm_server_parameters": {
|
||||||
"disable_log_stats": "",
|
"disable_log_stats": "",
|
||||||
"disable_log_requests": "",
|
|
||||||
"gpu_memory_utilization": 0.9,
|
"gpu_memory_utilization": 0.9,
|
||||||
"num_scheduler_steps": 10,
|
"num_scheduler_steps": 10,
|
||||||
"max_num_seqs": 512,
|
"max_num_seqs": 512,
|
||||||
|
|||||||
203
.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json
Normal file
203
.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json
Normal file
@ -0,0 +1,203 @@
|
|||||||
|
[
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_tp1_sharegpt",
|
||||||
|
"qps_list": [1, 4, 16, "inf"],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||||
|
"tensor_parallel_size": 1,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "sharegpt",
|
||||||
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"max_concurrency": 60,
|
||||||
|
"num_prompts": 200
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_tp2_sharegpt",
|
||||||
|
"qps_list": [1, 4, 16, "inf"],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||||
|
"tensor_parallel_size": 2,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "sharegpt",
|
||||||
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"max_concurrency": 60,
|
||||||
|
"num_prompts": 200
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_tp4_sharegpt",
|
||||||
|
"qps_list": [1, 4, 16, "inf"],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||||
|
"tensor_parallel_size": 4,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "sharegpt",
|
||||||
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"max_concurrency": 60,
|
||||||
|
"num_prompts": 200
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_tp1_random_128_128",
|
||||||
|
"qps_list": [1, 4, 16, "inf"],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||||
|
"tensor_parallel_size": 1,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"enable_chunked_prefill": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 128,
|
||||||
|
"random-output-len": 128,
|
||||||
|
"ignore-eos": "",
|
||||||
|
"max_concurrency": 1000,
|
||||||
|
"num_prompts": 1000
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_tp2_random_128_128",
|
||||||
|
"qps_list": [1, 4, 16, "inf"],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||||
|
"tensor_parallel_size": 2,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"enable_chunked_prefill": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 128,
|
||||||
|
"random-output-len": 128,
|
||||||
|
"ignore-eos": "",
|
||||||
|
"max_concurrency": 1000,
|
||||||
|
"num_prompts": 1000
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_tp4_random_128_128",
|
||||||
|
"qps_list": [1, 4, 16, "inf"],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||||
|
"tensor_parallel_size": 4,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"enable_chunked_prefill": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 128,
|
||||||
|
"random-output-len": 128,
|
||||||
|
"ignore-eos": "",
|
||||||
|
"max_concurrency": 1000,
|
||||||
|
"num_prompts": 1000
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
205
.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json
Normal file
205
.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json
Normal file
@ -0,0 +1,205 @@
|
|||||||
|
[
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_pp1_sharegpt",
|
||||||
|
"qps_list": [1, 4, 16, "inf"],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||||
|
"pipeline_parallel_size": 1,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "sharegpt",
|
||||||
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"max_concurrency": 60,
|
||||||
|
"num_prompts": 200
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_pp3_sharegpt",
|
||||||
|
"qps_list": [1, 4, 16, "inf"],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||||
|
"pipeline_parallel_size": 3,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "sharegpt",
|
||||||
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"max_concurrency": 60,
|
||||||
|
"num_prompts": 200
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_tp2pp6_sharegpt",
|
||||||
|
"qps_list": [1, 4, 16, "inf"],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||||
|
"tensor_parallel_size": 2,
|
||||||
|
"pipeline_parallel_size": 3,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "sharegpt",
|
||||||
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"max_concurrency": 60,
|
||||||
|
"num_prompts": 200
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_pp1_random_128_128",
|
||||||
|
"qps_list": [1, 4, 16, "inf"],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||||
|
"pipeline_parallel_size": 1,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"enable_chunked_prefill": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 128,
|
||||||
|
"random-output-len": 128,
|
||||||
|
"ignore-eos": "",
|
||||||
|
"max_concurrency": 1000,
|
||||||
|
"num_prompts": 1000
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_pp3_random_128_128",
|
||||||
|
"qps_list": [1, 4, 16, "inf"],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL:": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||||
|
"pipeline_parallel_size": 3,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"enable_chunked_prefill": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 128,
|
||||||
|
"random-output-len": 128,
|
||||||
|
"ignore-eos": "",
|
||||||
|
"max_concurrency": 1000,
|
||||||
|
"num_prompts": 1000
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_tp2pp3_random_128_128",
|
||||||
|
"qps_list": [1, 4, 16, "inf"],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||||
|
"tensor_parallel_size": 2,
|
||||||
|
"pipeline_parallel_size": 3,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"enable_chunked_prefill": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 128,
|
||||||
|
"random-output-len": 128,
|
||||||
|
"ignore-eos": "",
|
||||||
|
"max_concurrency": 1000,
|
||||||
|
"num_prompts": 1000
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
@ -6,6 +6,7 @@
|
|||||||
"VLLM_RPC_TIMEOUT": 100000,
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
},
|
},
|
||||||
"server_parameters": {
|
"server_parameters": {
|
||||||
@ -16,8 +17,9 @@
|
|||||||
"block_size": 128,
|
"block_size": 128,
|
||||||
"trust_remote_code": "",
|
"trust_remote_code": "",
|
||||||
"disable_log_stats": "",
|
"disable_log_stats": "",
|
||||||
"disable_log_requests": "",
|
|
||||||
"enforce_eager": "",
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
"load_format": "dummy"
|
"load_format": "dummy"
|
||||||
},
|
},
|
||||||
"client_parameters": {
|
"client_parameters": {
|
||||||
@ -36,6 +38,7 @@
|
|||||||
"VLLM_RPC_TIMEOUT": 100000,
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
},
|
},
|
||||||
"server_parameters": {
|
"server_parameters": {
|
||||||
@ -46,8 +49,9 @@
|
|||||||
"block_size": 128,
|
"block_size": 128,
|
||||||
"trust_remote_code": "",
|
"trust_remote_code": "",
|
||||||
"disable_log_stats": "",
|
"disable_log_stats": "",
|
||||||
"disable_log_requests": "",
|
|
||||||
"enforce_eager": "",
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
"load_format": "dummy"
|
"load_format": "dummy"
|
||||||
},
|
},
|
||||||
"client_parameters": {
|
"client_parameters": {
|
||||||
@ -66,6 +70,7 @@
|
|||||||
"VLLM_RPC_TIMEOUT": 100000,
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
},
|
},
|
||||||
"server_parameters": {
|
"server_parameters": {
|
||||||
@ -76,8 +81,9 @@
|
|||||||
"block_size": 128,
|
"block_size": 128,
|
||||||
"trust_remote_code": "",
|
"trust_remote_code": "",
|
||||||
"disable_log_stats": "",
|
"disable_log_stats": "",
|
||||||
"disable_log_requests": "",
|
|
||||||
"enforce_eager": "",
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
"load_format": "dummy"
|
"load_format": "dummy"
|
||||||
},
|
},
|
||||||
"client_parameters": {
|
"client_parameters": {
|
||||||
@ -96,6 +102,7 @@
|
|||||||
"VLLM_RPC_TIMEOUT": 100000,
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
},
|
},
|
||||||
"server_parameters": {
|
"server_parameters": {
|
||||||
@ -107,8 +114,9 @@
|
|||||||
"trust_remote_code": "",
|
"trust_remote_code": "",
|
||||||
"enable_chunked_prefill": "",
|
"enable_chunked_prefill": "",
|
||||||
"disable_log_stats": "",
|
"disable_log_stats": "",
|
||||||
"disable_log_requests": "",
|
|
||||||
"enforce_eager": "",
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
"load_format": "dummy"
|
"load_format": "dummy"
|
||||||
},
|
},
|
||||||
"client_parameters": {
|
"client_parameters": {
|
||||||
@ -129,6 +137,7 @@
|
|||||||
"VLLM_RPC_TIMEOUT": 100000,
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
},
|
},
|
||||||
"server_parameters": {
|
"server_parameters": {
|
||||||
@ -140,8 +149,9 @@
|
|||||||
"trust_remote_code": "",
|
"trust_remote_code": "",
|
||||||
"enable_chunked_prefill": "",
|
"enable_chunked_prefill": "",
|
||||||
"disable_log_stats": "",
|
"disable_log_stats": "",
|
||||||
"disable_log_requests": "",
|
|
||||||
"enforce_eager": "",
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
"load_format": "dummy"
|
"load_format": "dummy"
|
||||||
},
|
},
|
||||||
"client_parameters": {
|
"client_parameters": {
|
||||||
|
|||||||
@ -7,7 +7,6 @@
|
|||||||
"tensor_parallel_size": 1,
|
"tensor_parallel_size": 1,
|
||||||
"swap_space": 16,
|
"swap_space": 16,
|
||||||
"disable_log_stats": "",
|
"disable_log_stats": "",
|
||||||
"disable_log_requests": "",
|
|
||||||
"load_format": "dummy"
|
"load_format": "dummy"
|
||||||
},
|
},
|
||||||
"client_parameters": {
|
"client_parameters": {
|
||||||
@ -26,7 +25,6 @@
|
|||||||
"tensor_parallel_size": 4,
|
"tensor_parallel_size": 4,
|
||||||
"swap_space": 16,
|
"swap_space": 16,
|
||||||
"disable_log_stats": "",
|
"disable_log_stats": "",
|
||||||
"disable_log_requests": "",
|
|
||||||
"load_format": "dummy"
|
"load_format": "dummy"
|
||||||
},
|
},
|
||||||
"client_parameters": {
|
"client_parameters": {
|
||||||
@ -45,7 +43,6 @@
|
|||||||
"tensor_parallel_size": 2,
|
"tensor_parallel_size": 2,
|
||||||
"swap_space": 16,
|
"swap_space": 16,
|
||||||
"disable_log_stats": "",
|
"disable_log_stats": "",
|
||||||
"disable_log_requests": "",
|
|
||||||
"load_format": "dummy"
|
"load_format": "dummy"
|
||||||
},
|
},
|
||||||
"client_parameters": {
|
"client_parameters": {
|
||||||
@ -60,8 +57,7 @@
|
|||||||
"test_name": "serving_llama70B_tp4_sharegpt_specdecode",
|
"test_name": "serving_llama70B_tp4_sharegpt_specdecode",
|
||||||
"qps_list": [2],
|
"qps_list": [2],
|
||||||
"server_parameters": {
|
"server_parameters": {
|
||||||
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
||||||
"disable_log_requests": "",
|
|
||||||
"tensor_parallel_size": 4,
|
"tensor_parallel_size": 4,
|
||||||
"swap_space": 16,
|
"swap_space": 16,
|
||||||
"speculative_config": {
|
"speculative_config": {
|
||||||
|
|||||||
@ -13,9 +13,9 @@ NUMA_NODE=${NUMA_NODE:-1}
|
|||||||
export CMAKE_BUILD_PARALLEL_LEVEL=32
|
export CMAKE_BUILD_PARALLEL_LEVEL=32
|
||||||
|
|
||||||
# Setup cleanup
|
# Setup cleanup
|
||||||
remove_docker_container() {
|
remove_docker_container() {
|
||||||
set -e;
|
set -e;
|
||||||
docker rm -f cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"-avx2 || true;
|
docker rm -f cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"-avx2 || true;
|
||||||
}
|
}
|
||||||
trap remove_docker_container EXIT
|
trap remove_docker_container EXIT
|
||||||
remove_docker_container
|
remove_docker_container
|
||||||
@ -69,7 +69,7 @@ function cpu_tests() {
|
|||||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||||
set -e
|
set -e
|
||||||
pytest -s -v \
|
pytest -s -v \
|
||||||
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs[False-10-32-neuralmagic/Llama-3.2-1B-quantized.w8a8]"
|
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs[False-10-32-neuralmagic/Llama-3.2-1B-quantized.w8a8]"
|
||||||
|
|
||||||
# Note: disable it until supports V1
|
# Note: disable it until supports V1
|
||||||
# Run AWQ test
|
# Run AWQ test
|
||||||
@ -78,23 +78,23 @@ function cpu_tests() {
|
|||||||
# VLLM_USE_V1=0 pytest -s -v \
|
# VLLM_USE_V1=0 pytest -s -v \
|
||||||
# tests/quantization/test_ipex_quant.py"
|
# tests/quantization/test_ipex_quant.py"
|
||||||
|
|
||||||
# online serving
|
|
||||||
docker exec cpu-test-"$NUMA_NODE" bash -c '
|
|
||||||
set -e
|
|
||||||
VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 &
|
|
||||||
timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
|
|
||||||
python3 benchmarks/benchmark_serving.py \
|
|
||||||
--backend vllm \
|
|
||||||
--dataset-name random \
|
|
||||||
--model meta-llama/Llama-3.2-3B-Instruct \
|
|
||||||
--num-prompts 20 \
|
|
||||||
--endpoint /v1/completions'
|
|
||||||
|
|
||||||
# Run multi-lora tests
|
# Run multi-lora tests
|
||||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||||
set -e
|
set -e
|
||||||
pytest -s -v \
|
pytest -s -v \
|
||||||
tests/lora/test_qwen2vl.py"
|
tests/lora/test_qwen2vl.py"
|
||||||
|
|
||||||
|
# online serving
|
||||||
|
docker exec cpu-test-"$NUMA_NODE" bash -c '
|
||||||
|
set -e
|
||||||
|
VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 &
|
||||||
|
timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
|
||||||
|
vllm bench serve \
|
||||||
|
--backend vllm \
|
||||||
|
--dataset-name random \
|
||||||
|
--model meta-llama/Llama-3.2-3B-Instruct \
|
||||||
|
--num-prompts 20 \
|
||||||
|
--endpoint /v1/completions'
|
||||||
}
|
}
|
||||||
|
|
||||||
# All of CPU tests are expected to be finished less than 40 mins.
|
# All of CPU tests are expected to be finished less than 40 mins.
|
||||||
|
|||||||
@ -16,8 +16,7 @@ DOCKER_BUILDKIT=1 docker build . \
|
|||||||
--build-arg max_jobs=66 \
|
--build-arg max_jobs=66 \
|
||||||
--build-arg nvcc_threads=2 \
|
--build-arg nvcc_threads=2 \
|
||||||
--build-arg RUN_WHEEL_CHECK=false \
|
--build-arg RUN_WHEEL_CHECK=false \
|
||||||
--build-arg torch_cuda_arch_list="9.0+PTX" \
|
--build-arg torch_cuda_arch_list="9.0+PTX"
|
||||||
--build-arg vllm_fa_cmake_gpu_arches="90-real"
|
|
||||||
|
|
||||||
# Setup cleanup
|
# Setup cleanup
|
||||||
remove_docker_container() { docker rm -f gh200-test || true; }
|
remove_docker_container() { docker rm -f gh200-test || true; }
|
||||||
|
|||||||
165
.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
Executable file
165
.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
Executable file
@ -0,0 +1,165 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
set -xu
|
||||||
|
|
||||||
|
|
||||||
|
remove_docker_container() {
|
||||||
|
docker rm -f tpu-test || true;
|
||||||
|
}
|
||||||
|
|
||||||
|
trap remove_docker_container EXIT
|
||||||
|
|
||||||
|
# Remove the container that might not be cleaned up in the previous run.
|
||||||
|
remove_docker_container
|
||||||
|
|
||||||
|
# Build the docker image.
|
||||||
|
docker build -f docker/Dockerfile.tpu -t vllm-tpu .
|
||||||
|
|
||||||
|
# Set up cleanup.
|
||||||
|
cleanup_docker() {
|
||||||
|
# Get Docker's root directory
|
||||||
|
docker_root=$(docker info -f '{{.DockerRootDir}}')
|
||||||
|
if [ -z "$docker_root" ]; then
|
||||||
|
echo "Failed to determine Docker root directory."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
echo "Docker root directory: $docker_root"
|
||||||
|
# Check disk usage of the filesystem where Docker's root directory is located
|
||||||
|
disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
|
||||||
|
# Define the threshold
|
||||||
|
threshold=70
|
||||||
|
if [ "$disk_usage" -gt "$threshold" ]; then
|
||||||
|
echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
|
||||||
|
# Remove dangling images (those that are not tagged and not used by any container)
|
||||||
|
docker image prune -f
|
||||||
|
# Remove unused volumes / force the system prune for old images as well.
|
||||||
|
docker volume prune -f && docker system prune --force --filter "until=72h" --all
|
||||||
|
echo "Docker images and volumes cleanup completed."
|
||||||
|
else
|
||||||
|
echo "Disk usage is below $threshold%. No cleanup needed."
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
cleanup_docker
|
||||||
|
|
||||||
|
# For HF_TOKEN.
|
||||||
|
source /etc/environment
|
||||||
|
|
||||||
|
docker run --privileged --net host --shm-size=16G -it \
|
||||||
|
-e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
|
||||||
|
vllm-tpu /bin/bash -c '
|
||||||
|
set -e # Exit immediately if a command exits with a non-zero status.
|
||||||
|
set -u # Treat unset variables as an error.
|
||||||
|
|
||||||
|
echo "--- Starting script inside Docker container ---"
|
||||||
|
|
||||||
|
# Create results directory
|
||||||
|
RESULTS_DIR=$(mktemp -d)
|
||||||
|
# If mktemp fails, set -e will cause the script to exit.
|
||||||
|
echo "Results will be stored in: $RESULTS_DIR"
|
||||||
|
|
||||||
|
# Install dependencies
|
||||||
|
echo "--- Installing Python dependencies ---"
|
||||||
|
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
|
||||||
|
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
|
||||||
|
&& python3 -m pip install --progress-bar off lm_eval[api]==0.4.4 \
|
||||||
|
&& python3 -m pip install --progress-bar off hf-transfer
|
||||||
|
echo "--- Python dependencies installed ---"
|
||||||
|
export VLLM_USE_V1=1
|
||||||
|
export VLLM_XLA_CHECK_RECOMPILATION=1
|
||||||
|
export VLLM_XLA_CACHE_PATH=
|
||||||
|
echo "Using VLLM V1"
|
||||||
|
|
||||||
|
echo "--- Hardware Information ---"
|
||||||
|
# tpu-info
|
||||||
|
echo "--- Starting Tests ---"
|
||||||
|
set +e
|
||||||
|
overall_script_exit_code=0
|
||||||
|
|
||||||
|
# --- Test Definitions ---
|
||||||
|
# If a test fails, this function will print logs and will not cause the main script to exit.
|
||||||
|
run_test() {
|
||||||
|
local test_num=$1
|
||||||
|
local test_name=$2
|
||||||
|
local test_command=$3
|
||||||
|
local log_file="$RESULTS_DIR/test_${test_num}.log"
|
||||||
|
local actual_exit_code
|
||||||
|
|
||||||
|
echo "--- TEST_$test_num: Running $test_name ---"
|
||||||
|
|
||||||
|
# Execute the test command.
|
||||||
|
eval "$test_command" > >(tee -a "$log_file") 2> >(tee -a "$log_file" >&2)
|
||||||
|
actual_exit_code=$?
|
||||||
|
|
||||||
|
echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" # This goes to main log
|
||||||
|
echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" >> "$log_file" # Also to per-test log
|
||||||
|
|
||||||
|
if [ "$actual_exit_code" -ne 0 ]; then
|
||||||
|
echo "TEST_$test_num ($test_name) FAILED with exit code $actual_exit_code." >&2
|
||||||
|
echo "--- Log for failed TEST_$test_num ($test_name) ---" >&2
|
||||||
|
if [ -f "$log_file" ]; then
|
||||||
|
cat "$log_file" >&2
|
||||||
|
else
|
||||||
|
echo "Log file $log_file not found for TEST_$test_num ($test_name)." >&2
|
||||||
|
fi
|
||||||
|
echo "--- End of log for TEST_$test_num ($test_name) ---" >&2
|
||||||
|
return "$actual_exit_code" # Return the failure code
|
||||||
|
else
|
||||||
|
echo "TEST_$test_num ($test_name) PASSED."
|
||||||
|
return 0 # Return success
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
# Helper function to call run_test and update the overall script exit code
|
||||||
|
run_and_track_test() {
|
||||||
|
local test_num_arg="$1"
|
||||||
|
local test_name_arg="$2"
|
||||||
|
local test_command_arg="$3"
|
||||||
|
|
||||||
|
# Run the test
|
||||||
|
run_test "$test_num_arg" "$test_name_arg" "$test_command_arg"
|
||||||
|
local test_specific_exit_code=$?
|
||||||
|
|
||||||
|
# If the test failed, set the overall script exit code to 1
|
||||||
|
if [ "$test_specific_exit_code" -ne 0 ]; then
|
||||||
|
# No need for extra echo here, run_test already logged the failure.
|
||||||
|
overall_script_exit_code=1
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
# --- Actual Test Execution ---
|
||||||
|
run_and_track_test 1 "test_struct_output_generate.py" \
|
||||||
|
"HF_HUB_DISABLE_XET=1 python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\""
|
||||||
|
run_and_track_test 2 "test_moe_pallas.py" \
|
||||||
|
"python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py"
|
||||||
|
run_and_track_test 3 "test_lora.py" \
|
||||||
|
"VLLM_XLA_CHECK_RECOMPILATION=0 python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/test_lora.py"
|
||||||
|
run_and_track_test 4 "test_tpu_qkv_linear.py" \
|
||||||
|
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_tpu_qkv_linear.py"
|
||||||
|
run_and_track_test 5 "test_spmd_model_weight_loading.py" \
|
||||||
|
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_spmd_model_weight_loading.py"
|
||||||
|
run_and_track_test 6 "test_kv_cache_update_kernel.py" \
|
||||||
|
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_kv_cache_update_kernel.py"
|
||||||
|
|
||||||
|
# After all tests have been attempted, exit with the overall status.
|
||||||
|
if [ "$overall_script_exit_code" -ne 0 ]; then
|
||||||
|
echo "--- One or more tests FAILED. Overall script exiting with failure code 1. ---"
|
||||||
|
else
|
||||||
|
echo "--- All tests have completed and PASSED. Overall script exiting with success code 0. ---"
|
||||||
|
fi
|
||||||
|
exit "$overall_script_exit_code"
|
||||||
|
' # IMPORTANT: This is the closing single quote for the bash -c "..." command. Ensure it is present and correct.
|
||||||
|
|
||||||
|
# Capture the exit code of the docker run command
|
||||||
|
DOCKER_RUN_EXIT_CODE=$?
|
||||||
|
|
||||||
|
# The trap will run for cleanup.
|
||||||
|
# Exit the main script with the Docker run command's exit code.
|
||||||
|
if [ "$DOCKER_RUN_EXIT_CODE" -ne 0 ]; then
|
||||||
|
echo "Docker run command failed with exit code $DOCKER_RUN_EXIT_CODE."
|
||||||
|
exit "$DOCKER_RUN_EXIT_CODE"
|
||||||
|
else
|
||||||
|
echo "Docker run command completed successfully."
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
# TODO: This test fails because it uses RANDOM_SEED sampling
|
||||||
|
# pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
|
||||||
@ -5,7 +5,6 @@ set -xu
|
|||||||
|
|
||||||
remove_docker_container() {
|
remove_docker_container() {
|
||||||
docker rm -f tpu-test || true;
|
docker rm -f tpu-test || true;
|
||||||
docker rm -f vllm-tpu || true;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
trap remove_docker_container EXIT
|
trap remove_docker_container EXIT
|
||||||
@ -62,7 +61,8 @@ echo "Results will be stored in: $RESULTS_DIR"
|
|||||||
echo "--- Installing Python dependencies ---"
|
echo "--- Installing Python dependencies ---"
|
||||||
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
|
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
|
||||||
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
|
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
|
||||||
&& python3 -m pip install --progress-bar off lm_eval[api]==0.4.4
|
&& python3 -m pip install --progress-bar off lm_eval[api]==0.4.4 \
|
||||||
|
&& python3 -m pip install --progress-bar off hf-transfer
|
||||||
echo "--- Python dependencies installed ---"
|
echo "--- Python dependencies installed ---"
|
||||||
export VLLM_USE_V1=1
|
export VLLM_USE_V1=1
|
||||||
export VLLM_XLA_CHECK_RECOMPILATION=1
|
export VLLM_XLA_CHECK_RECOMPILATION=1
|
||||||
@ -134,7 +134,7 @@ run_and_track_test 1 "test_compilation.py" \
|
|||||||
run_and_track_test 2 "test_basic.py" \
|
run_and_track_test 2 "test_basic.py" \
|
||||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_basic.py"
|
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_basic.py"
|
||||||
run_and_track_test 3 "test_accuracy.py::test_lm_eval_accuracy_v1_engine" \
|
run_and_track_test 3 "test_accuracy.py::test_lm_eval_accuracy_v1_engine" \
|
||||||
"python3 -m pytest -s -v /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine"
|
"HF_HUB_DISABLE_XET=1 python3 -m pytest -s -v /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine"
|
||||||
run_and_track_test 4 "test_quantization_accuracy.py" \
|
run_and_track_test 4 "test_quantization_accuracy.py" \
|
||||||
"python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py"
|
"python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py"
|
||||||
run_and_track_test 5 "examples/offline_inference/tpu.py" \
|
run_and_track_test 5 "examples/offline_inference/tpu.py" \
|
||||||
@ -149,18 +149,6 @@ run_and_track_test 9 "test_multimodal.py" \
|
|||||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py"
|
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py"
|
||||||
run_and_track_test 10 "test_pallas.py" \
|
run_and_track_test 10 "test_pallas.py" \
|
||||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py"
|
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py"
|
||||||
run_and_track_test 11 "test_struct_output_generate.py" \
|
|
||||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\""
|
|
||||||
run_and_track_test 12 "test_moe_pallas.py" \
|
|
||||||
"python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py"
|
|
||||||
run_and_track_test 13 "test_lora.py" \
|
|
||||||
"VLLM_XLA_CHECK_RECOMPILATION=0 python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/test_lora.py"
|
|
||||||
run_and_track_test 14 "test_tpu_qkv_linear.py" \
|
|
||||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_tpu_qkv_linear.py"
|
|
||||||
run_and_track_test 15 "test_spmd_model_weight_loading.py" \
|
|
||||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_spmd_model_weight_loading.py"
|
|
||||||
run_and_track_test 16 "test_kv_cache_update_kernel.py" \
|
|
||||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_kv_cache_update_kernel.py"
|
|
||||||
|
|
||||||
# After all tests have been attempted, exit with the overall status.
|
# After all tests have been attempted, exit with the overall status.
|
||||||
if [ "$overall_script_exit_code" -ne 0 ]; then
|
if [ "$overall_script_exit_code" -ne 0 ]; then
|
||||||
|
|||||||
@ -31,4 +31,13 @@ docker run \
|
|||||||
VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
|
VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
|
||||||
cd tests
|
cd tests
|
||||||
pytest -v -s v1/core
|
pytest -v -s v1/core
|
||||||
|
pytest -v -s v1/engine
|
||||||
|
pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py
|
||||||
|
pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
|
||||||
|
pytest -v -s v1/structured_output
|
||||||
|
pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_eagle.py
|
||||||
|
pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py
|
||||||
|
pytest -v -s v1/test_serial_utils.py
|
||||||
|
pytest -v -s v1/test_utils.py
|
||||||
|
pytest -v -s v1/test_metrics_reader.py
|
||||||
'
|
'
|
||||||
|
|||||||
@ -11,10 +11,10 @@ cd "$(dirname "${BASH_SOURCE[0]}")/../.."
|
|||||||
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
|
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
|
||||||
|
|
||||||
# run python-based benchmarks and upload the result to buildkite
|
# run python-based benchmarks and upload the result to buildkite
|
||||||
python3 benchmarks/benchmark_latency.py --output-json latency_results.json 2>&1 | tee benchmark_latency.txt
|
vllm bench latency --output-json latency_results.json 2>&1 | tee benchmark_latency.txt
|
||||||
bench_latency_exit_code=$?
|
bench_latency_exit_code=$?
|
||||||
|
|
||||||
python3 benchmarks/benchmark_throughput.py --input-len 256 --output-len 256 --output-json throughput_results.json 2>&1 | tee benchmark_throughput.txt
|
vllm bench throughput --input-len 256 --output-len 256 --output-json throughput_results.json 2>&1 | tee benchmark_throughput.txt
|
||||||
bench_throughput_exit_code=$?
|
bench_throughput_exit_code=$?
|
||||||
|
|
||||||
# run server-based benchmarks and upload the result to buildkite
|
# run server-based benchmarks and upload the result to buildkite
|
||||||
@ -24,7 +24,7 @@ wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/r
|
|||||||
|
|
||||||
# wait for server to start, timeout after 600 seconds
|
# wait for server to start, timeout after 600 seconds
|
||||||
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
|
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
|
||||||
python3 benchmarks/benchmark_serving.py \
|
vllm bench serve \
|
||||||
--backend vllm \
|
--backend vllm \
|
||||||
--dataset-name sharegpt \
|
--dataset-name sharegpt \
|
||||||
--dataset-path ./ShareGPT_V3_unfiltered_cleaned_split.json \
|
--dataset-path ./ShareGPT_V3_unfiltered_cleaned_split.json \
|
||||||
|
|||||||
@ -1,6 +1,6 @@
|
|||||||
# Environment config
|
# Environment config
|
||||||
TEST_NAME=llama8b
|
TEST_NAME=llama8b
|
||||||
CONTAINER_NAME=vllm-tpu
|
CONTAINER_NAME=tpu-test
|
||||||
|
|
||||||
# vllm config
|
# vllm config
|
||||||
MODEL=meta-llama/Llama-3.1-8B-Instruct
|
MODEL=meta-llama/Llama-3.1-8B-Instruct
|
||||||
|
|||||||
@ -12,8 +12,6 @@ source /etc/environment
|
|||||||
source $ENV_FILE
|
source $ENV_FILE
|
||||||
|
|
||||||
remove_docker_container() {
|
remove_docker_container() {
|
||||||
docker rm -f tpu-test || true;
|
|
||||||
docker rm -f vllm-tpu || true;
|
|
||||||
docker rm -f $CONTAINER_NAME || true;
|
docker rm -f $CONTAINER_NAME || true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -1,6 +1,6 @@
|
|||||||
# Environment config
|
# Environment config
|
||||||
TEST_NAME=llama8bw8a8
|
TEST_NAME=llama8bw8a8
|
||||||
CONTAINER_NAME=vllm-tpu
|
CONTAINER_NAME=tpu-test
|
||||||
|
|
||||||
# vllm config
|
# vllm config
|
||||||
MODEL=RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8
|
MODEL=RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8
|
||||||
|
|||||||
@ -44,7 +44,6 @@ echo
|
|||||||
|
|
||||||
VLLM_USE_V1=1 vllm serve $MODEL \
|
VLLM_USE_V1=1 vllm serve $MODEL \
|
||||||
--seed 42 \
|
--seed 42 \
|
||||||
--disable-log-requests \
|
|
||||||
--max-num-seqs $MAX_NUM_SEQS \
|
--max-num-seqs $MAX_NUM_SEQS \
|
||||||
--max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \
|
--max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \
|
||||||
--tensor-parallel-size $TENSOR_PARALLEL_SIZE \
|
--tensor-parallel-size $TENSOR_PARALLEL_SIZE \
|
||||||
@ -77,7 +76,7 @@ done
|
|||||||
echo "run benchmark test..."
|
echo "run benchmark test..."
|
||||||
echo "logging to $BM_LOG"
|
echo "logging to $BM_LOG"
|
||||||
echo
|
echo
|
||||||
python benchmarks/benchmark_serving.py \
|
vllm bench serve \
|
||||||
--backend vllm \
|
--backend vllm \
|
||||||
--model $MODEL \
|
--model $MODEL \
|
||||||
--dataset-name sonnet \
|
--dataset-name sonnet \
|
||||||
|
|||||||
@ -82,7 +82,7 @@ steps:
|
|||||||
- bash standalone_tests/python_only_compile.sh
|
- bash standalone_tests/python_only_compile.sh
|
||||||
|
|
||||||
- label: Basic Correctness Test # 30min
|
- label: Basic Correctness Test # 30min
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
mirror_hardwares: [amdexperimental]
|
||||||
fast_check: true
|
fast_check: true
|
||||||
torch_nightly: true
|
torch_nightly: true
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
@ -99,7 +99,7 @@ steps:
|
|||||||
- VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
|
- VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
|
||||||
|
|
||||||
- label: Chunked Prefill Test
|
- label: Chunked Prefill Test
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
mirror_hardwares: [amdexperimental]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/basic_correctness/test_chunked_prefill
|
- tests/basic_correctness/test_chunked_prefill
|
||||||
@ -108,7 +108,7 @@ steps:
|
|||||||
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
|
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
|
||||||
|
|
||||||
- label: Core Test # 10min
|
- label: Core Test # 10min
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
mirror_hardwares: [amdexperimental]
|
||||||
fast_check: true
|
fast_check: true
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/core
|
- vllm/core
|
||||||
@ -128,11 +128,10 @@ steps:
|
|||||||
- tests/entrypoints/offline_mode
|
- tests/entrypoints/offline_mode
|
||||||
commands:
|
commands:
|
||||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_guided_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
|
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_collective_rpc.py
|
||||||
- pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
|
- pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
|
||||||
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
|
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
|
||||||
- pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
|
- pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
|
||||||
- VLLM_USE_V1=0 pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
|
|
||||||
- VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
|
- VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
|
||||||
|
|
||||||
- label: Entrypoints Test (API Server) # 40min
|
- label: Entrypoints Test (API Server) # 40min
|
||||||
@ -166,6 +165,7 @@ steps:
|
|||||||
- tests/v1/test_async_llm_dp.py
|
- tests/v1/test_async_llm_dp.py
|
||||||
- tests/v1/test_external_lb_dp.py
|
- tests/v1/test_external_lb_dp.py
|
||||||
- tests/v1/test_internal_lb_dp.py
|
- tests/v1/test_internal_lb_dp.py
|
||||||
|
- tests/v1/test_hybrid_lb_dp.py
|
||||||
- tests/v1/engine/test_engine_core_client.py
|
- tests/v1/engine/test_engine_core_client.py
|
||||||
commands:
|
commands:
|
||||||
# test with tp=2 and external_dp=2
|
# test with tp=2 and external_dp=2
|
||||||
@ -178,6 +178,7 @@ steps:
|
|||||||
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
|
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
|
||||||
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_external_lb_dp.py
|
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_external_lb_dp.py
|
||||||
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/test_internal_lb_dp.py
|
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/test_internal_lb_dp.py
|
||||||
|
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/test_hybrid_lb_dp.py
|
||||||
- pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
|
- pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
|
||||||
- pytest -v -s distributed/test_utils.py
|
- pytest -v -s distributed/test_utils.py
|
||||||
- pytest -v -s compile/test_basic_correctness.py
|
- pytest -v -s compile/test_basic_correctness.py
|
||||||
@ -208,7 +209,7 @@ steps:
|
|||||||
- pytest -v -s distributed/test_eplb_execute.py
|
- pytest -v -s distributed/test_eplb_execute.py
|
||||||
|
|
||||||
- label: Metrics, Tracing Test # 10min
|
- label: Metrics, Tracing Test # 10min
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
mirror_hardwares: [amdexperimental]
|
||||||
num_gpus: 2
|
num_gpus: 2
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
@ -227,7 +228,7 @@ steps:
|
|||||||
##### 1 GPU test #####
|
##### 1 GPU test #####
|
||||||
|
|
||||||
- label: Regression Test # 5min
|
- label: Regression Test # 5min
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
mirror_hardwares: [amdexperimental]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/test_regression
|
- tests/test_regression
|
||||||
@ -279,7 +280,7 @@ steps:
|
|||||||
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
|
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
|
||||||
|
|
||||||
- label: Examples Test # 25min
|
- label: Examples Test # 25min
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
mirror_hardwares: [amdexperimental]
|
||||||
working_dir: "/vllm-workspace/examples"
|
working_dir: "/vllm-workspace/examples"
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/entrypoints
|
- vllm/entrypoints
|
||||||
@ -304,7 +305,7 @@ steps:
|
|||||||
- VLLM_USE_V1=0 python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2
|
- VLLM_USE_V1=0 python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2
|
||||||
|
|
||||||
- label: Prefix Caching Test # 9min
|
- label: Prefix Caching Test # 9min
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
mirror_hardwares: [amdexperimental]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/prefix_caching
|
- tests/prefix_caching
|
||||||
@ -313,7 +314,7 @@ steps:
|
|||||||
|
|
||||||
|
|
||||||
- label: Platform Tests (CUDA)
|
- label: Platform Tests (CUDA)
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
mirror_hardwares: [amdexperimental]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/cuda
|
- tests/cuda
|
||||||
@ -352,9 +353,10 @@ steps:
|
|||||||
- pytest -v -s compile/test_silu_mul_quant_fusion.py
|
- pytest -v -s compile/test_silu_mul_quant_fusion.py
|
||||||
- pytest -v -s compile/test_sequence_parallelism.py
|
- pytest -v -s compile/test_sequence_parallelism.py
|
||||||
- pytest -v -s compile/test_async_tp.py
|
- pytest -v -s compile/test_async_tp.py
|
||||||
|
- pytest -v -s compile/test_fusion_all_reduce.py
|
||||||
|
|
||||||
- label: PyTorch Fullgraph Smoke Test # 9min
|
- label: PyTorch Fullgraph Smoke Test # 9min
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
mirror_hardwares: [amdexperimental]
|
||||||
torch_nightly: true
|
torch_nightly: true
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
@ -367,7 +369,7 @@ steps:
|
|||||||
- pytest -v -s compile/piecewise/test_full_cudagraph.py
|
- pytest -v -s compile/piecewise/test_full_cudagraph.py
|
||||||
|
|
||||||
- label: PyTorch Fullgraph Test # 18min
|
- label: PyTorch Fullgraph Test # 18min
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
mirror_hardwares: [amdexperimental]
|
||||||
torch_nightly: true
|
torch_nightly: true
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
@ -376,7 +378,7 @@ steps:
|
|||||||
- pytest -v -s compile/test_full_graph.py
|
- pytest -v -s compile/test_full_graph.py
|
||||||
|
|
||||||
- label: Kernels Core Operation Test
|
- label: Kernels Core Operation Test
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
mirror_hardwares: [amdexperimental]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- csrc/
|
- csrc/
|
||||||
- tests/kernels/core
|
- tests/kernels/core
|
||||||
@ -401,20 +403,21 @@ steps:
|
|||||||
- vllm/model_executor/layers/quantization
|
- vllm/model_executor/layers/quantization
|
||||||
- tests/kernels/quantization
|
- tests/kernels/quantization
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
- pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
||||||
parallelism: 2
|
parallelism: 2
|
||||||
|
|
||||||
- label: Kernels MoE Test
|
- label: Kernels MoE Test %N
|
||||||
mirror_hardwares: [amdexperimental]
|
mirror_hardwares: [amdexperimental]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- csrc/moe/
|
- csrc/moe/
|
||||||
- tests/kernels/moe
|
- tests/kernels/moe
|
||||||
- vllm/model_executor/layers/fused_moe/
|
- vllm/model_executor/layers/fused_moe/
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s kernels/moe
|
- pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
||||||
|
parallelism: 2
|
||||||
|
|
||||||
- label: Kernels Mamba Test
|
- label: Kernels Mamba Test
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
mirror_hardwares: [amdexperimental]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- csrc/mamba/
|
- csrc/mamba/
|
||||||
- tests/kernels/mamba
|
- tests/kernels/mamba
|
||||||
@ -422,7 +425,7 @@ steps:
|
|||||||
- pytest -v -s kernels/mamba
|
- pytest -v -s kernels/mamba
|
||||||
|
|
||||||
- label: Tensorizer Test # 11min
|
- label: Tensorizer Test # 11min
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
mirror_hardwares: [amdexperimental]
|
||||||
soft_fail: true
|
soft_fail: true
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/model_executor/model_loader
|
- vllm/model_executor/model_loader
|
||||||
@ -435,7 +438,7 @@ steps:
|
|||||||
- pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
|
- pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
|
||||||
|
|
||||||
- label: Model Executor Test
|
- label: Model Executor Test
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
mirror_hardwares: [amdexperimental]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/model_executor
|
- vllm/model_executor
|
||||||
- tests/model_executor
|
- tests/model_executor
|
||||||
@ -445,7 +448,7 @@ steps:
|
|||||||
- pytest -v -s model_executor
|
- pytest -v -s model_executor
|
||||||
|
|
||||||
- label: Benchmarks # 9min
|
- label: Benchmarks # 9min
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
mirror_hardwares: [amdexperimental]
|
||||||
working_dir: "/vllm-workspace/.buildkite"
|
working_dir: "/vllm-workspace/.buildkite"
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- benchmarks/
|
- benchmarks/
|
||||||
@ -453,7 +456,7 @@ steps:
|
|||||||
- bash scripts/run-benchmarks.sh
|
- bash scripts/run-benchmarks.sh
|
||||||
|
|
||||||
- label: Benchmarks CLI Test # 10min
|
- label: Benchmarks CLI Test # 10min
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
mirror_hardwares: [amdexperimental]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/benchmarks/
|
- tests/benchmarks/
|
||||||
@ -492,7 +495,7 @@ steps:
|
|||||||
- pytest -s entrypoints/openai/correctness/
|
- pytest -s entrypoints/openai/correctness/
|
||||||
|
|
||||||
- label: Encoder Decoder tests # 5min
|
- label: Encoder Decoder tests # 5min
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
mirror_hardwares: [amdexperimental]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/encoder_decoder
|
- tests/encoder_decoder
|
||||||
@ -500,7 +503,7 @@ steps:
|
|||||||
- pytest -v -s encoder_decoder
|
- pytest -v -s encoder_decoder
|
||||||
|
|
||||||
- label: OpenAI-Compatible Tool Use # 20 min
|
- label: OpenAI-Compatible Tool Use # 20 min
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
mirror_hardwares: [amdexperimental]
|
||||||
fast_check: false
|
fast_check: false
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
@ -578,7 +581,8 @@ steps:
|
|||||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||||
- pip freeze | grep -E 'torch'
|
- pip freeze | grep -E 'torch'
|
||||||
- pytest -v -s models/multimodal/processing
|
- pytest -v -s models/multimodal/processing
|
||||||
- pytest -v -s --ignore models/multimodal/generation/test_whisper.py models/multimodal -m core_model
|
- pytest -v -s --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/test_tensor_schema.py models/multimodal -m core_model
|
||||||
|
- pytest -v -s models/multimodal/test_tensor_schema.py -m core_model # Needs mp_method="spawn"
|
||||||
- cd .. && pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
|
- cd .. && pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
|
||||||
|
|
||||||
- label: Multi-Modal Models Test (Extended) 1
|
- label: Multi-Modal Models Test (Extended) 1
|
||||||
@ -621,7 +625,7 @@ steps:
|
|||||||
|
|
||||||
# This test is used only in PR development phase to test individual models and should never run on main
|
# This test is used only in PR development phase to test individual models and should never run on main
|
||||||
- label: Custom Models Test
|
- label: Custom Models Test
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
mirror_hardwares: [amdexperimental]
|
||||||
optional: true
|
optional: true
|
||||||
commands:
|
commands:
|
||||||
- echo 'Testing custom models...'
|
- echo 'Testing custom models...'
|
||||||
@ -641,11 +645,40 @@ steps:
|
|||||||
- python3 examples/offline_inference/audio_language.py --model-type whisper
|
- python3 examples/offline_inference/audio_language.py --model-type whisper
|
||||||
- python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
|
- python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
|
||||||
|
|
||||||
|
- label: Blackwell Test
|
||||||
|
working_dir: "/vllm-workspace/"
|
||||||
|
gpu: b200
|
||||||
|
# optional: true
|
||||||
|
source_file_dependencies:
|
||||||
|
- csrc/quantization/fp4/
|
||||||
|
- csrc/attention/mla/
|
||||||
|
- csrc/quantization/cutlass_w8a8/moe/
|
||||||
|
- vllm/model_executor/layers/fused_moe/cutlass_moe.py
|
||||||
|
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
|
||||||
|
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
|
||||||
|
- vllm/v1/attention/backends/flashinfer.py
|
||||||
|
- vllm/compilation/fusion.py
|
||||||
|
commands:
|
||||||
|
- nvidia-smi
|
||||||
|
- python3 examples/offline_inference/basic/chat.py
|
||||||
|
# Attention
|
||||||
|
# num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
|
||||||
|
- pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2'
|
||||||
|
- pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py
|
||||||
|
- pytest -v -s tests/kernels/test_cutlass_mla_decode.py
|
||||||
|
# Quantization
|
||||||
|
- pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8'
|
||||||
|
- pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py
|
||||||
|
- pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
|
||||||
|
- pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
|
||||||
|
# Fusion
|
||||||
|
- pytest -v -s tests/compile/test_fusion_all_reduce.py
|
||||||
|
|
||||||
##### 1 GPU test #####
|
##### 1 GPU test #####
|
||||||
##### multi gpus test #####
|
##### multi gpus test #####
|
||||||
|
|
||||||
- label: Distributed Comm Ops Test # 7min
|
- label: Distributed Comm Ops Test # 7min
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
mirror_hardwares: [amdexperimental]
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
num_gpus: 2
|
num_gpus: 2
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
@ -716,8 +749,8 @@ steps:
|
|||||||
# this test fails consistently.
|
# this test fails consistently.
|
||||||
# TODO: investigate and fix
|
# TODO: investigate and fix
|
||||||
- VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
|
- VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
|
||||||
- VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/test_disagg.py
|
|
||||||
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
|
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
|
||||||
|
- pytest -v -s models/multimodal/generation/test_maverick.py
|
||||||
|
|
||||||
- label: Plugin Tests (2 GPUs) # 40min
|
- label: Plugin Tests (2 GPUs) # 40min
|
||||||
mirror_hardwares: [amdexperimental]
|
mirror_hardwares: [amdexperimental]
|
||||||
@ -741,7 +774,7 @@ steps:
|
|||||||
- pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins
|
- pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins
|
||||||
|
|
||||||
- label: Multi-step Tests (4 GPUs) # 36min
|
- label: Multi-step Tests (4 GPUs) # 36min
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
mirror_hardwares: [amdexperimental]
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
num_gpus: 4
|
num_gpus: 4
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
@ -762,7 +795,7 @@ steps:
|
|||||||
- pytest -v -s multi_step/test_correctness_llm.py
|
- pytest -v -s multi_step/test_correctness_llm.py
|
||||||
|
|
||||||
- label: Pipeline Parallelism Test # 45min
|
- label: Pipeline Parallelism Test # 45min
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
mirror_hardwares: [amdexperimental]
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
num_gpus: 4
|
num_gpus: 4
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
@ -776,7 +809,7 @@ steps:
|
|||||||
- pytest -v -s distributed/test_pipeline_parallel.py
|
- pytest -v -s distributed/test_pipeline_parallel.py
|
||||||
|
|
||||||
- label: LoRA TP Test (Distributed)
|
- label: LoRA TP Test (Distributed)
|
||||||
mirror_hardwares: [amdexperimental, amdproduction]
|
mirror_hardwares: [amdexperimental]
|
||||||
num_gpus: 4
|
num_gpus: 4
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/lora
|
- vllm/lora
|
||||||
@ -789,6 +822,7 @@ steps:
|
|||||||
# requires multi-GPU testing for validation.
|
# requires multi-GPU testing for validation.
|
||||||
- pytest -v -s -x lora/test_chatglm3_tp.py
|
- pytest -v -s -x lora/test_chatglm3_tp.py
|
||||||
- pytest -v -s -x lora/test_llama_tp.py
|
- pytest -v -s -x lora/test_llama_tp.py
|
||||||
|
- pytest -v -s -x lora/test_multi_loras_with_tp.py
|
||||||
|
|
||||||
|
|
||||||
- label: Weight Loading Multiple GPU Test # 33min
|
- label: Weight Loading Multiple GPU Test # 33min
|
||||||
|
|||||||
27
.github/CODEOWNERS
vendored
27
.github/CODEOWNERS
vendored
@ -10,7 +10,6 @@
|
|||||||
/vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
/vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
||||||
/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
||||||
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth
|
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth
|
||||||
/vllm/model_executor/guided_decoding @mgoin @russellb @aarnphm
|
|
||||||
/vllm/multimodal @DarkLight1337 @ywang96
|
/vllm/multimodal @DarkLight1337 @ywang96
|
||||||
/vllm/vllm_flash_attn @LucasWilkinson
|
/vllm/vllm_flash_attn @LucasWilkinson
|
||||||
/vllm/lora @jeejeelee
|
/vllm/lora @jeejeelee
|
||||||
@ -35,9 +34,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
|
|||||||
/tests/distributed/test_pipeline_parallel.py @youkaichao
|
/tests/distributed/test_pipeline_parallel.py @youkaichao
|
||||||
/tests/distributed/test_same_node.py @youkaichao
|
/tests/distributed/test_same_node.py @youkaichao
|
||||||
/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo @aarnphm
|
/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo @aarnphm
|
||||||
/tests/entrypoints/llm/test_guided_generate.py @mgoin @russellb @aarnphm
|
|
||||||
/tests/kernels @tlrmchlsmth @WoosukKwon
|
/tests/kernels @tlrmchlsmth @WoosukKwon
|
||||||
/tests/model_executor/test_guided_processors.py @mgoin @russellb
|
|
||||||
/tests/models @DarkLight1337 @ywang96
|
/tests/models @DarkLight1337 @ywang96
|
||||||
/tests/multi_step @alexm-redhat @comaniac
|
/tests/multi_step @alexm-redhat @comaniac
|
||||||
/tests/multimodal @DarkLight1337 @ywang96
|
/tests/multimodal @DarkLight1337 @ywang96
|
||||||
@ -52,3 +49,27 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
|
|||||||
# Docs
|
# Docs
|
||||||
/docs @hmellor
|
/docs @hmellor
|
||||||
mkdocs.yaml @hmellor
|
mkdocs.yaml @hmellor
|
||||||
|
|
||||||
|
# CPU
|
||||||
|
/vllm/v1/worker/^cpu @bigPYJ1151
|
||||||
|
/csrc/cpu @bigPYJ1151
|
||||||
|
/vllm/platforms/cpu.py @bigPYJ1151
|
||||||
|
/cmake/cpu_extension.cmake @bigPYJ1151
|
||||||
|
/docker/Dockerfile.cpu @bigPYJ1151
|
||||||
|
|
||||||
|
# Intel GPU
|
||||||
|
/vllm/v1/worker/^xpu @jikunshang
|
||||||
|
/vllm/platforms/xpu.py @jikunshang
|
||||||
|
/docker/Dockerfile.xpu @jikunshang
|
||||||
|
|
||||||
|
# Qwen-specific files
|
||||||
|
/vllm/attention/backends/dual_chunk_flash_attn.py @sighingnow
|
||||||
|
/vllm/model_executor/models/qwen* @sighingnow
|
||||||
|
|
||||||
|
# Mistral-specific files
|
||||||
|
/vllm/model_executor/models/mistral*.py @patrickvonplaten
|
||||||
|
/vllm/model_executor/models/mixtral*.py @patrickvonplaten
|
||||||
|
/vllm/model_executor/models/voxtral*.py @patrickvonplaten
|
||||||
|
/vllm/model_executor/models/pixtral*.py @patrickvonplaten
|
||||||
|
/vllm/transformers_utils/configs/mistral.py @patrickvonplaten
|
||||||
|
/vllm/transformers_utils/tokenizers/mistral.py @patrickvonplaten
|
||||||
|
|||||||
4
.github/PULL_REQUEST_TEMPLATE.md
vendored
4
.github/PULL_REQUEST_TEMPLATE.md
vendored
@ -1,4 +1,5 @@
|
|||||||
## Essential Elements of an Effective PR Description Checklist
|
# Essential Elements of an Effective PR Description Checklist
|
||||||
|
|
||||||
- [ ] The purpose of the PR, such as "Fix some issue (link existing issues this PR will resolve)".
|
- [ ] The purpose of the PR, such as "Fix some issue (link existing issues this PR will resolve)".
|
||||||
- [ ] The test plan, such as providing test command.
|
- [ ] The test plan, such as providing test command.
|
||||||
- [ ] The test results, such as pasting the results comparison before and after, or e2e results
|
- [ ] The test results, such as pasting the results comparison before and after, or e2e results
|
||||||
@ -14,5 +15,4 @@ PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS ABOVE HAVE B
|
|||||||
|
|
||||||
## (Optional) Documentation Update
|
## (Optional) Documentation Update
|
||||||
|
|
||||||
<!--- pyml disable-next-line no-emphasis-as-heading -->
|
|
||||||
**BEFORE SUBMITTING, PLEASE READ <https://docs.vllm.ai/en/latest/contributing>** (anything written below this line will be removed by GitHub Actions)
|
**BEFORE SUBMITTING, PLEASE READ <https://docs.vllm.ai/en/latest/contributing>** (anything written below this line will be removed by GitHub Actions)
|
||||||
|
|||||||
3
.github/mergify.yml
vendored
3
.github/mergify.yml
vendored
@ -149,9 +149,6 @@ pull_request_rules:
|
|||||||
- files=examples/offline_inference/structured_outputs.py
|
- files=examples/offline_inference/structured_outputs.py
|
||||||
- files=examples/online_serving/openai_chat_completion_structured_outputs.py
|
- files=examples/online_serving/openai_chat_completion_structured_outputs.py
|
||||||
- files=examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
|
- files=examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
|
||||||
- files~=^vllm/model_executor/guided_decoding/
|
|
||||||
- files=tests/model_executor/test_guided_processors.py
|
|
||||||
- files=tests/entrypoints/llm/test_guided_generate.py
|
|
||||||
- files~=^tests/v1/structured_output/
|
- files~=^tests/v1/structured_output/
|
||||||
- files=tests/v1/entrypoints/llm/test_guided_generate.py
|
- files=tests/v1/entrypoints/llm/test_guided_generate.py
|
||||||
- files~=^vllm/v1/structured_output/
|
- files~=^vllm/v1/structured_output/
|
||||||
|
|||||||
4
.github/workflows/lint-and-deploy.yaml
vendored
4
.github/workflows/lint-and-deploy.yaml
vendored
@ -2,6 +2,10 @@ name: Lint and Deploy Charts
|
|||||||
|
|
||||||
on: pull_request
|
on: pull_request
|
||||||
|
|
||||||
|
concurrency:
|
||||||
|
group: ${{ github.workflow }}-${{ github.ref }}
|
||||||
|
cancel-in-progress: true
|
||||||
|
|
||||||
permissions:
|
permissions:
|
||||||
contents: read
|
contents: read
|
||||||
|
|
||||||
|
|||||||
17
.github/workflows/matchers/markdownlint.json
vendored
Normal file
17
.github/workflows/matchers/markdownlint.json
vendored
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
{
|
||||||
|
"problemMatcher": [
|
||||||
|
{
|
||||||
|
"owner": "markdownlint",
|
||||||
|
"pattern": [
|
||||||
|
{
|
||||||
|
"regexp": "^([^:]*):(\\d+):?(\\d+)?\\s([\\w-\\/]*)\\s(.*)$",
|
||||||
|
"file": 1,
|
||||||
|
"line": 2,
|
||||||
|
"column": 3,
|
||||||
|
"code": 4,
|
||||||
|
"message": 5
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
5
.github/workflows/pre-commit.yml
vendored
5
.github/workflows/pre-commit.yml
vendored
@ -5,6 +5,10 @@ on:
|
|||||||
push:
|
push:
|
||||||
branches: [main]
|
branches: [main]
|
||||||
|
|
||||||
|
concurrency:
|
||||||
|
group: ${{ github.workflow }}-${{ github.ref }}
|
||||||
|
cancel-in-progress: ${{ github.event_name == 'pull_request' }}
|
||||||
|
|
||||||
permissions:
|
permissions:
|
||||||
contents: read
|
contents: read
|
||||||
|
|
||||||
@ -17,6 +21,7 @@ jobs:
|
|||||||
with:
|
with:
|
||||||
python-version: "3.12"
|
python-version: "3.12"
|
||||||
- run: echo "::add-matcher::.github/workflows/matchers/actionlint.json"
|
- run: echo "::add-matcher::.github/workflows/matchers/actionlint.json"
|
||||||
|
- run: echo "::add-matcher::.github/workflows/matchers/markdownlint.json"
|
||||||
- run: echo "::add-matcher::.github/workflows/matchers/mypy.json"
|
- run: echo "::add-matcher::.github/workflows/matchers/mypy.json"
|
||||||
- uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1
|
- uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1
|
||||||
with:
|
with:
|
||||||
|
|||||||
1
.github/workflows/scripts/build.sh
vendored
1
.github/workflows/scripts/build.sh
vendored
@ -15,7 +15,6 @@ $python_executable -m pip install -r requirements/build.txt -r requirements/cuda
|
|||||||
export MAX_JOBS=1
|
export MAX_JOBS=1
|
||||||
# Make sure release wheels are built for the following architectures
|
# Make sure release wheels are built for the following architectures
|
||||||
export TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
|
export TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
|
||||||
export VLLM_FA_CMAKE_GPU_ARCHES="80-real;90-real"
|
|
||||||
|
|
||||||
bash tools/check_repo.sh
|
bash tools/check_repo.sh
|
||||||
|
|
||||||
|
|||||||
13
.markdownlint.yaml
Normal file
13
.markdownlint.yaml
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
MD007:
|
||||||
|
indent: 4
|
||||||
|
MD013: false
|
||||||
|
MD024:
|
||||||
|
siblings_only: true
|
||||||
|
MD033: false
|
||||||
|
MD042: false
|
||||||
|
MD045: false
|
||||||
|
MD046: false
|
||||||
|
MD051: false
|
||||||
|
MD052: false
|
||||||
|
MD053: false
|
||||||
|
MD059: false
|
||||||
@ -35,12 +35,12 @@ repos:
|
|||||||
exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))|vllm/third_party/.*'
|
exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))|vllm/third_party/.*'
|
||||||
types_or: [c++, cuda]
|
types_or: [c++, cuda]
|
||||||
args: [--style=file, --verbose]
|
args: [--style=file, --verbose]
|
||||||
- repo: https://github.com/jackdewinter/pymarkdown
|
- repo: https://github.com/igorshubovych/markdownlint-cli
|
||||||
rev: v0.9.29
|
rev: v0.45.0
|
||||||
hooks:
|
hooks:
|
||||||
- id: pymarkdown
|
- id: markdownlint
|
||||||
exclude: '.*\.inc\.md'
|
exclude: '.*\.inc\.md'
|
||||||
args: [fix]
|
stages: [manual] # Only run in CI
|
||||||
- repo: https://github.com/rhysd/actionlint
|
- repo: https://github.com/rhysd/actionlint
|
||||||
rev: v1.7.7
|
rev: v1.7.7
|
||||||
hooks:
|
hooks:
|
||||||
|
|||||||
@ -7,6 +7,9 @@ build:
|
|||||||
os: ubuntu-22.04
|
os: ubuntu-22.04
|
||||||
tools:
|
tools:
|
||||||
python: "3.12"
|
python: "3.12"
|
||||||
|
jobs:
|
||||||
|
post_checkout:
|
||||||
|
- git fetch --unshallow || true
|
||||||
|
|
||||||
mkdocs:
|
mkdocs:
|
||||||
configuration: mkdocs.yaml
|
configuration: mkdocs.yaml
|
||||||
|
|||||||
@ -529,6 +529,25 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
# The nvfp4_scaled_mm_sm120 kernels for Geforce Blackwell SM120 require
|
||||||
|
# CUDA 12.8 or later
|
||||||
|
cuda_archs_loose_intersection(FP4_ARCHS "12.0;12.0a" "${CUDA_ARCHS}")
|
||||||
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS)
|
||||||
|
set(SRCS
|
||||||
|
"csrc/quantization/fp4/nvfp4_quant_kernels.cu"
|
||||||
|
"csrc/quantization/fp4/nvfp4_scaled_mm_sm120_kernels.cu")
|
||||||
|
set_gencode_flags_for_srcs(
|
||||||
|
SRCS "${SRCS}"
|
||||||
|
CUDA_ARCHS "${FP4_ARCHS}")
|
||||||
|
list(APPEND VLLM_EXT_SRC "${SRCS}")
|
||||||
|
list(APPEND VLLM_GPU_FLAGS "-DENABLE_NVFP4_SM120=1")
|
||||||
|
message(STATUS "Building NVFP4 for archs: ${FP4_ARCHS}")
|
||||||
|
else()
|
||||||
|
message(STATUS "Not building NVFP4 as no compatible archs were found.")
|
||||||
|
# clear FP4_ARCHS
|
||||||
|
set(FP4_ARCHS)
|
||||||
|
endif()
|
||||||
|
|
||||||
# FP4 Archs and flags
|
# FP4 Archs and flags
|
||||||
cuda_archs_loose_intersection(FP4_ARCHS "10.0a" "${CUDA_ARCHS}")
|
cuda_archs_loose_intersection(FP4_ARCHS "10.0a" "${CUDA_ARCHS}")
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS)
|
||||||
@ -541,7 +560,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
SRCS "${SRCS}"
|
SRCS "${SRCS}"
|
||||||
CUDA_ARCHS "${FP4_ARCHS}")
|
CUDA_ARCHS "${FP4_ARCHS}")
|
||||||
list(APPEND VLLM_EXT_SRC "${SRCS}")
|
list(APPEND VLLM_EXT_SRC "${SRCS}")
|
||||||
list(APPEND VLLM_GPU_FLAGS "-DENABLE_NVFP4=1")
|
list(APPEND VLLM_GPU_FLAGS "-DENABLE_NVFP4_SM100=1")
|
||||||
list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MOE_SM100=1")
|
list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MOE_SM100=1")
|
||||||
message(STATUS "Building NVFP4 for archs: ${FP4_ARCHS}")
|
message(STATUS "Building NVFP4 for archs: ${FP4_ARCHS}")
|
||||||
else()
|
else()
|
||||||
@ -635,7 +654,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
"in CUDA target architectures.")
|
"in CUDA target architectures.")
|
||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
|
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
||||||
set(SRCS "csrc/quantization/cutlass_w8a8/moe/blockwise_scaled_group_mm_sm100.cu")
|
set(SRCS "csrc/quantization/cutlass_w8a8/moe/blockwise_scaled_group_mm_sm100.cu")
|
||||||
@ -768,6 +787,14 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
list(APPEND VLLM_MOE_EXT_SRC "csrc/moe/moe_wna16.cu")
|
list(APPEND VLLM_MOE_EXT_SRC "csrc/moe/moe_wna16.cu")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||||
|
set(MOE_PERMUTE_SRC
|
||||||
|
"csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.cu"
|
||||||
|
"csrc/moe/moe_permute_unpermute_op.cu")
|
||||||
|
|
||||||
|
list(APPEND VLLM_MOE_EXT_SRC "${MOE_PERMUTE_SRC}")
|
||||||
|
endif()
|
||||||
|
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${VLLM_MOE_EXT_SRC}"
|
SRCS "${VLLM_MOE_EXT_SRC}"
|
||||||
CUDA_ARCHS "${CUDA_ARCHS}")
|
CUDA_ARCHS "${CUDA_ARCHS}")
|
||||||
@ -836,17 +863,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|
||||||
set(MOE_PERMUTE_SRC
|
|
||||||
"csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.cu"
|
|
||||||
"csrc/moe/moe_permute_unpermute_op.cu")
|
|
||||||
|
|
||||||
set_gencode_flags_for_srcs(
|
|
||||||
SRCS "${MARLIN_PERMUTE_SRC}"
|
|
||||||
CUDA_ARCHS "${MOE_PERMUTE_ARCHS}")
|
|
||||||
|
|
||||||
list(APPEND VLLM_MOE_EXT_SRC "${MOE_PERMUTE_SRC}")
|
|
||||||
endif()
|
|
||||||
message(STATUS "Enabling moe extension.")
|
message(STATUS "Enabling moe extension.")
|
||||||
define_gpu_extension_target(
|
define_gpu_extension_target(
|
||||||
_moe_C
|
_moe_C
|
||||||
|
|||||||
@ -1,3 +1,4 @@
|
|||||||
|
<!-- markdownlint-disable MD001 MD041 -->
|
||||||
<p align="center">
|
<p align="center">
|
||||||
<picture>
|
<picture>
|
||||||
<source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/vllm-project/vllm/main/docs/assets/logos/vllm-logo-text-dark.png">
|
<source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/vllm-project/vllm/main/docs/assets/logos/vllm-logo-text-dark.png">
|
||||||
@ -16,6 +17,7 @@ Easy, fast, and cheap LLM serving for everyone
|
|||||||
---
|
---
|
||||||
|
|
||||||
*Latest News* 🔥
|
*Latest News* 🔥
|
||||||
|
|
||||||
- [2025/05] We hosted [NYC vLLM Meetup](https://lu.ma/c1rqyf1f)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1_q_aW_ioMJWUImf1s1YM-ZhjXz8cUeL0IJvaquOYBeA/edit?usp=sharing).
|
- [2025/05] We hosted [NYC vLLM Meetup](https://lu.ma/c1rqyf1f)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1_q_aW_ioMJWUImf1s1YM-ZhjXz8cUeL0IJvaquOYBeA/edit?usp=sharing).
|
||||||
- [2025/05] vLLM is now a hosted project under PyTorch Foundation! Please find the announcement [here](https://pytorch.org/blog/pytorch-foundation-welcomes-vllm/).
|
- [2025/05] vLLM is now a hosted project under PyTorch Foundation! Please find the announcement [here](https://pytorch.org/blog/pytorch-foundation-welcomes-vllm/).
|
||||||
- [2025/04] We hosted [Asia Developer Day](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/19cp6Qu8u48ihB91A064XfaXruNYiBOUKrBxAmDOllOo/edit?usp=sharing).
|
- [2025/04] We hosted [Asia Developer Day](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/19cp6Qu8u48ihB91A064XfaXruNYiBOUKrBxAmDOllOo/edit?usp=sharing).
|
||||||
@ -46,6 +48,7 @@ Easy, fast, and cheap LLM serving for everyone
|
|||||||
</details>
|
</details>
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## About
|
## About
|
||||||
|
|
||||||
vLLM is a fast and easy-to-use library for LLM inference and serving.
|
vLLM is a fast and easy-to-use library for LLM inference and serving.
|
||||||
@ -75,6 +78,7 @@ vLLM is flexible and easy to use with:
|
|||||||
- Multi-LoRA support
|
- Multi-LoRA support
|
||||||
|
|
||||||
vLLM seamlessly supports most popular open-source models on HuggingFace, including:
|
vLLM seamlessly supports most popular open-source models on HuggingFace, including:
|
||||||
|
|
||||||
- Transformer-like LLMs (e.g., Llama)
|
- Transformer-like LLMs (e.g., Llama)
|
||||||
- Mixture-of-Expert LLMs (e.g., Mixtral, Deepseek-V2 and V3)
|
- Mixture-of-Expert LLMs (e.g., Mixtral, Deepseek-V2 and V3)
|
||||||
- Embedding Models (e.g., E5-Mistral)
|
- Embedding Models (e.g., E5-Mistral)
|
||||||
@ -91,6 +95,7 @@ pip install vllm
|
|||||||
```
|
```
|
||||||
|
|
||||||
Visit our [documentation](https://docs.vllm.ai/en/latest/) to learn more.
|
Visit our [documentation](https://docs.vllm.ai/en/latest/) to learn more.
|
||||||
|
|
||||||
- [Installation](https://docs.vllm.ai/en/latest/getting_started/installation.html)
|
- [Installation](https://docs.vllm.ai/en/latest/getting_started/installation.html)
|
||||||
- [Quickstart](https://docs.vllm.ai/en/latest/getting_started/quickstart.html)
|
- [Quickstart](https://docs.vllm.ai/en/latest/getting_started/quickstart.html)
|
||||||
- [List of Supported Models](https://docs.vllm.ai/en/latest/models/supported_models.html)
|
- [List of Supported Models](https://docs.vllm.ai/en/latest/models/supported_models.html)
|
||||||
@ -107,6 +112,7 @@ vLLM is a community project. Our compute resources for development and testing a
|
|||||||
<!-- Note: Please sort them in alphabetical order. -->
|
<!-- Note: Please sort them in alphabetical order. -->
|
||||||
<!-- Note: Please keep these consistent with docs/community/sponsors.md -->
|
<!-- Note: Please keep these consistent with docs/community/sponsors.md -->
|
||||||
Cash Donations:
|
Cash Donations:
|
||||||
|
|
||||||
- a16z
|
- a16z
|
||||||
- Dropbox
|
- Dropbox
|
||||||
- Sequoia Capital
|
- Sequoia Capital
|
||||||
@ -114,6 +120,7 @@ Cash Donations:
|
|||||||
- ZhenFund
|
- ZhenFund
|
||||||
|
|
||||||
Compute Resources:
|
Compute Resources:
|
||||||
|
|
||||||
- AMD
|
- AMD
|
||||||
- Anyscale
|
- Anyscale
|
||||||
- AWS
|
- AWS
|
||||||
|
|||||||
@ -60,9 +60,10 @@ Please note: **No feature work allowed for cherry picks**. All PRs that are cons
|
|||||||
Before each release, we perform end-to-end performance validation to ensure no regressions are introduced. This validation uses the [vllm-benchmark workflow](https://github.com/pytorch/pytorch-integration-testing/actions/workflows/vllm-benchmark.yml) on PyTorch CI.
|
Before each release, we perform end-to-end performance validation to ensure no regressions are introduced. This validation uses the [vllm-benchmark workflow](https://github.com/pytorch/pytorch-integration-testing/actions/workflows/vllm-benchmark.yml) on PyTorch CI.
|
||||||
|
|
||||||
**Current Coverage:**
|
**Current Coverage:**
|
||||||
|
|
||||||
* Models: Llama3, Llama4, and Mixtral
|
* Models: Llama3, Llama4, and Mixtral
|
||||||
* Hardware: NVIDIA H100 and AMD MI300x
|
* Hardware: NVIDIA H100 and AMD MI300x
|
||||||
* *Note: Coverage may change based on new model releases and hardware availability*
|
* _Note: Coverage may change based on new model releases and hardware availability_
|
||||||
|
|
||||||
**Performance Validation Process:**
|
**Performance Validation Process:**
|
||||||
|
|
||||||
@ -71,11 +72,13 @@ Request write access to the [pytorch/pytorch-integration-testing](https://github
|
|||||||
|
|
||||||
**Step 2: Review Benchmark Setup**
|
**Step 2: Review Benchmark Setup**
|
||||||
Familiarize yourself with the benchmark configurations:
|
Familiarize yourself with the benchmark configurations:
|
||||||
|
|
||||||
* [CUDA setup](https://github.com/pytorch/pytorch-integration-testing/tree/main/vllm-benchmarks/benchmarks/cuda)
|
* [CUDA setup](https://github.com/pytorch/pytorch-integration-testing/tree/main/vllm-benchmarks/benchmarks/cuda)
|
||||||
* [ROCm setup](https://github.com/pytorch/pytorch-integration-testing/tree/main/vllm-benchmarks/benchmarks/rocm)
|
* [ROCm setup](https://github.com/pytorch/pytorch-integration-testing/tree/main/vllm-benchmarks/benchmarks/rocm)
|
||||||
|
|
||||||
**Step 3: Run the Benchmark**
|
**Step 3: Run the Benchmark**
|
||||||
Navigate to the [vllm-benchmark workflow](https://github.com/pytorch/pytorch-integration-testing/actions/workflows/vllm-benchmark.yml) and configure:
|
Navigate to the [vllm-benchmark workflow](https://github.com/pytorch/pytorch-integration-testing/actions/workflows/vllm-benchmark.yml) and configure:
|
||||||
|
|
||||||
* **vLLM branch**: Set to the release branch (e.g., `releases/v0.9.2`)
|
* **vLLM branch**: Set to the release branch (e.g., `releases/v0.9.2`)
|
||||||
* **vLLM commit**: Set to the RC commit hash
|
* **vLLM commit**: Set to the RC commit hash
|
||||||
|
|
||||||
|
|||||||
40
SECURITY.md
40
SECURITY.md
@ -1,13 +1,45 @@
|
|||||||
# Security Policy
|
# Security Policy
|
||||||
|
|
||||||
## Reporting a Vulnerability
|
## Reporting security issues
|
||||||
|
|
||||||
If you believe you have found a security vulnerability in vLLM, we encourage you to let us know right away. We will investigate all legitimate reports and do our best to quickly fix the problem.
|
Please report security issues privately using [the vulnerability submission form](https://github.com/vllm-project/vllm/security/advisories/new).
|
||||||
|
|
||||||
Please report security issues privately using [the vulnerability submission form](https://github.com/vllm-project/vllm/security/advisories/new). Reports will then be triaged by the [vulnerability management team](https://docs.vllm.ai/en/latest/contributing/vulnerability_management.html).
|
## Issue triage
|
||||||
|
|
||||||
---
|
Reports will then be triaged by the [vulnerability management team](https://docs.vllm.ai/en/latest/contributing/vulnerability_management.html).
|
||||||
|
|
||||||
|
## Threat model
|
||||||
|
|
||||||
Please see the [Security Guide in the vLLM documentation](https://docs.vllm.ai/en/latest/usage/security.html) for more information on vLLM's security assumptions and recommendations.
|
Please see the [Security Guide in the vLLM documentation](https://docs.vllm.ai/en/latest/usage/security.html) for more information on vLLM's security assumptions and recommendations.
|
||||||
|
|
||||||
Please see [PyTorch's Security Policy](https://github.com/pytorch/pytorch/blob/main/SECURITY.md) for more information and recommendations on how to securely interact with models.
|
Please see [PyTorch's Security Policy](https://github.com/pytorch/pytorch/blob/main/SECURITY.md) for more information and recommendations on how to securely interact with models.
|
||||||
|
|
||||||
|
## Issue severity
|
||||||
|
|
||||||
|
We will determine the risk of each issue, taking into account our experience dealing with past issues, versions affected, common defaults, and use cases. We use the following severity categories:
|
||||||
|
|
||||||
|
### CRITICAL Severity
|
||||||
|
|
||||||
|
Vulnerabilities that allow remote attackers to execute arbitrary code, take full control of the system, or significantly compromise confidentiality, integrity, or availability without any interaction or privileges needed, examples include remote code execution via network, deserialization issues that allow exploit chains. Generally those issues which are rated as CVSS ≥ 9.0.
|
||||||
|
|
||||||
|
### HIGH Severity
|
||||||
|
|
||||||
|
Serious security flaws that allow elevated impact—like RCE in specific, limited contexts or significant data loss—but require advanced conditions or some trust, examples include RCE in advanced deployment modes (e.g. multi-node), or high impact issues where some sort of privileged network access is required. These issues typically have CVSS scores between 7.0 and 8.9
|
||||||
|
|
||||||
|
### MODERATE Severity
|
||||||
|
|
||||||
|
Vulnerabilities that cause denial of service or partial disruption, but do not allow arbitrary code execution or data breach and have limited impact. These issues have a CVSS rating between 4.0 and 6.9
|
||||||
|
|
||||||
|
### LOW Severity
|
||||||
|
|
||||||
|
Minor issues such as informational disclosures, logging errors, non-exploitable flaws, or weaknesses that require local or high-privilege access and offer negligible impact. Examples include side channel attacks or hash collisions. These issues often have CVSS scores less than 4.0
|
||||||
|
|
||||||
|
## Prenotification policy
|
||||||
|
|
||||||
|
For certain security issues of CRITICAL, HIGH, or MODERATE severity level, we may prenotify certain organizations or vendors that ship vLLM. The purpose of this prenotification is to allow for a coordinated release of fixes for severe issues.
|
||||||
|
|
||||||
|
* This prenotification will be in the form of a private email notification. It may also include adding security contacts to the GitHub security advisory, typically a few days before release.
|
||||||
|
|
||||||
|
* If you wish to be added to the prenotification group, please send an email copying all the members of the [vulnerability management team](https://docs.vllm.ai/en/latest/contributing/vulnerability_management.html). Each vendor contact will be analyzed on a case-by-case basis.
|
||||||
|
|
||||||
|
* We may withdraw organizations from receiving future prenotifications if they release fixes or any other information about issues before they are public. Group membership may also change based on policy refinements for who may be included.
|
||||||
|
|||||||
@ -4,7 +4,7 @@ This README guides you through running benchmark tests with the extensive
|
|||||||
datasets supported on vLLM. It’s a living document, updated as new features and datasets
|
datasets supported on vLLM. It’s a living document, updated as new features and datasets
|
||||||
become available.
|
become available.
|
||||||
|
|
||||||
**Dataset Overview**
|
## Dataset Overview
|
||||||
|
|
||||||
<table style="width:100%; border-collapse: collapse;">
|
<table style="width:100%; border-collapse: collapse;">
|
||||||
<thead>
|
<thead>
|
||||||
@ -81,16 +81,17 @@ become available.
|
|||||||
|
|
||||||
**Note**: HuggingFace dataset's `dataset-name` should be set to `hf`
|
**Note**: HuggingFace dataset's `dataset-name` should be set to `hf`
|
||||||
|
|
||||||
---
|
## 🚀 Example - Online Benchmark
|
||||||
|
|
||||||
<details>
|
<details>
|
||||||
<summary><b>🚀 Example - Online Benchmark</b></summary>
|
<summary>Show more</summary>
|
||||||
|
|
||||||
<br/>
|
<br/>
|
||||||
|
|
||||||
First start serving your model
|
First start serving your model
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
vllm serve NousResearch/Hermes-3-Llama-3.1-8B --disable-log-requests
|
vllm serve NousResearch/Hermes-3-Llama-3.1-8B
|
||||||
```
|
```
|
||||||
|
|
||||||
Then run the benchmarking script
|
Then run the benchmarking script
|
||||||
@ -98,7 +99,7 @@ Then run the benchmarking script
|
|||||||
```bash
|
```bash
|
||||||
# download dataset
|
# download dataset
|
||||||
# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
||||||
python3 vllm/benchmarks/benchmark_serving.py \
|
vllm bench serve \
|
||||||
--backend vllm \
|
--backend vllm \
|
||||||
--model NousResearch/Hermes-3-Llama-3.1-8B \
|
--model NousResearch/Hermes-3-Llama-3.1-8B \
|
||||||
--endpoint /v1/completions \
|
--endpoint /v1/completions \
|
||||||
@ -109,48 +110,48 @@ python3 vllm/benchmarks/benchmark_serving.py \
|
|||||||
|
|
||||||
If successful, you will see the following output
|
If successful, you will see the following output
|
||||||
|
|
||||||
```
|
```text
|
||||||
============ Serving Benchmark Result ============
|
============ Serving Benchmark Result ============
|
||||||
Successful requests: 10
|
Successful requests: 10
|
||||||
Benchmark duration (s): 5.78
|
Benchmark duration (s): 5.78
|
||||||
Total input tokens: 1369
|
Total input tokens: 1369
|
||||||
Total generated tokens: 2212
|
Total generated tokens: 2212
|
||||||
Request throughput (req/s): 1.73
|
Request throughput (req/s): 1.73
|
||||||
Output token throughput (tok/s): 382.89
|
Output token throughput (tok/s): 382.89
|
||||||
Total Token throughput (tok/s): 619.85
|
Total Token throughput (tok/s): 619.85
|
||||||
---------------Time to First Token----------------
|
---------------Time to First Token----------------
|
||||||
Mean TTFT (ms): 71.54
|
Mean TTFT (ms): 71.54
|
||||||
Median TTFT (ms): 73.88
|
Median TTFT (ms): 73.88
|
||||||
P99 TTFT (ms): 79.49
|
P99 TTFT (ms): 79.49
|
||||||
-----Time per Output Token (excl. 1st token)------
|
-----Time per Output Token (excl. 1st token)------
|
||||||
Mean TPOT (ms): 7.91
|
Mean TPOT (ms): 7.91
|
||||||
Median TPOT (ms): 7.96
|
Median TPOT (ms): 7.96
|
||||||
P99 TPOT (ms): 8.03
|
P99 TPOT (ms): 8.03
|
||||||
---------------Inter-token Latency----------------
|
---------------Inter-token Latency----------------
|
||||||
Mean ITL (ms): 7.74
|
Mean ITL (ms): 7.74
|
||||||
Median ITL (ms): 7.70
|
Median ITL (ms): 7.70
|
||||||
P99 ITL (ms): 8.39
|
P99 ITL (ms): 8.39
|
||||||
==================================================
|
==================================================
|
||||||
```
|
```
|
||||||
|
|
||||||
**Custom Dataset**
|
### Custom Dataset
|
||||||
|
|
||||||
If the dataset you want to benchmark is not supported yet in vLLM, even then you can benchmark on it using `CustomDataset`. Your data needs to be in `.jsonl` format and needs to have "prompt" field per entry, e.g., data.jsonl
|
If the dataset you want to benchmark is not supported yet in vLLM, even then you can benchmark on it using `CustomDataset`. Your data needs to be in `.jsonl` format and needs to have "prompt" field per entry, e.g., data.jsonl
|
||||||
|
|
||||||
```
|
```json
|
||||||
{"prompt": "What is the capital of India?"}
|
{"prompt": "What is the capital of India?"}
|
||||||
{"prompt": "What is the capital of Iran?"}
|
{"prompt": "What is the capital of Iran?"}
|
||||||
{"prompt": "What is the capital of China?"}
|
{"prompt": "What is the capital of China?"}
|
||||||
```
|
```
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# start server
|
# start server
|
||||||
VLLM_USE_V1=1 vllm serve meta-llama/Llama-3.1-8B-Instruct --disable-log-requests
|
VLLM_USE_V1=1 vllm serve meta-llama/Llama-3.1-8B-Instruct
|
||||||
```
|
```
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# run benchmarking script
|
# run benchmarking script
|
||||||
python3 benchmarks/benchmark_serving.py --port 9001 --save-result --save-detailed \
|
vllm bench serve --port 9001 --save-result --save-detailed \
|
||||||
--backend vllm \
|
--backend vllm \
|
||||||
--model meta-llama/Llama-3.1-8B-Instruct \
|
--model meta-llama/Llama-3.1-8B-Instruct \
|
||||||
--endpoint /v1/completions \
|
--endpoint /v1/completions \
|
||||||
@ -166,15 +167,15 @@ python3 benchmarks/benchmark_serving.py --port 9001 --save-result --save-detaile
|
|||||||
|
|
||||||
You can skip applying chat template if your data already has it by using `--custom-skip-chat-template`.
|
You can skip applying chat template if your data already has it by using `--custom-skip-chat-template`.
|
||||||
|
|
||||||
**VisionArena Benchmark for Vision Language Models**
|
### VisionArena Benchmark for Vision Language Models
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# need a model with vision capability here
|
# need a model with vision capability here
|
||||||
vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
|
vllm serve Qwen/Qwen2-VL-7B-Instruct
|
||||||
```
|
```
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python3 vllm/benchmarks/benchmark_serving.py \
|
vllm bench serve \
|
||||||
--backend openai-chat \
|
--backend openai-chat \
|
||||||
--model Qwen/Qwen2-VL-7B-Instruct \
|
--model Qwen/Qwen2-VL-7B-Instruct \
|
||||||
--endpoint /v1/chat/completions \
|
--endpoint /v1/chat/completions \
|
||||||
@ -184,7 +185,7 @@ python3 vllm/benchmarks/benchmark_serving.py \
|
|||||||
--num-prompts 1000
|
--num-prompts 1000
|
||||||
```
|
```
|
||||||
|
|
||||||
**InstructCoder Benchmark with Speculative Decoding**
|
### InstructCoder Benchmark with Speculative Decoding
|
||||||
|
|
||||||
``` bash
|
``` bash
|
||||||
VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
|
VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
|
||||||
@ -194,23 +195,23 @@ VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
|
|||||||
```
|
```
|
||||||
|
|
||||||
``` bash
|
``` bash
|
||||||
python3 benchmarks/benchmark_serving.py \
|
vllm bench serve \
|
||||||
--model meta-llama/Meta-Llama-3-8B-Instruct \
|
--model meta-llama/Meta-Llama-3-8B-Instruct \
|
||||||
--dataset-name hf \
|
--dataset-name hf \
|
||||||
--dataset-path likaixin/InstructCoder \
|
--dataset-path likaixin/InstructCoder \
|
||||||
--num-prompts 2048
|
--num-prompts 2048
|
||||||
```
|
```
|
||||||
|
|
||||||
**Other HuggingFaceDataset Examples**
|
### Other HuggingFaceDataset Examples
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
|
vllm serve Qwen/Qwen2-VL-7B-Instruct
|
||||||
```
|
```
|
||||||
|
|
||||||
**`lmms-lab/LLaVA-OneVision-Data`**
|
`lmms-lab/LLaVA-OneVision-Data`:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python3 vllm/benchmarks/benchmark_serving.py \
|
vllm bench serve \
|
||||||
--backend openai-chat \
|
--backend openai-chat \
|
||||||
--model Qwen/Qwen2-VL-7B-Instruct \
|
--model Qwen/Qwen2-VL-7B-Instruct \
|
||||||
--endpoint /v1/chat/completions \
|
--endpoint /v1/chat/completions \
|
||||||
@ -221,10 +222,10 @@ python3 vllm/benchmarks/benchmark_serving.py \
|
|||||||
--num-prompts 10
|
--num-prompts 10
|
||||||
```
|
```
|
||||||
|
|
||||||
**`Aeala/ShareGPT_Vicuna_unfiltered`**
|
`Aeala/ShareGPT_Vicuna_unfiltered`:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python3 vllm/benchmarks/benchmark_serving.py \
|
vllm bench serve \
|
||||||
--backend openai-chat \
|
--backend openai-chat \
|
||||||
--model Qwen/Qwen2-VL-7B-Instruct \
|
--model Qwen/Qwen2-VL-7B-Instruct \
|
||||||
--endpoint /v1/chat/completions \
|
--endpoint /v1/chat/completions \
|
||||||
@ -234,10 +235,10 @@ python3 vllm/benchmarks/benchmark_serving.py \
|
|||||||
--num-prompts 10
|
--num-prompts 10
|
||||||
```
|
```
|
||||||
|
|
||||||
**`AI-MO/aimo-validation-aime`**
|
`AI-MO/aimo-validation-aime`:
|
||||||
|
|
||||||
``` bash
|
``` bash
|
||||||
python3 vllm/benchmarks/benchmark_serving.py \
|
vllm bench serve \
|
||||||
--model Qwen/QwQ-32B \
|
--model Qwen/QwQ-32B \
|
||||||
--dataset-name hf \
|
--dataset-name hf \
|
||||||
--dataset-path AI-MO/aimo-validation-aime \
|
--dataset-path AI-MO/aimo-validation-aime \
|
||||||
@ -245,23 +246,23 @@ python3 vllm/benchmarks/benchmark_serving.py \
|
|||||||
--seed 42
|
--seed 42
|
||||||
```
|
```
|
||||||
|
|
||||||
**`philschmid/mt-bench`**
|
`philschmid/mt-bench`:
|
||||||
|
|
||||||
``` bash
|
``` bash
|
||||||
python3 vllm/benchmarks/benchmark_serving.py \
|
vllm bench serve \
|
||||||
--model Qwen/QwQ-32B \
|
--model Qwen/QwQ-32B \
|
||||||
--dataset-name hf \
|
--dataset-name hf \
|
||||||
--dataset-path philschmid/mt-bench \
|
--dataset-path philschmid/mt-bench \
|
||||||
--num-prompts 80
|
--num-prompts 80
|
||||||
```
|
```
|
||||||
|
|
||||||
**Running With Sampling Parameters**
|
### Running With Sampling Parameters
|
||||||
|
|
||||||
When using OpenAI-compatible backends such as `vllm`, optional sampling
|
When using OpenAI-compatible backends such as `vllm`, optional sampling
|
||||||
parameters can be specified. Example client command:
|
parameters can be specified. Example client command:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python3 vllm/benchmarks/benchmark_serving.py \
|
vllm bench serve \
|
||||||
--backend vllm \
|
--backend vllm \
|
||||||
--model NousResearch/Hermes-3-Llama-3.1-8B \
|
--model NousResearch/Hermes-3-Llama-3.1-8B \
|
||||||
--endpoint /v1/completions \
|
--endpoint /v1/completions \
|
||||||
@ -273,30 +274,34 @@ python3 vllm/benchmarks/benchmark_serving.py \
|
|||||||
--num-prompts 10
|
--num-prompts 10
|
||||||
```
|
```
|
||||||
|
|
||||||
**Running With Ramp-Up Request Rate**
|
### Running With Ramp-Up Request Rate
|
||||||
|
|
||||||
The benchmark tool also supports ramping up the request rate over the
|
The benchmark tool also supports ramping up the request rate over the
|
||||||
duration of the benchmark run. This can be useful for stress testing the
|
duration of the benchmark run. This can be useful for stress testing the
|
||||||
server or finding the maximum throughput that it can handle, given some latency budget.
|
server or finding the maximum throughput that it can handle, given some latency budget.
|
||||||
|
|
||||||
Two ramp-up strategies are supported:
|
Two ramp-up strategies are supported:
|
||||||
|
|
||||||
- `linear`: Increases the request rate linearly from a start value to an end value.
|
- `linear`: Increases the request rate linearly from a start value to an end value.
|
||||||
- `exponential`: Increases the request rate exponentially.
|
- `exponential`: Increases the request rate exponentially.
|
||||||
|
|
||||||
The following arguments can be used to control the ramp-up:
|
The following arguments can be used to control the ramp-up:
|
||||||
|
|
||||||
- `--ramp-up-strategy`: The ramp-up strategy to use (`linear` or `exponential`).
|
- `--ramp-up-strategy`: The ramp-up strategy to use (`linear` or `exponential`).
|
||||||
- `--ramp-up-start-rps`: The request rate at the beginning of the benchmark.
|
- `--ramp-up-start-rps`: The request rate at the beginning of the benchmark.
|
||||||
- `--ramp-up-end-rps`: The request rate at the end of the benchmark.
|
- `--ramp-up-end-rps`: The request rate at the end of the benchmark.
|
||||||
|
|
||||||
</details>
|
</details>
|
||||||
|
|
||||||
|
## 📈 Example - Offline Throughput Benchmark
|
||||||
|
|
||||||
<details>
|
<details>
|
||||||
<summary><b>📈 Example - Offline Throughput Benchmark</b></summary>
|
<summary>Show more</summary>
|
||||||
|
|
||||||
<br/>
|
<br/>
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python3 vllm/benchmarks/benchmark_throughput.py \
|
vllm bench throughput \
|
||||||
--model NousResearch/Hermes-3-Llama-3.1-8B \
|
--model NousResearch/Hermes-3-Llama-3.1-8B \
|
||||||
--dataset-name sonnet \
|
--dataset-name sonnet \
|
||||||
--dataset-path vllm/benchmarks/sonnet.txt \
|
--dataset-path vllm/benchmarks/sonnet.txt \
|
||||||
@ -305,16 +310,16 @@ python3 vllm/benchmarks/benchmark_throughput.py \
|
|||||||
|
|
||||||
If successful, you will see the following output
|
If successful, you will see the following output
|
||||||
|
|
||||||
```
|
```text
|
||||||
Throughput: 7.15 requests/s, 4656.00 total tokens/s, 1072.15 output tokens/s
|
Throughput: 7.15 requests/s, 4656.00 total tokens/s, 1072.15 output tokens/s
|
||||||
Total num prompt tokens: 5014
|
Total num prompt tokens: 5014
|
||||||
Total num output tokens: 1500
|
Total num output tokens: 1500
|
||||||
```
|
```
|
||||||
|
|
||||||
**VisionArena Benchmark for Vision Language Models**
|
### VisionArena Benchmark for Vision Language Models
|
||||||
|
|
||||||
``` bash
|
```bash
|
||||||
python3 vllm/benchmarks/benchmark_throughput.py \
|
vllm bench throughput \
|
||||||
--model Qwen/Qwen2-VL-7B-Instruct \
|
--model Qwen/Qwen2-VL-7B-Instruct \
|
||||||
--backend vllm-chat \
|
--backend vllm-chat \
|
||||||
--dataset-name hf \
|
--dataset-name hf \
|
||||||
@ -325,18 +330,18 @@ python3 vllm/benchmarks/benchmark_throughput.py \
|
|||||||
|
|
||||||
The `num prompt tokens` now includes image token counts
|
The `num prompt tokens` now includes image token counts
|
||||||
|
|
||||||
```
|
```text
|
||||||
Throughput: 2.55 requests/s, 4036.92 total tokens/s, 326.90 output tokens/s
|
Throughput: 2.55 requests/s, 4036.92 total tokens/s, 326.90 output tokens/s
|
||||||
Total num prompt tokens: 14527
|
Total num prompt tokens: 14527
|
||||||
Total num output tokens: 1280
|
Total num output tokens: 1280
|
||||||
```
|
```
|
||||||
|
|
||||||
**InstructCoder Benchmark with Speculative Decoding**
|
### InstructCoder Benchmark with Speculative Decoding
|
||||||
|
|
||||||
``` bash
|
``` bash
|
||||||
VLLM_WORKER_MULTIPROC_METHOD=spawn \
|
VLLM_WORKER_MULTIPROC_METHOD=spawn \
|
||||||
VLLM_USE_V1=1 \
|
VLLM_USE_V1=1 \
|
||||||
python3 vllm/benchmarks/benchmark_throughput.py \
|
vllm bench throughput \
|
||||||
--dataset-name=hf \
|
--dataset-name=hf \
|
||||||
--dataset-path=likaixin/InstructCoder \
|
--dataset-path=likaixin/InstructCoder \
|
||||||
--model=meta-llama/Meta-Llama-3-8B-Instruct \
|
--model=meta-llama/Meta-Llama-3-8B-Instruct \
|
||||||
@ -349,18 +354,18 @@ python3 vllm/benchmarks/benchmark_throughput.py \
|
|||||||
"prompt_lookup_min": 2}'
|
"prompt_lookup_min": 2}'
|
||||||
```
|
```
|
||||||
|
|
||||||
```
|
```text
|
||||||
Throughput: 104.77 requests/s, 23836.22 total tokens/s, 10477.10 output tokens/s
|
Throughput: 104.77 requests/s, 23836.22 total tokens/s, 10477.10 output tokens/s
|
||||||
Total num prompt tokens: 261136
|
Total num prompt tokens: 261136
|
||||||
Total num output tokens: 204800
|
Total num output tokens: 204800
|
||||||
```
|
```
|
||||||
|
|
||||||
**Other HuggingFaceDataset Examples**
|
### Other HuggingFaceDataset Examples
|
||||||
|
|
||||||
**`lmms-lab/LLaVA-OneVision-Data`**
|
`lmms-lab/LLaVA-OneVision-Data`:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python3 vllm/benchmarks/benchmark_throughput.py \
|
vllm bench throughput \
|
||||||
--model Qwen/Qwen2-VL-7B-Instruct \
|
--model Qwen/Qwen2-VL-7B-Instruct \
|
||||||
--backend vllm-chat \
|
--backend vllm-chat \
|
||||||
--dataset-name hf \
|
--dataset-name hf \
|
||||||
@ -370,10 +375,10 @@ python3 vllm/benchmarks/benchmark_throughput.py \
|
|||||||
--num-prompts 10
|
--num-prompts 10
|
||||||
```
|
```
|
||||||
|
|
||||||
**`Aeala/ShareGPT_Vicuna_unfiltered`**
|
`Aeala/ShareGPT_Vicuna_unfiltered`:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python3 vllm/benchmarks/benchmark_throughput.py \
|
vllm bench throughput \
|
||||||
--model Qwen/Qwen2-VL-7B-Instruct \
|
--model Qwen/Qwen2-VL-7B-Instruct \
|
||||||
--backend vllm-chat \
|
--backend vllm-chat \
|
||||||
--dataset-name hf \
|
--dataset-name hf \
|
||||||
@ -382,10 +387,10 @@ python3 vllm/benchmarks/benchmark_throughput.py \
|
|||||||
--num-prompts 10
|
--num-prompts 10
|
||||||
```
|
```
|
||||||
|
|
||||||
**`AI-MO/aimo-validation-aime`**
|
`AI-MO/aimo-validation-aime`:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python3 benchmarks/benchmark_throughput.py \
|
vllm bench throughput \
|
||||||
--model Qwen/QwQ-32B \
|
--model Qwen/QwQ-32B \
|
||||||
--backend vllm \
|
--backend vllm \
|
||||||
--dataset-name hf \
|
--dataset-name hf \
|
||||||
@ -394,12 +399,12 @@ python3 benchmarks/benchmark_throughput.py \
|
|||||||
--num-prompts 10
|
--num-prompts 10
|
||||||
```
|
```
|
||||||
|
|
||||||
**Benchmark with LoRA Adapters**
|
Benchmark with LoRA adapters:
|
||||||
|
|
||||||
``` bash
|
``` bash
|
||||||
# download dataset
|
# download dataset
|
||||||
# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
||||||
python3 vllm/benchmarks/benchmark_throughput.py \
|
vllm bench throughput \
|
||||||
--model meta-llama/Llama-2-7b-hf \
|
--model meta-llama/Llama-2-7b-hf \
|
||||||
--backend vllm \
|
--backend vllm \
|
||||||
--dataset_path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
|
--dataset_path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
|
||||||
@ -413,20 +418,22 @@ python3 vllm/benchmarks/benchmark_throughput.py \
|
|||||||
|
|
||||||
</details>
|
</details>
|
||||||
|
|
||||||
|
## 🛠️ Example - Structured Output Benchmark
|
||||||
|
|
||||||
<details>
|
<details>
|
||||||
<summary><b>🛠️ Example - Structured Output Benchmark</b></summary>
|
<summary>Show more</summary>
|
||||||
|
|
||||||
<br/>
|
<br/>
|
||||||
|
|
||||||
Benchmark the performance of structured output generation (JSON, grammar, regex).
|
Benchmark the performance of structured output generation (JSON, grammar, regex).
|
||||||
|
|
||||||
**Server Setup**
|
### Server Setup
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
vllm serve NousResearch/Hermes-3-Llama-3.1-8B --disable-log-requests
|
vllm serve NousResearch/Hermes-3-Llama-3.1-8B
|
||||||
```
|
```
|
||||||
|
|
||||||
**JSON Schema Benchmark**
|
### JSON Schema Benchmark
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python3 benchmarks/benchmark_serving_structured_output.py \
|
python3 benchmarks/benchmark_serving_structured_output.py \
|
||||||
@ -438,7 +445,7 @@ python3 benchmarks/benchmark_serving_structured_output.py \
|
|||||||
--num-prompts 1000
|
--num-prompts 1000
|
||||||
```
|
```
|
||||||
|
|
||||||
**Grammar-based Generation Benchmark**
|
### Grammar-based Generation Benchmark
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python3 benchmarks/benchmark_serving_structured_output.py \
|
python3 benchmarks/benchmark_serving_structured_output.py \
|
||||||
@ -450,7 +457,7 @@ python3 benchmarks/benchmark_serving_structured_output.py \
|
|||||||
--num-prompts 1000
|
--num-prompts 1000
|
||||||
```
|
```
|
||||||
|
|
||||||
**Regex-based Generation Benchmark**
|
### Regex-based Generation Benchmark
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python3 benchmarks/benchmark_serving_structured_output.py \
|
python3 benchmarks/benchmark_serving_structured_output.py \
|
||||||
@ -461,7 +468,7 @@ python3 benchmarks/benchmark_serving_structured_output.py \
|
|||||||
--num-prompts 1000
|
--num-prompts 1000
|
||||||
```
|
```
|
||||||
|
|
||||||
**Choice-based Generation Benchmark**
|
### Choice-based Generation Benchmark
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python3 benchmarks/benchmark_serving_structured_output.py \
|
python3 benchmarks/benchmark_serving_structured_output.py \
|
||||||
@ -472,7 +479,7 @@ python3 benchmarks/benchmark_serving_structured_output.py \
|
|||||||
--num-prompts 1000
|
--num-prompts 1000
|
||||||
```
|
```
|
||||||
|
|
||||||
**XGrammar Benchmark Dataset**
|
### XGrammar Benchmark Dataset
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python3 benchmarks/benchmark_serving_structured_output.py \
|
python3 benchmarks/benchmark_serving_structured_output.py \
|
||||||
@ -485,14 +492,16 @@ python3 benchmarks/benchmark_serving_structured_output.py \
|
|||||||
|
|
||||||
</details>
|
</details>
|
||||||
|
|
||||||
|
## 📚 Example - Long Document QA Benchmark
|
||||||
|
|
||||||
<details>
|
<details>
|
||||||
<summary><b>📚 Example - Long Document QA Benchmark</b></summary>
|
<summary>Show more</summary>
|
||||||
|
|
||||||
<br/>
|
<br/>
|
||||||
|
|
||||||
Benchmark the performance of long document question-answering with prefix caching.
|
Benchmark the performance of long document question-answering with prefix caching.
|
||||||
|
|
||||||
**Basic Long Document QA Test**
|
### Basic Long Document QA Test
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python3 benchmarks/benchmark_long_document_qa_throughput.py \
|
python3 benchmarks/benchmark_long_document_qa_throughput.py \
|
||||||
@ -504,7 +513,7 @@ python3 benchmarks/benchmark_long_document_qa_throughput.py \
|
|||||||
--repeat-count 5
|
--repeat-count 5
|
||||||
```
|
```
|
||||||
|
|
||||||
**Different Repeat Modes**
|
### Different Repeat Modes
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# Random mode (default) - shuffle prompts randomly
|
# Random mode (default) - shuffle prompts randomly
|
||||||
@ -537,14 +546,16 @@ python3 benchmarks/benchmark_long_document_qa_throughput.py \
|
|||||||
|
|
||||||
</details>
|
</details>
|
||||||
|
|
||||||
|
## 🗂️ Example - Prefix Caching Benchmark
|
||||||
|
|
||||||
<details>
|
<details>
|
||||||
<summary><b>🗂️ Example - Prefix Caching Benchmark</b></summary>
|
<summary>Show more</summary>
|
||||||
|
|
||||||
<br/>
|
<br/>
|
||||||
|
|
||||||
Benchmark the efficiency of automatic prefix caching.
|
Benchmark the efficiency of automatic prefix caching.
|
||||||
|
|
||||||
**Fixed Prompt with Prefix Caching**
|
### Fixed Prompt with Prefix Caching
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python3 benchmarks/benchmark_prefix_caching.py \
|
python3 benchmarks/benchmark_prefix_caching.py \
|
||||||
@ -555,7 +566,7 @@ python3 benchmarks/benchmark_prefix_caching.py \
|
|||||||
--input-length-range 128:256
|
--input-length-range 128:256
|
||||||
```
|
```
|
||||||
|
|
||||||
**ShareGPT Dataset with Prefix Caching**
|
### ShareGPT Dataset with Prefix Caching
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# download dataset
|
# download dataset
|
||||||
@ -572,14 +583,16 @@ python3 benchmarks/benchmark_prefix_caching.py \
|
|||||||
|
|
||||||
</details>
|
</details>
|
||||||
|
|
||||||
|
## ⚡ Example - Request Prioritization Benchmark
|
||||||
|
|
||||||
<details>
|
<details>
|
||||||
<summary><b>⚡ Example - Request Prioritization Benchmark</b></summary>
|
<summary>Show more</summary>
|
||||||
|
|
||||||
<br/>
|
<br/>
|
||||||
|
|
||||||
Benchmark the performance of request prioritization in vLLM.
|
Benchmark the performance of request prioritization in vLLM.
|
||||||
|
|
||||||
**Basic Prioritization Test**
|
### Basic Prioritization Test
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python3 benchmarks/benchmark_prioritization.py \
|
python3 benchmarks/benchmark_prioritization.py \
|
||||||
@ -590,7 +603,7 @@ python3 benchmarks/benchmark_prioritization.py \
|
|||||||
--scheduling-policy priority
|
--scheduling-policy priority
|
||||||
```
|
```
|
||||||
|
|
||||||
**Multiple Sequences per Prompt**
|
### Multiple Sequences per Prompt
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python3 benchmarks/benchmark_prioritization.py \
|
python3 benchmarks/benchmark_prioritization.py \
|
||||||
|
|||||||
@ -3,6 +3,7 @@
|
|||||||
This script automates the process of finding the optimal server parameter combination (`max-num-seqs` and `max-num-batched-tokens`) to maximize throughput for a vLLM server. It also supports additional constraints such as E2E latency and prefix cache hit rate.
|
This script automates the process of finding the optimal server parameter combination (`max-num-seqs` and `max-num-batched-tokens`) to maximize throughput for a vLLM server. It also supports additional constraints such as E2E latency and prefix cache hit rate.
|
||||||
|
|
||||||
## Table of Contents
|
## Table of Contents
|
||||||
|
|
||||||
- [Prerequisites](#prerequisites)
|
- [Prerequisites](#prerequisites)
|
||||||
- [Configuration](#configuration)
|
- [Configuration](#configuration)
|
||||||
- [How to Run](#how-to-run)
|
- [How to Run](#how-to-run)
|
||||||
@ -39,6 +40,7 @@ You must set the following variables at the top of the script before execution.
|
|||||||
| `DOWNLOAD_DIR` | **Required.** Directory to download and load model weights from. | `""` (default download path) |
|
| `DOWNLOAD_DIR` | **Required.** Directory to download and load model weights from. | `""` (default download path) |
|
||||||
| `INPUT_LEN` | **Required.** Request input length. | `4000` |
|
| `INPUT_LEN` | **Required.** Request input length. | `4000` |
|
||||||
| `OUTPUT_LEN` | **Required.** Request output length. | `16` |
|
| `OUTPUT_LEN` | **Required.** Request output length. | `16` |
|
||||||
|
| `MAX_MODEL_LEN` | **Required.** Max model length. | `4096` |
|
||||||
| `MIN_CACHE_HIT_PCT` | Prefix cache hit rate in percentage (0-100). Set to `0` to disable. | `60` |
|
| `MIN_CACHE_HIT_PCT` | Prefix cache hit rate in percentage (0-100). Set to `0` to disable. | `60` |
|
||||||
| `MAX_LATENCY_ALLOWED_MS` | The maximum allowed P99 end-to-end latency in milliseconds. Set to a very large number (e.g., `100000000000`) to effectively ignore the latency constraint. | `500` |
|
| `MAX_LATENCY_ALLOWED_MS` | The maximum allowed P99 end-to-end latency in milliseconds. Set to a very large number (e.g., `100000000000`) to effectively ignore the latency constraint. | `500` |
|
||||||
| `NUM_SEQS_LIST` | A space-separated string of `max-num-seqs` values to test. | `"128 256"` |
|
| `NUM_SEQS_LIST` | A space-separated string of `max-num-seqs` values to test. | `"128 256"` |
|
||||||
@ -51,7 +53,7 @@ You must set the following variables at the top of the script before execution.
|
|||||||
1. **Configure**: Edit the script and set the variables in the [Configuration](#configuration) section.
|
1. **Configure**: Edit the script and set the variables in the [Configuration](#configuration) section.
|
||||||
2. **Execute**: Run the script. Since the process can take a long time, it is highly recommended to use a terminal multiplexer like `tmux` or `screen` to prevent the script from stopping if your connection is lost.
|
2. **Execute**: Run the script. Since the process can take a long time, it is highly recommended to use a terminal multiplexer like `tmux` or `screen` to prevent the script from stopping if your connection is lost.
|
||||||
|
|
||||||
```
|
```bash
|
||||||
cd <FOLDER_OF_THIS_SCRIPT>
|
cd <FOLDER_OF_THIS_SCRIPT>
|
||||||
bash auto_tune.sh
|
bash auto_tune.sh
|
||||||
```
|
```
|
||||||
@ -63,34 +65,40 @@ bash auto_tune.sh
|
|||||||
Here are a few examples of how to configure the script for different goals:
|
Here are a few examples of how to configure the script for different goals:
|
||||||
|
|
||||||
### 1. Maximize Throughput (No Latency Constraint)
|
### 1. Maximize Throughput (No Latency Constraint)
|
||||||
|
|
||||||
- **Goal**: Find the best `max-num-seqs` and `max-num-batched-tokens` to get the highest possible throughput for 1800 input tokens and 20 output tokens.
|
- **Goal**: Find the best `max-num-seqs` and `max-num-batched-tokens` to get the highest possible throughput for 1800 input tokens and 20 output tokens.
|
||||||
- **Configuration**:
|
- **Configuration**:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
INPUT_LEN=1800
|
INPUT_LEN=1800
|
||||||
OUTPUT_LEN=20
|
OUTPUT_LEN=20
|
||||||
|
MAX_MODEL_LEN=2048
|
||||||
MIN_CACHE_HIT_PCT=0
|
MIN_CACHE_HIT_PCT=0
|
||||||
MAX_LATENCY_ALLOWED_MS=100000000000 # A very large number
|
MAX_LATENCY_ALLOWED_MS=100000000000 # A very large number
|
||||||
```
|
```
|
||||||
|
|
||||||
#### 2. Maximize Throughput with a Latency Requirement
|
#### 2. Maximize Throughput with a Latency Requirement
|
||||||
|
|
||||||
- **Goal**: Find the best server parameters when P99 end-to-end latency must be below 500ms.
|
- **Goal**: Find the best server parameters when P99 end-to-end latency must be below 500ms.
|
||||||
- **Configuration**:
|
- **Configuration**:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
INPUT_LEN=1800
|
INPUT_LEN=1800
|
||||||
OUTPUT_LEN=20
|
OUTPUT_LEN=20
|
||||||
|
MAX_MODEL_LEN=2048
|
||||||
MIN_CACHE_HIT_PCT=0
|
MIN_CACHE_HIT_PCT=0
|
||||||
MAX_LATENCY_ALLOWED_MS=500
|
MAX_LATENCY_ALLOWED_MS=500
|
||||||
```
|
```
|
||||||
|
|
||||||
#### 3. Maximize Throughput with Prefix Caching and Latency Requirements
|
#### 3. Maximize Throughput with Prefix Caching and Latency Requirements
|
||||||
|
|
||||||
- **Goal**: Find the best server parameters assuming a 60% prefix cache hit rate and a latency requirement of 500ms.
|
- **Goal**: Find the best server parameters assuming a 60% prefix cache hit rate and a latency requirement of 500ms.
|
||||||
- **Configuration**:
|
- **Configuration**:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
INPUT_LEN=1800
|
INPUT_LEN=1800
|
||||||
OUTPUT_LEN=20
|
OUTPUT_LEN=20
|
||||||
|
MAX_MODEL_LEN=2048
|
||||||
MIN_CACHE_HIT_PCT=60
|
MIN_CACHE_HIT_PCT=60
|
||||||
MAX_LATENCY_ALLOWED_MS=500
|
MAX_LATENCY_ALLOWED_MS=500
|
||||||
```
|
```
|
||||||
@ -101,11 +109,11 @@ After the script finishes, you will find the results in a new, timestamped direc
|
|||||||
|
|
||||||
- **Log Files**: The directory (`$BASE/auto-benchmark/YYYY_MM_DD_HH_MM/`) contains detailed logs for each run:
|
- **Log Files**: The directory (`$BASE/auto-benchmark/YYYY_MM_DD_HH_MM/`) contains detailed logs for each run:
|
||||||
- `vllm_log_...txt`: The log output from the vLLM server for each parameter combination.
|
- `vllm_log_...txt`: The log output from the vLLM server for each parameter combination.
|
||||||
- `bm_log_...txt`: The log output from the `benchmark_serving.py` script for each benchmark run.
|
- `bm_log_...txt`: The log output from the `vllm bench serve` command for each benchmark run.
|
||||||
|
|
||||||
- **Final Result Summary**: A file named `result.txt` is created in the log directory. It contains a summary of each tested combination and concludes with the overall best parameters found.
|
- **Final Result Summary**: A file named `result.txt` is created in the log directory. It contains a summary of each tested combination and concludes with the overall best parameters found.
|
||||||
|
|
||||||
```
|
```text
|
||||||
# Example result.txt content
|
# Example result.txt content
|
||||||
hash:a1b2c3d4...
|
hash:a1b2c3d4...
|
||||||
max_num_seqs: 128, max_num_batched_tokens: 2048, request_rate: 10.0, e2el: 450.5, throughput: 9.8, goodput: 9.8
|
max_num_seqs: 128, max_num_batched_tokens: 2048, request_rate: 10.0, e2el: 450.5, throughput: 9.8, goodput: 9.8
|
||||||
|
|||||||
@ -1,16 +1,18 @@
|
|||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
|
|
||||||
# This script aims to tune the best server parameter combinations to maximize throughput for given requirement.
|
# This script aims to tune the best server parameter combinations to maximize throughput for given requirement.
|
||||||
# See details in README (benchmarks/auto_tune/README.md).
|
# See details in README (benchmarks/auto_tune/README.md).
|
||||||
|
|
||||||
TAG=$(date +"%Y_%m_%d_%H_%M")
|
TAG=$(date +"%Y_%m_%d_%H_%M")
|
||||||
BASE=""
|
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
|
||||||
|
BASE="$SCRIPT_DIR/../../.."
|
||||||
MODEL="meta-llama/Llama-3.1-8B-Instruct"
|
MODEL="meta-llama/Llama-3.1-8B-Instruct"
|
||||||
SYSTEM="TPU"
|
SYSTEM="TPU"
|
||||||
TP=1
|
TP=1
|
||||||
DOWNLOAD_DIR=""
|
DOWNLOAD_DIR=""
|
||||||
INPUT_LEN=4000
|
INPUT_LEN=4000
|
||||||
OUTPUT_LEN=16
|
OUTPUT_LEN=16
|
||||||
|
MAX_MODEL_LEN=4096
|
||||||
MIN_CACHE_HIT_PCT=0
|
MIN_CACHE_HIT_PCT=0
|
||||||
MAX_LATENCY_ALLOWED_MS=100000000000
|
MAX_LATENCY_ALLOWED_MS=100000000000
|
||||||
NUM_SEQS_LIST="128 256"
|
NUM_SEQS_LIST="128 256"
|
||||||
@ -36,10 +38,18 @@ current_hash=$(git rev-parse HEAD)
|
|||||||
echo "hash:$current_hash" >> "$RESULT"
|
echo "hash:$current_hash" >> "$RESULT"
|
||||||
echo "current_hash: $current_hash"
|
echo "current_hash: $current_hash"
|
||||||
|
|
||||||
|
TOTAL_LEN=$((INPUT_LEN + OUTPUT_LEN))
|
||||||
|
RED='\033[0;31m'
|
||||||
|
if (( TOTAL_LEN > MAX_MODEL_LEN )); then
|
||||||
|
echo -e "${RED}FAILED: INPUT_LEN($INPUT_LEN) + OUTPUT_LEN($OUTPUT_LEN) = $TOTAL_LEN, which is > MAX_MODEL_LEN = $MAX_MODEL_LEN.\033[0m" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
best_throughput=0
|
best_throughput=0
|
||||||
best_max_num_seqs=0
|
best_max_num_seqs=0
|
||||||
best_num_batched_tokens=0
|
best_num_batched_tokens=0
|
||||||
best_goodput=0
|
best_goodput=0
|
||||||
|
best_request_rate=0
|
||||||
|
|
||||||
start_server() {
|
start_server() {
|
||||||
local gpu_memory_utilization=$1
|
local gpu_memory_utilization=$1
|
||||||
@ -47,26 +57,42 @@ start_server() {
|
|||||||
local max_num_batched_tokens=$3
|
local max_num_batched_tokens=$3
|
||||||
local vllm_log=$4
|
local vllm_log=$4
|
||||||
local profile_dir=$5
|
local profile_dir=$5
|
||||||
|
|
||||||
pkill -f vllm
|
|
||||||
|
|
||||||
VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir vllm serve $MODEL \
|
pkill -if vllm
|
||||||
--disable-log-requests \
|
|
||||||
--port 8004 \
|
# Define the common arguments as a bash array.
|
||||||
--gpu-memory-utilization $gpu_memory_utilization \
|
# Each argument and its value are separate elements.
|
||||||
--max-num-seqs $max_num_seqs \
|
local common_args_array=(
|
||||||
--max-num-batched-tokens $max_num_batched_tokens \
|
"$MODEL"
|
||||||
--tensor-parallel-size $TP \
|
"--disable-log-requests"
|
||||||
--enable-prefix-caching \
|
"--port" "8004"
|
||||||
--load-format dummy \
|
"--gpu-memory-utilization" "$gpu_memory_utilization"
|
||||||
--download-dir "$DOWNLOAD_DIR" \
|
"--max-num-seqs" "$max_num_seqs"
|
||||||
--max-model-len $(( INPUT_LEN+OUTPUT_LEN )) > "$vllm_log" 2>&1 &
|
"--max-num-batched-tokens" "$max_num_batched_tokens"
|
||||||
|
"--tensor-parallel-size" "$TP"
|
||||||
|
"--enable-prefix-caching"
|
||||||
|
"--load-format" "dummy"
|
||||||
|
"--download-dir" "$DOWNLOAD_DIR"
|
||||||
|
"--max-model-len" "$MAX_MODEL_LEN"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Use the array expansion "${common_args_array[@]}"
|
||||||
|
# This correctly passes each element as a separate argument.
|
||||||
|
if [[ -n "$profile_dir" ]]; then
|
||||||
|
# Start server with profiling enabled
|
||||||
|
VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir \
|
||||||
|
vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 &
|
||||||
|
else
|
||||||
|
# Start server without profiling
|
||||||
|
VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 \
|
||||||
|
vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 &
|
||||||
|
fi
|
||||||
|
|
||||||
# wait for 10 minutes...
|
# wait for 10 minutes...
|
||||||
server_started=0
|
server_started=0
|
||||||
for i in {1..60}; do
|
for i in {1..60}; do
|
||||||
RESPONSE=$(curl -s -X GET "http://0.0.0.0:8004/health" -w "%{http_code}" -o /dev/stdout)
|
RESPONSE=$(curl -s -X GET "http://0.0.0.0:8004/health" -w "%{http_code}" -o /dev/stdout)
|
||||||
STATUS_CODE=$(echo "$RESPONSE" | tail -n 1)
|
STATUS_CODE=$(echo "$RESPONSE" | tail -n 1)
|
||||||
if [[ "$STATUS_CODE" -eq 200 ]]; then
|
if [[ "$STATUS_CODE" -eq 200 ]]; then
|
||||||
server_started=1
|
server_started=1
|
||||||
break
|
break
|
||||||
@ -74,6 +100,7 @@ start_server() {
|
|||||||
sleep 10
|
sleep 10
|
||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
|
|
||||||
if (( ! server_started )); then
|
if (( ! server_started )); then
|
||||||
echo "server did not start within 10 minutes. Please check server log at $vllm_log".
|
echo "server did not start within 10 minutes. Please check server log at $vllm_log".
|
||||||
return 1
|
return 1
|
||||||
@ -82,37 +109,20 @@ start_server() {
|
|||||||
fi
|
fi
|
||||||
}
|
}
|
||||||
|
|
||||||
update_best_profile() {
|
|
||||||
local profile_dir=$1
|
|
||||||
local profile_index=$2
|
|
||||||
sorted_paths=($(find "$profile_dir" -maxdepth 1 -not -path "$profile_dir" | sort))
|
|
||||||
selected_profile_file=
|
|
||||||
if [[ "$SYSTEM" == "TPU" ]]; then
|
|
||||||
selected_profile_file="${sorted_paths[$profile_index]}/*.xplane.pb"
|
|
||||||
fi
|
|
||||||
if [[ "$SYSTEM" == "GPU" ]]; then
|
|
||||||
selected_profile_file="${sorted_paths[$profile_index]}"
|
|
||||||
fi
|
|
||||||
rm -f $PROFILE_PATH/*
|
|
||||||
cp $selected_profile_file $PROFILE_PATH
|
|
||||||
}
|
|
||||||
|
|
||||||
run_benchmark() {
|
run_benchmark() {
|
||||||
local max_num_seqs=$1
|
local max_num_seqs=$1
|
||||||
local max_num_batched_tokens=$2
|
local max_num_batched_tokens=$2
|
||||||
local gpu_memory_utilization=$3
|
local gpu_memory_utilization=$3
|
||||||
echo "max_num_seq: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens"
|
echo "max_num_seq: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens"
|
||||||
local vllm_log="$LOG_FOLDER/vllm_log_${max_num_seqs}_${max_num_batched_tokens}.txt"
|
local vllm_log="$LOG_FOLDER/vllm_log_${max_num_seqs}_${max_num_batched_tokens}.txt"
|
||||||
local profile_dir="$LOG_FOLDER/profile_${max_num_seqs}_${max_num_batched_tokens}"
|
|
||||||
echo "vllm_log: $vllm_log"
|
echo "vllm_log: $vllm_log"
|
||||||
echo
|
echo
|
||||||
rm -f $vllm_log
|
rm -f $vllm_log
|
||||||
mkdir -p $profile_dir
|
pkill -if vllm
|
||||||
pkill -f vllm
|
|
||||||
local profile_index=0
|
|
||||||
|
|
||||||
echo "starting server..."
|
echo "starting server..."
|
||||||
start_server $gpu_memory_utilization $max_num_seqs $max_num_batched_tokens $vllm_log $profile_dir
|
# Call start_server without a profile_dir to avoid profiling overhead
|
||||||
|
start_server $gpu_memory_utilization $max_num_seqs $max_num_batched_tokens $vllm_log ""
|
||||||
result=$?
|
result=$?
|
||||||
if [[ "$result" -eq 1 ]]; then
|
if [[ "$result" -eq 1 ]]; then
|
||||||
echo "server failed to start. gpu_memory_utilization:$gpu_memory_utilization, max_num_seqs:$max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens"
|
echo "server failed to start. gpu_memory_utilization:$gpu_memory_utilization, max_num_seqs:$max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens"
|
||||||
@ -120,14 +130,15 @@ run_benchmark() {
|
|||||||
echo "server started."
|
echo "server started."
|
||||||
fi
|
fi
|
||||||
echo
|
echo
|
||||||
|
|
||||||
echo "run benchmark test..."
|
echo "run benchmark test..."
|
||||||
meet_latency_requirement=0
|
meet_latency_requirement=0
|
||||||
# get a basic qps by using request-rate inf
|
# get a basic qps by using request-rate inf
|
||||||
bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_inf.txt"
|
bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_inf.txt"
|
||||||
prefix_len=$(( INPUT_LEN * MIN_CACHE_HIT_PCT / 100 ))
|
prefix_len=$(( INPUT_LEN * MIN_CACHE_HIT_PCT / 100 ))
|
||||||
adjusted_input_len=$(( INPUT_LEN - prefix_len ))
|
adjusted_input_len=$(( INPUT_LEN - prefix_len ))
|
||||||
python3 benchmarks/benchmark_serving.py \
|
# --profile flag is removed from this call
|
||||||
|
vllm bench serve \
|
||||||
--backend vllm \
|
--backend vllm \
|
||||||
--model $MODEL \
|
--model $MODEL \
|
||||||
--dataset-name random \
|
--dataset-name random \
|
||||||
@ -140,8 +151,7 @@ adjusted_input_len=$(( INPUT_LEN - prefix_len ))
|
|||||||
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \
|
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \
|
||||||
--num-prompts 1000 \
|
--num-prompts 1000 \
|
||||||
--random-prefix-len $prefix_len \
|
--random-prefix-len $prefix_len \
|
||||||
--port 8004 \
|
--port 8004 &> "$bm_log"
|
||||||
--profile &> "$bm_log"
|
|
||||||
throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
|
throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
|
||||||
e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
|
e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
|
||||||
goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
|
goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
|
||||||
@ -155,12 +165,11 @@ adjusted_input_len=$(( INPUT_LEN - prefix_len ))
|
|||||||
# start from request-rate as int(throughput) + 1
|
# start from request-rate as int(throughput) + 1
|
||||||
request_rate=$((${throughput%.*} + 1))
|
request_rate=$((${throughput%.*} + 1))
|
||||||
while ((request_rate > 0)); do
|
while ((request_rate > 0)); do
|
||||||
profile_index=$((profile_index+1))
|
|
||||||
# clear prefix cache
|
# clear prefix cache
|
||||||
curl -X POST http://0.0.0.0:8004/reset_prefix_cache
|
curl -X POST http://0.0.0.0:8004/reset_prefix_cache
|
||||||
sleep 5
|
sleep 5
|
||||||
bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_${request_rate}.txt"
|
bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_${request_rate}.txt"
|
||||||
python3 benchmarks/benchmark_serving.py \
|
vllm bench serve \
|
||||||
--backend vllm \
|
--backend vllm \
|
||||||
--model $MODEL \
|
--model $MODEL \
|
||||||
--dataset-name random \
|
--dataset-name random \
|
||||||
@ -193,12 +202,7 @@ adjusted_input_len=$(( INPUT_LEN - prefix_len ))
|
|||||||
best_max_num_seqs=$max_num_seqs
|
best_max_num_seqs=$max_num_seqs
|
||||||
best_num_batched_tokens=$max_num_batched_tokens
|
best_num_batched_tokens=$max_num_batched_tokens
|
||||||
best_goodput=$goodput
|
best_goodput=$goodput
|
||||||
if [[ "$SYSTEM" == "TPU" ]]; then
|
best_request_rate=$request_rate
|
||||||
update_best_profile "$profile_dir/plugins/profile" $profile_index
|
|
||||||
fi
|
|
||||||
if [[ "$SYSTEM" == "GPU" ]]; then
|
|
||||||
update_best_profile "$profile_dir" $profile_index
|
|
||||||
fi
|
|
||||||
fi
|
fi
|
||||||
else
|
else
|
||||||
echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens does not meet latency requirement ${MAX_LATENCY_ALLOWED_MS}"
|
echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens does not meet latency requirement ${MAX_LATENCY_ALLOWED_MS}"
|
||||||
@ -207,7 +211,7 @@ adjusted_input_len=$(( INPUT_LEN - prefix_len ))
|
|||||||
|
|
||||||
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput"
|
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput"
|
||||||
|
|
||||||
pkill vllm
|
pkill -if vllm
|
||||||
sleep 10
|
sleep 10
|
||||||
printf '=%.0s' $(seq 1 20)
|
printf '=%.0s' $(seq 1 20)
|
||||||
return 0
|
return 0
|
||||||
@ -220,7 +224,8 @@ read -r -a num_batched_tokens_list <<< "$NUM_BATCHED_TOKENS_LIST"
|
|||||||
gpu_memory_utilization=0.98
|
gpu_memory_utilization=0.98
|
||||||
find_gpu_memory_utilization=0
|
find_gpu_memory_utilization=0
|
||||||
while (( $(echo "$gpu_memory_utilization >= 0.9" | bc -l) )); do
|
while (( $(echo "$gpu_memory_utilization >= 0.9" | bc -l) )); do
|
||||||
start_server $gpu_memory_utilization "${num_seqs_list[-1]}" "${num_batched_tokens_list[-1]}" "$LOG_FOLDER/vllm_log_gpu_memory_utilization_$gpu_memory_utilization.log"
|
# Pass empty string for profile_dir argument
|
||||||
|
start_server $gpu_memory_utilization "${num_seqs_list[-1]}" "${num_batched_tokens_list[-1]}" "$LOG_FOLDER/vllm_log_gpu_memory_utilization_$gpu_memory_utilization.log" ""
|
||||||
result=$?
|
result=$?
|
||||||
if [[ "$result" -eq 0 ]]; then
|
if [[ "$result" -eq 0 ]]; then
|
||||||
find_gpu_memory_utilization=1
|
find_gpu_memory_utilization=1
|
||||||
@ -243,6 +248,45 @@ for num_seqs in "${num_seqs_list[@]}"; do
|
|||||||
done
|
done
|
||||||
done
|
done
|
||||||
echo "finish permutations"
|
echo "finish permutations"
|
||||||
|
|
||||||
|
# =================================================================================
|
||||||
|
# FINAL PROFILING RUN FOR THE BEST CONFIGURATION
|
||||||
|
# =================================================================================
|
||||||
|
if (( $(echo "$best_throughput > 0" | bc -l) )); then
|
||||||
|
echo
|
||||||
|
echo "Benchmark tuning finished. Now running profiling on the best configuration found..."
|
||||||
|
echo "Best config: max_num_seqs: $best_max_num_seqs, max_num_batched_tokens: $best_num_batched_tokens, throughput: $best_throughput"
|
||||||
|
echo
|
||||||
|
|
||||||
|
vllm_log="$LOG_FOLDER/vllm_log_BEST_PROFILE.txt"
|
||||||
|
bm_log="$LOG_FOLDER/bm_log_BEST_PROFILE.txt"
|
||||||
|
|
||||||
|
# Start server with the best params and profiling ENABLED
|
||||||
|
echo "Starting server for profiling..."
|
||||||
|
start_server $gpu_memory_utilization $best_max_num_seqs $best_num_batched_tokens "$vllm_log" "$PROFILE_PATH"
|
||||||
|
|
||||||
|
# Run benchmark with the best params and the --profile flag
|
||||||
|
echo "Running benchmark with profiling..."
|
||||||
|
prefix_len=$(( INPUT_LEN * MIN_CACHE_HIT_PCT / 100 ))
|
||||||
|
adjusted_input_len=$(( INPUT_LEN - prefix_len ))
|
||||||
|
vllm bench serve \
|
||||||
|
--backend vllm \
|
||||||
|
--model $MODEL \
|
||||||
|
--dataset-name random \
|
||||||
|
--random-input-len $adjusted_input_len \
|
||||||
|
--random-output-len $OUTPUT_LEN \
|
||||||
|
--ignore-eos \
|
||||||
|
--disable-tqdm \
|
||||||
|
--request-rate $best_request_rate \
|
||||||
|
--percentile-metrics ttft,tpot,itl,e2el \
|
||||||
|
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \
|
||||||
|
--num-prompts 100 \
|
||||||
|
--random-prefix-len $prefix_len \
|
||||||
|
--port 8004 \
|
||||||
|
--profile &> "$bm_log"
|
||||||
|
else
|
||||||
|
echo "No configuration met the latency requirements. Skipping final profiling run."
|
||||||
|
fi
|
||||||
|
pkill -if vllm
|
||||||
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH"
|
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH"
|
||||||
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH" >> "$RESULT"
|
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH" >> "$RESULT"
|
||||||
|
|
||||||
|
|||||||
@ -11,6 +11,7 @@ from typing import Any, Optional
|
|||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
from typing_extensions import deprecated
|
||||||
|
|
||||||
import vllm.envs as envs
|
import vllm.envs as envs
|
||||||
from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
|
from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
|
||||||
@ -34,6 +35,10 @@ def save_to_pytorch_benchmark_format(
|
|||||||
write_to_json(pt_file, pt_records)
|
write_to_json(pt_file, pt_records)
|
||||||
|
|
||||||
|
|
||||||
|
@deprecated(
|
||||||
|
"benchmark_latency.py is deprecated and will be removed in a "
|
||||||
|
"future version. Please use 'vllm bench latency' instead.",
|
||||||
|
)
|
||||||
def main(args: argparse.Namespace):
|
def main(args: argparse.Namespace):
|
||||||
print(args)
|
print(args)
|
||||||
|
|
||||||
|
|||||||
@ -5,8 +5,7 @@ r"""Benchmark online serving throughput.
|
|||||||
On the server side, run one of the following commands:
|
On the server side, run one of the following commands:
|
||||||
vLLM OpenAI API server
|
vLLM OpenAI API server
|
||||||
vllm serve <your_model> \
|
vllm serve <your_model> \
|
||||||
--swap-space 16 \
|
--swap-space 16
|
||||||
--disable-log-requests
|
|
||||||
|
|
||||||
On the client side, run:
|
On the client side, run:
|
||||||
python benchmarks/benchmark_serving.py \
|
python benchmarks/benchmark_serving.py \
|
||||||
@ -38,6 +37,7 @@ from typing import Any, Literal, Optional
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
from tqdm.asyncio import tqdm
|
from tqdm.asyncio import tqdm
|
||||||
from transformers import PreTrainedTokenizerBase
|
from transformers import PreTrainedTokenizerBase
|
||||||
|
from typing_extensions import deprecated
|
||||||
|
|
||||||
from backend_request_func import (
|
from backend_request_func import (
|
||||||
ASYNC_REQUEST_FUNCS,
|
ASYNC_REQUEST_FUNCS,
|
||||||
@ -395,20 +395,6 @@ async def benchmark(
|
|||||||
tasks.append(asyncio.create_task(task))
|
tasks.append(asyncio.create_task(task))
|
||||||
outputs: list[RequestFuncOutput] = await asyncio.gather(*tasks)
|
outputs: list[RequestFuncOutput] = await asyncio.gather(*tasks)
|
||||||
|
|
||||||
if profile:
|
|
||||||
print("Stopping profiler...")
|
|
||||||
profile_input = RequestFuncInput(
|
|
||||||
model=model_id,
|
|
||||||
prompt=test_prompt,
|
|
||||||
api_url=base_url + "/stop_profile",
|
|
||||||
prompt_len=test_prompt_len,
|
|
||||||
output_len=test_output_len,
|
|
||||||
logprobs=logprobs,
|
|
||||||
)
|
|
||||||
profile_output = await request_func(request_func_input=profile_input)
|
|
||||||
if profile_output.success:
|
|
||||||
print("Profiler stopped")
|
|
||||||
|
|
||||||
if pbar is not None:
|
if pbar is not None:
|
||||||
pbar.close()
|
pbar.close()
|
||||||
|
|
||||||
@ -426,6 +412,10 @@ async def benchmark(
|
|||||||
|
|
||||||
print("{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=50, c="="))
|
print("{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=50, c="="))
|
||||||
print("{:<40} {:<10}".format("Successful requests:", metrics.completed))
|
print("{:<40} {:<10}".format("Successful requests:", metrics.completed))
|
||||||
|
if max_concurrency is not None:
|
||||||
|
print("{:<40} {:<10}".format("Maximum request concurrency:", max_concurrency))
|
||||||
|
if request_rate != float("inf"):
|
||||||
|
print("{:<40} {:<10.2f}".format("Request rate configured (RPS):", request_rate))
|
||||||
print("{:<40} {:<10.2f}".format("Benchmark duration (s):", benchmark_duration))
|
print("{:<40} {:<10.2f}".format("Benchmark duration (s):", benchmark_duration))
|
||||||
print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input))
|
print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input))
|
||||||
print("{:<40} {:<10}".format("Total generated tokens:", metrics.total_output))
|
print("{:<40} {:<10}".format("Total generated tokens:", metrics.total_output))
|
||||||
@ -517,6 +507,20 @@ async def benchmark(
|
|||||||
|
|
||||||
print("=" * 50)
|
print("=" * 50)
|
||||||
|
|
||||||
|
if profile:
|
||||||
|
print("Stopping profiler...")
|
||||||
|
profile_input = RequestFuncInput(
|
||||||
|
model=model_id,
|
||||||
|
prompt=test_prompt,
|
||||||
|
api_url=base_url + "/stop_profile",
|
||||||
|
prompt_len=test_prompt_len,
|
||||||
|
output_len=test_output_len,
|
||||||
|
logprobs=logprobs,
|
||||||
|
)
|
||||||
|
profile_output = await request_func(request_func_input=profile_input)
|
||||||
|
if profile_output.success:
|
||||||
|
print("Profiler stopped")
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
@ -593,6 +597,10 @@ def save_to_pytorch_benchmark_format(
|
|||||||
write_to_json(pt_file, pt_records)
|
write_to_json(pt_file, pt_records)
|
||||||
|
|
||||||
|
|
||||||
|
@deprecated(
|
||||||
|
"benchmark_serving.py is deprecated and will be removed in a future "
|
||||||
|
"version. Please use 'vllm bench serve' instead.",
|
||||||
|
)
|
||||||
def main(args: argparse.Namespace):
|
def main(args: argparse.Namespace):
|
||||||
print(args)
|
print(args)
|
||||||
random.seed(args.seed)
|
random.seed(args.seed)
|
||||||
|
|||||||
@ -4,7 +4,7 @@ r"""Benchmark online serving throughput with structured outputs.
|
|||||||
|
|
||||||
On the server side, run one of the following commands:
|
On the server side, run one of the following commands:
|
||||||
(vLLM OpenAI API server)
|
(vLLM OpenAI API server)
|
||||||
vllm serve <your_model> --disable-log-requests
|
vllm serve <your_model>
|
||||||
|
|
||||||
On the client side, run:
|
On the client side, run:
|
||||||
python benchmarks/benchmark_serving_structured_output.py \
|
python benchmarks/benchmark_serving_structured_output.py \
|
||||||
@ -538,20 +538,6 @@ async def benchmark(
|
|||||||
)
|
)
|
||||||
outputs: list[RequestFuncOutput] = await asyncio.gather(*tasks)
|
outputs: list[RequestFuncOutput] = await asyncio.gather(*tasks)
|
||||||
|
|
||||||
if profile:
|
|
||||||
print("Stopping profiler...")
|
|
||||||
profile_input = RequestFuncInput(
|
|
||||||
model=model_id,
|
|
||||||
prompt=test_request.prompt,
|
|
||||||
api_url=base_url + "/stop_profile",
|
|
||||||
prompt_len=test_request.prompt_len,
|
|
||||||
output_len=test_request.expected_output_len,
|
|
||||||
extra_body={test_request.structure_type: test_request.schema},
|
|
||||||
)
|
|
||||||
profile_output = await request_func(request_func_input=profile_input)
|
|
||||||
if profile_output.success:
|
|
||||||
print("Profiler stopped")
|
|
||||||
|
|
||||||
if pbar is not None:
|
if pbar is not None:
|
||||||
pbar.close()
|
pbar.close()
|
||||||
|
|
||||||
@ -569,6 +555,10 @@ async def benchmark(
|
|||||||
|
|
||||||
print("{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=50, c="="))
|
print("{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=50, c="="))
|
||||||
print("{:<40} {:<10}".format("Successful requests:", metrics.completed))
|
print("{:<40} {:<10}".format("Successful requests:", metrics.completed))
|
||||||
|
if max_concurrency is not None:
|
||||||
|
print("{:<40} {:<10}".format("Maximum request concurrency:", max_concurrency))
|
||||||
|
if request_rate != float("inf"):
|
||||||
|
print("{:<40} {:<10.2f}".format("Request rate configured (RPS):", request_rate))
|
||||||
print("{:<40} {:<10.2f}".format("Benchmark duration (s):", benchmark_duration))
|
print("{:<40} {:<10.2f}".format("Benchmark duration (s):", benchmark_duration))
|
||||||
print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input))
|
print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input))
|
||||||
print("{:<40} {:<10}".format("Total generated tokens:", metrics.total_output))
|
print("{:<40} {:<10}".format("Total generated tokens:", metrics.total_output))
|
||||||
@ -666,6 +656,20 @@ async def benchmark(
|
|||||||
|
|
||||||
print("=" * 50)
|
print("=" * 50)
|
||||||
|
|
||||||
|
if profile:
|
||||||
|
print("Stopping profiler...")
|
||||||
|
profile_input = RequestFuncInput(
|
||||||
|
model=model_id,
|
||||||
|
prompt=test_request.prompt,
|
||||||
|
api_url=base_url + "/stop_profile",
|
||||||
|
prompt_len=test_request.prompt_len,
|
||||||
|
output_len=test_request.expected_output_len,
|
||||||
|
extra_body={test_request.structure_type: test_request.schema},
|
||||||
|
)
|
||||||
|
profile_output = await request_func(request_func_input=profile_input)
|
||||||
|
if profile_output.success:
|
||||||
|
print("Profiler stopped")
|
||||||
|
|
||||||
return result, ret
|
return result, ret
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -15,6 +15,7 @@ import torch
|
|||||||
import uvloop
|
import uvloop
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizerBase
|
from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizerBase
|
||||||
|
from typing_extensions import deprecated
|
||||||
|
|
||||||
from benchmark_dataset import (
|
from benchmark_dataset import (
|
||||||
AIMODataset,
|
AIMODataset,
|
||||||
@ -167,7 +168,8 @@ async def run_vllm_async(
|
|||||||
from vllm import SamplingParams
|
from vllm import SamplingParams
|
||||||
|
|
||||||
async with build_async_engine_client_from_engine_args(
|
async with build_async_engine_client_from_engine_args(
|
||||||
engine_args, disable_frontend_multiprocessing
|
engine_args,
|
||||||
|
disable_frontend_multiprocessing=disable_frontend_multiprocessing,
|
||||||
) as llm:
|
) as llm:
|
||||||
model_config = await llm.get_model_config()
|
model_config = await llm.get_model_config()
|
||||||
assert all(
|
assert all(
|
||||||
@ -381,6 +383,10 @@ def get_requests(args, tokenizer):
|
|||||||
return dataset_cls(**common_kwargs).sample(**sample_kwargs)
|
return dataset_cls(**common_kwargs).sample(**sample_kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
@deprecated(
|
||||||
|
"benchmark_throughput.py is deprecated and will be removed in a "
|
||||||
|
"future version. Please use 'vllm bench throughput' instead.",
|
||||||
|
)
|
||||||
def main(args: argparse.Namespace):
|
def main(args: argparse.Namespace):
|
||||||
if args.seed is None:
|
if args.seed is None:
|
||||||
args.seed = 0
|
args.seed = 0
|
||||||
|
|||||||
@ -3,7 +3,7 @@
|
|||||||
# benchmark the overhead of disaggregated prefill.
|
# benchmark the overhead of disaggregated prefill.
|
||||||
# methodology:
|
# methodology:
|
||||||
# - send all request to prefill vLLM instance. It will buffer KV cache.
|
# - send all request to prefill vLLM instance. It will buffer KV cache.
|
||||||
# - then send all request to decode instance.
|
# - then send all request to decode instance.
|
||||||
# - The TTFT of decode instance is the overhead.
|
# - The TTFT of decode instance is the overhead.
|
||||||
|
|
||||||
set -ex
|
set -ex
|
||||||
@ -12,6 +12,8 @@ kill_gpu_processes() {
|
|||||||
# kill all processes on GPU.
|
# kill all processes on GPU.
|
||||||
pgrep pt_main_thread | xargs -r kill -9
|
pgrep pt_main_thread | xargs -r kill -9
|
||||||
pgrep python3 | xargs -r kill -9
|
pgrep python3 | xargs -r kill -9
|
||||||
|
# vLLM now names the process with VLLM prefix after https://github.com/vllm-project/vllm/pull/21445
|
||||||
|
pgrep VLLM | xargs -r kill -9
|
||||||
sleep 10
|
sleep 10
|
||||||
|
|
||||||
# remove vllm config file
|
# remove vllm config file
|
||||||
@ -61,7 +63,7 @@ benchmark() {
|
|||||||
--gpu-memory-utilization 0.6 \
|
--gpu-memory-utilization 0.6 \
|
||||||
--kv-transfer-config \
|
--kv-transfer-config \
|
||||||
'{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
|
'{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
|
||||||
|
|
||||||
|
|
||||||
CUDA_VISIBLE_DEVICES=1 python3 \
|
CUDA_VISIBLE_DEVICES=1 python3 \
|
||||||
-m vllm.entrypoints.openai.api_server \
|
-m vllm.entrypoints.openai.api_server \
|
||||||
@ -76,38 +78,38 @@ benchmark() {
|
|||||||
wait_for_server 8200
|
wait_for_server 8200
|
||||||
|
|
||||||
# let the prefill instance finish prefill
|
# let the prefill instance finish prefill
|
||||||
python3 ../benchmark_serving.py \
|
vllm bench serve \
|
||||||
--backend vllm \
|
--backend vllm \
|
||||||
--model $model \
|
--model $model \
|
||||||
--dataset-name $dataset_name \
|
--dataset-name $dataset_name \
|
||||||
--dataset-path $dataset_path \
|
--dataset-path $dataset_path \
|
||||||
--sonnet-input-len $input_len \
|
--sonnet-input-len $input_len \
|
||||||
--sonnet-output-len "$output_len" \
|
--sonnet-output-len "$output_len" \
|
||||||
--sonnet-prefix-len $prefix_len \
|
--sonnet-prefix-len $prefix_len \
|
||||||
--num-prompts $num_prompts \
|
--num-prompts $num_prompts \
|
||||||
--port 8100 \
|
--port 8100 \
|
||||||
--save-result \
|
--save-result \
|
||||||
--result-dir $results_folder \
|
--result-dir $results_folder \
|
||||||
--result-filename disagg_prefill_tp1.json \
|
--result-filename disagg_prefill_tp1.json \
|
||||||
--request-rate "inf"
|
--request-rate "inf"
|
||||||
|
|
||||||
|
|
||||||
# send the request to decode.
|
# send the request to decode.
|
||||||
# The TTFT of this command will be the overhead of disagg prefill impl.
|
# The TTFT of this command will be the overhead of disagg prefill impl.
|
||||||
python3 ../benchmark_serving.py \
|
vllm bench serve \
|
||||||
--backend vllm \
|
--backend vllm \
|
||||||
--model $model \
|
--model $model \
|
||||||
--dataset-name $dataset_name \
|
--dataset-name $dataset_name \
|
||||||
--dataset-path $dataset_path \
|
--dataset-path $dataset_path \
|
||||||
--sonnet-input-len $input_len \
|
--sonnet-input-len $input_len \
|
||||||
--sonnet-output-len "$output_len" \
|
--sonnet-output-len "$output_len" \
|
||||||
--sonnet-prefix-len $prefix_len \
|
--sonnet-prefix-len $prefix_len \
|
||||||
--num-prompts $num_prompts \
|
--num-prompts $num_prompts \
|
||||||
--port 8200 \
|
--port 8200 \
|
||||||
--save-result \
|
--save-result \
|
||||||
--result-dir $results_folder \
|
--result-dir $results_folder \
|
||||||
--result-filename disagg_prefill_tp1_overhead.json \
|
--result-filename disagg_prefill_tp1_overhead.json \
|
||||||
--request-rate "$qps"
|
--request-rate "$qps"
|
||||||
kill_gpu_processes
|
kill_gpu_processes
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -18,6 +18,8 @@ kill_gpu_processes() {
|
|||||||
# kill all processes on GPU.
|
# kill all processes on GPU.
|
||||||
pgrep pt_main_thread | xargs -r kill -9
|
pgrep pt_main_thread | xargs -r kill -9
|
||||||
pgrep python3 | xargs -r kill -9
|
pgrep python3 | xargs -r kill -9
|
||||||
|
# vLLM now names the process with VLLM prefix after https://github.com/vllm-project/vllm/pull/21445
|
||||||
|
pgrep VLLM | xargs -r kill -9
|
||||||
for port in 8000 8100 8200; do lsof -t -i:$port | xargs -r kill -9; done
|
for port in 8000 8100 8200; do lsof -t -i:$port | xargs -r kill -9; done
|
||||||
sleep 1
|
sleep 1
|
||||||
}
|
}
|
||||||
@ -58,7 +60,7 @@ launch_chunked_prefill() {
|
|||||||
|
|
||||||
|
|
||||||
launch_disagg_prefill() {
|
launch_disagg_prefill() {
|
||||||
model="meta-llama/Meta-Llama-3.1-8B-Instruct"
|
model="meta-llama/Meta-Llama-3.1-8B-Instruct"
|
||||||
# disagg prefill
|
# disagg prefill
|
||||||
CUDA_VISIBLE_DEVICES=0 python3 \
|
CUDA_VISIBLE_DEVICES=0 python3 \
|
||||||
-m vllm.entrypoints.openai.api_server \
|
-m vllm.entrypoints.openai.api_server \
|
||||||
@ -97,20 +99,20 @@ benchmark() {
|
|||||||
output_len=$2
|
output_len=$2
|
||||||
tag=$3
|
tag=$3
|
||||||
|
|
||||||
python3 ../benchmark_serving.py \
|
vllm bench serve \
|
||||||
--backend vllm \
|
--backend vllm \
|
||||||
--model $model \
|
--model $model \
|
||||||
--dataset-name $dataset_name \
|
--dataset-name $dataset_name \
|
||||||
--dataset-path $dataset_path \
|
--dataset-path $dataset_path \
|
||||||
--sonnet-input-len $input_len \
|
--sonnet-input-len $input_len \
|
||||||
--sonnet-output-len "$output_len" \
|
--sonnet-output-len "$output_len" \
|
||||||
--sonnet-prefix-len $prefix_len \
|
--sonnet-prefix-len $prefix_len \
|
||||||
--num-prompts $num_prompts \
|
--num-prompts $num_prompts \
|
||||||
--port 8000 \
|
--port 8000 \
|
||||||
--save-result \
|
--save-result \
|
||||||
--result-dir $results_folder \
|
--result-dir $results_folder \
|
||||||
--result-filename "$tag"-qps-"$qps".json \
|
--result-filename "$tag"-qps-"$qps".json \
|
||||||
--request-rate "$qps"
|
--request-rate "$qps"
|
||||||
|
|
||||||
sleep 2
|
sleep 2
|
||||||
}
|
}
|
||||||
|
|||||||
@ -22,6 +22,13 @@ from vllm.utils import FlexibleArgumentParser
|
|||||||
FP8_DTYPE = current_platform.fp8_dtype()
|
FP8_DTYPE = current_platform.fp8_dtype()
|
||||||
|
|
||||||
|
|
||||||
|
def ensure_divisibility(numerator, denominator):
|
||||||
|
"""Ensure that numerator is divisible by the denominator."""
|
||||||
|
assert numerator % denominator == 0, (
|
||||||
|
"intermediate_size {} is not divisible by tp {}.".format(numerator, denominator)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class BenchmarkConfig(TypedDict):
|
class BenchmarkConfig(TypedDict):
|
||||||
BLOCK_SIZE_M: int
|
BLOCK_SIZE_M: int
|
||||||
BLOCK_SIZE_N: int
|
BLOCK_SIZE_N: int
|
||||||
@ -603,7 +610,7 @@ def main(args: argparse.Namespace):
|
|||||||
topk = config.num_experts_per_tok
|
topk = config.num_experts_per_tok
|
||||||
intermediate_size = config.intermediate_size
|
intermediate_size = config.intermediate_size
|
||||||
shard_intermediate_size = 2 * intermediate_size // args.tp_size
|
shard_intermediate_size = 2 * intermediate_size // args.tp_size
|
||||||
|
ensure_divisibility(intermediate_size, args.tp_size)
|
||||||
hidden_size = config.hidden_size
|
hidden_size = config.hidden_size
|
||||||
dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype
|
dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype
|
||||||
use_fp8_w8a8 = args.dtype == "fp8_w8a8"
|
use_fp8_w8a8 = args.dtype == "fp8_w8a8"
|
||||||
|
|||||||
@ -5,9 +5,8 @@ import itertools
|
|||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
from vllm import _custom_ops as ops
|
|
||||||
from vllm.model_executor.layers.fused_moe.moe_align_block_size import (
|
from vllm.model_executor.layers.fused_moe.moe_align_block_size import (
|
||||||
moe_align_block_size_triton,
|
moe_align_block_size,
|
||||||
)
|
)
|
||||||
from vllm.triton_utils import triton
|
from vllm.triton_utils import triton
|
||||||
|
|
||||||
@ -21,60 +20,6 @@ def get_topk_ids(num_tokens: int, num_experts: int, topk: int) -> torch.Tensor:
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def check_correctness(num_tokens, num_experts=256, block_size=256, topk=8):
|
|
||||||
"""
|
|
||||||
Verifies vllm vs. Triton
|
|
||||||
"""
|
|
||||||
topk_ids = get_topk_ids(num_tokens, num_experts, topk)
|
|
||||||
|
|
||||||
# 1. malloc space for triton and vllm
|
|
||||||
# malloc enough space (max_num_tokens_padded) for the sorted ids
|
|
||||||
max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1)
|
|
||||||
sorted_ids_triton = torch.empty(
|
|
||||||
(max_num_tokens_padded,), dtype=torch.int32, device="cuda"
|
|
||||||
)
|
|
||||||
expert_ids_triton = torch.empty(
|
|
||||||
(max_num_tokens_padded // block_size,), dtype=torch.int32, device="cuda"
|
|
||||||
)
|
|
||||||
num_tokens_post_pad_triton = torch.empty((1,), dtype=torch.int32, device="cuda")
|
|
||||||
|
|
||||||
sorted_ids_vllm = torch.empty_like(sorted_ids_triton)
|
|
||||||
expert_ids_vllm = torch.empty_like(expert_ids_triton)
|
|
||||||
num_tokens_post_pad_vllm = torch.empty_like(num_tokens_post_pad_triton)
|
|
||||||
|
|
||||||
# 2. run implementations
|
|
||||||
moe_align_block_size_triton(
|
|
||||||
topk_ids,
|
|
||||||
num_experts,
|
|
||||||
block_size,
|
|
||||||
sorted_ids_triton,
|
|
||||||
expert_ids_triton,
|
|
||||||
num_tokens_post_pad_triton,
|
|
||||||
)
|
|
||||||
|
|
||||||
ops.moe_align_block_size(
|
|
||||||
topk_ids,
|
|
||||||
num_experts,
|
|
||||||
block_size,
|
|
||||||
sorted_ids_vllm,
|
|
||||||
expert_ids_vllm,
|
|
||||||
num_tokens_post_pad_vllm,
|
|
||||||
)
|
|
||||||
print(f"✅ VLLM implementation works with {num_experts} experts!")
|
|
||||||
|
|
||||||
# 3. compare results
|
|
||||||
if torch.allclose(expert_ids_triton, expert_ids_vllm) and torch.allclose(
|
|
||||||
num_tokens_post_pad_triton, num_tokens_post_pad_vllm
|
|
||||||
):
|
|
||||||
print("✅ Triton and VLLM implementations match.")
|
|
||||||
else:
|
|
||||||
print("❌ Triton and VLLM implementations DO NOT match.")
|
|
||||||
print("Triton expert_ids:", expert_ids_triton)
|
|
||||||
print("VLLM expert_ids:", expert_ids_vllm)
|
|
||||||
print("Triton num_tokens_post_pad:", num_tokens_post_pad_triton)
|
|
||||||
print("VLLM num_tokens_post_pad:", num_tokens_post_pad_vllm)
|
|
||||||
|
|
||||||
|
|
||||||
# test configurations
|
# test configurations
|
||||||
num_tokens_range = [1, 16, 256, 4096]
|
num_tokens_range = [1, 16, 256, 4096]
|
||||||
num_experts_range = [16, 64, 224, 256, 280, 512]
|
num_experts_range = [16, 64, 224, 256, 280, 512]
|
||||||
@ -87,8 +32,8 @@ configs = list(itertools.product(num_tokens_range, num_experts_range, topk_range
|
|||||||
x_names=["num_tokens", "num_experts", "topk"],
|
x_names=["num_tokens", "num_experts", "topk"],
|
||||||
x_vals=configs,
|
x_vals=configs,
|
||||||
line_arg="provider",
|
line_arg="provider",
|
||||||
line_vals=["vllm", "triton"], # "triton"
|
line_vals=["vllm"],
|
||||||
line_names=["VLLM", "Triton"], # "Triton"
|
line_names=["vLLM"],
|
||||||
plot_name="moe-align-block-size-performance",
|
plot_name="moe-align-block-size-performance",
|
||||||
args={},
|
args={},
|
||||||
)
|
)
|
||||||
@ -98,36 +43,11 @@ def benchmark(num_tokens, num_experts, topk, provider):
|
|||||||
block_size = 256
|
block_size = 256
|
||||||
topk_ids = get_topk_ids(num_tokens, num_experts, topk)
|
topk_ids = get_topk_ids(num_tokens, num_experts, topk)
|
||||||
|
|
||||||
max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1)
|
|
||||||
sorted_ids = torch.empty((max_num_tokens_padded,), dtype=torch.int32, device="cuda")
|
|
||||||
max_num_m_blocks = max_num_tokens_padded // block_size
|
|
||||||
expert_ids = torch.empty((max_num_m_blocks,), dtype=torch.int32, device="cuda")
|
|
||||||
num_tokens_post_pad = torch.empty((1,), dtype=torch.int32, device="cuda")
|
|
||||||
|
|
||||||
quantiles = [0.5, 0.2, 0.8]
|
quantiles = [0.5, 0.2, 0.8]
|
||||||
|
|
||||||
if provider == "vllm":
|
if provider == "vllm":
|
||||||
ms, min_ms, max_ms = triton.testing.do_bench(
|
ms, min_ms, max_ms = triton.testing.do_bench(
|
||||||
lambda: ops.moe_align_block_size(
|
lambda: moe_align_block_size(topk_ids, block_size, num_experts),
|
||||||
topk_ids,
|
|
||||||
num_experts,
|
|
||||||
block_size,
|
|
||||||
sorted_ids.clone(),
|
|
||||||
expert_ids.clone(),
|
|
||||||
num_tokens_post_pad.clone(),
|
|
||||||
),
|
|
||||||
quantiles=quantiles,
|
|
||||||
)
|
|
||||||
elif provider == "triton":
|
|
||||||
ms, min_ms, max_ms = triton.testing.do_bench(
|
|
||||||
lambda: moe_align_block_size_triton(
|
|
||||||
topk_ids,
|
|
||||||
num_experts,
|
|
||||||
block_size,
|
|
||||||
sorted_ids.clone(),
|
|
||||||
expert_ids.clone(),
|
|
||||||
num_tokens_post_pad.clone(),
|
|
||||||
),
|
|
||||||
quantiles=quantiles,
|
quantiles=quantiles,
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -151,6 +71,4 @@ if __name__ == "__main__":
|
|||||||
)
|
)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
print("Running correctness check...")
|
|
||||||
check_correctness(num_tokens=1024, num_experts=args.num_experts, topk=args.topk)
|
|
||||||
benchmark.run(print_data=True, show_plots=True)
|
benchmark.run(print_data=True, show_plots=True)
|
||||||
|
|||||||
@ -8,12 +8,13 @@ import ray
|
|||||||
import torch
|
import torch
|
||||||
from transformers import AutoConfig
|
from transformers import AutoConfig
|
||||||
|
|
||||||
from vllm.model_executor.layers.fused_moe.deep_gemm_moe import (
|
from vllm.model_executor.layers.fused_moe.fused_moe import *
|
||||||
|
from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import (
|
||||||
_moe_permute,
|
_moe_permute,
|
||||||
_moe_unpermute_and_reduce,
|
_moe_unpermute_and_reduce,
|
||||||
|
moe_permute,
|
||||||
|
moe_unpermute,
|
||||||
)
|
)
|
||||||
from vllm.model_executor.layers.fused_moe.fused_moe import *
|
|
||||||
from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import *
|
|
||||||
from vllm.model_executor.layers.fused_moe.utils import _fp8_quantize
|
from vllm.model_executor.layers.fused_moe.utils import _fp8_quantize
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
from vllm.utils import FlexibleArgumentParser
|
from vllm.utils import FlexibleArgumentParser
|
||||||
@ -63,18 +64,19 @@ def benchmark_permute(
|
|||||||
|
|
||||||
def run():
|
def run():
|
||||||
if use_customized_permute:
|
if use_customized_permute:
|
||||||
(permuted_hidden_states, first_token_off, inv_perm_idx, m_indices) = (
|
(
|
||||||
moe_permute(
|
permuted_hidden_states,
|
||||||
qhidden_states,
|
a1q_scale,
|
||||||
topk_weights=topk_weights,
|
first_token_off,
|
||||||
topk_ids=topk_ids,
|
inv_perm_idx,
|
||||||
token_expert_indices=token_expert_indices,
|
m_indices,
|
||||||
topk=topk,
|
) = moe_permute(
|
||||||
n_expert=num_experts,
|
qhidden_states,
|
||||||
n_local_expert=num_experts,
|
a1q_scale=None,
|
||||||
expert_map=None,
|
topk_ids=topk_ids,
|
||||||
align_block_size=align_block_size,
|
n_expert=num_experts,
|
||||||
)
|
expert_map=None,
|
||||||
|
align_block_size=align_block_size,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
(
|
(
|
||||||
@ -150,18 +152,19 @@ def benchmark_unpermute(
|
|||||||
|
|
||||||
def prepare():
|
def prepare():
|
||||||
if use_customized_permute:
|
if use_customized_permute:
|
||||||
(permuted_hidden_states, first_token_off, inv_perm_idx, m_indices) = (
|
(
|
||||||
moe_permute(
|
permuted_hidden_states,
|
||||||
qhidden_states,
|
a1q_scale,
|
||||||
topk_weights=topk_weights,
|
first_token_off,
|
||||||
topk_ids=topk_ids,
|
inv_perm_idx,
|
||||||
token_expert_indices=token_expert_indices,
|
m_indices,
|
||||||
topk=topk,
|
) = moe_permute(
|
||||||
n_expert=num_experts,
|
qhidden_states,
|
||||||
n_local_expert=num_experts,
|
a1q_scale=None,
|
||||||
expert_map=None,
|
topk_ids=topk_ids,
|
||||||
align_block_size=align_block_size,
|
n_expert=num_experts,
|
||||||
)
|
expert_map=None,
|
||||||
|
align_block_size=align_block_size,
|
||||||
)
|
)
|
||||||
# convert to fp16/bf16 as gemm output
|
# convert to fp16/bf16 as gemm output
|
||||||
return (
|
return (
|
||||||
@ -191,16 +194,19 @@ def benchmark_unpermute(
|
|||||||
|
|
||||||
def run(input: tuple):
|
def run(input: tuple):
|
||||||
if use_customized_permute:
|
if use_customized_permute:
|
||||||
(permuted_hidden_states, first_token_off, inv_perm_idx, m_indices) = input
|
(
|
||||||
|
permuted_hidden_states,
|
||||||
|
first_token_off,
|
||||||
|
inv_perm_idx,
|
||||||
|
m_indices,
|
||||||
|
) = input
|
||||||
|
output = torch.empty_like(hidden_states)
|
||||||
moe_unpermute(
|
moe_unpermute(
|
||||||
|
output,
|
||||||
permuted_hidden_states,
|
permuted_hidden_states,
|
||||||
topk_weights,
|
topk_weights,
|
||||||
topk_ids,
|
|
||||||
inv_perm_idx,
|
inv_perm_idx,
|
||||||
first_token_off,
|
first_token_off,
|
||||||
topk,
|
|
||||||
num_experts,
|
|
||||||
num_experts,
|
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
(
|
(
|
||||||
@ -211,7 +217,11 @@ def benchmark_unpermute(
|
|||||||
inv_perm,
|
inv_perm,
|
||||||
) = input
|
) = input
|
||||||
_moe_unpermute_and_reduce(
|
_moe_unpermute_and_reduce(
|
||||||
output_hidden_states, permuted_hidden_states, inv_perm, topk_weights
|
output_hidden_states,
|
||||||
|
permuted_hidden_states,
|
||||||
|
inv_perm,
|
||||||
|
topk_weights,
|
||||||
|
True,
|
||||||
)
|
)
|
||||||
|
|
||||||
# JIT compilation & warmup
|
# JIT compilation & warmup
|
||||||
|
|||||||
159
benchmarks/kernels/benchmark_per_token_group_quant.py
Normal file
159
benchmarks/kernels/benchmark_per_token_group_quant.py
Normal file
@ -0,0 +1,159 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import math
|
||||||
|
from contextlib import contextmanager
|
||||||
|
from typing import Callable
|
||||||
|
from unittest.mock import patch
|
||||||
|
|
||||||
|
import torch
|
||||||
|
|
||||||
|
from vllm.model_executor.layers.quantization.utils import fp8_utils, int8_utils
|
||||||
|
from vllm.platforms import current_platform
|
||||||
|
|
||||||
|
|
||||||
|
@contextmanager
|
||||||
|
def _triton_mode():
|
||||||
|
"""Temporarily force the Triton fallback path"""
|
||||||
|
with patch("vllm.platforms.current_platform.is_cuda", return_value=False):
|
||||||
|
yield
|
||||||
|
|
||||||
|
|
||||||
|
def _time_cuda(
|
||||||
|
fn: Callable[[], tuple[torch.Tensor, torch.Tensor]],
|
||||||
|
warmup_iters: int,
|
||||||
|
bench_iters: int,
|
||||||
|
) -> float:
|
||||||
|
# warmup
|
||||||
|
for _ in range(warmup_iters):
|
||||||
|
fn()
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
|
||||||
|
start = torch.cuda.Event(enable_timing=True)
|
||||||
|
end = torch.cuda.Event(enable_timing=True)
|
||||||
|
|
||||||
|
start.record()
|
||||||
|
for _ in range(bench_iters):
|
||||||
|
fn()
|
||||||
|
end.record()
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
|
||||||
|
return start.elapsed_time(end) / bench_iters # ms/iter
|
||||||
|
|
||||||
|
|
||||||
|
def _run_single(
|
||||||
|
shape: tuple[int, int],
|
||||||
|
group_size: int,
|
||||||
|
dtype: str,
|
||||||
|
*,
|
||||||
|
column_major: bool = False,
|
||||||
|
scale_ue8m0: bool = False,
|
||||||
|
warmup_iters: int,
|
||||||
|
bench_iters: int,
|
||||||
|
) -> None:
|
||||||
|
num_tokens, hidden_dim = shape
|
||||||
|
|
||||||
|
device = torch.device("cuda")
|
||||||
|
torch.manual_seed(42)
|
||||||
|
x = torch.randn(num_tokens, hidden_dim, device=device, dtype=torch.bfloat16) * 8
|
||||||
|
|
||||||
|
if dtype == "fp8":
|
||||||
|
|
||||||
|
def cuda_impl():
|
||||||
|
return fp8_utils.per_token_group_quant_fp8(
|
||||||
|
x,
|
||||||
|
group_size,
|
||||||
|
column_major_scales=column_major,
|
||||||
|
use_ue8m0=scale_ue8m0,
|
||||||
|
)
|
||||||
|
|
||||||
|
def triton_impl():
|
||||||
|
with _triton_mode():
|
||||||
|
return fp8_utils.per_token_group_quant_fp8(
|
||||||
|
x,
|
||||||
|
group_size,
|
||||||
|
column_major_scales=column_major,
|
||||||
|
use_ue8m0=scale_ue8m0,
|
||||||
|
)
|
||||||
|
elif dtype == "int8":
|
||||||
|
|
||||||
|
def cuda_impl():
|
||||||
|
return int8_utils.per_token_group_quant_int8(x, group_size)
|
||||||
|
|
||||||
|
def triton_impl():
|
||||||
|
with _triton_mode():
|
||||||
|
return int8_utils.per_token_group_quant_int8(x, group_size)
|
||||||
|
else:
|
||||||
|
raise ValueError("dtype must be 'fp8' or 'int8'")
|
||||||
|
|
||||||
|
cuda_ms = _time_cuda(cuda_impl, warmup_iters, bench_iters)
|
||||||
|
triton_ms = _time_cuda(triton_impl, warmup_iters, bench_iters)
|
||||||
|
|
||||||
|
speedup = triton_ms / cuda_ms if cuda_ms else math.inf
|
||||||
|
|
||||||
|
cfg_desc = (
|
||||||
|
f"shape={shape} gs={group_size:<3} col_major={column_major:<5} "
|
||||||
|
f"ue8m0={scale_ue8m0:<5} dtype={dtype}"
|
||||||
|
)
|
||||||
|
print(
|
||||||
|
f"{cfg_desc:55} | CUDA {cuda_ms:7.3f} ms | Triton {triton_ms:7.3f} ms | "
|
||||||
|
f"speed-up ×{speedup:5.2f}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def parse_args():
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument("--warmup-iters", type=int, default=10)
|
||||||
|
parser.add_argument("--bench-iters", type=int, default=100)
|
||||||
|
parser.add_argument("--dtype", choices=["fp8", "int8", "both"], default="both")
|
||||||
|
return parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
if not current_platform.is_cuda():
|
||||||
|
raise RuntimeError("CUDA device is required to run this benchmark.")
|
||||||
|
|
||||||
|
args = parse_args()
|
||||||
|
warmup_iters, bench_iters = args.warmup_iters, args.bench_iters
|
||||||
|
|
||||||
|
shapes = [(32, 128), (64, 256), (16, 512)]
|
||||||
|
group_sizes = [64, 128]
|
||||||
|
|
||||||
|
dtypes = ["fp8", "int8"] if args.dtype == "both" else [args.dtype]
|
||||||
|
|
||||||
|
header = (
|
||||||
|
"Configuration".ljust(55)
|
||||||
|
+ " | "
|
||||||
|
+ "CUDA (ms)".center(12)
|
||||||
|
+ " | "
|
||||||
|
+ "Triton (ms)".center(13)
|
||||||
|
+ " | "
|
||||||
|
+ "Speed-up"
|
||||||
|
)
|
||||||
|
print(header)
|
||||||
|
print("-" * len(header))
|
||||||
|
|
||||||
|
for dtype in dtypes:
|
||||||
|
for shape in shapes:
|
||||||
|
for gs in group_sizes:
|
||||||
|
if dtype == "fp8":
|
||||||
|
for col_major in (False, True):
|
||||||
|
for ue8m0 in (False, True):
|
||||||
|
_run_single(
|
||||||
|
shape,
|
||||||
|
gs,
|
||||||
|
dtype,
|
||||||
|
column_major=col_major,
|
||||||
|
scale_ue8m0=ue8m0,
|
||||||
|
warmup_iters=warmup_iters,
|
||||||
|
bench_iters=bench_iters,
|
||||||
|
)
|
||||||
|
else: # INT8 has no col-major / ue8m0 switches
|
||||||
|
_run_single(
|
||||||
|
shape,
|
||||||
|
gs,
|
||||||
|
dtype,
|
||||||
|
warmup_iters=warmup_iters,
|
||||||
|
bench_iters=bench_iters,
|
||||||
|
)
|
||||||
156
benchmarks/kernels/benchmark_reshape_and_cache_flash.py
Normal file
156
benchmarks/kernels/benchmark_reshape_and_cache_flash.py
Normal file
@ -0,0 +1,156 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import random
|
||||||
|
import time
|
||||||
|
|
||||||
|
import torch
|
||||||
|
from tabulate import tabulate
|
||||||
|
|
||||||
|
from vllm import _custom_ops as ops
|
||||||
|
from vllm.logger import init_logger
|
||||||
|
from vllm.platforms import current_platform
|
||||||
|
from vllm.utils import (
|
||||||
|
STR_DTYPE_TO_TORCH_DTYPE,
|
||||||
|
FlexibleArgumentParser,
|
||||||
|
create_kv_caches_with_random_flash,
|
||||||
|
)
|
||||||
|
|
||||||
|
logger = init_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
@torch.inference_mode()
|
||||||
|
def run_benchmark(
|
||||||
|
num_tokens: int,
|
||||||
|
num_heads: int,
|
||||||
|
head_size: int,
|
||||||
|
block_size: int,
|
||||||
|
num_blocks: int,
|
||||||
|
dtype: torch.dtype,
|
||||||
|
kv_cache_dtype: str,
|
||||||
|
kv_cache_layout: str,
|
||||||
|
num_iters: int,
|
||||||
|
device: str = "cuda",
|
||||||
|
) -> float:
|
||||||
|
"""Return latency (seconds) for given num_tokens."""
|
||||||
|
|
||||||
|
if kv_cache_dtype == "fp8" and head_size % 16:
|
||||||
|
raise ValueError("fp8 kv-cache requires head_size to be a multiple of 16.")
|
||||||
|
|
||||||
|
current_platform.seed_everything(42)
|
||||||
|
torch.set_default_device(device)
|
||||||
|
|
||||||
|
# create random key / value tensors [T, H, D].
|
||||||
|
key = torch.randn(num_tokens, num_heads, head_size, dtype=dtype, device=device)
|
||||||
|
value = torch.randn_like(key)
|
||||||
|
|
||||||
|
# prepare the slot mapping.
|
||||||
|
# each token is assigned a unique slot in the KV-cache.
|
||||||
|
num_slots = block_size * num_blocks
|
||||||
|
if num_tokens > num_slots:
|
||||||
|
raise ValueError("num_tokens cannot exceed the total number of cache slots")
|
||||||
|
slot_mapping_lst = random.sample(range(num_slots), num_tokens)
|
||||||
|
slot_mapping = torch.tensor(slot_mapping_lst, dtype=torch.long, device=device)
|
||||||
|
|
||||||
|
key_caches, value_caches = create_kv_caches_with_random_flash(
|
||||||
|
num_blocks,
|
||||||
|
block_size,
|
||||||
|
1, # num_layers
|
||||||
|
num_heads,
|
||||||
|
head_size,
|
||||||
|
kv_cache_dtype,
|
||||||
|
dtype,
|
||||||
|
device=device,
|
||||||
|
cache_layout=kv_cache_layout,
|
||||||
|
)
|
||||||
|
key_cache, value_cache = key_caches[0], value_caches[0]
|
||||||
|
|
||||||
|
# compute per-kernel scaling factors for fp8 conversion (if used).
|
||||||
|
k_scale = (key.amax() / 64.0).to(torch.float32)
|
||||||
|
v_scale = (value.amax() / 64.0).to(torch.float32)
|
||||||
|
|
||||||
|
def run_cuda_benchmark(n_iters: int) -> float:
|
||||||
|
nonlocal key, value, key_cache, value_cache, slot_mapping
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
start = time.perf_counter()
|
||||||
|
for _ in range(n_iters):
|
||||||
|
ops.reshape_and_cache_flash(
|
||||||
|
key,
|
||||||
|
value,
|
||||||
|
key_cache,
|
||||||
|
value_cache,
|
||||||
|
slot_mapping,
|
||||||
|
kv_cache_dtype,
|
||||||
|
k_scale,
|
||||||
|
v_scale,
|
||||||
|
)
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
end = time.perf_counter()
|
||||||
|
return (end - start) / n_iters
|
||||||
|
|
||||||
|
# warm-up
|
||||||
|
run_cuda_benchmark(3)
|
||||||
|
|
||||||
|
lat = run_cuda_benchmark(num_iters)
|
||||||
|
|
||||||
|
# free tensors to mitigate OOM when sweeping
|
||||||
|
del key, value, key_cache, value_cache, slot_mapping
|
||||||
|
torch.cuda.empty_cache()
|
||||||
|
|
||||||
|
return lat
|
||||||
|
|
||||||
|
|
||||||
|
def main(args):
|
||||||
|
rows = []
|
||||||
|
for layout in ["NHD", "HND"]:
|
||||||
|
for exp in range(1, 17):
|
||||||
|
n_tok = 2**exp
|
||||||
|
lat = run_benchmark(
|
||||||
|
num_tokens=n_tok,
|
||||||
|
num_heads=args.num_heads,
|
||||||
|
head_size=args.head_size,
|
||||||
|
block_size=args.block_size,
|
||||||
|
num_blocks=args.num_blocks,
|
||||||
|
dtype=STR_DTYPE_TO_TORCH_DTYPE[args.dtype],
|
||||||
|
kv_cache_dtype=args.kv_cache_dtype,
|
||||||
|
kv_cache_layout=layout,
|
||||||
|
num_iters=args.iters,
|
||||||
|
device="cuda",
|
||||||
|
)
|
||||||
|
rows.append([n_tok, layout, f"{lat * 1e6:.3f}"])
|
||||||
|
|
||||||
|
print(tabulate(rows, headers=["num_tokens", "layout", "latency (µs)"]))
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = FlexibleArgumentParser()
|
||||||
|
|
||||||
|
parser.add_argument("--num-heads", type=int, default=128)
|
||||||
|
parser.add_argument(
|
||||||
|
"--head-size",
|
||||||
|
type=int,
|
||||||
|
choices=[64, 80, 96, 112, 120, 128, 192, 256],
|
||||||
|
default=128,
|
||||||
|
)
|
||||||
|
parser.add_argument("--block-size", type=int, choices=[16, 32], default=16)
|
||||||
|
parser.add_argument("--num-blocks", type=int, default=128 * 512)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--dtype",
|
||||||
|
type=str,
|
||||||
|
choices=["half", "bfloat16", "float"],
|
||||||
|
default="bfloat16",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--kv-cache-dtype",
|
||||||
|
type=str,
|
||||||
|
choices=["auto", "fp8"],
|
||||||
|
default="auto",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument("--iters", type=int, default=100)
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
main(args)
|
||||||
@ -41,7 +41,6 @@ def benchmark_decode(
|
|||||||
device = "cuda"
|
device = "cuda"
|
||||||
torch.manual_seed(0)
|
torch.manual_seed(0)
|
||||||
|
|
||||||
# Currently only HEAD_GRP_SIZE == 8 is supported
|
|
||||||
HEAD_GRP_SIZE = 8
|
HEAD_GRP_SIZE = 8
|
||||||
MAX_SEQ_LEN = max_seq_len
|
MAX_SEQ_LEN = max_seq_len
|
||||||
|
|
||||||
@ -71,22 +70,20 @@ def benchmark_decode(
|
|||||||
if kv_cache_dtype.startswith("fp8"):
|
if kv_cache_dtype.startswith("fp8"):
|
||||||
kv_cache, _ = to_float8(kv_cache)
|
kv_cache, _ = to_float8(kv_cache)
|
||||||
|
|
||||||
|
output_trtllm = torch.empty(q.shape, dtype=dtype)
|
||||||
|
|
||||||
# Benchmark TRT decode
|
# Benchmark TRT decode
|
||||||
def trt_decode():
|
def trt_decode():
|
||||||
return flashinfer.decode.trtllm_batch_decode_with_kv_cache(
|
return flashinfer.decode.trtllm_batch_decode_with_kv_cache(
|
||||||
q,
|
q,
|
||||||
kv_cache,
|
kv_cache,
|
||||||
workspace_buffer,
|
workspace_buffer,
|
||||||
num_qo_heads,
|
|
||||||
num_kv_heads,
|
|
||||||
sm_scale,
|
|
||||||
block_tables,
|
block_tables,
|
||||||
kv_lens_tensor,
|
kv_lens_tensor,
|
||||||
page_size,
|
|
||||||
max_kv_len,
|
max_kv_len,
|
||||||
kv_cache_dtype,
|
bmm1_scale=k_scale * sm_scale,
|
||||||
k_scale,
|
bmm2_scale=v_scale,
|
||||||
v_scale,
|
out=output_trtllm,
|
||||||
)
|
)
|
||||||
|
|
||||||
def time_fn(fn, warmup=10, trials=20):
|
def time_fn(fn, warmup=10, trials=20):
|
||||||
@ -125,6 +122,8 @@ def benchmark_decode(
|
|||||||
kv_indices = torch.tensor(kv_indices, dtype=torch.int32)
|
kv_indices = torch.tensor(kv_indices, dtype=torch.int32)
|
||||||
kv_last_page_lens = torch.tensor(kv_last_page_lens, dtype=torch.int32)
|
kv_last_page_lens = torch.tensor(kv_last_page_lens, dtype=torch.int32)
|
||||||
|
|
||||||
|
output_baseline = torch.empty(q.shape, dtype=dtype)
|
||||||
|
|
||||||
wrapper = flashinfer.BatchDecodeWithPagedKVCacheWrapper(
|
wrapper = flashinfer.BatchDecodeWithPagedKVCacheWrapper(
|
||||||
workspace_buffer,
|
workspace_buffer,
|
||||||
kv_layout,
|
kv_layout,
|
||||||
@ -145,7 +144,7 @@ def benchmark_decode(
|
|||||||
)
|
)
|
||||||
|
|
||||||
def baseline_decode():
|
def baseline_decode():
|
||||||
return wrapper.run(q, kv_cache, sm_scale, k_scale, v_scale)
|
return wrapper.run(q, kv_cache, sm_scale, k_scale, v_scale, output_baseline)
|
||||||
|
|
||||||
baseline_mean, baseline_std = time_fn(baseline_decode)
|
baseline_mean, baseline_std = time_fn(baseline_decode)
|
||||||
|
|
||||||
@ -214,25 +213,39 @@ if __name__ == "__main__":
|
|||||||
max_seq_lens = [1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072]
|
max_seq_lens = [1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072]
|
||||||
all_results = []
|
all_results = []
|
||||||
|
|
||||||
print("Running benchmark for kv_cache_dtype: bfloat16")
|
|
||||||
print(
|
print(
|
||||||
"\tnum_seqs\tmax_seq_len\ttrt_mean\ttrt_std\tbaseline_mean\tbaseline_std\tspeedup_percent"
|
"Running benchmark for q_dtype = bfloat16, kv_cache_dtype: bfloat16, "
|
||||||
|
"output_dtype: bfloat16"
|
||||||
|
)
|
||||||
|
print(
|
||||||
|
"\tnum_seqs\tmax_seq_len\ttrt_mean\ttrt_std\tbaseline_mean\t"
|
||||||
|
"baseline_std\tspeedup_percent"
|
||||||
)
|
)
|
||||||
for max_seq_len in max_seq_lens:
|
for max_seq_len in max_seq_lens:
|
||||||
for bs in num_seqs:
|
for bs in num_seqs:
|
||||||
result = benchmark_decode(
|
result = benchmark_decode(
|
||||||
bs, max_seq_len, dtype=torch.bfloat16, kv_cache_dtype="auto"
|
bs,
|
||||||
|
max_seq_len,
|
||||||
|
dtype=torch.bfloat16,
|
||||||
|
kv_cache_dtype="auto",
|
||||||
)
|
)
|
||||||
all_results.append(result)
|
all_results.append(result)
|
||||||
|
|
||||||
print("Running benchmark for q_dtype = bfloat16, kv_cache_dtype: fp8")
|
|
||||||
print(
|
print(
|
||||||
"\tnum_seqs\tmax_seq_len\ttrt_mean\ttrt_std\tbaseline_mean\tbaseline_std\tspeedup_percent"
|
"Running benchmark for q_dtype = bfloat16, kv_cache_dtype: fp8, "
|
||||||
|
"output_dtype: bfloat16"
|
||||||
|
)
|
||||||
|
print(
|
||||||
|
"\tnum_seqs\tmax_seq_len\ttrt_mean\ttrt_std\tbaseline_mean\t"
|
||||||
|
"baseline_std\tspeedup_percent"
|
||||||
)
|
)
|
||||||
for max_seq_len in max_seq_lens:
|
for max_seq_len in max_seq_lens:
|
||||||
for bs in num_seqs:
|
for bs in num_seqs:
|
||||||
result = benchmark_decode(
|
result = benchmark_decode(
|
||||||
bs, max_seq_len, dtype=torch.bfloat16, kv_cache_dtype="fp8"
|
bs,
|
||||||
|
max_seq_len,
|
||||||
|
dtype=torch.bfloat16,
|
||||||
|
kv_cache_dtype="fp8",
|
||||||
)
|
)
|
||||||
all_results.append(result)
|
all_results.append(result)
|
||||||
|
|
||||||
250
benchmarks/kernels/benchmark_trtllm_prefill_attention.py
Normal file
250
benchmarks/kernels/benchmark_trtllm_prefill_attention.py
Normal file
@ -0,0 +1,250 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
|
import csv
|
||||||
|
import os
|
||||||
|
import random
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
import flashinfer
|
||||||
|
import torch
|
||||||
|
|
||||||
|
FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
|
||||||
|
|
||||||
|
# KV Cache Layout for TRT-LLM
|
||||||
|
# kv_cache_shape = (num_blocks, 2, num_kv_heads, page_size, head_dim)
|
||||||
|
|
||||||
|
|
||||||
|
def to_float8(x, dtype=torch.float8_e4m3fn):
|
||||||
|
finfo = torch.finfo(dtype)
|
||||||
|
min_val, max_val = x.aminmax()
|
||||||
|
amax = torch.maximum(min_val.abs(), max_val.abs()).clamp(min=1e-12)
|
||||||
|
scale = finfo.max / amax * 0.1
|
||||||
|
x_scl_sat = (x * scale).clamp(min=finfo.min, max=finfo.max)
|
||||||
|
return x_scl_sat.to(dtype), scale.float().reciprocal()
|
||||||
|
|
||||||
|
|
||||||
|
@torch.no_grad()
|
||||||
|
def benchmark_prefill(
|
||||||
|
num_seqs,
|
||||||
|
max_seq_len,
|
||||||
|
page_size=16,
|
||||||
|
dtype=torch.bfloat16,
|
||||||
|
kv_layout="HND",
|
||||||
|
num_kv_heads=8,
|
||||||
|
kv_cache_dtype="auto",
|
||||||
|
head_dim=128,
|
||||||
|
warmup=10,
|
||||||
|
trials=20,
|
||||||
|
):
|
||||||
|
torch.set_default_device("cuda")
|
||||||
|
torch.manual_seed(0)
|
||||||
|
|
||||||
|
HEAD_GRP_SIZE = 8
|
||||||
|
MAX_SEQ_LEN = max_seq_len
|
||||||
|
|
||||||
|
# large number to reduce kv_cache reuse
|
||||||
|
NUM_BLOCKS = int(256000 / page_size)
|
||||||
|
|
||||||
|
workspace_buffer = torch.empty(1024 * 1024 * 1024, dtype=torch.int8)
|
||||||
|
|
||||||
|
num_qo_heads = num_kv_heads * HEAD_GRP_SIZE
|
||||||
|
sm_scale = float(1.0 / (head_dim**0.5))
|
||||||
|
|
||||||
|
q_lens = [random.randint(1, MAX_SEQ_LEN) for _ in range(num_seqs)]
|
||||||
|
q_lens[-1] = MAX_SEQ_LEN
|
||||||
|
max_q_len = max(q_lens)
|
||||||
|
q_indptr = torch.cat(
|
||||||
|
[
|
||||||
|
torch.tensor([0], dtype=torch.int32),
|
||||||
|
torch.cumsum(
|
||||||
|
torch.tensor(q_lens, dtype=torch.int32), dim=0, dtype=torch.int32
|
||||||
|
),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
q = torch.randn(sum(q_lens), num_qo_heads, head_dim, dtype=dtype)
|
||||||
|
|
||||||
|
kv_lens = [random.randint(0, MAX_SEQ_LEN) for _ in range(num_seqs)]
|
||||||
|
kv_lens[-1] = MAX_SEQ_LEN
|
||||||
|
|
||||||
|
seq_lens = [q_len + kv_len for q_len, kv_len in zip(q_lens, kv_lens)]
|
||||||
|
max_seq_len = max(seq_lens)
|
||||||
|
seq_lens_tensor = torch.tensor(seq_lens, dtype=torch.int32)
|
||||||
|
|
||||||
|
max_num_blocks_per_seq = (max_seq_len + page_size - 1) // page_size
|
||||||
|
block_tables = torch.randint(
|
||||||
|
0, NUM_BLOCKS, (num_seqs, max_num_blocks_per_seq), dtype=torch.int32
|
||||||
|
)
|
||||||
|
|
||||||
|
kv_cache_shape = (NUM_BLOCKS, 2, num_kv_heads, page_size, head_dim)
|
||||||
|
kv_cache = torch.randn(size=kv_cache_shape, dtype=dtype)
|
||||||
|
k_scale = v_scale = 1.0
|
||||||
|
|
||||||
|
if kv_cache_dtype.startswith("fp8"):
|
||||||
|
kv_cache, _ = to_float8(kv_cache)
|
||||||
|
|
||||||
|
output_trtllm = torch.empty(q.shape, dtype=dtype)
|
||||||
|
|
||||||
|
kv_indptr = [0]
|
||||||
|
kv_indices = []
|
||||||
|
kv_last_page_lens = []
|
||||||
|
for i in range(num_seqs):
|
||||||
|
seq_len = seq_lens[i]
|
||||||
|
assert seq_len > 0
|
||||||
|
num_blocks = (seq_len + page_size - 1) // page_size
|
||||||
|
kv_indices.extend(block_tables[i, :num_blocks])
|
||||||
|
kv_indptr.append(kv_indptr[-1] + num_blocks)
|
||||||
|
kv_last_page_len = seq_len % page_size
|
||||||
|
if kv_last_page_len == 0:
|
||||||
|
kv_last_page_len = page_size
|
||||||
|
kv_last_page_lens.append(kv_last_page_len)
|
||||||
|
|
||||||
|
kv_indptr = torch.tensor(kv_indptr, dtype=torch.int32)
|
||||||
|
kv_indices = torch.tensor(kv_indices, dtype=torch.int32)
|
||||||
|
kv_last_page_lens = torch.tensor(kv_last_page_lens, dtype=torch.int32)
|
||||||
|
|
||||||
|
output_baseline = torch.empty(q.shape, dtype=dtype)
|
||||||
|
|
||||||
|
wrapper = flashinfer.BatchPrefillWithPagedKVCacheWrapper(
|
||||||
|
workspace_buffer, kv_layout
|
||||||
|
)
|
||||||
|
wrapper.plan(
|
||||||
|
q_indptr,
|
||||||
|
kv_indptr,
|
||||||
|
kv_indices,
|
||||||
|
kv_last_page_lens,
|
||||||
|
num_qo_heads,
|
||||||
|
num_kv_heads,
|
||||||
|
head_dim,
|
||||||
|
page_size,
|
||||||
|
causal=True,
|
||||||
|
sm_scale=sm_scale,
|
||||||
|
q_data_type=dtype,
|
||||||
|
kv_data_type=kv_cache.dtype,
|
||||||
|
)
|
||||||
|
|
||||||
|
def time_fn(fn, warmup=10, trials=20):
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
start = torch.cuda.Event(enable_timing=True)
|
||||||
|
end = torch.cuda.Event(enable_timing=True)
|
||||||
|
times = []
|
||||||
|
for i in range(warmup):
|
||||||
|
fn()
|
||||||
|
for i in range(trials):
|
||||||
|
start.record()
|
||||||
|
fn()
|
||||||
|
end.record()
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
times.append(start.elapsed_time(end)) # ms
|
||||||
|
return sum(times) / len(times), torch.std(torch.tensor(times))
|
||||||
|
|
||||||
|
def baseline_prefill():
|
||||||
|
return wrapper.run(
|
||||||
|
q, kv_cache, k_scale=k_scale, v_scale=v_scale, out=output_baseline
|
||||||
|
)
|
||||||
|
|
||||||
|
def trt_prefill():
|
||||||
|
return flashinfer.prefill.trtllm_batch_context_with_kv_cache(
|
||||||
|
query=q,
|
||||||
|
kv_cache=kv_cache,
|
||||||
|
workspace_buffer=workspace_buffer,
|
||||||
|
block_tables=block_tables,
|
||||||
|
seq_lens=seq_lens_tensor,
|
||||||
|
max_q_len=max_q_len,
|
||||||
|
max_kv_len=max_seq_len,
|
||||||
|
bmm1_scale=k_scale * sm_scale,
|
||||||
|
bmm2_scale=v_scale,
|
||||||
|
batch_size=num_seqs,
|
||||||
|
cum_seq_lens_q=q_indptr,
|
||||||
|
cum_seq_lens_kv=kv_indptr,
|
||||||
|
out=output_trtllm,
|
||||||
|
)
|
||||||
|
|
||||||
|
trt_mean, trt_std = time_fn(trt_prefill)
|
||||||
|
baseline_mean, baseline_std = time_fn(baseline_prefill)
|
||||||
|
|
||||||
|
# Calculate percentage speedup (positive means TRT is faster)
|
||||||
|
speedup_percent = (baseline_mean - trt_mean) / baseline_mean
|
||||||
|
|
||||||
|
print(
|
||||||
|
f"\t{num_seqs}\t{max_seq_len}\t{trt_mean:.5f}\t{trt_std.item():.5f}"
|
||||||
|
f"\t{baseline_mean:.5f}\t{baseline_std.item():.5f}\t{speedup_percent:.5f}"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Return results for CSV writing
|
||||||
|
return {
|
||||||
|
"num_seqs": num_seqs,
|
||||||
|
"trt_mean": trt_mean,
|
||||||
|
"trt_std": trt_std.item(),
|
||||||
|
"baseline_mean": baseline_mean,
|
||||||
|
"baseline_std": baseline_std.item(),
|
||||||
|
"speedup_percent": speedup_percent,
|
||||||
|
"q_dtype": str(dtype),
|
||||||
|
"kv_cache_dtype": kv_cache_dtype,
|
||||||
|
"page_size": page_size,
|
||||||
|
"num_kv_heads": num_kv_heads,
|
||||||
|
"head_dim": head_dim,
|
||||||
|
"max_seq_len": max_seq_len,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def write_results_to_csv(results, filename=None):
|
||||||
|
"""Write benchmark results to CSV file."""
|
||||||
|
if filename is None:
|
||||||
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||||
|
filename = f"flashinfer_trtllm_benchmark_{timestamp}.csv"
|
||||||
|
|
||||||
|
fieldnames = [
|
||||||
|
"num_seqs",
|
||||||
|
"trt_mean",
|
||||||
|
"trt_std",
|
||||||
|
"baseline_mean",
|
||||||
|
"baseline_std",
|
||||||
|
"speedup_percent",
|
||||||
|
"q_dtype",
|
||||||
|
"kv_cache_dtype",
|
||||||
|
"page_size",
|
||||||
|
"num_kv_heads",
|
||||||
|
"head_dim",
|
||||||
|
"max_seq_len",
|
||||||
|
]
|
||||||
|
|
||||||
|
file_exists = os.path.exists(filename)
|
||||||
|
|
||||||
|
with open(filename, "a", newline="") as csvfile:
|
||||||
|
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
|
||||||
|
|
||||||
|
if not file_exists:
|
||||||
|
writer.writeheader()
|
||||||
|
|
||||||
|
for result in results:
|
||||||
|
writer.writerow(result)
|
||||||
|
|
||||||
|
print(f"Results written to {filename}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
num_seqs = [1, 4, 8, 16, 32, 64, 128, 256]
|
||||||
|
max_seq_lens = [1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072]
|
||||||
|
all_results = []
|
||||||
|
|
||||||
|
print(
|
||||||
|
"Running benchmark for q_dtype = bfloat16, kv_cache_dtype: bfloat16, "
|
||||||
|
"output_dtype: bfloat16"
|
||||||
|
)
|
||||||
|
print(
|
||||||
|
"\tnum_seqs\tmax_seq_len\ttrt_mean\ttrt_std\tbaseline_mean\t"
|
||||||
|
"baseline_std\tspeedup_percent"
|
||||||
|
)
|
||||||
|
for max_seq_len in max_seq_lens:
|
||||||
|
for bs in num_seqs:
|
||||||
|
result = benchmark_prefill(
|
||||||
|
bs,
|
||||||
|
max_seq_len,
|
||||||
|
dtype=torch.bfloat16,
|
||||||
|
kv_cache_dtype="auto",
|
||||||
|
)
|
||||||
|
all_results.append(result)
|
||||||
|
|
||||||
|
# Write all results to CSV
|
||||||
|
write_results_to_csv(all_results)
|
||||||
@ -8,7 +8,7 @@ Currently this just includes dense GEMMs and only works on Hopper GPUs.
|
|||||||
|
|
||||||
You need to install vLLM in your usual fashion, then install DeepGEMM from source in its own directory:
|
You need to install vLLM in your usual fashion, then install DeepGEMM from source in its own directory:
|
||||||
|
|
||||||
```
|
```bash
|
||||||
git clone --recursive https://github.com/deepseek-ai/DeepGEMM
|
git clone --recursive https://github.com/deepseek-ai/DeepGEMM
|
||||||
cd DeepGEMM
|
cd DeepGEMM
|
||||||
python setup.py install
|
python setup.py install
|
||||||
@ -17,7 +17,7 @@ uv pip install -e .
|
|||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
|
|
||||||
```
|
```console
|
||||||
python benchmark_fp8_block_dense_gemm.py
|
python benchmark_fp8_block_dense_gemm.py
|
||||||
INFO 02-26 21:55:13 [__init__.py:207] Automatically detected platform cuda.
|
INFO 02-26 21:55:13 [__init__.py:207] Automatically detected platform cuda.
|
||||||
===== STARTING FP8 GEMM BENCHMARK =====
|
===== STARTING FP8 GEMM BENCHMARK =====
|
||||||
|
|||||||
@ -4,49 +4,16 @@
|
|||||||
# ruff: noqa: E501
|
# ruff: noqa: E501
|
||||||
import time
|
import time
|
||||||
|
|
||||||
# Import DeepGEMM functions
|
|
||||||
import deep_gemm
|
|
||||||
import torch
|
import torch
|
||||||
from deep_gemm import calc_diff, ceil_div, get_col_major_tma_aligned_tensor
|
|
||||||
|
|
||||||
# Import vLLM functions
|
|
||||||
from vllm import _custom_ops as ops
|
from vllm import _custom_ops as ops
|
||||||
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
|
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
|
||||||
|
get_col_major_tma_aligned_tensor,
|
||||||
per_token_group_quant_fp8,
|
per_token_group_quant_fp8,
|
||||||
w8a8_block_fp8_matmul,
|
w8a8_block_fp8_matmul,
|
||||||
)
|
)
|
||||||
from vllm.triton_utils import triton
|
from vllm.triton_utils import triton
|
||||||
|
from vllm.utils.deep_gemm import calc_diff, fp8_gemm_nt, per_block_cast_to_fp8
|
||||||
|
|
||||||
# Copied from
|
|
||||||
# https://github.com/deepseek-ai/DeepGEMM/blob/78cacf70d41d15d688bd493ebc85845f7f2a3d5d/tests/test_core.py#L9
|
|
||||||
def per_token_cast_to_fp8(
|
|
||||||
x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
|
|
||||||
"""Convert tensor to FP8 format with per-token scaling."""
|
|
||||||
assert x.dim() == 2 and x.size(1) % 128 == 0
|
|
||||||
m, n = x.shape
|
|
||||||
x_view = x.view(m, -1, 128)
|
|
||||||
x_amax = x_view.abs().float().amax(dim=2).view(m, -1).clamp(1e-4)
|
|
||||||
return (x_view * (448.0 / x_amax.unsqueeze(2))).to(
|
|
||||||
torch.float8_e4m3fn).view(m, n), (x_amax / 448.0).view(m, -1)
|
|
||||||
|
|
||||||
|
|
||||||
# Copied from
|
|
||||||
# https://github.com/deepseek-ai/DeepGEMM/blob/78cacf70d41d15d688bd493ebc85845f7f2a3d5d/tests/test_core.py#L17
|
|
||||||
def per_block_cast_to_fp8(
|
|
||||||
x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
|
|
||||||
"""Convert tensor to FP8 format with per-block scaling."""
|
|
||||||
assert x.dim() == 2
|
|
||||||
m, n = x.shape
|
|
||||||
x_padded = torch.zeros((ceil_div(m, 128) * 128, ceil_div(n, 128) * 128),
|
|
||||||
dtype=x.dtype,
|
|
||||||
device=x.device)
|
|
||||||
x_padded[:m, :n] = x
|
|
||||||
x_view = x_padded.view(-1, 128, x_padded.size(1) // 128, 128)
|
|
||||||
x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4)
|
|
||||||
x_scaled = (x_view * (448.0 / x_amax)).to(torch.float8_e4m3fn)
|
|
||||||
return x_scaled.view_as(x_padded)[:m, :n].contiguous(), (
|
|
||||||
x_amax / 448.0).view(x_view.size(0), x_view.size(2))
|
|
||||||
|
|
||||||
|
|
||||||
def benchmark_shape(m: int,
|
def benchmark_shape(m: int,
|
||||||
@ -69,14 +36,14 @@ def benchmark_shape(m: int,
|
|||||||
|
|
||||||
# Pre-quantize B for all implementations
|
# Pre-quantize B for all implementations
|
||||||
# (weights can be pre-quantized offline)
|
# (weights can be pre-quantized offline)
|
||||||
B_deepgemm, B_scale_deepgemm = per_block_cast_to_fp8(B)
|
B_deepgemm, B_scale_deepgemm = per_block_cast_to_fp8(B, [128, 128], use_ue8m0=True)
|
||||||
B_vllm, B_scale_vllm = per_block_cast_to_fp8(B)
|
B_vllm, B_scale_vllm = per_block_cast_to_fp8(B, [128, 128], use_ue8m0=True)
|
||||||
|
|
||||||
# Block size configuration
|
# Block size configuration
|
||||||
block_size = [128, 128]
|
block_size = [128, 128]
|
||||||
|
|
||||||
# Pre-quantize A for all implementations
|
# Pre-quantize A for all implementations
|
||||||
A_deepgemm, A_scale_deepgemm = per_token_cast_to_fp8(A)
|
A_deepgemm, A_scale_deepgemm = per_token_group_quant_fp8(A, block_size[1])
|
||||||
A_scale_deepgemm = get_col_major_tma_aligned_tensor(A_scale_deepgemm)
|
A_scale_deepgemm = get_col_major_tma_aligned_tensor(A_scale_deepgemm)
|
||||||
C_deepgemm = torch.empty((m, n), device='cuda', dtype=torch.bfloat16)
|
C_deepgemm = torch.empty((m, n), device='cuda', dtype=torch.bfloat16)
|
||||||
A_vllm, A_scale_vllm = per_token_group_quant_fp8(A, block_size[1])
|
A_vllm, A_scale_vllm = per_token_group_quant_fp8(A, block_size[1])
|
||||||
@ -85,7 +52,7 @@ def benchmark_shape(m: int,
|
|||||||
|
|
||||||
# === DeepGEMM Implementation ===
|
# === DeepGEMM Implementation ===
|
||||||
def deepgemm_gemm():
|
def deepgemm_gemm():
|
||||||
deep_gemm.gemm_fp8_fp8_bf16_nt((A_deepgemm, A_scale_deepgemm),
|
fp8_gemm_nt((A_deepgemm, A_scale_deepgemm),
|
||||||
(B_deepgemm, B_scale_deepgemm),
|
(B_deepgemm, B_scale_deepgemm),
|
||||||
C_deepgemm)
|
C_deepgemm)
|
||||||
return C_deepgemm
|
return C_deepgemm
|
||||||
|
|||||||
@ -58,6 +58,22 @@ function (find_isa CPUINFO TARGET OUT)
|
|||||||
endif()
|
endif()
|
||||||
endfunction()
|
endfunction()
|
||||||
|
|
||||||
|
|
||||||
|
function(check_sysctl TARGET OUT)
|
||||||
|
execute_process(COMMAND sysctl -n "${TARGET}"
|
||||||
|
RESULT_VARIABLE SYSCTL_RET
|
||||||
|
OUTPUT_VARIABLE SYSCTL_INFO
|
||||||
|
ERROR_QUIET
|
||||||
|
OUTPUT_STRIP_TRAILING_WHITESPACE)
|
||||||
|
if(SYSCTL_RET EQUAL 0 AND
|
||||||
|
(SYSCTL_INFO STREQUAL "1" OR SYSCTL_INFO GREATER 0))
|
||||||
|
set(${OUT} ON PARENT_SCOPE)
|
||||||
|
else()
|
||||||
|
set(${OUT} OFF PARENT_SCOPE)
|
||||||
|
endif()
|
||||||
|
endfunction()
|
||||||
|
|
||||||
|
|
||||||
function (is_avx512_disabled OUT)
|
function (is_avx512_disabled OUT)
|
||||||
set(DISABLE_AVX512 $ENV{VLLM_CPU_DISABLE_AVX512})
|
set(DISABLE_AVX512 $ENV{VLLM_CPU_DISABLE_AVX512})
|
||||||
if(DISABLE_AVX512 AND DISABLE_AVX512 STREQUAL "true")
|
if(DISABLE_AVX512 AND DISABLE_AVX512 STREQUAL "true")
|
||||||
@ -70,7 +86,10 @@ endfunction()
|
|||||||
is_avx512_disabled(AVX512_DISABLED)
|
is_avx512_disabled(AVX512_DISABLED)
|
||||||
|
|
||||||
if (MACOSX_FOUND AND CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64")
|
if (MACOSX_FOUND AND CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64")
|
||||||
set(APPLE_SILICON_FOUND TRUE)
|
message(STATUS "Apple Silicon Detected")
|
||||||
|
set(ENABLE_NUMA OFF)
|
||||||
|
check_sysctl(hw.optional.neon ASIMD_FOUND)
|
||||||
|
check_sysctl(hw.optional.arm.FEAT_BF16 ARM_BF16_FOUND)
|
||||||
else()
|
else()
|
||||||
find_isa(${CPUINFO} "avx2" AVX2_FOUND)
|
find_isa(${CPUINFO} "avx2" AVX2_FOUND)
|
||||||
find_isa(${CPUINFO} "avx512f" AVX512_FOUND)
|
find_isa(${CPUINFO} "avx512f" AVX512_FOUND)
|
||||||
@ -82,7 +101,6 @@ else()
|
|||||||
find_isa(${CPUINFO} "S390" S390_FOUND)
|
find_isa(${CPUINFO} "S390" S390_FOUND)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
|
||||||
if (AVX512_FOUND AND NOT AVX512_DISABLED)
|
if (AVX512_FOUND AND NOT AVX512_DISABLED)
|
||||||
list(APPEND CXX_COMPILE_FLAGS
|
list(APPEND CXX_COMPILE_FLAGS
|
||||||
"-mavx512f"
|
"-mavx512f"
|
||||||
@ -149,9 +167,6 @@ elseif (ASIMD_FOUND)
|
|||||||
set(MARCH_FLAGS "-march=armv8.2-a+dotprod+fp16")
|
set(MARCH_FLAGS "-march=armv8.2-a+dotprod+fp16")
|
||||||
endif()
|
endif()
|
||||||
list(APPEND CXX_COMPILE_FLAGS ${MARCH_FLAGS})
|
list(APPEND CXX_COMPILE_FLAGS ${MARCH_FLAGS})
|
||||||
elseif(APPLE_SILICON_FOUND)
|
|
||||||
message(STATUS "Apple Silicon Detected")
|
|
||||||
set(ENABLE_NUMA OFF)
|
|
||||||
elseif (S390_FOUND)
|
elseif (S390_FOUND)
|
||||||
message(STATUS "S390 detected")
|
message(STATUS "S390 detected")
|
||||||
# Check for S390 VXE support
|
# Check for S390 VXE support
|
||||||
|
|||||||
@ -38,7 +38,7 @@ else()
|
|||||||
FetchContent_Declare(
|
FetchContent_Declare(
|
||||||
vllm-flash-attn
|
vllm-flash-attn
|
||||||
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
|
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
|
||||||
GIT_TAG 1c2624e53c078854e0637ee566c72fe2107e75f4
|
GIT_TAG 6dbc6e011a3ebe9349eeb74578940dd7095436ba
|
||||||
GIT_PROGRESS TRUE
|
GIT_PROGRESS TRUE
|
||||||
# Don't share the vllm-flash-attn build between build types
|
# Don't share the vllm-flash-attn build between build types
|
||||||
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
|
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
|
||||||
|
|||||||
@ -467,6 +467,12 @@ function (define_gpu_extension_target GPU_MOD_NAME)
|
|||||||
if (GPU_LANGUAGE STREQUAL "HIP")
|
if (GPU_LANGUAGE STREQUAL "HIP")
|
||||||
# Make this target dependent on the hipify preprocessor step.
|
# Make this target dependent on the hipify preprocessor step.
|
||||||
add_dependencies(${GPU_MOD_NAME} hipify${GPU_MOD_NAME})
|
add_dependencies(${GPU_MOD_NAME} hipify${GPU_MOD_NAME})
|
||||||
|
# Make sure we include the hipified versions of the headers, and avoid conflicts with the ones in the original source folder
|
||||||
|
target_include_directories(${GPU_MOD_NAME} PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/csrc
|
||||||
|
${GPU_INCLUDE_DIRECTORIES})
|
||||||
|
else()
|
||||||
|
target_include_directories(${GPU_MOD_NAME} PRIVATE csrc
|
||||||
|
${GPU_INCLUDE_DIRECTORIES})
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (GPU_ARCHITECTURES)
|
if (GPU_ARCHITECTURES)
|
||||||
@ -482,8 +488,6 @@ function (define_gpu_extension_target GPU_MOD_NAME)
|
|||||||
target_compile_definitions(${GPU_MOD_NAME} PRIVATE
|
target_compile_definitions(${GPU_MOD_NAME} PRIVATE
|
||||||
"-DTORCH_EXTENSION_NAME=${GPU_MOD_NAME}")
|
"-DTORCH_EXTENSION_NAME=${GPU_MOD_NAME}")
|
||||||
|
|
||||||
target_include_directories(${GPU_MOD_NAME} PRIVATE csrc
|
|
||||||
${GPU_INCLUDE_DIRECTORIES})
|
|
||||||
|
|
||||||
target_link_libraries(${GPU_MOD_NAME} PRIVATE torch ${GPU_LIBRARIES})
|
target_link_libraries(${GPU_MOD_NAME} PRIVATE torch ${GPU_LIBRARIES})
|
||||||
|
|
||||||
|
|||||||
@ -24,7 +24,7 @@
|
|||||||
|
|
||||||
#include "attention_dtypes.h"
|
#include "attention_dtypes.h"
|
||||||
#include "attention_utils.cuh"
|
#include "attention_utils.cuh"
|
||||||
#include "cuda_compat.h"
|
#include "../cuda_compat.h"
|
||||||
|
|
||||||
#ifdef USE_ROCM
|
#ifdef USE_ROCM
|
||||||
#include <hip/hip_bf16.h>
|
#include <hip/hip_bf16.h>
|
||||||
|
|||||||
@ -16,9 +16,8 @@
|
|||||||
* See the License for the specific language governing permissions and
|
* See the License for the specific language governing permissions and
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include "attention_kernels.cuh"
|
#include "attention_kernels.cuh"
|
||||||
#include "cuda_compat.h"
|
#include "../cuda_compat.h"
|
||||||
|
|
||||||
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
||||||
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
||||||
@ -75,7 +74,7 @@ void paged_attention_v1_launcher(
|
|||||||
const float* k_scale_ptr = reinterpret_cast<const float*>(k_scale.data_ptr());
|
const float* k_scale_ptr = reinterpret_cast<const float*>(k_scale.data_ptr());
|
||||||
const float* v_scale_ptr = reinterpret_cast<const float*>(v_scale.data_ptr());
|
const float* v_scale_ptr = reinterpret_cast<const float*>(v_scale.data_ptr());
|
||||||
|
|
||||||
constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
|
const int NUM_WARPS = NUM_THREADS / WARP_SIZE;
|
||||||
int padded_max_seq_len =
|
int padded_max_seq_len =
|
||||||
DIVIDE_ROUND_UP(max_seq_len, BLOCK_SIZE) * BLOCK_SIZE;
|
DIVIDE_ROUND_UP(max_seq_len, BLOCK_SIZE) * BLOCK_SIZE;
|
||||||
int logits_size = padded_max_seq_len * sizeof(float);
|
int logits_size = padded_max_seq_len * sizeof(float);
|
||||||
|
|||||||
@ -16,9 +16,8 @@
|
|||||||
* See the License for the specific language governing permissions and
|
* See the License for the specific language governing permissions and
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include "attention_kernels.cuh"
|
#include "attention_kernels.cuh"
|
||||||
#include "cuda_compat.h"
|
#include "../cuda_compat.h"
|
||||||
|
|
||||||
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
||||||
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
||||||
@ -79,7 +78,7 @@ void paged_attention_v2_launcher(
|
|||||||
const float* k_scale_ptr = reinterpret_cast<const float*>(k_scale.data_ptr());
|
const float* k_scale_ptr = reinterpret_cast<const float*>(k_scale.data_ptr());
|
||||||
const float* v_scale_ptr = reinterpret_cast<const float*>(v_scale.data_ptr());
|
const float* v_scale_ptr = reinterpret_cast<const float*>(v_scale.data_ptr());
|
||||||
|
|
||||||
constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
|
const int NUM_WARPS = NUM_THREADS / WARP_SIZE;
|
||||||
int max_num_partitions = DIVIDE_ROUND_UP(max_seq_len, PARTITION_SIZE);
|
int max_num_partitions = DIVIDE_ROUND_UP(max_seq_len, PARTITION_SIZE);
|
||||||
int logits_size = PARTITION_SIZE * sizeof(float);
|
int logits_size = PARTITION_SIZE * sizeof(float);
|
||||||
int outputs_size = (NUM_WARPS / 2) * head_size * sizeof(float);
|
int outputs_size = (NUM_WARPS / 2) * head_size * sizeof(float);
|
||||||
|
|||||||
@ -5,6 +5,7 @@
|
|||||||
#include "cuda_utils.h"
|
#include "cuda_utils.h"
|
||||||
#include "cuda_compat.h"
|
#include "cuda_compat.h"
|
||||||
#include "dispatch_utils.h"
|
#include "dispatch_utils.h"
|
||||||
|
#include "quantization/vectorization_utils.cuh"
|
||||||
|
|
||||||
#ifdef USE_ROCM
|
#ifdef USE_ROCM
|
||||||
#include "quantization/fp8/amd/quant_utils.cuh"
|
#include "quantization/fp8/amd/quant_utils.cuh"
|
||||||
@ -261,14 +262,26 @@ __global__ void reshape_and_cache_kernel(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Used by vectorization_utils to copy/convert one element
|
||||||
|
template <typename OutT, typename InT, Fp8KVCacheDataType kv_dt>
|
||||||
|
struct CopyWithScaleOp {
|
||||||
|
float scale;
|
||||||
|
|
||||||
|
__device__ __forceinline__ void operator()(OutT& dst, const InT src) const {
|
||||||
|
if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
|
||||||
|
dst = static_cast<OutT>(src);
|
||||||
|
} else {
|
||||||
|
dst = fp8::scaled_convert<OutT, InT, kv_dt>(src, scale);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
|
template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
|
||||||
__global__ void reshape_and_cache_flash_kernel(
|
__global__ void reshape_and_cache_flash_kernel(
|
||||||
const scalar_t* __restrict__ key, // [num_tokens, num_heads, head_size]
|
const scalar_t* __restrict__ key, // [num_tokens, num_heads, head_size]
|
||||||
const scalar_t* __restrict__ value, // [num_tokens, num_heads, head_size]
|
const scalar_t* __restrict__ value, // [num_tokens, num_heads, head_size]
|
||||||
cache_t* __restrict__ key_cache, // [num_blocks, block_size, num_heads,
|
cache_t* __restrict__ key_cache, // NHD or HND, shape see comments below
|
||||||
// head_size]
|
cache_t* __restrict__ value_cache, // same above
|
||||||
cache_t* __restrict__ value_cache, // [num_blocks, block_size, num_heads,
|
|
||||||
// head_size]
|
|
||||||
const int64_t* __restrict__ slot_mapping, // [num_tokens]
|
const int64_t* __restrict__ slot_mapping, // [num_tokens]
|
||||||
const int64_t block_stride, const int64_t page_stride,
|
const int64_t block_stride, const int64_t page_stride,
|
||||||
const int64_t head_stride, const int64_t key_stride,
|
const int64_t head_stride, const int64_t key_stride,
|
||||||
@ -282,25 +295,58 @@ __global__ void reshape_and_cache_flash_kernel(
|
|||||||
}
|
}
|
||||||
const int64_t block_idx = slot_idx / block_size;
|
const int64_t block_idx = slot_idx / block_size;
|
||||||
const int64_t block_offset = slot_idx % block_size;
|
const int64_t block_offset = slot_idx % block_size;
|
||||||
const int n = num_heads * head_size;
|
const int n_elems = num_heads * head_size;
|
||||||
for (int i = threadIdx.x; i < n; i += blockDim.x) {
|
|
||||||
const int64_t src_key_idx = token_idx * key_stride + i;
|
// pointers to the beginning of the source row for this token.
|
||||||
const int64_t src_value_idx = token_idx * value_stride + i;
|
const scalar_t* __restrict__ key_src = key + token_idx * key_stride;
|
||||||
const int head_idx = i / head_size;
|
const scalar_t* __restrict__ value_src = value + token_idx * value_stride;
|
||||||
const int head_offset = i % head_size;
|
|
||||||
const int64_t tgt_key_value_idx = block_idx * block_stride +
|
// find the start position inside the kv-cache for this token.
|
||||||
block_offset * page_stride +
|
cache_t* __restrict__ key_dst =
|
||||||
head_idx * head_stride + head_offset;
|
key_cache + block_idx * block_stride + block_offset * page_stride;
|
||||||
scalar_t tgt_key = key[src_key_idx];
|
cache_t* __restrict__ value_dst =
|
||||||
scalar_t tgt_value = value[src_value_idx];
|
value_cache + block_idx * block_stride + block_offset * page_stride;
|
||||||
if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
|
|
||||||
key_cache[tgt_key_value_idx] = tgt_key;
|
// this is true for the NHD layout where `head_stride == head_size`
|
||||||
value_cache[tgt_key_value_idx] = tgt_value;
|
const bool is_contiguous_heads = (head_stride == head_size);
|
||||||
} else {
|
|
||||||
key_cache[tgt_key_value_idx] =
|
float k_scale_val = (kv_dt == Fp8KVCacheDataType::kAuto) ? 0.f : *k_scale;
|
||||||
fp8::scaled_convert<cache_t, scalar_t, kv_dt>(tgt_key, *k_scale);
|
float v_scale_val = (kv_dt == Fp8KVCacheDataType::kAuto) ? 0.f : *v_scale;
|
||||||
value_cache[tgt_key_value_idx] =
|
constexpr int VEC_SIZE = (sizeof(scalar_t) == 2) ? 8 : 4;
|
||||||
fp8::scaled_convert<cache_t, scalar_t, kv_dt>(tgt_value, *v_scale);
|
CopyWithScaleOp<cache_t, scalar_t, kv_dt> k_op{k_scale_val};
|
||||||
|
CopyWithScaleOp<cache_t, scalar_t, kv_dt> v_op{v_scale_val};
|
||||||
|
if (is_contiguous_heads) {
|
||||||
|
// NHD layout
|
||||||
|
// kv cache: [num_blocks, block_size, num_heads, head_size]
|
||||||
|
vectorize_with_alignment<VEC_SIZE>(key_src, key_dst, n_elems, threadIdx.x,
|
||||||
|
blockDim.x, k_op);
|
||||||
|
|
||||||
|
vectorize_with_alignment<VEC_SIZE>(value_src, value_dst, n_elems,
|
||||||
|
threadIdx.x, blockDim.x, v_op);
|
||||||
|
|
||||||
|
} else {
|
||||||
|
// HND layout: heads are strided, but each head_size segment is contiguous
|
||||||
|
// kv cache: [num_blocks, num_heads, block_size, head_size]
|
||||||
|
const int lane = threadIdx.x & 31; // 0..31 within warp
|
||||||
|
const int warp_id = threadIdx.x >> 5; // warp index within block
|
||||||
|
const int warps_per_block = blockDim.x >> 5;
|
||||||
|
|
||||||
|
for (int head = warp_id; head < num_heads; head += warps_per_block) {
|
||||||
|
const scalar_t* __restrict__ k_src_h = key_src + head * head_size;
|
||||||
|
const scalar_t* __restrict__ v_src_h = value_src + head * head_size;
|
||||||
|
|
||||||
|
cache_t* __restrict__ k_dst_h =
|
||||||
|
key_dst + static_cast<int64_t>(head) * head_stride;
|
||||||
|
cache_t* __restrict__ v_dst_h =
|
||||||
|
value_dst + static_cast<int64_t>(head) * head_stride;
|
||||||
|
|
||||||
|
// within each head, let the 32 threads of the warp perform the vector
|
||||||
|
// copy
|
||||||
|
vectorize_with_alignment<VEC_SIZE>(k_src_h, k_dst_h, head_size, lane, 32,
|
||||||
|
k_op);
|
||||||
|
|
||||||
|
vectorize_with_alignment<VEC_SIZE>(v_src_h, v_dst_h, head_size, lane, 32,
|
||||||
|
v_op);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -16,12 +16,14 @@ struct KernelVecType<float> {
|
|||||||
using cvt_vec_type = vec_op::FP32Vec16;
|
using cvt_vec_type = vec_op::FP32Vec16;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
#if !defined(__aarch64__) || defined(ARM_BF16_SUPPORT)
|
||||||
template <>
|
template <>
|
||||||
struct KernelVecType<c10::BFloat16> {
|
struct KernelVecType<c10::BFloat16> {
|
||||||
using load_vec_type = vec_op::BF16Vec16;
|
using load_vec_type = vec_op::BF16Vec16;
|
||||||
using azp_adj_load_vec_type = vec_op::INT32Vec16;
|
using azp_adj_load_vec_type = vec_op::INT32Vec16;
|
||||||
using cvt_vec_type = vec_op::FP32Vec16;
|
using cvt_vec_type = vec_op::FP32Vec16;
|
||||||
};
|
};
|
||||||
|
#endif
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
struct KernelVecType<c10::Half> {
|
struct KernelVecType<c10::Half> {
|
||||||
|
|||||||
@ -151,7 +151,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
|||||||
ops.impl("rotary_embedding", torch::kCPU, &rotary_embedding);
|
ops.impl("rotary_embedding", torch::kCPU, &rotary_embedding);
|
||||||
|
|
||||||
// Quantization
|
// Quantization
|
||||||
#if defined(__AVX512F__) || defined(__aarch64__)
|
#if defined(__AVX512F__) || (defined(__aarch64__) && !defined(__APPLE__))
|
||||||
at::Tag stride_tag = at::Tag::needs_fixed_stride_order;
|
at::Tag stride_tag = at::Tag::needs_fixed_stride_order;
|
||||||
|
|
||||||
// Compute int8 quantized tensor for given scaling factor.
|
// Compute int8 quantized tensor for given scaling factor.
|
||||||
|
|||||||
@ -4,8 +4,35 @@
|
|||||||
#include <hip/hip_runtime.h>
|
#include <hip/hip_runtime.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(USE_ROCM) && defined(__GFX9__)
|
#ifdef USE_ROCM
|
||||||
#define WARP_SIZE 64
|
struct Utils {
|
||||||
|
static __host__ int get_warp_size() {
|
||||||
|
static bool is_cached = false;
|
||||||
|
static int result;
|
||||||
|
|
||||||
|
if (!is_cached) {
|
||||||
|
int device_id;
|
||||||
|
cudaDeviceProp deviceProp;
|
||||||
|
cudaGetDevice(&device_id);
|
||||||
|
cudaGetDeviceProperties(&deviceProp, device_id);
|
||||||
|
|
||||||
|
result = deviceProp.warpSize;
|
||||||
|
is_cached = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
static __device__ constexpr int get_warp_size() {
|
||||||
|
#ifdef __GFX9__
|
||||||
|
return 64;
|
||||||
|
#else
|
||||||
|
return 32;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
#define WARP_SIZE Utils::get_warp_size()
|
||||||
#else
|
#else
|
||||||
#define WARP_SIZE 32
|
#define WARP_SIZE 32
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
@ -10,32 +10,28 @@
|
|||||||
|
|
||||||
void moe_permute(
|
void moe_permute(
|
||||||
const torch::Tensor& input, // [n_token, hidden]
|
const torch::Tensor& input, // [n_token, hidden]
|
||||||
const torch::Tensor& topk_weights, //[n_token, topk]
|
const torch::Tensor& topk_ids, // [n_token, topk]
|
||||||
torch::Tensor& topk_ids, // [n_token, topk]
|
|
||||||
const torch::Tensor& token_expert_indices, // [n_token, topk]
|
const torch::Tensor& token_expert_indices, // [n_token, topk]
|
||||||
const std::optional<torch::Tensor>& expert_map, // [n_expert]
|
const std::optional<torch::Tensor>& expert_map, // [n_expert]
|
||||||
int64_t n_expert, int64_t n_local_expert, int64_t topk,
|
int64_t n_expert, int64_t n_local_expert, int64_t topk,
|
||||||
const std::optional<int64_t>& align_block_size,
|
const std::optional<int64_t>& align_block_size,
|
||||||
torch::Tensor&
|
torch::Tensor& permuted_input, // [permuted_size, hidden]
|
||||||
permuted_input, // [topk * n_token/align_block_size_m, hidden]
|
|
||||||
torch::Tensor& expert_first_token_offset, // [n_local_expert + 1]
|
torch::Tensor& expert_first_token_offset, // [n_local_expert + 1]
|
||||||
torch::Tensor& src_row_id2dst_row_id_map, // [n_token, topk]
|
torch::Tensor& inv_permuted_idx, // [n_token, topk]
|
||||||
|
torch::Tensor& permuted_idx, // [permute_size]
|
||||||
torch::Tensor& m_indices) { // [align_expand_m]
|
torch::Tensor& m_indices) { // [align_expand_m]
|
||||||
TORCH_CHECK(topk_weights.scalar_type() == at::ScalarType::Float,
|
|
||||||
"topk_weights must be float32");
|
|
||||||
TORCH_CHECK(expert_first_token_offset.scalar_type() == at::ScalarType::Long,
|
TORCH_CHECK(expert_first_token_offset.scalar_type() == at::ScalarType::Long,
|
||||||
"expert_first_token_offset must be int64");
|
"expert_first_token_offset must be int64");
|
||||||
TORCH_CHECK(topk_ids.scalar_type() == at::ScalarType::Int,
|
TORCH_CHECK(topk_ids.scalar_type() == at::ScalarType::Int,
|
||||||
"topk_ids must be int32");
|
"topk_ids must be int32");
|
||||||
TORCH_CHECK(token_expert_indices.scalar_type() == at::ScalarType::Int,
|
TORCH_CHECK(token_expert_indices.scalar_type() == at::ScalarType::Int,
|
||||||
"token_expert_indices must be int32");
|
"token_expert_indices must be int32");
|
||||||
TORCH_CHECK(src_row_id2dst_row_id_map.scalar_type() == at::ScalarType::Int,
|
TORCH_CHECK(inv_permuted_idx.scalar_type() == at::ScalarType::Int,
|
||||||
"src_row_id2dst_row_id_map must be int32");
|
"inv_permuted_idx must be int32");
|
||||||
TORCH_CHECK(expert_first_token_offset.size(0) == n_local_expert + 1,
|
TORCH_CHECK(expert_first_token_offset.size(0) == n_local_expert + 1,
|
||||||
"expert_first_token_offset shape != n_local_expert+1")
|
"expert_first_token_offset shape != n_local_expert+1")
|
||||||
TORCH_CHECK(
|
TORCH_CHECK(inv_permuted_idx.sizes() == token_expert_indices.sizes(),
|
||||||
src_row_id2dst_row_id_map.sizes() == token_expert_indices.sizes(),
|
"token_expert_indices shape must be same as inv_permuted_idx");
|
||||||
"token_expert_indices shape must be same as src_row_id2dst_row_id_map");
|
|
||||||
auto n_token = input.sizes()[0];
|
auto n_token = input.sizes()[0];
|
||||||
auto n_hidden = input.sizes()[1];
|
auto n_hidden = input.sizes()[1];
|
||||||
auto align_block_size_value =
|
auto align_block_size_value =
|
||||||
@ -46,8 +42,9 @@ void moe_permute(
|
|||||||
auto sort_workspace = torch::empty(
|
auto sort_workspace = torch::empty(
|
||||||
{sorter_size},
|
{sorter_size},
|
||||||
torch::dtype(torch::kInt8).device(torch::kCUDA).requires_grad(false));
|
torch::dtype(torch::kInt8).device(torch::kCUDA).requires_grad(false));
|
||||||
|
auto copy_topk_ids = topk_ids.clone(); // copy topk_ids for preprocess
|
||||||
auto permuted_experts_id = torch::empty_like(topk_ids);
|
auto permuted_experts_id = torch::empty_like(topk_ids);
|
||||||
auto dst_row_id2src_row_id_map = torch::empty_like(src_row_id2dst_row_id_map);
|
auto sorted_row_idx = torch::empty_like(inv_permuted_idx);
|
||||||
auto align_expert_first_token_offset =
|
auto align_expert_first_token_offset =
|
||||||
torch::zeros_like(expert_first_token_offset);
|
torch::zeros_like(expert_first_token_offset);
|
||||||
|
|
||||||
@ -67,24 +64,22 @@ void moe_permute(
|
|||||||
const int* expert_map_ptr = get_ptr<int>(expert_map.value());
|
const int* expert_map_ptr = get_ptr<int>(expert_map.value());
|
||||||
valid_num_ptr =
|
valid_num_ptr =
|
||||||
get_ptr<int64_t>(expert_first_token_offset) + n_local_expert;
|
get_ptr<int64_t>(expert_first_token_offset) + n_local_expert;
|
||||||
preprocessTopkIdLauncher(get_ptr<int>(topk_ids), n_token * topk,
|
preprocessTopkIdLauncher(get_ptr<int>(copy_topk_ids), n_token * topk,
|
||||||
expert_map_ptr, n_expert, stream);
|
expert_map_ptr, n_expert, stream);
|
||||||
}
|
}
|
||||||
// expert sort topk expert id and scan expert id get expert_first_token_offset
|
// expert sort topk expert id and scan expert id get expert_first_token_offset
|
||||||
sortAndScanExpert(get_ptr<int>(topk_ids), get_ptr<int>(token_expert_indices),
|
sortAndScanExpert(
|
||||||
get_ptr<int>(permuted_experts_id),
|
get_ptr<int>(copy_topk_ids), get_ptr<int>(token_expert_indices),
|
||||||
get_ptr<int>(dst_row_id2src_row_id_map),
|
get_ptr<int>(permuted_experts_id), get_ptr<int>(sorted_row_idx),
|
||||||
get_ptr<int64_t>(expert_first_token_offset), n_token,
|
get_ptr<int64_t>(expert_first_token_offset), n_token, n_expert,
|
||||||
n_expert, n_local_expert, topk, sorter,
|
n_local_expert, topk, sorter, get_ptr<int>(sort_workspace), stream);
|
||||||
get_ptr<int>(sort_workspace), stream);
|
|
||||||
|
|
||||||
// dispatch expandInputRowsKernelLauncher
|
// dispatch expandInputRowsKernelLauncher
|
||||||
MOE_DISPATCH(input.scalar_type(), [&] {
|
MOE_DISPATCH(input.scalar_type(), [&] {
|
||||||
expandInputRowsKernelLauncher<scalar_t>(
|
expandInputRowsKernelLauncher<scalar_t>(
|
||||||
get_ptr<scalar_t>(input), get_ptr<scalar_t>(permuted_input),
|
get_ptr<scalar_t>(input), get_ptr<scalar_t>(permuted_input),
|
||||||
get_ptr<float>(topk_weights), get_ptr<int>(permuted_experts_id),
|
get_ptr<int>(permuted_experts_id), get_ptr<int>(sorted_row_idx),
|
||||||
get_ptr<int>(dst_row_id2src_row_id_map),
|
get_ptr<int>(inv_permuted_idx), get_ptr<int>(permuted_idx),
|
||||||
get_ptr<int>(src_row_id2dst_row_id_map),
|
|
||||||
get_ptr<int64_t>(expert_first_token_offset), n_token, valid_num_ptr,
|
get_ptr<int64_t>(expert_first_token_offset), n_token, valid_num_ptr,
|
||||||
n_hidden, topk, n_local_expert, align_block_size_value, stream);
|
n_hidden, topk, n_local_expert, align_block_size_value, stream);
|
||||||
});
|
});
|
||||||
@ -101,32 +96,34 @@ void moe_permute(
|
|||||||
}
|
}
|
||||||
|
|
||||||
void moe_unpermute(
|
void moe_unpermute(
|
||||||
const torch::Tensor& permuted_hidden_states, // [n_token * topk, hidden]
|
const torch::Tensor& permuted_hidden_states, // [n_token * topk, hidden]
|
||||||
const torch::Tensor& topk_weights, //[n_token, topk]
|
const torch::Tensor& topk_weights, // [n_token, topk]
|
||||||
const torch::Tensor& topk_ids, // [n_token, topk]
|
const torch::Tensor& inv_permuted_idx, // [n_token, topk]
|
||||||
const torch::Tensor& src_row_id2dst_row_id_map, // [n_token, topk]
|
const std::optional<torch::Tensor>&
|
||||||
const torch::Tensor& expert_first_token_offset, // [n_local_expert+1]
|
expert_first_token_offset, // [n_local_expert+1]
|
||||||
int64_t n_expert, int64_t n_local_expert, int64_t topk,
|
int64_t topk,
|
||||||
torch::Tensor& hidden_states // [n_token, hidden]
|
torch::Tensor& hidden_states // [n_token, hidden]
|
||||||
) {
|
) {
|
||||||
TORCH_CHECK(src_row_id2dst_row_id_map.sizes() == topk_ids.sizes(),
|
|
||||||
"topk_ids shape must be same as src_row_id2dst_row_id_map");
|
|
||||||
TORCH_CHECK(topk_ids.scalar_type() == at::ScalarType::Int,
|
|
||||||
"topk_ids must be int32");
|
|
||||||
TORCH_CHECK(
|
TORCH_CHECK(
|
||||||
permuted_hidden_states.scalar_type() == hidden_states.scalar_type(),
|
permuted_hidden_states.scalar_type() == hidden_states.scalar_type(),
|
||||||
"topk_ids dtype must be same as src_row_id2dst_row_id_map");
|
"permuted_hidden_states dtype must be same as hidden_states");
|
||||||
auto n_token = hidden_states.size(0);
|
auto n_token = hidden_states.size(0);
|
||||||
auto n_hidden = hidden_states.size(1);
|
auto n_hidden = hidden_states.size(1);
|
||||||
auto stream = at::cuda::getCurrentCUDAStream().stream();
|
auto stream = at::cuda::getCurrentCUDAStream().stream();
|
||||||
const int64_t* valid_ptr =
|
|
||||||
get_ptr<int64_t>(expert_first_token_offset) + n_local_expert;
|
int64_t const* valid_ptr = nullptr;
|
||||||
|
if (expert_first_token_offset.has_value()) {
|
||||||
|
int n_local_expert = expert_first_token_offset.value().size(0) - 1;
|
||||||
|
valid_ptr =
|
||||||
|
get_ptr<int64_t>(expert_first_token_offset.value()) + n_local_expert;
|
||||||
|
}
|
||||||
|
|
||||||
MOE_DISPATCH(hidden_states.scalar_type(), [&] {
|
MOE_DISPATCH(hidden_states.scalar_type(), [&] {
|
||||||
finalizeMoeRoutingKernelLauncher<scalar_t, scalar_t>(
|
finalizeMoeRoutingKernelLauncher<scalar_t, scalar_t>(
|
||||||
get_ptr<scalar_t>(permuted_hidden_states),
|
get_ptr<scalar_t>(permuted_hidden_states),
|
||||||
get_ptr<scalar_t>(hidden_states), get_ptr<float>(topk_weights),
|
get_ptr<scalar_t>(hidden_states), get_ptr<float>(topk_weights),
|
||||||
get_ptr<int>(src_row_id2dst_row_id_map), get_ptr<int>(topk_ids),
|
get_ptr<int>(inv_permuted_idx), n_token, n_hidden, topk, valid_ptr,
|
||||||
n_token, n_hidden, topk, valid_ptr, stream);
|
stream);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -177,7 +177,7 @@ __global__ void getMIndicesKernel(int64_t* expert_first_token_offset,
|
|||||||
int tidx = threadIdx.x;
|
int tidx = threadIdx.x;
|
||||||
extern __shared__ int64_t smem_expert_first_token_offset[];
|
extern __shared__ int64_t smem_expert_first_token_offset[];
|
||||||
for (int i = tidx; i <= num_local_expert; i += blockDim.x) {
|
for (int i = tidx; i <= num_local_expert; i += blockDim.x) {
|
||||||
smem_expert_first_token_offset[tidx] = __ldg(expert_first_token_offset + i);
|
smem_expert_first_token_offset[i] = __ldg(expert_first_token_offset + i);
|
||||||
}
|
}
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
auto last_token_offset = smem_expert_first_token_offset[eidx + 1];
|
auto last_token_offset = smem_expert_first_token_offset[eidx + 1];
|
||||||
|
|||||||
@ -57,31 +57,19 @@ void sortAndScanExpert(int* expert_for_source_row, const int* source_rows,
|
|||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
void expandInputRowsKernelLauncher(
|
void expandInputRowsKernelLauncher(
|
||||||
T const* unpermuted_input, T* permuted_output,
|
T const* unpermuted_input, T* permuted_output, int* sorted_experts,
|
||||||
const float* unpermuted_scales, int* sorted_experts,
|
|
||||||
int const* expanded_dest_row_to_expanded_source_row,
|
int const* expanded_dest_row_to_expanded_source_row,
|
||||||
int* expanded_source_row_to_expanded_dest_row,
|
int* expanded_source_row_to_expanded_dest_row, int* permuted_idx,
|
||||||
int64_t* expert_first_token_offset, int64_t const num_rows,
|
int64_t* expert_first_token_offset, int64_t const num_rows,
|
||||||
int64_t const* num_valid_tokens_ptr, int64_t const cols, int const k,
|
int64_t const* num_valid_tokens_ptr, int64_t const cols, int const k,
|
||||||
int num_local_experts, const int& align_block_size, cudaStream_t stream);
|
int num_local_experts, const int& align_block_size, cudaStream_t stream);
|
||||||
|
|
||||||
// Final kernel to unpermute and scale
|
|
||||||
// This kernel unpermutes the original data, does the k-way reduction and
|
|
||||||
// performs the final skip connection.
|
|
||||||
template <typename T, typename OutputType, bool CHECK_SKIPPED>
|
|
||||||
__global__ void finalizeMoeRoutingKernel(
|
|
||||||
T const* expanded_permuted_rows, OutputType* reduced_unpermuted_output,
|
|
||||||
float const* scales, int const* expanded_source_row_to_expanded_dest_row,
|
|
||||||
int const* expert_for_source_row, int64_t const orig_cols, int64_t const k,
|
|
||||||
int64_t const* num_valid_ptr);
|
|
||||||
|
|
||||||
template <class T, class OutputType>
|
template <class T, class OutputType>
|
||||||
void finalizeMoeRoutingKernelLauncher(
|
void finalizeMoeRoutingKernelLauncher(
|
||||||
T const* expanded_permuted_rows, OutputType* reduced_unpermuted_output,
|
T const* expanded_permuted_rows, OutputType* reduced_unpermuted_output,
|
||||||
float const* scales, int const* expanded_source_row_to_expanded_dest_row,
|
float const* scales, int const* expanded_source_row_to_expanded_dest_row,
|
||||||
int const* expert_for_source_row, int64_t const num_rows,
|
int64_t const num_rows, int64_t const cols, int64_t const k,
|
||||||
int64_t const cols, int64_t const k, int64_t const* num_valid_ptr,
|
int64_t const* num_valid_ptr, cudaStream_t stream);
|
||||||
cudaStream_t stream);
|
|
||||||
|
|
||||||
void preprocessTopkIdLauncher(int* topk_id_ptr, int size,
|
void preprocessTopkIdLauncher(int* topk_id_ptr, int size,
|
||||||
const int* expert_map_ptr, int num_experts,
|
const int* expert_map_ptr, int num_experts,
|
||||||
|
|||||||
@ -2,10 +2,9 @@
|
|||||||
|
|
||||||
template <typename T, bool CHECK_SKIPPED, bool ALIGN_BLOCK_SIZE>
|
template <typename T, bool CHECK_SKIPPED, bool ALIGN_BLOCK_SIZE>
|
||||||
__global__ void expandInputRowsKernel(
|
__global__ void expandInputRowsKernel(
|
||||||
T const* unpermuted_input, T* permuted_output,
|
T const* unpermuted_input, T* permuted_output, int* sorted_experts,
|
||||||
const float* unpermuted_scales, int* sorted_experts,
|
|
||||||
int const* expanded_dest_row_to_expanded_source_row,
|
int const* expanded_dest_row_to_expanded_source_row,
|
||||||
int* expanded_source_row_to_expanded_dest_row,
|
int* expanded_source_row_to_expanded_dest_row, int* permuted_idx,
|
||||||
int64_t* expert_first_token_offset, int64_t const num_rows,
|
int64_t* expert_first_token_offset, int64_t const num_rows,
|
||||||
int64_t const* num_dest_rows, int64_t const cols, int64_t k,
|
int64_t const* num_dest_rows, int64_t const cols, int64_t k,
|
||||||
int num_local_experts, int align_block_size) {
|
int num_local_experts, int align_block_size) {
|
||||||
@ -54,6 +53,10 @@ __global__ void expandInputRowsKernel(
|
|||||||
assert(expanded_dest_row <= INT32_MAX);
|
assert(expanded_dest_row <= INT32_MAX);
|
||||||
expanded_source_row_to_expanded_dest_row[expanded_source_row] =
|
expanded_source_row_to_expanded_dest_row[expanded_source_row] =
|
||||||
static_cast<int>(expanded_dest_row);
|
static_cast<int>(expanded_dest_row);
|
||||||
|
// skip non local expert token
|
||||||
|
if (!CHECK_SKIPPED || blockIdx.x < *num_dest_rows) {
|
||||||
|
permuted_idx[expanded_dest_row] = expanded_source_row;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!CHECK_SKIPPED || blockIdx.x < *num_dest_rows) {
|
if (!CHECK_SKIPPED || blockIdx.x < *num_dest_rows) {
|
||||||
@ -62,7 +65,7 @@ __global__ void expandInputRowsKernel(
|
|||||||
using DataElem = cutlass::Array<T, ELEM_PER_THREAD>;
|
using DataElem = cutlass::Array<T, ELEM_PER_THREAD>;
|
||||||
|
|
||||||
// Duplicate and permute rows
|
// Duplicate and permute rows
|
||||||
int64_t const source_row = expanded_source_row % num_rows;
|
int64_t const source_row = expanded_source_row / k;
|
||||||
|
|
||||||
auto const* source_row_ptr =
|
auto const* source_row_ptr =
|
||||||
reinterpret_cast<DataElem const*>(unpermuted_input + source_row * cols);
|
reinterpret_cast<DataElem const*>(unpermuted_input + source_row * cols);
|
||||||
@ -82,10 +85,9 @@ __global__ void expandInputRowsKernel(
|
|||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
void expandInputRowsKernelLauncher(
|
void expandInputRowsKernelLauncher(
|
||||||
T const* unpermuted_input, T* permuted_output,
|
T const* unpermuted_input, T* permuted_output, int* sorted_experts,
|
||||||
const float* unpermuted_scales, int* sorted_experts,
|
|
||||||
int const* expanded_dest_row_to_expanded_source_row,
|
int const* expanded_dest_row_to_expanded_source_row,
|
||||||
int* expanded_source_row_to_expanded_dest_row,
|
int* expanded_source_row_to_expanded_dest_row, int* permuted_idx,
|
||||||
int64_t* expert_first_token_offset, int64_t const num_rows,
|
int64_t* expert_first_token_offset, int64_t const num_rows,
|
||||||
int64_t const* num_valid_tokens_ptr, int64_t const cols, int const k,
|
int64_t const* num_valid_tokens_ptr, int64_t const cols, int const k,
|
||||||
int num_local_experts, const int& align_block_size, cudaStream_t stream) {
|
int num_local_experts, const int& align_block_size, cudaStream_t stream) {
|
||||||
@ -105,11 +107,11 @@ void expandInputRowsKernelLauncher(
|
|||||||
int64_t smem_size = sizeof(int64_t) * (num_local_experts + 1);
|
int64_t smem_size = sizeof(int64_t) * (num_local_experts + 1);
|
||||||
|
|
||||||
func<<<blocks, threads, smem_size, stream>>>(
|
func<<<blocks, threads, smem_size, stream>>>(
|
||||||
unpermuted_input, permuted_output, unpermuted_scales, sorted_experts,
|
unpermuted_input, permuted_output, sorted_experts,
|
||||||
expanded_dest_row_to_expanded_source_row,
|
expanded_dest_row_to_expanded_source_row,
|
||||||
expanded_source_row_to_expanded_dest_row, expert_first_token_offset,
|
expanded_source_row_to_expanded_dest_row, permuted_idx,
|
||||||
num_rows, num_valid_tokens_ptr, cols, k, num_local_experts,
|
expert_first_token_offset, num_rows, num_valid_tokens_ptr, cols, k,
|
||||||
align_block_size);
|
num_local_experts, align_block_size);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class T, class U>
|
template <class T, class U>
|
||||||
@ -128,11 +130,9 @@ template <typename T, typename OutputType, bool CHECK_SKIPPED>
|
|||||||
__global__ void finalizeMoeRoutingKernel(
|
__global__ void finalizeMoeRoutingKernel(
|
||||||
T const* expanded_permuted_rows, OutputType* reduced_unpermuted_output,
|
T const* expanded_permuted_rows, OutputType* reduced_unpermuted_output,
|
||||||
float const* scales, int const* expanded_source_row_to_expanded_dest_row,
|
float const* scales, int const* expanded_source_row_to_expanded_dest_row,
|
||||||
int const* expert_for_source_row, int64_t const orig_cols, int64_t const k,
|
int64_t const orig_cols, int64_t const k, int64_t const* num_valid_ptr) {
|
||||||
int64_t const* num_valid_ptr) {
|
|
||||||
assert(orig_cols % 4 == 0);
|
assert(orig_cols % 4 == 0);
|
||||||
int64_t const original_row = blockIdx.x;
|
int64_t const original_row = blockIdx.x;
|
||||||
int64_t const num_rows = gridDim.x;
|
|
||||||
auto const offset = original_row * orig_cols;
|
auto const offset = original_row * orig_cols;
|
||||||
OutputType* reduced_row_ptr = reduced_unpermuted_output + offset;
|
OutputType* reduced_row_ptr = reduced_unpermuted_output + offset;
|
||||||
int64_t const num_valid = *num_valid_ptr;
|
int64_t const num_valid = *num_valid_ptr;
|
||||||
@ -159,14 +159,13 @@ __global__ void finalizeMoeRoutingKernel(
|
|||||||
ComputeElem thread_output;
|
ComputeElem thread_output;
|
||||||
thread_output.fill(0);
|
thread_output.fill(0);
|
||||||
for (int k_idx = 0; k_idx < k; ++k_idx) {
|
for (int k_idx = 0; k_idx < k; ++k_idx) {
|
||||||
int64_t const expanded_original_row = original_row + k_idx * num_rows;
|
int64_t const expanded_original_row = original_row * k + k_idx;
|
||||||
int64_t const expanded_permuted_row =
|
int64_t const expanded_permuted_row =
|
||||||
expanded_source_row_to_expanded_dest_row[expanded_original_row];
|
expanded_source_row_to_expanded_dest_row[expanded_original_row];
|
||||||
|
|
||||||
int64_t const k_offset = original_row * k + k_idx;
|
int64_t const k_offset = original_row * k + k_idx;
|
||||||
float const row_scale = scales[k_offset];
|
float const row_scale = scales[k_offset];
|
||||||
|
|
||||||
// Check after row_rescale has accumulated
|
|
||||||
if (CHECK_SKIPPED && expanded_permuted_row >= num_valid) {
|
if (CHECK_SKIPPED && expanded_permuted_row >= num_valid) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@ -189,9 +188,8 @@ template <class T, class OutputType>
|
|||||||
void finalizeMoeRoutingKernelLauncher(
|
void finalizeMoeRoutingKernelLauncher(
|
||||||
T const* expanded_permuted_rows, OutputType* reduced_unpermuted_output,
|
T const* expanded_permuted_rows, OutputType* reduced_unpermuted_output,
|
||||||
float const* scales, int const* expanded_source_row_to_expanded_dest_row,
|
float const* scales, int const* expanded_source_row_to_expanded_dest_row,
|
||||||
int const* expert_for_source_row, int64_t const num_rows,
|
int64_t const num_rows, int64_t const cols, int64_t const k,
|
||||||
int64_t const cols, int64_t const k, int64_t const* num_valid_ptr,
|
int64_t const* num_valid_ptr, cudaStream_t stream) {
|
||||||
cudaStream_t stream) {
|
|
||||||
int64_t const blocks = num_rows;
|
int64_t const blocks = num_rows;
|
||||||
int64_t const threads = 256;
|
int64_t const threads = 256;
|
||||||
bool const check_finished = num_valid_ptr != nullptr;
|
bool const check_finished = num_valid_ptr != nullptr;
|
||||||
@ -201,6 +199,5 @@ void finalizeMoeRoutingKernelLauncher(
|
|||||||
auto* const kernel = func_map[check_finished];
|
auto* const kernel = func_map[check_finished];
|
||||||
kernel<<<blocks, threads, 0, stream>>>(
|
kernel<<<blocks, threads, 0, stream>>>(
|
||||||
expanded_permuted_rows, reduced_unpermuted_output, scales,
|
expanded_permuted_rows, reduced_unpermuted_output, scales,
|
||||||
expanded_source_row_to_expanded_dest_row, expert_for_source_row, cols, k,
|
expanded_source_row_to_expanded_dest_row, cols, k, num_valid_ptr);
|
||||||
num_valid_ptr);
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -24,9 +24,12 @@
|
|||||||
#ifndef USE_ROCM
|
#ifndef USE_ROCM
|
||||||
#include <cub/util_type.cuh>
|
#include <cub/util_type.cuh>
|
||||||
#include <cub/cub.cuh>
|
#include <cub/cub.cuh>
|
||||||
|
#include <cuda/std/functional>
|
||||||
|
using AddOp = cuda::std::plus<float>;
|
||||||
#else
|
#else
|
||||||
#include <hipcub/util_type.hpp>
|
#include <hipcub/util_type.hpp>
|
||||||
#include <hipcub/hipcub.hpp>
|
#include <hipcub/hipcub.hpp>
|
||||||
|
using AddOp = cub::Sum;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
||||||
@ -62,7 +65,6 @@ __launch_bounds__(TPB) __global__
|
|||||||
|
|
||||||
const int thread_row_offset = blockIdx.x * num_cols;
|
const int thread_row_offset = blockIdx.x * num_cols;
|
||||||
|
|
||||||
cub::Sum sum;
|
|
||||||
float threadData(-FLT_MAX);
|
float threadData(-FLT_MAX);
|
||||||
|
|
||||||
// Don't touch finished rows.
|
// Don't touch finished rows.
|
||||||
@ -92,7 +94,7 @@ __launch_bounds__(TPB) __global__
|
|||||||
threadData += exp((static_cast<float>(input[idx]) - float_max));
|
threadData += exp((static_cast<float>(input[idx]) - float_max));
|
||||||
}
|
}
|
||||||
|
|
||||||
const auto Z = BlockReduce(tmpStorage).Reduce(threadData, sum);
|
const auto Z = BlockReduce(tmpStorage).Reduce(threadData, AddOp());
|
||||||
|
|
||||||
if (threadIdx.x == 0)
|
if (threadIdx.x == 0)
|
||||||
{
|
{
|
||||||
@ -190,8 +192,8 @@ __launch_bounds__(TPB) __global__ void moeTopK(
|
|||||||
2) This implementation assumes k is small, but will work for any k.
|
2) This implementation assumes k is small, but will work for any k.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
template <int VPT, int NUM_EXPERTS, int WARPS_PER_CTA, int BYTES_PER_LDG, typename IndType>
|
template <int VPT, int NUM_EXPERTS, int WARPS_PER_CTA, int BYTES_PER_LDG, int WARP_SIZE_PARAM, typename IndType>
|
||||||
__launch_bounds__(WARPS_PER_CTA* WARP_SIZE) __global__
|
__launch_bounds__(WARPS_PER_CTA* WARP_SIZE_PARAM) __global__
|
||||||
void topkGatingSoftmax(const float* input, const bool* finished, float* output, const int num_rows, IndType* indices,
|
void topkGatingSoftmax(const float* input, const bool* finished, float* output, const int num_rows, IndType* indices,
|
||||||
int* source_rows, const int k, const int start_expert, const int end_expert)
|
int* source_rows, const int k, const int start_expert, const int end_expert)
|
||||||
{
|
{
|
||||||
@ -209,12 +211,12 @@ __launch_bounds__(WARPS_PER_CTA* WARP_SIZE) __global__
|
|||||||
|
|
||||||
// Restrictions based on previous section.
|
// Restrictions based on previous section.
|
||||||
static_assert(VPT % ELTS_PER_LDG == 0, "The elements per thread must be a multiple of the elements per ldg");
|
static_assert(VPT % ELTS_PER_LDG == 0, "The elements per thread must be a multiple of the elements per ldg");
|
||||||
static_assert(WARP_SIZE % THREADS_PER_ROW == 0, "The threads per row must cleanly divide the threads per warp");
|
static_assert(WARP_SIZE_PARAM % THREADS_PER_ROW == 0, "The threads per row must cleanly divide the threads per warp");
|
||||||
static_assert(THREADS_PER_ROW == (THREADS_PER_ROW & -THREADS_PER_ROW), "THREADS_PER_ROW must be power of 2");
|
static_assert(THREADS_PER_ROW == (THREADS_PER_ROW & -THREADS_PER_ROW), "THREADS_PER_ROW must be power of 2");
|
||||||
static_assert(THREADS_PER_ROW <= WARP_SIZE, "THREADS_PER_ROW can be at most warp size");
|
static_assert(THREADS_PER_ROW <= WARP_SIZE_PARAM, "THREADS_PER_ROW can be at most warp size");
|
||||||
|
|
||||||
// We have NUM_EXPERTS elements per row. We specialize for small #experts
|
// We have NUM_EXPERTS elements per row. We specialize for small #experts
|
||||||
static constexpr int ELTS_PER_WARP = WARP_SIZE * VPT;
|
static constexpr int ELTS_PER_WARP = WARP_SIZE_PARAM * VPT;
|
||||||
static constexpr int ROWS_PER_WARP = ELTS_PER_WARP / ELTS_PER_ROW;
|
static constexpr int ROWS_PER_WARP = ELTS_PER_WARP / ELTS_PER_ROW;
|
||||||
static constexpr int ROWS_PER_CTA = WARPS_PER_CTA * ROWS_PER_WARP;
|
static constexpr int ROWS_PER_CTA = WARPS_PER_CTA * ROWS_PER_WARP;
|
||||||
|
|
||||||
@ -393,41 +395,51 @@ __launch_bounds__(WARPS_PER_CTA* WARP_SIZE) __global__
|
|||||||
namespace detail
|
namespace detail
|
||||||
{
|
{
|
||||||
// Constructs some constants needed to partition the work across threads at compile time.
|
// Constructs some constants needed to partition the work across threads at compile time.
|
||||||
template <int EXPERTS, int BYTES_PER_LDG>
|
template <int EXPERTS, int BYTES_PER_LDG, int WARP_SIZE_PARAM>
|
||||||
struct TopkConstants
|
struct TopkConstants
|
||||||
{
|
{
|
||||||
static constexpr int ELTS_PER_LDG = BYTES_PER_LDG / sizeof(float);
|
static constexpr int ELTS_PER_LDG = BYTES_PER_LDG / sizeof(float);
|
||||||
static_assert(EXPERTS / (ELTS_PER_LDG * WARP_SIZE) == 0 || EXPERTS % (ELTS_PER_LDG * WARP_SIZE) == 0, "");
|
static_assert(EXPERTS / (ELTS_PER_LDG * WARP_SIZE_PARAM) == 0 || EXPERTS % (ELTS_PER_LDG * WARP_SIZE_PARAM) == 0, "");
|
||||||
static constexpr int VECs_PER_THREAD = MAX(1, EXPERTS / (ELTS_PER_LDG * WARP_SIZE));
|
static constexpr int VECs_PER_THREAD = MAX(1, EXPERTS / (ELTS_PER_LDG * WARP_SIZE_PARAM));
|
||||||
static constexpr int VPT = VECs_PER_THREAD * ELTS_PER_LDG;
|
static constexpr int VPT = VECs_PER_THREAD * ELTS_PER_LDG;
|
||||||
static constexpr int THREADS_PER_ROW = EXPERTS / VPT;
|
static constexpr int THREADS_PER_ROW = EXPERTS / VPT;
|
||||||
static constexpr int ROWS_PER_WARP = WARP_SIZE / THREADS_PER_ROW;
|
static const int ROWS_PER_WARP = WARP_SIZE_PARAM / THREADS_PER_ROW;
|
||||||
};
|
};
|
||||||
} // namespace detail
|
} // namespace detail
|
||||||
|
|
||||||
template <int EXPERTS, int WARPS_PER_TB, typename IndType>
|
template <int EXPERTS, int WARPS_PER_TB, int WARP_SIZE_PARAM, typename IndType>
|
||||||
void topkGatingSoftmaxLauncherHelper(const float* input, const bool* finished, float* output, IndType* indices,
|
void topkGatingSoftmaxLauncherHelper(const float* input, const bool* finished, float* output, IndType* indices,
|
||||||
int* source_row, const int num_rows, const int k, const int start_expert, const int end_expert, cudaStream_t stream)
|
int* source_row, const int num_rows, const int k, const int start_expert, const int end_expert, cudaStream_t stream)
|
||||||
{
|
{
|
||||||
static constexpr std::size_t MAX_BYTES_PER_LDG = 16;
|
static constexpr std::size_t MAX_BYTES_PER_LDG = 16;
|
||||||
|
|
||||||
static constexpr int BYTES_PER_LDG = MIN(MAX_BYTES_PER_LDG, sizeof(float) * EXPERTS);
|
static constexpr int BYTES_PER_LDG = MIN(MAX_BYTES_PER_LDG, sizeof(float) * EXPERTS);
|
||||||
using Constants = detail::TopkConstants<EXPERTS, BYTES_PER_LDG>;
|
using Constants = detail::TopkConstants<EXPERTS, BYTES_PER_LDG, WARP_SIZE_PARAM>;
|
||||||
static constexpr int VPT = Constants::VPT;
|
static constexpr int VPT = Constants::VPT;
|
||||||
static constexpr int ROWS_PER_WARP = Constants::ROWS_PER_WARP;
|
static constexpr int ROWS_PER_WARP = Constants::ROWS_PER_WARP;
|
||||||
const int num_warps = (num_rows + ROWS_PER_WARP - 1) / ROWS_PER_WARP;
|
const int num_warps = (num_rows + ROWS_PER_WARP - 1) / ROWS_PER_WARP;
|
||||||
const int num_blocks = (num_warps + WARPS_PER_TB - 1) / WARPS_PER_TB;
|
const int num_blocks = (num_warps + WARPS_PER_TB - 1) / WARPS_PER_TB;
|
||||||
|
|
||||||
dim3 block_dim(WARP_SIZE, WARPS_PER_TB);
|
dim3 block_dim(WARP_SIZE_PARAM, WARPS_PER_TB);
|
||||||
topkGatingSoftmax<VPT, EXPERTS, WARPS_PER_TB, BYTES_PER_LDG><<<num_blocks, block_dim, 0, stream>>>(
|
topkGatingSoftmax<VPT, EXPERTS, WARPS_PER_TB, BYTES_PER_LDG, WARP_SIZE_PARAM><<<num_blocks, block_dim, 0, stream>>>(
|
||||||
input, finished, output, num_rows, indices, source_row, k, start_expert, end_expert);
|
input, finished, output, num_rows, indices, source_row, k, start_expert, end_expert);
|
||||||
}
|
}
|
||||||
|
|
||||||
#define LAUNCH_SOFTMAX(NUM_EXPERTS, WARPS_PER_TB) \
|
#define LAUNCH_SOFTMAX(NUM_EXPERTS, WARPS_PER_TB) \
|
||||||
topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB>( \
|
switch (warpSize) { \
|
||||||
gating_output, nullptr, topk_weights, topk_indices, \
|
case 32: \
|
||||||
token_expert_indices, num_tokens, topk, 0, num_experts, \
|
topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB, 32>( \
|
||||||
stream);
|
gating_output, nullptr, topk_weights, topk_indices, \
|
||||||
|
token_expert_indices, num_tokens, topk, 0, num_experts, stream); \
|
||||||
|
break; \
|
||||||
|
case 64: \
|
||||||
|
topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB, 64>( \
|
||||||
|
gating_output, nullptr, topk_weights, topk_indices, \
|
||||||
|
token_expert_indices, num_tokens, topk, 0, num_experts, stream); \
|
||||||
|
break; \
|
||||||
|
default: \
|
||||||
|
TORCH_CHECK(false, "Unsupported warp size: ", warpSize); \
|
||||||
|
}
|
||||||
|
|
||||||
template <typename IndType>
|
template <typename IndType>
|
||||||
void topkGatingSoftmaxKernelLauncher(
|
void topkGatingSoftmaxKernelLauncher(
|
||||||
@ -441,6 +453,7 @@ void topkGatingSoftmaxKernelLauncher(
|
|||||||
const int topk,
|
const int topk,
|
||||||
cudaStream_t stream) {
|
cudaStream_t stream) {
|
||||||
static constexpr int WARPS_PER_TB = 4;
|
static constexpr int WARPS_PER_TB = 4;
|
||||||
|
auto warpSize = WARP_SIZE;
|
||||||
switch (num_experts) {
|
switch (num_experts) {
|
||||||
case 1:
|
case 1:
|
||||||
LAUNCH_SOFTMAX(1, WARPS_PER_TB);
|
LAUNCH_SOFTMAX(1, WARPS_PER_TB);
|
||||||
|
|||||||
@ -56,18 +56,17 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
|
|||||||
" -> Tensor");
|
" -> Tensor");
|
||||||
|
|
||||||
m.def(
|
m.def(
|
||||||
"moe_permute(Tensor input, Tensor topk_weight, Tensor! topk_ids,"
|
"moe_permute(Tensor input, Tensor topk_ids,"
|
||||||
"Tensor token_expert_indices, Tensor? expert_map, int n_expert,"
|
"Tensor token_expert_indices, Tensor? expert_map, int n_expert,"
|
||||||
"int n_local_expert,"
|
"int n_local_expert,"
|
||||||
"int topk, int? align_block_size,Tensor! permuted_input, Tensor! "
|
"int topk, int? align_block_size,Tensor! permuted_input, Tensor! "
|
||||||
"expert_first_token_offset, Tensor! src_row_id2dst_row_id_map, Tensor! "
|
"expert_first_token_offset, Tensor! inv_permuted_idx, Tensor! "
|
||||||
"m_indices)->()");
|
"permuted_idx, Tensor! m_indices)->()");
|
||||||
|
|
||||||
m.def(
|
m.def(
|
||||||
"moe_unpermute(Tensor permuted_hidden_states, Tensor topk_weights,"
|
"moe_unpermute(Tensor permuted_hidden_states, Tensor topk_weights,"
|
||||||
"Tensor topk_ids,Tensor src_row_id2dst_row_id_map, Tensor "
|
"Tensor inv_permuted_idx, Tensor? expert_first_token_offset, "
|
||||||
"expert_first_token_offset, int n_expert, int n_local_expert,int "
|
"int topk, Tensor! hidden_states)->()");
|
||||||
"topk, Tensor! hidden_states)->()");
|
|
||||||
|
|
||||||
m.def("moe_permute_unpermute_supported() -> bool");
|
m.def("moe_permute_unpermute_supported() -> bool");
|
||||||
m.impl("moe_permute_unpermute_supported", &moe_permute_unpermute_supported);
|
m.impl("moe_permute_unpermute_supported", &moe_permute_unpermute_supported);
|
||||||
|
|||||||
@ -292,6 +292,11 @@ void per_token_group_quant_fp8(const torch::Tensor& input,
|
|||||||
torch::Tensor& output_q, torch::Tensor& output_s,
|
torch::Tensor& output_q, torch::Tensor& output_s,
|
||||||
int64_t group_size, double eps, double fp8_min,
|
int64_t group_size, double eps, double fp8_min,
|
||||||
double fp8_max, bool scale_ue8m0);
|
double fp8_max, bool scale_ue8m0);
|
||||||
|
|
||||||
|
void per_token_group_quant_int8(const torch::Tensor& input,
|
||||||
|
torch::Tensor& output_q,
|
||||||
|
torch::Tensor& output_s, int64_t group_size,
|
||||||
|
double eps, double int8_min, double int8_max);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
void static_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input,
|
void static_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input,
|
||||||
|
|||||||
@ -4,7 +4,7 @@
|
|||||||
|
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
#include "core/math.hpp"
|
#include "core/math.hpp"
|
||||||
#include "cuda_compat.h"
|
#include "../cuda_compat.h"
|
||||||
#include "dispatch_utils.h"
|
#include "dispatch_utils.h"
|
||||||
|
|
||||||
#include "quantization/fp8/common.cuh"
|
#include "quantization/fp8/common.cuh"
|
||||||
|
|||||||
@ -1,6 +1,10 @@
|
|||||||
#include <ATen/cuda/CUDAContext.h>
|
#include <ATen/cuda/CUDAContext.h>
|
||||||
#include <torch/all.h>
|
#include <torch/all.h>
|
||||||
|
|
||||||
|
#ifndef USE_ROCM
|
||||||
|
#include "../per_token_group_quant_8bit.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
|
|
||||||
#include "../../dispatch_utils.h"
|
#include "../../dispatch_utils.h"
|
||||||
@ -336,3 +340,13 @@ void dynamic_scaled_int8_quant(
|
|||||||
}
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifndef USE_ROCM
|
||||||
|
void per_token_group_quant_int8(const torch::Tensor& input,
|
||||||
|
torch::Tensor& output_q,
|
||||||
|
torch::Tensor& output_s, int64_t group_size,
|
||||||
|
double eps, double int8_min, double int8_max) {
|
||||||
|
per_token_group_quant_8bit(input, output_q, output_s, group_size, eps,
|
||||||
|
int8_min, int8_max);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|||||||
@ -86,6 +86,7 @@ D = s_a s_b \widehat A \widehat B
|
|||||||
```
|
```
|
||||||
|
|
||||||
Epilogue parameters:
|
Epilogue parameters:
|
||||||
|
|
||||||
- `scale_a` is the scale for activations, can be per-tensor (scalar) or per-token (column-vector).
|
- `scale_a` is the scale for activations, can be per-tensor (scalar) or per-token (column-vector).
|
||||||
- `scale_b` is the scale for weights, can be per-tensor (scalar) or per-channel (row-vector).
|
- `scale_b` is the scale for weights, can be per-tensor (scalar) or per-channel (row-vector).
|
||||||
|
|
||||||
@ -135,7 +136,7 @@ That is precomputed and stored in `azp_with_adj` as a row-vector.
|
|||||||
Epilogue parameters:
|
Epilogue parameters:
|
||||||
|
|
||||||
- `scale_a` is the scale for activations, can be per-tensor (scalar) or per-token (column-vector).
|
- `scale_a` is the scale for activations, can be per-tensor (scalar) or per-token (column-vector).
|
||||||
- Generally this will be per-tensor as the zero-points are per-tensor.
|
- Generally this will be per-tensor as the zero-points are per-tensor.
|
||||||
- `scale_b` is the scale for weights, can be per-tensor (scalar) or per-channel (row-vector).
|
- `scale_b` is the scale for weights, can be per-tensor (scalar) or per-channel (row-vector).
|
||||||
- `azp_with_adj` is the precomputed zero-point term ($` z_a J_a \widehat B `$), is per-channel (row-vector).
|
- `azp_with_adj` is the precomputed zero-point term ($` z_a J_a \widehat B `$), is per-channel (row-vector).
|
||||||
- `bias` is the bias, is always per-channel (row-vector).
|
- `bias` is the bias, is always per-channel (row-vector).
|
||||||
@ -152,7 +153,7 @@ That means the zero-point term $` z_a J_a \widehat B `$ becomes an outer product
|
|||||||
Epilogue parameters:
|
Epilogue parameters:
|
||||||
|
|
||||||
- `scale_a` is the scale for activations, can be per-tensor (scalar) or per-token (column-vector).
|
- `scale_a` is the scale for activations, can be per-tensor (scalar) or per-token (column-vector).
|
||||||
- Generally this will be per-token as the zero-points are per-token.
|
- Generally this will be per-token as the zero-points are per-token.
|
||||||
- `scale_b` is the scale for weights, can be per-tensor (scalar) or per-channel (row-vector).
|
- `scale_b` is the scale for weights, can be per-tensor (scalar) or per-channel (row-vector).
|
||||||
- `azp_adj` is the precomputed zero-point adjustment term ($` \mathbf 1 \widehat B `$), is per-channel (row-vector).
|
- `azp_adj` is the precomputed zero-point adjustment term ($` \mathbf 1 \widehat B `$), is per-channel (row-vector).
|
||||||
- `azp` is the zero-point (`z_a`), is per-token (column-vector).
|
- `azp` is the zero-point (`z_a`), is per-token (column-vector).
|
||||||
|
|||||||
@ -1,6 +1,5 @@
|
|||||||
#include "scaled_mm_kernels.hpp"
|
#include "scaled_mm_kernels.hpp"
|
||||||
#include "scaled_mm_sm90_fp8_dispatch.cuh"
|
#include "scaled_mm_sm90_fp8_dispatch.cuh"
|
||||||
#include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp"
|
|
||||||
|
|
||||||
namespace vllm {
|
namespace vllm {
|
||||||
|
|
||||||
@ -13,11 +12,11 @@ void cutlass_scaled_mm_sm90_fp8(torch::Tensor& out, torch::Tensor const& a,
|
|||||||
if (bias) {
|
if (bias) {
|
||||||
TORCH_CHECK(bias->dtype() == out.dtype(),
|
TORCH_CHECK(bias->dtype() == out.dtype(),
|
||||||
"currently bias dtype must match output dtype ", out.dtype());
|
"currently bias dtype must match output dtype ", out.dtype());
|
||||||
return cutlass_scaled_mm_sm90_fp8_epilogue<c3x::ScaledEpilogueBias>(
|
return cutlass_scaled_mm_sm90_fp8_epilogue<true>(out, a, b, a_scales,
|
||||||
out, a, b, a_scales, b_scales, *bias);
|
b_scales, *bias);
|
||||||
} else {
|
} else {
|
||||||
return cutlass_scaled_mm_sm90_fp8_epilogue<c3x::ScaledEpilogue>(
|
return cutlass_scaled_mm_sm90_fp8_epilogue<false>(out, a, b, a_scales,
|
||||||
out, a, b, a_scales, b_scales);
|
b_scales);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -2,6 +2,7 @@
|
|||||||
|
|
||||||
#include "scaled_mm.cuh"
|
#include "scaled_mm.cuh"
|
||||||
#include "cutlass_gemm_caller.cuh"
|
#include "cutlass_gemm_caller.cuh"
|
||||||
|
#include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp"
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* This file defines Gemm kernel configurations for SM90 (fp8) based on the Gemm
|
* This file defines Gemm kernel configurations for SM90 (fp8) based on the Gemm
|
||||||
@ -12,8 +13,91 @@ namespace vllm {
|
|||||||
|
|
||||||
using c3x::cutlass_gemm_caller;
|
using c3x::cutlass_gemm_caller;
|
||||||
|
|
||||||
template <typename InType, typename OutType,
|
template <typename ElementAB_, typename ElementD_,
|
||||||
template <typename, typename, typename> typename Epilogue>
|
template <typename, typename, typename> typename Epilogue_,
|
||||||
|
typename TileShape, typename ClusterShape, typename KernelSchedule,
|
||||||
|
typename EpilogueSchedule, bool swap_ab_ = false>
|
||||||
|
struct cutlass_3x_gemm_sm90_fp8 {
|
||||||
|
using ElementAB = ElementAB_;
|
||||||
|
using ElementC = ElementD_;
|
||||||
|
using ElementD = ElementD_;
|
||||||
|
using ElementAcc =
|
||||||
|
typename std::conditional<std::is_same_v<ElementAB, int8_t>, int32_t,
|
||||||
|
float>::type;
|
||||||
|
|
||||||
|
using Epilogue = Epilogue_<ElementAcc, ElementD, TileShape>;
|
||||||
|
|
||||||
|
using EVTCompute = typename Epilogue::EVTCompute;
|
||||||
|
|
||||||
|
static constexpr int AlignmentAB =
|
||||||
|
128 / cutlass::sizeof_bits<ElementAB>::value;
|
||||||
|
static constexpr int AlignmentCD =
|
||||||
|
128 / cutlass::sizeof_bits<ElementD>::value;
|
||||||
|
|
||||||
|
// Compile-time swap_ab flag
|
||||||
|
static constexpr bool swap_ab = swap_ab_;
|
||||||
|
|
||||||
|
// -----------------------------------------------------------
|
||||||
|
// Layout definitions
|
||||||
|
// -----------------------------------------------------------
|
||||||
|
using LayoutA = cutlass::layout::RowMajor;
|
||||||
|
using LayoutA_T = typename cutlass::layout::LayoutTranspose<LayoutA>::type;
|
||||||
|
|
||||||
|
using LayoutB = cutlass::layout::ColumnMajor;
|
||||||
|
using LayoutB_T = typename cutlass::layout::LayoutTranspose<LayoutB>::type;
|
||||||
|
|
||||||
|
using LayoutD = cutlass::layout::RowMajor;
|
||||||
|
using LayoutD_Transpose =
|
||||||
|
typename cutlass::layout::LayoutTranspose<LayoutD>::type;
|
||||||
|
|
||||||
|
using LayoutC = LayoutD;
|
||||||
|
using LayoutC_Transpose = LayoutD_Transpose;
|
||||||
|
|
||||||
|
// -----------------------------------------------------------
|
||||||
|
// Collective epilogue (conditionally swap operands and layouts)
|
||||||
|
// -----------------------------------------------------------
|
||||||
|
using CollectiveEpilogue =
|
||||||
|
typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||||
|
cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp, TileShape,
|
||||||
|
ClusterShape, cutlass::epilogue::collective::EpilogueTileAuto,
|
||||||
|
ElementAcc, float, ElementC,
|
||||||
|
conditional_t<swap_ab, LayoutC_Transpose, LayoutC>, AlignmentCD,
|
||||||
|
ElementD, conditional_t<swap_ab, LayoutD_Transpose, LayoutD>,
|
||||||
|
AlignmentCD, EpilogueSchedule, EVTCompute>::CollectiveOp;
|
||||||
|
|
||||||
|
static constexpr size_t CEStorageSize =
|
||||||
|
sizeof(typename CollectiveEpilogue::SharedStorage);
|
||||||
|
|
||||||
|
using Stages = typename cutlass::gemm::collective::StageCountAutoCarveout<
|
||||||
|
static_cast<int>(CEStorageSize)>;
|
||||||
|
|
||||||
|
// -----------------------------------------------------------
|
||||||
|
// Collective mainloop (conditionally swap operands and layouts)
|
||||||
|
// -----------------------------------------------------------
|
||||||
|
using CollectiveMainloop = conditional_t<
|
||||||
|
swap_ab,
|
||||||
|
typename cutlass::gemm::collective::CollectiveBuilder<
|
||||||
|
cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp, ElementAB,
|
||||||
|
LayoutB_T, AlignmentAB, // Swapped B (as A)
|
||||||
|
ElementAB, LayoutA_T, AlignmentAB, // Swapped A (as B)
|
||||||
|
ElementAcc, TileShape, ClusterShape, Stages,
|
||||||
|
KernelSchedule>::CollectiveOp,
|
||||||
|
typename cutlass::gemm::collective::CollectiveBuilder<
|
||||||
|
cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp, ElementAB,
|
||||||
|
LayoutA, AlignmentAB, ElementAB, LayoutB, AlignmentAB, ElementAcc,
|
||||||
|
TileShape, ClusterShape, Stages, KernelSchedule>::CollectiveOp>;
|
||||||
|
|
||||||
|
// -----------------------------------------------------------
|
||||||
|
// Kernel definition
|
||||||
|
// -----------------------------------------------------------
|
||||||
|
using KernelType = enable_sm90_or_later<cutlass::gemm::kernel::GemmUniversal<
|
||||||
|
cute::Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue,
|
||||||
|
cutlass::gemm::PersistentScheduler>>;
|
||||||
|
|
||||||
|
struct GemmKernel : public KernelType {};
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename InType, typename OutType, bool EnableBias>
|
||||||
struct sm90_fp8_config_default {
|
struct sm90_fp8_config_default {
|
||||||
// M in (128, inf)
|
// M in (128, inf)
|
||||||
static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
|
static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
|
||||||
@ -22,13 +106,17 @@ struct sm90_fp8_config_default {
|
|||||||
using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
|
using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
|
||||||
using TileShape = Shape<_128, _128, _128>;
|
using TileShape = Shape<_128, _128, _128>;
|
||||||
using ClusterShape = Shape<_2, _1, _1>;
|
using ClusterShape = Shape<_2, _1, _1>;
|
||||||
using Cutlass3xGemm =
|
|
||||||
cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
|
using Cutlass3xGemm = conditional_t<
|
||||||
KernelSchedule, EpilogueSchedule>;
|
EnableBias,
|
||||||
|
cutlass_3x_gemm_sm90_fp8<InType, OutType, c3x::ScaledEpilogueBias,
|
||||||
|
TileShape, ClusterShape, KernelSchedule,
|
||||||
|
EpilogueSchedule>,
|
||||||
|
cutlass_3x_gemm_sm90_fp8<InType, OutType, c3x::ScaledEpilogue, TileShape,
|
||||||
|
ClusterShape, KernelSchedule, EpilogueSchedule>>;
|
||||||
};
|
};
|
||||||
|
|
||||||
template <typename InType, typename OutType,
|
template <typename InType, typename OutType, bool EnableBias>
|
||||||
template <typename, typename, typename> typename Epilogue>
|
|
||||||
struct sm90_fp8_config_M128 {
|
struct sm90_fp8_config_M128 {
|
||||||
// M in (64, 128]
|
// M in (64, 128]
|
||||||
static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
|
static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
|
||||||
@ -37,33 +125,146 @@ struct sm90_fp8_config_M128 {
|
|||||||
using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
|
using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
|
||||||
using TileShape = Shape<_64, _128, _128>;
|
using TileShape = Shape<_64, _128, _128>;
|
||||||
using ClusterShape = Shape<_2, _1, _1>;
|
using ClusterShape = Shape<_2, _1, _1>;
|
||||||
using Cutlass3xGemm =
|
using Cutlass3xGemm = conditional_t<
|
||||||
cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
|
EnableBias,
|
||||||
KernelSchedule, EpilogueSchedule>;
|
cutlass_3x_gemm_sm90_fp8<InType, OutType, c3x::ScaledEpilogueBias,
|
||||||
|
TileShape, ClusterShape, KernelSchedule,
|
||||||
|
EpilogueSchedule>,
|
||||||
|
cutlass_3x_gemm_sm90_fp8<InType, OutType, c3x::ScaledEpilogue, TileShape,
|
||||||
|
ClusterShape, KernelSchedule, EpilogueSchedule>>;
|
||||||
};
|
};
|
||||||
|
|
||||||
template <typename InType, typename OutType,
|
template <typename InType, typename OutType, bool EnableBias>
|
||||||
template <typename, typename, typename> typename Epilogue>
|
struct sm90_fp8_config_M64_N1280 {
|
||||||
struct sm90_fp8_config_M64 {
|
// M in (16, 64], N in [1 1280]
|
||||||
// M in [1, 64]
|
|
||||||
static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
|
static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
|
||||||
using KernelSchedule =
|
using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum;
|
||||||
cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
|
|
||||||
using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
|
using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
|
||||||
using TileShape = Shape<_64, _64, _128>;
|
using TileShape = Shape<_64, _16, _256>;
|
||||||
using ClusterShape = Shape<_1, _8, _1>;
|
using ClusterShape = Shape<_1, _4, _1>;
|
||||||
|
|
||||||
using Cutlass3xGemm =
|
// enable swap AB for M < 64
|
||||||
cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
|
using Cutlass3xGemm = conditional_t<
|
||||||
KernelSchedule, EpilogueSchedule>;
|
EnableBias,
|
||||||
|
cutlass_3x_gemm_sm90_fp8<InType, OutType, c3x::ScaledEpilogueColumnBias,
|
||||||
|
TileShape, ClusterShape, KernelSchedule,
|
||||||
|
EpilogueSchedule, true>,
|
||||||
|
cutlass_3x_gemm_sm90_fp8<InType, OutType, c3x::ScaledEpilogue, TileShape,
|
||||||
|
ClusterShape, KernelSchedule, EpilogueSchedule,
|
||||||
|
true>>;
|
||||||
};
|
};
|
||||||
|
|
||||||
template <typename InType, typename OutType,
|
template <typename InType, typename OutType, bool EnableBias>
|
||||||
template <typename, typename, typename> typename Epilogue,
|
struct sm90_fp8_config_M64_N8192 {
|
||||||
|
// M in (16, 64], N > 1280
|
||||||
|
static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
|
||||||
|
using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum;
|
||||||
|
using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
|
||||||
|
using TileShape = Shape<_64, _64, _256>;
|
||||||
|
using ClusterShape = Shape<_1, _1, _1>;
|
||||||
|
|
||||||
|
// enable swap AB for M < 64
|
||||||
|
using Cutlass3xGemm = conditional_t<
|
||||||
|
EnableBias,
|
||||||
|
cutlass_3x_gemm_sm90_fp8<InType, OutType, c3x::ScaledEpilogueColumnBias,
|
||||||
|
TileShape, ClusterShape, KernelSchedule,
|
||||||
|
EpilogueSchedule, true>,
|
||||||
|
cutlass_3x_gemm_sm90_fp8<InType, OutType, c3x::ScaledEpilogue, TileShape,
|
||||||
|
ClusterShape, KernelSchedule, EpilogueSchedule,
|
||||||
|
true>>;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename InType, typename OutType, bool EnableBias>
|
||||||
|
struct sm90_fp8_config_M16_N1280 {
|
||||||
|
// M in [1, 16], N in [1, 1280]
|
||||||
|
static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
|
||||||
|
using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum;
|
||||||
|
using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
|
||||||
|
using TileShape = Shape<_64, _16, _256>;
|
||||||
|
using ClusterShape = Shape<_1, _2, _1>;
|
||||||
|
|
||||||
|
// enable swap AB for M < 64
|
||||||
|
using Cutlass3xGemm = conditional_t<
|
||||||
|
EnableBias,
|
||||||
|
cutlass_3x_gemm_sm90_fp8<InType, OutType, c3x::ScaledEpilogueColumnBias,
|
||||||
|
TileShape, ClusterShape, KernelSchedule,
|
||||||
|
EpilogueSchedule, true>,
|
||||||
|
cutlass_3x_gemm_sm90_fp8<InType, OutType, c3x::ScaledEpilogue, TileShape,
|
||||||
|
ClusterShape, KernelSchedule, EpilogueSchedule,
|
||||||
|
true>>;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename InType, typename OutType, bool EnableBias>
|
||||||
|
struct sm90_fp8_config_M16_N8192 {
|
||||||
|
// M in [1, 16], N > 1280
|
||||||
|
static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
|
||||||
|
using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum;
|
||||||
|
using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
|
||||||
|
using TileShape = Shape<_64, _16, _256>;
|
||||||
|
using ClusterShape = Shape<_1, _1, _1>;
|
||||||
|
|
||||||
|
// enable swap AB for M < 64
|
||||||
|
using Cutlass3xGemm = conditional_t<
|
||||||
|
EnableBias,
|
||||||
|
cutlass_3x_gemm_sm90_fp8<InType, OutType, c3x::ScaledEpilogueColumnBias,
|
||||||
|
TileShape, ClusterShape, KernelSchedule,
|
||||||
|
EpilogueSchedule, true>,
|
||||||
|
cutlass_3x_gemm_sm90_fp8<InType, OutType, c3x::ScaledEpilogue, TileShape,
|
||||||
|
ClusterShape, KernelSchedule, EpilogueSchedule,
|
||||||
|
true>>;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename Gemm, typename... EpilogueArgs>
|
||||||
|
void cutlass_gemm_caller_sm90_fp8(torch::Tensor& out, torch::Tensor const& a,
|
||||||
|
torch::Tensor const& b,
|
||||||
|
EpilogueArgs&&... epilogue_params) {
|
||||||
|
static constexpr bool swap_ab = Gemm::swap_ab;
|
||||||
|
using ElementAB = typename Gemm::ElementAB;
|
||||||
|
using ElementD = typename Gemm::ElementD;
|
||||||
|
using GemmKernel = typename Gemm::GemmKernel;
|
||||||
|
|
||||||
|
using StrideA = typename Gemm::GemmKernel::StrideA;
|
||||||
|
using StrideB = typename Gemm::GemmKernel::StrideB;
|
||||||
|
using StrideC = typename Gemm::GemmKernel::StrideC;
|
||||||
|
|
||||||
|
int32_t m = a.size(0), n = b.size(1), k = a.size(1);
|
||||||
|
auto prob_shape =
|
||||||
|
swap_ab ? cute::make_shape(n, m, k, 1) : cute::make_shape(m, n, k, 1);
|
||||||
|
|
||||||
|
StrideA a_stride =
|
||||||
|
cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(m, k, 1));
|
||||||
|
StrideB b_stride =
|
||||||
|
cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(n, k, 1));
|
||||||
|
StrideC c_stride = cutlass::make_cute_packed_stride(
|
||||||
|
StrideC{},
|
||||||
|
swap_ab ? cute::make_shape(n, m, 1) : cute::make_shape(m, n, 1));
|
||||||
|
|
||||||
|
auto a_ptr = static_cast<ElementAB*>(a.data_ptr());
|
||||||
|
auto b_ptr = static_cast<ElementAB*>(b.data_ptr());
|
||||||
|
auto c_ptr = static_cast<ElementD*>(out.data_ptr());
|
||||||
|
|
||||||
|
typename GemmKernel::MainloopArguments mainloop_args =
|
||||||
|
swap_ab ? typename GemmKernel::MainloopArguments{b_ptr, b_stride, a_ptr,
|
||||||
|
a_stride}
|
||||||
|
: typename GemmKernel::MainloopArguments{a_ptr, a_stride, b_ptr,
|
||||||
|
b_stride};
|
||||||
|
|
||||||
|
typename GemmKernel::EpilogueArguments epilogue_args{
|
||||||
|
Gemm::Epilogue::prepare_args(
|
||||||
|
std::forward<EpilogueArgs>(epilogue_params)...),
|
||||||
|
c_ptr, c_stride, c_ptr, c_stride};
|
||||||
|
|
||||||
|
c3x::cutlass_gemm_caller<GemmKernel>(a.device(), prob_shape, mainloop_args,
|
||||||
|
epilogue_args);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename InType, typename OutType, bool EnableBias,
|
||||||
typename... EpilogueArgs>
|
typename... EpilogueArgs>
|
||||||
inline void cutlass_gemm_sm90_fp8_dispatch(torch::Tensor& out,
|
inline void cutlass_gemm_sm90_fp8_dispatch(torch::Tensor& out,
|
||||||
torch::Tensor const& a,
|
torch::Tensor const& a,
|
||||||
torch::Tensor const& b,
|
torch::Tensor const& b,
|
||||||
|
torch::Tensor const& a_scales,
|
||||||
|
torch::Tensor const& b_scales,
|
||||||
EpilogueArgs&&... args) {
|
EpilogueArgs&&... args) {
|
||||||
static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
|
static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
|
||||||
TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
|
TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
|
||||||
@ -71,50 +272,75 @@ inline void cutlass_gemm_sm90_fp8_dispatch(torch::Tensor& out,
|
|||||||
|
|
||||||
using Cutlass3xGemmDefault =
|
using Cutlass3xGemmDefault =
|
||||||
typename sm90_fp8_config_default<InType, OutType,
|
typename sm90_fp8_config_default<InType, OutType,
|
||||||
Epilogue>::Cutlass3xGemm;
|
EnableBias>::Cutlass3xGemm;
|
||||||
using Cutlass3xGemmM64 =
|
|
||||||
typename sm90_fp8_config_M64<InType, OutType, Epilogue>::Cutlass3xGemm;
|
|
||||||
using Cutlass3xGemmM128 =
|
using Cutlass3xGemmM128 =
|
||||||
typename sm90_fp8_config_M128<InType, OutType, Epilogue>::Cutlass3xGemm;
|
typename sm90_fp8_config_M128<InType, OutType, EnableBias>::Cutlass3xGemm;
|
||||||
|
|
||||||
|
using Cutlass3xGemmM64_N1280 =
|
||||||
|
typename sm90_fp8_config_M64_N1280<InType, OutType,
|
||||||
|
EnableBias>::Cutlass3xGemm;
|
||||||
|
using Cutlass3xGemmM64_N8192 =
|
||||||
|
typename sm90_fp8_config_M64_N8192<InType, OutType,
|
||||||
|
EnableBias>::Cutlass3xGemm;
|
||||||
|
using Cutlass3xGemmM16_N1280 =
|
||||||
|
typename sm90_fp8_config_M16_N1280<InType, OutType,
|
||||||
|
EnableBias>::Cutlass3xGemm;
|
||||||
|
using Cutlass3xGemmM16_N8192 =
|
||||||
|
typename sm90_fp8_config_M16_N8192<InType, OutType,
|
||||||
|
EnableBias>::Cutlass3xGemm;
|
||||||
|
|
||||||
uint32_t const m = a.size(0);
|
uint32_t const m = a.size(0);
|
||||||
uint32_t const mp2 =
|
uint32_t const n = b.size(1);
|
||||||
std::max(static_cast<uint32_t>(64), next_pow_2(m)); // next power of 2
|
|
||||||
|
|
||||||
if (mp2 <= 64) {
|
if (m <= 16) {
|
||||||
// m in [1, 64]
|
// m in [1, 16]
|
||||||
return cutlass_gemm_caller<Cutlass3xGemmM64>(
|
if (n <= 1280) {
|
||||||
out, a, b, std::forward<EpilogueArgs>(args)...);
|
return cutlass_gemm_caller_sm90_fp8<Cutlass3xGemmM16_N1280>(
|
||||||
} else if (mp2 <= 128) {
|
out, a, b, b_scales, a_scales, std::forward<EpilogueArgs>(args)...);
|
||||||
|
}
|
||||||
|
return cutlass_gemm_caller_sm90_fp8<Cutlass3xGemmM16_N8192>(
|
||||||
|
out, a, b, b_scales, a_scales, std::forward<EpilogueArgs>(args)...);
|
||||||
|
} else if (m <= 64) {
|
||||||
|
// m in (16, 64]
|
||||||
|
if (n <= 1280) {
|
||||||
|
return cutlass_gemm_caller_sm90_fp8<Cutlass3xGemmM64_N1280>(
|
||||||
|
out, a, b, b_scales, a_scales, std::forward<EpilogueArgs>(args)...);
|
||||||
|
}
|
||||||
|
return cutlass_gemm_caller_sm90_fp8<Cutlass3xGemmM64_N8192>(
|
||||||
|
out, a, b, b_scales, a_scales, std::forward<EpilogueArgs>(args)...);
|
||||||
|
} else if (m <= 128) {
|
||||||
// m in (64, 128]
|
// m in (64, 128]
|
||||||
return cutlass_gemm_caller<Cutlass3xGemmM128>(
|
return cutlass_gemm_caller_sm90_fp8<Cutlass3xGemmM128>(
|
||||||
out, a, b, std::forward<EpilogueArgs>(args)...);
|
out, a, b, a_scales, b_scales, std::forward<EpilogueArgs>(args)...);
|
||||||
} else {
|
} else {
|
||||||
// m in (128, inf)
|
// m in (128, inf)
|
||||||
return cutlass_gemm_caller<Cutlass3xGemmDefault>(
|
return cutlass_gemm_caller_sm90_fp8<Cutlass3xGemmDefault>(
|
||||||
out, a, b, std::forward<EpilogueArgs>(args)...);
|
out, a, b, a_scales, b_scales, std::forward<EpilogueArgs>(args)...);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template <template <typename, typename, typename> typename Epilogue,
|
template <bool EnableBias, typename... EpilogueArgs>
|
||||||
typename... EpilogueArgs>
|
|
||||||
void cutlass_scaled_mm_sm90_fp8_epilogue(torch::Tensor& out,
|
void cutlass_scaled_mm_sm90_fp8_epilogue(torch::Tensor& out,
|
||||||
torch::Tensor const& a,
|
torch::Tensor const& a,
|
||||||
torch::Tensor const& b,
|
torch::Tensor const& b,
|
||||||
|
torch::Tensor const& a_scales,
|
||||||
|
torch::Tensor const& b_scales,
|
||||||
EpilogueArgs&&... epilogue_args) {
|
EpilogueArgs&&... epilogue_args) {
|
||||||
TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
|
TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
|
||||||
TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn);
|
TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn);
|
||||||
|
|
||||||
if (out.dtype() == torch::kBFloat16) {
|
if (out.dtype() == torch::kBFloat16) {
|
||||||
return cutlass_gemm_sm90_fp8_dispatch<cutlass::float_e4m3_t,
|
return cutlass_gemm_sm90_fp8_dispatch<cutlass::float_e4m3_t,
|
||||||
cutlass::bfloat16_t, Epilogue>(
|
cutlass::bfloat16_t, EnableBias>(
|
||||||
out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
|
out, a, b, a_scales, b_scales,
|
||||||
|
std::forward<EpilogueArgs>(epilogue_args)...);
|
||||||
} else {
|
} else {
|
||||||
TORCH_CHECK(out.dtype() == torch::kFloat16);
|
TORCH_CHECK(out.dtype() == torch::kFloat16);
|
||||||
return cutlass_gemm_sm90_fp8_dispatch<cutlass::float_e4m3_t,
|
return cutlass_gemm_sm90_fp8_dispatch<cutlass::float_e4m3_t,
|
||||||
cutlass::half_t, Epilogue>(
|
cutlass::half_t, EnableBias>(
|
||||||
out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
|
out, a, b, a_scales, b_scales,
|
||||||
|
std::forward<EpilogueArgs>(epilogue_args)...);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace vllm
|
} // namespace vllm
|
||||||
|
|||||||
@ -47,13 +47,12 @@ __global__ void compute_problem_sizes(const int32_t* __restrict__ topk_ids,
|
|||||||
|
|
||||||
__global__ void compute_expert_offsets(
|
__global__ void compute_expert_offsets(
|
||||||
const int32_t* __restrict__ problem_sizes1, int32_t* expert_offsets,
|
const int32_t* __restrict__ problem_sizes1, int32_t* expert_offsets,
|
||||||
int32_t* atomic_buffer, const int num_experts, const int topk_length) {
|
int32_t* atomic_buffer, const int num_experts, const bool swap_ab) {
|
||||||
int32_t tot_offset = 0;
|
int32_t tot_offset = 0;
|
||||||
expert_offsets[0] = 0;
|
expert_offsets[0] = 0;
|
||||||
for (int i = 0; i < num_experts; ++i) {
|
for (int i = 0; i < num_experts; ++i) {
|
||||||
atomic_buffer[i] = tot_offset;
|
atomic_buffer[i] = tot_offset;
|
||||||
tot_offset += topk_length > SWAP_AB_THRESHOLD ? problem_sizes1[i * 3]
|
tot_offset += swap_ab ? problem_sizes1[i * 3 + 1] : problem_sizes1[i * 3];
|
||||||
: problem_sizes1[i * 3 + 1];
|
|
||||||
expert_offsets[i + 1] = tot_offset;
|
expert_offsets[i + 1] = tot_offset;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -61,15 +60,14 @@ __global__ void compute_expert_offsets(
|
|||||||
__global__ void compute_expert_blockscale_offsets(
|
__global__ void compute_expert_blockscale_offsets(
|
||||||
const int32_t* __restrict__ problem_sizes1, int32_t* expert_offsets,
|
const int32_t* __restrict__ problem_sizes1, int32_t* expert_offsets,
|
||||||
int32_t* blockscale_offsets, int32_t* atomic_buffer, const int num_experts,
|
int32_t* blockscale_offsets, int32_t* atomic_buffer, const int num_experts,
|
||||||
const int topk_length) {
|
const bool swap_ab) {
|
||||||
int32_t tot_offset = 0;
|
int32_t tot_offset = 0;
|
||||||
int32_t tot_offset_round = 0;
|
int32_t tot_offset_round = 0;
|
||||||
expert_offsets[0] = 0;
|
expert_offsets[0] = 0;
|
||||||
blockscale_offsets[0] = 0;
|
blockscale_offsets[0] = 0;
|
||||||
for (int i = 0; i < num_experts; ++i) {
|
for (int i = 0; i < num_experts; ++i) {
|
||||||
int32_t cur_offset = topk_length > SWAP_AB_THRESHOLD
|
int32_t cur_offset =
|
||||||
? problem_sizes1[i * 3]
|
swap_ab ? problem_sizes1[i * 3 + 1] : problem_sizes1[i * 3];
|
||||||
: problem_sizes1[i * 3 + 1];
|
|
||||||
atomic_buffer[i] = tot_offset;
|
atomic_buffer[i] = tot_offset;
|
||||||
tot_offset += cur_offset;
|
tot_offset += cur_offset;
|
||||||
expert_offsets[i + 1] = tot_offset;
|
expert_offsets[i + 1] = tot_offset;
|
||||||
@ -119,15 +117,19 @@ void get_cutlass_moe_mm_data_caller(
|
|||||||
|
|
||||||
int num_threads = min(THREADS_PER_EXPERT, topk_ids.numel());
|
int num_threads = min(THREADS_PER_EXPERT, topk_ids.numel());
|
||||||
|
|
||||||
if (topk_ids.numel() > SWAP_AB_THRESHOLD) {
|
// Swap-AB should be disabled for FP4 path
|
||||||
compute_problem_sizes<false><<<num_experts, num_threads, 0, stream>>>(
|
bool may_swap_ab = (!blockscale_offsets.has_value()) &&
|
||||||
|
(topk_ids.numel() <= SWAP_AB_THRESHOLD);
|
||||||
|
|
||||||
|
if (may_swap_ab) {
|
||||||
|
compute_problem_sizes<true><<<num_experts, num_threads, 0, stream>>>(
|
||||||
static_cast<const int32_t*>(topk_ids.data_ptr()),
|
static_cast<const int32_t*>(topk_ids.data_ptr()),
|
||||||
static_cast<int32_t*>(problem_sizes1.data_ptr()),
|
static_cast<int32_t*>(problem_sizes1.data_ptr()),
|
||||||
static_cast<int32_t*>(problem_sizes2.data_ptr()),
|
static_cast<int32_t*>(problem_sizes2.data_ptr()),
|
||||||
static_cast<int32_t*>(atomic_buffer.data_ptr()), topk_ids.numel(), n,
|
static_cast<int32_t*>(atomic_buffer.data_ptr()), topk_ids.numel(), n,
|
||||||
k);
|
k);
|
||||||
} else {
|
} else {
|
||||||
compute_problem_sizes<true><<<num_experts, num_threads, 0, stream>>>(
|
compute_problem_sizes<false><<<num_experts, num_threads, 0, stream>>>(
|
||||||
static_cast<const int32_t*>(topk_ids.data_ptr()),
|
static_cast<const int32_t*>(topk_ids.data_ptr()),
|
||||||
static_cast<int32_t*>(problem_sizes1.data_ptr()),
|
static_cast<int32_t*>(problem_sizes1.data_ptr()),
|
||||||
static_cast<int32_t*>(problem_sizes2.data_ptr()),
|
static_cast<int32_t*>(problem_sizes2.data_ptr()),
|
||||||
@ -136,18 +138,19 @@ void get_cutlass_moe_mm_data_caller(
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (blockscale_offsets.has_value()) {
|
if (blockscale_offsets.has_value()) {
|
||||||
|
// fp4 path
|
||||||
compute_expert_blockscale_offsets<<<1, 1, 0, stream>>>(
|
compute_expert_blockscale_offsets<<<1, 1, 0, stream>>>(
|
||||||
static_cast<const int32_t*>(problem_sizes1.data_ptr()),
|
static_cast<const int32_t*>(problem_sizes1.data_ptr()),
|
||||||
static_cast<int32_t*>(expert_offsets.data_ptr()),
|
static_cast<int32_t*>(expert_offsets.data_ptr()),
|
||||||
static_cast<int32_t*>(blockscale_offsets.value().data_ptr()),
|
static_cast<int32_t*>(blockscale_offsets.value().data_ptr()),
|
||||||
static_cast<int32_t*>(atomic_buffer.data_ptr()), num_experts,
|
static_cast<int32_t*>(atomic_buffer.data_ptr()), num_experts,
|
||||||
topk_ids.numel());
|
may_swap_ab);
|
||||||
} else {
|
} else {
|
||||||
compute_expert_offsets<<<1, 1, 0, stream>>>(
|
compute_expert_offsets<<<1, 1, 0, stream>>>(
|
||||||
static_cast<const int32_t*>(problem_sizes1.data_ptr()),
|
static_cast<const int32_t*>(problem_sizes1.data_ptr()),
|
||||||
static_cast<int32_t*>(expert_offsets.data_ptr()),
|
static_cast<int32_t*>(expert_offsets.data_ptr()),
|
||||||
static_cast<int32_t*>(atomic_buffer.data_ptr()), num_experts,
|
static_cast<int32_t*>(atomic_buffer.data_ptr()), num_experts,
|
||||||
topk_ids.numel());
|
may_swap_ab);
|
||||||
}
|
}
|
||||||
compute_arg_sorts<<<num_experts, num_threads, 0, stream>>>(
|
compute_arg_sorts<<<num_experts, num_threads, 0, stream>>>(
|
||||||
static_cast<const int32_t*>(topk_ids.data_ptr()),
|
static_cast<const int32_t*>(topk_ids.data_ptr()),
|
||||||
|
|||||||
@ -335,7 +335,7 @@ void run_fp4_blockwise_scaled_group_mm(
|
|||||||
TORCH_CHECK(status == cutlass::Status::kSuccess, "Failed to run GEMM");
|
TORCH_CHECK(status == cutlass::Status::kSuccess, "Failed to run GEMM");
|
||||||
}
|
}
|
||||||
|
|
||||||
#if defined ENABLE_NVFP4 && ENABLE_NVFP4
|
#if defined ENABLE_NVFP4_SM100 && ENABLE_NVFP4_SM100
|
||||||
constexpr auto FLOAT4_E2M1X2 = at::ScalarType::Byte;
|
constexpr auto FLOAT4_E2M1X2 = at::ScalarType::Byte;
|
||||||
constexpr auto SF_DTYPE = at::ScalarType::Float8_e4m3fn;
|
constexpr auto SF_DTYPE = at::ScalarType::Float8_e4m3fn;
|
||||||
#endif
|
#endif
|
||||||
@ -356,7 +356,7 @@ void cutlass_fp4_group_mm(
|
|||||||
const torch::Tensor& a_blockscale, const torch::Tensor& b_blockscales,
|
const torch::Tensor& a_blockscale, const torch::Tensor& b_blockscales,
|
||||||
const torch::Tensor& alphas, const torch::Tensor& problem_sizes,
|
const torch::Tensor& alphas, const torch::Tensor& problem_sizes,
|
||||||
const torch::Tensor& expert_offsets, const torch::Tensor& sf_offsets) {
|
const torch::Tensor& expert_offsets, const torch::Tensor& sf_offsets) {
|
||||||
#if defined ENABLE_NVFP4 && ENABLE_NVFP4
|
#if defined ENABLE_NVFP4_SM100 && ENABLE_NVFP4_SM100
|
||||||
// Input validation
|
// Input validation
|
||||||
CHECK_INPUT(a, FLOAT4_E2M1X2, "a");
|
CHECK_INPUT(a, FLOAT4_E2M1X2, "a");
|
||||||
CHECK_INPUT(b, FLOAT4_E2M1X2, "b");
|
CHECK_INPUT(b, FLOAT4_E2M1X2, "b");
|
||||||
@ -398,7 +398,7 @@ void cutlass_fp4_group_mm(
|
|||||||
TORCH_CHECK_NOT_IMPLEMENTED(
|
TORCH_CHECK_NOT_IMPLEMENTED(
|
||||||
false,
|
false,
|
||||||
"No compiled cutlass_fp4_group_mm kernel, vLLM must "
|
"No compiled cutlass_fp4_group_mm kernel, vLLM must "
|
||||||
"be compiled with ENABLE_NVFP4 for SM100+ and CUDA "
|
"be compiled with ENABLE_NVFP4_SM100 for SM100+ and CUDA "
|
||||||
"12.8 or above.");
|
"12.8 or above.");
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|||||||
@ -16,14 +16,15 @@
|
|||||||
|
|
||||||
#include <torch/all.h>
|
#include <torch/all.h>
|
||||||
|
|
||||||
#if defined ENABLE_NVFP4 && ENABLE_NVFP4
|
#if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \
|
||||||
void scaled_fp4_quant_sm100a(torch::Tensor const& output,
|
(defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120)
|
||||||
|
void scaled_fp4_quant_sm1xxa(torch::Tensor const& output,
|
||||||
torch::Tensor const& input,
|
torch::Tensor const& input,
|
||||||
torch::Tensor const& output_sf,
|
torch::Tensor const& output_sf,
|
||||||
torch::Tensor const& input_sf);
|
torch::Tensor const& input_sf);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined ENABLE_NVFP4 && ENABLE_NVFP4
|
#if defined ENABLE_NVFP4_SM100 && ENABLE_NVFP4_SM100
|
||||||
void scaled_fp4_experts_quant_sm100a(
|
void scaled_fp4_experts_quant_sm100a(
|
||||||
torch::Tensor& output, torch::Tensor& output_scale,
|
torch::Tensor& output, torch::Tensor& output_scale,
|
||||||
torch::Tensor const& input, torch::Tensor const& input_global_scale,
|
torch::Tensor const& input, torch::Tensor const& input_global_scale,
|
||||||
@ -33,8 +34,9 @@ void scaled_fp4_experts_quant_sm100a(
|
|||||||
|
|
||||||
void scaled_fp4_quant(torch::Tensor& output, torch::Tensor const& input,
|
void scaled_fp4_quant(torch::Tensor& output, torch::Tensor const& input,
|
||||||
torch::Tensor& output_sf, torch::Tensor const& input_sf) {
|
torch::Tensor& output_sf, torch::Tensor const& input_sf) {
|
||||||
#if defined ENABLE_NVFP4 && ENABLE_NVFP4
|
#if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \
|
||||||
return scaled_fp4_quant_sm100a(output, input, output_sf, input_sf);
|
(defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120)
|
||||||
|
return scaled_fp4_quant_sm1xxa(output, input, output_sf, input_sf);
|
||||||
#endif
|
#endif
|
||||||
TORCH_CHECK_NOT_IMPLEMENTED(false, "No compiled nvfp4 quantization kernel");
|
TORCH_CHECK_NOT_IMPLEMENTED(false, "No compiled nvfp4 quantization kernel");
|
||||||
}
|
}
|
||||||
@ -44,7 +46,7 @@ void scaled_fp4_experts_quant(
|
|||||||
torch::Tensor const& input, torch::Tensor const& input_global_scale,
|
torch::Tensor const& input, torch::Tensor const& input_global_scale,
|
||||||
torch::Tensor const& input_offset_by_experts,
|
torch::Tensor const& input_offset_by_experts,
|
||||||
torch::Tensor const& output_scale_offset_by_experts) {
|
torch::Tensor const& output_scale_offset_by_experts) {
|
||||||
#if defined ENABLE_NVFP4 && ENABLE_NVFP4
|
#if defined ENABLE_NVFP4_SM100 && ENABLE_NVFP4_SM100
|
||||||
return scaled_fp4_experts_quant_sm100a(
|
return scaled_fp4_experts_quant_sm100a(
|
||||||
output, output_scale, input, input_global_scale, input_offset_by_experts,
|
output, output_scale, input, input_global_scale, input_offset_by_experts,
|
||||||
output_scale_offset_by_experts);
|
output_scale_offset_by_experts);
|
||||||
|
|||||||
@ -332,7 +332,7 @@ template void invokeFP4Quantization(int m, int n, __nv_bfloat16 const* input,
|
|||||||
int multiProcessorCount,
|
int multiProcessorCount,
|
||||||
cudaStream_t stream);
|
cudaStream_t stream);
|
||||||
|
|
||||||
void scaled_fp4_quant_sm100a(torch::Tensor const& output,
|
void scaled_fp4_quant_sm1xxa(torch::Tensor const& output,
|
||||||
torch::Tensor const& input,
|
torch::Tensor const& input,
|
||||||
torch::Tensor const& output_sf,
|
torch::Tensor const& output_sf,
|
||||||
torch::Tensor const& input_sf) {
|
torch::Tensor const& input_sf) {
|
||||||
|
|||||||
@ -16,7 +16,7 @@
|
|||||||
|
|
||||||
#include <torch/all.h>
|
#include <torch/all.h>
|
||||||
|
|
||||||
#if defined ENABLE_NVFP4 && ENABLE_NVFP4
|
#if defined ENABLE_NVFP4_SM100 && ENABLE_NVFP4_SM100
|
||||||
void cutlass_scaled_fp4_mm_sm100a(torch::Tensor& D, torch::Tensor const& A,
|
void cutlass_scaled_fp4_mm_sm100a(torch::Tensor& D, torch::Tensor const& A,
|
||||||
torch::Tensor const& B,
|
torch::Tensor const& B,
|
||||||
torch::Tensor const& A_sf,
|
torch::Tensor const& A_sf,
|
||||||
@ -24,12 +24,22 @@ void cutlass_scaled_fp4_mm_sm100a(torch::Tensor& D, torch::Tensor const& A,
|
|||||||
torch::Tensor const& alpha);
|
torch::Tensor const& alpha);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if defined ENABLE_NVFP4_SM120 && ENABLE_NVFP4_SM120
|
||||||
|
void cutlass_scaled_fp4_mm_sm120a(torch::Tensor& D, torch::Tensor const& A,
|
||||||
|
torch::Tensor const& B,
|
||||||
|
torch::Tensor const& A_sf,
|
||||||
|
torch::Tensor const& B_sf,
|
||||||
|
torch::Tensor const& alpha);
|
||||||
|
#endif
|
||||||
|
|
||||||
void cutlass_scaled_fp4_mm(torch::Tensor& D, torch::Tensor const& A,
|
void cutlass_scaled_fp4_mm(torch::Tensor& D, torch::Tensor const& A,
|
||||||
torch::Tensor const& B, torch::Tensor const& A_sf,
|
torch::Tensor const& B, torch::Tensor const& A_sf,
|
||||||
torch::Tensor const& B_sf,
|
torch::Tensor const& B_sf,
|
||||||
torch::Tensor const& alpha) {
|
torch::Tensor const& alpha) {
|
||||||
#if defined ENABLE_NVFP4 && ENABLE_NVFP4
|
#if defined ENABLE_NVFP4_SM100 && ENABLE_NVFP4_SM100
|
||||||
return cutlass_scaled_fp4_mm_sm100a(D, A, B, A_sf, B_sf, alpha);
|
return cutlass_scaled_fp4_mm_sm100a(D, A, B, A_sf, B_sf, alpha);
|
||||||
|
#elif defined ENABLE_NVFP4_SM120 && ENABLE_NVFP4_SM120
|
||||||
|
return cutlass_scaled_fp4_mm_sm120a(D, A, B, A_sf, B_sf, alpha);
|
||||||
#endif
|
#endif
|
||||||
TORCH_CHECK_NOT_IMPLEMENTED(false,
|
TORCH_CHECK_NOT_IMPLEMENTED(false,
|
||||||
"No compiled nvfp4 mm kernel, vLLM should "
|
"No compiled nvfp4 mm kernel, vLLM should "
|
||||||
|
|||||||
285
csrc/quantization/fp4/nvfp4_scaled_mm_sm120_kernels.cu
Normal file
285
csrc/quantization/fp4/nvfp4_scaled_mm_sm120_kernels.cu
Normal file
@ -0,0 +1,285 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <torch/all.h>
|
||||||
|
|
||||||
|
#include <ATen/cuda/CUDAContext.h>
|
||||||
|
#include <c10/cuda/CUDAGuard.h>
|
||||||
|
|
||||||
|
#include "cutlass_extensions/common.hpp"
|
||||||
|
|
||||||
|
#include "cutlass/cutlass.h"
|
||||||
|
|
||||||
|
#include "cutlass/gemm/collective/collective_builder.hpp"
|
||||||
|
#include "cutlass/epilogue/collective/collective_builder.hpp"
|
||||||
|
#include "cutlass/gemm/device/gemm_universal_adapter.h"
|
||||||
|
#include "cutlass/gemm/kernel/gemm_universal.hpp"
|
||||||
|
|
||||||
|
#include "cutlass/util/packed_stride.hpp"
|
||||||
|
|
||||||
|
#include "core/math.hpp"
|
||||||
|
|
||||||
|
using namespace cute;
|
||||||
|
|
||||||
|
#define CHECK_TYPE(x, st, m) \
|
||||||
|
TORCH_CHECK(x.scalar_type() == st, ": Inconsistency of Tensor type:", m)
|
||||||
|
#define CHECK_TH_CUDA(x, m) \
|
||||||
|
TORCH_CHECK(x.is_cuda(), m, ": must be a CUDA tensor")
|
||||||
|
#define CHECK_CONTIGUOUS(x, m) \
|
||||||
|
TORCH_CHECK(x.is_contiguous(), m, ": must be contiguous")
|
||||||
|
#define CHECK_INPUT(x, st, m) \
|
||||||
|
CHECK_TH_CUDA(x, m); \
|
||||||
|
CHECK_CONTIGUOUS(x, m); \
|
||||||
|
CHECK_TYPE(x, st, m)
|
||||||
|
|
||||||
|
constexpr auto FLOAT4_E2M1X2 = at::ScalarType::Byte;
|
||||||
|
constexpr auto SF_DTYPE = at::ScalarType::Float8_e4m3fn;
|
||||||
|
|
||||||
|
struct sm120_fp4_config_M256 {
|
||||||
|
using ClusterShape = Shape<_1, _1, _1>;
|
||||||
|
using MmaTileShape = Shape<_128, _128, _128>;
|
||||||
|
using PerSmTileShape_MNK = Shape<_128, _128, _128>;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct sm120_fp4_config_default {
|
||||||
|
using ClusterShape = Shape<_1, _1, _1>;
|
||||||
|
using MmaTileShape = Shape<_256, _128, _128>;
|
||||||
|
using PerSmTileShape_MNK = Shape<_256, _128, _128>;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename Config, typename OutType>
|
||||||
|
struct Fp4GemmSm120 {
|
||||||
|
using ElementA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
|
||||||
|
using LayoutATag = cutlass::layout::RowMajor;
|
||||||
|
static constexpr int AlignmentA = 32;
|
||||||
|
|
||||||
|
using ElementB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
|
||||||
|
using LayoutBTag = cutlass::layout::ColumnMajor;
|
||||||
|
static constexpr int AlignmentB = 32;
|
||||||
|
|
||||||
|
using ElementD = OutType;
|
||||||
|
using ElementC = OutType;
|
||||||
|
using LayoutCTag = cutlass::layout::RowMajor;
|
||||||
|
using LayoutDTag = cutlass::layout::RowMajor;
|
||||||
|
static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
|
||||||
|
static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
|
||||||
|
|
||||||
|
using ElementAccumulator = float;
|
||||||
|
using ArchTag = cutlass::arch::Sm120;
|
||||||
|
using OperatorClass = cutlass::arch::OpClassBlockScaledTensorOp;
|
||||||
|
|
||||||
|
using MmaTileShape = typename Config::MmaTileShape;
|
||||||
|
using ClusterShape = typename Config::ClusterShape;
|
||||||
|
using PerSmTileShape_MNK = typename Config::PerSmTileShape_MNK;
|
||||||
|
|
||||||
|
using CollectiveEpilogue =
|
||||||
|
typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||||
|
ArchTag, OperatorClass, PerSmTileShape_MNK, ClusterShape,
|
||||||
|
cutlass::epilogue::collective::EpilogueTileAuto, ElementAccumulator,
|
||||||
|
ElementAccumulator, ElementC, LayoutCTag, AlignmentC, ElementD,
|
||||||
|
LayoutDTag, AlignmentD,
|
||||||
|
cutlass::epilogue::collective::EpilogueScheduleAuto>::CollectiveOp;
|
||||||
|
|
||||||
|
using CollectiveMainloop =
|
||||||
|
typename cutlass::gemm::collective::CollectiveBuilder<
|
||||||
|
ArchTag, OperatorClass, ElementA, LayoutATag, AlignmentA, ElementB,
|
||||||
|
LayoutBTag, AlignmentB, ElementAccumulator, MmaTileShape,
|
||||||
|
ClusterShape,
|
||||||
|
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(
|
||||||
|
sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||||
|
cutlass::gemm::collective::KernelScheduleAuto>::CollectiveOp;
|
||||||
|
|
||||||
|
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||||
|
Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue, void>;
|
||||||
|
|
||||||
|
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename Gemm>
|
||||||
|
typename Gemm::Arguments args_from_options(at::Tensor& D, at::Tensor const& A,
|
||||||
|
at::Tensor const& B,
|
||||||
|
at::Tensor const& A_sf,
|
||||||
|
at::Tensor const& B_sf,
|
||||||
|
torch::Tensor const& alpha, int M,
|
||||||
|
int N, int K) {
|
||||||
|
using ElementA = typename Gemm::ElementA;
|
||||||
|
using ElementB = typename Gemm::ElementB;
|
||||||
|
using ElementD = typename Gemm::ElementD;
|
||||||
|
using ElementSFA = cutlass::float_ue4m3_t;
|
||||||
|
using ElementSFB = cutlass::float_ue4m3_t;
|
||||||
|
using ElementCompute = float;
|
||||||
|
|
||||||
|
using StrideA = typename Gemm::GemmKernel::StrideA;
|
||||||
|
using StrideB = typename Gemm::GemmKernel::StrideB;
|
||||||
|
using StrideC = typename Gemm::GemmKernel::StrideC;
|
||||||
|
using StrideD = typename Gemm::GemmKernel::StrideD;
|
||||||
|
|
||||||
|
using Sm1xxBlkScaledConfig =
|
||||||
|
typename Gemm::GemmKernel::CollectiveMainloop::Sm1xxBlkScaledConfig;
|
||||||
|
|
||||||
|
auto stride_A = cutlass::make_cute_packed_stride(StrideA{}, {M, K, 1});
|
||||||
|
auto stride_B = cutlass::make_cute_packed_stride(StrideB{}, {N, K, 1});
|
||||||
|
auto stride_D = cutlass::make_cute_packed_stride(StrideD{}, {M, N, 1});
|
||||||
|
|
||||||
|
auto layout_SFA = Sm1xxBlkScaledConfig::tile_atom_to_shape_SFA(
|
||||||
|
cute::make_shape(M, N, K, 1));
|
||||||
|
auto layout_SFB = Sm1xxBlkScaledConfig::tile_atom_to_shape_SFB(
|
||||||
|
cute::make_shape(M, N, K, 1));
|
||||||
|
|
||||||
|
typename Gemm::Arguments arguments{
|
||||||
|
cutlass::gemm::GemmUniversalMode::kGemm,
|
||||||
|
{M, N, K, 1},
|
||||||
|
{static_cast<ElementA const*>(A.data_ptr()), stride_A,
|
||||||
|
static_cast<ElementB const*>(B.data_ptr()), stride_B,
|
||||||
|
static_cast<ElementSFA const*>(A_sf.data_ptr()), layout_SFA,
|
||||||
|
static_cast<ElementSFB const*>(B_sf.data_ptr()), layout_SFB},
|
||||||
|
{{},
|
||||||
|
static_cast<ElementD const*>(D.data_ptr()),
|
||||||
|
stride_D,
|
||||||
|
static_cast<ElementD*>(D.data_ptr()),
|
||||||
|
stride_D}};
|
||||||
|
auto& fusion_args = arguments.epilogue.thread;
|
||||||
|
fusion_args.alpha_ptr = static_cast<ElementCompute const*>(alpha.data_ptr());
|
||||||
|
|
||||||
|
return arguments;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename Gemm>
|
||||||
|
void runGemm(at::Tensor& D, at::Tensor const& A, at::Tensor const& B,
|
||||||
|
at::Tensor const& A_sf, at::Tensor const& B_sf,
|
||||||
|
torch::Tensor const& alpha, int M, int N, int K,
|
||||||
|
cudaStream_t stream) {
|
||||||
|
Gemm gemm;
|
||||||
|
|
||||||
|
auto arguments = args_from_options<Gemm>(D, A, B, A_sf, B_sf, alpha, M, N, K);
|
||||||
|
|
||||||
|
size_t workspace_size = Gemm::get_workspace_size(arguments);
|
||||||
|
auto const workspace_options =
|
||||||
|
torch::TensorOptions().dtype(torch::kUInt8).device(A.device());
|
||||||
|
auto workspace = torch::empty(workspace_size, workspace_options);
|
||||||
|
|
||||||
|
CUTLASS_CHECK(gemm.can_implement(arguments));
|
||||||
|
|
||||||
|
CUTLASS_CHECK(gemm.initialize(arguments, workspace.data_ptr(), stream));
|
||||||
|
|
||||||
|
CUTLASS_CHECK(gemm.run(arguments, workspace.data_ptr(), stream));
|
||||||
|
}
|
||||||
|
|
||||||
|
void cutlass_fp4_bf16_gemm_dispatch(torch::Tensor& D, torch::Tensor const& A,
|
||||||
|
torch::Tensor const& B,
|
||||||
|
torch::Tensor const& A_sf,
|
||||||
|
torch::Tensor const& B_sf,
|
||||||
|
torch::Tensor const& alpha, int m, int n,
|
||||||
|
int k, cudaStream_t stream) {
|
||||||
|
uint32_t const mp2 = std::max(static_cast<uint32_t>(16), next_pow_2(m));
|
||||||
|
if (mp2 <= 256) {
|
||||||
|
runGemm<Fp4GemmSm120<sm120_fp4_config_M256, cutlass::bfloat16_t>::Gemm>(
|
||||||
|
D, A, B, A_sf, B_sf, alpha, m, n, k, stream);
|
||||||
|
} else {
|
||||||
|
runGemm<Fp4GemmSm120<sm120_fp4_config_default, cutlass::bfloat16_t>::Gemm>(
|
||||||
|
D, A, B, A_sf, B_sf, alpha, m, n, k, stream);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void cutlass_fp4_f16_gemm_dispatch(torch::Tensor& D, torch::Tensor const& A,
|
||||||
|
torch::Tensor const& B,
|
||||||
|
torch::Tensor const& A_sf,
|
||||||
|
torch::Tensor const& B_sf,
|
||||||
|
torch::Tensor const& alpha, int m, int n,
|
||||||
|
int k, cudaStream_t stream) {
|
||||||
|
uint32_t const mp2 = std::max(static_cast<uint32_t>(16), next_pow_2(m));
|
||||||
|
if (mp2 <= 256) {
|
||||||
|
runGemm<Fp4GemmSm120<sm120_fp4_config_M256, cutlass::half_t>::Gemm>(
|
||||||
|
D, A, B, A_sf, B_sf, alpha, m, n, k, stream);
|
||||||
|
} else {
|
||||||
|
runGemm<Fp4GemmSm120<sm120_fp4_config_default, cutlass::half_t>::Gemm>(
|
||||||
|
D, A, B, A_sf, B_sf, alpha, m, n, k, stream);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void cutlass_scaled_fp4_mm_sm120a(torch::Tensor& D, torch::Tensor const& A,
|
||||||
|
torch::Tensor const& B,
|
||||||
|
torch::Tensor const& A_sf,
|
||||||
|
torch::Tensor const& B_sf,
|
||||||
|
torch::Tensor const& alpha) {
|
||||||
|
#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
|
||||||
|
CHECK_INPUT(A, FLOAT4_E2M1X2, "a");
|
||||||
|
CHECK_INPUT(B, FLOAT4_E2M1X2, "b");
|
||||||
|
|
||||||
|
CHECK_INPUT(A_sf, SF_DTYPE, "scale_a");
|
||||||
|
CHECK_INPUT(B_sf, SF_DTYPE, "scale_b");
|
||||||
|
|
||||||
|
CHECK_INPUT(alpha, at::ScalarType::Float, "alpha");
|
||||||
|
|
||||||
|
TORCH_CHECK(A.dim() == 2, "a must be a matrix");
|
||||||
|
TORCH_CHECK(B.dim() == 2, "b must be a matrix");
|
||||||
|
TORCH_CHECK(A.sizes()[1] == B.sizes()[1],
|
||||||
|
"a and b shapes cannot be multiplied (", A.sizes()[0], "x",
|
||||||
|
A.sizes()[1], " and ", B.sizes()[0], "x", B.sizes()[1], ")");
|
||||||
|
|
||||||
|
auto const m = A.sizes()[0];
|
||||||
|
auto const n = B.sizes()[0];
|
||||||
|
auto const k = A.sizes()[1] * 2;
|
||||||
|
|
||||||
|
constexpr int alignment = 32;
|
||||||
|
TORCH_CHECK(k % alignment == 0, "Expected k to be divisible by ", alignment,
|
||||||
|
", but got a shape: (", A.sizes()[0], "x", A.sizes()[1],
|
||||||
|
"), k: ", k, ".");
|
||||||
|
TORCH_CHECK(n % alignment == 0, "Expected n to be divisible by ", alignment,
|
||||||
|
", but got b shape: (", B.sizes()[0], "x", B.sizes()[1], ").");
|
||||||
|
|
||||||
|
auto round_up = [](int x, int y) { return (x + y - 1) / y * y; };
|
||||||
|
int rounded_m = round_up(m, 128);
|
||||||
|
int rounded_n = round_up(n, 128);
|
||||||
|
// Since k is divisible by 32 (alignment), k / 16 is guaranteed to be an
|
||||||
|
// integer.
|
||||||
|
int rounded_k = round_up(k / 16, 4);
|
||||||
|
|
||||||
|
TORCH_CHECK(A_sf.dim() == 2, "scale_a must be a matrix");
|
||||||
|
TORCH_CHECK(B_sf.dim() == 2, "scale_b must be a matrix");
|
||||||
|
TORCH_CHECK(A_sf.sizes()[1] == B_sf.sizes()[1],
|
||||||
|
"scale_a and scale_b shapes cannot be multiplied (",
|
||||||
|
A_sf.sizes()[0], "x", A_sf.sizes()[1], " and ", B_sf.sizes()[0],
|
||||||
|
"x", B_sf.sizes()[1], ")");
|
||||||
|
TORCH_CHECK(A_sf.sizes()[0] == rounded_m && A_sf.sizes()[1] == rounded_k,
|
||||||
|
"scale_a must be padded and swizzled to a shape (", rounded_m,
|
||||||
|
"x", rounded_k, "), but got a shape (", A_sf.sizes()[0], "x",
|
||||||
|
A_sf.sizes()[1], ")");
|
||||||
|
TORCH_CHECK(B_sf.sizes()[0] == rounded_n && B_sf.sizes()[1] == rounded_k,
|
||||||
|
"scale_b must be padded and swizzled to a shape (", rounded_n,
|
||||||
|
"x", rounded_k, "), but got a shape (", B_sf.sizes()[0], "x",
|
||||||
|
B_sf.sizes()[1], ")");
|
||||||
|
|
||||||
|
auto out_dtype = D.dtype();
|
||||||
|
const at::cuda::OptionalCUDAGuard device_guard(device_of(A));
|
||||||
|
const cudaStream_t stream = at::cuda::getCurrentCUDAStream(A.get_device());
|
||||||
|
|
||||||
|
if (out_dtype == at::ScalarType::BFloat16) {
|
||||||
|
return cutlass_fp4_bf16_gemm_dispatch(D, A, B, A_sf, B_sf, alpha, m, n, k,
|
||||||
|
stream);
|
||||||
|
} else if (out_dtype == at::ScalarType::Half) {
|
||||||
|
return cutlass_fp4_f16_gemm_dispatch(D, A, B, A_sf, B_sf, alpha, m, n, k,
|
||||||
|
stream);
|
||||||
|
} else {
|
||||||
|
TORCH_CHECK(false, "Unsupported output data type of nvfp4 mm sm120 (",
|
||||||
|
out_dtype, ")");
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
TORCH_CHECK(false,
|
||||||
|
"Unsupported CUTLASS version. Set VLLM_CUTLASS_SRC_DIR to "
|
||||||
|
"a CUTLASS 3.8 source directory to enable support.");
|
||||||
|
#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
|
||||||
|
}
|
||||||
@ -1,7 +1,8 @@
|
|||||||
#include "common.cuh"
|
#include "common.cuh"
|
||||||
#include "dispatch_utils.h"
|
#include "dispatch_utils.h"
|
||||||
|
#include "../vectorization_utils.cuh"
|
||||||
#include <c10/cuda/CUDAGuard.h>
|
#include <c10/cuda/CUDAGuard.h>
|
||||||
|
#include <ATen/cuda/Exceptions.h>
|
||||||
|
|
||||||
#ifndef USE_ROCM
|
#ifndef USE_ROCM
|
||||||
#include <cub/cub.cuh>
|
#include <cub/cub.cuh>
|
||||||
@ -12,74 +13,127 @@
|
|||||||
namespace vllm {
|
namespace vllm {
|
||||||
|
|
||||||
template <typename scalar_t, typename fp8_type>
|
template <typename scalar_t, typename fp8_type>
|
||||||
__global__ void scaled_fp8_quant_kernel(fp8_type* __restrict__ out,
|
__global__ void scaled_fp8_quant_kernel_strided(
|
||||||
const scalar_t* __restrict__ input,
|
fp8_type* __restrict__ out, const scalar_t* __restrict__ input,
|
||||||
const float* __restrict__ scale,
|
const float* __restrict__ scale, int hidden_size, int64_t in_row_stride,
|
||||||
int64_t num_elems) {
|
int64_t out_row_stride) {
|
||||||
int tid = blockDim.x * blockIdx.x + threadIdx.x;
|
const int64_t token_idx = blockIdx.x; // one token per block
|
||||||
|
const int tid = threadIdx.x;
|
||||||
|
|
||||||
// Invert the scale so that we can use multiplications to avoid expensive
|
const scalar_t* token_in = input + token_idx * in_row_stride;
|
||||||
// division.
|
fp8_type* token_out = out + token_idx * out_row_stride;
|
||||||
const float inverted_scale = 1.0f / (*scale);
|
|
||||||
scaled_fp8_conversion_vec<scalar_t, true>(
|
const float inv_scale = 1.0f / (*scale);
|
||||||
out, input, inverted_scale, num_elems, tid, blockDim.x * gridDim.x);
|
|
||||||
|
vectorize_with_alignment<16>(
|
||||||
|
token_in, token_out, hidden_size, tid, blockDim.x,
|
||||||
|
[=] __device__(fp8_type & dst, const scalar_t& src) {
|
||||||
|
dst = scaled_fp8_conversion<true, fp8_type>(static_cast<float>(src),
|
||||||
|
inv_scale);
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename scalar_t, typename fp8_type>
|
template <typename scalar_t, typename fp8_type>
|
||||||
__global__ void dynamic_per_token_scaled_fp8_quant_kernel(
|
__global__ void segmented_max_reduction_strided(
|
||||||
fp8_type* __restrict__ out, float* __restrict__ scale,
|
float* __restrict__ scale, const scalar_t* __restrict__ input,
|
||||||
scalar_t const* __restrict__ input, float const* __restrict__ scale_ub,
|
int hidden_size, int64_t in_row_stride, int64_t num_tokens) {
|
||||||
const int hidden_size) {
|
__shared__ float cache[256];
|
||||||
int const tid = threadIdx.x;
|
const int tid = threadIdx.x;
|
||||||
int const token_idx = blockIdx.x;
|
int64_t token_idx = blockIdx.x;
|
||||||
|
|
||||||
// Use int64 to avoid overflowing an int32 when calculating this offset
|
// one block per token. Guard in case gridDim.x > num_tokens.
|
||||||
int64_t offset = static_cast<int64_t>(token_idx) * hidden_size;
|
if (token_idx >= num_tokens) {
|
||||||
scalar_t const* __restrict__ token_input = &input[offset];
|
return;
|
||||||
fp8_type* __restrict__ token_output = &out[offset];
|
|
||||||
|
|
||||||
// For vectorization, token_input and token_output pointers need to be
|
|
||||||
// aligned at 32-byte and 16-byte addresses respectively.
|
|
||||||
bool const can_vectorize = hidden_size % 16 == 0;
|
|
||||||
|
|
||||||
float absmax_val = 0.0f;
|
|
||||||
if (can_vectorize) {
|
|
||||||
absmax_val = thread_max_vec(token_input, hidden_size, tid, blockDim.x);
|
|
||||||
} else {
|
|
||||||
for (int i = tid; i < hidden_size; i += blockDim.x) {
|
|
||||||
float const x = static_cast<float>(token_input[i]);
|
|
||||||
absmax_val = fmaxf(absmax_val, fabsf(x));
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const scalar_t* row_ptr = input + token_idx * in_row_stride;
|
||||||
|
|
||||||
|
// each thread scans elements of the row in a strided fashion.
|
||||||
|
float thread_max = 0.0f;
|
||||||
|
for (int e = tid; e < hidden_size; e += blockDim.x) {
|
||||||
|
float v = fabsf(static_cast<float>(row_ptr[e]));
|
||||||
|
thread_max = fmaxf(thread_max, v);
|
||||||
|
}
|
||||||
|
|
||||||
|
cache[tid] = thread_max;
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
// parallel reduction to find row max.
|
||||||
|
for (int offset = blockDim.x / 2; offset > 0; offset >>= 1) {
|
||||||
|
if (tid < offset) {
|
||||||
|
cache[tid] = fmaxf(cache[tid], cache[tid + offset]);
|
||||||
|
}
|
||||||
|
__syncthreads();
|
||||||
|
}
|
||||||
|
|
||||||
|
// thread 0 updates global scale (per-tensor) atomically.
|
||||||
|
if (tid == 0) {
|
||||||
|
atomicMaxFloat(scale, cache[0] / quant_type_max_v<fp8_type>);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename scalar_t, typename fp8_type>
|
||||||
|
__global__ void scaled_fp8_quant_kernel_strided_dynamic(
|
||||||
|
fp8_type* __restrict__ out, const scalar_t* __restrict__ input,
|
||||||
|
const float* __restrict__ scale, int hidden_size, int64_t in_row_stride,
|
||||||
|
int64_t out_row_stride) {
|
||||||
|
const int64_t token_idx = blockIdx.x;
|
||||||
|
const int tid = threadIdx.x;
|
||||||
|
|
||||||
|
const scalar_t* token_in = input + token_idx * in_row_stride;
|
||||||
|
fp8_type* token_out = out + token_idx * out_row_stride;
|
||||||
|
|
||||||
|
const float reciprocal_scale = 1.0f / (*scale);
|
||||||
|
vectorize_with_alignment<16>(
|
||||||
|
token_in, token_out, hidden_size, tid, blockDim.x,
|
||||||
|
[=] __device__(fp8_type & dst, const scalar_t& src) {
|
||||||
|
dst = scaled_fp8_conversion<true, fp8_type>(static_cast<float>(src),
|
||||||
|
reciprocal_scale);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename scalar_t, typename fp8_type>
|
||||||
|
__global__ void dynamic_per_token_scaled_fp8_quant_kernel_strided(
|
||||||
|
fp8_type* __restrict__ out, float* __restrict__ scale,
|
||||||
|
const scalar_t* __restrict__ input, const float* __restrict__ scale_ub,
|
||||||
|
int hidden_size, int64_t in_row_stride, int64_t out_row_stride) {
|
||||||
|
const int64_t token_idx = blockIdx.x;
|
||||||
|
const int tid = threadIdx.x;
|
||||||
|
|
||||||
|
// Use int64 to avoid overflowing an int32 when calculating this offset
|
||||||
|
int64_t in_offset = static_cast<int64_t>(token_idx) * in_row_stride;
|
||||||
|
int64_t out_offset = static_cast<int64_t>(token_idx) * out_row_stride;
|
||||||
|
const scalar_t* token_in = input + in_offset;
|
||||||
|
fp8_type* token_out = out + out_offset;
|
||||||
|
|
||||||
|
// 1) per-token absmax
|
||||||
|
float absmax_val = 0.f;
|
||||||
|
vectorize_read_with_alignment<16>(
|
||||||
|
token_in, hidden_size, tid, blockDim.x, [&] __device__(scalar_t v) {
|
||||||
|
absmax_val = fmaxf(absmax_val, fabsf(static_cast<float>(v)));
|
||||||
|
});
|
||||||
|
|
||||||
using BlockReduce = cub::BlockReduce<float, 256>;
|
using BlockReduce = cub::BlockReduce<float, 256>;
|
||||||
__shared__ typename BlockReduce::TempStorage reduceStorage;
|
__shared__ typename BlockReduce::TempStorage tmp;
|
||||||
float const block_absmax_val_maybe =
|
const float block_max =
|
||||||
BlockReduce(reduceStorage).Reduce(absmax_val, cub::Max{}, blockDim.x);
|
BlockReduce(tmp).Reduce(absmax_val, cub::Max{}, blockDim.x);
|
||||||
|
|
||||||
__shared__ float token_scale;
|
__shared__ float token_scale;
|
||||||
if (tid == 0) {
|
if (tid == 0) {
|
||||||
if (scale_ub) {
|
token_scale = scale_ub ? fminf(block_max, *scale_ub) : block_max;
|
||||||
token_scale = fminf(block_absmax_val_maybe, *scale_ub);
|
|
||||||
} else {
|
|
||||||
token_scale = block_absmax_val_maybe;
|
|
||||||
}
|
|
||||||
// token scale computation
|
|
||||||
token_scale = fmaxf(token_scale / quant_type_max_v<fp8_type>,
|
token_scale = fmaxf(token_scale / quant_type_max_v<fp8_type>,
|
||||||
min_scaling_factor<fp8_type>::val());
|
min_scaling_factor<fp8_type>::val());
|
||||||
scale[token_idx] = token_scale;
|
scale[token_idx] = token_scale;
|
||||||
}
|
}
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
// Note that we don't use inverted scales so we can match FBGemm impl.
|
// 2) quantize
|
||||||
if (can_vectorize) {
|
vectorize_with_alignment<16>(
|
||||||
scaled_fp8_conversion_vec<scalar_t, false>(
|
token_in, token_out, hidden_size, tid, blockDim.x,
|
||||||
token_output, token_input, token_scale, hidden_size, tid, blockDim.x);
|
[=] __device__(fp8_type & dst, const scalar_t& src) {
|
||||||
} else {
|
dst = scaled_fp8_conversion<false, fp8_type>(static_cast<float>(src),
|
||||||
for (int i = tid; i < hidden_size; i += blockDim.x) {
|
token_scale);
|
||||||
token_output[i] = scaled_fp8_conversion<false, fp8_type>(
|
});
|
||||||
static_cast<float>(token_input[i]), token_scale);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace vllm
|
} // namespace vllm
|
||||||
@ -88,23 +142,31 @@ void static_scaled_fp8_quant(torch::Tensor& out, // [..., d]
|
|||||||
torch::Tensor const& input, // [..., d]
|
torch::Tensor const& input, // [..., d]
|
||||||
torch::Tensor const& scale) // [1]
|
torch::Tensor const& scale) // [1]
|
||||||
{
|
{
|
||||||
TORCH_CHECK(input.is_contiguous());
|
TORCH_CHECK(input.stride(-1) == 1,
|
||||||
TORCH_CHECK(out.is_contiguous());
|
"last dimension of input must be contiguous");
|
||||||
int const block_size = 256;
|
TORCH_CHECK(out.stride(-1) == 1,
|
||||||
int const num_tokens = input.numel() / input.size(-1);
|
"last dimension of output must be contiguous");
|
||||||
int const num_elems = input.numel();
|
|
||||||
dim3 const grid(num_tokens);
|
const int hidden_size = input.size(-1);
|
||||||
dim3 const block(block_size);
|
const int num_tokens = input.numel() / hidden_size;
|
||||||
|
const int block_size = 256;
|
||||||
|
dim3 grid(num_tokens);
|
||||||
|
dim3 block(block_size);
|
||||||
|
|
||||||
|
const int64_t in_row_stride = input.stride(-2);
|
||||||
|
const int64_t out_row_stride = out.stride(-2);
|
||||||
|
|
||||||
const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
|
const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
|
||||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||||
VLLM_DISPATCH_FLOATING_TYPES(
|
VLLM_DISPATCH_FLOATING_TYPES(
|
||||||
input.scalar_type(), "scaled_fp8_quant_kernel_scalar_type", [&] {
|
input.scalar_type(), "scaled_fp8_quant_kernel_scalar_type", [&] {
|
||||||
VLLM_DISPATCH_FP8_TYPES(
|
VLLM_DISPATCH_FP8_TYPES(
|
||||||
out.scalar_type(), "scaled_fp8_quant_kernel_fp8_type", [&] {
|
out.scalar_type(), "scaled_fp8_quant_kernel_fp8_type", [&] {
|
||||||
vllm::scaled_fp8_quant_kernel<scalar_t, fp8_t>
|
vllm::scaled_fp8_quant_kernel_strided<scalar_t, fp8_t>
|
||||||
<<<grid, block, 0, stream>>>(
|
<<<grid, block, 0, stream>>>(
|
||||||
out.data_ptr<fp8_t>(), input.data_ptr<scalar_t>(),
|
out.data_ptr<fp8_t>(), input.data_ptr<scalar_t>(),
|
||||||
scale.data_ptr<float>(), num_elems);
|
scale.data_ptr<float>(), hidden_size, in_row_stride,
|
||||||
|
out_row_stride);
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
@ -113,27 +175,42 @@ void dynamic_scaled_fp8_quant(torch::Tensor& out, // [..., d]
|
|||||||
torch::Tensor const& input, // [..., d]
|
torch::Tensor const& input, // [..., d]
|
||||||
torch::Tensor& scale) // [1]
|
torch::Tensor& scale) // [1]
|
||||||
{
|
{
|
||||||
TORCH_CHECK(input.is_contiguous());
|
TORCH_CHECK(input.stride(-1) == 1,
|
||||||
TORCH_CHECK(out.is_contiguous());
|
"last dimension of input must be contiguous");
|
||||||
int const block_size = 256;
|
TORCH_CHECK(out.stride(-1) == 1,
|
||||||
int const num_tokens = input.numel() / input.size(-1);
|
"last dimension of output must be contiguous");
|
||||||
int const num_elems = input.numel();
|
|
||||||
dim3 const grid(num_tokens);
|
const int hidden_size = input.size(-1);
|
||||||
dim3 const block(block_size);
|
const int num_tokens = input.numel() / hidden_size;
|
||||||
|
const int block_size = 256;
|
||||||
|
dim3 grid(num_tokens);
|
||||||
|
dim3 block(block_size);
|
||||||
|
|
||||||
|
const int64_t in_row_stride = input.stride(-2);
|
||||||
|
const int64_t out_row_stride = out.stride(-2);
|
||||||
|
|
||||||
const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
|
const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
|
||||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||||
|
|
||||||
|
// scale tensor should be initialised to <=0 before reduction
|
||||||
|
AT_CUDA_CHECK(
|
||||||
|
cudaMemsetAsync(scale.data_ptr<float>(), 0, sizeof(float), stream));
|
||||||
|
|
||||||
VLLM_DISPATCH_FLOATING_TYPES(
|
VLLM_DISPATCH_FLOATING_TYPES(
|
||||||
input.scalar_type(), "scaled_fp8_quant_kernel_scalar_type", [&] {
|
input.scalar_type(), "scaled_fp8_quant_kernel_scalar_type", [&] {
|
||||||
VLLM_DISPATCH_FP8_TYPES(
|
VLLM_DISPATCH_FP8_TYPES(
|
||||||
out.scalar_type(), "scaled_fp8_quant_kernel_fp8_type", [&] {
|
out.scalar_type(), "scaled_fp8_quant_kernel_fp8_type", [&] {
|
||||||
vllm::segmented_max_reduction<scalar_t, fp8_t>
|
vllm::segmented_max_reduction_strided<scalar_t, fp8_t>
|
||||||
<<<grid, block, 0, stream>>>(scale.data_ptr<float>(),
|
<<<grid, block, 0, stream>>>(
|
||||||
input.data_ptr<scalar_t>(),
|
scale.data_ptr<float>(), input.data_ptr<scalar_t>(),
|
||||||
num_elems);
|
hidden_size, in_row_stride,
|
||||||
vllm::scaled_fp8_quant_kernel<scalar_t, fp8_t>
|
static_cast<int64_t>(num_tokens));
|
||||||
|
|
||||||
|
vllm::scaled_fp8_quant_kernel_strided_dynamic<scalar_t, fp8_t>
|
||||||
<<<grid, block, 0, stream>>>(
|
<<<grid, block, 0, stream>>>(
|
||||||
out.data_ptr<fp8_t>(), input.data_ptr<scalar_t>(),
|
out.data_ptr<fp8_t>(), input.data_ptr<scalar_t>(),
|
||||||
scale.data_ptr<float>(), num_elems);
|
scale.data_ptr<float>(), hidden_size, in_row_stride,
|
||||||
|
out_row_stride);
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
@ -142,14 +219,19 @@ void dynamic_per_token_scaled_fp8_quant(
|
|||||||
torch::Tensor& out, // [..., d]
|
torch::Tensor& out, // [..., d]
|
||||||
torch::Tensor const& input, // [..., d]
|
torch::Tensor const& input, // [..., d]
|
||||||
torch::Tensor& scales, std::optional<at::Tensor> const& scale_ub) {
|
torch::Tensor& scales, std::optional<at::Tensor> const& scale_ub) {
|
||||||
TORCH_CHECK(input.is_contiguous());
|
TORCH_CHECK(input.stride(-1) == 1,
|
||||||
TORCH_CHECK(out.is_contiguous());
|
"last dimension of input must be contiguous");
|
||||||
|
TORCH_CHECK(out.stride(-1) == 1,
|
||||||
|
"last dimension of output must be contiguous");
|
||||||
|
|
||||||
int const hidden_size = input.size(-1);
|
const int hidden_size = input.size(-1);
|
||||||
int const num_tokens = input.numel() / hidden_size;
|
const int num_tokens = input.numel() / hidden_size;
|
||||||
int const block_size = 256;
|
const int block_size = 256;
|
||||||
dim3 const grid(num_tokens);
|
dim3 grid(num_tokens);
|
||||||
dim3 const block(std::min(hidden_size, block_size));
|
dim3 block(std::min(hidden_size, block_size));
|
||||||
|
|
||||||
|
const int64_t in_row_stride = input.stride(-2);
|
||||||
|
const int64_t out_row_stride = out.stride(-2);
|
||||||
|
|
||||||
const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
|
const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
|
||||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||||
@ -159,13 +241,12 @@ void dynamic_per_token_scaled_fp8_quant(
|
|||||||
VLLM_DISPATCH_FP8_TYPES(
|
VLLM_DISPATCH_FP8_TYPES(
|
||||||
out.scalar_type(),
|
out.scalar_type(),
|
||||||
"dynamic_per_token_scaled_fp8_quant_kernel_fp8_type", [&] {
|
"dynamic_per_token_scaled_fp8_quant_kernel_fp8_type", [&] {
|
||||||
vllm::dynamic_per_token_scaled_fp8_quant_kernel<scalar_t, fp8_t>
|
vllm::dynamic_per_token_scaled_fp8_quant_kernel_strided<
|
||||||
<<<grid, block, 0, stream>>>(
|
scalar_t, fp8_t><<<grid, block, 0, stream>>>(
|
||||||
out.data_ptr<fp8_t>(), scales.data_ptr<float>(),
|
out.data_ptr<fp8_t>(), scales.data_ptr<float>(),
|
||||||
input.data_ptr<scalar_t>(),
|
input.data_ptr<scalar_t>(),
|
||||||
scale_ub.has_value() ? scale_ub->data_ptr<float>()
|
scale_ub.has_value() ? scale_ub->data_ptr<float>() : nullptr,
|
||||||
: nullptr,
|
hidden_size, in_row_stride, out_row_stride);
|
||||||
hidden_size);
|
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|||||||
@ -55,111 +55,4 @@ __device__ __forceinline__ fp8_type scaled_fp8_conversion(float const val,
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
// Compute the absolute maximum m of the input tensor and store
|
|
||||||
// m / float8_e4m3::max() in *scale. Each thread block performs a
|
|
||||||
// reduction tree and the memory in scale is atomically updated.
|
|
||||||
// So to get the right answer, *scale needs to be initialized to
|
|
||||||
// a value <= 0.0 and we need to wait for all thread blocks to
|
|
||||||
// finish before consuming *scale.
|
|
||||||
template <typename scalar_t, typename fp8_type>
|
|
||||||
__global__ void segmented_max_reduction(float* __restrict__ scale,
|
|
||||||
const scalar_t* __restrict__ input,
|
|
||||||
int64_t num_elems) {
|
|
||||||
__shared__ float cache[256];
|
|
||||||
int64_t i = blockDim.x * blockIdx.x + threadIdx.x;
|
|
||||||
|
|
||||||
// First store maximum for all values processes by
|
|
||||||
// the current thread in cache[threadIdx.x]
|
|
||||||
scalar_t tmp = 0.0;
|
|
||||||
while (i < num_elems) {
|
|
||||||
float x = static_cast<float>(input[i]);
|
|
||||||
tmp = fmaxf(tmp, fabsf(x));
|
|
||||||
i += blockDim.x * gridDim.x;
|
|
||||||
}
|
|
||||||
cache[threadIdx.x] = tmp;
|
|
||||||
|
|
||||||
__syncthreads();
|
|
||||||
|
|
||||||
// Now perform parallel reduction within the thread block
|
|
||||||
int ib = blockDim.x / 2;
|
|
||||||
while (ib != 0) {
|
|
||||||
if (threadIdx.x < ib && cache[threadIdx.x + ib] > cache[threadIdx.x]) {
|
|
||||||
cache[threadIdx.x] = cache[threadIdx.x + ib];
|
|
||||||
}
|
|
||||||
__syncthreads();
|
|
||||||
ib /= 2;
|
|
||||||
}
|
|
||||||
// Finally, since cache[0] contains the maximum for this thread block,
|
|
||||||
// atomically write the max to the target location
|
|
||||||
if (threadIdx.x == 0) {
|
|
||||||
atomicMaxFloat(scale, cache[0] / quant_type_max_v<fp8_type>);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename scalar_t>
|
|
||||||
__device__ float thread_max_vec(scalar_t const* __restrict__ input,
|
|
||||||
int64_t const num_elems, int const tid,
|
|
||||||
int const step) {
|
|
||||||
constexpr size_t VEC_SIZE = 16;
|
|
||||||
using scalarxN_t = vec_n_t<scalar_t, VEC_SIZE>;
|
|
||||||
// Vectorized input/output to better utilize memory bandwidth.
|
|
||||||
auto const* vectorized_in = reinterpret_cast<scalarxN_t const*>(input);
|
|
||||||
|
|
||||||
// num_elems / VEC_SIZE (which is 16)
|
|
||||||
int64_t const num_vec_elems = num_elems >> 4;
|
|
||||||
float absmax_val = 0.0f;
|
|
||||||
|
|
||||||
#pragma unroll
|
|
||||||
for (int64_t i = tid; i < num_vec_elems; i += step) {
|
|
||||||
scalarxN_t in_vec = vectorized_in[i];
|
|
||||||
#pragma unroll
|
|
||||||
for (int j = 0; j < VEC_SIZE; ++j) {
|
|
||||||
absmax_val = fmaxf(absmax_val, fabsf(in_vec.val[j]));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Handle the remaining elements if num_elems is not divisible by VEC_SIZE
|
|
||||||
for (int64_t i = num_vec_elems * VEC_SIZE + tid; i < num_elems; i += step) {
|
|
||||||
absmax_val = fmaxf(absmax_val, fabsf(input[i]));
|
|
||||||
}
|
|
||||||
|
|
||||||
return absmax_val;
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename scalar_t, bool is_scale_inverted, typename fp8_type>
|
|
||||||
__device__ void scaled_fp8_conversion_vec(fp8_type* __restrict__ out,
|
|
||||||
scalar_t const* __restrict__ input,
|
|
||||||
float const scale,
|
|
||||||
int64_t const num_elems,
|
|
||||||
int const tid, int const step) {
|
|
||||||
constexpr size_t VEC_SIZE = 16;
|
|
||||||
using scalarxN_t = vec_n_t<scalar_t, VEC_SIZE>;
|
|
||||||
using float8xN_t = q8_n_t<fp8_type, VEC_SIZE>;
|
|
||||||
// Vectorized input/output to better utilize memory bandwidth.
|
|
||||||
auto const* vectorized_in = reinterpret_cast<scalarxN_t const*>(input);
|
|
||||||
auto* vectorized_out = reinterpret_cast<float8xN_t*>(out);
|
|
||||||
|
|
||||||
// num_elems / VEC_SIZE (which is 16)
|
|
||||||
int64_t const num_vec_elems = num_elems >> 4;
|
|
||||||
|
|
||||||
#pragma unroll
|
|
||||||
for (int64_t i = tid; i < num_vec_elems; i += step) {
|
|
||||||
scalarxN_t in_vec = vectorized_in[i];
|
|
||||||
float8xN_t out_vec;
|
|
||||||
|
|
||||||
#pragma unroll
|
|
||||||
for (int j = 0; j < VEC_SIZE; ++j) {
|
|
||||||
out_vec.val[j] = scaled_fp8_conversion<is_scale_inverted, fp8_type>(
|
|
||||||
static_cast<float>(in_vec.val[j]), scale);
|
|
||||||
}
|
|
||||||
vectorized_out[i] = out_vec;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Handle the remaining elements if num_elems is not divisible by VEC_SIZE
|
|
||||||
for (int64_t i = num_vec_elems * VEC_SIZE + tid; i < num_elems; i += step) {
|
|
||||||
out[i] = scaled_fp8_conversion<is_scale_inverted, fp8_type>(
|
|
||||||
static_cast<float>(input[i]), scale);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace vllm
|
} // namespace vllm
|
||||||
|
|||||||
@ -1,10 +1,10 @@
|
|||||||
#include <ATen/cuda/CUDAContext.h>
|
#include <ATen/cuda/CUDAContext.h>
|
||||||
#include <c10/util/Float8_e4m3fn.h>
|
|
||||||
|
#include "../per_token_group_quant_8bit.h"
|
||||||
|
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
|
|
||||||
#include <cuda_fp16.h>
|
#include <cuda_fp8.h>
|
||||||
#include <cuda_bf16.h>
|
|
||||||
|
|
||||||
#include <torch/all.h>
|
#include <torch/all.h>
|
||||||
|
|
||||||
@ -120,7 +120,7 @@ void per_token_group_quant_8bit(const torch::Tensor& input,
|
|||||||
torch::Tensor& output_q,
|
torch::Tensor& output_q,
|
||||||
torch::Tensor& output_s, int64_t group_size,
|
torch::Tensor& output_s, int64_t group_size,
|
||||||
double eps, double min_8bit, double max_8bit,
|
double eps, double min_8bit, double max_8bit,
|
||||||
bool scale_ue8m0 = false) {
|
bool scale_ue8m0) {
|
||||||
TORCH_CHECK(input.is_contiguous());
|
TORCH_CHECK(input.is_contiguous());
|
||||||
TORCH_CHECK(output_q.is_contiguous());
|
TORCH_CHECK(output_q.is_contiguous());
|
||||||
|
|
||||||
@ -197,7 +197,9 @@ void per_token_group_quant_8bit(const torch::Tensor& input,
|
|||||||
VLLM_DISPATCH_FLOATING_TYPES(
|
VLLM_DISPATCH_FLOATING_TYPES(
|
||||||
input.scalar_type(), "per_token_group_quant_8bit", ([&] {
|
input.scalar_type(), "per_token_group_quant_8bit", ([&] {
|
||||||
if (dst_type == at::ScalarType::Float8_e4m3fn) {
|
if (dst_type == at::ScalarType::Float8_e4m3fn) {
|
||||||
LAUNCH_KERNEL(scalar_t, c10::Float8_e4m3fn);
|
LAUNCH_KERNEL(scalar_t, __nv_fp8_e4m3);
|
||||||
|
} else if (dst_type == at::ScalarType::Char) {
|
||||||
|
LAUNCH_KERNEL(scalar_t, int8_t);
|
||||||
}
|
}
|
||||||
}));
|
}));
|
||||||
|
|
||||||
|
|||||||
@ -4,7 +4,7 @@
|
|||||||
#include <torch/all.h>
|
#include <torch/all.h>
|
||||||
#include <c10/cuda/CUDAGuard.h>
|
#include <c10/cuda/CUDAGuard.h>
|
||||||
|
|
||||||
#include "cuda_compat.h"
|
#include "../../cuda_compat.h"
|
||||||
#include "dispatch_utils.h"
|
#include "dispatch_utils.h"
|
||||||
|
|
||||||
#include "ggml-common.h"
|
#include "ggml-common.h"
|
||||||
|
|||||||
@ -187,8 +187,12 @@ struct PrepackedLayoutBTemplate {
|
|||||||
CUTE_HOST_DEVICE static constexpr auto TVbNbKL_to_offset_copy(
|
CUTE_HOST_DEVICE static constexpr auto TVbNbKL_to_offset_copy(
|
||||||
Shape_NKL shape_mkl) {
|
Shape_NKL shape_mkl) {
|
||||||
auto layout = TVbNbKL_to_offset(shape_mkl);
|
auto layout = TVbNbKL_to_offset(shape_mkl);
|
||||||
return make_layout(coalesce(get<0>(layout)), get<1>(layout),
|
// for 4-bit elements, having >= 64 values per column
|
||||||
get<2>(layout));
|
// allows TMA to load full 32-byte sectors
|
||||||
|
auto inner_layout =
|
||||||
|
make_layout(make_shape(_256{}, size<0>(layout) / _256{}));
|
||||||
|
|
||||||
|
return make_layout(inner_layout, get<1>(layout), get<2>(layout));
|
||||||
}
|
}
|
||||||
|
|
||||||
// ((BlockN, BlockK), (BlocksN, BlocksK), L) -> (storage_idx)
|
// ((BlockN, BlockK), (BlocksN, BlocksK), L) -> (storage_idx)
|
||||||
|
|||||||
10
csrc/quantization/per_token_group_quant_8bit.h
Normal file
10
csrc/quantization/per_token_group_quant_8bit.h
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
#pragma once
|
||||||
|
#include <torch/all.h>
|
||||||
|
|
||||||
|
// TODO(wentao): refactor the folder to 8bit, then includes fp8 and int8 folders
|
||||||
|
// 8-bit per-token-group quantization helper used by both FP8 and INT8
|
||||||
|
void per_token_group_quant_8bit(const torch::Tensor& input,
|
||||||
|
torch::Tensor& output_q,
|
||||||
|
torch::Tensor& output_s, int64_t group_size,
|
||||||
|
double eps, double min_8bit, double max_8bit,
|
||||||
|
bool scale_ue8m0 = false);
|
||||||
@ -19,7 +19,7 @@
|
|||||||
#include <c10/cuda/CUDAGuard.h>
|
#include <c10/cuda/CUDAGuard.h>
|
||||||
#include <hip/hip_fp8.h>
|
#include <hip/hip_fp8.h>
|
||||||
#include <hip/hip_bf16.h>
|
#include <hip/hip_bf16.h>
|
||||||
#include "cuda_compat.h"
|
#include "../cuda_compat.h"
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include "../attention/dtype_fp8.cuh"
|
#include "../attention/dtype_fp8.cuh"
|
||||||
|
|||||||
@ -9,7 +9,7 @@
|
|||||||
#include <stdexcept>
|
#include <stdexcept>
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
|
|
||||||
#include "cuda_compat.h"
|
#include "../cuda_compat.h"
|
||||||
#include "dispatch_utils.h"
|
#include "dispatch_utils.h"
|
||||||
#include "quantization/fp8/common.cuh"
|
#include "quantization/fp8/common.cuh"
|
||||||
|
|
||||||
|
|||||||
@ -624,6 +624,14 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
|||||||
ops.impl("per_token_group_fp8_quant", torch::kCUDA,
|
ops.impl("per_token_group_fp8_quant", torch::kCUDA,
|
||||||
&per_token_group_quant_fp8);
|
&per_token_group_quant_fp8);
|
||||||
|
|
||||||
|
// Compute per-token-group INT8 quantized tensor and scaling factor.
|
||||||
|
ops.def(
|
||||||
|
"per_token_group_quant_int8(Tensor input, Tensor! output_q, Tensor! "
|
||||||
|
"output_s, int group_size, float eps, float int8_min, float int8_max) -> "
|
||||||
|
"()");
|
||||||
|
ops.impl("per_token_group_quant_int8", torch::kCUDA,
|
||||||
|
&per_token_group_quant_int8);
|
||||||
|
|
||||||
// reorder weight for AllSpark Ampere W8A16 Fused Gemm kernel
|
// reorder weight for AllSpark Ampere W8A16 Fused Gemm kernel
|
||||||
ops.def(
|
ops.def(
|
||||||
"rearrange_kn_weight_as_n32k16_order(Tensor b_qweight, Tensor b_scales, "
|
"rearrange_kn_weight_as_n32k16_order(Tensor b_qweight, Tensor b_scales, "
|
||||||
|
|||||||
@ -1,4 +1,3 @@
|
|||||||
|
|
||||||
# The vLLM Dockerfile is used to construct vLLM image that can be directly used
|
# The vLLM Dockerfile is used to construct vLLM image that can be directly used
|
||||||
# to run the OpenAI compatible server.
|
# to run the OpenAI compatible server.
|
||||||
|
|
||||||
@ -16,6 +15,7 @@ ARG PYTHON_VERSION=3.12
|
|||||||
# Example:
|
# Example:
|
||||||
# docker build --build-arg BUILD_BASE_IMAGE=registry.acme.org/mirror/nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04
|
# docker build --build-arg BUILD_BASE_IMAGE=registry.acme.org/mirror/nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04
|
||||||
ARG BUILD_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04
|
ARG BUILD_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04
|
||||||
|
# TODO: Restore to base image after FlashInfer AOT wheel fixed
|
||||||
ARG FINAL_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04
|
ARG FINAL_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04
|
||||||
|
|
||||||
# By parameterizing the Deadsnakes repository URL, we allow third-party to use
|
# By parameterizing the Deadsnakes repository URL, we allow third-party to use
|
||||||
@ -119,6 +119,8 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
|||||||
# Reference: https://github.com/astral-sh/uv/pull/1694
|
# Reference: https://github.com/astral-sh/uv/pull/1694
|
||||||
ENV UV_HTTP_TIMEOUT=500
|
ENV UV_HTTP_TIMEOUT=500
|
||||||
ENV UV_INDEX_STRATEGY="unsafe-best-match"
|
ENV UV_INDEX_STRATEGY="unsafe-best-match"
|
||||||
|
# Use copy mode to avoid hardlink failures with Docker cache mounts
|
||||||
|
ENV UV_LINK_MODE=copy
|
||||||
|
|
||||||
# Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519
|
# Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519
|
||||||
# as it was causing spam when compiling the CUTLASS kernels
|
# as it was causing spam when compiling the CUTLASS kernels
|
||||||
@ -164,9 +166,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
|||||||
# see https://github.com/pytorch/pytorch/pull/123243
|
# see https://github.com/pytorch/pytorch/pull/123243
|
||||||
ARG torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0 10.0 12.0'
|
ARG torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0 10.0 12.0'
|
||||||
ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
|
ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
|
||||||
# Override the arch list for flash-attn to reduce the binary size
|
|
||||||
ARG vllm_fa_cmake_gpu_arches='80-real;90-real'
|
|
||||||
ENV VLLM_FA_CMAKE_GPU_ARCHES=${vllm_fa_cmake_gpu_arches}
|
|
||||||
#################### BASE BUILD IMAGE ####################
|
#################### BASE BUILD IMAGE ####################
|
||||||
|
|
||||||
#################### WHEEL BUILD IMAGE ####################
|
#################### WHEEL BUILD IMAGE ####################
|
||||||
@ -184,6 +183,8 @@ COPY requirements/build.txt requirements/build.txt
|
|||||||
# Reference: https://github.com/astral-sh/uv/pull/1694
|
# Reference: https://github.com/astral-sh/uv/pull/1694
|
||||||
ENV UV_HTTP_TIMEOUT=500
|
ENV UV_HTTP_TIMEOUT=500
|
||||||
ENV UV_INDEX_STRATEGY="unsafe-best-match"
|
ENV UV_INDEX_STRATEGY="unsafe-best-match"
|
||||||
|
# Use copy mode to avoid hardlink failures with Docker cache mounts
|
||||||
|
ENV UV_LINK_MODE=copy
|
||||||
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||||
uv pip install --system -r requirements/build.txt \
|
uv pip install --system -r requirements/build.txt \
|
||||||
@ -265,7 +266,7 @@ RUN if [ "$RUN_WHEEL_CHECK" = "true" ]; then \
|
|||||||
#################### EXTENSION Build IMAGE ####################
|
#################### EXTENSION Build IMAGE ####################
|
||||||
|
|
||||||
#################### DEV IMAGE ####################
|
#################### DEV IMAGE ####################
|
||||||
FROM base as dev
|
FROM base AS dev
|
||||||
|
|
||||||
ARG PIP_INDEX_URL UV_INDEX_URL
|
ARG PIP_INDEX_URL UV_INDEX_URL
|
||||||
ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
|
ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
|
||||||
@ -275,10 +276,8 @@ ARG PYTORCH_CUDA_INDEX_BASE_URL
|
|||||||
# Reference: https://github.com/astral-sh/uv/pull/1694
|
# Reference: https://github.com/astral-sh/uv/pull/1694
|
||||||
ENV UV_HTTP_TIMEOUT=500
|
ENV UV_HTTP_TIMEOUT=500
|
||||||
ENV UV_INDEX_STRATEGY="unsafe-best-match"
|
ENV UV_INDEX_STRATEGY="unsafe-best-match"
|
||||||
|
# Use copy mode to avoid hardlink failures with Docker cache mounts
|
||||||
# Workaround for #17068
|
ENV UV_LINK_MODE=copy
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
|
||||||
uv pip install --system --no-build-isolation "git+https://github.com/state-spaces/mamba@v2.2.4"
|
|
||||||
|
|
||||||
COPY requirements/lint.txt requirements/lint.txt
|
COPY requirements/lint.txt requirements/lint.txt
|
||||||
COPY requirements/test.txt requirements/test.txt
|
COPY requirements/test.txt requirements/test.txt
|
||||||
@ -290,7 +289,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
|||||||
|
|
||||||
#################### vLLM installation IMAGE ####################
|
#################### vLLM installation IMAGE ####################
|
||||||
# image with vLLM installed
|
# image with vLLM installed
|
||||||
# TODO: Restore to base image after FlashInfer AOT wheel fixed
|
|
||||||
FROM ${FINAL_BASE_IMAGE} AS vllm-base
|
FROM ${FINAL_BASE_IMAGE} AS vllm-base
|
||||||
ARG CUDA_VERSION
|
ARG CUDA_VERSION
|
||||||
ARG PYTHON_VERSION
|
ARG PYTHON_VERSION
|
||||||
@ -349,6 +347,8 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
|||||||
# Reference: https://github.com/astral-sh/uv/pull/1694
|
# Reference: https://github.com/astral-sh/uv/pull/1694
|
||||||
ENV UV_HTTP_TIMEOUT=500
|
ENV UV_HTTP_TIMEOUT=500
|
||||||
ENV UV_INDEX_STRATEGY="unsafe-best-match"
|
ENV UV_INDEX_STRATEGY="unsafe-best-match"
|
||||||
|
# Use copy mode to avoid hardlink failures with Docker cache mounts
|
||||||
|
ENV UV_LINK_MODE=copy
|
||||||
|
|
||||||
# Workaround for https://github.com/openai/triton/issues/2507 and
|
# Workaround for https://github.com/openai/triton/issues/2507 and
|
||||||
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
|
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
|
||||||
@ -390,7 +390,9 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
|
|||||||
|
|
||||||
# Install FlashInfer from source
|
# Install FlashInfer from source
|
||||||
ARG FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git"
|
ARG FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git"
|
||||||
ARG FLASHINFER_GIT_REF="v0.2.8rc1"
|
# Keep this in sync with https://github.com/vllm-project/vllm/blob/main/requirements/cuda.txt
|
||||||
|
# We use `--force-reinstall --no-deps` to avoid issues with the existing FlashInfer wheel.
|
||||||
|
ARG FLASHINFER_GIT_REF="v0.2.9"
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH'
|
RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH'
|
||||||
. /etc/environment
|
. /etc/environment
|
||||||
git clone --depth 1 --recursive --shallow-submodules \
|
git clone --depth 1 --recursive --shallow-submodules \
|
||||||
@ -412,7 +414,7 @@ RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH'
|
|||||||
TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
|
TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
|
||||||
python3 -m flashinfer.aot
|
python3 -m flashinfer.aot
|
||||||
TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
|
TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
|
||||||
uv pip install --system --no-build-isolation .
|
uv pip install --system --no-build-isolation --force-reinstall --no-deps .
|
||||||
popd
|
popd
|
||||||
rm -rf flashinfer
|
rm -rf flashinfer
|
||||||
BASH
|
BASH
|
||||||
@ -433,6 +435,33 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
|||||||
uv pip install --system -r requirements/build.txt \
|
uv pip install --system -r requirements/build.txt \
|
||||||
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
|
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
|
||||||
|
|
||||||
|
# Install DeepGEMM from source
|
||||||
|
ARG DEEPGEMM_GIT_REPO="https://github.com/deepseek-ai/DeepGEMM.git"
|
||||||
|
ARG DEEPGEMM_GIT_REF="187656694f7f69e3e7975617a68bc3387680a7e1"
|
||||||
|
RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH'
|
||||||
|
. /etc/environment
|
||||||
|
CUDA_MAJOR="${CUDA_VERSION%%.*}"
|
||||||
|
CUDA_MINOR="${CUDA_VERSION#${CUDA_MAJOR}.}"
|
||||||
|
CUDA_MINOR="${CUDA_MINOR%%.*}"
|
||||||
|
if [ "$CUDA_MAJOR" -ge 12 ] && [ "$CUDA_MINOR" -ge 8 ]; then
|
||||||
|
git clone --recursive --shallow-submodules \
|
||||||
|
${DEEPGEMM_GIT_REPO} deepgemm
|
||||||
|
echo "🏗️ Building DeepGEMM"
|
||||||
|
pushd deepgemm
|
||||||
|
git checkout ${DEEPGEMM_GIT_REF}
|
||||||
|
# Build DeepGEMM
|
||||||
|
# (Based on https://github.com/deepseek-ai/DeepGEMM/blob/main/install.sh)
|
||||||
|
rm -rf build dist
|
||||||
|
rm -rf *.egg-info
|
||||||
|
python3 setup.py bdist_wheel
|
||||||
|
uv pip install --system dist/*.whl
|
||||||
|
popd
|
||||||
|
rm -rf deepgemm
|
||||||
|
else
|
||||||
|
echo "Skipping DeepGEMM installation (requires CUDA 12.8+ but got ${CUDA_VERSION})"
|
||||||
|
fi
|
||||||
|
BASH
|
||||||
|
|
||||||
#################### vLLM installation IMAGE ####################
|
#################### vLLM installation IMAGE ####################
|
||||||
|
|
||||||
#################### TEST IMAGE ####################
|
#################### TEST IMAGE ####################
|
||||||
@ -451,10 +480,8 @@ ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
|
|||||||
# Reference: https://github.com/astral-sh/uv/pull/1694
|
# Reference: https://github.com/astral-sh/uv/pull/1694
|
||||||
ENV UV_HTTP_TIMEOUT=500
|
ENV UV_HTTP_TIMEOUT=500
|
||||||
ENV UV_INDEX_STRATEGY="unsafe-best-match"
|
ENV UV_INDEX_STRATEGY="unsafe-best-match"
|
||||||
|
# Use copy mode to avoid hardlink failures with Docker cache mounts
|
||||||
# Workaround for #17068
|
ENV UV_LINK_MODE=copy
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
|
||||||
uv pip install --system --no-build-isolation "git+https://github.com/state-spaces/mamba@v2.2.4"
|
|
||||||
|
|
||||||
# install development dependencies (for testing)
|
# install development dependencies (for testing)
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||||
|
|||||||
@ -1,62 +0,0 @@
|
|||||||
# This vLLM Dockerfile is used to construct an image that can build and run vLLM on ARM CPU platform.
|
|
||||||
|
|
||||||
FROM ubuntu:22.04 AS cpu-test-arm
|
|
||||||
|
|
||||||
ENV CCACHE_DIR=/root/.cache/ccache
|
|
||||||
|
|
||||||
ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache
|
|
||||||
|
|
||||||
RUN --mount=type=cache,target=/var/cache/apt \
|
|
||||||
apt-get update -y \
|
|
||||||
&& apt-get install -y curl ccache git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 libnuma-dev \
|
|
||||||
&& apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
|
|
||||||
&& update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
|
|
||||||
|
|
||||||
# tcmalloc provides better memory allocation efficiency, e.g., holding memory in caches to speed up access of commonly-used objects.
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
|
||||||
pip install py-cpuinfo # Use this to gather CPU info and optimize based on ARM Neoverse cores
|
|
||||||
|
|
||||||
# Set LD_PRELOAD for tcmalloc on ARM
|
|
||||||
ENV LD_PRELOAD="/usr/lib/aarch64-linux-gnu/libtcmalloc_minimal.so.4"
|
|
||||||
|
|
||||||
RUN echo 'ulimit -c 0' >> ~/.bashrc
|
|
||||||
|
|
||||||
WORKDIR /workspace
|
|
||||||
|
|
||||||
ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
|
|
||||||
ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
|
||||||
--mount=type=bind,src=requirements/build.txt,target=requirements/build.txt \
|
|
||||||
pip install --upgrade pip && \
|
|
||||||
pip install -r requirements/build.txt
|
|
||||||
|
|
||||||
FROM cpu-test-arm AS build
|
|
||||||
|
|
||||||
WORKDIR /workspace/vllm
|
|
||||||
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
|
||||||
--mount=type=bind,src=requirements/common.txt,target=requirements/common.txt \
|
|
||||||
--mount=type=bind,src=requirements/cpu.txt,target=requirements/cpu.txt \
|
|
||||||
pip install -v -r requirements/cpu.txt
|
|
||||||
|
|
||||||
COPY . .
|
|
||||||
ARG GIT_REPO_CHECK=0
|
|
||||||
RUN --mount=type=bind,source=.git,target=.git \
|
|
||||||
if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
|
|
||||||
|
|
||||||
# Disabling AVX512 specific optimizations for ARM
|
|
||||||
ARG VLLM_CPU_DISABLE_AVX512="true"
|
|
||||||
ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}
|
|
||||||
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
|
||||||
--mount=type=cache,target=/root/.cache/ccache \
|
|
||||||
--mount=type=bind,source=.git,target=.git \
|
|
||||||
VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel && \
|
|
||||||
pip install dist/*.whl && \
|
|
||||||
rm -rf dist
|
|
||||||
|
|
||||||
WORKDIR /workspace/
|
|
||||||
|
|
||||||
RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
|
|
||||||
|
|
||||||
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
|
|
||||||
@ -1,4 +1,11 @@
|
|||||||
# This vLLM Dockerfile is used to construct image that can build and run vLLM on x86 CPU platform.
|
# This vLLM Dockerfile is used to build images that can run vLLM on both x86_64 and arm64 CPU platforms.
|
||||||
|
#
|
||||||
|
# Supported platforms:
|
||||||
|
# - linux/amd64 (x86_64)
|
||||||
|
# - linux/arm64 (aarch64)
|
||||||
|
#
|
||||||
|
# Use the `--platform` option with `docker buildx build` to specify the target architecture, e.g.:
|
||||||
|
# docker buildx build --platform=linux/arm64 -f docker/Dockerfile.cpu .
|
||||||
#
|
#
|
||||||
# Build targets:
|
# Build targets:
|
||||||
# vllm-openai (default): used for serving deployment
|
# vllm-openai (default): used for serving deployment
|
||||||
@ -12,16 +19,14 @@
|
|||||||
# VLLM_CPU_AVX512VNNI=false (default)|true
|
# VLLM_CPU_AVX512VNNI=false (default)|true
|
||||||
#
|
#
|
||||||
|
|
||||||
######################### BASE IMAGE #########################
|
######################### COMMON BASE IMAGE #########################
|
||||||
FROM ubuntu:22.04 AS base
|
FROM ubuntu:22.04 AS base-common
|
||||||
|
|
||||||
WORKDIR /workspace/
|
WORKDIR /workspace/
|
||||||
|
|
||||||
ARG PYTHON_VERSION=3.12
|
ARG PYTHON_VERSION=3.12
|
||||||
ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
|
ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
|
||||||
|
|
||||||
ENV LD_PRELOAD=""
|
|
||||||
|
|
||||||
# Install minimal dependencies and uv
|
# Install minimal dependencies and uv
|
||||||
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
|
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
|
||||||
--mount=type=cache,target=/var/lib/apt,sharing=locked \
|
--mount=type=cache,target=/var/lib/apt,sharing=locked \
|
||||||
@ -53,7 +58,21 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
|||||||
uv pip install --upgrade pip && \
|
uv pip install --upgrade pip && \
|
||||||
uv pip install -r requirements/cpu.txt
|
uv pip install -r requirements/cpu.txt
|
||||||
|
|
||||||
ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/opt/venv/lib/libiomp5.so:$LD_PRELOAD"
|
ARG TARGETARCH
|
||||||
|
ENV TARGETARCH=${TARGETARCH}
|
||||||
|
|
||||||
|
######################### x86_64 BASE IMAGE #########################
|
||||||
|
FROM base-common AS base-amd64
|
||||||
|
|
||||||
|
ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/opt/venv/lib/libiomp5.so"
|
||||||
|
|
||||||
|
######################### arm64 BASE IMAGE #########################
|
||||||
|
FROM base-common AS base-arm64
|
||||||
|
|
||||||
|
ENV LD_PRELOAD="/usr/lib/aarch64-linux-gnu/libtcmalloc_minimal.so.4"
|
||||||
|
|
||||||
|
######################### BASE IMAGE #########################
|
||||||
|
FROM base-${TARGETARCH} AS base
|
||||||
|
|
||||||
RUN echo 'ulimit -c 0' >> ~/.bashrc
|
RUN echo 'ulimit -c 0' >> ~/.bashrc
|
||||||
|
|
||||||
@ -94,7 +113,6 @@ WORKDIR /workspace/vllm
|
|||||||
|
|
||||||
RUN --mount=type=bind,src=requirements/test.in,target=requirements/test.in \
|
RUN --mount=type=bind,src=requirements/test.in,target=requirements/test.in \
|
||||||
cp requirements/test.in requirements/cpu-test.in && \
|
cp requirements/test.in requirements/cpu-test.in && \
|
||||||
sed -i '/mamba_ssm/d' requirements/cpu-test.in && \
|
|
||||||
sed -i 's/^torch==.*/torch==2.6.0/g' requirements/cpu-test.in && \
|
sed -i 's/^torch==.*/torch==2.6.0/g' requirements/cpu-test.in && \
|
||||||
sed -i 's/torchaudio.*/torchaudio/g' requirements/cpu-test.in && \
|
sed -i 's/torchaudio.*/torchaudio/g' requirements/cpu-test.in && \
|
||||||
sed -i 's/torchvision.*/torchvision/g' requirements/cpu-test.in && \
|
sed -i 's/torchvision.*/torchvision/g' requirements/cpu-test.in && \
|
||||||
|
|||||||
@ -114,9 +114,6 @@ RUN cat torch_build_versions.txt
|
|||||||
# explicitly set the list to avoid issues with torch 2.2
|
# explicitly set the list to avoid issues with torch 2.2
|
||||||
# see https://github.com/pytorch/pytorch/pull/123243
|
# see https://github.com/pytorch/pytorch/pull/123243
|
||||||
|
|
||||||
# Override the arch list for flash-attn to reduce the binary size
|
|
||||||
ARG vllm_fa_cmake_gpu_arches='80-real;90-real'
|
|
||||||
ENV VLLM_FA_CMAKE_GPU_ARCHES=${vllm_fa_cmake_gpu_arches}
|
|
||||||
#################### BASE BUILD IMAGE ####################
|
#################### BASE BUILD IMAGE ####################
|
||||||
|
|
||||||
#################### WHEEL BUILD IMAGE ####################
|
#################### WHEEL BUILD IMAGE ####################
|
||||||
|
|||||||
@ -1,4 +1,4 @@
|
|||||||
ARG NIGHTLY_DATE="20250714"
|
ARG NIGHTLY_DATE="20250730"
|
||||||
ARG BASE_IMAGE="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.12_tpuvm_$NIGHTLY_DATE"
|
ARG BASE_IMAGE="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.12_tpuvm_$NIGHTLY_DATE"
|
||||||
|
|
||||||
FROM $BASE_IMAGE
|
FROM $BASE_IMAGE
|
||||||
|
|||||||
@ -47,7 +47,7 @@ FROM vllm-base AS vllm-openai
|
|||||||
|
|
||||||
# install additional dependencies for openai api server
|
# install additional dependencies for openai api server
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
pip install accelerate hf_transfer pytest modelscope
|
pip install accelerate hf_transfer pytest pytest_asyncio lm_eval[api] modelscope
|
||||||
|
|
||||||
ENV VLLM_USAGE_SOURCE production-docker-image \
|
ENV VLLM_USAGE_SOURCE production-docker-image \
|
||||||
TRITON_XPU_PROFILE 1
|
TRITON_XPU_PROFILE 1
|
||||||
|
|||||||
@ -56,9 +56,7 @@ nav:
|
|||||||
- contributing/model/tests.md
|
- contributing/model/tests.md
|
||||||
- contributing/model/multimodal.md
|
- contributing/model/multimodal.md
|
||||||
- CI: contributing/ci
|
- CI: contributing/ci
|
||||||
- Design Documents:
|
- Design Documents: design
|
||||||
- V0: design
|
|
||||||
- V1: design/v1
|
|
||||||
- API Reference:
|
- API Reference:
|
||||||
- Summary: api/README.md
|
- Summary: api/README.md
|
||||||
- Contents:
|
- Contents:
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user