Compare commits

..

50 Commits

Author SHA1 Message Date
220d694080 updated
Signed-off-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com>
2025-03-24 01:00:20 +00:00
70e06dd574 updated
Signed-off-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com>
2025-03-24 00:46:46 +00:00
7954461d4c updated
Signed-off-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com>
2025-03-23 23:03:42 +00:00
a10da86677 updated
Signed-off-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com>
2025-03-23 22:56:53 +00:00
284d5df45b added __init__.py
Signed-off-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com>
2025-03-23 22:50:20 +00:00
d5b0db449e added __init__.py
Signed-off-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com>
2025-03-23 22:44:36 +00:00
66349c33a1 updated
Signed-off-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com>
2025-03-23 22:36:57 +00:00
28d0396ff1 updated
Signed-off-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com>
2025-03-23 21:54:04 +00:00
2f29ae383a added files
Signed-off-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com>
2025-03-23 21:45:01 +00:00
cf64b0e6a7 updated
Signed-off-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com>
2025-03-23 21:44:14 +00:00
f51f182d64 pre-commit
Signed-off-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com>
2025-03-23 20:18:50 +00:00
79e465f557 fix pre-commit
Signed-off-by: Robert Shaw <rshaw@neuralmagic.com>
2025-03-23 09:38:55 -04:00
2ba687d39f updated
Signed-off-by: Robert Shaw <rshaw@neuralmagic.com>
2025-03-22 18:52:06 -04:00
5d57896e2c updated
Signed-off-by: Robert Shaw <rshaw@neuralmagic.com>
2025-03-22 18:51:53 -04:00
f6f008ca1d cleanup
Signed-off-by: Robert Shaw <rshaw@neuralmagic.com>
2025-03-22 18:51:21 -04:00
24cbbe4778 updated
Signed-off-by: Robert Shaw <rshaw@neuralmagic.com>
2025-03-22 18:50:48 -04:00
2fec6e0b5c working?
Signed-off-by: Robert Shaw <rshaw@neuralmagic.com>
2025-03-22 18:45:00 -04:00
47a3f26b2a updated
Signed-off-by: Robert Shaw <rshaw@neuralmagic.com>
2025-03-22 18:36:52 -04:00
144162fc8c Merge branch 'main' into rob-fixes 2025-03-22 17:12:53 -04:00
522279ebb9 Stash
Signed-off-by: Robert Shaw <rshaw@neuralmagic.com>
2025-03-22 17:12:21 -04:00
85687b43e7 updated
Signed-off-by: Robert Shaw <rshaw@neuralmagic.com>
2025-03-22 17:00:46 -04:00
120bbdfd82 updated
Signed-off-by: Robert Shaw <rshaw@neuralmagic.com>
2025-03-22 15:58:51 -04:00
2ceb7bc534 updated
Signed-off-by: Robert Shaw <rshaw@neuralmagic.com>
2025-03-22 13:25:05 -04:00
9f7fb5ec84 updated
Signed-off-by: Robert Shaw <rshaw@neuralmagic.com>
2025-03-22 13:22:00 -04:00
a8a621e419 updated
Signed-off-by: Robert Shaw <rshaw@neuralmagic.com>
2025-03-22 13:11:50 -04:00
b89d89f456 fix rebase
Signed-off-by: clark <panf2333@gmail.com>
2025-03-21 08:32:21 +08:00
8355358fb3 add unlimited HWM
Signed-off-by: clark <panf2333@gmail.com>
2025-03-21 08:20:12 +08:00
c0b1443345 fix mypy
Signed-off-by: clark <panf2333@gmail.com>
2025-03-21 08:20:12 +08:00
d35dace985 refactor zmq msg to object
Signed-off-by: clark <panf2333@gmail.com>
2025-03-21 08:20:12 +08:00
912031ceb5 refactor disagg
Signed-off-by: clark <panf2333@gmail.com>
2025-03-21 08:20:12 +08:00
4f13e89143 fix SIM105
Signed-off-by: clark <panf2333@gmail.com>
2025-03-21 08:18:19 +08:00
b9a7dbe769 remove default socket address value
Signed-off-by: clark <panf2333@gmail.com>
2025-03-21 08:18:19 +08:00
0cb2e05256 change log level and fix some comments
Signed-off-by: clark <panf2333@gmail.com>
2025-03-21 08:18:19 +08:00
d6945ecdf0 change disagg_prefill example to use zmq
Signed-off-by: clark <panf2333@gmail.com>
2025-03-21 08:18:19 +08:00
298298f97d remove invalid zmq benchmark code
Signed-off-by: clark <panf2333@gmail.com>
2025-03-21 08:18:19 +08:00
6c8fae82dd run format
Signed-off-by: clark <panf2333@gmail.com>
2025-03-21 08:18:19 +08:00
16ed827378 add benchmark shell
Signed-off-by: clark <panf2333@gmail.com>
2025-03-21 08:18:08 +08:00
8fa9df7987 run format.sh
Signed-off-by: clark <panf2333@gmail.com>
2025-03-21 08:17:57 +08:00
27c1afe88b fix ThreadProxy
Signed-off-by: clark <panf2333@gmail.com>
2025-03-21 08:17:57 +08:00
ee6607332e create proxy sockets in the proxy function for thread safety
Signed-off-by: clark <panf2333@gmail.com>
2025-03-21 08:17:57 +08:00
7fbf70db57 1. replace tpc:// with ipc:// \n 2. fix json response
Signed-off-by: clark <panf2333@gmail.com>
2025-03-21 08:17:57 +08:00
2c31e4c3ea Run yapf and ruff
Signed-off-by: clark <panf2333@gmail.com>
2025-03-21 08:17:57 +08:00
187f112ccd 1. fix mypy issue
Signed-off-by: clark <panf2333@gmail.com>
2025-03-21 08:17:44 +08:00
897db7b93d Replace zmq.asyncio.Context().term() with zmq.asyncio.Context().destroy(linger=0) for immediate termination
Signed-off-by: clark <panf2333@gmail.com>
2025-03-21 08:17:44 +08:00
b7ffb43792 update disagg_connect test_request.py
Signed-off-by: clark <panf2333@gmail.com>
2025-03-21 08:17:44 +08:00
6e1fba8a73 1. connect_parser set --prefill-addr and --decode-addr are required
2.To more accurately reflect its purpose, we will rename connect.py to disagg_connector.py.

Signed-off-by: clark <panf2333@gmail.com>
2025-03-21 08:17:44 +08:00
bfde1688e7 add /v1/completions stream support
Signed-off-by: clark <panf2333@gmail.com>
2025-03-21 08:17:44 +08:00
905424ed65 add identity url headers
Signed-off-by: clark <panf2333@gmail.com>
2025-03-21 08:15:42 +08:00
5d20f389d6 add vllm connect cmd
Signed-off-by: clark <panf2333@gmail.com>
2025-03-21 08:15:42 +08:00
2a0cb78016 add test py
Signed-off-by: clark <panf2333@gmail.com>
2025-03-21 08:15:42 +08:00
2107 changed files with 58678 additions and 180648 deletions

View File

@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os import os
import sys import sys
@ -9,12 +8,12 @@ import zipfile
# Note that we have 400 MiB quota, please use it wisely. # Note that we have 400 MiB quota, please use it wisely.
# See https://github.com/pypi/support/issues/3792 . # See https://github.com/pypi/support/issues/3792 .
# Please also sync the value with the one in Dockerfile. # Please also sync the value with the one in Dockerfile.
VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 400)) VLLM_MAX_SIZE_MB = int(os.environ.get('VLLM_MAX_SIZE_MB', 400))
def print_top_10_largest_files(zip_file): def print_top_10_largest_files(zip_file):
"""Print the top 10 largest files in the given zip file.""" """Print the top 10 largest files in the given zip file."""
with zipfile.ZipFile(zip_file, "r") as z: with zipfile.ZipFile(zip_file, 'r') as z:
file_sizes = [(f, z.getinfo(f).file_size) for f in z.namelist()] file_sizes = [(f, z.getinfo(f).file_size) for f in z.namelist()]
file_sizes.sort(key=lambda x: x[1], reverse=True) file_sizes.sort(key=lambda x: x[1], reverse=True)
for f, size in file_sizes[:10]: for f, size in file_sizes[:10]:
@ -29,18 +28,14 @@ def check_wheel_size(directory):
wheel_path = os.path.join(root, file_name) wheel_path = os.path.join(root, file_name)
wheel_size_mb = os.path.getsize(wheel_path) / (1024 * 1024) wheel_size_mb = os.path.getsize(wheel_path) / (1024 * 1024)
if wheel_size_mb > VLLM_MAX_SIZE_MB: if wheel_size_mb > VLLM_MAX_SIZE_MB:
print( print(f"Not allowed: Wheel {wheel_path} is larger "
f"Not allowed: Wheel {wheel_path} is larger " f"({wheel_size_mb:.2f} MB) than the limit "
f"({wheel_size_mb:.2f} MB) than the limit " f"({VLLM_MAX_SIZE_MB} MB).")
f"({VLLM_MAX_SIZE_MB} MB)."
)
print_top_10_largest_files(wheel_path) print_top_10_largest_files(wheel_path)
return 1 return 1
else: else:
print( print(f"Wheel {wheel_path} is within the allowed size "
f"Wheel {wheel_path} is within the allowed size " f"({wheel_size_mb:.2f} MB).")
f"({wheel_size_mb:.2f} MB)."
)
return 0 return 0
@ -50,4 +45,4 @@ if __name__ == "__main__":
sys.exit(1) sys.exit(1)
directory = sys.argv[1] directory = sys.argv[1]
sys.exit(check_wheel_size(directory)) sys.exit(check_wheel_size(directory))

View File

@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse import argparse
import os import os
@ -23,5 +22,5 @@ with open("index.html", "w") as f:
print(f"Generated index.html for {args.wheel}") print(f"Generated index.html for {args.wheel}")
# cloudfront requires escaping the '+' character # cloudfront requires escaping the '+' character
f.write( f.write(
template.format(wheel=filename, wheel_html_escaped=filename.replace("+", "%2B")) template.format(wheel=filename,
) wheel_html_escaped=filename.replace("+", "%2B")))

View File

@ -1,4 +1,3 @@
# For vllm script, with -t option (tensor parallel size).
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m deepseek-ai/DeepSeek-V2-Lite-Chat -b "auto" -l 1000 -f 5 -t 2 # bash ./run-lm-eval-gsm-vllm-baseline.sh -m deepseek-ai/DeepSeek-V2-Lite-Chat -b "auto" -l 1000 -f 5 -t 2
model_name: "deepseek-ai/DeepSeek-V2-Lite-Chat" model_name: "deepseek-ai/DeepSeek-V2-Lite-Chat"
tasks: tasks:

View File

@ -1,4 +1,3 @@
# For hf script, without -t option (tensor parallel size).
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5
model_name: "nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform" model_name: "nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform"
tasks: tasks:

View File

@ -1,4 +1,3 @@
# For hf script, without -t option (tensor parallel size).
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-70B-Instruct -b 32 -l 250 -f 5 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-70B-Instruct -b 32 -l 250 -f 5
model_name: "meta-llama/Meta-Llama-3-70B-Instruct" model_name: "meta-llama/Meta-Llama-3-70B-Instruct"
tasks: tasks:

View File

@ -1,4 +1,3 @@
# For vllm script, with -t option (tensor parallel size).
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors -b auto -l 1000 -f 5 -t 1 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors -b auto -l 1000 -f 5 -t 1
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors" model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors"
tasks: tasks:

View File

@ -1,4 +1,3 @@
# For vllm script, with -t option (tensor parallel size).
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5 -t 1 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5 -t 1
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform" model_name: "nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform"
tasks: tasks:

View File

@ -1,4 +1,3 @@
# For vllm script, with -t option (tensor parallel size).
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test -b 32 -l 1000 -f 5 -t 1 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test -b 32 -l 1000 -f 5 -t 1
model_name: "nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test" model_name: "nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test"
tasks: tasks:

View File

@ -1,4 +1,3 @@
# For vllm script, with -t option (tensor parallel size).
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Meta-Llama-3-8B-Instruct-FP8 -b 32 -l 250 -f 5 -t 1 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Meta-Llama-3-8B-Instruct-FP8 -b 32 -l 250 -f 5 -t 1
model_name: "neuralmagic/Meta-Llama-3-8B-Instruct-FP8" model_name: "neuralmagic/Meta-Llama-3-8B-Instruct-FP8"
tasks: tasks:

View File

@ -1,4 +1,3 @@
# For vllm script, with -t option (tensor parallel size).
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test -b "auto" -l 250 -f 5 -t 1 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test" model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test"
tasks: tasks:

View File

@ -1,4 +1,3 @@
# For vllm script, with -t option (tensor parallel size).
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test -b "auto" -l 250 -f 5 -t 1 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test" model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test"
tasks: tasks:

View File

@ -1,4 +1,3 @@
# For vllm script, with -t option (tensor parallel size).
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test -b auto -l 1000 -f 5 -t 1 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test -b auto -l 1000 -f 5 -t 1
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test" model_name: "nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test"
tasks: tasks:

View File

@ -1,5 +1,4 @@
# For hf script, without -t option (tensor parallel size). # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-8B-Instruct -b 32 -l 250 -f 5 -t 1
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-8B-Instruct -b 32 -l 250 -f 5
model_name: "meta-llama/Meta-Llama-3-8B-Instruct" model_name: "meta-llama/Meta-Llama-3-8B-Instruct"
tasks: tasks:
- name: "gsm8k" - name: "gsm8k"

View File

@ -1,4 +1,3 @@
# For vllm script, with -t option (tensor parallel size).
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m HandH1998/QQQ-Llama-3-8b-g128 -b 32 -l 1000 -f 5 -t 1 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m HandH1998/QQQ-Llama-3-8b-g128 -b 32 -l 1000 -f 5 -t 1
model_name: "HandH1998/QQQ-Llama-3-8b-g128" model_name: "HandH1998/QQQ-Llama-3-8b-g128"
tasks: tasks:

View File

@ -1,11 +0,0 @@
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Llama-3.2-1B-Instruct-FP8 -b "auto" -l 1319 -f 5 -t 1
model_name: "RedHatAI/Llama-3.2-1B-Instruct-FP8"
tasks:
- name: "gsm8k"
metrics:
- name: "exact_match,strict-match"
value: 0.335
- name: "exact_match,flexible-extract"
value: 0.323
limit: 1319
num_fewshot: 5

View File

@ -1,4 +1,3 @@
# For vllm script, with -t option (tensor parallel size).
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
model_name: "neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8" model_name: "neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8"
tasks: tasks:

View File

@ -1,4 +1,3 @@
# For vllm script, with -t option (tensor parallel size).
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m mgoin/Minitron-4B-Base-FP8 -b auto -l 1000 -f 5 -t 1 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m mgoin/Minitron-4B-Base-FP8 -b auto -l 1000 -f 5 -t 1
model_name: "mgoin/Minitron-4B-Base-FP8" model_name: "mgoin/Minitron-4B-Base-FP8"
tasks: tasks:

View File

@ -1,4 +1,3 @@
# For vllm script, with -t option (tensor parallel size).
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic -b "auto" -l 250 -f 5 -t 8 # bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic -b "auto" -l 250 -f 5 -t 8
model_name: "neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic" model_name: "neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic"
tasks: tasks:

View File

@ -1,4 +1,3 @@
# For vllm script, with -t option (tensor parallel size).
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8 -b "auto" -l 250 -f 5 -t 4 # bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8 -b "auto" -l 250 -f 5 -t 4
model_name: "neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8" model_name: "neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8"
tasks: tasks:

View File

@ -1,5 +1,4 @@
# For hf script, without -t option (tensor parallel size). # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1 -b 32 -l 250 -f 5 -t 4
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1 -b 32 -l 250 -f 5
model_name: "mistralai/Mixtral-8x7B-Instruct-v0.1" model_name: "mistralai/Mixtral-8x7B-Instruct-v0.1"
tasks: tasks:
- name: "gsm8k" - name: "gsm8k"

View File

@ -1,12 +0,0 @@
# For vllm script, with -t option (tensor parallel size).
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16 -b auto -l 1319 -f 5 -t 1
model_name: "nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16"
tasks:
- name: "gsm8k"
metrics:
- name: "exact_match,strict-match"
value: 0.30
- name: "exact_match,flexible-extract"
value: 0.465
limit: 1319
num_fewshot: 5

View File

@ -1,4 +1,3 @@
# For vllm script, with -t option (tensor parallel size).
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-FP8W8 -b auto -l 1000 -f 5 -t 1 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-FP8W8 -b auto -l 1000 -f 5 -t 1
model_name: "nm-testing/Qwen2-1.5B-Instruct-FP8W8" model_name: "nm-testing/Qwen2-1.5B-Instruct-FP8W8"
tasks: tasks:

View File

@ -1,4 +1,3 @@
# For vllm script, with -t option (tensor parallel size).
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
model_name: "neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8" model_name: "neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8"
tasks: tasks:

View File

@ -1,4 +1,3 @@
# For vllm script, with -t option (tensor parallel size).
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise -b "auto" -l 1000 -f 5 -t 1 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise -b "auto" -l 1000 -f 5 -t 1
model_name: "nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise" model_name: "nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise"
tasks: tasks:

View File

@ -1,4 +1,3 @@
# For vllm script, with -t option (tensor parallel size).
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m Qwen/Qwen2-57B-A14B-Instruct -b "auto" -l 250 -f 5 -t 4 # bash ./run-lm-eval-gsm-vllm-baseline.sh -m Qwen/Qwen2-57B-A14B-Instruct -b "auto" -l 250 -f 5 -t 4
model_name: "Qwen/Qwen2-57B-A14B-Instruct" model_name: "Qwen/Qwen2-57B-A14B-Instruct"
tasks: tasks:

View File

@ -1,11 +0,0 @@
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m Qwen/Qwen2.5-1.5B-Instruct -b auto -l 1319 -f 5 -t 1
model_name: "Qwen/Qwen2.5-1.5B-Instruct"
tasks:
- name: "gsm8k"
metrics:
- name: "exact_match,strict-match"
value: 0.54
- name: "exact_match,flexible-extract"
value: 0.59
limit: 1319
num_fewshot: 5

View File

@ -1,11 +0,0 @@
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -b auto -l 1319 -f 5 -t 1
model_name: "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic"
tasks:
- name: "gsm8k"
metrics:
- name: "exact_match,strict-match"
value: 0.47
- name: "exact_match,flexible-extract"
value: 0.64
limit: 1319
num_fewshot: 5

View File

@ -1,4 +1,3 @@
# For vllm script, with -t option (tensor parallel size).
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM -b "auto" -t 2 # bash ./run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM -b "auto" -t 2
model_name: "nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM" model_name: "nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM"
tasks: tasks:

View File

@ -3,4 +3,3 @@ Meta-Llama-3-70B-Instruct.yaml
Mixtral-8x7B-Instruct-v0.1.yaml Mixtral-8x7B-Instruct-v0.1.yaml
Qwen2-57B-A14-Instruct.yaml Qwen2-57B-A14-Instruct.yaml
DeepSeek-V2-Lite-Chat.yaml DeepSeek-V2-Lite-Chat.yaml
Meta-Llama-3-8B-QQQ.yaml

View File

@ -1,6 +1,10 @@
Qwen2.5-1.5B-Instruct.yaml Meta-Llama-3-8B-Instruct.yaml
Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
Qwen1.5-MoE-W4A16-compressed-tensors.yaml Minitron-4B-Base-FP8.yaml
Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
Qwen2-1.5B-Instruct-FP8W8.yaml
Meta-Llama-3-8B-QQQ.yaml

View File

@ -1,44 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from pathlib import Path
import pytest
def pytest_addoption(parser):
parser.addoption(
"--config-list-file",
action="store",
help="Path to the file listing model config YAMLs (one per line)",
)
parser.addoption(
"--tp-size",
action="store",
default="1",
help="Tensor parallel size to use for evaluation",
)
@pytest.fixture(scope="session")
def config_list_file(pytestconfig, config_dir):
rel_path = pytestconfig.getoption("--config-list-file")
return config_dir / rel_path
@pytest.fixture(scope="session")
def tp_size(pytestconfig):
return pytestconfig.getoption("--tp-size")
def pytest_generate_tests(metafunc):
if "config_filename" in metafunc.fixturenames:
rel_path = metafunc.config.getoption("--config-list-file")
config_list_file = Path(rel_path).resolve()
config_dir = config_list_file.parent
with open(config_list_file, encoding="utf-8") as f:
configs = [
config_dir / line.strip()
for line in f
if line.strip() and not line.startswith("#")
]
metafunc.parametrize("config_filename", configs)

View File

@ -0,0 +1,59 @@
#!/bin/bash
usage() {
echo``
echo "Runs lm eval harness on GSM8k using vllm and compares to "
echo "precomputed baseline (measured by HF transformers.)"
echo
echo "usage: ${0} <options>"
echo
echo " -c - path to the test data config (e.g. configs/small-models.txt)"
echo " -t - tensor parallel size"
echo
}
SUCCESS=0
while getopts "c:t:" OPT; do
case ${OPT} in
c )
CONFIG="$OPTARG"
;;
t )
TP_SIZE="$OPTARG"
;;
\? )
usage
exit 1
;;
esac
done
# Parse list of configs.
IFS=$'\n' read -d '' -r -a MODEL_CONFIGS < "$CONFIG"
for MODEL_CONFIG in "${MODEL_CONFIGS[@]}"
do
LOCAL_SUCCESS=0
echo "=== RUNNING MODEL: $MODEL_CONFIG WITH TP SIZE: $TP_SIZE==="
export LM_EVAL_TEST_DATA_FILE=$PWD/configs/${MODEL_CONFIG}
export LM_EVAL_TP_SIZE=$TP_SIZE
pytest -s test_lm_eval_correctness.py || LOCAL_SUCCESS=$?
if [[ $LOCAL_SUCCESS == 0 ]]; then
echo "=== PASSED MODEL: ${MODEL_CONFIG} ==="
else
echo "=== FAILED MODEL: ${MODEL_CONFIG} ==="
fi
SUCCESS=$((SUCCESS + LOCAL_SUCCESS))
done
if [ "${SUCCESS}" -eq "0" ]; then
exit 0
else
exit 1
fi

View File

@ -1,55 +1,69 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
""" """
LM eval harness on model to compare vs HF baseline computed offline. LM eval harness on model to compare vs HF baseline computed offline.
Configs are found in configs/$MODEL.yaml Configs are found in configs/$MODEL.yaml
pytest -s -v test_lm_eval_correctness.py \ * export LM_EVAL_TEST_DATA_FILE=configs/Meta-Llama-3-70B-Instruct.yaml
--config-list-file=configs/models-small.txt \ * export LM_EVAL_TP_SIZE=4
--tp-size=1 * pytest -s test_lm_eval_correctness.py
""" """
import os
from pathlib import Path
import lm_eval import lm_eval
import numpy as np import numpy
import pytest
import yaml import yaml
RTOL = 0.08 RTOL = 0.05
TEST_DATA_FILE = os.environ.get(
"LM_EVAL_TEST_DATA_FILE",
".buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml")
TP_SIZE = os.environ.get("LM_EVAL_TP_SIZE", 1)
def launch_lm_eval(eval_config, tp_size): def launch_lm_eval(eval_config):
trust_remote_code = eval_config.get("trust_remote_code", False) trust_remote_code = eval_config.get('trust_remote_code', False)
model_args = (
f"pretrained={eval_config['model_name']}," model_args = f"pretrained={eval_config['model_name']}," \
f"tensor_parallel_size={tp_size}," f"tensor_parallel_size={TP_SIZE}," \
f"enforce_eager=true," f"add_bos_token=true," \
f"add_bos_token=true," f"trust_remote_code={trust_remote_code}"
f"trust_remote_code={trust_remote_code}"
)
results = lm_eval.simple_evaluate( results = lm_eval.simple_evaluate(
model="vllm", model="vllm",
model_args=model_args, model_args=model_args,
tasks=[task["name"] for task in eval_config["tasks"]], tasks=[task["name"] for task in eval_config["tasks"]],
num_fewshot=eval_config["num_fewshot"], num_fewshot=eval_config["num_fewshot"],
limit=eval_config["limit"], limit=eval_config["limit"],
batch_size="auto", batch_size="auto")
)
return results return results
def test_lm_eval_correctness_param(config_filename, tp_size): def test_lm_eval_correctness():
eval_config = yaml.safe_load(config_filename.read_text(encoding="utf-8")) eval_config = yaml.safe_load(
Path(TEST_DATA_FILE).read_text(encoding="utf-8"))
results = launch_lm_eval(eval_config, tp_size) if eval_config[
"model_name"] == "nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform": #noqa: E501
pytest.skip("FBGEMM is currently failing on main.")
# Launch eval requests.
results = launch_lm_eval(eval_config)
# Confirm scores match ground truth.
success = True success = True
for task in eval_config["tasks"]: for task in eval_config["tasks"]:
for metric in task["metrics"]: for metric in task["metrics"]:
ground_truth = metric["value"] ground_truth = metric["value"]
measured_value = results["results"][task["name"]][metric["name"]] measured_value = results["results"][task["name"]][metric["name"]]
print( print(f'{task["name"]} | {metric["name"]}: '
f"{task['name']} | {metric['name']}: " f'ground_truth={ground_truth} | measured={measured_value}')
f"ground_truth={ground_truth} | measured={measured_value}" success = success and numpy.isclose(
) ground_truth, measured_value, rtol=RTOL)
success = success and np.isclose(ground_truth, measured_value, rtol=RTOL)
# Assert at the end, print all scores even on failure for debugging.
assert success assert success

View File

@ -113,7 +113,7 @@ WARNING: The benchmarking script will save json results by itself, so please do
### Visualizing the results ### Visualizing the results
The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](performance-benchmarks-descriptions.md) with real benchmarking results. The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](tests/descriptions.md) with real benchmarking results.
You can find the result presented as a table inside the `buildkite/performance-benchmark` job page. You can find the result presented as a table inside the `buildkite/performance-benchmark` job page.
If you do not see the table, please wait till the benchmark finish running. If you do not see the table, please wait till the benchmark finish running.
The json version of the table (together with the json version of the benchmark) will be also attached to the markdown file. The json version of the table (together with the json version of the benchmark) will be also attached to the markdown file.

View File

@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import json import json
import os import os
@ -66,18 +65,18 @@ def read_markdown(file):
def results_to_json(latency, throughput, serving): def results_to_json(latency, throughput, serving):
return json.dumps( return json.dumps({
{ 'latency': latency.to_dict(),
"latency": latency.to_dict(), 'throughput': throughput.to_dict(),
"throughput": throughput.to_dict(), 'serving': serving.to_dict()
"serving": serving.to_dict(), })
}
)
if __name__ == "__main__": if __name__ == "__main__":
# collect results # collect results
for test_file in results_folder.glob("*.json"): for test_file in results_folder.glob("*.json"):
with open(test_file) as f: with open(test_file) as f:
raw_result = json.loads(f.read()) raw_result = json.loads(f.read())
@ -121,8 +120,7 @@ if __name__ == "__main__":
for perc in [10, 25, 50, 75, 90, 99]: for perc in [10, 25, 50, 75, 90, 99]:
# Multiply 1000 to convert the time unit from s to ms # Multiply 1000 to convert the time unit from s to ms
raw_result.update( raw_result.update(
{f"P{perc}": 1000 * raw_result["percentiles"][str(perc)]} {f"P{perc}": 1000 * raw_result["percentiles"][str(perc)]})
)
raw_result["avg_latency"] = raw_result["avg_latency"] * 1000 raw_result["avg_latency"] = raw_result["avg_latency"] * 1000
# add the result to raw_result # add the result to raw_result
@ -155,27 +153,26 @@ if __name__ == "__main__":
serving_results = pd.DataFrame.from_dict(serving_results) serving_results = pd.DataFrame.from_dict(serving_results)
throughput_results = pd.DataFrame.from_dict(throughput_results) throughput_results = pd.DataFrame.from_dict(throughput_results)
raw_results_json = results_to_json( raw_results_json = results_to_json(latency_results, throughput_results,
latency_results, throughput_results, serving_results serving_results)
)
# remapping the key, for visualization purpose # remapping the key, for visualization purpose
if not latency_results.empty: if not latency_results.empty:
latency_results = latency_results[list(latency_column_mapping.keys())].rename( latency_results = latency_results[list(
columns=latency_column_mapping latency_column_mapping.keys())].rename(
) columns=latency_column_mapping)
if not serving_results.empty: if not serving_results.empty:
serving_results = serving_results[list(serving_column_mapping.keys())].rename( serving_results = serving_results[list(
columns=serving_column_mapping serving_column_mapping.keys())].rename(
) columns=serving_column_mapping)
if not throughput_results.empty: if not throughput_results.empty:
throughput_results = throughput_results[ throughput_results = throughput_results[list(
list(throughput_results_column_mapping.keys()) throughput_results_column_mapping.keys())].rename(
].rename(columns=throughput_results_column_mapping) columns=throughput_results_column_mapping)
processed_results_json = results_to_json( processed_results_json = results_to_json(latency_results,
latency_results, throughput_results, serving_results throughput_results,
) serving_results)
for df in [latency_results, serving_results, throughput_results]: for df in [latency_results, serving_results, throughput_results]:
if df.empty: if df.empty:
@ -187,39 +184,38 @@ if __name__ == "__main__":
# The GPUs sometimes come in format of "GPUTYPE\nGPUTYPE\n...", # The GPUs sometimes come in format of "GPUTYPE\nGPUTYPE\n...",
# we want to turn it into "8xGPUTYPE" # we want to turn it into "8xGPUTYPE"
df["GPU"] = df["GPU"].apply( df["GPU"] = df["GPU"].apply(
lambda x: f"{len(x.split('\n'))}x{x.split('\n')[0]}" lambda x: f"{len(x.split('\n'))}x{x.split('\n')[0]}")
)
# get markdown tables # get markdown tables
latency_md_table = tabulate( latency_md_table = tabulate(latency_results,
latency_results, headers="keys", tablefmt="pipe", showindex=False headers='keys',
) tablefmt='pipe',
serving_md_table = tabulate( showindex=False)
serving_results, headers="keys", tablefmt="pipe", showindex=False serving_md_table = tabulate(serving_results,
) headers='keys',
throughput_md_table = tabulate( tablefmt='pipe',
throughput_results, headers="keys", tablefmt="pipe", showindex=False showindex=False)
) throughput_md_table = tabulate(throughput_results,
headers='keys',
tablefmt='pipe',
showindex=False)
# document the result # document the result
with open(results_folder / "benchmark_results.md", "w") as f: with open(results_folder / "benchmark_results.md", "w") as f:
results = read_markdown(
"../.buildkite/nightly-benchmarks/" results = read_markdown("../.buildkite/nightly-benchmarks/" +
+ "performance-benchmarks-descriptions.md" "performance-benchmarks-descriptions.md")
)
results = results.format( results = results.format(
latency_tests_markdown_table=latency_md_table, latency_tests_markdown_table=latency_md_table,
throughput_tests_markdown_table=throughput_md_table, throughput_tests_markdown_table=throughput_md_table,
serving_tests_markdown_table=serving_md_table, serving_tests_markdown_table=serving_md_table,
benchmarking_results_in_json_string=processed_results_json, benchmarking_results_in_json_string=processed_results_json)
)
f.write(results) f.write(results)
# document benchmarking results in json # document benchmarking results in json
with open(results_folder / "benchmark_results.json", "w") as f: with open(results_folder / "benchmark_results.json", "w") as f:
results = (
latency_results.to_dict(orient="records") results = latency_results.to_dict(
+ throughput_results.to_dict(orient="records") orient='records') + throughput_results.to_dict(
+ serving_results.to_dict(orient="records") orient='records') + serving_results.to_dict(orient='records')
)
f.write(json.dumps(results)) f.write(json.dumps(results))

View File

@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse import argparse
@ -15,12 +14,15 @@ def main(model, cachedir):
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
description="Download and save Hugging Face tokenizer" description="Download and save Hugging Face tokenizer")
) parser.add_argument("--model",
parser.add_argument("--model", type=str, required=True, help="Name of the model") type=str,
parser.add_argument( required=True,
"--cachedir", type=str, required=True, help="Directory to save the tokenizer" help="Name of the model")
) parser.add_argument("--cachedir",
type=str,
required=True,
help="Directory to save the tokenizer")
args = parser.parse_args() args = parser.parse_args()
main(args.model, args.cachedir) main(args.model, args.cachedir)

View File

@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse import argparse
import json import json
@ -12,33 +11,33 @@ from tabulate import tabulate
def parse_arguments(): def parse_arguments():
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
description="Parse command line arguments for summary-nightly-results script." description=
) 'Parse command line arguments for summary-nightly-results script.')
parser.add_argument( parser.add_argument('--results-folder',
"--results-folder", type=str,
type=str, required=True,
required=True, help='The folder where the results are stored.')
help="The folder where the results are stored.", parser.add_argument('--description',
) type=str,
parser.add_argument( required=True,
"--description", type=str, required=True, help="Description of the results." help='Description of the results.')
)
args = parser.parse_args() args = parser.parse_args()
return args return args
def get_perf(df, method, model, metric): def get_perf(df, method, model, metric):
means = [] means = []
for qps in [2, 4, 8, 16, "inf"]: for qps in [2, 4, 8, 16, "inf"]:
target = df["Test name"].str.contains(model) target = df['Test name'].str.contains(model)
target = target & df["Engine"].str.contains(method) target = target & df['Engine'].str.contains(method)
target = target & df["Test name"].str.contains("qps_" + str(qps)) target = target & df['Test name'].str.contains("qps_" + str(qps))
filtered_df = df[target] filtered_df = df[target]
if filtered_df.empty: if filtered_df.empty:
means.append(0.0) means.append(0.)
else: else:
means.append(filtered_df[metric].values[0]) means.append(filtered_df[metric].values[0])
@ -46,6 +45,7 @@ def get_perf(df, method, model, metric):
def get_perf_w_std(df, method, model, metric): def get_perf_w_std(df, method, model, metric):
if metric in ["TTFT", "ITL"]: if metric in ["TTFT", "ITL"]:
mean = get_perf(df, method, model, "Mean " + metric + " (ms)") mean = get_perf(df, method, model, "Mean " + metric + " (ms)")
mean = mean.tolist() mean = mean.tolist()
@ -60,8 +60,7 @@ def get_perf_w_std(df, method, model, metric):
else: else:
assert metric == "Tput" assert metric == "Tput"
mean = get_perf(df, method, model, "Input Tput (tok/s)") + get_perf( mean = get_perf(df, method, model, "Input Tput (tok/s)") + get_perf(
df, method, model, "Output Tput (tok/s)" df, method, model, "Output Tput (tok/s)")
)
mean = mean.tolist() mean = mean.tolist()
std = None std = None
@ -81,17 +80,18 @@ def main(args):
# generate markdown table # generate markdown table
df = pd.DataFrame.from_dict(results) df = pd.DataFrame.from_dict(results)
md_table = tabulate(df, headers="keys", tablefmt="pipe", showindex=False) md_table = tabulate(df, headers='keys', tablefmt='pipe', showindex=False)
with open(args.description) as f: with open(args.description) as f:
description = f.read() description = f.read()
description = description.format(nightly_results_benchmarking_table=md_table) description = description.format(
nightly_results_benchmarking_table=md_table)
with open("nightly_results.md", "w") as f: with open("nightly_results.md", "w") as f:
f.write(description) f.write(description)
if __name__ == "__main__": if __name__ == '__main__':
args = parse_arguments() args = parse_arguments()
main(args) main(args)

View File

@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from lmdeploy.serve.openai.api_client import APIClient from lmdeploy.serve.openai.api_client import APIClient

View File

@ -10,24 +10,15 @@ set -x
set -o pipefail set -o pipefail
check_gpus() { check_gpus() {
if command -v nvidia-smi; then # check the number of GPUs and GPU type.
# check the number of GPUs and GPU type. declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
elif command -v amd-smi; then
declare -g gpu_count=$(amd-smi list | grep 'GPU' | wc -l)
fi
if [[ $gpu_count -gt 0 ]]; then if [[ $gpu_count -gt 0 ]]; then
echo "GPU found." echo "GPU found."
else else
echo "Need at least 1 GPU to run benchmarking." echo "Need at least 1 GPU to run benchmarking."
exit 1 exit 1
fi fi
if command -v nvidia-smi; then declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')
declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')
elif command -v amd-smi; then
declare -g gpu_type=$(amd-smi static -g 0 -a | grep 'MARKET_NAME' | awk '{print $2}')
fi
echo "GPU type is $gpu_type" echo "GPU type is $gpu_type"
} }
@ -99,15 +90,9 @@ kill_gpu_processes() {
# wait until GPU memory usage smaller than 1GB # wait until GPU memory usage smaller than 1GB
if command -v nvidia-smi; then while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do sleep 1
sleep 1 done
done
elif command -v amd-smi; then
while [ "$(amd-smi metric -g 0 | grep 'USED_VRAM' | awk '{print $2}')" -ge 1000 ]; do
sleep 1
done
fi
# remove vllm config file # remove vllm config file
rm -rf ~/.config/vllm rm -rf ~/.config/vllm

View File

@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import datetime import datetime
import json import json
@ -35,8 +34,10 @@ serving_column_mapping = {
} }
if __name__ == "__main__": if __name__ == "__main__":
# collect results # collect results
for test_file in results_folder.glob("*.json"): for test_file in results_folder.glob("*.json"):
with open(test_file) as f: with open(test_file) as f:
raw_result = json.loads(f.read()) raw_result = json.loads(f.read())
@ -55,16 +56,17 @@ if __name__ == "__main__":
serving_results = pd.DataFrame.from_dict(serving_results) serving_results = pd.DataFrame.from_dict(serving_results)
if not serving_results.empty: if not serving_results.empty:
serving_results = serving_results[list(serving_column_mapping.keys())].rename( serving_results = serving_results[list(
columns=serving_column_mapping serving_column_mapping.keys())].rename(
) columns=serving_column_mapping)
serving_md_table_with_headers = tabulate( serving_md_table_with_headers = tabulate(serving_results,
serving_results, headers="keys", tablefmt="pipe", showindex=False headers='keys',
) tablefmt='pipe',
showindex=False)
# remove the first line of header # remove the first line of header
serving_md_table_lines = serving_md_table_with_headers.split("\n") serving_md_table_lines = serving_md_table_with_headers.split('\n')
serving_md_table_without_header = "\n".join(serving_md_table_lines[2:]) serving_md_table_without_header = '\n'.join(serving_md_table_lines[2:])
prefix = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S") prefix = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
prefix = prefix + "_" + os.environ.get("CURRENT_LLM_SERVING_ENGINE") prefix = prefix + "_" + os.environ.get("CURRENT_LLM_SERVING_ENGINE")
@ -74,9 +76,10 @@ if __name__ == "__main__":
# document results with header. # document results with header.
# for those who wants to reproduce our benchmark. # for those who wants to reproduce our benchmark.
f.write(serving_md_table_with_headers) f.write(serving_md_table_with_headers)
f.write("\n") f.write('\n')
# document benchmarking results in json # document benchmarking results in json
with open(results_folder / f"{prefix}_nightly_results.json", "w") as f: with open(results_folder / f"{prefix}_nightly_results.json", "w") as f:
results = serving_results.to_dict(orient="records")
results = serving_results.to_dict(orient='records')
f.write(json.dumps(results)) f.write(json.dumps(results))

View File

@ -63,12 +63,10 @@
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct", "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
"disable_log_requests": "", "disable_log_requests": "",
"tensor_parallel_size": 4, "tensor_parallel_size": 4,
"swap_space": 16, "swap_space": 16,
"speculative_config": { "speculative_model": "turboderp/Qwama-0.5B-Instruct",
"model": "turboderp/Qwama-0.5B-Instruct", "num_speculative_tokens": 4,
"num_speculative_tokens": 4, "speculative_draft_tensor_parallel_size": 1
"draft_tensor_parallel_size": 1
}
}, },
"client_parameters": { "client_parameters": {
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct", "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",

View File

@ -1,46 +0,0 @@
# This local pyproject file is part of the migration from yapf to ruff format.
# It uses the same core rules as the main pyproject.toml file, but with the
# following differences:
# - ruff line length is overridden to 88
# - deprecated typing ignores (UP006, UP035) have been removed
[tool.ruff]
line-length = 88
[tool.ruff.lint.per-file-ignores]
"vllm/third_party/**" = ["ALL"]
"vllm/version.py" = ["F401"]
"vllm/_version.py" = ["ALL"]
[tool.ruff.lint]
select = [
# pycodestyle
"E",
# Pyflakes
"F",
# pyupgrade
"UP",
# flake8-bugbear
"B",
# flake8-simplify
"SIM",
# isort
"I",
# flake8-logging-format
"G",
]
ignore = [
# star imports
"F405", "F403",
# lambda expression assignment
"E731",
# Loop control variable not used within loop body
"B007",
# f-string format
"UP032",
# Can remove once 3.10+ is the minimum Python version
"UP007",
]
[tool.ruff.format]
docstring-code-format = true

View File

@ -1,25 +1,23 @@
steps: steps:
- label: "Build wheel - CUDA 12.8" - label: "Build wheel - CUDA 12.4"
id: build-wheel-cuda-12-8
agents: agents:
queue: cpu_queue_postmerge queue: cpu_queue_postmerge
commands: commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag vllm-ci:build-image --target build --progress plain ."
- "mkdir artifacts" - "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-wheels.sh" - "bash .buildkite/upload-wheels.sh"
env: env:
DOCKER_BUILDKIT: "1" DOCKER_BUILDKIT: "1"
- label: "Build wheel - CUDA 12.6" - label: "Build wheel - CUDA 12.1"
id: build-wheel-cuda-12-6
agents: agents:
queue: cpu_queue_postmerge queue: cpu_queue_postmerge
commands: commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.6.3 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain ."
- "mkdir artifacts" - "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-wheels.sh" - "bash .buildkite/upload-wheels.sh"
env: env:
DOCKER_BUILDKIT: "1" DOCKER_BUILDKIT: "1"
@ -30,14 +28,13 @@ steps:
- label: "Build wheel - CUDA 11.8" - label: "Build wheel - CUDA 11.8"
# depends_on: block-build-cu118-wheel # depends_on: block-build-cu118-wheel
id: build-wheel-cuda-11-8
agents: agents:
queue: cpu_queue_postmerge queue: cpu_queue_postmerge
commands: commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain ."
- "mkdir artifacts" - "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-wheels.sh" - "bash .buildkite/upload-wheels.sh"
env: env:
DOCKER_BUILDKIT: "1" DOCKER_BUILDKIT: "1"
@ -47,49 +44,33 @@ steps:
- label: "Build release image" - label: "Build release image"
depends_on: block-release-image-build depends_on: block-release-image-build
id: build-release-image
agents: agents:
queue: cpu_queue_postmerge queue: cpu_queue_postmerge
commands: commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ." - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT" - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
- label: "Annotate release workflow"
depends_on:
- build-release-image
- build-wheel-cuda-12-8
- build-wheel-cuda-12-6
- build-wheel-cuda-11-8
id: annotate-release-workflow
agents:
queue: cpu_queue_postmerge
commands:
- "bash .buildkite/scripts/annotate-release.sh"
- label: "Build and publish TPU release image" - label: "Build and publish TPU release image"
depends_on: ~ depends_on: ~
if: build.env("NIGHTLY") == "1" if: build.env("NIGHTLY") == "1"
agents: agents:
queue: tpu_queue_postmerge queue: tpu_queue_postmerge
commands: commands:
- "yes | docker system prune -a" - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f Dockerfile.tpu ."
- "git fetch --all"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f docker/Dockerfile.tpu ."
- "docker push vllm/vllm-tpu:nightly" - "docker push vllm/vllm-tpu:nightly"
- "docker push vllm/vllm-tpu:$BUILDKITE_COMMIT" - "docker push vllm/vllm-tpu:$BUILDKITE_COMMIT"
plugins: plugins:
- docker-login#v3.0.0: - docker-login#v3.0.0:
username: vllmbot username: vllm
password-env: DOCKERHUB_TOKEN password-env: DOCKERHUB_TOKEN
env: env:
DOCKER_BUILDKIT: "1" DOCKER_BUILDKIT: "1"
- input: "Provide Release version here" - input: "Provide Release version here"
id: input-release-version
fields: fields:
- text: "What is the release version?" - text: "What is the release version?"
key: release-version key: "release-version"
- block: "Build CPU release image" - block: "Build CPU release image"
key: block-cpu-release-image-build key: block-cpu-release-image-build
@ -101,22 +82,7 @@ steps:
queue: cpu_queue_postmerge queue: cpu_queue_postmerge
commands: commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ." - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain -f Dockerfile.cpu ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)" - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
env: env:
DOCKER_BUILDKIT: "1" DOCKER_BUILDKIT: "1"
- block: "Build Neuron release image"
key: block-neuron-release-image-build
depends_on: ~
- label: "Build and publish Neuron release image"
depends_on: block-neuron-release-image-build
agents:
queue: neuron-postmerge
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:latest --progress plain -f docker/Dockerfile.neuron ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version)"
env:
DOCKER_BUILDKIT: "1"

View File

@ -3,9 +3,6 @@
# This script runs test inside the corresponding ROCm docker container. # This script runs test inside the corresponding ROCm docker container.
set -o pipefail set -o pipefail
# Export Python path
export PYTHONPATH=".."
# Print ROCm version # Print ROCm version
echo "--- Confirming Clean Initial State" echo "--- Confirming Clean Initial State"
while true; do while true; do
@ -77,106 +74,50 @@ HF_MOUNT="/root/.cache/huggingface"
commands=$@ commands=$@
echo "Commands:$commands" echo "Commands:$commands"
if [[ $commands == *"pytest -v -s basic_correctness/test_basic_correctness.py"* ]]; then
commands=${commands//"pytest -v -s basic_correctness/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s basic_correctness/test_basic_correctness.py"}
fi
if [[ $commands == *"pytest -v -s models/test_registry.py"* ]]; then
commands=${commands//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"}
fi
if [[ $commands == *"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'"* ]]; then
commands=${commands//"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'"/"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2 and not BambaForCausalLM and not Gemma2ForCausalLM and not Grok1ModelForCausalLM and not Zamba2ForCausalLM and not Gemma2Model and not GritLM'"}
fi
if [[ $commands == *"pytest -v -s compile/test_basic_correctness.py"* ]]; then
commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s compile/test_basic_correctness.py"}
fi
if [[ $commands == *"pytest -v -s lora"* ]]; then
commands=${commands//"pytest -v -s lora"/"VLLM_ROCM_CUSTOM_PAGED_ATTN=0 pytest -v -s lora"}
fi
#ignore certain kernels tests #ignore certain kernels tests
if [[ $commands == *" kernels/core"* ]]; then if [[ $commands == *" kernels "* ]]; then
commands="${commands} \ commands="${commands} \
--ignore=kernels/core/test_fused_quant_layernorm.py \ --ignore=kernels/test_attention_selector.py \
--ignore=kernels/core/test_permute_cols.py" --ignore=kernels/test_blocksparse_attention.py \
fi --ignore=kernels/test_causal_conv1d.py \
--ignore=kernels/test_cutlass.py \
if [[ $commands == *" kernels/attention"* ]]; then --ignore=kernels/test_encoder_decoder_attn.py \
commands="${commands} \ --ignore=kernels/test_flash_attn.py \
--ignore=kernels/attention/stest_attention_selector.py \ --ignore=kernels/test_flashinfer.py \
--ignore=kernels/attention/test_blocksparse_attention.py \ --ignore=kernels/test_int8_quant.py \
--ignore=kernels/attention/test_encoder_decoder_attn.py \ --ignore=kernels/test_machete_gemm.py \
--ignore=kernels/attention/test_attention_selector.py \ --ignore=kernels/test_mamba_ssm.py \
--ignore=kernels/attention/test_flash_attn.py \ --ignore=kernels/test_marlin_gemm.py \
--ignore=kernels/attention/test_flashinfer.py \ --ignore=kernels/test_moe.py \
--ignore=kernels/attention/test_prefix_prefill.py \ --ignore=kernels/test_prefix_prefill.py \
--ignore=kernels/attention/test_cascade_flash_attn.py \ --ignore=kernels/test_rand.py \
--ignore=kernels/attention/test_mha_attn.py \ --ignore=kernels/test_sampler.py \
--ignore=kernels/attention/test_lightning_attn.py \ --ignore=kernels/test_cascade_flash_attn.py \
--ignore=kernels/attention/test_attention.py" --ignore=kernels/test_mamba_mixer2.py \
fi --ignore=kernels/test_aqlm.py \
--ignore=kernels/test_machete_mm.py \
if [[ $commands == *" kernels/quantization"* ]]; then --ignore=kernels/test_mha_attn.py \
commands="${commands} \ --ignore=kernels/test_block_fp8.py \
--ignore=kernels/quantization/test_int8_quant.py \ --ignore=kernels/test_permute_cols.py"
--ignore=kernels/quantization/test_aqlm.py \
--ignore=kernels/quantization/test_machete_mm.py \
--ignore=kernels/quantization/test_block_fp8.py \
--ignore=kernels/quantization/test_block_int8.py \
--ignore=kernels/quantization/test_marlin_gemm.py \
--ignore=kernels/quantization/test_cutlass_scaled_mm.py \
--ignore=kernels/quantization/test_int8_kernel.py"
fi
if [[ $commands == *" kernels/mamba"* ]]; then
commands="${commands} \
--ignore=kernels/mamba/test_mamba_mixer2.py \
--ignore=kernels/mamba/test_causal_conv1d.py \
--ignore=kernels/mamba/test_mamba_ssm_ssd.py"
fi
if [[ $commands == *" kernels/moe"* ]]; then
commands="${commands} \
--ignore=kernels/moe/test_moe.py \
--ignore=kernels/moe/test_cutlass_moe.py \
--ignore=kernels/moe/test_triton_moe_ptpc_fp8.py"
fi fi
#ignore certain Entrypoints/openai tests #ignore certain Entrypoints/openai tests
if [[ $commands == *" entrypoints/openai "* ]]; then if [[ $commands == *" entrypoints/openai "* ]]; then
commands=${commands//" entrypoints/openai "/" entrypoints/openai \ commands=${commands//" entrypoints/openai "/" entrypoints/openai \
--ignore=entrypoints/openai/test_audio.py \ --ignore=entrypoints/openai/test_audio.py \
--ignore=entrypoints/openai/test_chat.py \
--ignore=entrypoints/openai/test_shutdown.py \ --ignore=entrypoints/openai/test_shutdown.py \
--ignore=entrypoints/openai/test_completion.py \ --ignore=entrypoints/openai/test_completion.py \
--ignore=entrypoints/openai/test_sleep.py \ --ignore=entrypoints/openai/test_sleep.py \
--ignore=entrypoints/openai/test_models.py \ --ignore=entrypoints/openai/test_models.py \
--ignore=entrypoints/openai/test_lora_adapters.py \
--ignore=entrypoints/openai/test_return_tokens_as_ids.py \
--ignore=entrypoints/openai/test_root_path.py \
--ignore=entrypoints/openai/test_tokenization.py \
--ignore=entrypoints/openai/test_prompt_validation.py "} --ignore=entrypoints/openai/test_prompt_validation.py "}
fi fi
#ignore certain Entrypoints/llm tests #ignore certain Entrypoints/llm tests
if [[ $commands == *" entrypoints/llm "* ]]; then if [[ $commands == *" && pytest -v -s entrypoints/llm/test_guided_generate.py"* ]]; then
commands=${commands//" entrypoints/llm "/" entrypoints/llm \ commands=${commands//" && pytest -v -s entrypoints/llm/test_guided_generate.py"/" "}
--ignore=entrypoints/llm/test_chat.py \
--ignore=entrypoints/llm/test_accuracy.py \
--ignore=entrypoints/llm/test_init.py \
--ignore=entrypoints/llm/test_generate_multiple_loras.py \
--ignore=entrypoints/llm/test_prompt_validation.py "}
fi fi
#Obsolete currently
##ignore certain Entrypoints/llm tests
#if [[ $commands == *" && pytest -v -s entrypoints/llm/test_guided_generate.py"* ]]; then
# commands=${commands//" && pytest -v -s entrypoints/llm/test_guided_generate.py"/" "}
#fi
# --ignore=entrypoints/openai/test_encoder_decoder.py \ # --ignore=entrypoints/openai/test_encoder_decoder.py \
# --ignore=entrypoints/openai/test_embedding.py \ # --ignore=entrypoints/openai/test_embedding.py \
# --ignore=entrypoints/openai/test_oot_registration.py # --ignore=entrypoints/openai/test_oot_registration.py
@ -185,8 +126,6 @@ fi
PARALLEL_JOB_COUNT=8 PARALLEL_JOB_COUNT=8
MYPYTHONPATH=".."
# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs. # check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
if [[ $commands == *"--shard-id="* ]]; then if [[ $commands == *"--shard-id="* ]]; then
# assign job count as the number of shards used # assign job count as the number of shards used
@ -195,10 +134,9 @@ if [[ $commands == *"--shard-id="* ]]; then
# assign shard-id for each shard # assign shard-id for each shard
commands_gpu=${commands//"--shard-id= "/"--shard-id=${GPU} "} commands_gpu=${commands//"--shard-id= "/"--shard-id=${GPU} "}
echo "Shard ${GPU} commands:$commands_gpu" echo "Shard ${GPU} commands:$commands_gpu"
echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
docker run \ docker run \
--device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \ --device /dev/kfd --device /dev/dri \
--network=host \ --network host \
--shm-size=16gb \ --shm-size=16gb \
--rm \ --rm \
-e HIP_VISIBLE_DEVICES="${GPU}" \ -e HIP_VISIBLE_DEVICES="${GPU}" \
@ -207,7 +145,6 @@ if [[ $commands == *"--shard-id="* ]]; then
-e AWS_SECRET_ACCESS_KEY \ -e AWS_SECRET_ACCESS_KEY \
-v "${HF_CACHE}:${HF_MOUNT}" \ -v "${HF_CACHE}:${HF_MOUNT}" \
-e "HF_HOME=${HF_MOUNT}" \ -e "HF_HOME=${HF_MOUNT}" \
-e "PYTHONPATH=${MYPYTHONPATH}" \
--name "${container_name}_${GPU}" \ --name "${container_name}_${GPU}" \
"${image_name}" \ "${image_name}" \
/bin/bash -c "${commands_gpu}" \ /bin/bash -c "${commands_gpu}" \
@ -226,10 +163,9 @@ if [[ $commands == *"--shard-id="* ]]; then
fi fi
done done
else else
echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
docker run \ docker run \
--device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \ --device /dev/kfd --device /dev/dri \
--network=host \ --network host \
--shm-size=16gb \ --shm-size=16gb \
--rm \ --rm \
-e HIP_VISIBLE_DEVICES=0 \ -e HIP_VISIBLE_DEVICES=0 \
@ -238,7 +174,6 @@ else
-e AWS_SECRET_ACCESS_KEY \ -e AWS_SECRET_ACCESS_KEY \
-v "${HF_CACHE}:${HF_MOUNT}" \ -v "${HF_CACHE}:${HF_MOUNT}" \
-e "HF_HOME=${HF_MOUNT}" \ -e "HF_HOME=${HF_MOUNT}" \
-e "PYTHONPATH=${MYPYTHONPATH}" \
--name "${container_name}" \ --name "${container_name}" \
"${image_name}" \ "${image_name}" \
/bin/bash -c "${commands}" /bin/bash -c "${commands}"

View File

@ -5,8 +5,8 @@
set -ex set -ex
set -o pipefail set -o pipefail
# cd 2 levels into the working directory # cd into parent directory of this file
cd "$(dirname "${BASH_SOURCE[0]}")/../.." cd "$(dirname "${BASH_SOURCE[0]}")/.."
(which wget && which curl) || (apt-get update && apt-get install -y wget curl) (which wget && which curl) || (apt-get update && apt-get install -y wget curl)

View File

@ -10,4 +10,5 @@ trap remove_docker_container EXIT
remove_docker_container remove_docker_container
# Try building the docker image # Try building the docker image
docker build -t cpu-test -f docker/Dockerfile.s390x . docker build -t cpu-test -f Dockerfile.ppc64le .

View File

@ -0,0 +1,90 @@
#!/bin/bash
# This script build the CPU docker image and run the offline inference inside the container.
# It serves a sanity check for compilation and basic model usage.
set -ex
# allow to bind to different cores
CORE_RANGE=${CORE_RANGE:-48-95}
NUMA_NODE=${NUMA_NODE:-1}
# Try building the docker image
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build -t cpu-test-"$BUILDKITE_BUILD_NUMBER" -f Dockerfile.cpu .
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 -f Dockerfile.cpu .
# Setup cleanup
remove_docker_container() { set -e; docker rm -f cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" || true; }
trap remove_docker_container EXIT
remove_docker_container
# Run the image, setting --shm-size=4g for tensor parallel.
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
--cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
--cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2
function cpu_tests() {
set -e
export NUMA_NODE=$2
export BUILDKITE_BUILD_NUMBER=$3
# offline inference
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" bash -c "
set -e
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
# Run basic model test
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
set -e
pip install -r vllm/requirements/test.txt
pip install -r vllm/requirements/cpu.txt
pytest -v -s tests/models/decoder_only/language -m cpu_model
pytest -v -s tests/models/embedding/language -m cpu_model
pytest -v -s tests/models/encoder_decoder/language -m cpu_model
pytest -v -s tests/models/decoder_only/audio_language -m cpu_model
pytest -v -s tests/models/decoder_only/vision_language -m cpu_model"
# Run compressed-tensor test
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
set -e
pytest -s -v \
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token"
# Run AWQ test
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
set -e
pytest -s -v \
tests/quantization/test_ipex_quant.py"
# Run chunked-prefill and prefix-cache test
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
set -e
pytest -s -v -k cpu_model \
tests/basic_correctness/test_chunked_prefill.py"
# online serving
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
set -e
export VLLM_CPU_KVCACHE_SPACE=10
export VLLM_CPU_OMP_THREADS_BIND=$1
python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half &
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
python3 benchmarks/benchmark_serving.py \
--backend vllm \
--dataset-name random \
--model facebook/opt-125m \
--num-prompts 20 \
--endpoint /v1/completions \
--tokenizer facebook/opt-125m"
# Run multi-lora tests
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
set -e
pytest -s -v \
tests/lora/test_qwen2vl.py"
}
# All of CPU tests are expected to be finished less than 40 mins.
export -f cpu_tests
timeout 40m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE $BUILDKITE_BUILD_NUMBER"

View File

@ -9,13 +9,11 @@ python3 use_existing_torch.py
# Try building the docker image # Try building the docker image
DOCKER_BUILDKIT=1 docker build . \ DOCKER_BUILDKIT=1 docker build . \
--file docker/Dockerfile \
--target vllm-openai \ --target vllm-openai \
--platform "linux/arm64" \ --platform "linux/arm64" \
-t gh200-test \ -t gh200-test \
--build-arg max_jobs=66 \ --build-arg max_jobs=66 \
--build-arg nvcc_threads=2 \ --build-arg nvcc_threads=2 \
--build-arg RUN_WHEEL_CHECK=false \
--build-arg torch_cuda_arch_list="9.0+PTX" \ --build-arg torch_cuda_arch_list="9.0+PTX" \
--build-arg vllm_fa_cmake_gpu_arches="90-real" --build-arg vllm_fa_cmake_gpu_arches="90-real"
@ -25,6 +23,6 @@ trap remove_docker_container EXIT
remove_docker_container remove_docker_container
# Run the image and test offline inference # Run the image and test offline inference
docker run -e HF_TOKEN -e VLLM_WORKER_MULTIPROC_METHOD=spawn -v /root/.cache/huggingface:/root/.cache/huggingface --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c ' docker run -e HF_TOKEN -v /root/.cache/huggingface:/root/.cache/huggingface --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
python3 examples/offline_inference/basic/generate.py --model meta-llama/Llama-3.2-1B python3 examples/offline_inference/basic/generate.py --model meta-llama/Llama-3.2-1B
' '

View File

@ -5,22 +5,20 @@
set -ex set -ex
# Try building the docker image # Try building the docker image
docker build -t hpu-test-env -f docker/Dockerfile.hpu . docker build -t hpu-test-env -f Dockerfile.hpu .
# Setup cleanup # Setup cleanup
# certain versions of HPU software stack have a bug that can # certain versions of HPU software stack have a bug that can
# override the exit code of the script, so we need to use # override the exit code of the script, so we need to use
# separate remove_docker_containers and remove_docker_containers_and_exit # separate remove_docker_container and remove_docker_container_and_exit
# functions, while other platforms only need one remove_docker_container # functions, while other platforms only need one remove_docker_container
# function. # function.
EXITCODE=1 EXITCODE=1
remove_docker_containers() { docker rm -f hpu-test || true; docker rm -f hpu-test-tp2 || true; } remove_docker_container() { docker rm -f hpu-test || true; }
remove_docker_containers_and_exit() { remove_docker_containers; exit $EXITCODE; } remove_docker_container_and_exit() { remove_docker_container; exit $EXITCODE; }
trap remove_docker_containers_and_exit EXIT trap remove_docker_container_and_exit EXIT
remove_docker_containers remove_docker_container
# Run the image and launch offline inference # Run the image and launch offline inference
docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m
docker run --runtime=habana --name=hpu-test-tp2 --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --tensor-parallel-size 2
EXITCODE=$? EXITCODE=$?

View File

@ -3,7 +3,7 @@
set -euox pipefail set -euox pipefail
if [[ $# -lt 4 ]]; then if [[ $# -lt 4 ]]; then
echo "Usage: .buildkite/scripts/run-multi-node-test.sh WORKING_DIR NUM_NODES NUM_GPUS DOCKER_IMAGE COMMAND1 COMMAND2 ... COMMANDN" echo "Usage: .buildkite/run-multi-node-test.sh WORKING_DIR NUM_NODES NUM_GPUS DOCKER_IMAGE COMMAND1 COMMAND2 ... COMMANDN"
exit 1 exit 1
fi fi

View File

@ -11,14 +11,13 @@ container_name="neuron_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
HF_CACHE="$(realpath ~)/huggingface" HF_CACHE="$(realpath ~)/huggingface"
mkdir -p "${HF_CACHE}" mkdir -p "${HF_CACHE}"
HF_MOUNT="/root/.cache/huggingface" HF_MOUNT="/root/.cache/huggingface"
HF_TOKEN=$(aws secretsmanager get-secret-value --secret-id "ci/vllm-neuron/hf-token" --region us-west-2 --query 'SecretString' --output text | jq -r .VLLM_NEURON_CI_HF_TOKEN)
NEURON_COMPILE_CACHE_URL="$(realpath ~)/neuron_compile_cache" NEURON_COMPILE_CACHE_URL="$(realpath ~)/neuron_compile_cache"
mkdir -p "${NEURON_COMPILE_CACHE_URL}" mkdir -p "${NEURON_COMPILE_CACHE_URL}"
NEURON_COMPILE_CACHE_MOUNT="/root/.cache/neuron_compile_cache" NEURON_COMPILE_CACHE_MOUNT="/root/.cache/neuron_compile_cache"
# Try building the docker image # Try building the docker image
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-west-2.amazonaws.com
# prune old image and containers to save disk space, and only once a day # prune old image and containers to save disk space, and only once a day
# by using a timestamp file in tmp. # by using a timestamp file in tmp.
@ -36,7 +35,7 @@ else
date "+%s" > /tmp/neuron-docker-build-timestamp date "+%s" > /tmp/neuron-docker-build-timestamp
fi fi
docker build -t "${image_name}" -f docker/Dockerfile.neuron . docker build -t "${image_name}" -f Dockerfile.neuron .
# Setup cleanup # Setup cleanup
remove_docker_container() { remove_docker_container() {
@ -48,16 +47,8 @@ trap remove_docker_container EXIT
docker run --rm -it --device=/dev/neuron0 --network bridge \ docker run --rm -it --device=/dev/neuron0 --network bridge \
-v "${HF_CACHE}:${HF_MOUNT}" \ -v "${HF_CACHE}:${HF_MOUNT}" \
-e "HF_HOME=${HF_MOUNT}" \ -e "HF_HOME=${HF_MOUNT}" \
-e "HF_TOKEN=${HF_TOKEN}" \
-v "${NEURON_COMPILE_CACHE_URL}:${NEURON_COMPILE_CACHE_MOUNT}" \ -v "${NEURON_COMPILE_CACHE_URL}:${NEURON_COMPILE_CACHE_MOUNT}" \
-e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \ -e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \
--name "${container_name}" \ --name "${container_name}" \
${image_name} \ ${image_name} \
/bin/bash -c " /bin/bash -c "python3 /workspace/vllm/examples/offline_inference/neuron.py && python3 -m pytest /workspace/vllm/tests/neuron/1_core/ -v --capture=tee-sys && python3 -m pytest /workspace/vllm/tests/neuron/2_core/ -v --capture=tee-sys"
python3 /workspace/vllm/examples/offline_inference/neuron.py;
python3 -m pytest /workspace/vllm/tests/neuron/1_core/ -v --capture=tee-sys;
for f in /workspace/vllm/tests/neuron/2_core/*.py; do
echo 'Running test file: '$f;
python3 -m pytest \$f -v --capture=tee-sys;
done
"

38
.buildkite/run-tpu-v1-test.sh Executable file
View File

@ -0,0 +1,38 @@
#!/bin/bash
set -e
# Build the docker image.
docker build -f Dockerfile.tpu -t vllm-tpu .
# Set up cleanup.
remove_docker_container() { docker rm -f tpu-test || true; }
trap remove_docker_container EXIT
# Remove the container that might not be cleaned up in the previous run.
remove_docker_container
# For HF_TOKEN.
source /etc/environment
# Run a simple end-to-end example.
docker run --privileged --net host --shm-size=16G -it \
-e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \
&& python3 -m pip install pytest \
&& python3 -m pip install lm_eval[api]==0.4.4 \
&& export VLLM_USE_V1=1 \
&& export VLLM_XLA_CHECK_RECOMPILATION=1 \
&& echo TEST_1 \
&& python3 /workspace/vllm/tests/tpu/test_compilation.py \
&& echo TEST_2 \
&& pytest -v -s /workspace/vllm/tests/v1/tpu/test_basic.py \
&& echo TEST_3 \
&& pytest -v -s /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine \
&& echo TEST_4 \
&& pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py \
&& echo TEST_5 \
&& python3 /workspace/vllm/examples/offline_inference/tpu.py" \
# TODO: This test fails because it uses RANDOM_SEED sampling
# && VLLM_USE_V1=1 pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \

View File

@ -8,7 +8,7 @@ image_name="xpu/vllm-ci:${BUILDKITE_COMMIT}"
container_name="xpu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)" container_name="xpu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
# Try building the docker image # Try building the docker image
docker build -t ${image_name} -f docker/Dockerfile.xpu . docker build -t ${image_name} -f Dockerfile.xpu .
# Setup cleanup # Setup cleanup
remove_docker_container() { remove_docker_container() {

View File

@ -1,31 +0,0 @@
#!/bin/bash
set -ex
# Get release version and strip leading 'v' if present
RELEASE_VERSION=$(buildkite-agent meta-data get release-version | sed 's/^v//')
if [ -z "$RELEASE_VERSION" ]; then
echo "Error: RELEASE_VERSION is empty. 'release-version' metadata might not be set or is invalid."
exit 1
fi
buildkite-agent annotate --style 'info' --context 'release-workflow' << EOF
To download the wheel:
\`\`\`
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu126/vllm-${RELEASE_VERSION}+cu126-cp38-abi3-manylinux1_x86_64.whl .
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu118/vllm-${RELEASE_VERSION}+cu118-cp38-abi3-manylinux1_x86_64.whl .
\`\`\`
To download and upload the image:
\`\`\`
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT} vllm/vllm-openai
docker tag vllm/vllm-openai vllm/vllm-openai:latest
docker tag vllm/vllm-openai vllm/vllm-openai:v${RELEASE_VERSION}
docker push vllm/vllm-openai:latest
docker push vllm/vllm-openai:v${RELEASE_VERSION}
\`\`\`
EOF

View File

@ -1,17 +0,0 @@
#!/bin/bash
# Usage: ./ci_clean_log.sh ci.log
# This script strips timestamps and color codes from CI log files.
# Check if argument is given
if [ $# -lt 1 ]; then
echo "Usage: $0 ci.log"
exit 1
fi
INPUT_FILE="$1"
# Strip timestamps
sed -i 's/^\[[0-9]\{4\}-[0-9]\{2\}-[0-9]\{2\}T[0-9]\{2\}:[0-9]\{2\}:[0-9]\{2\}Z\] //' "$INPUT_FILE"
# Strip colorization
sed -i -r 's/\x1B\[[0-9;]*[mK]//g' "$INPUT_FILE"

View File

@ -1,49 +0,0 @@
#!/bin/bash
# This script build the CPU docker image and run the offline inference inside the container.
# It serves a sanity check for compilation and basic model usage.
set -ex
# Setup cleanup
remove_docker_container() {
if [[ -n "$container_id" ]]; then
podman stop --all -t0
podman rm -f "$container_id" || true
fi
podman system prune -f
}
trap remove_docker_container EXIT
remove_docker_container
# Try building the docker image
podman build -t cpu-test-ubi9-ppc -f docker/Dockerfile.ppc64le .
# Run the image
container_id=$(podman run -itd --entrypoint /bin/bash -v /tmp/:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN cpu-test-ubi9-ppc)
function cpu_tests() {
# offline inference
podman exec -it "$container_id" bash -c "
set -e
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
# Run basic model test
podman exec -it "$container_id" bash -c "
set -e
pip install pytest pytest-asyncio einops peft Pillow soundfile transformers_stream_generator matplotlib
pip install sentence-transformers datamodel_code_generator
pytest -v -s tests/models/language/generation/test_bart.py -m cpu_model
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-openai-community/gpt2]
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-facebook/opt-125m]
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-google/gemma-1.1-2b-it]
pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach]
pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model"
}
# All of CPU tests are expected to be finished less than 40 mins.
export container_id
export -f cpu_tests
timeout 40m bash -c cpu_tests

View File

@ -1,101 +0,0 @@
#!/bin/bash
# This script build the CPU docker image and run the offline inference inside the container.
# It serves a sanity check for compilation and basic model usage.
set -ex
# allow to bind to different cores
CORE_RANGE=${CORE_RANGE:-48-95}
OMP_CORE_RANGE=${OMP_CORE_RANGE:-48-95}
NUMA_NODE=${NUMA_NODE:-1}
export CMAKE_BUILD_PARALLEL_LEVEL=32
# Setup cleanup
remove_docker_container() {
set -e;
docker rm -f cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"-avx2 || true;
}
trap remove_docker_container EXIT
remove_docker_container
# Try building the docker image
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE" --target vllm-test -f docker/Dockerfile.cpu .
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
# Run the image, setting --shm-size=4g for tensor parallel.
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_OMP_THREADS_BIND="$OMP_CORE_RANGE" --env VLLM_CPU_CI_ENV=1 --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_OMP_THREADS_BIND="$OMP_CORE_RANGE" --env VLLM_CPU_CI_ENV=1 --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2
function cpu_tests() {
set -e
export NUMA_NODE=$2
# list packages
docker exec cpu-test-"$NUMA_NODE"-avx2 bash -c "
set -e
pip list"
docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e
pip list"
# offline inference
docker exec cpu-test-"$NUMA_NODE"-avx2 bash -c "
set -e
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
# Run basic model test
docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e
pytest -v -s tests/kernels/attention/test_cache.py -m cpu_model
pytest -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
pytest -v -s tests/models/language/generation -m cpu_model
pytest -v -s tests/models/language/pooling -m cpu_model
pytest -v -s tests/models/multimodal/generation \
--ignore=tests/models/multimodal/generation/test_mllama.py \
--ignore=tests/models/multimodal/generation/test_pixtral.py \
-m cpu_model"
# Run compressed-tensor test
docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e
pytest -s -v \
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token"
# Run AWQ test
docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e
VLLM_USE_V1=0 pytest -s -v \
tests/quantization/test_ipex_quant.py"
# Run chunked-prefill and prefix-cache test
docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e
pytest -s -v -k cpu_model \
tests/basic_correctness/test_chunked_prefill.py"
# online serving
docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e
python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half &
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
VLLM_CPU_CI_ENV=0 python3 benchmarks/benchmark_serving.py \
--backend vllm \
--dataset-name random \
--model facebook/opt-125m \
--num-prompts 20 \
--endpoint /v1/completions \
--tokenizer facebook/opt-125m"
# Run multi-lora tests
docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e
pytest -s -v \
tests/lora/test_qwen2vl.py"
}
# All of CPU tests are expected to be finished less than 40 mins.
export -f cpu_tests
timeout 1h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"

View File

@ -1,185 +0,0 @@
#!/bin/bash
set -xu
remove_docker_container() {
docker rm -f tpu-test || true;
docker rm -f vllm-tpu || true;
}
trap remove_docker_container EXIT
# Remove the container that might not be cleaned up in the previous run.
remove_docker_container
# Build the docker image.
docker build -f docker/Dockerfile.tpu -t vllm-tpu .
# Set up cleanup.
cleanup_docker() {
# Get Docker's root directory
docker_root=$(docker info -f '{{.DockerRootDir}}')
if [ -z "$docker_root" ]; then
echo "Failed to determine Docker root directory."
exit 1
fi
echo "Docker root directory: $docker_root"
# Check disk usage of the filesystem where Docker's root directory is located
disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
# Define the threshold
threshold=70
if [ "$disk_usage" -gt "$threshold" ]; then
echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
# Remove dangling images (those that are not tagged and not used by any container)
docker image prune -f
# Remove unused volumes / force the system prune for old images as well.
docker volume prune -f && docker system prune --force --filter "until=72h" --all
echo "Docker images and volumes cleanup completed."
else
echo "Disk usage is below $threshold%. No cleanup needed."
fi
}
cleanup_docker
# For HF_TOKEN.
source /etc/environment
docker run --privileged --net host --shm-size=16G -it \
-e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
vllm-tpu /bin/bash -c '
set -e # Exit immediately if a command exits with a non-zero status.
set -u # Treat unset variables as an error.
echo "--- Starting script inside Docker container ---"
# Create results directory
RESULTS_DIR=$(mktemp -d)
# If mktemp fails, set -e will cause the script to exit.
echo "Results will be stored in: $RESULTS_DIR"
# Install dependencies
echo "--- Installing Python dependencies ---"
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
&& python3 -m pip install --progress-bar off lm_eval[api]==0.4.4
echo "--- Python dependencies installed ---"
export VLLM_USE_V1=1
export VLLM_XLA_CHECK_RECOMPILATION=1
export VLLM_XLA_CACHE_PATH=
echo "Using VLLM V1"
echo "--- Hardware Information ---"
tpu-info
echo "--- Starting Tests ---"
set +e
overall_script_exit_code=0
# --- Test Definitions ---
# If a test fails, this function will print logs and will not cause the main script to exit.
run_test() {
local test_num=$1
local test_name=$2
local test_command=$3
local log_file="$RESULTS_DIR/test_${test_num}.log"
local actual_exit_code
echo "--- TEST_$test_num: Running $test_name ---"
# Execute the test command.
eval "$test_command" > >(tee -a "$log_file") 2> >(tee -a "$log_file" >&2)
actual_exit_code=$?
echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" # This goes to main log
echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" >> "$log_file" # Also to per-test log
if [ "$actual_exit_code" -ne 0 ]; then
echo "TEST_$test_num ($test_name) FAILED with exit code $actual_exit_code." >&2
echo "--- Log for failed TEST_$test_num ($test_name) ---" >&2
if [ -f "$log_file" ]; then
cat "$log_file" >&2
else
echo "Log file $log_file not found for TEST_$test_num ($test_name)." >&2
fi
echo "--- End of log for TEST_$test_num ($test_name) ---" >&2
return "$actual_exit_code" # Return the failure code
else
echo "TEST_$test_num ($test_name) PASSED."
return 0 # Return success
fi
}
# Helper function to call run_test and update the overall script exit code
run_and_track_test() {
local test_num_arg="$1"
local test_name_arg="$2"
local test_command_arg="$3"
# Run the test
run_test "$test_num_arg" "$test_name_arg" "$test_command_arg"
local test_specific_exit_code=$?
# If the test failed, set the overall script exit code to 1
if [ "$test_specific_exit_code" -ne 0 ]; then
# No need for extra echo here, run_test already logged the failure.
overall_script_exit_code=1
fi
}
# --- Actual Test Execution ---
run_and_track_test 0 "test_perf.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_perf.py"
run_and_track_test 1 "test_compilation.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_compilation.py"
run_and_track_test 2 "test_basic.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_basic.py"
run_and_track_test 3 "test_accuracy.py::test_lm_eval_accuracy_v1_engine" \
"python3 -m pytest -s -v /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine"
run_and_track_test 4 "test_quantization_accuracy.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py"
run_and_track_test 5 "examples/offline_inference/tpu.py" \
"python3 /workspace/vllm/examples/offline_inference/tpu.py"
run_and_track_test 6 "test_tpu_model_runner.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/worker/test_tpu_model_runner.py"
run_and_track_test 7 "test_sampler.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py"
run_and_track_test 8 "test_topk_topp_sampler.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_topk_topp_sampler.py"
run_and_track_test 9 "test_multimodal.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py"
run_and_track_test 10 "test_pallas.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py"
run_and_track_test 11 "test_struct_output_generate.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\""
run_and_track_test 12 "test_moe_pallas.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py"
run_and_track_test 13 "test_lora.py" \
"VLLM_XLA_CHECK_RECOMPILATION=0 python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/test_lora.py"
run_and_track_test 14 "test_tpu_qkv_linear.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_tpu_qkv_linear.py"
run_and_track_test 15 "test_spmd_model_weight_loading.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_spmd_model_weight_loading.py"
# After all tests have been attempted, exit with the overall status.
if [ "$overall_script_exit_code" -ne 0 ]; then
echo "--- One or more tests FAILED. Overall script exiting with failure code 1. ---"
else
echo "--- All tests have completed and PASSED. Overall script exiting with success code 0. ---"
fi
exit "$overall_script_exit_code"
' # IMPORTANT: This is the closing single quote for the bash -c "..." command. Ensure it is present and correct.
# Capture the exit code of the docker run command
DOCKER_RUN_EXIT_CODE=$?
# The trap will run for cleanup.
# Exit the main script with the Docker run command's exit code.
if [ "$DOCKER_RUN_EXIT_CODE" -ne 0 ]; then
echo "Docker run command failed with exit code $DOCKER_RUN_EXIT_CODE."
exit "$DOCKER_RUN_EXIT_CODE"
else
echo "Docker run command completed successfully."
exit 0
fi
# TODO: This test fails because it uses RANDOM_SEED sampling
# pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \

View File

@ -1,18 +0,0 @@
#!/bin/bash
# Usage: ./rerun_test.sh path/to/test.py::test_name
# Check if argument is given
if [ $# -lt 1 ]; then
echo "Usage: $0 path/to/test.py::test_name"
echo "Example: $0 tests/v1/engine/test_engine_core_client.py::test_kv_cache_events[True-tcp]"
exit 1
fi
TEST=$1
COUNT=1
while pytest -sv "$TEST"; do
COUNT=$((COUNT + 1))
echo "RUN NUMBER ${COUNT}"
done

View File

@ -1,24 +0,0 @@
#!/bin/bash
set -euo pipefail
docker_root=$(docker info -f '{{.DockerRootDir}}')
if [ -z "$docker_root" ]; then
echo "Failed to determine Docker root directory."
exit 1
fi
echo "Docker root directory: $docker_root"
# Check disk usage of the filesystem where Docker's root directory is located
disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
# Define the threshold
threshold=70
if [ "$disk_usage" -gt "$threshold" ]; then
echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
# Remove dangling images (those that are not tagged and not used by any container)
docker image prune -f
# Remove unused volumes / force the system prune for old images as well.
docker volume prune -f && docker system prune --force --filter "until=72h" --all
echo "Docker images and volumes cleanup completed."
else
echo "Disk usage is below $threshold%. No cleanup needed."
fi

View File

@ -1,14 +0,0 @@
# Environment config
TEST_NAME=llama8b
CONTAINER_NAME=vllm-tpu
# vllm config
MODEL=meta-llama/Llama-3.1-8B-Instruct
MAX_NUM_SEQS=512
MAX_NUM_BATCHED_TOKENS=512
TENSOR_PARALLEL_SIZE=1
MAX_MODEL_LEN=2048
DOWNLOAD_DIR=/mnt/disks/persist
EXPECTED_THROUGHPUT=8.0
INPUT_LEN=1800
OUTPUT_LEN=128

View File

@ -1,102 +0,0 @@
#!/bin/bash
if [ ! -f "$1" ]; then
echo "Error: The env file '$1' does not exist."
exit 1 # Exit the script with a non-zero status to indicate an error
fi
ENV_FILE=$1
# For testing on local vm, use `set -a` to export all variables
source /etc/environment
source $ENV_FILE
remove_docker_container() {
docker rm -f tpu-test || true;
docker rm -f vllm-tpu || true;
docker rm -f $CONTAINER_NAME || true;
}
trap remove_docker_container EXIT
# Remove the container that might not be cleaned up in the previous run.
remove_docker_container
# Build docker image.
# TODO: build the image outside the script and share the image with other
# tpu test if building time is too long.
DOCKER_BUILDKIT=1 docker build \
--build-arg max_jobs=16 \
--build-arg USE_SCCACHE=1 \
--build-arg GIT_REPO_CHECK=0 \
--tag vllm/vllm-tpu-bm \
--progress plain -f docker/Dockerfile.tpu .
LOG_ROOT=$(mktemp -d)
# If mktemp fails, set -e will cause the script to exit.
echo "Results will be stored in: $LOG_ROOT"
if [ -z "$HF_TOKEN" ]; then
echo "Error: HF_TOKEN is not set or is empty."
exit 1
fi
# Make sure mounted disk or dir exists
if [ ! -d "$DOWNLOAD_DIR" ]; then
echo "Error: Folder $DOWNLOAD_DIR does not exist. This is useually a mounted drive. If no mounted drive, just create a folder."
exit 1
fi
echo "Run model $MODEL"
echo
echo "starting docker...$CONTAINER_NAME"
echo
docker run \
-v $DOWNLOAD_DIR:$DOWNLOAD_DIR \
--env-file $ENV_FILE \
-e HF_TOKEN="$HF_TOKEN" \
-e TARGET_COMMIT=$BUILDKITE_COMMIT \
-e MODEL=$MODEL \
-e WORKSPACE=/workspace \
--name $CONTAINER_NAME \
-d \
--privileged \
--network host \
-v /dev/shm:/dev/shm \
vllm/vllm-tpu-bm tail -f /dev/null
echo "run script..."
echo
docker exec "$CONTAINER_NAME" /bin/bash -c ".buildkite/scripts/hardware_ci/run_bm.sh"
echo "copy result back..."
VLLM_LOG="$LOG_ROOT/$TEST_NAME"_vllm_log.txt
BM_LOG="$LOG_ROOT/$TEST_NAME"_bm_log.txt
docker cp "$CONTAINER_NAME:/workspace/vllm_log.txt" "$VLLM_LOG"
docker cp "$CONTAINER_NAME:/workspace/bm_log.txt" "$BM_LOG"
throughput=$(grep "Request throughput (req/s):" "$BM_LOG" | sed 's/[^0-9.]//g')
echo "throughput for $TEST_NAME at $BUILDKITE_COMMIT: $throughput"
if [ "$BUILDKITE" = "true" ]; then
echo "Running inside Buildkite"
buildkite-agent artifact upload "$VLLM_LOG"
buildkite-agent artifact upload "$BM_LOG"
else
echo "Not running inside Buildkite"
fi
#
# compare the throughput with EXPECTED_THROUGHPUT
# and assert meeting the expectation
#
if [[ -z "$throughput" || ! "$throughput" =~ ^[0-9]+([.][0-9]+)?$ ]]; then
echo "Failed to get the throughput"
exit 1
fi
if (( $(echo "$throughput < $EXPECTED_THROUGHPUT" | bc -l) )); then
echo "Error: throughput($throughput) is less than expected($EXPECTED_THROUGHPUT)"
exit 1
fi

View File

@ -1,94 +0,0 @@
#!/bin/bash
set -euo pipefail
VLLM_LOG="$WORKSPACE/vllm_log.txt"
BM_LOG="$WORKSPACE/bm_log.txt"
if [ -n "$TARGET_COMMIT" ]; then
head_hash=$(git rev-parse HEAD)
if [ "$TARGET_COMMIT" != "$head_hash" ]; then
echo "Error: target commit $TARGET_COMMIT does not match HEAD: $head_hash"
exit 1
fi
fi
echo "model: $MODEL"
echo
#
# create a log folder
#
mkdir "$WORKSPACE/log"
# TODO: Move to image building.
pip install pandas
pip install datasets
#
# create sonnet_4x
#
echo "Create sonnet_4x.txt"
echo "" > benchmarks/sonnet_4x.txt
for _ in {1..4}
do
cat benchmarks/sonnet.txt >> benchmarks/sonnet_4x.txt
done
#
# start vllm service in backend
#
echo "lanching vllm..."
echo "logging to $VLLM_LOG"
echo
VLLM_USE_V1=1 vllm serve $MODEL \
--seed 42 \
--disable-log-requests \
--max-num-seqs $MAX_NUM_SEQS \
--max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \
--tensor-parallel-size $TENSOR_PARALLEL_SIZE \
--no-enable-prefix-caching \
--download_dir $DOWNLOAD_DIR \
--max-model-len $MAX_MODEL_LEN > "$VLLM_LOG" 2>&1 &
echo "wait for 20 minutes.."
echo
# sleep 1200
# wait for 10 minutes...
for i in {1..120}; do
# TODO: detect other type of errors.
if grep -Fq "raise RuntimeError" "$VLLM_LOG"; then
echo "Detected RuntimeError, exiting."
exit 1
elif grep -Fq "Application startup complete" "$VLLM_LOG"; then
echo "Application started"
break
else
echo "wait for 10 seconds..."
sleep 10
fi
done
#
# run test
#
echo "run benchmark test..."
echo "logging to $BM_LOG"
echo
python benchmarks/benchmark_serving.py \
--backend vllm \
--model $MODEL \
--dataset-name sonnet \
--dataset-path benchmarks/sonnet_4x.txt \
--sonnet-input-len $INPUT_LEN \
--sonnet-output-len $OUTPUT_LEN \
--ignore-eos > "$BM_LOG"
echo "completed..."
echo
throughput=$(grep "Request throughput (req/s):" "$BM_LOG" | sed 's/[^0-9.]//g')
echo "throughput: $throughput"
echo

View File

@ -8,7 +8,6 @@
# Documentation # Documentation
# label(str): the name of the test. emoji allowed. # label(str): the name of the test. emoji allowed.
# fast_check(bool): whether to run this on each commit on fastcheck pipeline. # fast_check(bool): whether to run this on each commit on fastcheck pipeline.
# torch_nightly(bool): whether to run this on vllm against torch nightly pipeline.
# fast_check_only(bool): run this test on fastcheck pipeline only # fast_check_only(bool): run this test on fastcheck pipeline only
# optional(bool): never run this test by default (i.e. need to unblock manually) unless it's scheduled nightly run. # optional(bool): never run this test by default (i.e. need to unblock manually) unless it's scheduled nightly run.
# command(str): the single command to run for tests. incompatible with commands. # command(str): the single command to run for tests. incompatible with commands.
@ -32,17 +31,16 @@ steps:
##### fast check tests ##### ##### fast check tests #####
- label: Documentation Build # 2min - label: Documentation Build # 2min
mirror_hardwares: [amdexperimental] working_dir: "/vllm-workspace/test_docs/docs"
working_dir: "/vllm-workspace/test_docs"
fast_check: true fast_check: true
no_gpu: True no_gpu: True
commands: commands:
- pip install -r ../requirements/docs.txt - pip install -r ../../requirements/docs.txt
# TODO: add `--strict` once warnings in docstrings are fixed - SPHINXOPTS=\"-W\" make html
- mkdocs build # Check API reference (if it fails, you may have missing mock imports)
- grep \"sig sig-object py\" build/html/api/inference_params.html
- label: Async Engine, Inputs, Utils, Worker Test # 24min - label: Async Engine, Inputs, Utils, Worker Test # 24min
mirror_hardwares: [amdexperimental]
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/mq_llm_engine - tests/mq_llm_engine
@ -58,13 +56,11 @@ steps:
- pytest -v -s async_engine # AsyncLLMEngine - pytest -v -s async_engine # AsyncLLMEngine
- NUM_SCHEDULER_STEPS=4 pytest -v -s async_engine/test_async_llm_engine.py - NUM_SCHEDULER_STEPS=4 pytest -v -s async_engine/test_async_llm_engine.py
- pytest -v -s test_inputs.py - pytest -v -s test_inputs.py
- pytest -v -s test_outputs.py
- pytest -v -s multimodal - pytest -v -s multimodal
- pytest -v -s test_utils.py # Utils - pytest -v -s test_utils.py # Utils
- pytest -v -s worker # Worker - pytest -v -s worker # Worker
- label: Python-only Installation Test - label: Python-only Installation Test
mirror_hardwares: [amdexperimental]
source_file_dependencies: source_file_dependencies:
- tests/standalone_tests/python_only_compile.sh - tests/standalone_tests/python_only_compile.sh
- setup.py - setup.py
@ -72,9 +68,8 @@ steps:
- bash standalone_tests/python_only_compile.sh - bash standalone_tests/python_only_compile.sh
- label: Basic Correctness Test # 30min - label: Basic Correctness Test # 30min
mirror_hardwares: [amdexperimental, amdproduction] #mirror_hardwares: [amd]
fast_check: true fast_check: true
torch_nightly: true
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/basic_correctness/test_basic_correctness - tests/basic_correctness/test_basic_correctness
@ -89,7 +84,6 @@ steps:
- VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
- label: Chunked Prefill Test - label: Chunked Prefill Test
mirror_hardwares: [amdexperimental]
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/basic_correctness/test_chunked_prefill - tests/basic_correctness/test_chunked_prefill
@ -98,7 +92,7 @@ steps:
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
- label: Core Test # 10min - label: Core Test # 10min
mirror_hardwares: [amdexperimental, amdproduction] mirror_hardwares: [amd]
fast_check: true fast_check: true
source_file_dependencies: source_file_dependencies:
- vllm/core - vllm/core
@ -108,10 +102,9 @@ steps:
- pytest -v -s core - pytest -v -s core
- label: Entrypoints Test # 40min - label: Entrypoints Test # 40min
mirror_hardwares: [amdexperimental]
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"
fast_check: true fast_check: true
torch_nightly: true mirror_hardwares: [amd]
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/entrypoints/llm - tests/entrypoints/llm
@ -125,12 +118,11 @@ steps:
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
- pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process - pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
- VLLM_USE_V1=0 pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process - VLLM_USE_V1=0 pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/correctness/
- pytest -v -s entrypoints/test_chat_utils.py - pytest -v -s entrypoints/test_chat_utils.py
- VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests - VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
- label: Distributed Tests (4 GPUs) # 10min - label: Distributed Tests (4 GPUs) # 10min
mirror_hardwares: [amdexperimental]
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"
num_gpus: 4 num_gpus: 4
source_file_dependencies: source_file_dependencies:
@ -138,38 +130,29 @@ steps:
- vllm/core/ - vllm/core/
- tests/distributed/test_utils - tests/distributed/test_utils
- tests/distributed/test_pynccl - tests/distributed/test_pynccl
- tests/distributed/test_events
- tests/spec_decode/e2e/test_integration_dist_tp4 - tests/spec_decode/e2e/test_integration_dist_tp4
- tests/compile/test_basic_correctness - tests/compile/test_basic_correctness
- examples/offline_inference/rlhf.py - examples/offline_inference/rlhf.py
- examples/offline_inference/rlhf_colocate.py - examples/offline_inference/rlhf_colocate.py
- tests/examples/offline_inference/data_parallel.py - tests/examples/offline_inference/data_parallel.py
- tests/v1/test_async_llm_dp.py
- tests/v1/engine/test_engine_core_client.py
commands: commands:
# test with tp=2 and external_dp=2 # test with tp=2 and external_dp=2
- VLLM_USE_V1=0 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py - VLLM_USE_V1=0 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
- torchrun --nproc-per-node=4 distributed/test_torchrun_example.py - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
# test with tp=2 and pp=2
- PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
# test with internal dp # test with internal dp
- python3 ../examples/offline_inference/data_parallel.py - python3 ../examples/offline_inference/data_parallel.py
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
- pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
- pytest -v -s distributed/test_utils.py - pytest -v -s distributed/test_utils.py
- pytest -v -s compile/test_basic_correctness.py - pytest -v -s compile/test_basic_correctness.py
- pytest -v -s distributed/test_pynccl.py - pytest -v -s distributed/test_pynccl.py
- pytest -v -s distributed/test_events.py
- pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py - pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
# TODO: create a dedicated test section for multi-GPU example tests # TODO: create a dedicated test section for multi-GPU example tests
# when we have multiple distributed example tests # when we have multiple distributed example tests
- pushd ../examples/offline_inference - pushd ../examples/offline_inference
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py - python3 rlhf.py
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py - RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
- popd - popd
- label: Metrics, Tracing Test # 10min - label: Metrics, Tracing Test # 10min
mirror_hardwares: [amdexperimental, amdproduction]
num_gpus: 2 num_gpus: 2
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
@ -178,17 +161,17 @@ steps:
commands: commands:
- pytest -v -s metrics - pytest -v -s metrics
- "pip install \ - "pip install \
'opentelemetry-sdk>=1.26.0' \ 'opentelemetry-sdk>=1.26.0,<1.27.0' \
'opentelemetry-api>=1.26.0' \ 'opentelemetry-api>=1.26.0,<1.27.0' \
'opentelemetry-exporter-otlp>=1.26.0' \ 'opentelemetry-exporter-otlp>=1.26.0,<1.27.0' \
'opentelemetry-semantic-conventions-ai>=0.4.1'" 'opentelemetry-semantic-conventions-ai>=0.4.1,<0.5.0'"
- pytest -v -s tracing - pytest -v -s tracing
##### fast check tests ##### ##### fast check tests #####
##### 1 GPU test ##### ##### 1 GPU test #####
- label: Regression Test # 5min - label: Regression Test # 5min
mirror_hardwares: [amdexperimental, amdproduction] mirror_hardwares: [amd]
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/test_regression - tests/test_regression
@ -198,7 +181,7 @@ steps:
working_dir: "/vllm-workspace/tests" # optional working_dir: "/vllm-workspace/tests" # optional
- label: Engine Test # 10min - label: Engine Test # 10min
mirror_hardwares: [amdexperimental, amdproduction] mirror_hardwares: [amd]
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/engine - tests/engine
@ -206,31 +189,28 @@ steps:
- tests/test_sequence - tests/test_sequence
- tests/test_config - tests/test_config
- tests/test_logger - tests/test_logger
- tests/test_vllm_port
commands: commands:
- pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py - pytest -v -s engine test_sequence.py test_config.py test_logger.py
# OOM in the CI unless we run this separately # OOM in the CI unless we run this separately
- pytest -v -s tokenization - pytest -v -s tokenization
- label: V1 Test - label: V1 Test
mirror_hardwares: [amdexperimental] #mirror_hardwares: [amd]
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/v1 - tests/v1
commands: commands:
# split the test to avoid interference # split the test to avoid interference
- pytest -v -s v1/core - pytest -v -s v1/core
- pytest -v -s v1/entrypoints
- pytest -v -s v1/engine - pytest -v -s v1/engine
- pytest -v -s v1/entrypoints - pytest -v -s v1/entrypoints
- pytest -v -s v1/sample - pytest -v -s v1/sample
- pytest -v -s v1/worker - pytest -v -s v1/worker
- pytest -v -s v1/structured_output - pytest -v -s v1/structured_output
- pytest -v -s v1/spec_decode - pytest -v -s v1/test_stats.py
- pytest -v -s v1/kv_connector/unit
- pytest -v -s v1/test_serial_utils.py
- pytest -v -s v1/test_utils.py - pytest -v -s v1/test_utils.py
- pytest -v -s v1/test_oracle.py - pytest -v -s v1/test_oracle.py
- pytest -v -s v1/test_metrics_reader.py
# TODO: accuracy does not match, whether setting # TODO: accuracy does not match, whether setting
# VLLM_USE_FLASHINFER_SAMPLER or not on H100. # VLLM_USE_FLASHINFER_SAMPLER or not on H100.
- pytest -v -s v1/e2e - pytest -v -s v1/e2e
@ -239,8 +219,8 @@ steps:
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
- label: Examples Test # 25min - label: Examples Test # 25min
mirror_hardwares: [amdexperimental]
working_dir: "/vllm-workspace/examples" working_dir: "/vllm-workspace/examples"
#mirror_hardwares: [amd]
source_file_dependencies: source_file_dependencies:
- vllm/entrypoints - vllm/entrypoints
- examples/ - examples/
@ -255,7 +235,7 @@ steps:
- python3 offline_inference/vision_language.py --seed 0 - python3 offline_inference/vision_language.py --seed 0
- python3 offline_inference/vision_language_embedding.py --seed 0 - python3 offline_inference/vision_language_embedding.py --seed 0
- python3 offline_inference/vision_language_multi_image.py --seed 0 - python3 offline_inference/vision_language_multi_image.py --seed 0
- VLLM_USE_V1=0 python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - VLLM_USE_V1=0 python3 other/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 other/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
- python3 offline_inference/encoder_decoder.py - python3 offline_inference/encoder_decoder.py
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0 - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
- python3 offline_inference/basic/classify.py - python3 offline_inference/basic/classify.py
@ -264,7 +244,7 @@ steps:
- VLLM_USE_V1=0 python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2 - VLLM_USE_V1=0 python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2
- label: Prefix Caching Test # 9min - label: Prefix Caching Test # 9min
mirror_hardwares: [amdexperimental, amdproduction] mirror_hardwares: [amd]
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/prefix_caching - tests/prefix_caching
@ -272,7 +252,6 @@ steps:
- pytest -v -s prefix_caching - pytest -v -s prefix_caching
- label: Samplers Test # 36min - label: Samplers Test # 36min
mirror_hardwares: [amdexperimental]
source_file_dependencies: source_file_dependencies:
- vllm/model_executor/layers - vllm/model_executor/layers
- vllm/sampling_metadata.py - vllm/sampling_metadata.py
@ -282,8 +261,18 @@ steps:
- pytest -v -s samplers - pytest -v -s samplers
- VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
- label: LogitsProcessor Test # 5min
mirror_hardwares: [amd]
source_file_dependencies:
- vllm/model_executor/layers
- vllm/model_executor/guided_decoding
- tests/test_logits_processor
- tests/model_executor/test_guided_processors
commands:
- pytest -v -s test_logits_processor.py
- pytest -v -s model_executor/test_guided_processors.py
- label: Speculative decoding tests # 40min - label: Speculative decoding tests # 40min
mirror_hardwares: [amdexperimental]
source_file_dependencies: source_file_dependencies:
- vllm/spec_decode - vllm/spec_decode
- tests/spec_decode - tests/spec_decode
@ -294,30 +283,14 @@ steps:
- pytest -v -s spec_decode/e2e/test_eagle_correctness.py - pytest -v -s spec_decode/e2e/test_eagle_correctness.py
- label: LoRA Test %N # 15min each - label: LoRA Test %N # 15min each
mirror_hardwares: [amdexperimental, amdproduction] mirror_hardwares: [amd]
source_file_dependencies: source_file_dependencies:
- vllm/lora - vllm/lora
- tests/lora - tests/lora
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_minicpmv_tp.py --ignore=lora/test_transfomers_model.py
parallelism: 4 parallelism: 4
- label: PyTorch Compilation Unit Tests
mirror_hardwares: [amdexperimental, amdproduction]
torch_nightly: true
source_file_dependencies:
- vllm/
- tests/compile
commands:
- pytest -v -s compile/test_pass_manager.py
- pytest -v -s compile/test_fusion.py
- pytest -v -s compile/test_fusion_attn.py
- pytest -v -s compile/test_silu_mul_quant_fusion.py
- pytest -v -s compile/test_sequence_parallelism.py
- pytest -v -s compile/test_async_tp.py
- label: PyTorch Fullgraph Smoke Test # 9min - label: PyTorch Fullgraph Smoke Test # 9min
mirror_hardwares: [amdexperimental, amdproduction]
torch_nightly: true
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/compile - tests/compile
@ -326,127 +299,61 @@ steps:
# these tests need to be separated, cannot combine # these tests need to be separated, cannot combine
- pytest -v -s compile/piecewise/test_simple.py - pytest -v -s compile/piecewise/test_simple.py
- pytest -v -s compile/piecewise/test_toy_llama.py - pytest -v -s compile/piecewise/test_toy_llama.py
- pytest -v -s compile/piecewise/test_full_cudagraph.py - pytest -v -s compile/test_pass_manager.py
- label: PyTorch Fullgraph Test # 18min - label: PyTorch Fullgraph Test # 18min
mirror_hardwares: [amdexperimental, amdproduction]
torch_nightly: true
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/compile - tests/compile
commands: commands:
- pytest -v -s compile/test_full_graph.py - pytest -v -s compile/test_full_graph.py
- label: Kernels Core Operation Test - label: Kernels Test %N # 1h each
mirror_hardwares: [amdexperimental, amdproduction] mirror_hardwares: [amd]
source_file_dependencies: source_file_dependencies:
- csrc/ - csrc/
- tests/kernels/core
commands:
- pytest -v -s kernels/core
- label: Kernels Attention Test %N
mirror_hardwares: [amdexperimental, amdproduction]
source_file_dependencies:
- csrc/attention/
- vllm/attention - vllm/attention
- vllm/v1/attention - tests/kernels
- tests/kernels/attention
commands: commands:
- pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT - pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
parallelism: 2 parallelism: 4
- label: Kernels Quantization Test %N
mirror_hardwares: [amdexperimental, amdproduction]
source_file_dependencies:
- csrc/quantization/
- vllm/model_executor/layers/quantization
- tests/kernels/quantization
commands:
- pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
parallelism: 2
- label: Kernels MoE Test
mirror_hardwares: [amdexperimental]
source_file_dependencies:
- csrc/moe/
- tests/kernels/moe
- vllm/model_executor/layers/fused_moe/
commands:
- pytest -v -s kernels/moe
- label: Kernels Mamba Test
mirror_hardwares: [amdexperimental]
source_file_dependencies:
- csrc/mamba/
- tests/kernels/mamba
commands:
- pytest -v -s kernels/mamba
- label: Tensorizer Test # 11min - label: Tensorizer Test # 11min
mirror_hardwares: [amdexperimental, amdproduction] mirror_hardwares: [amd]
soft_fail: true soft_fail: true
source_file_dependencies: source_file_dependencies:
- vllm/model_executor/model_loader - vllm/model_executor/model_loader
- tests/tensorizer_loader - tests/tensorizer_loader
- tests/entrypoints/openai/test_tensorizer_entrypoint.py
commands: commands:
- apt-get update && apt-get install -y curl libsodium23 - apt-get update && apt-get install -y curl libsodium23
- export VLLM_WORKER_MULTIPROC_METHOD=spawn - export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s tensorizer_loader - pytest -v -s tensorizer_loader
- pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
- label: Model Executor Test
mirror_hardwares: [amdexperimental, amdproduction]
soft_fail: true
source_file_dependencies:
- vllm/model_executor
- tests/model_executor
commands:
- apt-get update && apt-get install -y curl libsodium23
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s model_executor
- label: Benchmarks # 9min - label: Benchmarks # 9min
mirror_hardwares: [amdexperimental, amdproduction]
working_dir: "/vllm-workspace/.buildkite" working_dir: "/vllm-workspace/.buildkite"
mirror_hardwares: [amd]
source_file_dependencies: source_file_dependencies:
- benchmarks/ - benchmarks/
commands: commands:
- bash scripts/run-benchmarks.sh - bash run-benchmarks.sh
- label: Benchmarks CLI Test # 10min - label: Quantization Test # 33min
mirror_hardwares: [amdexperimental, amdproduction]
source_file_dependencies:
- vllm/
- tests/benchmarks/
commands:
- pytest -v -s benchmarks/
- label: Quantization Test
mirror_hardwares: [amdexperimental]
source_file_dependencies: source_file_dependencies:
- csrc/ - csrc/
- vllm/model_executor/layers/quantization - vllm/model_executor/layers/quantization
- tests/quantization - tests/quantization
commands: command: VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
# temporary install here since we need nightly, will move to requirements/test.in
# after torchao 0.12 release
- pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
- label: LM Eval Small Models # 53min - label: LM Eval Small Models # 53min
mirror_hardwares: [amdexperimental]
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
source_file_dependencies: source_file_dependencies:
- csrc/ - csrc/
- vllm/model_executor/layers/quantization - vllm/model_executor/layers/quantization
commands: commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn - export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-small.txt --tp-size=1 - bash ./run-tests.sh -c configs/models-small.txt -t 1
- label: OpenAI API correctness - label: OpenAI API correctness
mirror_hardwares: [amdexperimental]
source_file_dependencies: source_file_dependencies:
- csrc/ - csrc/
- vllm/entrypoints/openai/ - vllm/entrypoints/openai/
@ -455,7 +362,6 @@ steps:
- pytest -s entrypoints/openai/correctness/ - pytest -s entrypoints/openai/correctness/
- label: Encoder Decoder tests # 5min - label: Encoder Decoder tests # 5min
mirror_hardwares: [amdexperimental]
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/encoder_decoder - tests/encoder_decoder
@ -463,117 +369,98 @@ steps:
- pytest -v -s encoder_decoder - pytest -v -s encoder_decoder
- label: OpenAI-Compatible Tool Use # 20 min - label: OpenAI-Compatible Tool Use # 20 min
mirror_hardwares: [amdexperimental]
fast_check: false fast_check: false
mirror_hardwares: [ amd ]
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/tool_use - tests/tool_use
- tests/mistral_tool_use
commands: commands:
- pytest -v -s tool_use - pytest -v -s tool_use
- pytest -v -s mistral_tool_use
##### models test ##### ##### models test #####
- label: Basic Models Test # 24min - label: Basic Models Test # 24min
mirror_hardwares: [amdexperimental, amdproduction]
torch_nightly: true
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/models - tests/models
commands: commands:
- pytest -v -s models/test_transformers.py - pytest -v -s models/test_transformers.py
- pytest -v -s models/test_registry.py - pytest -v -s models/test_registry.py
- pytest -v -s models/test_utils.py # V1 Test: https://github.com/vllm-project/vllm/issues/14531
- pytest -v -s models/test_vision.py - VLLM_USE_V1=0 pytest -v -s models/test_initialization.py
- pytest -v -s models/test_initialization.py
- label: Language Models Test (Standard) - label: Language Models Test (Standard) # 32min
mirror_hardwares: [amdexperimental] #mirror_hardwares: [amd]
torch_nightly: true
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/models/language - tests/models/decoder_only/language
- tests/models/embedding/language
- tests/models/encoder_decoder/language
commands: commands:
# Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile. - pytest -v -s models/decoder_only/language -m 'core_model or quant_model'
- pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8' - pytest -v -s models/embedding/language -m core_model
- pip freeze | grep -E 'torch'
- pytest -v -s models/language -m core_model
- label: Language Models Test (Extended Generation) # 1hr20min - label: Language Models Test (Extended) # 1h10min
mirror_hardwares: [amdexperimental]
optional: true optional: true
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/models/language/generation - tests/models/decoder_only/language
- tests/models/embedding/language
- tests/models/encoder_decoder/language
commands: commands:
# Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile. - pytest -v -s models/decoder_only/language -m 'not core_model and not quant_model'
- pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8' - pytest -v -s models/embedding/language -m 'not core_model'
- pytest -v -s models/language/generation -m 'not core_model'
- label: Language Models Test (Extended Pooling) # 36min - label: Multi-Modal Models Test (Standard) # 40min
mirror_hardwares: [amdexperimental] #mirror_hardwares: [amd]
optional: true
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/models/language/pooling - tests/models/decoder_only/audio_language
commands: - tests/models/decoder_only/vision_language
- pytest -v -s models/language/pooling -m 'not core_model' - tests/models/embedding/vision_language
- tests/models/encoder_decoder/audio_language
- label: Multi-Modal Models Test (Standard) - tests/models/encoder_decoder/vision_language
mirror_hardwares: [amdexperimental]
torch_nightly: true
source_file_dependencies:
- vllm/
- tests/models/multimodal
commands: commands:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pip freeze | grep -E 'torch' - pytest -v -s models/multimodal
- pytest -v -s models/multimodal/processing - pytest -v -s models/decoder_only/audio_language -m 'core_model or quant_model'
- pytest -v -s --ignore models/multimodal/generation/test_whisper.py models/multimodal -m core_model - pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'core_model or quant_model'
- cd .. && pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work - pytest -v -s models/embedding/vision_language -m core_model
- pytest -v -s models/encoder_decoder/audio_language -m core_model
- pytest -v -s models/encoder_decoder/language -m core_model
- pytest -v -s models/encoder_decoder/vision_language -m core_model
- label: Multi-Modal Models Test (Extended) 1 - label: Multi-Modal Models Test (Extended) 1 # 48m
mirror_hardwares: [amdexperimental]
optional: true optional: true
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/models/multimodal - tests/models/decoder_only/audio_language
- tests/models/decoder_only/vision_language
- tests/models/embedding/vision_language
- tests/models/encoder_decoder/vision_language
commands: commands:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pytest -v -s --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing models/multimodal -m 'not core_model' - pytest -v -s models/decoder_only/audio_language -m 'not core_model and not quant_model'
- pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=0) and not core_model and not quant_model'
# HACK - run phi3v tests separately to sidestep this transformers bug
# https://github.com/huggingface/transformers/issues/34307
- pytest -v -s models/decoder_only/vision_language/test_phi3v.py
- pytest -v -s --ignore models/decoder_only/vision_language/test_models.py --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'not core_model and not quant_model'
- pytest -v -s models/embedding/vision_language -m 'not core_model'
- pytest -v -s models/encoder_decoder/language -m 'not core_model'
- pytest -v -s models/encoder_decoder/vision_language -m 'not core_model'
- label: Multi-Modal Models Test (Extended) 2 - label: Multi-Modal Models Test (Extended) 2 # 38m
mirror_hardwares: [amdexperimental]
optional: true optional: true
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/models/multimodal - tests/models/decoder_only/vision_language
commands: commands:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model' - pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=1) and not core_model and not quant_model'
- label: Multi-Modal Models Test (Extended) 3
mirror_hardwares: [amdexperimental, amdproduction]
optional: true
source_file_dependencies:
- vllm/
- tests/models/multimodal
commands:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model'
- label: Quantized Models Test
mirror_hardwares: [amdexperimental, amdproduction]
source_file_dependencies:
- vllm/model_executor/layers/quantization
- tests/models/quantization
commands:
- pytest -v -s models/quantization
# This test is used only in PR development phase to test individual models and should never run on main # This test is used only in PR development phase to test individual models and should never run on main
- label: Custom Models Test - label: Custom Models Test
mirror_hardwares: [amdexperimental, amdproduction]
optional: true optional: true
commands: commands:
- echo 'Testing custom models...' - echo 'Testing custom models...'
@ -585,7 +472,6 @@ steps:
##### multi gpus test ##### ##### multi gpus test #####
- label: Distributed Comm Ops Test # 7min - label: Distributed Comm Ops Test # 7min
mirror_hardwares: [amdexperimental, amdproduction]
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"
num_gpus: 2 num_gpus: 2
source_file_dependencies: source_file_dependencies:
@ -596,7 +482,6 @@ steps:
- pytest -v -s distributed/test_shm_broadcast.py - pytest -v -s distributed/test_shm_broadcast.py
- label: 2 Node Tests (4 GPUs in total) # 16min - label: 2 Node Tests (4 GPUs in total) # 16min
mirror_hardwares: [amdexperimental]
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"
num_gpus: 2 num_gpus: 2
num_nodes: 2 num_nodes: 2
@ -615,7 +500,7 @@ steps:
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
- label: Distributed Tests (2 GPUs) # 40min - label: Distributed Tests (2 GPUs) # 40min
mirror_hardwares: [amdexperimental] #mirror_hardwares: [amd]
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"
num_gpus: 2 num_gpus: 2
source_file_dependencies: source_file_dependencies:
@ -629,12 +514,7 @@ steps:
- vllm/worker/worker.py - vllm/worker/worker.py
- vllm/worker/model_runner.py - vllm/worker/model_runner.py
- entrypoints/llm/test_collective_rpc.py - entrypoints/llm/test_collective_rpc.py
- tests/v1/test_async_llm_dp.py
- tests/v1/entrypoints/openai/test_multi_api_servers.py
- vllm/v1/engine/
commands: commands:
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
- DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
- pytest -v -s entrypoints/llm/test_collective_rpc.py - pytest -v -s entrypoints/llm/test_collective_rpc.py
- pytest -v -s ./compile/test_basic_correctness.py - pytest -v -s ./compile/test_basic_correctness.py
- pytest -v -s ./compile/test_wrapper.py - pytest -v -s ./compile/test_wrapper.py
@ -642,26 +522,23 @@ steps:
- TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)' - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
# Avoid importing model tests that cause CUDA reinitialization error # Avoid importing model tests that cause CUDA reinitialization error
- pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)' - pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
- pytest models/language -v -s -m 'distributed(num_gpus=2)' - pytest models/encoder_decoder/language/test_bart.py -v -s -m 'distributed(num_gpus=2)'
- pytest models/multimodal -v -s -m 'distributed(num_gpus=2)' - pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m 'distributed(num_gpus=2)'
# test sequence parallel - pytest models/decoder_only/vision_language/test_models.py -v -s -m 'distributed(num_gpus=2)'
- pytest -v -s distributed/test_sequence_parallel.py
# this test fails consistently. # this test fails consistently.
# TODO: investigate and fix # TODO: investigate and fix
# - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py # - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
- VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
- VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/test_disagg.py - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/test_disagg.py
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
- label: Plugin Tests (2 GPUs) # 40min - label: Plugin Tests (2 GPUs) # 40min
mirror_hardwares: [amdexperimental]
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"
num_gpus: 2 num_gpus: 2
source_file_dependencies: source_file_dependencies:
- vllm/plugins/ - vllm/plugins/
- tests/plugins/ - tests/plugins/
commands: commands:
# begin platform plugin and general plugin tests, all the code in-between runs on dummy platform # begin platform plugin tests, all the code in-between runs on dummy platform
- pip install -e ./plugins/vllm_add_dummy_platform - pip install -e ./plugins/vllm_add_dummy_platform
- pytest -v -s plugins_tests/test_platform_plugins.py - pytest -v -s plugins_tests/test_platform_plugins.py
- pip uninstall vllm_add_dummy_platform -y - pip uninstall vllm_add_dummy_platform -y
@ -672,10 +549,8 @@ steps:
- pytest -v -s distributed/test_distributed_oot.py - pytest -v -s distributed/test_distributed_oot.py
- pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process - pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
- pytest -v -s models/test_oot_registration.py # it needs a clean process - pytest -v -s models/test_oot_registration.py # it needs a clean process
- pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins
- label: Multi-step Tests (4 GPUs) # 36min - label: Multi-step Tests (4 GPUs) # 36min
mirror_hardwares: [amdexperimental, amdproduction]
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"
num_gpus: 4 num_gpus: 4
source_file_dependencies: source_file_dependencies:
@ -696,7 +571,6 @@ steps:
- pytest -v -s multi_step/test_correctness_llm.py - pytest -v -s multi_step/test_correctness_llm.py
- label: Pipeline Parallelism Test # 45min - label: Pipeline Parallelism Test # 45min
mirror_hardwares: [amdexperimental, amdproduction]
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"
num_gpus: 4 num_gpus: 4
source_file_dependencies: source_file_dependencies:
@ -710,7 +584,6 @@ steps:
- pytest -v -s distributed/test_pipeline_parallel.py - pytest -v -s distributed/test_pipeline_parallel.py
- label: LoRA TP Test (Distributed) - label: LoRA TP Test (Distributed)
mirror_hardwares: [amdexperimental, amdproduction]
num_gpus: 4 num_gpus: 4
source_file_dependencies: source_file_dependencies:
- vllm/lora - vllm/lora
@ -719,14 +592,17 @@ steps:
# FIXIT: find out which code initialize cuda before running the test # FIXIT: find out which code initialize cuda before running the test
# before the fix, we need to use spawn to test it # before the fix, we need to use spawn to test it
- export VLLM_WORKER_MULTIPROC_METHOD=spawn - export VLLM_WORKER_MULTIPROC_METHOD=spawn
# This test runs llama 13B, so it is required to run on 4 GPUs.
- pytest -v -s -x lora/test_long_context.py
# There is some Tensor Parallelism related processing logic in LoRA that # There is some Tensor Parallelism related processing logic in LoRA that
# requires multi-GPU testing for validation. # requires multi-GPU testing for validation.
- pytest -v -s -x lora/test_chatglm3_tp.py - pytest -v -s -x lora/test_chatglm3_tp.py
- pytest -v -s -x lora/test_llama_tp.py - pytest -v -s -x lora/test_llama_tp.py
- pytest -v -s -x lora/test_minicpmv_tp.py
- pytest -v -s -x lora/test_transfomers_model.py
- label: Weight Loading Multiple GPU Test # 33min - label: Weight Loading Multiple GPU Test # 33min
mirror_hardwares: [amdexperimental]
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"
num_gpus: 2 num_gpus: 2
source_file_dependencies: source_file_dependencies:
@ -736,7 +612,6 @@ steps:
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt
- label: Weight Loading Multiple GPU Test - Large Models # optional - label: Weight Loading Multiple GPU Test - Large Models # optional
mirror_hardwares: [amdexperimental]
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"
num_gpus: 2 num_gpus: 2
gpu: a100 gpu: a100
@ -775,4 +650,4 @@ steps:
- vllm/model_executor/layers/quantization - vllm/model_executor/layers/quantization
commands: commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn - export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4 - bash ./run-tests.sh -c configs/models-large.txt -t 4

View File

@ -50,11 +50,11 @@ aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
if [[ $normal_wheel == *"cu118"* ]]; then if [[ $normal_wheel == *"cu118"* ]]; then
# if $normal_wheel matches cu118, do not upload the index.html # if $normal_wheel matches cu118, do not upload the index.html
echo "Skipping index files for cu118 wheels" echo "Skipping index files for cu118 wheels"
elif [[ $normal_wheel == *"cu126"* ]]; then elif [[ $normal_wheel == *"cu121"* ]]; then
# if $normal_wheel matches cu126, do not upload the index.html # if $normal_wheel matches cu121, do not upload the index.html
echo "Skipping index files for cu126 wheels" echo "Skipping index files for cu121 wheels"
else else
# only upload index.html for cu128 wheels (default wheels) # only upload index.html for cu124 wheels (default wheels)
aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html" aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html" aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
fi fi
@ -66,13 +66,12 @@ aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"
if [[ $normal_wheel == *"cu118"* ]]; then if [[ $normal_wheel == *"cu118"* ]]; then
# if $normal_wheel matches cu118, do not upload the index.html # if $normal_wheel matches cu118, do not upload the index.html
echo "Skipping index files for cu118 wheels" echo "Skipping index files for cu118 wheels"
elif [[ $normal_wheel == *"cu126"* ]]; then elif [[ $normal_wheel == *"cu121"* ]]; then
# if $normal_wheel matches cu126, do not upload the index.html # if $normal_wheel matches cu121, do not upload the index.html
echo "Skipping index files for cu126 wheels" echo "Skipping index files for cu121 wheels"
else else
# only upload index.html for cu128 wheels (default wheels) # only upload index.html for cu124 wheels (default wheels)
aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html" aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
fi fi
aws s3 cp "$wheel" "s3://vllm-wheels/$version/" aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
aws s3 cp index.html "s3://vllm-wheels/$version/vllm/index.html"

21
.github/CODEOWNERS vendored
View File

@ -10,17 +10,13 @@
/vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill /vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill /vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth /vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth
/vllm/model_executor/guided_decoding @mgoin @russellb @aarnphm /vllm/model_executor/guided_decoding @mgoin @russellb
/vllm/multimodal @DarkLight1337 @ywang96 /vllm/multimodal @DarkLight1337 @ywang96
/vllm/vllm_flash_attn @LucasWilkinson
/vllm/lora @jeejeelee
/vllm/reasoning @aarnphm
/vllm/entrypoints @aarnphm
CMakeLists.txt @tlrmchlsmth CMakeLists.txt @tlrmchlsmth
# vLLM V1 # vLLM V1
/vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat /vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
/vllm/v1/structured_output @mgoin @russellb @aarnphm /vllm/v1/structured_output @mgoin @russellb
# Test ownership # Test ownership
/.buildkite/lm-eval-harness @mgoin @simon-mo /.buildkite/lm-eval-harness @mgoin @simon-mo
@ -29,8 +25,8 @@ CMakeLists.txt @tlrmchlsmth
/tests/distributed/test_multi_node_assignment.py @youkaichao /tests/distributed/test_multi_node_assignment.py @youkaichao
/tests/distributed/test_pipeline_parallel.py @youkaichao /tests/distributed/test_pipeline_parallel.py @youkaichao
/tests/distributed/test_same_node.py @youkaichao /tests/distributed/test_same_node.py @youkaichao
/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo @aarnphm /tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo
/tests/entrypoints/llm/test_guided_generate.py @mgoin @russellb @aarnphm /tests/entrypoints/llm/test_guided_generate.py @mgoin @russellb
/tests/kernels @tlrmchlsmth @WoosukKwon /tests/kernels @tlrmchlsmth @WoosukKwon
/tests/model_executor/test_guided_processors.py @mgoin @russellb /tests/model_executor/test_guided_processors.py @mgoin @russellb
/tests/models @DarkLight1337 @ywang96 /tests/models @DarkLight1337 @ywang96
@ -40,11 +36,6 @@ CMakeLists.txt @tlrmchlsmth
/tests/quantization @mgoin @robertgshaw2-redhat /tests/quantization @mgoin @robertgshaw2-redhat
/tests/spec_decode @njhill @LiuXiaoxuanPKU /tests/spec_decode @njhill @LiuXiaoxuanPKU
/tests/test_inputs.py @DarkLight1337 @ywang96 /tests/test_inputs.py @DarkLight1337 @ywang96
/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm /tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb
/tests/v1/structured_output @mgoin @russellb @aarnphm /tests/v1/structured_output @mgoin @russellb
/tests/weight_loading @mgoin @youkaichao /tests/weight_loading @mgoin @youkaichao
/tests/lora @jeejeelee
# Docs
/docs @hmellor
mkdocs.yaml @hmellor

View File

@ -14,7 +14,7 @@ body:
description: | description: |
Please run the following and paste the output below. Please run the following and paste the output below.
```sh ```sh
wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
# For security purposes, please feel free to check the contents of collect_env.py before running it. # For security purposes, please feel free to check the contents of collect_env.py before running it.
python collect_env.py python collect_env.py
``` ```

View File

@ -14,7 +14,7 @@ body:
description: | description: |
Please run the following and paste the output below. Please run the following and paste the output below.
```sh ```sh
wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
# For security purposes, please feel free to check the contents of collect_env.py before running it. # For security purposes, please feel free to check the contents of collect_env.py before running it.
python collect_env.py python collect_env.py
``` ```

View File

@ -8,35 +8,25 @@ body:
attributes: attributes:
value: > value: >
#### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+). #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
- type: markdown
attributes:
value: |
⚠️ **SECURITY WARNING:** Please review any text you paste to ensure it does not contain sensitive information such as:
- API tokens or keys (e.g., Hugging Face tokens, OpenAI API keys)
- Passwords or authentication credentials
- Private URLs or endpoints
- Personal or confidential data
Consider redacting or replacing sensitive values with placeholders like `<YOUR_TOKEN_HERE>` when sharing configuration or code examples.
- type: textarea - type: textarea
attributes: attributes:
label: Your current environment label: Your current environment
description: | description: |
Please run the following and paste the output below. Please run the following and paste the output below.
```sh ```sh
wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
# For security purposes, please feel free to check the contents of collect_env.py before running it. # For security purposes, please feel free to check the contents of collect_env.py before running it.
python collect_env.py python collect_env.py
``` ```
It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues. It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues.
value: | value: |
<details> <details>
<summary>The output of <code>python collect_env.py</code></summary> <summary>The output of `python collect_env.py`</summary>
```text ```text
Your output of `python collect_env.py` here Your output of `python collect_env.py` here
``` ```
</details> </details>
validations: validations:
required: true required: true
@ -85,20 +75,20 @@ body:
``` ```
``` ```
The error message you got, with the full traceback and the error logs with [dump_input.py:##] if present. The error message you got, with the full traceback.
``` ```
validations: validations:
required: true required: true
- type: markdown - type: markdown
attributes: attributes:
value: | value: >
⚠️ Please separate bugs of `transformers` implementation or usage from bugs of `vllm`. If you think anything is wrong with the model's output: ⚠️ Please separate bugs of `transformers` implementation or usage from bugs of `vllm`. If you think anything is wrong with the models' output:
- Try the counterpart of `transformers` first. If the error appears, please go to [their issues](https://github.com/huggingface/transformers/issues?q=is%3Aissue+is%3Aopen+sort%3Aupdated-desc). - Try the counterpart of `transformers` first. If the error appears, please go to [their issues](https://github.com/huggingface/transformers/issues?q=is%3Aissue+is%3Aopen+sort%3Aupdated-desc).
- If the error only appears in vllm, please provide the detailed script of how you run `transformers` and `vllm`, also highlight the difference and what you expect. - If the error only appears in vllm, please provide the detailed script of how you run `transformers` and `vllm`, also highlight the difference and what you expect.
Thanks for reporting 🙏! Thanks for contributing 🎉!
- type: checkboxes - type: checkboxes
id: askllm id: askllm
attributes: attributes:

View File

@ -1,69 +0,0 @@
name: 🧪 CI failure report
description: Report a failing test.
title: "[CI Failure]: "
labels: ["ci-failure"]
body:
- type: markdown
attributes:
value: >
#### Include the name of the failing Buildkite step and test file in the title.
- type: input
attributes:
label: Name of failing test
description: |
Paste in the fully-qualified name of the failing test from the logs.
placeholder: |
`path/to/test_file.py::test_name[params]`
validations:
required: true
- type: checkboxes
attributes:
label: Basic information
description: Select all items that apply to the failing test.
options:
- label: Flaky test
- label: Can reproduce locally
- label: Caused by external libraries (e.g. bug in `transformers`)
- type: textarea
attributes:
label: 🧪 Describe the failing test
description: |
Please provide a clear and concise description of the failing test.
placeholder: |
A clear and concise description of the failing test.
```
The error message you got, with the full traceback and the error logs with [dump_input.py:##] if present.
```
validations:
required: true
- type: textarea
attributes:
label: 📝 History of failing test
description: |
Since when did the test start to fail?
You can look up its history via [Buildkite Test Suites](https://buildkite.com/organizations/vllm/analytics/suites/ci-1/tests?branch=main).
If you have time, identify the PR that caused the test to fail on main. You can do so via the following methods:
- Use Buildkite Test Suites to find the PR where the test failure first occurred, and reproduce the failure locally.
- Run [`git bisect`](https://git-scm.com/docs/git-bisect) locally.
- Manually unblock Buildkite steps for suspected PRs on main and check the results. (authorized users only)
placeholder: |
Approximate timeline and/or problematic PRs
A link to the Buildkite analytics of the failing test (if available)
validations:
required: true
- type: textarea
attributes:
label: CC List.
description: >
The list of people you want to CC. Usually, this includes those who worked on the PR that failed the test.
- type: markdown
attributes:
value: >
Thanks for reporting 🙏!

View File

@ -9,7 +9,7 @@ body:
value: > value: >
#### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+). #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
#### We also highly recommend you read https://docs.vllm.ai/en/latest/contributing/model/index.html first to understand how to add a new model. #### We also highly recommend you read https://docs.vllm.ai/en/latest/contributing/model/adding_model.html first to understand how to add a new model.
- type: textarea - type: textarea
attributes: attributes:
label: The model to consider. label: The model to consider.

View File

@ -35,7 +35,7 @@ body:
description: | description: |
Please run the following and paste the output below. Please run the following and paste the output below.
```sh ```sh
wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
# For security purposes, please feel free to check the contents of collect_env.py before running it. # For security purposes, please feel free to check the contents of collect_env.py before running it.
python collect_env.py python collect_env.py
``` ```

View File

@ -1,18 +1,6 @@
## Essential Elements of an Effective PR Description Checklist FILL IN THE PR DESCRIPTION HERE
- [ ] The purpose of the PR, such as "Fix some issue (link existing issues this PR will resolve)".
- [ ] The test plan, such as providing test command.
- [ ] The test results, such as pasting the results comparison before and after, or e2e results
- [ ] (Optional) The necessary documentation update, such as updating `supported_models.md` and `examples` for a new model.
PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS ABOVE HAVE BEEN CONSIDERED. FIX #xxxx (*link existing issues this PR will resolve*)
## Purpose
## Test Plan
## Test Result
## (Optional) Documentation Update
<!--- pyml disable-next-line no-emphasis-as-heading --> <!--- pyml disable-next-line no-emphasis-as-heading -->
**BEFORE SUBMITTING, PLEASE READ <https://docs.vllm.ai/en/latest/contributing>** (anything written below this line will be removed by GitHub Actions) **BEFORE SUBMITTING, PLEASE READ <https://docs.vllm.ai/en/latest/contributing/overview.html>**

124
.github/mergify.yml vendored
View File

@ -19,7 +19,7 @@ pull_request_rules:
- files~=\.buildkite/ - files~=\.buildkite/
- files~=^cmake/ - files~=^cmake/
- files=CMakeLists.txt - files=CMakeLists.txt
- files~=^docker/Dockerfile - files~=^Dockerfile
- files~=^requirements.*\.txt - files~=^requirements.*\.txt
- files=setup.py - files=setup.py
actions: actions:
@ -36,20 +36,6 @@ pull_request_rules:
add: add:
- frontend - frontend
- name: label-llama
description: Automatically apply llama label
conditions:
- or:
- files~=^examples/.*llama.*\.py
- files~=^tests/.*llama.*\.py
- files~=^vllm/entrypoints/openai/tool_parsers/llama.*\.py
- files~=^vllm/model_executor/models/.*llama.*\.py
- files~=^vllm/transformers_utils/configs/.*llama.*\.py
actions:
label:
add:
- llama
- name: label-multi-modality - name: label-multi-modality
description: Automatically apply multi-modality label description: Automatically apply multi-modality label
conditions: conditions:
@ -65,58 +51,15 @@ pull_request_rules:
add: add:
- multi-modality - multi-modality
- name: label-qwen
description: Automatically apply qwen label
conditions:
- or:
- files~=^examples/.*qwen.*\.py
- files~=^tests/.*qwen.*\.py
- files~=^vllm/model_executor/models/.*qwen.*\.py
- files~=^vllm/reasoning/.*qwen.*\.py
- title~=(?i)Qwen
- body~=(?i)Qwen
actions:
label:
add:
- qwen
- name: label-rocm
description: Automatically apply rocm label
conditions:
- or:
- files~=^csrc/rocm/
- files~=^docker/Dockerfile.rocm
- files~=^requirements/rocm.*\.txt
- files~=^vllm/attention/backends/rocm.*\.py
- files~=^vllm/attention/ops/rocm.*\.py
- files~=^vllm/model_executor/layers/fused_moe/rocm.*\.py
- files~=^vllm/v1/attention/backends/mla/rocm.*\.py
- files~=^tests/kernels/.*_rocm.*\.py
- files=vllm/platforms/rocm.py
- title~=(?i)AMD
- title~=(?i)ROCm
actions:
label:
add:
- rocm
- name: label-structured-output - name: label-structured-output
description: Automatically apply structured-output label description: Automatically apply structured-output label
conditions: conditions:
- or: - or:
- files~=^benchmarks/structured_schemas/
- files=benchmarks/benchmark_serving_structured_output.py
- files=benchmarks/run_structured_output_benchmark.sh
- files=docs/features/structured_outputs.md
- files=examples/offline_inference/structured_outputs.py
- files=examples/online_serving/openai_chat_completion_structured_outputs.py
- files=examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
- files~=^vllm/model_executor/guided_decoding/ - files~=^vllm/model_executor/guided_decoding/
- files=tests/model_executor/test_guided_processors.py - files=tests/model_executor/test_guided_processors.py
- files=tests/entrypoints/llm/test_guided_generate.py - files=tests/entrypoints/llm/test_guided_generate.py
- files~=^tests/v1/structured_output/ - files=benchmarks/benchmark_serving_guided.py
- files=tests/v1/entrypoints/llm/test_guided_generate.py - files=benchmarks/benchmark_guided.py
- files~=^vllm/v1/structured_output/
actions: actions:
label: label:
add: add:
@ -145,56 +88,6 @@ pull_request_rules:
add: add:
- v1 - v1
- name: label-tpu
description: Automatically apply tpu label
# Keep this list in sync with `label-tpu-remove` conditions
conditions:
- or:
- files~=tpu.py
- files~=_tpu
- files~=tpu_
- files~=/tpu/
- files~=pallas
actions:
label:
add:
- tpu
- name: label-tpu-remove
description: Automatically remove tpu label
# Keep this list in sync with `label-tpu` conditions
conditions:
- and:
- -files~=tpu.py
- -files~=_tpu
- -files~=tpu_
- -files~=/tpu/
- -files~=pallas
actions:
label:
remove:
- tpu
- name: label-tool-calling
description: Automatically add tool-calling label
conditions:
- or:
- files~=^tests/tool_use/
- files~=^tests/mistral_tool_use/
- files~=^tests/entrypoints/openai/tool_parsers/
- files=tests/entrypoints/openai/test_chat_with_tool_reasoning.py
- files~=^vllm/entrypoints/openai/tool_parsers/
- files=docs/features/tool_calling.md
- files~=^examples/tool_chat_*
- files=examples/offline_inference/chat_with_tools.py
- files=examples/online_serving/openai_chat_completion_client_with_tools_required.py
- files=examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py
- files=examples/online_serving/openai_chat_completion_client_with_tools.py
actions:
label:
add:
- tool-calling
- name: ping author on conflicts and add 'needs-rebase' label - name: ping author on conflicts and add 'needs-rebase' label
conditions: conditions:
- conflict - conflict
@ -210,17 +103,6 @@ pull_request_rules:
https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/syncing-a-fork https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/syncing-a-fork
- name: assign reviewer for tensorizer changes
conditions:
- files~=^vllm/model_executor/model_loader/tensorizer.py
- files~=^vllm/model_executor/model_loader/tensorizer_loader.py
- files~=^tests/entrypoints/openai/test_tensorizer_entrypoint.py
- files~=^tests/tensorizer_loader/
actions:
assign:
users:
- "sangstar"
- name: remove 'needs-rebase' label when conflict is resolved - name: remove 'needs-rebase' label when conflict is resolved
conditions: conditions:
- -conflict - -conflict

View File

@ -26,7 +26,7 @@ sed -i '/\*\*BEFORE SUBMITTING, PLEASE READ.*\*\*/,$d' "${NEW}"
# Remove HTML <details> section that includes <summary> text of "PR Checklist (Click to Expand)" # Remove HTML <details> section that includes <summary> text of "PR Checklist (Click to Expand)"
python3 - <<EOF python3 - <<EOF
import regex as re import re
with open("${NEW}", "r") as file: with open("${NEW}", "r") as file:
content = file.read() content = file.read()

View File

@ -1,6 +1,4 @@
name: Add label on auto-merge enabled name: Add label on auto-merge enabled
permissions:
pull-requests: write
on: on:
pull_request_target: pull_request_target:
types: types:

View File

@ -20,12 +20,7 @@ jobs:
with: with:
python-version: '3.12' python-version: '3.12'
- name: Install Python dependencies
run: |
python3 -m pip install --upgrade pip
python3 -m pip install regex
- name: Update PR description - name: Update PR description
env: env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: bash .github/scripts/cleanup_pr_body.sh "${{ github.event.number }}" run: .github/scripts/cleanup_pr_body.sh "${{ github.event.number }}"

View File

@ -2,9 +2,6 @@ name: Lint and Deploy Charts
on: pull_request on: pull_request
permissions:
contents: read
jobs: jobs:
lint-and-deploy: lint-and-deploy:
runs-on: ubuntu-latest runs-on: ubuntu-latest
@ -53,7 +50,7 @@ jobs:
uses: helm/kind-action@a1b0e391336a6ee6713a0583f8c6240d70863de3 # v1.12.0 uses: helm/kind-action@a1b0e391336a6ee6713a0583f8c6240d70863de3 # v1.12.0
- name: Build the Docker image vllm cpu - name: Build the Docker image vllm cpu
run: docker buildx build -f docker/Dockerfile.cpu -t vllm-cpu-env . run: docker buildx build -f Dockerfile.cpu -t vllm-cpu-env .
- name: Configuration of docker images, network and namespace for the kind cluster - name: Configuration of docker images, network and namespace for the kind cluster
run: | run: |
@ -69,7 +66,7 @@ jobs:
export AWS_SECRET_ACCESS_KEY=minioadmin export AWS_SECRET_ACCESS_KEY=minioadmin
sleep 30 && kubectl -n ns-vllm logs -f "$(kubectl -n ns-vllm get pods | awk '/deployment/ {print $1;exit}')" & sleep 30 && kubectl -n ns-vllm logs -f "$(kubectl -n ns-vllm get pods | awk '/deployment/ {print $1;exit}')" &
helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/online_serving/chart-helm -f examples/online_serving/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env" helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/online_serving/chart-helm -f examples/online_serving/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env"
- name: curl test - name: curl test
run: | run: |
kubectl -n ns-vllm port-forward service/test-vllm-service 8001:80 & kubectl -n ns-vllm port-forward service/test-vllm-service 8001:80 &
@ -82,4 +79,4 @@ jobs:
"max_tokens": 7, "max_tokens": 7,
"temperature": 0 "temperature": 0
}'):$CODE" }'):$CODE"
echo "$CODE" echo "$CODE"

View File

@ -5,9 +5,6 @@ on:
push: push:
branches: [main] branches: [main]
permissions:
contents: read
jobs: jobs:
pre-commit: pre-commit:
runs-on: ubuntu-latest runs-on: ubuntu-latest

View File

@ -1,6 +1,4 @@
name: PR Reminder Comment Bot name: PR Reminder Comment Bot
permissions:
pull-requests: write
on: on:
pull_request_target: pull_request_target:
types: [opened] types: [opened]

10
.gitignore vendored
View File

@ -2,7 +2,7 @@
/vllm/_version.py /vllm/_version.py
# vllm-flash-attn built from source # vllm-flash-attn built from source
vllm/vllm_flash_attn/* vllm/vllm_flash_attn/
# Byte-compiled / optimized / DLL files # Byte-compiled / optimized / DLL files
__pycache__/ __pycache__/
@ -77,6 +77,10 @@ instance/
# Scrapy stuff: # Scrapy stuff:
.scrapy .scrapy
# Sphinx documentation
docs/_build/
docs/source/getting_started/examples/
# PyBuilder # PyBuilder
.pybuilder/ .pybuilder/
target/ target/
@ -146,7 +150,6 @@ venv.bak/
# mkdocs documentation # mkdocs documentation
/site /site
docs/examples
# mypy # mypy
.mypy_cache/ .mypy_cache/
@ -199,6 +202,3 @@ benchmarks/**/*.json
# Linting # Linting
actionlint actionlint
shellcheck*/ shellcheck*/
# Ignore moe/marlin_moe gen code
csrc/moe/marlin_moe_wna16/kernel_*

View File

@ -1,6 +1,3 @@
default_install_hook_types:
- pre-commit
- commit-msg
default_stages: default_stages:
- pre-commit # Run locally - pre-commit # Run locally
- manual # Run in CI - manual # Run in CI
@ -11,45 +8,43 @@ repos:
hooks: hooks:
- id: yapf - id: yapf
args: [--in-place, --verbose] args: [--in-place, --verbose]
# Keep the same list from yapfignore here to avoid yapf failing without any inputs additional_dependencies: [toml] # TODO: Remove when yapf is upgraded
exclude: '(.buildkite|benchmarks|build|examples)/.*'
- repo: https://github.com/astral-sh/ruff-pre-commit - repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.11.7 rev: v0.9.3
hooks: hooks:
- id: ruff - id: ruff
args: [--output-format, github, --fix] args: [--output-format, github, --fix]
- id: ruff-format - repo: https://github.com/codespell-project/codespell
files: ^(.buildkite|benchmarks|examples)/.* rev: v2.4.0
- repo: https://github.com/crate-ci/typos
rev: v1.32.0
hooks: hooks:
- id: typos - id: codespell
additional_dependencies: ['tomli']
args: ['--toml', 'pyproject.toml']
- repo: https://github.com/PyCQA/isort - repo: https://github.com/PyCQA/isort
rev: 6.0.1 rev: 0a0b7a830386ba6a31c2ec8316849ae4d1b8240d # 6.0.0
hooks: hooks:
- id: isort - id: isort
- repo: https://github.com/pre-commit/mirrors-clang-format - repo: https://github.com/pre-commit/mirrors-clang-format
rev: v20.1.3 rev: v19.1.7
hooks: hooks:
- id: clang-format - id: clang-format
exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))|vllm/third_party/.*' exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))|vllm/third_party/.*'
types_or: [c++, cuda] types_or: [c++, cuda]
args: [--style=file, --verbose] args: [--style=file, --verbose]
- repo: https://github.com/jackdewinter/pymarkdown - repo: https://github.com/jackdewinter/pymarkdown
rev: v0.9.29 rev: v0.9.27
hooks: hooks:
- id: pymarkdown - id: pymarkdown
exclude: '.*\.inc\.md'
args: [fix] args: [fix]
- repo: https://github.com/rhysd/actionlint - repo: https://github.com/rhysd/actionlint
rev: v1.7.7 rev: v1.7.7
hooks: hooks:
- id: actionlint - id: actionlint
- repo: https://github.com/astral-sh/uv-pre-commit - repo: https://github.com/astral-sh/uv-pre-commit
rev: 0.6.17 rev: 0.6.2
hooks: hooks:
- id: pip-compile - id: pip-compile
args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu128] args: [requirements/test.in, -o, requirements/test.txt]
files: ^requirements/test\.(in|txt)$ files: ^requirements/test\.(in|txt)$
- repo: local - repo: local
hooks: hooks:
@ -58,7 +53,7 @@ repos:
entry: tools/mypy.sh 0 "local" entry: tools/mypy.sh 0 "local"
language: python language: python
types: [python] types: [python]
additional_dependencies: &mypy_deps [mypy==1.11.1, types-cachetools, types-setuptools, types-PyYAML, types-requests, pydantic] additional_dependencies: &mypy_deps [mypy==1.11.1, types-cachetools, types-setuptools, types-PyYAML, types-requests]
stages: [pre-commit] # Don't run in CI stages: [pre-commit] # Don't run in CI
- id: mypy-3.9 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward - id: mypy-3.9 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
name: Run mypy for Python 3.9 name: Run mypy for Python 3.9
@ -104,8 +99,8 @@ repos:
args: args:
- -c - -c
- | - |
if ! grep -q "^Signed-off-by: $(git config user.name) <$(git config user.email)>" "$(git rev-parse --git-path COMMIT_EDITMSG)"; then if ! grep -q "^Signed-off-by: $(git config user.name) <$(git config user.email)>" .git/COMMIT_EDITMSG; then
printf "\nSigned-off-by: $(git config user.name) <$(git config user.email)>\n" >> "$(git rev-parse --git-path COMMIT_EDITMSG)" printf "\nSigned-off-by: $(git config user.name) <$(git config user.email)>\n" >> .git/COMMIT_EDITMSG
fi fi
language: system language: system
verbose: true verbose: true
@ -124,32 +119,6 @@ repos:
language: system language: system
always_run: true always_run: true
pass_filenames: false pass_filenames: false
- id: update-dockerfile-graph
name: Update Dockerfile dependency graph
entry: tools/update-dockerfile-graph.sh
language: script
- id: enforce-import-regex-instead-of-re
name: Enforce import regex as re
entry: python tools/enforce_regex_import.py
language: python
types: [python]
pass_filenames: false
additional_dependencies: [regex]
# forbid directly import triton
- id: forbid-direct-triton-import
name: "Forbid direct 'import triton'"
entry: python tools/check_triton_import.py
language: python
types: [python]
pass_filenames: false
additional_dependencies: [regex]
- id: check-pickle-imports
name: Prevent new pickle/cloudpickle imports
entry: python tools/check_pickle_imports.py
language: python
types: [python]
pass_filenames: false
additional_dependencies: [pathspec, regex]
# Keep `suggestion` last # Keep `suggestion` last
- id: suggestion - id: suggestion
name: Suggestion name: Suggestion

View File

@ -8,8 +8,12 @@ build:
tools: tools:
python: "3.12" python: "3.12"
mkdocs: sphinx:
configuration: mkdocs.yaml configuration: docs/source/conf.py
fail_on_warning: true
# If using Sphinx, optionally build your docs in additional formats such as PDF
formats: []
# Optionally declare the Python requirements required to build your docs # Optionally declare the Python requirements required to build your docs
python: python:

View File

@ -15,6 +15,7 @@ project(vllm_extensions LANGUAGES CXX)
# CUDA by default, can be overridden by using -DVLLM_TARGET_DEVICE=... (used by setup.py) # CUDA by default, can be overridden by using -DVLLM_TARGET_DEVICE=... (used by setup.py)
set(VLLM_TARGET_DEVICE "cuda" CACHE STRING "Target device backend for vLLM") set(VLLM_TARGET_DEVICE "cuda" CACHE STRING "Target device backend for vLLM")
message(STATUS "Build type: ${CMAKE_BUILD_TYPE}") message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
message(STATUS "Target device: ${VLLM_TARGET_DEVICE}") message(STATUS "Target device: ${VLLM_TARGET_DEVICE}")
@ -23,17 +24,17 @@ include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake)
# Suppress potential warnings about unused manually-specified variables # Suppress potential warnings about unused manually-specified variables
set(ignoreMe "${VLLM_PYTHON_PATH}") set(ignoreMe "${VLLM_PYTHON_PATH}")
# Prevent installation of dependencies (cutlass) by default.
install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
# #
# Supported python versions. These versions will be searched in order, the # Supported python versions. These versions will be searched in order, the
# first match will be selected. These should be kept in sync with setup.py. # first match will be selected. These should be kept in sync with setup.py.
# #
set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12") set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12")
# Supported NVIDIA architectures.
set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0")
# Supported AMD GPU architectures. # Supported AMD GPU architectures.
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201") set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101")
# #
# Supported/expected torch versions for CUDA/ROCm. # Supported/expected torch versions for CUDA/ROCm.
@ -43,10 +44,10 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1
# #
# Note: the CUDA torch version is derived from pyproject.toml and various # Note: the CUDA torch version is derived from pyproject.toml and various
# requirements.txt files and should be kept consistent. The ROCm torch # requirements.txt files and should be kept consistent. The ROCm torch
# versions are derived from docker/Dockerfile.rocm # versions are derived from Dockerfile.rocm
# #
set(TORCH_SUPPORTED_VERSION_CUDA "2.7.0") set(TORCH_SUPPORTED_VERSION_CUDA "2.6.0")
set(TORCH_SUPPORTED_VERSION_ROCM "2.7.0") set(TORCH_SUPPORTED_VERSION_ROCM "2.6.0")
# #
# Try to find python package with an executable that exactly matches # Try to find python package with an executable that exactly matches
@ -79,15 +80,6 @@ endif()
# #
find_package(Torch REQUIRED) find_package(Torch REQUIRED)
# Supported NVIDIA architectures.
# This check must happen after find_package(Torch) because that's when CMAKE_CUDA_COMPILER_VERSION gets defined
if(DEFINED CMAKE_CUDA_COMPILER_VERSION AND
CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8)
set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0")
else()
set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0")
endif()
# #
# Forward the non-CUDA device extensions to external CMake scripts. # Forward the non-CUDA device extensions to external CMake scripts.
# #
@ -182,6 +174,9 @@ include(FetchContent)
file(MAKE_DIRECTORY ${FETCHCONTENT_BASE_DIR}) # Ensure the directory exists file(MAKE_DIRECTORY ${FETCHCONTENT_BASE_DIR}) # Ensure the directory exists
message(STATUS "FetchContent base directory: ${FETCHCONTENT_BASE_DIR}") message(STATUS "FetchContent base directory: ${FETCHCONTENT_BASE_DIR}")
#
# Set rocm version dev int.
#
if(VLLM_GPU_LANG STREQUAL "HIP") if(VLLM_GPU_LANG STREQUAL "HIP")
# #
# Overriding the default -O set up by cmake, adding ggdb3 for the most verbose devug info # Overriding the default -O set up by cmake, adding ggdb3 for the most verbose devug info
@ -189,6 +184,7 @@ if(VLLM_GPU_LANG STREQUAL "HIP")
set(CMAKE_${VLLM_GPU_LANG}_FLAGS_DEBUG "${CMAKE_${VLLM_GPU_LANG}_FLAGS_DEBUG} -O0 -ggdb3") set(CMAKE_${VLLM_GPU_LANG}_FLAGS_DEBUG "${CMAKE_${VLLM_GPU_LANG}_FLAGS_DEBUG} -O0 -ggdb3")
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0 -ggdb3") set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0 -ggdb3")
# #
# Certain HIP functions are marked as [[nodiscard]], yet vllm ignores the result which generates # Certain HIP functions are marked as [[nodiscard]], yet vllm ignores the result which generates
# a lot of warnings that always mask real issues. Suppressing until this is properly addressed. # a lot of warnings that always mask real issues. Suppressing until this is properly addressed.
@ -231,35 +227,28 @@ endif()
# #
set(VLLM_EXT_SRC set(VLLM_EXT_SRC
"csrc/mamba/mamba_ssm/selective_scan_fwd.cu"
"csrc/mamba/causal_conv1d/causal_conv1d.cu"
"csrc/cache_kernels.cu" "csrc/cache_kernels.cu"
"csrc/attention/paged_attention_v1.cu" "csrc/attention/paged_attention_v1.cu"
"csrc/attention/paged_attention_v2.cu" "csrc/attention/paged_attention_v2.cu"
"csrc/attention/merge_attn_states.cu"
"csrc/attention/vertical_slash_index.cu"
"csrc/pos_encoding_kernels.cu" "csrc/pos_encoding_kernels.cu"
"csrc/activation_kernels.cu" "csrc/activation_kernels.cu"
"csrc/layernorm_kernels.cu" "csrc/layernorm_kernels.cu"
"csrc/layernorm_quant_kernels.cu" "csrc/layernorm_quant_kernels.cu"
"csrc/sampler.cu"
"csrc/cuda_view.cu"
"csrc/quantization/gptq/q_gemm.cu" "csrc/quantization/gptq/q_gemm.cu"
"csrc/quantization/compressed_tensors/int8_quant_kernels.cu" "csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
"csrc/quantization/fp8/common.cu" "csrc/quantization/fp8/common.cu"
"csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu" "csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu"
"csrc/quantization/gguf/gguf_kernel.cu" "csrc/quantization/gguf/gguf_kernel.cu"
"csrc/quantization/activation_kernels.cu"
"csrc/cuda_utils_kernels.cu" "csrc/cuda_utils_kernels.cu"
"csrc/prepare_inputs/advance_step.cu" "csrc/prepare_inputs/advance_step.cu"
"csrc/custom_all_reduce.cu"
"csrc/torch_bindings.cpp") "csrc/torch_bindings.cpp")
if(VLLM_GPU_LANG STREQUAL "CUDA") if(VLLM_GPU_LANG STREQUAL "CUDA")
SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library") SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
# Set CUTLASS_REVISION. Used for FetchContent. Also fixes some bogus messages when building. # Set CUTLASS_REVISION manually -- its revision detection doesn't work in this case.
set(CUTLASS_REVISION "v3.9.2" CACHE STRING "CUTLASS revision to use") # Please keep this in sync with FetchContent_Declare line below.
set(CUTLASS_REVISION "v3.8.0" CACHE STRING "CUTLASS revision to use")
# Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided # Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR}) if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
@ -277,7 +266,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
cutlass cutlass
GIT_REPOSITORY https://github.com/nvidia/cutlass.git GIT_REPOSITORY https://github.com/nvidia/cutlass.git
# Please keep this in sync with CUTLASS_REVISION line above. # Please keep this in sync with CUTLASS_REVISION line above.
GIT_TAG ${CUTLASS_REVISION} GIT_TAG v3.8.0
GIT_PROGRESS TRUE GIT_PROGRESS TRUE
# Speed up CUTLASS download by retrieving only the specified GIT_TAG instead of the history. # Speed up CUTLASS download by retrieving only the specified GIT_TAG instead of the history.
@ -289,16 +278,17 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
FetchContent_MakeAvailable(cutlass) FetchContent_MakeAvailable(cutlass)
list(APPEND VLLM_EXT_SRC list(APPEND VLLM_EXT_SRC
"csrc/mamba/mamba_ssm/selective_scan_fwd.cu"
"csrc/mamba/causal_conv1d/causal_conv1d.cu"
"csrc/quantization/aqlm/gemm_kernels.cu" "csrc/quantization/aqlm/gemm_kernels.cu"
"csrc/quantization/awq/gemm_kernels.cu" "csrc/quantization/awq/gemm_kernels.cu"
"csrc/custom_all_reduce.cu"
"csrc/permute_cols.cu" "csrc/permute_cols.cu"
"csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu" "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
"csrc/quantization/fp4/nvfp4_quant_entry.cu" "csrc/quantization/fp4/nvfp4_quant_entry.cu"
"csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu" "csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu"
"csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu"
"csrc/sparse/cutlass/sparse_scaled_mm_entry.cu" "csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
"csrc/cutlass_extensions/common.cpp" "csrc/cutlass_extensions/common.cpp")
"csrc/attention/mla/cutlass_mla_entry.cu")
set_gencode_flags_for_srcs( set_gencode_flags_for_srcs(
SRCS "${VLLM_EXT_SRC}" SRCS "${VLLM_EXT_SRC}"
@ -307,55 +297,10 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# Only build Marlin kernels if we are building for at least some compatible archs. # Only build Marlin kernels if we are building for at least some compatible archs.
# Keep building Marlin for 9.0 as there are some group sizes and shapes that # Keep building Marlin for 9.0 as there are some group sizes and shapes that
# are not supported by Machete yet. # are not supported by Machete yet.
# 9.0 for latest bf16 atomicAdd PTX cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0" "${CUDA_ARCHS}")
cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.7;9.0+PTX" "${CUDA_ARCHS}")
if (MARLIN_ARCHS) if (MARLIN_ARCHS)
#
# For the Marlin kernels we automatically generate sources for various
# preselected input type pairs and schedules.
# Generate sources:
set(MARLIN_GEN_SCRIPT
${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/gptq_marlin/generate_kernels.py)
file(MD5 ${MARLIN_GEN_SCRIPT} MARLIN_GEN_SCRIPT_HASH)
message(STATUS "Marlin generation script hash: ${MARLIN_GEN_SCRIPT_HASH}")
message(STATUS "Last run Marlin generate script hash: $CACHE{MARLIN_GEN_SCRIPT_HASH}")
if (NOT DEFINED CACHE{MARLIN_GEN_SCRIPT_HASH}
OR NOT $CACHE{MARLIN_GEN_SCRIPT_HASH} STREQUAL ${MARLIN_GEN_SCRIPT_HASH})
execute_process(
COMMAND ${CMAKE_COMMAND} -E env
PYTHONPATH=$PYTHONPATH
${Python_EXECUTABLE} ${MARLIN_GEN_SCRIPT}
RESULT_VARIABLE marlin_generation_result
OUTPUT_VARIABLE marlin_generation_result
OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log
ERROR_FILE ${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log
)
if (NOT marlin_generation_result EQUAL 0)
message(FATAL_ERROR "Marlin generation failed."
" Result: \"${marlin_generation_result}\""
"\nCheck the log for details: "
"${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log")
else()
set(MARLIN_GEN_SCRIPT_HASH ${MARLIN_GEN_SCRIPT_HASH}
CACHE STRING "Last run Marlin generate script hash" FORCE)
message(STATUS "Marlin generation completed successfully.")
endif()
else()
message(STATUS "Marlin generation script has not changed, skipping generation.")
endif()
file(GLOB MARLIN_TEMPLATE_KERNEL_SRC "csrc/quantization/gptq_marlin/kernel_*.cu")
set_gencode_flags_for_srcs(
SRCS "${MARLIN_TEMPLATE_KERNEL_SRC}"
CUDA_ARCHS "${MARLIN_ARCHS}")
list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_KERNEL_SRC})
set(MARLIN_SRCS set(MARLIN_SRCS
"csrc/quantization/fp8/fp8_marlin.cu"
"csrc/quantization/marlin/dense/marlin_cuda_kernel.cu" "csrc/quantization/marlin/dense/marlin_cuda_kernel.cu"
"csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu" "csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
"csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu" "csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu"
@ -420,14 +365,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
endif() endif()
endif() endif()
# The cutlass_scaled_mm kernels for Blackwell SM100 (c3x, i.e. CUTLASS 3.x) # The cutlass_scaled_mm kernels for Blackwell (c3x, i.e. CUTLASS 3.x) require
# require CUDA 12.8 or later # CUDA 12.8 or later
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a" "${CUDA_ARCHS}") cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;12.0a" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND SCALED_MM_ARCHS) if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND SCALED_MM_ARCHS)
set(SRCS set(SRCS
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu" "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu"
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu" "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu"
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8.cu"
) )
set_gencode_flags_for_srcs( set_gencode_flags_for_srcs(
SRCS "${SRCS}" SRCS "${SRCS}"
@ -452,9 +396,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# #
# For the cutlass_scaled_mm kernels we want to build the c2x (CUTLASS 2.x) # For the cutlass_scaled_mm kernels we want to build the c2x (CUTLASS 2.x)
# kernels for the remaining archs that are not already built for 3x. # kernels for the remaining archs that are not already built for 3x.
# (Build 8.9 for FP8)
cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS
"7.5;8.0;8.7;8.9+PTX" "${CUDA_ARCHS}") "7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0" "${CUDA_ARCHS}")
# subtract out the archs that are already built for 3x # subtract out the archs that are already built for 3x
list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS}) list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
if (SCALED_MM_2X_ARCHS) if (SCALED_MM_2X_ARCHS)
@ -505,9 +448,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND FP4_ARCHS) if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND FP4_ARCHS)
set(SRCS set(SRCS
"csrc/quantization/fp4/nvfp4_quant_kernels.cu" "csrc/quantization/fp4/nvfp4_quant_kernels.cu"
"csrc/quantization/fp4/nvfp4_experts_quant.cu" "csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu")
"csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu"
"csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu")
set_gencode_flags_for_srcs( set_gencode_flags_for_srcs(
SRCS "${SRCS}" SRCS "${SRCS}"
CUDA_ARCHS "${FP4_ARCHS}") CUDA_ARCHS "${FP4_ARCHS}")
@ -520,52 +461,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
set(FP4_ARCHS) set(FP4_ARCHS)
endif() endif()
# CUTLASS MLA Archs and flags
cuda_archs_loose_intersection(MLA_ARCHS "10.0a" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND MLA_ARCHS)
set(SRCS
"csrc/attention/mla/cutlass_mla_kernels.cu")
set_gencode_flags_for_srcs(
SRCS "${SRCS}"
CUDA_ARCHS "${MLA_ARCHS}")
list(APPEND VLLM_EXT_SRC "${SRCS}")
list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MLA=1")
# Add MLA-specific include directories only to MLA source files
set_source_files_properties(${SRCS}
PROPERTIES INCLUDE_DIRECTORIES "${CUTLASS_DIR}/examples/77_blackwell_fmha;${CUTLASS_DIR}/examples/common")
message(STATUS "Building CUTLASS MLA for archs: ${MLA_ARCHS}")
else()
message(STATUS "Not building CUTLASS MLA as no compatible archs were found.")
# clear MLA_ARCHS
set(MLA_ARCHS)
endif()
# CUTLASS MoE kernels
# The MoE kernel cutlass_moe_mm requires CUDA 12.3 or later (and ONLY works
# on Hopper). get_cutlass_(pplx_)moe_mm_data should only be compiled
# if it's possible to compile MoE kernels that use its output.
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu"
"csrc/quantization/cutlass_w8a8/moe/moe_data.cu")
set_gencode_flags_for_srcs(
SRCS "${SRCS}"
CUDA_ARCHS "${SCALED_MM_ARCHS}")
list(APPEND VLLM_EXT_SRC "${SRCS}")
list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MOE_SM90=1")
message(STATUS "Building grouped_mm_c3x for archs: ${SCALED_MM_ARCHS}")
else()
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
message(STATUS "Not building grouped_mm_c3x kernels as CUDA Compiler version is "
"not >= 12.3, we recommend upgrading to CUDA 12.3 or later "
"if you intend on running FP8 quantized MoE models on Hopper.")
else()
message(STATUS "Not building grouped_mm_c3x as no compatible archs found "
"in CUDA target architectures")
endif()
endif()
# #
# Machete kernels # Machete kernels
@ -683,54 +578,23 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
CUDA_ARCHS "${CUDA_ARCHS}") CUDA_ARCHS "${CUDA_ARCHS}")
list(APPEND VLLM_MOE_EXT_SRC "${VLLM_MOE_WNA16_SRC}") list(APPEND VLLM_MOE_EXT_SRC "${VLLM_MOE_WNA16_SRC}")
# 9.0 for latest bf16 atomicAdd PTX cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0" "${CUDA_ARCHS}")
cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.7;9.0+PTX" "${CUDA_ARCHS}")
if (MARLIN_MOE_ARCHS) if (MARLIN_MOE_ARCHS)
set(MARLIN_MOE_SRC
"csrc/moe/marlin_kernels/marlin_moe_kernel.h"
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.h"
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.cu"
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.h"
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.cu"
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.h"
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.cu"
"csrc/moe/marlin_moe_ops.cu")
#
# For the Marlin MOE kernels we automatically generate sources for various
# preselected input type pairs and schedules.
# Generate sources:
set(MOE_MARLIN_GEN_SCRIPT
${CMAKE_CURRENT_SOURCE_DIR}/csrc/moe/marlin_moe_wna16/generate_kernels.py)
file(MD5 ${MOE_MARLIN_GEN_SCRIPT} MOE_MARLIN_GEN_SCRIPT_HASH)
message(STATUS "Marlin MOE generation script hash: ${MOE_MARLIN_GEN_SCRIPT_HASH}")
message(STATUS "Last run Marlin MOE generate script hash: $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH}")
if (NOT DEFINED CACHE{MOE_MARLIN_GEN_SCRIPT_HASH}
OR NOT $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH} STREQUAL ${MOE_MARLIN_GEN_SCRIPT_HASH})
execute_process(
COMMAND ${CMAKE_COMMAND} -E env
PYTHONPATH=$PYTHONPATH
${Python_EXECUTABLE} ${MOE_MARLIN_GEN_SCRIPT}
RESULT_VARIABLE moe_marlin_generation_result
OUTPUT_VARIABLE moe_marlin_generation_output
OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log
ERROR_FILE ${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log
)
if (NOT moe_marlin_generation_result EQUAL 0)
message(FATAL_ERROR "Marlin MOE generation failed."
" Result: \"${moe_marlin_generation_result}\""
"\nCheck the log for details: "
"${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log")
else()
set(MOE_MARLIN_GEN_SCRIPT_HASH ${MOE_MARLIN_GEN_SCRIPT_HASH}
CACHE STRING "Last run Marlin MOE generate script hash" FORCE)
message(STATUS "Marlin MOE generation completed successfully.")
endif()
else()
message(STATUS "Marlin MOE generation script has not changed, skipping generation.")
endif()
file(GLOB MOE_WNAA16_MARLIN_SRC "csrc/moe/marlin_moe_wna16/*.cu")
set_gencode_flags_for_srcs( set_gencode_flags_for_srcs(
SRCS "${MOE_WNAA16_MARLIN_SRC}" SRCS "${MARLIN_MOE_SRC}"
CUDA_ARCHS "${MARLIN_MOE_ARCHS}") CUDA_ARCHS "${MARLIN_MOE_ARCHS}")
list(APPEND VLLM_MOE_EXT_SRC ${MOE_WNAA16_MARLIN_SRC}) list(APPEND VLLM_MOE_EXT_SRC "${MARLIN_MOE_SRC}")
message(STATUS "Building Marlin MOE kernels for archs: ${MARLIN_MOE_ARCHS}") message(STATUS "Building Marlin MOE kernels for archs: ${MARLIN_MOE_ARCHS}")
else() else()
message(STATUS "Not building Marlin MOE kernels as no compatible archs found" message(STATUS "Not building Marlin MOE kernels as no compatible archs found"
@ -738,17 +602,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
endif() endif()
endif() endif()
if(VLLM_GPU_LANG STREQUAL "CUDA")
set(MOE_PERMUTE_SRC
"csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.cu"
"csrc/moe/moe_permute_unpermute_op.cu")
set_gencode_flags_for_srcs(
SRCS "${MARLIN_PERMUTE_SRC}"
CUDA_ARCHS "${MOE_PERMUTE_ARCHS}")
list(APPEND VLLM_MOE_EXT_SRC "${MOE_PERMUTE_SRC}")
endif()
message(STATUS "Enabling moe extension.") message(STATUS "Enabling moe extension.")
define_gpu_extension_target( define_gpu_extension_target(
_moe_C _moe_C
@ -757,8 +610,6 @@ define_gpu_extension_target(
SOURCES ${VLLM_MOE_EXT_SRC} SOURCES ${VLLM_MOE_EXT_SRC}
COMPILE_FLAGS ${VLLM_GPU_FLAGS} COMPILE_FLAGS ${VLLM_GPU_FLAGS}
ARCHITECTURES ${VLLM_GPU_ARCHES} ARCHITECTURES ${VLLM_GPU_ARCHES}
INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR}
INCLUDE_DIRECTORIES ${CUTLASS_TOOLS_UTIL_INCLUDE_DIR}
USE_SABI 3 USE_SABI 3
WITH_SOABI) WITH_SOABI)
@ -768,7 +619,6 @@ if(VLLM_GPU_LANG STREQUAL "HIP")
# #
set(VLLM_ROCM_EXT_SRC set(VLLM_ROCM_EXT_SRC
"csrc/rocm/torch_bindings.cpp" "csrc/rocm/torch_bindings.cpp"
"csrc/rocm/skinny_gemms.cu"
"csrc/rocm/attention.cu") "csrc/rocm/attention.cu")
define_gpu_extension_target( define_gpu_extension_target(
@ -785,7 +635,5 @@ endif()
# For CUDA we also build and ship some external projects. # For CUDA we also build and ship some external projects.
if (VLLM_GPU_LANG STREQUAL "CUDA") if (VLLM_GPU_LANG STREQUAL "CUDA")
include(cmake/external_projects/flashmla.cmake) include(cmake/external_projects/flashmla.cmake)
# vllm-flash-attn should be last as it overwrites some CMake functions
include(cmake/external_projects/vllm_flash_attn.cmake) include(cmake/external_projects/vllm_flash_attn.cmake)
endif () endif ()

View File

@ -1,3 +1,3 @@
# Contributing to vLLM # Contributing to vLLM
You may find information about contributing to vLLM on [docs.vllm.ai](https://docs.vllm.ai/en/latest/contributing). You may find information about contributing to vLLM on [docs.vllm.ai](https://docs.vllm.ai/en/latest/contributing/overview.html).

View File

@ -2,42 +2,33 @@
# to run the OpenAI compatible server. # to run the OpenAI compatible server.
# Please update any changes made here to # Please update any changes made here to
# docs/contributing/dockerfile/dockerfile.md and # docs/source/contributing/dockerfile/dockerfile.md and
# docs/assets/contributing/dockerfile-stages-dependency.png # docs/source/assets/contributing/dockerfile-stages-dependency.png
ARG CUDA_VERSION=12.8.1 ARG CUDA_VERSION=12.4.1
#################### BASE BUILD IMAGE #################### #################### BASE BUILD IMAGE ####################
# prepare basic build environment # prepare basic build environment
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 AS base FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 AS base
ARG CUDA_VERSION=12.8.1 ARG CUDA_VERSION=12.4.1
ARG PYTHON_VERSION=3.12 ARG PYTHON_VERSION=3.12
ARG TARGETPLATFORM ARG TARGETPLATFORM
ENV DEBIAN_FRONTEND=noninteractive ENV DEBIAN_FRONTEND=noninteractive
# Install Python and other dependencies # Install minimal dependencies and uv
RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \ RUN apt-get update -y \
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \ && apt-get install -y ccache git curl wget sudo \
&& apt-get update -y \ && curl -LsSf https://astral.sh/uv/install.sh | sh
&& apt-get install -y ccache software-properties-common git curl sudo \
&& for i in 1 2 3; do \ # Add uv to PATH
add-apt-repository -y ppa:deadsnakes/ppa && break || \ ENV PATH="/root/.local/bin:$PATH"
{ echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \ # Create venv with specified Python and activate by placing at the front of path
done \ ENV VIRTUAL_ENV="/opt/venv"
&& apt-get update -y \ RUN uv venv --python ${PYTHON_VERSION} --seed ${VIRTUAL_ENV}
&& apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \ ENV PATH="$VIRTUAL_ENV/bin:$PATH"
&& update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
&& update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
&& ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
&& curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
&& python3 --version && python3 -m pip --version
# Install uv for faster pip installs
RUN --mount=type=cache,target=/root/.cache/uv \
python3 -m pip install uv
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out # This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
# Reference: https://github.com/astral-sh/uv/pull/1694 # Reference: https://github.com/astral-sh/uv/pull/1694
ENV UV_HTTP_TIMEOUT=500 ENV UV_HTTP_TIMEOUT=500
ENV UV_INDEX_STRATEGY="unsafe-best-match"
# Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519 # Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519
# as it was causing spam when compiling the CUTLASS kernels # as it was causing spam when compiling the CUTLASS kernels
@ -55,29 +46,25 @@ RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
WORKDIR /workspace WORKDIR /workspace
# install build and runtime dependencies
# arm64 (GH200) build follows the practice of "use existing pytorch" build, # arm64 (GH200) build follows the practice of "use existing pytorch" build,
# we need to install torch and torchvision from the nightly builds first, # we need to install torch and torchvision from the nightly builds first,
# pytorch will not appear as a vLLM dependency in all of the following steps # pytorch will not appear as a vLLM dependency in all of the following steps
# after this step # after this step
RUN --mount=type=cache,target=/root/.cache/uv \ RUN --mount=type=cache,target=/root/.cache/uv \
if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \ if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu128 "torch==2.8.0.dev20250318+cu128" "torchvision==0.22.0.dev20250319"; \ uv pip install --index-url https://download.pytorch.org/whl/nightly/cu126 "torch==2.7.0.dev20250121+cu126" "torchvision==0.22.0.dev20250121"; \
uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu128 --pre pytorch_triton==3.3.0+gitab727c40; \
fi fi
COPY requirements/common.txt requirements/common.txt COPY requirements/common.txt requirements/common.txt
COPY requirements/cuda.txt requirements/cuda.txt COPY requirements/cuda.txt requirements/cuda.txt
RUN --mount=type=cache,target=/root/.cache/uv \ RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system -r requirements/cuda.txt \ uv pip install -r requirements/cuda.txt
--extra-index-url https://download.pytorch.org/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
# cuda arch list used by torch # cuda arch list used by torch
# can be useful for both `dev` and `test` # can be useful for both `dev` and `test`
# explicitly set the list to avoid issues with torch 2.2 # explicitly set the list to avoid issues with torch 2.2
# see https://github.com/pytorch/pytorch/pull/123243 # see https://github.com/pytorch/pytorch/pull/123243
ARG torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0 10.0+PTX' ARG torch_cuda_arch_list='7.0 7.5 8.0 8.6 8.9 9.0+PTX'
ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list} ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
# Override the arch list for flash-attn to reduce the binary size # Override the arch list for flash-attn to reduce the binary size
ARG vllm_fa_cmake_gpu_arches='80-real;90-real' ARG vllm_fa_cmake_gpu_arches='80-real;90-real'
@ -94,11 +81,9 @@ COPY requirements/build.txt requirements/build.txt
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out # This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
# Reference: https://github.com/astral-sh/uv/pull/1694 # Reference: https://github.com/astral-sh/uv/pull/1694
ENV UV_HTTP_TIMEOUT=500 ENV UV_HTTP_TIMEOUT=500
ENV UV_INDEX_STRATEGY="unsafe-best-match"
RUN --mount=type=cache,target=/root/.cache/uv \ RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system -r requirements/build.txt \ uv pip install -r requirements/build.txt
--extra-index-url https://download.pytorch.org/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
COPY . . COPY . .
ARG GIT_REPO_CHECK=0 ARG GIT_REPO_CHECK=0
@ -165,60 +150,43 @@ FROM base as dev
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out # This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
# Reference: https://github.com/astral-sh/uv/pull/1694 # Reference: https://github.com/astral-sh/uv/pull/1694
ENV UV_HTTP_TIMEOUT=500 ENV UV_HTTP_TIMEOUT=500
ENV UV_INDEX_STRATEGY="unsafe-best-match"
# Workaround for #17068
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system --no-build-isolation "git+https://github.com/state-spaces/mamba@v2.2.4"
COPY requirements/lint.txt requirements/lint.txt COPY requirements/lint.txt requirements/lint.txt
COPY requirements/test.txt requirements/test.txt COPY requirements/test.txt requirements/test.txt
COPY requirements/dev.txt requirements/dev.txt COPY requirements/dev.txt requirements/dev.txt
RUN --mount=type=cache,target=/root/.cache/uv \ RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system -r requirements/dev.txt \ uv pip install -r requirements/dev.txt
--extra-index-url https://download.pytorch.org/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
#################### DEV IMAGE #################### #################### DEV IMAGE ####################
#################### vLLM installation IMAGE #################### #################### vLLM installation IMAGE ####################
# image with vLLM installed # image with vLLM installed
# TODO: Restore to base image after FlashInfer AOT wheel fixed # TODO: Restore to base image after FlashInfer AOT wheel fixed
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS vllm-base FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS vllm-base
ARG CUDA_VERSION=12.8.1 ARG CUDA_VERSION=12.4.1
ARG PYTHON_VERSION=3.12 ARG PYTHON_VERSION=3.12
WORKDIR /vllm-workspace WORKDIR /vllm-workspace
ENV DEBIAN_FRONTEND=noninteractive ENV DEBIAN_FRONTEND=noninteractive
ARG TARGETPLATFORM ARG TARGETPLATFORM
SHELL ["/bin/bash", "-c"]
RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \ RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment
# Install Python and other dependencies # Install minimal dependencies and uv
RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \ RUN apt-get update -y \
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \ && apt-get install -y ccache git curl wget sudo vim \
&& apt-get update -y \ && apt-get install -y ffmpeg libsm6 libxext6 libgl1 libibverbs-dev \
&& apt-get install -y ccache software-properties-common git curl wget sudo vim python3-pip \ && curl -LsSf https://astral.sh/uv/install.sh | sh
&& apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
&& for i in 1 2 3; do \ # Add uv to PATH
add-apt-repository -y ppa:deadsnakes/ppa && break || \ ENV PATH="/root/.local/bin:$PATH"
{ echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \ # Create venv with specified Python and activate by placing at the front of path
done \ ENV VIRTUAL_ENV="/opt/venv"
&& apt-get update -y \ RUN uv venv --python ${PYTHON_VERSION} --seed ${VIRTUAL_ENV}
&& apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv libibverbs-dev \ ENV PATH="$VIRTUAL_ENV/bin:$PATH"
&& update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
&& update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
&& ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
&& curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
&& python3 --version && python3 -m pip --version
# Install uv for faster pip installs
RUN --mount=type=cache,target=/root/.cache/uv \
python3 -m pip install uv
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out # This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
# Reference: https://github.com/astral-sh/uv/pull/1694 # Reference: https://github.com/astral-sh/uv/pull/1694
ENV UV_HTTP_TIMEOUT=500 ENV UV_HTTP_TIMEOUT=500
ENV UV_INDEX_STRATEGY="unsafe-best-match"
# Workaround for https://github.com/openai/triton/issues/2507 and # Workaround for https://github.com/openai/triton/issues/2507 and
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully # https://github.com/pytorch/pytorch/issues/107960 -- hopefully
@ -232,61 +200,39 @@ RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
# after this step # after this step
RUN --mount=type=cache,target=/root/.cache/uv \ RUN --mount=type=cache,target=/root/.cache/uv \
if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \ if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu128 "torch==2.8.0.dev20250318+cu128" "torchvision==0.22.0.dev20250319"; \ uv pip install --index-url https://download.pytorch.org/whl/nightly/cu124 "torch==2.6.0.dev20241210+cu124" "torchvision==0.22.0.dev20241215"; \
uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu128 --pre pytorch_triton==3.3.0+gitab727c40; \
fi fi
# Install vllm wheel first, so that torch etc will be installed. # Install vllm wheel first, so that torch etc will be installed.
RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
--mount=type=cache,target=/root/.cache/uv \ --mount=type=cache,target=/root/.cache/uv \
uv pip install --system dist/*.whl --verbose \ uv pip install dist/*.whl --verbose
--extra-index-url https://download.pytorch.org/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
# If we need to build FlashInfer wheel before its release: # If we need to build FlashInfer wheel before its release:
# $ export FLASHINFER_ENABLE_AOT=1
# $ # Note we remove 7.0 from the arch list compared to the list below, since FlashInfer only supports sm75+ # $ # Note we remove 7.0 from the arch list compared to the list below, since FlashInfer only supports sm75+
# $ export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0a 10.0a' # $ export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.6 8.9 9.0+PTX'
# $ git clone https://github.com/flashinfer-ai/flashinfer.git --recursive # $ git clone https://github.com/flashinfer-ai/flashinfer.git --recursive
# $ cd flashinfer # $ cd flashinfer
# $ git checkout v0.2.6.post1 # $ git checkout 524304395bd1d8cd7d07db083859523fcaa246a4
# $ python -m flashinfer.aot # $ rm -rf build
# $ python -m build --no-isolation --wheel # $ python3 setup.py bdist_wheel --dist-dir=dist --verbose
# $ ls -la dist # $ ls dist
# -rw-rw-r-- 1 mgoin mgoin 205M Jun 9 18:03 flashinfer_python-0.2.6.post1-cp39-abi3-linux_x86_64.whl # $ # upload the wheel to a public location, e.g. https://wheels.vllm.ai/flashinfer/524304395bd1d8cd7d07db083859523fcaa246a4/flashinfer_python-0.2.1.post1+cu124torch2.5-cp38-abi3-linux_x86_64.whl
# $ # upload the wheel to a public location, e.g. https://wheels.vllm.ai/flashinfer/v0.2.6.post1/flashinfer_python-0.2.6.post1-cp39-abi3-linux_x86_64.whl
RUN --mount=type=cache,target=/root/.cache/uv \ RUN --mount=type=cache,target=/root/.cache/uv \
. /etc/environment && \
if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \ if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \
# FlashInfer already has a wheel for PyTorch 2.7.0 and CUDA 12.8. This is enough for CI use uv pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.1.post2/flashinfer_python-0.2.1.post2+cu124torch2.6-cp38-abi3-linux_x86_64.whl ; \
if [[ "$CUDA_VERSION" == 12.8* ]]; then \
uv pip install --system https://download.pytorch.org/whl/cu128/flashinfer/flashinfer_python-0.2.6.post1%2Bcu128torch2.7-cp39-abi3-linux_x86_64.whl; \
else \
export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0a 10.0a' && \
git clone https://github.com/flashinfer-ai/flashinfer.git --single-branch --branch v0.2.6.post1 --recursive && \
# Needed to build AOT kernels
(cd flashinfer && \
python3 -m flashinfer.aot && \
uv pip install --system --no-build-isolation . \
) && \
rm -rf flashinfer; \
fi \
fi fi
COPY examples examples COPY examples examples
COPY benchmarks benchmarks
COPY ./vllm/collect_env.py .
RUN --mount=type=cache,target=/root/.cache/uv \ # Although we build Flashinfer with AOT mode, there's still
. /etc/environment && \
uv pip list
# Even when we build Flashinfer with AOT mode, there's still
# some issues w.r.t. JIT compilation. Therefore we need to # some issues w.r.t. JIT compilation. Therefore we need to
# install build dependencies for JIT compilation. # install build dependencies for JIT compilation.
# TODO: Remove this once FlashInfer AOT wheel is fixed # TODO: Remove this once FlashInfer AOT wheel is fixed
COPY requirements/build.txt requirements/build.txt COPY requirements/build.txt requirements/build.txt
RUN --mount=type=cache,target=/root/.cache/uv \ RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system -r requirements/build.txt \ uv pip install -r requirements/build.txt
--extra-index-url https://download.pytorch.org/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
#################### vLLM installation IMAGE #################### #################### vLLM installation IMAGE ####################
@ -300,26 +246,18 @@ ADD . /vllm-workspace/
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out # This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
# Reference: https://github.com/astral-sh/uv/pull/1694 # Reference: https://github.com/astral-sh/uv/pull/1694
ENV UV_HTTP_TIMEOUT=500 ENV UV_HTTP_TIMEOUT=500
ENV UV_INDEX_STRATEGY="unsafe-best-match"
# Workaround for #17068
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system --no-build-isolation "git+https://github.com/state-spaces/mamba@v2.2.4"
# install development dependencies (for testing)
RUN --mount=type=cache,target=/root/.cache/uv \
CUDA_MAJOR="${CUDA_VERSION%%.*}"; \
if [ "$CUDA_MAJOR" -ge 12 ]; then \
uv pip install --system -r requirements/dev.txt; \
fi
# install development dependencies (for testing) # install development dependencies (for testing)
RUN --mount=type=cache,target=/root/.cache/uv \ RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system -e tests/vllm_test_utils uv pip install -r requirements/dev.txt
# install development dependencies (for testing)
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install -e tests/vllm_test_utils
# enable fast downloads from hf (for testing) # enable fast downloads from hf (for testing)
RUN --mount=type=cache,target=/root/.cache/uv \ RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system hf_transfer uv pip install hf_transfer
ENV HF_HUB_ENABLE_HF_TRANSFER 1 ENV HF_HUB_ENABLE_HF_TRANSFER 1
# Copy in the v1 package for testing (it isn't distributed yet) # Copy in the v1 package for testing (it isn't distributed yet)
@ -330,15 +268,12 @@ COPY vllm/v1 /usr/local/lib/python3.12/dist-packages/vllm/v1
# will not be imported by other tests # will not be imported by other tests
RUN mkdir test_docs RUN mkdir test_docs
RUN mv docs test_docs/ RUN mv docs test_docs/
RUN cp -r examples test_docs/
RUN mv vllm test_docs/ RUN mv vllm test_docs/
RUN mv mkdocs.yaml test_docs/
#################### TEST IMAGE #################### #################### TEST IMAGE ####################
#################### OPENAI API SERVER #################### #################### OPENAI API SERVER ####################
# base openai image with additional requirements, for any subsequent openai-style images # base openai image with additional requirements, for any subsequent openai-style images
FROM vllm-base AS vllm-openai-base FROM vllm-base AS vllm-openai-base
ARG TARGETPLATFORM
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out # This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
# Reference: https://github.com/astral-sh/uv/pull/1694 # Reference: https://github.com/astral-sh/uv/pull/1694
@ -347,9 +282,9 @@ ENV UV_HTTP_TIMEOUT=500
# install additional dependencies for openai api server # install additional dependencies for openai api server
RUN --mount=type=cache,target=/root/.cache/uv \ RUN --mount=type=cache,target=/root/.cache/uv \
if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \ if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
uv pip install --system accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.42.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \ uv pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.42.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
else \ else \
uv pip install --system accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.45.3' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \ uv pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.45.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
fi fi
ENV VLLM_USAGE_SOURCE production-docker-image ENV VLLM_USAGE_SOURCE production-docker-image

69
Dockerfile.cpu Normal file
View File

@ -0,0 +1,69 @@
# This vLLM Dockerfile is used to construct image that can build and run vLLM on x86 CPU platform.
FROM ubuntu:22.04 AS cpu-test-1
ENV CCACHE_DIR=/root/.cache/ccache
ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache
RUN --mount=type=cache,target=/var/cache/apt \
apt-get update -y \
&& apt-get install -y curl ccache git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 libnuma-dev \
&& apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
&& update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
# https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/performance_tuning/tuning_guide.html
# intel-openmp provides additional performance improvement vs. openmp
# tcmalloc provides better memory allocation efficiency, e.g, holding memory in caches to speed up access of commonly-used objects.
RUN --mount=type=cache,target=/root/.cache/pip \
pip install intel-openmp==2025.0.1
ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/usr/local/lib/libiomp5.so"
RUN echo 'ulimit -c 0' >> ~/.bashrc
RUN pip install intel_extension_for_pytorch==2.6.0
WORKDIR /workspace
ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=bind,src=requirements/build.txt,target=requirements/build.txt \
pip install --upgrade pip && \
pip install -r requirements/build.txt
FROM cpu-test-1 AS build
WORKDIR /workspace/vllm
RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=bind,src=requirements/common.txt,target=requirements/common.txt \
--mount=type=bind,src=requirements/cpu.txt,target=requirements/cpu.txt \
pip install -v -r requirements/cpu.txt
COPY . .
ARG GIT_REPO_CHECK=0
RUN --mount=type=bind,source=.git,target=.git \
if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
# Support for building with non-AVX512 vLLM: docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" ...
ARG VLLM_CPU_DISABLE_AVX512
ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}
RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=cache,target=/root/.cache/ccache \
--mount=type=bind,source=.git,target=.git \
VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel && \
pip install dist/*.whl && \
rm -rf dist
WORKDIR /workspace/
RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
# install development dependencies (for testing)
RUN --mount=type=cache,target=/root/.cache/pip \
pip install -e tests/vllm_test_utils
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]

View File

@ -1,4 +1,4 @@
FROM vault.habana.ai/gaudi-docker/1.20.1/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest FROM vault.habana.ai/gaudi-docker/1.19.1/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest
COPY ./ /workspace/vllm COPY ./ /workspace/vllm

View File

@ -1,6 +1,6 @@
# default base image # default base image
# https://gallery.ecr.aws/neuron/pytorch-inference-neuronx # https://gallery.ecr.aws/neuron/pytorch-inference-neuronx
ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.6.0-neuronx-py310-sdk2.23.0-ubuntu22.04" ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.5.1-neuronx-py310-sdk2.21.0-ubuntu22.04"
FROM $BASE_IMAGE FROM $BASE_IMAGE
@ -21,8 +21,9 @@ VOLUME [ ${APP_MOUNT} ]
WORKDIR ${APP_MOUNT}/vllm WORKDIR ${APP_MOUNT}/vllm
RUN python3 -m pip install --upgrade pip RUN python3 -m pip install --upgrade pip
RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas tenacity RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas
RUN python3 -m pip install neuronx-cc==2.* --extra-index-url=https://pip.repos.neuron.amazonaws.com -U RUN python3 -m pip install sentencepiece transformers==4.45.2 -U
RUN python3 -m pip install neuronx-cc==2.16.345.0 --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
RUN python3 -m pip install pytest RUN python3 -m pip install pytest
# uninstall transformers-neuronx package explicitly to avoid version conflict # uninstall transformers-neuronx package explicitly to avoid version conflict
@ -34,7 +35,7 @@ RUN --mount=type=bind,source=.git,target=.git \
if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
RUN python3 -m pip install -U \ RUN python3 -m pip install -U \
'cmake>=3.26.1' ninja packaging 'setuptools-scm>=8' wheel jinja2 \ 'cmake>=3.26' ninja packaging 'setuptools-scm>=8' wheel jinja2 \
-r requirements/neuron.txt -r requirements/neuron.txt
ENV VLLM_TARGET_DEVICE neuron ENV VLLM_TARGET_DEVICE neuron
@ -48,8 +49,6 @@ RUN python3 -m pip install -e tests/vllm_test_utils
# FIXME: `--no-deps` argument is temporarily added to resolve transformers package version conflict # FIXME: `--no-deps` argument is temporarily added to resolve transformers package version conflict
RUN python3 -m pip install transformers-neuronx==0.13.* --extra-index-url=https://pip.repos.neuron.amazonaws.com -U --no-deps RUN python3 -m pip install transformers-neuronx==0.13.* --extra-index-url=https://pip.repos.neuron.amazonaws.com -U --no-deps
RUN python3 -m pip install sentencepiece transformers==4.48.0 -U
# overwrite entrypoint to run bash script # overwrite entrypoint to run bash script
RUN echo "import subprocess; import sys; subprocess.check_call(sys.argv[1:])" > /usr/local/bin/dockerd-entrypoint.py RUN echo "import subprocess; import sys; subprocess.check_call(sys.argv[1:])" > /usr/local/bin/dockerd-entrypoint.py

37
Dockerfile.ppc64le Normal file
View File

@ -0,0 +1,37 @@
FROM mambaorg/micromamba
ARG MAMBA_DOCKERFILE_ACTIVATE=1
USER root
ENV PATH="/usr/local/cargo/bin:$PATH:/opt/conda/bin/"
RUN apt-get update -y && apt-get install -y git wget kmod curl vim libnuma-dev libsndfile-dev libprotobuf-dev build-essential ffmpeg libsm6 libxext6 libgl1 libssl-dev
# Some packages in requirements/cpu are installed here
# IBM provides optimized packages for ppc64le processors in the open-ce project for mamba
# Currently these may not be available for venv or pip directly
RUN micromamba install -y -n base -c https://ftp.osuosl.org/pub/open-ce/1.11.0-p10/ -c defaults python=3.10 rust && micromamba clean --all --yes
COPY ./ /workspace/vllm
WORKDIR /workspace/vllm
ARG GIT_REPO_CHECK=0
RUN --mount=type=bind,source=.git,target=.git \
if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh; fi
RUN --mount=type=cache,target=/root/.cache/pip \
RUSTFLAGS='-L /opt/conda/lib' pip install -v --prefer-binary --extra-index-url https://repo.fury.io/mgiessing \
'cmake>=3.26' ninja packaging 'setuptools-scm>=8' wheel jinja2 \
-r requirements/cpu.txt \
xformers uvloop==0.20.0
RUN --mount=type=bind,source=.git,target=.git \
VLLM_TARGET_DEVICE=cpu python3 setup.py install
# install development dependencies (for testing)
RUN python3 -m pip install -e tests/vllm_test_utils
WORKDIR /workspace/
RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
ENTRYPOINT ["/opt/conda/bin/python3", "-m", "vllm.entrypoints.openai.api_server"]

View File

@ -1,5 +1,7 @@
# default base image # default base image
ARG REMOTE_VLLM="0" ARG REMOTE_VLLM="0"
ARG USE_CYTHON="0"
ARG BUILD_RPD="1"
ARG COMMON_WORKDIR=/app ARG COMMON_WORKDIR=/app
ARG BASE_IMAGE=rocm/vllm-dev:base ARG BASE_IMAGE=rocm/vllm-dev:base
@ -10,10 +12,9 @@ ENV PYTORCH_ROCM_ARCH=${ARG_PYTORCH_ROCM_ARCH:-${PYTORCH_ROCM_ARCH}}
# Install some basic utilities # Install some basic utilities
RUN apt-get update -q -y && apt-get install -q -y \ RUN apt-get update -q -y && apt-get install -q -y \
sqlite3 libsqlite3-dev libfmt-dev libmsgpack-dev libsuitesparse-dev \ sqlite3 libsqlite3-dev libfmt-dev libmsgpack-dev libsuitesparse-dev
apt-transport-https ca-certificates wget curl
# Remove sccache # Remove sccache
RUN python3 -m pip install --upgrade pip RUN python3 -m pip install --upgrade pip && pip install setuptools_scm
RUN apt-get purge -y sccache; python3 -m pip uninstall -y sccache; rm -f "$(which sccache)" RUN apt-get purge -y sccache; python3 -m pip uninstall -y sccache; rm -f "$(which sccache)"
ARG COMMON_WORKDIR ARG COMMON_WORKDIR
WORKDIR ${COMMON_WORKDIR} WORKDIR ${COMMON_WORKDIR}
@ -28,17 +29,18 @@ ARG VLLM_REPO="https://github.com/vllm-project/vllm.git"
ARG VLLM_BRANCH="main" ARG VLLM_BRANCH="main"
ONBUILD RUN git clone ${VLLM_REPO} \ ONBUILD RUN git clone ${VLLM_REPO} \
&& cd vllm \ && cd vllm \
&& git fetch -v --prune -- origin ${VLLM_BRANCH} \ && git checkout ${VLLM_BRANCH}
&& git checkout FETCH_HEAD
FROM fetch_vllm_${REMOTE_VLLM} AS fetch_vllm FROM fetch_vllm_${REMOTE_VLLM} AS fetch_vllm
# ----------------------- # -----------------------
# vLLM build stages # vLLM build stages
FROM fetch_vllm AS build_vllm FROM fetch_vllm AS build_vllm
ARG USE_CYTHON
# Build vLLM # Build vLLM
RUN cd vllm \ RUN cd vllm \
&& python3 -m pip install -r requirements/rocm.txt \ && python3 -m pip install -r requirements/rocm.txt \
&& python3 setup.py clean --all \ && python3 setup.py clean --all \
&& if [ ${USE_CYTHON} -eq "1" ]; then python3 setup_cython.py build_ext --inplace; fi \
&& python3 setup.py bdist_wheel --dist-dir=dist && python3 setup.py bdist_wheel --dist-dir=dist
FROM scratch AS export_vllm FROM scratch AS export_vllm
ARG COMMON_WORKDIR ARG COMMON_WORKDIR
@ -87,6 +89,13 @@ RUN case "$(which python3)" in \
*) ;; esac *) ;; esac
RUN python3 -m pip install --upgrade huggingface-hub[cli] RUN python3 -m pip install --upgrade huggingface-hub[cli]
ARG BUILD_RPD
RUN if [ ${BUILD_RPD} -eq "1" ]; then \
git clone -b nvtx_enabled https://github.com/ROCm/rocmProfileData.git \
&& cd rocmProfileData/rpd_tracer \
&& pip install -r requirements.txt && cd ../ \
&& make && make install \
&& cd hipMarker && python3 setup.py install ; fi
# Install vLLM # Install vLLM
RUN --mount=type=bind,from=export_vllm,src=/,target=/install \ RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
@ -104,10 +113,8 @@ COPY --from=export_vllm /examples ${COMMON_WORKDIR}/vllm/examples
ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1 ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
ENV TOKENIZERS_PARALLELISM=false ENV TOKENIZERS_PARALLELISM=false
# ENV that can improve safe tensor loading, and end-to-end time
ENV SAFETENSORS_FAST_GPU=1
# Performance environment variable. # Performance environment variable.
ENV HIP_FORCE_DEV_KERNARG=1 ENV HIP_FORCE_DEV_KERNARG=1
CMD ["/bin/bash"] CMD ["/bin/bash"]

View File

@ -1,18 +1,18 @@
ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:6.3.1-complete ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:6.3.1-complete
ARG HIPBLASLT_BRANCH="db8e93b4" ARG HIPBLASLT_BRANCH="4d40e36"
ARG HIPBLAS_COMMON_BRANCH="7c1566b" ARG HIPBLAS_COMMON_BRANCH="7c1566b"
ARG LEGACY_HIPBLASLT_OPTION= ARG LEGACY_HIPBLASLT_OPTION=
ARG RCCL_BRANCH="648a58d" ARG RCCL_BRANCH="648a58d"
ARG RCCL_REPO="https://github.com/ROCm/rccl" ARG RCCL_REPO="https://github.com/ROCm/rccl"
ARG TRITON_BRANCH="e5be006" ARG TRITON_BRANCH="e5be006"
ARG TRITON_REPO="https://github.com/triton-lang/triton.git" ARG TRITON_REPO="https://github.com/triton-lang/triton.git"
ARG PYTORCH_BRANCH="295f2ed4" ARG PYTORCH_BRANCH="3a585126"
ARG PYTORCH_VISION_BRANCH="v0.21.0" ARG PYTORCH_VISION_BRANCH="v0.19.1"
ARG PYTORCH_REPO="https://github.com/pytorch/pytorch.git" ARG PYTORCH_REPO="https://github.com/pytorch/pytorch.git"
ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git" ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"
ARG FA_BRANCH="1a7f4dfa" ARG FA_BRANCH="b7d29fb"
ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git" ARG FA_REPO="https://github.com/ROCm/flash-attention.git"
ARG AITER_BRANCH="c1debd8" ARG AITER_BRANCH="21d47a9"
ARG AITER_REPO="https://github.com/ROCm/aiter.git" ARG AITER_REPO="https://github.com/ROCm/aiter.git"
FROM ${BASE_IMAGE} AS base FROM ${BASE_IMAGE} AS base
@ -20,7 +20,7 @@ FROM ${BASE_IMAGE} AS base
ENV PATH=/opt/rocm/llvm/bin:$PATH ENV PATH=/opt/rocm/llvm/bin:$PATH
ENV ROCM_PATH=/opt/rocm ENV ROCM_PATH=/opt/rocm
ENV LD_LIBRARY_PATH=/opt/rocm/lib:/usr/local/lib: ENV LD_LIBRARY_PATH=/opt/rocm/lib:/usr/local/lib:
ARG PYTORCH_ROCM_ARCH=gfx90a;gfx942;gfx1100;gfx1101;gfx1200;gfx1201 ARG PYTORCH_ROCM_ARCH=gfx90a;gfx942
ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}
ARG PYTHON_VERSION=3.12 ARG PYTHON_VERSION=3.12
@ -31,11 +31,8 @@ ENV DEBIAN_FRONTEND=noninteractive
# Install Python and other dependencies # Install Python and other dependencies
RUN apt-get update -y \ RUN apt-get update -y \
&& apt-get install -y software-properties-common git curl sudo vim less libgfortran5 \ && apt-get install -y software-properties-common git curl sudo vim less \
&& for i in 1 2 3; do \ && add-apt-repository ppa:deadsnakes/ppa \
add-apt-repository -y ppa:deadsnakes/ppa && break || \
{ echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \
done \
&& apt-get update -y \ && apt-get update -y \
&& apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \ && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
python${PYTHON_VERSION}-lib2to3 python-is-python3 \ python${PYTHON_VERSION}-lib2to3 python-is-python3 \
@ -45,7 +42,7 @@ RUN apt-get update -y \
&& curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \ && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
&& python3 --version && python3 -m pip --version && python3 --version && python3 -m pip --version
RUN pip install -U packaging 'cmake<4' ninja wheel setuptools pybind11 Cython RUN pip install -U packaging cmake ninja wheel setuptools pybind11 Cython
FROM base AS build_hipblaslt FROM base AS build_hipblaslt
ARG HIPBLASLT_BRANCH ARG HIPBLASLT_BRANCH
@ -63,8 +60,7 @@ RUN cd hipBLAS-common \
RUN git clone https://github.com/ROCm/hipBLASLt RUN git clone https://github.com/ROCm/hipBLASLt
RUN cd hipBLASLt \ RUN cd hipBLASLt \
&& git checkout ${HIPBLASLT_BRANCH} \ && git checkout ${HIPBLASLT_BRANCH} \
&& apt-get install -y llvm-dev \ && ./install.sh -d --architecture ${PYTORCH_ROCM_ARCH} ${LEGACY_HIPBLASLT_OPTION} \
&& ./install.sh -dc --architecture ${PYTORCH_ROCM_ARCH} ${LEGACY_HIPBLASLT_OPTION} \
&& cd build/release \ && cd build/release \
&& make package && make package
RUN mkdir -p /app/install && cp /app/hipBLASLt/build/release/*.deb /app/hipBLAS-common/build/*.deb /app/install RUN mkdir -p /app/install && cp /app/hipBLASLt/build/release/*.deb /app/hipBLAS-common/build/*.deb /app/install
@ -114,24 +110,11 @@ RUN git clone ${FA_REPO}
RUN cd flash-attention \ RUN cd flash-attention \
&& git checkout ${FA_BRANCH} \ && git checkout ${FA_BRANCH} \
&& git submodule update --init \ && git submodule update --init \
&& GPU_ARCHS=$(echo ${PYTORCH_ROCM_ARCH} | sed -e 's/;gfx1[0-9]\{3\}//g') python3 setup.py bdist_wheel --dist-dir=dist && MAX_JOBS=64 GPU_ARCHS=${PYTORCH_ROCM_ARCH} python3 setup.py bdist_wheel --dist-dir=dist
RUN mkdir -p /app/install && cp /app/pytorch/dist/*.whl /app/install \ RUN mkdir -p /app/install && cp /app/pytorch/dist/*.whl /app/install \
&& cp /app/vision/dist/*.whl /app/install \ && cp /app/vision/dist/*.whl /app/install \
&& cp /app/flash-attention/dist/*.whl /app/install && cp /app/flash-attention/dist/*.whl /app/install
FROM base AS build_aiter
ARG AITER_BRANCH
ARG AITER_REPO
RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
pip install /install/*.whl
RUN git clone --recursive ${AITER_REPO}
RUN cd aiter \
&& git checkout ${AITER_BRANCH} \
&& git submodule update --init --recursive \
&& pip install -r requirements.txt
RUN pip install pyyaml && cd aiter && PREBUILD_KERNELS=1 GPU_ARCHS=gfx942 python3 setup.py bdist_wheel --dist-dir=dist && ls /app/aiter/dist/*.whl
RUN mkdir -p /app/install && cp /app/aiter/dist/*.whl /app/install
FROM base AS final FROM base AS final
RUN --mount=type=bind,from=build_hipblaslt,src=/app/install/,target=/install \ RUN --mount=type=bind,from=build_hipblaslt,src=/app/install/,target=/install \
dpkg -i /install/*deb \ dpkg -i /install/*deb \
@ -147,12 +130,19 @@ RUN --mount=type=bind,from=build_amdsmi,src=/app/install/,target=/install \
pip install /install/*.whl pip install /install/*.whl
RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \ RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
pip install /install/*.whl pip install /install/*.whl
RUN --mount=type=bind,from=build_aiter,src=/app/install/,target=/install \
pip install /install/*.whl ARG AITER_REPO
ARG AITER_BRANCH
RUN git clone --recursive ${AITER_REPO}
RUN cd aiter \
&& git checkout ${AITER_BRANCH} \
&& git submodule update --init --recursive \
&& pip install -r requirements.txt \
&& PREBUILD_KERNELS=1 GPU_ARCHS=gfx942 python3 setup.py develop && pip show aiter
ARG BASE_IMAGE ARG BASE_IMAGE
ARG HIPBLAS_COMMON_BRANCH
ARG HIPBLASLT_BRANCH ARG HIPBLASLT_BRANCH
ARG HIPBLAS_COMMON_BRANCH
ARG LEGACY_HIPBLASLT_OPTION ARG LEGACY_HIPBLASLT_OPTION
ARG RCCL_BRANCH ARG RCCL_BRANCH
ARG RCCL_REPO ARG RCCL_REPO
@ -164,8 +154,6 @@ ARG PYTORCH_REPO
ARG PYTORCH_VISION_REPO ARG PYTORCH_VISION_REPO
ARG FA_BRANCH ARG FA_BRANCH
ARG FA_REPO ARG FA_REPO
ARG AITER_BRANCH
ARG AITER_REPO
RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \ RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
&& echo "HIPBLAS_COMMON_BRANCH: ${HIPBLAS_COMMON_BRANCH}" >> /app/versions.txt \ && echo "HIPBLAS_COMMON_BRANCH: ${HIPBLAS_COMMON_BRANCH}" >> /app/versions.txt \
&& echo "HIPBLASLT_BRANCH: ${HIPBLASLT_BRANCH}" >> /app/versions.txt \ && echo "HIPBLASLT_BRANCH: ${HIPBLASLT_BRANCH}" >> /app/versions.txt \
@ -179,5 +167,6 @@ RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
&& echo "PYTORCH_REPO: ${PYTORCH_REPO}" >> /app/versions.txt \ && echo "PYTORCH_REPO: ${PYTORCH_REPO}" >> /app/versions.txt \
&& echo "PYTORCH_VISION_REPO: ${PYTORCH_VISION_REPO}" >> /app/versions.txt \ && echo "PYTORCH_VISION_REPO: ${PYTORCH_VISION_REPO}" >> /app/versions.txt \
&& echo "FA_BRANCH: ${FA_BRANCH}" >> /app/versions.txt \ && echo "FA_BRANCH: ${FA_BRANCH}" >> /app/versions.txt \
&& echo "FA_REPO: ${FA_REPO}" >> /app/versions.txt \
&& echo "AITER_BRANCH: ${AITER_BRANCH}" >> /app/versions.txt \ && echo "AITER_BRANCH: ${AITER_BRANCH}" >> /app/versions.txt \
&& echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt && echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt

View File

@ -16,7 +16,7 @@ ENV LANG=C.UTF-8 \
RUN microdnf install -y \ RUN microdnf install -y \
which procps findutils tar vim git gcc gcc-gfortran g++ make patch zlib-devel \ which procps findutils tar vim git gcc gcc-gfortran g++ make patch zlib-devel \
libjpeg-turbo-devel libtiff-devel libpng-devel libwebp-devel freetype-devel harfbuzz-devel \ libjpeg-turbo-devel libtiff-devel libpng-devel libwebp-devel freetype-devel harfbuzz-devel \
openssl-devel openblas openblas-devel autoconf automake libtool cmake numpy && \ openssl-devel openblas openblas-devel autoconf automake libtool cmake && \
microdnf clean all microdnf clean all
# Python Installation # Python Installation
@ -58,7 +58,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
cd ../../python && \ cd ../../python && \
export PYARROW_PARALLEL=4 && \ export PYARROW_PARALLEL=4 && \
export ARROW_BUILD_TYPE=release && \ export ARROW_BUILD_TYPE=release && \
uv pip install -r requirements-build.txt && \ uv pip install -r requirements/build.txt && \
python setup.py build_ext --build-type=$ARROW_BUILD_TYPE --bundle-arrow-cpp bdist_wheel python setup.py build_ext --build-type=$ARROW_BUILD_TYPE --bundle-arrow-cpp bdist_wheel
FROM python-install AS numa-build FROM python-install AS numa-build
@ -84,58 +84,18 @@ RUN curl https://sh.rustup.rs -sSf | sh -s -- -y && \
rustup default stable && \ rustup default stable && \
rustup show rustup show
FROM python-install AS torch
ARG TORCH_VERSION=2.7.0
ENV export _GLIBCXX_USE_CXX11_ABI=1
ENV CARGO_HOME=/root/.cargo
ENV RUSTUP_HOME=/root/.rustup
ENV PATH="$CARGO_HOME/bin:$RUSTUP_HOME/bin:$PATH"
WORKDIR /tmp
RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,from=rust,source=/root/.cargo,target=/root/.cargo,rw \
--mount=type=bind,from=rust,source=/root/.rustup,target=/root/.rustup,rw \
git clone https://github.com/pytorch/pytorch.git && \
cd pytorch && \
git checkout v2.7.0 && \
git submodule sync && \
git submodule update --init --recursive && \
uv pip install cmake ninja && \
uv pip install -r requirements.txt && \
python setup.py bdist_wheel
FROM python-install AS torch-vision FROM python-install AS torch-vision
# Install torchvision # Install torchvision
ARG TORCH_VERSION=2.7.0 ARG TORCH_VERSION=2.7.0.dev20250304
ARG TORCH_VISION_VERSION=v0.20.1 ARG TORCH_VISION_VERSION=v0.20.1
WORKDIR /tmp WORKDIR /tmp
RUN --mount=type=cache,target=/root/.cache/uv \ RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,from=torch,source=/tmp/pytorch/dist,target=/tmp/torch-wheels/ \
git clone https://github.com/pytorch/vision.git && \ git clone https://github.com/pytorch/vision.git && \
cd vision && \ cd vision && \
git checkout $TORCH_VISION_VERSION && \ git checkout $TORCH_VISION_VERSION && \
TORCH_WHL_FILE=$(ls /tmp/torch-wheels/*.whl | head -n 1) && \ uv pip install -v torch==${TORCH_VERSION} --extra-index-url https://download.pytorch.org/whl/nightly/cpu && \
uv pip install -v $TORCH_WHL_FILE && \
python setup.py bdist_wheel python setup.py bdist_wheel
FROM python-install AS hf-xet-builder
# Install hf-xet
WORKDIR /tmp
ENV CARGO_HOME=/root/.cargo
ENV RUSTUP_HOME=/root/.rustup
ENV PATH="$CARGO_HOME/bin:$RUSTUP_HOME/bin:$PATH"
RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,from=rust,source=/root/.cargo,target=/root/.cargo,rw \
--mount=type=bind,from=rust,source=/root/.rustup,target=/root/.rustup,rw \
git clone https://github.com/huggingface/xet-core.git && \
cd xet-core/hf_xet/ && \
uv pip install maturin patchelf && \
python -m maturin build --release --out dist && \
mkdir -p /tmp/hf-xet/dist && \
cp dist/*.whl /tmp/hf-xet/dist/
# Final build stage # Final build stage
FROM python-install AS vllm-cpu FROM python-install AS vllm-cpu
ARG PYTHON_VERSION ARG PYTHON_VERSION
@ -147,7 +107,6 @@ ENV UV_LINK_MODE=copy
ENV CARGO_HOME=/root/.cargo ENV CARGO_HOME=/root/.cargo
ENV RUSTUP_HOME=/root/.rustup ENV RUSTUP_HOME=/root/.rustup
ENV PATH="$CARGO_HOME/bin:$RUSTUP_HOME/bin:$PATH" ENV PATH="$CARGO_HOME/bin:$RUSTUP_HOME/bin:$PATH"
ENV GRPC_PYTHON_BUILD_SYSTEM_OPENSSL=1
COPY . /workspace/vllm COPY . /workspace/vllm
WORKDIR /workspace/vllm WORKDIR /workspace/vllm
@ -161,18 +120,13 @@ RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,from=rust,source=/root/.rustup,target=/root/.rustup,rw \ --mount=type=bind,from=rust,source=/root/.rustup,target=/root/.rustup,rw \
--mount=type=bind,from=pyarrow,source=/tmp/arrow/python/dist,target=/tmp/arrow-wheels \ --mount=type=bind,from=pyarrow,source=/tmp/arrow/python/dist,target=/tmp/arrow-wheels \
--mount=type=bind,from=torch-vision,source=/tmp/vision/dist,target=/tmp/vision-wheels/ \ --mount=type=bind,from=torch-vision,source=/tmp/vision/dist,target=/tmp/vision-wheels/ \
--mount=type=bind,from=hf-xet-builder,source=/tmp/hf-xet/dist,target=/tmp/hf-xet-wheels/ \
--mount=type=bind,from=torch,source=/tmp/pytorch/dist,target=/tmp/torch-wheels/ \
sed -i '/^torch/d' requirements/build.txt && \ sed -i '/^torch/d' requirements/build.txt && \
ARROW_WHL_FILE=$(ls /tmp/arrow-wheels/pyarrow-*.whl | head -n 1) && \ ARROW_WHL_FILE=$(ls /tmp/arrow-wheels/pyarrow-*.whl | head -n 1) && \
VISION_WHL_FILE=$(ls /tmp/vision-wheels/*.whl | head -n 1) && \ VISION_WHL_FILE=$(ls /tmp/vision-wheels/*.whl | head -n 1) && \
HF_XET_WHL_FILE=$(ls /tmp/hf-xet-wheels/*.whl | head -n 1) && \
TORCH_WHL_FILE=$(ls /tmp/torch-wheels/*.whl | head -n 1) && \
uv pip install -v \ uv pip install -v \
$ARROW_WHL_FILE \ $ARROW_WHL_FILE \
$VISION_WHL_FILE \ $VISION_WHL_FILE \
$HF_XET_WHL_FILE \ --extra-index-url https://download.pytorch.org/whl/nightly/cpu \
$TORCH_WHL_FILE \
--index-strategy unsafe-best-match \ --index-strategy unsafe-best-match \
-r requirements/build.txt \ -r requirements/build.txt \
-r requirements/cpu.txt -r requirements/cpu.txt
@ -195,5 +149,4 @@ USER 2000
WORKDIR /home/vllm WORKDIR /home/vllm
# Set the default entrypoint # Set the default entrypoint
ENTRYPOINT ["python", "-m", "vllm.entrypoints.openai.api_server"] ENTRYPOINT ["python", "-m", "vllm.entrypoints.openai.api_server"]

View File

@ -23,7 +23,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=bind,source=.git,target=.git \ --mount=type=bind,source=.git,target=.git \
python3 -m pip install \ python3 -m pip install \
-r requirements/tpu.txt -r requirements/tpu.txt
RUN python3 -m pip install -e . RUN python3 setup.py develop
# install development dependencies (for testing) # install development dependencies (for testing)
RUN python3 -m pip install -e tests/vllm_test_utils RUN python3 -m pip install -e tests/vllm_test_utils

View File

@ -40,6 +40,12 @@ RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=bind,source=.git,target=.git \ --mount=type=bind,source=.git,target=.git \
python3 setup.py install python3 setup.py install
# Please refer xpu doc, we need manually install intel-extension-for-pytorch 2.6.10+xpu due to there are some conflict dependencies with torch 2.6.0+xpu
# FIXME: This will be fix in ipex 2.7. just leave this here for awareness.
RUN --mount=type=cache,target=/root/.cache/pip \
pip install intel-extension-for-pytorch==2.6.10+xpu \
--extra-index-url=https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
CMD ["/bin/bash"] CMD ["/bin/bash"]
FROM vllm-base AS vllm-openai FROM vllm-base AS vllm-openai

View File

@ -1,7 +1,7 @@
<p align="center"> <p align="center">
<picture> <picture>
<source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/vllm-project/vllm/main/docs/assets/logos/vllm-logo-text-dark.png"> <source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/vllm-project/vllm/main/docs/source/assets/logos/vllm-logo-text-dark.png">
<img alt="vLLM" src="https://raw.githubusercontent.com/vllm-project/vllm/main/docs/assets/logos/vllm-logo-text-light.png" width=55%> <img alt="vLLM" src="https://raw.githubusercontent.com/vllm-project/vllm/main/docs/source/assets/logos/vllm-logo-text-light.png" width=55%>
</picture> </picture>
</p> </p>
@ -10,42 +10,28 @@ Easy, fast, and cheap LLM serving for everyone
</h3> </h3>
<p align="center"> <p align="center">
| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://blog.vllm.ai/"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> | <a href="https://discuss.vllm.ai"><b>User Forum</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> | | <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://vllm.ai"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> | <a href="https://discuss.vllm.ai"><b>User Forum</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> |
</p> </p>
--- ---
[2025/03] We are collaborating with Ollama to host an [Inference Night](https://lu.ma/vllm-ollama) at Y Combinator in San Francisco on Thursday, March 27, at 6 PM. Discuss all things inference local or data center!
[2025/04] We're hosting our first-ever *vLLM Asia Developer Day* in Singapore on *April 3rd*! This is a full-day event (9 AM - 9 PM SGT) in partnership with SGInnovate, AMD, and Embedded LLM. Meet the vLLM team and learn about LLM inference for RL, MI300X, and more! [Register Now](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day)
---
*Latest News* 🔥 *Latest News* 🔥
- [2025/05] We hosted [NYC vLLM Meetup](https://lu.ma/c1rqyf1f)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1_q_aW_ioMJWUImf1s1YM-ZhjXz8cUeL0IJvaquOYBeA/edit?usp=sharing).
- [2025/05] vLLM is now a hosted project under PyTorch Foundation! Please find the announcement [here](https://pytorch.org/blog/pytorch-foundation-welcomes-vllm/).
- [2025/04] We hosted [Asia Developer Day](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/19cp6Qu8u48ihB91A064XfaXruNYiBOUKrBxAmDOllOo/edit?usp=sharing).
- [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html).
<details>
<summary>Previous News</summary>
- [2025/03] We hosted [vLLM x Ollama Inference Night](https://lu.ma/vllm-ollama)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/16T2PDD1YwRnZ4Tu8Q5r6n53c5Lr5c73UV9Vd2_eBo4U/edit?usp=sharing).
- [2025/03] We hosted [the first vLLM China Meetup](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg)! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1REHvfQMKGnvz6p3Fd23HhSO4c8j5WPGZV0bKYLwnHyQ/edit?usp=sharing). - [2025/03] We hosted [the first vLLM China Meetup](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg)! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1REHvfQMKGnvz6p3Fd23HhSO4c8j5WPGZV0bKYLwnHyQ/edit?usp=sharing).
- [2025/03] We hosted [the East Coast vLLM Meetup](https://lu.ma/7mu4k4xx)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1NHiv8EUFF1NLd3fEYODm56nDmL26lEeXCaDgyDlTsRs/edit#slide=id.g31441846c39_0_0). - [2025/03] We hosted [the East Coast vLLM Meetup](https://lu.ma/7mu4k4xx)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1NHiv8EUFF1NLd3fEYODm56nDmL26lEeXCaDgyDlTsRs/edit#slide=id.g31441846c39_0_0).
- [2025/02] We hosted [the ninth vLLM meetup](https://lu.ma/h7g3kuj9) with Meta! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1jzC_PZVXrVNSFVCW-V4cFXb6pn7zZ2CyP_Flwo05aqg/edit?usp=sharing) and AMD [here](https://drive.google.com/file/d/1Zk5qEJIkTmlQ2eQcXQZlljAx3m9s7nwn/view?usp=sharing). The slides from Meta will not be posted. - [2025/02] We hosted [the ninth vLLM meetup](https://lu.ma/h7g3kuj9) with Meta! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1jzC_PZVXrVNSFVCW-V4cFXb6pn7zZ2CyP_Flwo05aqg/edit?usp=sharing) and AMD [here](https://drive.google.com/file/d/1Zk5qEJIkTmlQ2eQcXQZlljAx3m9s7nwn/view?usp=sharing). The slides from Meta will not be posted.
- [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html).
- [2025/01] We hosted [the eighth vLLM meetup](https://lu.ma/zep56hui) with Google Cloud! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1epVkt4Zu8Jz_S5OhEHPc798emsYh2BwYfRuDDVEF7u4/edit?usp=sharing), and Google Cloud team [here](https://drive.google.com/file/d/1h24pHewANyRL11xy5dXUbvRC9F9Kkjix/view?usp=sharing). - [2025/01] We hosted [the eighth vLLM meetup](https://lu.ma/zep56hui) with Google Cloud! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1epVkt4Zu8Jz_S5OhEHPc798emsYh2BwYfRuDDVEF7u4/edit?usp=sharing), and Google Cloud team [here](https://drive.google.com/file/d/1h24pHewANyRL11xy5dXUbvRC9F9Kkjix/view?usp=sharing).
- [2024/12] vLLM joins [pytorch ecosystem](https://pytorch.org/blog/vllm-joins-pytorch)! Easy, Fast, and Cheap LLM Serving for Everyone! - [2024/12] vLLM joins [PyTorch ecosystem](https://pytorch.org/blog/vllm-joins-pytorch)! Easy, Fast, and Cheap LLM Serving for Everyone!
- [2024/11] We hosted [the seventh vLLM meetup](https://lu.ma/h0qvrajz) with Snowflake! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing), and Snowflake team [here](https://docs.google.com/presentation/d/1qF3RkDAbOULwz9WK5TOltt2fE9t6uIc_hVNLFAaQX6A/edit?usp=sharing).
- [2024/10] We have just created a developer slack ([slack.vllm.ai](https://slack.vllm.ai)) focusing on coordinating contributions and discussing features. Please feel free to join us there!
- [2024/10] Ray Summit 2024 held a special track for vLLM! Please find the opening talk slides from the vLLM team [here](https://docs.google.com/presentation/d/1B_KQxpHBTRa_mDF-tR6i8rWdOU5QoTZNcEg2MKZxEHM/edit?usp=sharing). Learn more from the [talks](https://www.youtube.com/playlist?list=PLzTswPQNepXl6AQwifuwUImLPFRVpksjR) from other vLLM contributors and users!
- [2024/09] We hosted [the sixth vLLM meetup](https://lu.ma/87q3nvnh) with NVIDIA! Please find the meetup slides [here](https://docs.google.com/presentation/d/1wrLGwytQfaOTd5wCGSPNhoaW3nq0E-9wqyP7ny93xRs/edit?usp=sharing).
- [2024/07] We hosted [the fifth vLLM meetup](https://lu.ma/lp0gyjqr) with AWS! Please find the meetup slides [here](https://docs.google.com/presentation/d/1RgUD8aCfcHocghoP3zmXzck9vX3RCI9yfUAB2Bbcl4Y/edit?usp=sharing).
- [2024/07] In partnership with Meta, vLLM officially supports Llama 3.1 with FP8 quantization and pipeline parallelism! Please check out our blog post [here](https://blog.vllm.ai/2024/07/23/llama31.html).
- [2024/06] We hosted [the fourth vLLM meetup](https://lu.ma/agivllm) with Cloudflare and BentoML! Please find the meetup slides [here](https://docs.google.com/presentation/d/1iJ8o7V2bQEi0BFEljLTwc5G1S10_Rhv3beed5oB0NJ4/edit?usp=sharing).
- [2024/04] We hosted [the third vLLM meetup](https://robloxandvllmmeetup2024.splashthat.com/) with Roblox! Please find the meetup slides [here](https://docs.google.com/presentation/d/1A--47JAK4BJ39t954HyTkvtfwn0fkqtsL8NGFuslReM/edit?usp=sharing).
- [2024/01] We hosted [the second vLLM meetup](https://lu.ma/ygxbpzhl) with IBM! Please find the meetup slides [here](https://docs.google.com/presentation/d/12mI2sKABnUw5RBWXDYY-HtHth4iMSNcEoQ10jDQbxgA/edit?usp=sharing).
- [2023/10] We hosted [the first vLLM meetup](https://lu.ma/first-vllm-meetup) with a16z! Please find the meetup slides [here](https://docs.google.com/presentation/d/1QL-XPFXiFpDBh86DbEegFXBXFXjix4v032GhShbKf3s/edit?usp=sharing).
- [2023/08] We would like to express our sincere gratitude to [Andreessen Horowitz](https://a16z.com/2023/08/30/supporting-the-open-source-ai-community/) (a16z) for providing a generous grant to support the open-source development and research of vLLM.
- [2023/06] We officially released vLLM! FastChat-vLLM integration has powered [LMSYS Vicuna and Chatbot Arena](https://chat.lmsys.org) since mid-April. Check out our [blog post](https://vllm.ai).
</details>
--- ---
## About ## About
vLLM is a fast and easy-to-use library for LLM inference and serving. vLLM is a fast and easy-to-use library for LLM inference and serving.
@ -58,8 +44,8 @@ vLLM is fast with:
- Efficient management of attention key and value memory with [**PagedAttention**](https://blog.vllm.ai/2023/06/20/vllm.html) - Efficient management of attention key and value memory with [**PagedAttention**](https://blog.vllm.ai/2023/06/20/vllm.html)
- Continuous batching of incoming requests - Continuous batching of incoming requests
- Fast model execution with CUDA/HIP graph - Fast model execution with CUDA/HIP graph
- Quantizations: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), [AutoRound](https://arxiv.org/abs/2309.05516), INT4, INT8, and FP8 - Quantizations: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), INT4, INT8, and FP8.
- Optimized CUDA kernels, including integration with FlashAttention and FlashInfer - Optimized CUDA kernels, including integration with FlashAttention and FlashInfer.
- Speculative decoding - Speculative decoding
- Chunked prefill - Chunked prefill
@ -72,14 +58,14 @@ vLLM is flexible and easy to use with:
- Tensor parallelism and pipeline parallelism support for distributed inference - Tensor parallelism and pipeline parallelism support for distributed inference
- Streaming outputs - Streaming outputs
- OpenAI-compatible API server - OpenAI-compatible API server
- Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, TPU, and AWS Neuron - Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, TPU, and AWS Neuron.
- Prefix caching support - Prefix caching support
- Multi-LoRA support - Multi-lora support
vLLM seamlessly supports most popular open-source models on HuggingFace, including: vLLM seamlessly supports most popular open-source models on HuggingFace, including:
- Transformer-like LLMs (e.g., Llama) - Transformer-like LLMs (e.g., Llama)
- Mixture-of-Expert LLMs (e.g., Mixtral, Deepseek-V2 and V3) - Mixture-of-Expert LLMs (e.g., Mixtral, Deepseek-V2 and V3)
- Embedding Models (e.g., E5-Mistral) - Embedding Models (e.g. E5-Mistral)
- Multi-modal LLMs (e.g., LLaVA) - Multi-modal LLMs (e.g., LLaVA)
Find the full list of supported models [here](https://docs.vllm.ai/en/latest/models/supported_models.html). Find the full list of supported models [here](https://docs.vllm.ai/en/latest/models/supported_models.html).
@ -100,14 +86,14 @@ Visit our [documentation](https://docs.vllm.ai/en/latest/) to learn more.
## Contributing ## Contributing
We welcome and value any contributions and collaborations. We welcome and value any contributions and collaborations.
Please check out [Contributing to vLLM](https://docs.vllm.ai/en/latest/contributing/index.html) for how to get involved. Please check out [CONTRIBUTING.md](./CONTRIBUTING.md) for how to get involved.
## Sponsors ## Sponsors
vLLM is a community project. Our compute resources for development and testing are supported by the following organizations. Thank you for your support! vLLM is a community project. Our compute resources for development and testing are supported by the following organizations. Thank you for your support!
<!-- Note: Please sort them in alphabetical order. --> <!-- Note: Please sort them in alphabetical order. -->
<!-- Note: Please keep these consistent with docs/community/sponsors.md --> <!-- Note: Please keep these consistent with docs/source/community/sponsors.md -->
Cash Donations: Cash Donations:
- a16z - a16z
- Dropbox - Dropbox
@ -123,7 +109,6 @@ Compute Resources:
- Databricks - Databricks
- DeepInfra - DeepInfra
- Google Cloud - Google Cloud
- Intel
- Lambda Lab - Lambda Lab
- Nebius - Nebius
- Novita AI - Novita AI
@ -156,10 +141,10 @@ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs
- For technical questions and feature requests, please use GitHub [Issues](https://github.com/vllm-project/vllm/issues) or [Discussions](https://github.com/vllm-project/vllm/discussions) - For technical questions and feature requests, please use GitHub [Issues](https://github.com/vllm-project/vllm/issues) or [Discussions](https://github.com/vllm-project/vllm/discussions)
- For discussing with fellow users, please use the [vLLM Forum](https://discuss.vllm.ai) - For discussing with fellow users, please use the [vLLM Forum](https://discuss.vllm.ai)
- For coordinating contributions and development, please use [Slack](https://slack.vllm.ai) - coordinating contributions and development, please use [Slack](https://slack.vllm.ai)
- For security disclosures, please use GitHub's [Security Advisories](https://github.com/vllm-project/vllm/security/advisories) feature - For security disclosures, please use GitHub's [Security Advisories](https://github.com/vllm-project/vllm/security/advisories) feature
- For collaborations and partnerships, please contact us at [vllm-questions@lists.berkeley.edu](mailto:vllm-questions@lists.berkeley.edu) - For collaborations and partnerships, please contact us at [vllm-questions@lists.berkeley.edu](mailto:vllm-questions@lists.berkeley.edu)
## Media Kit ## Media Kit
- If you wish to use vLLM's logo, please refer to [our media kit repo](https://github.com/vllm-project/media-kit) - If you wish to use vLLM's logo, please refer to [our media kit repo](https://github.com/vllm-project/media-kit).

View File

@ -8,6 +8,4 @@ Please report security issues privately using [the vulnerability submission form
--- ---
Please see the [Security Guide in the vLLM documentation](https://docs.vllm.ai/en/latest/usage/security.html) for more information on vLLM's security assumptions and recommendations.
Please see [PyTorch's Security Policy](https://github.com/pytorch/pytorch/blob/main/SECURITY.md) for more information and recommendations on how to securely interact with models. Please see [PyTorch's Security Policy](https://github.com/pytorch/pytorch/blob/main/SECURITY.md) for more information and recommendations on how to securely interact with models.

View File

@ -41,45 +41,29 @@ become available.
<td><code>synthetic</code></td> <td><code>synthetic</code></td>
</tr> </tr>
<tr> <tr>
<td><strong>HuggingFace-VisionArena</strong></td> <td><strong>HuggingFace</strong></td>
<td style="text-align: center;"></td> <td style="text-align: center;">🟡</td>
<td style="text-align: center;"></td> <td style="text-align: center;">🟡</td>
<td><code>lmarena-ai/VisionArena-Chat</code></td> <td>Specify your dataset path on HuggingFace</td>
</tr> </tr>
<tr> <tr>
<td><strong>HuggingFace-InstructCoder</strong></td> <td><strong>VisionArena</strong></td>
<td style="text-align: center;"></td> <td style="text-align: center;"></td>
<td style="text-align: center;"></td> <td style="text-align: center;"></td>
<td><code>likaixin/InstructCoder</code></td> <td><code>lmarena-ai/vision-arena-bench-v0.1</code> (a HuggingFace dataset)</td>
</tr>
<tr>
<td><strong>HuggingFace-AIMO</strong></td>
<td style="text-align: center;"></td>
<td style="text-align: center;"></td>
<td><code>AI-MO/aimo-validation-aime</code> , <code>AI-MO/NuminaMath-1.5</code>, <code>AI-MO/NuminaMath-CoT</code></td>
</tr>
<tr>
<td><strong>HuggingFace-Other</strong></td>
<td style="text-align: center;"></td>
<td style="text-align: center;"></td>
<td><code>lmms-lab/LLaVA-OneVision-Data</code>, <code>Aeala/ShareGPT_Vicuna_unfiltered</code></td>
</tr>
<tr>
<td><strong>Custom</strong></td>
<td style="text-align: center;"></td>
<td style="text-align: center;"></td>
<td>Local file: <code>data.jsonl</code></td>
</tr> </tr>
</tbody> </tbody>
</table> </table>
✅: supported ✅: supported
🟡: Partial support
🚧: to be supported 🚧: to be supported
**Note**: HuggingFace dataset's `dataset-name` should be set to `hf` 🟡: Partial support. Currently, HuggingFaceDataset only supports dataset formats
similar to `lmms-lab/LLaVA-OneVision-Data` and `Aeala/ShareGPT_Vicuna_unfiltered`.
If you need support for other dataset formats, please consider contributing.
**Note**: VisionArenas `dataset-name` should be set to `hf`
--- ---
## Example - Online Benchmark ## Example - Online Benchmark
@ -87,7 +71,8 @@ become available.
First start serving your model First start serving your model
```bash ```bash
vllm serve NousResearch/Hermes-3-Llama-3.1-8B --disable-log-requests MODEL_NAME="NousResearch/Hermes-3-Llama-3.1-8B"
vllm serve ${MODEL_NAME} --disable-log-requests
``` ```
Then run the benchmarking script Then run the benchmarking script
@ -95,13 +80,12 @@ Then run the benchmarking script
```bash ```bash
# download dataset # download dataset
# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json # wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
python3 vllm/benchmarks/benchmark_serving.py \ MODEL_NAME="NousResearch/Hermes-3-Llama-3.1-8B"
--backend vllm \ NUM_PROMPTS=10
--model NousResearch/Hermes-3-Llama-3.1-8B \ BACKEND="vllm"
--endpoint /v1/completions \ DATASET_NAME="sharegpt"
--dataset-name sharegpt \ DATASET_PATH="<your data path>/ShareGPT_V3_unfiltered_cleaned_split.json"
--dataset-path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \ python3 vllm/benchmarks/benchmark_serving.py --backend ${BACKEND} --model ${MODEL_NAME} --endpoint /v1/completions --dataset-name ${DATASET_NAME} --dataset-path ${DATASET_PATH} --num-prompts ${NUM_PROMPTS}
--num-prompts 10
``` ```
If successful, you will see the following output If successful, you will see the following output
@ -130,38 +114,6 @@ P99 ITL (ms): 8.39
================================================== ==================================================
``` ```
### Custom Dataset
If the dataset you want to benchmark is not supported yet in vLLM, even then you can benchmark on it using `CustomDataset`. Your data needs to be in `.jsonl` format and needs to have "prompt" field per entry, e.g., data.jsonl
```
{"prompt": "What is the capital of India?"}
{"prompt": "What is the capital of Iran?"}
{"prompt": "What is the capital of China?"}
```
```bash
# start server
VLLM_USE_V1=1 vllm serve meta-llama/Llama-3.1-8B-Instruct --disable-log-requests
```
```bash
# run benchmarking script
python3 benchmarks/benchmark_serving.py --port 9001 --save-result --save-detailed \
--backend vllm \
--model meta-llama/Llama-3.1-8B-Instruct \
--endpoint /v1/completions \
--dataset-name custom \
--dataset-path <path-to-your-data-jsonl> \
--custom-skip-chat-template \
--num-prompts 80 \
--max-concurrency 1 \
--temperature=0.3 \
--top-p=0.75 \
--result-dir "./log/"
```
You can skip applying chat template if your data already has it by using `--custom-skip-chat-template`.
### VisionArena Benchmark for Vision Language Models ### VisionArena Benchmark for Vision Language Models
```bash ```bash
@ -170,114 +122,88 @@ vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
``` ```
```bash ```bash
MODEL_NAME="Qwen/Qwen2-VL-7B-Instruct"
NUM_PROMPTS=10
BACKEND="openai-chat"
DATASET_NAME="hf"
DATASET_PATH="lmarena-ai/vision-arena-bench-v0.1"
DATASET_SPLIT='train'
python3 vllm/benchmarks/benchmark_serving.py \ python3 vllm/benchmarks/benchmark_serving.py \
--backend openai-chat \ --backend "${BACKEND}" \
--model Qwen/Qwen2-VL-7B-Instruct \ --model "${MODEL_NAME}" \
--endpoint /v1/chat/completions \ --endpoint "/v1/chat/completions" \
--dataset-name hf \ --dataset-name "${DATASET_NAME}" \
--dataset-path lmarena-ai/VisionArena-Chat \ --dataset-path "${DATASET_PATH}" \
--hf-split train \ --hf-split "${DATASET_SPLIT}" \
--num-prompts 1000 --num-prompts "${NUM_PROMPTS}"
``` ```
### InstructCoder Benchmark with Speculative Decoding ### HuggingFaceDataset Examples
``` bash Currently, HuggingFaceDataset only supports dataset formats
VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \ similar to `lmms-lab/LLaVA-OneVision-Data` and `Aeala/ShareGPT_Vicuna_unfiltered`. If you need support for other dataset
--speculative-config $'{"method": "ngram", formats, please consider contributing.
"num_speculative_tokens": 5, "prompt_lookup_max": 5,
"prompt_lookup_min": 2}'
```
``` bash
python3 benchmarks/benchmark_serving.py \
--model meta-llama/Meta-Llama-3-8B-Instruct \
--dataset-name hf \
--dataset-path likaixin/InstructCoder \
--num-prompts 2048
```
### Other HuggingFaceDataset Examples
```bash ```bash
# need a model with vision capability here
vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
``` ```
**`lmms-lab/LLaVA-OneVision-Data`** **`lmms-lab/LLaVA-OneVision-Data`**
```bash ```bash
MODEL_NAME="Qwen/Qwen2-VL-7B-Instruct"
NUM_PROMPTS=10
BACKEND="openai-chat"
DATASET_NAME="hf"
DATASET_PATH="lmms-lab/LLaVA-OneVision-Data"
DATASET_SPLIT='train'
DATASET_SUBSET='chart2text(cauldron)'
python3 vllm/benchmarks/benchmark_serving.py \ python3 vllm/benchmarks/benchmark_serving.py \
--backend openai-chat \ --backend "${BACKEND}" \
--model Qwen/Qwen2-VL-7B-Instruct \ --model "${MODEL_NAME}" \
--endpoint /v1/chat/completions \ --endpoint "/v1/chat/completions" \
--dataset-name hf \ --dataset-name "${DATASET_NAME}" \
--dataset-path lmms-lab/LLaVA-OneVision-Data \ --dataset-path "${DATASET_PATH}" \
--hf-split train \ --hf-split "${DATASET_SPLIT}" \
--hf-subset "chart2text(cauldron)" \ --num-prompts "${NUM_PROMPTS}" \
--num-prompts 10 --hf-subset "${DATASET_SUBSET}"
``` ```
**`Aeala/ShareGPT_Vicuna_unfiltered`** **`Aeala/ShareGPT_Vicuna_unfiltered`**
```bash ```bash
MODEL_NAME="Qwen/Qwen2-VL-7B-Instruct"
NUM_PROMPTS=10
BACKEND="openai-chat"
DATASET_NAME="hf"
DATASET_PATH="Aeala/ShareGPT_Vicuna_unfiltered"
DATASET_SPLIT='train'
python3 vllm/benchmarks/benchmark_serving.py \ python3 vllm/benchmarks/benchmark_serving.py \
--backend openai-chat \ --backend "${BACKEND}" \
--model Qwen/Qwen2-VL-7B-Instruct \ --model "${MODEL_NAME}" \
--endpoint /v1/chat/completions \ --endpoint "/v1/chat/completions" \
--dataset-name hf \ --dataset-name "${DATASET_NAME}" \
--dataset-path Aeala/ShareGPT_Vicuna_unfiltered \ --dataset-path "${DATASET_PATH}" \
--hf-split train \ --hf-split "${DATASET_SPLIT}" \
--num-prompts 10 --num-prompts "${NUM_PROMPTS}" \
```
**`AI-MO/aimo-validation-aime`**
``` bash
python3 vllm/benchmarks/benchmark_serving.py \
--model Qwen/QwQ-32B \
--dataset-name hf \
--dataset-path AI-MO/aimo-validation-aime \
--num-prompts 10 \
--seed 42
```
**`philschmid/mt-bench`**
``` bash
python3 vllm/benchmarks/benchmark_serving.py \
--model Qwen/QwQ-32B \
--dataset-name hf \
--dataset-path philschmid/mt-bench \
--num-prompts 80
```
### Running With Sampling Parameters
When using OpenAI-compatible backends such as `vllm`, optional sampling
parameters can be specified. Example client command:
```bash
python3 vllm/benchmarks/benchmark_serving.py \
--backend vllm \
--model NousResearch/Hermes-3-Llama-3.1-8B \
--endpoint /v1/completions \
--dataset-name sharegpt \
--dataset-path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
--top-k 10 \
--top-p 0.9 \
--temperature 0.5 \
--num-prompts 10
``` ```
--- ---
## Example - Offline Throughput Benchmark ## Example - Offline Throughput Benchmark
```bash ```bash
MODEL_NAME="NousResearch/Hermes-3-Llama-3.1-8B"
NUM_PROMPTS=10
DATASET_NAME="sonnet"
DATASET_PATH="vllm/benchmarks/sonnet.txt"
python3 vllm/benchmarks/benchmark_throughput.py \ python3 vllm/benchmarks/benchmark_throughput.py \
--model NousResearch/Hermes-3-Llama-3.1-8B \ --model "${MODEL_NAME}" \
--dataset-name sonnet \ --dataset-name "${DATASET_NAME}" \
--dataset-path vllm/benchmarks/sonnet.txt \ --dataset-path "${DATASET_PATH}" \
--num-prompts 10 --num-prompts "${NUM_PROMPTS}"
``` ```
If successful, you will see the following output If successful, you will see the following output
@ -291,13 +217,19 @@ Total num output tokens: 1500
### VisionArena Benchmark for Vision Language Models ### VisionArena Benchmark for Vision Language Models
``` bash ``` bash
MODEL_NAME="Qwen/Qwen2-VL-7B-Instruct"
NUM_PROMPTS=10
DATASET_NAME="hf"
DATASET_PATH="lmarena-ai/vision-arena-bench-v0.1"
DATASET_SPLIT="train"
python3 vllm/benchmarks/benchmark_throughput.py \ python3 vllm/benchmarks/benchmark_throughput.py \
--model Qwen/Qwen2-VL-7B-Instruct \ --model "${MODEL_NAME}" \
--backend vllm-chat \ --backend "vllm-chat" \
--dataset-name hf \ --dataset-name "${DATASET_NAME}" \
--dataset-path lmarena-ai/VisionArena-Chat \ --dataset-path "${DATASET_PATH}" \
--num-prompts 1000 \ --num-prompts "${NUM_PROMPTS}" \
--hf-split train --hf-split "${DATASET_SPLIT}"
``` ```
The `num prompt tokens` now includes image token counts The `num prompt tokens` now includes image token counts
@ -308,82 +240,29 @@ Total num prompt tokens: 14527
Total num output tokens: 1280 Total num output tokens: 1280
``` ```
### InstructCoder Benchmark with Speculative Decoding
``` bash
VLLM_WORKER_MULTIPROC_METHOD=spawn \
VLLM_USE_V1=1 \
python3 vllm/benchmarks/benchmark_throughput.py \
--dataset-name=hf \
--dataset-path=likaixin/InstructCoder \
--model=meta-llama/Meta-Llama-3-8B-Instruct \
--input-len=1000 \
--output-len=100 \
--num-prompts=2048 \
--async-engine \
--speculative-config $'{"method": "ngram",
"num_speculative_tokens": 5, "prompt_lookup_max": 5,
"prompt_lookup_min": 2}'
```
```
Throughput: 104.77 requests/s, 23836.22 total tokens/s, 10477.10 output tokens/s
Total num prompt tokens: 261136
Total num output tokens: 204800
```
### Other HuggingFaceDataset Examples
**`lmms-lab/LLaVA-OneVision-Data`**
```bash
python3 vllm/benchmarks/benchmark_throughput.py \
--model Qwen/Qwen2-VL-7B-Instruct \
--backend vllm-chat \
--dataset-name hf \
--dataset-path lmms-lab/LLaVA-OneVision-Data \
--hf-split train \
--hf-subset "chart2text(cauldron)" \
--num-prompts 10
```
**`Aeala/ShareGPT_Vicuna_unfiltered`**
```bash
python3 vllm/benchmarks/benchmark_throughput.py \
--model Qwen/Qwen2-VL-7B-Instruct \
--backend vllm-chat \
--dataset-name hf \
--dataset-path Aeala/ShareGPT_Vicuna_unfiltered \
--hf-split train \
--num-prompts 10
```
**`AI-MO/aimo-validation-aime`**
```bash
python3 benchmarks/benchmark_throughput.py \
--model Qwen/QwQ-32B \
--backend vllm \
--dataset-name hf \
--dataset-path AI-MO/aimo-validation-aime \
--hf-split train \
--num-prompts 10
```
### Benchmark with LoRA Adapters ### Benchmark with LoRA Adapters
``` bash ``` bash
# download dataset # download dataset
# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json # wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
MODEL_NAME="meta-llama/Llama-2-7b-hf"
BACKEND="vllm"
DATASET_NAME="sharegpt"
DATASET_PATH="<your data path>/ShareGPT_V3_unfiltered_cleaned_split.json"
NUM_PROMPTS=10
MAX_LORAS=2
MAX_LORA_RANK=8
ENABLE_LORA="--enable-lora"
LORA_PATH="yard1/llama-2-7b-sql-lora-test"
python3 vllm/benchmarks/benchmark_throughput.py \ python3 vllm/benchmarks/benchmark_throughput.py \
--model meta-llama/Llama-2-7b-hf \ --model "${MODEL_NAME}" \
--backend vllm \ --backend "${BACKEND}" \
--dataset_path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \ --dataset_path "${DATASET_PATH}" \
--dataset_name sharegpt \ --dataset_name "${DATASET_NAME}" \
--num-prompts 10 \ --num-prompts "${NUM_PROMPTS}" \
--max-loras 2 \ --max-loras "${MAX_LORAS}" \
--max-lora-rank 8 \ --max-lora-rank "${MAX_LORA_RANK}" \
--enable-lora \ ${ENABLE_LORA} \
--lora-path yard1/llama-2-7b-sql-lora-test --lora-path "${LORA_PATH}"
``` ```

View File

@ -1,244 +0,0 @@
#!/bin/bash
# This script aims to tune the best server parameter combinations to maximize throughput for given requirement.
# The current server parameter combination is max_num_seqs and max_num_batched_tokens
# It also supports additional requirement: e2e latency and prefix cache.
# Pre-requisite:
# 1. Checkout to your branch, install/ update the correct running env. For TPU, activate conda env and install the corresponding torch, xla version.
# 2. If the model is customized, replace the MODEL's config with the customized config.
# 3. Set variables (ALL REQUIRED)
# BASE: your directory for vllm repo
# MODEL: the model served by vllm
# TP: ways of tensor parallelism
# DOWNLOAD_DIR: directory to download and load model weights.
# INPUT_LEN: request input len
# OUTPUT_LEN: request output len
# MIN_CACHE_HIT_PCT: prefix cache rate
# MAX_LATENCY_ALLOWED_MS: (e2e) latency requirement. If there's no latency requirement, set it to a large number like 1000000000
# NUM_SEQS_LIST: a list of `max-num-seqs` you want to loop with.
# NUM_BATCHED_TOKENS_LIST: a list of `max-num-batched-tokens` you want to loop with.
# Note that the default NUM_SEQS_LIST and NUM_BATCHED_TOKENS_LIST are set for medium size input/output len, for extra short context (such as 20:20), you might need to include larger numbers in NUM_SEQS_LIST.
# 4. Run the script, it might take a long time, you can use tmux to avoid the script stop if disconnection happens.
# 5. The final result will be saved in RESULT file.
# Example use cases
# 1. Given input_len=1800, output_len=20, what's the best max_num_seqs and max_num_batched_tokens to get highest throughput?
# Use INPUT_LEN=1800, OUTPUT_LEN=20, MIN_CACHE_HIT_PCT=0, MAX_LATENCY_ALLOWED_MS=100000000000
# 2. If we have latency requirement to be lower than 500ms, what's the best server parameter?
# Use INPUT_LEN=1800, OUTPUT_LEN=20, MIN_CACHE_HIT_PCT=0, MAX_LATENCY_ALLOWED_MS=500
# 3. If we want to reach 60% prefix cache, what's the best server parameter?
# Use INPUT_LEN=1800, OUTPUT_LEN=20, MIN_CACHE_HIT_PCT=60, MAX_LATENCY_ALLOWED_MS=500
TAG=$(date +"%Y_%m_%d_%H_%M")
BASE=""
MODEL="meta-llama/Llama-3.1-8B-Instruct"
TP=1
DOWNLOAD_DIR=""
INPUT_LEN=4000
OUTPUT_LEN=16
MIN_CACHE_HIT_PCT=0
MAX_LATENCY_ALLOWED_MS=100000000000
NUM_SEQS_LIST="128 256"
NUM_BATCHED_TOKENS_LIST="512 1024 2048 4096"
LOG_FOLDER="$BASE/auto-benchmark/$TAG"
RESULT="$LOG_FOLDER/result.txt"
echo "result file: $RESULT"
echo "model: $MODEL"
rm -rf $LOG_FOLDER
mkdir -p $LOG_FOLDER
cd "$BASE/vllm"
pip install -q datasets
current_hash=$(git rev-parse HEAD)
echo "hash:$current_hash" >> "$RESULT"
echo "current_hash: $current_hash"
best_throughput=0
best_max_num_seqs=0
best_num_batched_tokens=0
best_goodput=0
start_server() {
local gpu_memory_utilization=$1
local max_num_seqs=$2
local max_num_batched_tokens=$3
local vllm_log=$4
pkill -f vllm
VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 vllm serve $MODEL \
--disable-log-requests \
--port 8004 \
--gpu-memory-utilization $gpu_memory_utilization \
--max-num-seqs $max_num_seqs \
--max-num-batched-tokens $max_num_batched_tokens \
--tensor-parallel-size $TP \
--enable-prefix-caching \
--load-format dummy \
--download-dir "$DOWNLOAD_DIR" \
--max-model-len $(( INPUT_LEN+OUTPUT_LEN )) > "$vllm_log" 2>&1 &
# wait for 10 minutes...
server_started=0
for i in {1..60}; do
RESPONSE=$(curl -s -X GET "http://0.0.0.0:8004/health" -w "%{http_code}" -o /dev/stdout)
STATUS_CODE=$(echo "$RESPONSE" | tail -n 1)
if [[ "$STATUS_CODE" -eq 200 ]]; then
server_started=1
break
else
sleep 10
fi
done
if (( ! server_started )); then
echo "server did not start within 10 minutes. Please check server log at $vllm_log".
return 1
else
return 0
fi
}
run_benchmark() {
local max_num_seqs=$1
local max_num_batched_tokens=$2
local gpu_memory_utilization=$3
echo "max_num_seq: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens"
local vllm_log="$LOG_FOLDER/vllm_log_${max_num_seqs}_${max_num_batched_tokens}.txt"
echo "vllm_log: $vllm_log"
echo
rm -f $vllm_log
pkill -f vllm
echo "starting server..."
start_server $gpu_memory_utilization $max_num_seqs $max_num_batched_tokens $vllm_log
result=$?
if [[ "$result" -eq 1 ]]; then
echo "server failed to start. gpu_memory_utilization:$gpu_memory_utilization, max_num_seqs:$max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens"
else
echo "server started."
fi
echo
echo "run benchmark test..."
meet_latency_requirement=0
# get a basic qps by using request-rate inf
bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_inf.txt"
prefix_len=$(( INPUT_LEN * MIN_CACHE_HIT_PCT / 100 ))
python benchmarks/benchmark_serving.py \
--backend vllm \
--model $MODEL \
--dataset-name random \
--random-input-len $INPUT_LEN \
--random-output-len $OUTPUT_LEN \
--ignore-eos \
--disable-tqdm \
--request-rate inf \
--percentile-metrics ttft,tpot,itl,e2el \
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \
--num-prompts 1000 \
--random-prefix-len $prefix_len \
--port 8004 &> "$bm_log"
throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
if (( $(echo "$e2el <= $MAX_LATENCY_ALLOWED_MS" | bc -l) )); then
meet_latency_requirement=1
request_rate=inf
fi
if (( ! meet_latency_requirement )); then
# start from request-rate as int(throughput) + 1
request_rate=$((${throughput%.*} + 1))
while ((request_rate > 0)); do
# clear prefix cache
curl -X POST http://0.0.0.0:8004/reset_prefix_cache
sleep 5
bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_${request_rate}.txt"
python benchmarks/benchmark_serving.py \
--backend vllm \
--model $MODEL \
--dataset-name random \
--random-input-len $INPUT_LEN \
--random-output-len $OUTPUT_LEN \
--ignore-eos \
--disable-tqdm \
--request-rate $request_rate \
--percentile-metrics ttft,tpot,itl,e2el \
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \
--num-prompts 100 \
--random-prefix-len $prefix_len \
--port 8004 &> "$bm_log"
throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
if (( $(echo "$e2el <= $MAX_LATENCY_ALLOWED_MS" | bc -l) )); then
meet_latency_requirement=1
break
fi
request_rate=$((request_rate-1))
done
fi
# write the results and update the best result.
if ((meet_latency_requirement)); then
echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens, request_rate: $request_rate, e2el: $e2el, throughput: $throughput, goodput: $goodput"
echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens, request_rate: $request_rate, e2el: $e2el, throughput: $throughput, goodput: $goodput" >> "$RESULT"
if (( $(echo "$throughput > $best_throughput" | bc -l) )); then
best_throughput=$throughput
best_max_num_seqs=$max_num_seqs
best_num_batched_tokens=$max_num_batched_tokens
best_goodput=$goodput
fi
else
echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens does not meet latency requirement ${MAX_LATENCY_ALLOWED_MS}"
echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens does not meet latency requirement ${MAX_LATENCY_ALLOWED_MS}" >> "$RESULT"
fi
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput"
pkill vllm
sleep 10
printf '=%.0s' $(seq 1 20)
return 0
}
read -r -a num_seqs_list <<< "$NUM_SEQS_LIST"
read -r -a num_batched_tokens_list <<< "$NUM_BATCHED_TOKENS_LIST"
# first find out the max gpu-memory-utilization without HBM OOM.
gpu_memory_utilization=0.98
find_gpu_memory_utilization=0
while (( $(echo "$gpu_memory_utilization >= 0.9" | bc -l) )); do
start_server $gpu_memory_utilization "${num_seqs_list[-1]}" "${num_batched_tokens_list[-1]}" "$LOG_FOLDER/vllm_log_gpu_memory_utilization_$gpu_memory_utilization.log"
result=$?
if [[ "$result" -eq 0 ]]; then
find_gpu_memory_utilization=1
break
else
gpu_memory_utilization=$(echo "$gpu_memory_utilization - 0.01" | bc)
fi
done
if [[ "$find_gpu_memory_utilization" -eq 1 ]]; then
echo "Using gpu_memory_utilization=$gpu_memory_utilization to serve model."
else
echo "Cannot find a proper gpu_memory_utilization over 0.9 to serve the model, please check logs in $LOG_FOLDER."
exit 1
fi
for num_seqs in "${num_seqs_list[@]}"; do
for num_batched_tokens in "${num_batched_tokens_list[@]}"; do
run_benchmark $num_seqs $num_batched_tokens $gpu_memory_utilization
done
done
echo "finish permutations"
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput"
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput" >> "$RESULT"

Some files were not shown because too many files have changed in this diff Show More