Compare commits
1 Commits
compile-ep
...
correct-do
| Author | SHA1 | Date | |
|---|---|---|---|
| c1d1875ba3 |
@ -1,20 +1,14 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
import zipfile
|
import zipfile
|
||||||
|
|
||||||
# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 400 MiB
|
# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 250 MB
|
||||||
# Note that we have 400 MiB quota, please use it wisely.
|
VLLM_MAX_SIZE_MB = int(os.environ.get('VLLM_MAX_SIZE_MB', 250))
|
||||||
# See https://github.com/pypi/support/issues/3792 .
|
|
||||||
# Please also sync the value with the one in Dockerfile.
|
|
||||||
VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 400))
|
|
||||||
|
|
||||||
|
|
||||||
def print_top_10_largest_files(zip_file):
|
def print_top_10_largest_files(zip_file):
|
||||||
"""Print the top 10 largest files in the given zip file."""
|
"""Print the top 10 largest files in the given zip file."""
|
||||||
with zipfile.ZipFile(zip_file, "r") as z:
|
with zipfile.ZipFile(zip_file, 'r') as z:
|
||||||
file_sizes = [(f, z.getinfo(f).file_size) for f in z.namelist()]
|
file_sizes = [(f, z.getinfo(f).file_size) for f in z.namelist()]
|
||||||
file_sizes.sort(key=lambda x: x[1], reverse=True)
|
file_sizes.sort(key=lambda x: x[1], reverse=True)
|
||||||
for f, size in file_sizes[:10]:
|
for f, size in file_sizes[:10]:
|
||||||
@ -29,18 +23,14 @@ def check_wheel_size(directory):
|
|||||||
wheel_path = os.path.join(root, file_name)
|
wheel_path = os.path.join(root, file_name)
|
||||||
wheel_size_mb = os.path.getsize(wheel_path) / (1024 * 1024)
|
wheel_size_mb = os.path.getsize(wheel_path) / (1024 * 1024)
|
||||||
if wheel_size_mb > VLLM_MAX_SIZE_MB:
|
if wheel_size_mb > VLLM_MAX_SIZE_MB:
|
||||||
print(
|
print(f"Not allowed: Wheel {wheel_path} is larger "
|
||||||
f"Not allowed: Wheel {wheel_path} is larger "
|
f"({wheel_size_mb:.2f} MB) than the limit "
|
||||||
f"({wheel_size_mb:.2f} MB) than the limit "
|
f"({VLLM_MAX_SIZE_MB} MB).")
|
||||||
f"({VLLM_MAX_SIZE_MB} MB)."
|
|
||||||
)
|
|
||||||
print_top_10_largest_files(wheel_path)
|
print_top_10_largest_files(wheel_path)
|
||||||
return 1
|
return 1
|
||||||
else:
|
else:
|
||||||
print(
|
print(f"Wheel {wheel_path} is within the allowed size "
|
||||||
f"Wheel {wheel_path} is within the allowed size "
|
f"({wheel_size_mb:.2f} MB).")
|
||||||
f"({wheel_size_mb:.2f} MB)."
|
|
||||||
)
|
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
|
||||||
@ -50,4 +40,4 @@ if __name__ == "__main__":
|
|||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
directory = sys.argv[1]
|
directory = sys.argv[1]
|
||||||
sys.exit(check_wheel_size(directory))
|
sys.exit(check_wheel_size(directory))
|
||||||
@ -1,6 +1,3 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import os
|
import os
|
||||||
|
|
||||||
@ -8,8 +5,7 @@ template = """<!DOCTYPE html>
|
|||||||
<html>
|
<html>
|
||||||
<body>
|
<body>
|
||||||
<h1>Links for vLLM</h1/>
|
<h1>Links for vLLM</h1/>
|
||||||
<a href="../{x86_wheel_html_escaped}">{x86_wheel}</a><br/>
|
<a href="../{wheel_html_escaped}">{wheel}</a><br/>
|
||||||
<a href="../{arm_wheel_html_escaped}">{arm_wheel}</a><br/>
|
|
||||||
</body>
|
</body>
|
||||||
</html>
|
</html>
|
||||||
"""
|
"""
|
||||||
@ -22,25 +18,7 @@ filename = os.path.basename(args.wheel)
|
|||||||
|
|
||||||
with open("index.html", "w") as f:
|
with open("index.html", "w") as f:
|
||||||
print(f"Generated index.html for {args.wheel}")
|
print(f"Generated index.html for {args.wheel}")
|
||||||
# sync the abi tag with .buildkite/scripts/upload-wheels.sh
|
|
||||||
if "x86_64" in filename:
|
|
||||||
x86_wheel = filename
|
|
||||||
arm_wheel = filename.replace("x86_64", "aarch64").replace(
|
|
||||||
"manylinux1", "manylinux2014"
|
|
||||||
)
|
|
||||||
elif "aarch64" in filename:
|
|
||||||
x86_wheel = filename.replace("aarch64", "x86_64").replace(
|
|
||||||
"manylinux2014", "manylinux1"
|
|
||||||
)
|
|
||||||
arm_wheel = filename
|
|
||||||
else:
|
|
||||||
raise ValueError(f"Unsupported wheel: {filename}")
|
|
||||||
# cloudfront requires escaping the '+' character
|
# cloudfront requires escaping the '+' character
|
||||||
f.write(
|
f.write(
|
||||||
template.format(
|
template.format(wheel=filename,
|
||||||
x86_wheel=x86_wheel,
|
wheel_html_escaped=filename.replace("+", "%2B")))
|
||||||
x86_wheel_html_escaped=x86_wheel.replace("+", "%2B"),
|
|
||||||
arm_wheel=arm_wheel,
|
|
||||||
arm_wheel_html_escaped=arm_wheel.replace("+", "%2B"),
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|||||||
@ -1,4 +1,3 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m deepseek-ai/DeepSeek-V2-Lite-Chat -b "auto" -l 1000 -f 5 -t 2
|
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m deepseek-ai/DeepSeek-V2-Lite-Chat -b "auto" -l 1000 -f 5 -t 2
|
||||||
model_name: "deepseek-ai/DeepSeek-V2-Lite-Chat"
|
model_name: "deepseek-ai/DeepSeek-V2-Lite-Chat"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@ -1,4 +1,3 @@
|
|||||||
# For hf script, without -t option (tensor parallel size).
|
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5
|
||||||
model_name: "nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform"
|
model_name: "nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@ -1,4 +1,3 @@
|
|||||||
# For hf script, without -t option (tensor parallel size).
|
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-70B-Instruct -b 32 -l 250 -f 5
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-70B-Instruct -b 32 -l 250 -f 5
|
||||||
model_name: "meta-llama/Meta-Llama-3-70B-Instruct"
|
model_name: "meta-llama/Meta-Llama-3-70B-Instruct"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@ -1,4 +1,3 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors -b auto -l 1000 -f 5 -t 1
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors -b auto -l 1000 -f 5 -t 1
|
||||||
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors"
|
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@ -1,4 +1,3 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5 -t 1
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5 -t 1
|
||||||
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform"
|
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@ -1,4 +1,3 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test -b 32 -l 1000 -f 5 -t 1
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test -b 32 -l 1000 -f 5 -t 1
|
||||||
model_name: "nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test"
|
model_name: "nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@ -1,4 +1,3 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Meta-Llama-3-8B-Instruct-FP8 -b 32 -l 250 -f 5 -t 1
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Meta-Llama-3-8B-Instruct-FP8 -b 32 -l 250 -f 5 -t 1
|
||||||
model_name: "neuralmagic/Meta-Llama-3-8B-Instruct-FP8"
|
model_name: "neuralmagic/Meta-Llama-3-8B-Instruct-FP8"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@ -1,4 +1,3 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
|
||||||
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test"
|
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@ -1,4 +1,3 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
|
||||||
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test"
|
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@ -1,4 +1,3 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test -b auto -l 1000 -f 5 -t 1
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test -b auto -l 1000 -f 5 -t 1
|
||||||
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test"
|
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@ -1,5 +1,4 @@
|
|||||||
# For hf script, without -t option (tensor parallel size).
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-8B-Instruct -b 32 -l 250 -f 5 -t 1
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-8B-Instruct -b 32 -l 250 -f 5
|
|
||||||
model_name: "meta-llama/Meta-Llama-3-8B-Instruct"
|
model_name: "meta-llama/Meta-Llama-3-8B-Instruct"
|
||||||
tasks:
|
tasks:
|
||||||
- name: "gsm8k"
|
- name: "gsm8k"
|
||||||
|
|||||||
@ -1,11 +1,11 @@
|
|||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m Qwen/Qwen2.5-1.5B-Instruct -b auto -l 1319 -f 5 -t 1
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m HandH1998/QQQ-Llama-3-8b-g128 -b 32 -l 1000 -f 5 -t 1
|
||||||
model_name: "Qwen/Qwen2.5-1.5B-Instruct"
|
model_name: "HandH1998/QQQ-Llama-3-8b-g128"
|
||||||
tasks:
|
tasks:
|
||||||
- name: "gsm8k"
|
- name: "gsm8k"
|
||||||
metrics:
|
metrics:
|
||||||
- name: "exact_match,strict-match"
|
- name: "exact_match,strict-match"
|
||||||
value: 0.54
|
value: 0.419
|
||||||
- name: "exact_match,flexible-extract"
|
- name: "exact_match,flexible-extract"
|
||||||
value: 0.59
|
value: 0.416
|
||||||
limit: 1319
|
limit: 1000
|
||||||
num_fewshot: 5
|
num_fewshot: 5
|
||||||
@ -1,11 +0,0 @@
|
|||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Llama-3.2-1B-Instruct-FP8 -b "auto" -l 1319 -f 5 -t 1
|
|
||||||
model_name: "RedHatAI/Llama-3.2-1B-Instruct-FP8"
|
|
||||||
tasks:
|
|
||||||
- name: "gsm8k"
|
|
||||||
metrics:
|
|
||||||
- name: "exact_match,strict-match"
|
|
||||||
value: 0.335
|
|
||||||
- name: "exact_match,flexible-extract"
|
|
||||||
value: 0.323
|
|
||||||
limit: 1319
|
|
||||||
num_fewshot: 5
|
|
||||||
@ -1,4 +1,3 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
|
||||||
model_name: "neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8"
|
model_name: "neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@ -1,12 +1,11 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m mgoin/Minitron-4B-Base-FP8 -b auto -l 1000 -f 5 -t 1
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m mgoin/Minitron-4B-Base-FP8 -b auto -l 1000 -f 5 -t 1
|
||||||
model_name: "mgoin/Minitron-4B-Base-FP8"
|
model_name: "mgoin/Minitron-4B-Base-FP8"
|
||||||
tasks:
|
tasks:
|
||||||
- name: "gsm8k"
|
- name: "gsm8k"
|
||||||
metrics:
|
metrics:
|
||||||
- name: "exact_match,strict-match"
|
- name: "exact_match,strict-match"
|
||||||
value: 0.231
|
value: 0.233
|
||||||
- name: "exact_match,flexible-extract"
|
- name: "exact_match,flexible-extract"
|
||||||
value: 0.22
|
value: 0.236
|
||||||
limit: 1000
|
limit: 1000
|
||||||
num_fewshot: 5
|
num_fewshot: 5
|
||||||
|
|||||||
@ -1,4 +1,3 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic -b "auto" -l 250 -f 5 -t 8
|
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic -b "auto" -l 250 -f 5 -t 8
|
||||||
model_name: "neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic"
|
model_name: "neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@ -1,4 +1,3 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8 -b "auto" -l 250 -f 5 -t 4
|
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8 -b "auto" -l 250 -f 5 -t 4
|
||||||
model_name: "neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8"
|
model_name: "neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@ -1,5 +1,4 @@
|
|||||||
# For hf script, without -t option (tensor parallel size).
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1 -b 32 -l 250 -f 5 -t 4
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1 -b 32 -l 250 -f 5
|
|
||||||
model_name: "mistralai/Mixtral-8x7B-Instruct-v0.1"
|
model_name: "mistralai/Mixtral-8x7B-Instruct-v0.1"
|
||||||
tasks:
|
tasks:
|
||||||
- name: "gsm8k"
|
- name: "gsm8k"
|
||||||
|
|||||||
@ -1,12 +0,0 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16 -b auto -l 1319 -f 5 -t 1
|
|
||||||
model_name: "nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16"
|
|
||||||
tasks:
|
|
||||||
- name: "gsm8k"
|
|
||||||
metrics:
|
|
||||||
- name: "exact_match,strict-match"
|
|
||||||
value: 0.30
|
|
||||||
- name: "exact_match,flexible-extract"
|
|
||||||
value: 0.465
|
|
||||||
limit: 1319
|
|
||||||
num_fewshot: 5
|
|
||||||
@ -1,4 +1,3 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-FP8W8 -b auto -l 1000 -f 5 -t 1
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-FP8W8 -b auto -l 1000 -f 5 -t 1
|
||||||
model_name: "nm-testing/Qwen2-1.5B-Instruct-FP8W8"
|
model_name: "nm-testing/Qwen2-1.5B-Instruct-FP8W8"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@ -1,4 +1,3 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
|
||||||
model_name: "neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8"
|
model_name: "neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@ -1,4 +1,3 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise -b "auto" -l 1000 -f 5 -t 1
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise -b "auto" -l 1000 -f 5 -t 1
|
||||||
model_name: "nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise"
|
model_name: "nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@ -1,4 +1,3 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m Qwen/Qwen2-57B-A14B-Instruct -b "auto" -l 250 -f 5 -t 4
|
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m Qwen/Qwen2-57B-A14B-Instruct -b "auto" -l 250 -f 5 -t 4
|
||||||
model_name: "Qwen/Qwen2-57B-A14B-Instruct"
|
model_name: "Qwen/Qwen2-57B-A14B-Instruct"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@ -1,11 +0,0 @@
|
|||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -b auto -l 1319 -f 5 -t 1
|
|
||||||
model_name: "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic"
|
|
||||||
tasks:
|
|
||||||
- name: "gsm8k"
|
|
||||||
metrics:
|
|
||||||
- name: "exact_match,strict-match"
|
|
||||||
value: 0.47
|
|
||||||
- name: "exact_match,flexible-extract"
|
|
||||||
value: 0.64
|
|
||||||
limit: 1319
|
|
||||||
num_fewshot: 5
|
|
||||||
@ -1,12 +0,0 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM -b "auto" -t 2
|
|
||||||
model_name: "nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM"
|
|
||||||
tasks:
|
|
||||||
- name: "gsm8k"
|
|
||||||
metrics:
|
|
||||||
- name: "exact_match,strict-match"
|
|
||||||
value: 0.6353
|
|
||||||
- name: "exact_match,flexible-extract"
|
|
||||||
value: 0.637
|
|
||||||
limit: null
|
|
||||||
num_fewshot: null
|
|
||||||
@ -1,6 +1,10 @@
|
|||||||
Qwen2.5-1.5B-Instruct.yaml
|
Meta-Llama-3-8B-Instruct.yaml
|
||||||
|
Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
|
||||||
Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
|
Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
|
||||||
Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
|
Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
|
||||||
Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
|
Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
|
||||||
Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
|
Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
|
||||||
Qwen1.5-MoE-W4A16-compressed-tensors.yaml
|
Minitron-4B-Base-FP8.yaml
|
||||||
|
Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
|
||||||
|
Qwen2-1.5B-Instruct-FP8W8.yaml
|
||||||
|
Meta-Llama-3-8B-QQQ.yaml
|
||||||
|
|||||||
@ -1,44 +0,0 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
|
|
||||||
def pytest_addoption(parser):
|
|
||||||
parser.addoption(
|
|
||||||
"--config-list-file",
|
|
||||||
action="store",
|
|
||||||
help="Path to the file listing model config YAMLs (one per line)",
|
|
||||||
)
|
|
||||||
parser.addoption(
|
|
||||||
"--tp-size",
|
|
||||||
action="store",
|
|
||||||
default="1",
|
|
||||||
help="Tensor parallel size to use for evaluation",
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
|
||||||
def config_list_file(pytestconfig, config_dir):
|
|
||||||
rel_path = pytestconfig.getoption("--config-list-file")
|
|
||||||
return config_dir / rel_path
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
|
||||||
def tp_size(pytestconfig):
|
|
||||||
return pytestconfig.getoption("--tp-size")
|
|
||||||
|
|
||||||
|
|
||||||
def pytest_generate_tests(metafunc):
|
|
||||||
if "config_filename" in metafunc.fixturenames:
|
|
||||||
rel_path = metafunc.config.getoption("--config-list-file")
|
|
||||||
config_list_file = Path(rel_path).resolve()
|
|
||||||
config_dir = config_list_file.parent
|
|
||||||
with open(config_list_file, encoding="utf-8") as f:
|
|
||||||
configs = [
|
|
||||||
config_dir / line.strip()
|
|
||||||
for line in f
|
|
||||||
if line.strip() and not line.startswith("#")
|
|
||||||
]
|
|
||||||
metafunc.parametrize("config_filename", configs)
|
|
||||||
@ -2,7 +2,7 @@
|
|||||||
# We can use this script to compute baseline accuracy on GSM for transformers.
|
# We can use this script to compute baseline accuracy on GSM for transformers.
|
||||||
#
|
#
|
||||||
# Make sure you have lm-eval-harness installed:
|
# Make sure you have lm-eval-harness installed:
|
||||||
# pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
|
# pip install lm-eval==0.4.4
|
||||||
|
|
||||||
usage() {
|
usage() {
|
||||||
echo``
|
echo``
|
||||||
|
|||||||
@ -3,7 +3,7 @@
|
|||||||
# We use this for fp8, which HF does not support.
|
# We use this for fp8, which HF does not support.
|
||||||
#
|
#
|
||||||
# Make sure you have lm-eval-harness installed:
|
# Make sure you have lm-eval-harness installed:
|
||||||
# pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
|
# pip install lm-eval==0.4.4
|
||||||
|
|
||||||
usage() {
|
usage() {
|
||||||
echo``
|
echo``
|
||||||
@ -46,6 +46,6 @@ while getopts "m:b:l:f:t:" OPT; do
|
|||||||
done
|
done
|
||||||
|
|
||||||
lm_eval --model vllm \
|
lm_eval --model vllm \
|
||||||
--model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,add_bos_token=true,trust_remote_code=true,max_model_len=4096" \
|
--model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,distributed_executor_backend=ray,trust_remote_code=true,max_model_len=4096" \
|
||||||
--tasks gsm8k --num_fewshot "$FEWSHOT" --limit "$LIMIT" \
|
--tasks gsm8k --num_fewshot "$FEWSHOT" --limit "$LIMIT" \
|
||||||
--batch_size "$BATCH_SIZE"
|
--batch_size "$BATCH_SIZE"
|
||||||
|
|||||||
59
.buildkite/lm-eval-harness/run-tests.sh
Normal file
59
.buildkite/lm-eval-harness/run-tests.sh
Normal file
@ -0,0 +1,59 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
usage() {
|
||||||
|
echo``
|
||||||
|
echo "Runs lm eval harness on GSM8k using vllm and compares to "
|
||||||
|
echo "precomputed baseline (measured by HF transformers.)"
|
||||||
|
echo
|
||||||
|
echo "usage: ${0} <options>"
|
||||||
|
echo
|
||||||
|
echo " -c - path to the test data config (e.g. configs/small-models.txt)"
|
||||||
|
echo " -t - tensor parallel size"
|
||||||
|
echo
|
||||||
|
}
|
||||||
|
|
||||||
|
SUCCESS=0
|
||||||
|
|
||||||
|
while getopts "c:t:" OPT; do
|
||||||
|
case ${OPT} in
|
||||||
|
c )
|
||||||
|
CONFIG="$OPTARG"
|
||||||
|
;;
|
||||||
|
t )
|
||||||
|
TP_SIZE="$OPTARG"
|
||||||
|
;;
|
||||||
|
\? )
|
||||||
|
usage
|
||||||
|
exit 1
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
|
||||||
|
# Parse list of configs.
|
||||||
|
IFS=$'\n' read -d '' -r -a MODEL_CONFIGS < "$CONFIG"
|
||||||
|
|
||||||
|
for MODEL_CONFIG in "${MODEL_CONFIGS[@]}"
|
||||||
|
do
|
||||||
|
LOCAL_SUCCESS=0
|
||||||
|
|
||||||
|
echo "=== RUNNING MODEL: $MODEL_CONFIG WITH TP SIZE: $TP_SIZE==="
|
||||||
|
|
||||||
|
export LM_EVAL_TEST_DATA_FILE=$PWD/configs/${MODEL_CONFIG}
|
||||||
|
export LM_EVAL_TP_SIZE=$TP_SIZE
|
||||||
|
pytest -s test_lm_eval_correctness.py || LOCAL_SUCCESS=$?
|
||||||
|
|
||||||
|
if [[ $LOCAL_SUCCESS == 0 ]]; then
|
||||||
|
echo "=== PASSED MODEL: ${MODEL_CONFIG} ==="
|
||||||
|
else
|
||||||
|
echo "=== FAILED MODEL: ${MODEL_CONFIG} ==="
|
||||||
|
fi
|
||||||
|
|
||||||
|
SUCCESS=$((SUCCESS + LOCAL_SUCCESS))
|
||||||
|
|
||||||
|
done
|
||||||
|
|
||||||
|
if [ "${SUCCESS}" -eq "0" ]; then
|
||||||
|
exit 0
|
||||||
|
else
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
@ -1,57 +1,63 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
"""
|
"""
|
||||||
LM eval harness on model to compare vs HF baseline computed offline.
|
LM eval harness on model to compare vs HF baseline computed offline.
|
||||||
Configs are found in configs/$MODEL.yaml
|
Configs are found in configs/$MODEL.yaml
|
||||||
|
|
||||||
pytest -s -v test_lm_eval_correctness.py \
|
* export LM_EVAL_TEST_DATA_FILE=configs/Meta-Llama-3-70B-Instruct.yaml
|
||||||
--config-list-file=configs/models-small.txt \
|
* export LM_EVAL_TP_SIZE=4
|
||||||
--tp-size=1
|
* pytest -s test_lm_eval_correctness.py
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
import lm_eval
|
import lm_eval
|
||||||
import numpy as np
|
import numpy
|
||||||
import yaml
|
import yaml
|
||||||
|
|
||||||
RTOL = 0.08
|
RTOL = 0.05
|
||||||
|
TEST_DATA_FILE = os.environ.get(
|
||||||
|
"LM_EVAL_TEST_DATA_FILE",
|
||||||
|
".buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml")
|
||||||
|
|
||||||
|
TP_SIZE = os.environ.get("LM_EVAL_TP_SIZE", 1)
|
||||||
|
|
||||||
|
|
||||||
def launch_lm_eval(eval_config, tp_size):
|
def launch_lm_eval(eval_config):
|
||||||
trust_remote_code = eval_config.get("trust_remote_code", False)
|
trust_remote_code = eval_config.get('trust_remote_code', False)
|
||||||
max_model_len = eval_config.get("max_model_len", 4096)
|
|
||||||
model_args = (
|
model_args = f"pretrained={eval_config['model_name']}," \
|
||||||
f"pretrained={eval_config['model_name']},"
|
f"tensor_parallel_size={TP_SIZE}," \
|
||||||
f"tensor_parallel_size={tp_size},"
|
f"add_bos_token=true," \
|
||||||
f"enforce_eager=true,"
|
f"trust_remote_code={trust_remote_code}"
|
||||||
f"add_bos_token=true,"
|
|
||||||
f"trust_remote_code={trust_remote_code},"
|
|
||||||
f"max_model_len={max_model_len}"
|
|
||||||
)
|
|
||||||
results = lm_eval.simple_evaluate(
|
results = lm_eval.simple_evaluate(
|
||||||
model="vllm",
|
model="vllm",
|
||||||
model_args=model_args,
|
model_args=model_args,
|
||||||
tasks=[task["name"] for task in eval_config["tasks"]],
|
tasks=[task["name"] for task in eval_config["tasks"]],
|
||||||
num_fewshot=eval_config["num_fewshot"],
|
num_fewshot=eval_config["num_fewshot"],
|
||||||
limit=eval_config["limit"],
|
limit=eval_config["limit"],
|
||||||
batch_size="auto",
|
batch_size="auto")
|
||||||
)
|
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
|
||||||
def test_lm_eval_correctness_param(config_filename, tp_size):
|
def test_lm_eval_correctness():
|
||||||
eval_config = yaml.safe_load(config_filename.read_text(encoding="utf-8"))
|
eval_config = yaml.safe_load(
|
||||||
|
Path(TEST_DATA_FILE).read_text(encoding="utf-8"))
|
||||||
|
|
||||||
results = launch_lm_eval(eval_config, tp_size)
|
# Launch eval requests.
|
||||||
|
results = launch_lm_eval(eval_config)
|
||||||
|
|
||||||
|
# Confirm scores match ground truth.
|
||||||
success = True
|
success = True
|
||||||
for task in eval_config["tasks"]:
|
for task in eval_config["tasks"]:
|
||||||
for metric in task["metrics"]:
|
for metric in task["metrics"]:
|
||||||
ground_truth = metric["value"]
|
ground_truth = metric["value"]
|
||||||
measured_value = results["results"][task["name"]][metric["name"]]
|
measured_value = results["results"][task["name"]][metric["name"]]
|
||||||
print(
|
print(f'{task["name"]} | {metric["name"]}: '
|
||||||
f"{task['name']} | {metric['name']}: "
|
f'ground_truth={ground_truth} | measured={measured_value}')
|
||||||
f"ground_truth={ground_truth} | measured={measured_value}"
|
success = success and numpy.isclose(
|
||||||
)
|
ground_truth, measured_value, rtol=RTOL)
|
||||||
success = success and np.isclose(ground_truth, measured_value, rtol=RTOL)
|
|
||||||
|
|
||||||
|
# Assert at the end, print all scores even on failure for debugging.
|
||||||
assert success
|
assert success
|
||||||
|
|||||||
@ -1,62 +1,54 @@
|
|||||||
# vLLM benchmark suite
|
# vLLM benchmark suite
|
||||||
|
|
||||||
|
|
||||||
## Introduction
|
## Introduction
|
||||||
|
|
||||||
This directory contains two sets of benchmark for vllm.
|
This directory contains two sets of benchmark for vllm.
|
||||||
|
|
||||||
- Performance benchmark: benchmark vllm's performance under various workload, for **developers** to gain clarity on whether their PR improves/degrades vllm's performance
|
- Performance benchmark: benchmark vllm's performance under various workload, for **developers** to gain clarity on whether their PR improves/degrades vllm's performance
|
||||||
- Nightly benchmark: compare vllm's performance against alternatives (tgi, trt-llm and lmdeploy), for **the public** to know when to choose vllm.
|
- Nightly benchmark: compare vllm's performance against alternatives (tgi, trt-llm and lmdeploy), for **the public** to know when to choose vllm.
|
||||||
|
|
||||||
See [vLLM performance dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm) for the latest performance benchmark results and [vLLM GitHub README](https://github.com/vllm-project/vllm/blob/main/README.md) for latest nightly benchmark results.
|
|
||||||
|
See [vLLM performance dashboard](https://perf.vllm.ai) for the latest performance benchmark results and [vLLM GitHub README](https://github.com/vllm-project/vllm/blob/main/README.md) for latest nightly benchmark results.
|
||||||
|
|
||||||
|
|
||||||
## Performance benchmark quick overview
|
## Performance benchmark quick overview
|
||||||
|
|
||||||
**Benchmarking Coverage**: latency, throughput and fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!) and Intel® Xeon® Processors, with different models.
|
**Benchmarking Coverage**: latency, throughput and fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!), with different models.
|
||||||
|
|
||||||
**Benchmarking Duration**: about 1hr.
|
**Benchmarking Duration**: about 1hr.
|
||||||
|
|
||||||
**For benchmarking developers**: please try your best to constraint the duration of benchmarking to about 1 hr so that it won't take forever to run.
|
**For benchmarking developers**: please try your best to constraint the duration of benchmarking to about 1 hr so that it won't take forever to run.
|
||||||
|
|
||||||
|
|
||||||
## Nightly benchmark quick overview
|
## Nightly benchmark quick overview
|
||||||
|
|
||||||
**Benchmarking Coverage**: Fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!) on Llama-3 8B, 70B and Mixtral 8x7B.
|
**Benchmarking Coverage**: Fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!) on Llama-3 8B, 70B and Mixtral 8x7B.
|
||||||
|
|
||||||
**Benchmarking engines**: vllm, TGI, trt-llm and lmdeploy.
|
**Benchmarking engines**: vllm, TGI, trt-llm and lmdeploy.
|
||||||
|
|
||||||
**Benchmarking Duration**: about 3.5hrs.
|
**Benchmarking Duration**: about 3.5hrs.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
## Trigger the benchmark
|
## Trigger the benchmark
|
||||||
|
|
||||||
Performance benchmark will be triggered when:
|
Performance benchmark will be triggered when:
|
||||||
|
|
||||||
- A PR being merged into vllm.
|
- A PR being merged into vllm.
|
||||||
- Every commit for those PRs with `perf-benchmarks` label AND `ready` label.
|
- Every commit for those PRs with `perf-benchmarks` label AND `ready` label.
|
||||||
|
|
||||||
Manually Trigger the benchmark
|
|
||||||
|
|
||||||
```bash
|
|
||||||
bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
|
|
||||||
```
|
|
||||||
|
|
||||||
Runtime environment variables:
|
|
||||||
|
|
||||||
- `ON_CPU`: set the value to '1' on Intel® Xeon® Processors. Default value is 0.
|
|
||||||
- `SERVING_JSON`: JSON file to use for the serving tests. Default value is empty string (use default file).
|
|
||||||
- `LATENCY_JSON`: JSON file to use for the latency tests. Default value is empty string (use default file).
|
|
||||||
- `THROUGHPUT_JSON`: JSON file to use for the throughout tests. Default value is empty string (use default file).
|
|
||||||
- `REMOTE_HOST`: IP for the remote vLLM service to benchmark. Default value is empty string.
|
|
||||||
- `REMOTE_PORT`: Port for the remote vLLM service to benchmark. Default value is empty string.
|
|
||||||
|
|
||||||
Nightly benchmark will be triggered when:
|
Nightly benchmark will be triggered when:
|
||||||
|
|
||||||
- Every commit for those PRs with `perf-benchmarks` label and `nightly-benchmarks` label.
|
- Every commit for those PRs with `perf-benchmarks` label and `nightly-benchmarks` label.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
## Performance benchmark details
|
## Performance benchmark details
|
||||||
|
|
||||||
|
|
||||||
See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.
|
See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.
|
||||||
> NOTE: For Intel® Xeon® Processors, use `tests/latency-tests-cpu.json`, `tests/throughput-tests-cpu.json`, `tests/serving-tests-cpu.json` instead.
|
|
||||||
>
|
|
||||||
### Latency test
|
#### Latency test
|
||||||
|
|
||||||
Here is an example of one test inside `latency-tests.json`:
|
Here is an example of one test inside `latency-tests.json`:
|
||||||
|
|
||||||
@ -76,25 +68,23 @@ Here is an example of one test inside `latency-tests.json`:
|
|||||||
```
|
```
|
||||||
|
|
||||||
In this example:
|
In this example:
|
||||||
|
- The `test_name` attributes is a unique identifier for the test. In `latency-tests.json`, it must start with `latency_`.
|
||||||
- The `test_name` attributes is a unique identifier for the test. In `latency-tests.json`, it must start with `latency_`.
|
- The `parameters` attribute control the command line arguments to be used for `benchmark_latency.py`. Note that please use underline `_` instead of the dash `-` when specifying the command line arguments, and `run-performance-benchmarks.sh` will convert the underline to dash when feeding the arguments to `benchmark_latency.py`. For example, the corresponding command line arguments for `benchmark_latency.py` will be `--model meta-llama/Meta-Llama-3-8B --tensor-parallel-size 1 --load-format dummy --num-iters-warmup 5 --num-iters 15`
|
||||||
- The `parameters` attribute control the command line arguments to be used for `vllm bench latency`. Note that please use underline `_` instead of the dash `-` when specifying the command line arguments, and `run-performance-benchmarks.sh` will convert the underline to dash when feeding the arguments to `vllm bench latency`. For example, the corresponding command line arguments for `vllm bench latency` will be `--model meta-llama/Meta-Llama-3-8B --tensor-parallel-size 1 --load-format dummy --num-iters-warmup 5 --num-iters 15`
|
|
||||||
|
|
||||||
Note that the performance numbers are highly sensitive to the value of the parameters. Please make sure the parameters are set correctly.
|
Note that the performance numbers are highly sensitive to the value of the parameters. Please make sure the parameters are set correctly.
|
||||||
|
|
||||||
WARNING: The benchmarking script will save json results by itself, so please do not configure `--output-json` parameter in the json file.
|
WARNING: The benchmarking script will save json results by itself, so please do not configure `--output-json` parameter in the json file.
|
||||||
|
|
||||||
### Throughput test
|
|
||||||
|
|
||||||
The tests are specified in `throughput-tests.json`. The syntax is similar to `latency-tests.json`, except for that the parameters will be fed forward to `vllm bench throughput`.
|
#### Throughput test
|
||||||
|
The tests are specified in `throughput-tests.json`. The syntax is similar to `latency-tests.json`, except for that the parameters will be fed forward to `benchmark_throughput.py`.
|
||||||
|
|
||||||
The number of this test is also stable -- a slight change on the value of this number might vary the performance numbers by a lot.
|
The number of this test is also stable -- a slight change on the value of this number might vary the performance numbers by a lot.
|
||||||
|
|
||||||
### Serving test
|
#### Serving test
|
||||||
|
We test the throughput by using `benchmark_serving.py` with request rate = inf to cover the online serving overhead. The corresponding parameters are in `serving-tests.json`, and here is an example:
|
||||||
|
|
||||||
We test the throughput by using `vllm bench serve` with request rate = inf to cover the online serving overhead. The corresponding parameters are in `serving-tests.json`, and here is an example:
|
```
|
||||||
|
|
||||||
```json
|
|
||||||
[
|
[
|
||||||
{
|
{
|
||||||
"test_name": "serving_llama8B_tp1_sharegpt",
|
"test_name": "serving_llama8B_tp1_sharegpt",
|
||||||
@ -104,6 +94,7 @@ We test the throughput by using `vllm bench serve` with request rate = inf to co
|
|||||||
"tensor_parallel_size": 1,
|
"tensor_parallel_size": 1,
|
||||||
"swap_space": 16,
|
"swap_space": 16,
|
||||||
"disable_log_stats": "",
|
"disable_log_stats": "",
|
||||||
|
"disable_log_requests": "",
|
||||||
"load_format": "dummy"
|
"load_format": "dummy"
|
||||||
},
|
},
|
||||||
"client_parameters": {
|
"client_parameters": {
|
||||||
@ -118,60 +109,45 @@ We test the throughput by using `vllm bench serve` with request rate = inf to co
|
|||||||
```
|
```
|
||||||
|
|
||||||
Inside this example:
|
Inside this example:
|
||||||
|
|
||||||
- The `test_name` attribute is also a unique identifier for the test. It must start with `serving_`.
|
- The `test_name` attribute is also a unique identifier for the test. It must start with `serving_`.
|
||||||
- The `server-parameters` includes the command line arguments for vLLM server.
|
- The `server-parameters` includes the command line arguments for vLLM server.
|
||||||
- The `client-parameters` includes the command line arguments for `vllm bench serve`.
|
- The `client-parameters` includes the command line arguments for `benchmark_serving.py`.
|
||||||
- The `qps_list` controls the list of qps for test. It will be used to configure the `--request-rate` parameter in `vllm bench serve`
|
- The `qps_list` controls the list of qps for test. It will be used to configure the `--request-rate` parameter in `benchmark_serving.py`
|
||||||
|
|
||||||
The number of this test is less stable compared to the delay and latency benchmarks (due to randomized sharegpt dataset sampling inside `benchmark_serving.py`), but a large change on this number (e.g. 5% change) still vary the output greatly.
|
The number of this test is less stable compared to the delay and latency benchmarks (due to randomized sharegpt dataset sampling inside `benchmark_serving.py`), but a large change on this number (e.g. 5% change) still vary the output greatly.
|
||||||
|
|
||||||
WARNING: The benchmarking script will save json results by itself, so please do not configure `--save-results` or other results-saving-related parameters in `serving-tests.json`.
|
WARNING: The benchmarking script will save json results by itself, so please do not configure `--save-results` or other results-saving-related parameters in `serving-tests.json`.
|
||||||
|
|
||||||
### Visualizing the results
|
#### Visualizing the results
|
||||||
|
The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](tests/descriptions.md) with real benchmarking results.
|
||||||
The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](performance-benchmarks-descriptions.md) with real benchmarking results.
|
|
||||||
You can find the result presented as a table inside the `buildkite/performance-benchmark` job page.
|
You can find the result presented as a table inside the `buildkite/performance-benchmark` job page.
|
||||||
If you do not see the table, please wait till the benchmark finish running.
|
If you do not see the table, please wait till the benchmark finish running.
|
||||||
The json version of the table (together with the json version of the benchmark) will be also attached to the markdown file.
|
The json version of the table (together with the json version of the benchmark) will be also attached to the markdown file.
|
||||||
The raw benchmarking results (in the format of json files) are in the `Artifacts` tab of the benchmarking.
|
The raw benchmarking results (in the format of json files) are in the `Artifacts` tab of the benchmarking.
|
||||||
|
|
||||||
The `compare-json-results.py` helps to compare benchmark results JSON files converted using `convert-results-json-to-markdown.py`.
|
|
||||||
When run, benchmark script generates results under `benchmark/results` folder, along with the `benchmark_results.md` and `benchmark_results.json`.
|
|
||||||
`compare-json-results.py` compares two `benchmark_results.json` files and provides performance ratio e.g. for Output Tput, Median TTFT and Median TPOT.
|
|
||||||
If only one benchmark_results.json is passed, `compare-json-results.py` compares different TP and PP configurations in the benchmark_results.json instead.
|
|
||||||
|
|
||||||
Here is an example using the script to compare result_a and result_b with Model, Dataset name, input/output length, max concurrency and qps.
|
|
||||||
`python3 compare-json-results.py -f results_a/benchmark_results.json -f results_b/benchmark_results.json`
|
|
||||||
|
|
||||||
| | Model | Dataset Name | Input Len | Output Len | # of max concurrency | qps | results_a/benchmark_results.json | results_b/benchmark_results.json | perf_ratio |
|
|
||||||
|----|---------------------------------------|--------|-----|-----|------|-----|-----------|----------|----------|
|
|
||||||
| 0 | meta-llama/Meta-Llama-3.1-8B-Instruct | random | 128 | 128 | 1000 | 1 | 142.633982 | 156.526018 | 1.097396 |
|
|
||||||
| 1 | meta-llama/Meta-Llama-3.1-8B-Instruct | random | 128 | 128 | 1000 | inf| 241.620334 | 294.018783 | 1.216863 |
|
|
||||||
|
|
||||||
A comparison diagram will be generated below the table.
|
|
||||||
Here is an example to compare between 96c/results_gnr_96c_091_tp2pp3 and 128c/results_gnr_128c_091_tp2pp3
|
|
||||||
<img width="1886" height="828" alt="image" src="https://github.com/user-attachments/assets/c02a43ef-25d0-4fd6-90e5-2169a28682dd" />
|
|
||||||
|
|
||||||
## Nightly test details
|
## Nightly test details
|
||||||
|
|
||||||
See [nightly-descriptions.md](nightly-descriptions.md) for the detailed description on test workload, models and docker containers of benchmarking other llm engines.
|
See [nightly-descriptions.md](nightly-descriptions.md) for the detailed description on test workload, models and docker containers of benchmarking other llm engines.
|
||||||
|
|
||||||
### Workflow
|
|
||||||
|
|
||||||
- The [nightly-pipeline.yaml](nightly-pipeline.yaml) specifies the docker containers for different LLM serving engines.
|
#### Workflow
|
||||||
- Inside each container, we run [scripts/run-nightly-benchmarks.sh](scripts/run-nightly-benchmarks.sh), which will probe the serving engine of the current container.
|
|
||||||
- The `scripts/run-nightly-benchmarks.sh` will parse the workload described in [nightly-tests.json](tests/nightly-tests.json) and launch the right benchmark for the specified serving engine via `scripts/launch-server.sh`.
|
|
||||||
- At last, we run [scripts/summary-nightly-results.py](scripts/summary-nightly-results.py) to collect and plot the final benchmarking results, and update the results to buildkite.
|
|
||||||
|
|
||||||
### Nightly tests
|
- The [nightly-pipeline.yaml](nightly-pipeline.yaml) specifies the docker containers for different LLM serving engines.
|
||||||
|
- Inside each container, we run [run-nightly-suite.sh](run-nightly-suite.sh), which will probe the serving engine of the current container.
|
||||||
|
- The `run-nightly-suite.sh` will redirect the request to `tests/run-[llm serving engine name]-nightly.sh`, which parses the workload described in [nightly-tests.json](tests/nightly-tests.json) and performs the benchmark.
|
||||||
|
- At last, we run [scripts/plot-nightly-results.py](scripts/plot-nightly-results.py) to collect and plot the final benchmarking results, and update the results to buildkite.
|
||||||
|
|
||||||
|
#### Nightly tests
|
||||||
|
|
||||||
In [nightly-tests.json](tests/nightly-tests.json), we include the command line arguments for benchmarking commands, together with the benchmarking test cases. The format is highly similar to performance benchmark.
|
In [nightly-tests.json](tests/nightly-tests.json), we include the command line arguments for benchmarking commands, together with the benchmarking test cases. The format is highly similar to performance benchmark.
|
||||||
|
|
||||||
### Docker containers
|
#### Docker containers
|
||||||
|
|
||||||
The docker containers for benchmarking are specified in `nightly-pipeline.yaml`.
|
The docker containers for benchmarking are specified in `nightly-pipeline.yaml`.
|
||||||
|
|
||||||
WARNING: the docker versions are HARD-CODED and SHOULD BE ALIGNED WITH `nightly-descriptions.md`. The docker versions need to be hard-coded as there are several version-specific bug fixes inside `scripts/run-nightly-benchmarks.sh` and `scripts/launch-server.sh`.
|
WARNING: the docker versions are HARD-CODED and SHOULD BE ALIGNED WITH `nightly-descriptions.md`. The docker versions need to be hard-coded as there are several version-specific bug fixes inside `tests/run-[llm serving engine name]-nightly.sh`.
|
||||||
|
|
||||||
WARNING: populating `trt-llm` to latest version is not easy, as it requires updating several protobuf files in [tensorrt-demo](https://github.com/neuralmagic/tensorrt-demo.git).
|
WARNING: populating `trt-llm` to latest version is not easy, as it requires updating several protobuf files in [tensorrt-demo](https://github.com/neuralmagic/tensorrt-demo.git).
|
||||||
|
|
||||||
|
|||||||
@ -10,18 +10,12 @@ steps:
|
|||||||
- image: badouralix/curl-jq
|
- image: badouralix/curl-jq
|
||||||
command:
|
command:
|
||||||
- sh .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
|
- sh .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
|
||||||
- label: "Cleanup H100"
|
|
||||||
agents:
|
|
||||||
queue: H100
|
|
||||||
depends_on: ~
|
|
||||||
command: docker system prune -a --volumes --force
|
|
||||||
|
|
||||||
- label: "A100"
|
- label: "A100"
|
||||||
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
|
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
|
||||||
agents:
|
agents:
|
||||||
queue: A100
|
queue: A100
|
||||||
depends_on: wait-for-container-image
|
depends_on: wait-for-container-image
|
||||||
if: build.branch == "main"
|
|
||||||
plugins:
|
plugins:
|
||||||
- kubernetes:
|
- kubernetes:
|
||||||
podSpec:
|
podSpec:
|
||||||
@ -56,7 +50,6 @@ steps:
|
|||||||
agents:
|
agents:
|
||||||
queue: H200
|
queue: H200
|
||||||
depends_on: wait-for-container-image
|
depends_on: wait-for-container-image
|
||||||
if: build.branch == "main"
|
|
||||||
plugins:
|
plugins:
|
||||||
- docker#v5.12.0:
|
- docker#v5.12.0:
|
||||||
image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
|
image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
|
||||||
@ -82,7 +75,6 @@ steps:
|
|||||||
agents:
|
agents:
|
||||||
queue: H100
|
queue: H100
|
||||||
depends_on: wait-for-container-image
|
depends_on: wait-for-container-image
|
||||||
if: build.branch == "main"
|
|
||||||
plugins:
|
plugins:
|
||||||
- docker#v5.12.0:
|
- docker#v5.12.0:
|
||||||
image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
|
image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
|
||||||
@ -98,87 +90,3 @@ steps:
|
|||||||
environment:
|
environment:
|
||||||
- VLLM_USAGE_SOURCE
|
- VLLM_USAGE_SOURCE
|
||||||
- HF_TOKEN
|
- HF_TOKEN
|
||||||
|
|
||||||
# Premerge benchmark
|
|
||||||
- label: "A100"
|
|
||||||
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
|
|
||||||
agents:
|
|
||||||
queue: A100
|
|
||||||
depends_on: wait-for-container-image
|
|
||||||
if: build.branch != "main"
|
|
||||||
plugins:
|
|
||||||
- kubernetes:
|
|
||||||
podSpec:
|
|
||||||
priorityClassName: perf-benchmark
|
|
||||||
containers:
|
|
||||||
- image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
|
|
||||||
command:
|
|
||||||
- bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
|
|
||||||
resources:
|
|
||||||
limits:
|
|
||||||
nvidia.com/gpu: 8
|
|
||||||
volumeMounts:
|
|
||||||
- name: devshm
|
|
||||||
mountPath: /dev/shm
|
|
||||||
env:
|
|
||||||
- name: VLLM_USAGE_SOURCE
|
|
||||||
value: ci-test
|
|
||||||
- name: HF_TOKEN
|
|
||||||
valueFrom:
|
|
||||||
secretKeyRef:
|
|
||||||
name: hf-token-secret
|
|
||||||
key: token
|
|
||||||
nodeSelector:
|
|
||||||
nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
|
|
||||||
volumes:
|
|
||||||
- name: devshm
|
|
||||||
emptyDir:
|
|
||||||
medium: Memory
|
|
||||||
|
|
||||||
- label: "H200"
|
|
||||||
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
|
|
||||||
agents:
|
|
||||||
queue: H200
|
|
||||||
depends_on: wait-for-container-image
|
|
||||||
if: build.branch != "main"
|
|
||||||
plugins:
|
|
||||||
- docker#v5.12.0:
|
|
||||||
image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
|
|
||||||
command:
|
|
||||||
- bash
|
|
||||||
- .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
|
|
||||||
mount-buildkite-agent: true
|
|
||||||
propagate-environment: true
|
|
||||||
ipc: host
|
|
||||||
gpus: 4,5,6,7
|
|
||||||
volumes:
|
|
||||||
- /data/benchmark-hf-cache:/root/.cache/huggingface
|
|
||||||
environment:
|
|
||||||
- VLLM_USAGE_SOURCE
|
|
||||||
- HF_TOKEN
|
|
||||||
|
|
||||||
#- block: "Run H100 Benchmark"
|
|
||||||
#key: block-h100
|
|
||||||
#depends_on: ~
|
|
||||||
|
|
||||||
- label: "H100"
|
|
||||||
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
|
|
||||||
agents:
|
|
||||||
queue: H100
|
|
||||||
depends_on: wait-for-container-image
|
|
||||||
if: build.branch != "main"
|
|
||||||
plugins:
|
|
||||||
- docker#v5.12.0:
|
|
||||||
image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
|
|
||||||
command:
|
|
||||||
- bash
|
|
||||||
- .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
|
|
||||||
mount-buildkite-agent: true
|
|
||||||
propagate-environment: true
|
|
||||||
ipc: host
|
|
||||||
gpus: all # see CUDA_VISIBLE_DEVICES for actual GPUs used
|
|
||||||
volumes:
|
|
||||||
- /data/benchmark-hf-cache:/root/.cache/huggingface
|
|
||||||
environment:
|
|
||||||
- VLLM_USAGE_SOURCE
|
|
||||||
- HF_TOKEN
|
|
||||||
|
|||||||
@ -1,4 +1,3 @@
|
|||||||
# Nightly benchmark annotation
|
|
||||||
|
|
||||||
## Description
|
## Description
|
||||||
|
|
||||||
@ -10,19 +9,20 @@ This file contains the downloading link for benchmarking results.
|
|||||||
|
|
||||||
Please download the visualization scripts in the post
|
Please download the visualization scripts in the post
|
||||||
|
|
||||||
|
|
||||||
## Results reproduction
|
## Results reproduction
|
||||||
|
|
||||||
- Find the docker we use in `benchmarking pipeline`
|
- Find the docker we use in `benchmarking pipeline`
|
||||||
- Deploy the docker, and inside the docker:
|
- Deploy the docker, and inside the docker:
|
||||||
- Download `nightly-benchmarks.zip`.
|
- Download `nightly-benchmarks.zip`.
|
||||||
- In the same folder, run the following code:
|
- In the same folder, run the following code
|
||||||
|
```
|
||||||
```bash
|
export HF_TOKEN=<your HF token>
|
||||||
export HF_TOKEN=<your HF token>
|
apt update
|
||||||
apt update
|
apt install -y git
|
||||||
apt install -y git
|
unzip nightly-benchmarks.zip
|
||||||
unzip nightly-benchmarks.zip
|
VLLM_SOURCE_CODE_LOC=./ bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
|
||||||
VLLM_SOURCE_CODE_LOC=./ bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
|
```
|
||||||
```
|
|
||||||
|
|
||||||
And the results will be inside `./benchmarks/results`.
|
And the results will be inside `./benchmarks/results`.
|
||||||
|
|
||||||
|
|||||||
@ -2,7 +2,6 @@
|
|||||||
# Nightly benchmark
|
# Nightly benchmark
|
||||||
|
|
||||||
This benchmark aims to:
|
This benchmark aims to:
|
||||||
|
|
||||||
- Provide performance clarity: Provide clarity on which one (vllm, tensorrt-llm, lmdeploy and SGLang) leads in performance in what workload.
|
- Provide performance clarity: Provide clarity on which one (vllm, tensorrt-llm, lmdeploy and SGLang) leads in performance in what workload.
|
||||||
- Be reproducible: one can run the exact same set of benchmarking commands inside the exact same docker by following reproducing instructions.
|
- Be reproducible: one can run the exact same set of benchmarking commands inside the exact same docker by following reproducing instructions.
|
||||||
|
|
||||||
@ -10,30 +9,31 @@ Latest results: [results link](https://blog.vllm.ai/2024/09/05/perf-update.html)
|
|||||||
|
|
||||||
Latest reproduction guilde: [github issue link](https://github.com/vllm-project/vllm/issues/8176)
|
Latest reproduction guilde: [github issue link](https://github.com/vllm-project/vllm/issues/8176)
|
||||||
|
|
||||||
|
|
||||||
## Setup
|
## Setup
|
||||||
|
|
||||||
- Docker images:
|
- Docker images:
|
||||||
- vLLM: `vllm/vllm-openai:v0.6.2`
|
- vLLM: `vllm/vllm-openai:v0.6.2`
|
||||||
- SGLang: `lmsysorg/sglang:v0.3.2-cu121`
|
- SGLang: `lmsysorg/sglang:v0.3.2-cu121`
|
||||||
- LMDeploy: `openmmlab/lmdeploy:v0.6.1-cu12`
|
- LMDeploy: `openmmlab/lmdeploy:v0.6.1-cu12`
|
||||||
- TensorRT-LLM: `nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3`
|
- TensorRT-LLM: `nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3`
|
||||||
- *NOTE: we use r24.07 as the current implementation only works for this version. We are going to bump this up.*
|
- *NOTE: we uses r24.07 as the current implementation only works for this version. We are going to bump this up.*
|
||||||
- Check [nightly-pipeline.yaml](nightly-pipeline.yaml) for the concrete docker images, specs and commands we use for the benchmark.
|
- Check [nightly-pipeline.yaml](nightly-pipeline.yaml) for the concrete docker images, specs and commands we use for the benchmark.
|
||||||
- Hardware
|
- Hardware
|
||||||
- 8x Nvidia A100 GPUs
|
- 8x Nvidia A100 GPUs
|
||||||
- Workload:
|
- Workload:
|
||||||
- Dataset
|
- Dataset
|
||||||
- ShareGPT dataset
|
- ShareGPT dataset
|
||||||
- Prefill-heavy dataset (in average 462 input tokens, 16 tokens as output)
|
- Prefill-heavy dataset (in average 462 input tokens, 16 tokens as output)
|
||||||
- Decode-heavy dataset (in average 462 input tokens, 256 output tokens)
|
- Decode-heavy dataset (in average 462 input tokens, 256 output tokens)
|
||||||
- Check [nightly-tests.json](tests/nightly-tests.json) for the concrete configuration of datasets we use.
|
- Check [nightly-tests.json](tests/nightly-tests.json) for the concrete configuration of datasets we use.
|
||||||
- Models: llama-3 8B, llama-3 70B.
|
- Models: llama-3 8B, llama-3 70B.
|
||||||
- We do not use llama 3.1 as it is incompatible with trt-llm r24.07. ([issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105)).
|
- We do not use llama 3.1 as it is incompatible with trt-llm r24.07. ([issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105)).
|
||||||
- Average QPS (query per second): 2, 4, 8, 16, 32 and inf.
|
- Average QPS (query per second): 2, 4, 8, 16, 32 and inf.
|
||||||
- Queries are randomly sampled, and arrival patterns are determined via Poisson process, but all with fixed random seed.
|
- Queries are randomly sampled, and arrival patterns are determined via Poisson process, but all with fixed random seed.
|
||||||
- Evaluation metrics: Throughput (higher the better), TTFT (time to the first token, lower the better), ITL (inter-token latency, lower the better).
|
- Evaluation metrics: Throughput (higher the better), TTFT (time to the first token, lower the better), ITL (inter-token latency, lower the better).
|
||||||
|
|
||||||
## Known issues
|
# Known issues
|
||||||
|
|
||||||
- TRT-LLM crashes with Llama 3.1 8B [issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105).
|
- TRT-LLM crashes with Llama 3.1 8B [issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105).
|
||||||
- TGI does not support `ignore-eos` flag.
|
- TGI does not support `ignore-eos` flag.
|
||||||
@ -1,48 +1,45 @@
|
|||||||
# Performance benchmarks descriptions
|
|
||||||
|
|
||||||
## Latency tests
|
## Latency tests
|
||||||
|
|
||||||
- Input length: 32 tokens.
|
- Input length: 32 tokens.
|
||||||
- Output length: 128 tokens.
|
- Output length: 128 tokens.
|
||||||
- Batch size: fixed (8).
|
- Batch size: fixed (8).
|
||||||
- GPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
|
- Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
|
||||||
- CPU Models: llama-3.1 8B.
|
|
||||||
- Evaluation metrics: end-to-end latency (mean, median, p99).
|
- Evaluation metrics: end-to-end latency (mean, median, p99).
|
||||||
|
|
||||||
|
|
||||||
{latency_tests_markdown_table}
|
{latency_tests_markdown_table}
|
||||||
|
|
||||||
|
|
||||||
## Throughput tests
|
## Throughput tests
|
||||||
|
|
||||||
- Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
|
- Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
|
||||||
- Output length: the corresponding output length of these 200 prompts.
|
- Output length: the corresponding output length of these 200 prompts.
|
||||||
- Batch size: dynamically determined by vllm to achieve maximum throughput.
|
- Batch size: dynamically determined by vllm to achieve maximum throughput.
|
||||||
- GPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
|
- Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
|
||||||
- CPU Models: llama-3.1 8B.
|
|
||||||
- Evaluation metrics: throughput.
|
- Evaluation metrics: throughput.
|
||||||
|
|
||||||
|
|
||||||
{throughput_tests_markdown_table}
|
{throughput_tests_markdown_table}
|
||||||
|
|
||||||
|
|
||||||
## Serving tests
|
## Serving tests
|
||||||
|
|
||||||
- Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
|
- Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
|
||||||
- Output length: the corresponding output length of these 200 prompts.
|
- Output length: the corresponding output length of these 200 prompts.
|
||||||
- Batch size: dynamically determined by vllm and the arrival pattern of the requests.
|
- Batch size: dynamically determined by vllm and the arrival pattern of the requests.
|
||||||
- **Average QPS (query per second)**: 1, 4, 16 and inf. QPS = inf means all requests come at once. For other QPS values, the arrival time of each query is determined using a random Poisson process (with fixed random seed).
|
- **Average QPS (query per second)**: 1, 4, 16 and inf. QPS = inf means all requests come at once. For other QPS values, the arrival time of each query is determined using a random Poisson process (with fixed random seed).
|
||||||
- GPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
|
- Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
|
||||||
- We also added a speculative decoding test for llama-3 70B on GPU, under QPS 2
|
- We also added a speculative decoding test for llama-3 70B, under QPS 2
|
||||||
- CPU Models: llama-3.1 8B.
|
|
||||||
- Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99).
|
- Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99).
|
||||||
- For CPU, we added random dataset tests to benchmark fixed input/output length with 100 prompts.
|
|
||||||
|
|
||||||
{serving_tests_markdown_table}
|
{serving_tests_markdown_table}
|
||||||
|
|
||||||
## Platform Information
|
|
||||||
|
|
||||||
{platform_markdown_table}
|
|
||||||
|
|
||||||
## json version of the benchmarking tables
|
## json version of the benchmarking tables
|
||||||
|
|
||||||
This section contains the data of the markdown tables above in JSON format.
|
This section contains the data of the markdown tables above in JSON format.
|
||||||
You can load the benchmarking tables into pandas dataframes as follows:
|
You can load the benchmarking tables into pandas dataframes as follows:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
@ -57,9 +54,9 @@ serving_results = pd.DataFrame.from_dict(benchmarking_results["serving"])
|
|||||||
```
|
```
|
||||||
|
|
||||||
The json string for all benchmarking tables:
|
The json string for all benchmarking tables:
|
||||||
|
|
||||||
```json
|
```json
|
||||||
{benchmarking_results_in_json_string}
|
{benchmarking_results_in_json_string}
|
||||||
```
|
```
|
||||||
|
|
||||||
You can also check the raw experiment data in the Artifact tab of the Buildkite page.
|
You can also check the raw experiment data in the Artifact tab of the Buildkite page.
|
||||||
|
|
||||||
|
|||||||
@ -1,307 +0,0 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
import argparse
|
|
||||||
import json
|
|
||||||
import os
|
|
||||||
from importlib import util
|
|
||||||
|
|
||||||
import pandas as pd
|
|
||||||
|
|
||||||
plotly_found = util.find_spec("plotly.express") is not None
|
|
||||||
|
|
||||||
|
|
||||||
def compare_data_columns(
|
|
||||||
files, name_column, data_column, info_cols, drop_column, debug=False
|
|
||||||
):
|
|
||||||
"""
|
|
||||||
Align concatenation by keys derived from info_cols instead of row order.
|
|
||||||
- Pick one canonical key list: subset of info_cols present in ALL files.
|
|
||||||
- For each file: set index to those keys, aggregate duplicates
|
|
||||||
- (mean for metric, first for names).
|
|
||||||
- Concat along axis=1 (indexes align), then reset_index so callers can
|
|
||||||
- group by columns.
|
|
||||||
- If --debug, add a <file_label>_name column per file.
|
|
||||||
"""
|
|
||||||
print("\ncompare_data_column:", data_column)
|
|
||||||
|
|
||||||
frames = []
|
|
||||||
raw_data_cols = []
|
|
||||||
compare_frames = []
|
|
||||||
|
|
||||||
# 1) choose a canonical key list from info_cols that exists in ALL files
|
|
||||||
cols_per_file = []
|
|
||||||
for f in files:
|
|
||||||
try:
|
|
||||||
df_tmp = pd.read_json(f, orient="records")
|
|
||||||
except Exception as err:
|
|
||||||
raise ValueError(f"Failed to read {f}") from err
|
|
||||||
cols_per_file.append(set(df_tmp.columns))
|
|
||||||
|
|
||||||
key_cols = [c for c in info_cols if all(c in cset for cset in cols_per_file)]
|
|
||||||
if not key_cols:
|
|
||||||
# soft fallback: use any info_cols present in the first file
|
|
||||||
key_cols = [c for c in info_cols if c in list(cols_per_file[0])]
|
|
||||||
if not key_cols:
|
|
||||||
raise ValueError(
|
|
||||||
"No common key columns found from info_cols across the input files."
|
|
||||||
)
|
|
||||||
|
|
||||||
# 2) build a single "meta" block (keys as columns) once, aligned by the key index
|
|
||||||
meta_added = False
|
|
||||||
|
|
||||||
for file in files:
|
|
||||||
df = pd.read_json(file, orient="records")
|
|
||||||
|
|
||||||
# Keep rows that actually have the compared metric (same as original behavior)
|
|
||||||
if drop_column in df.columns:
|
|
||||||
df = df.dropna(subset=[drop_column], ignore_index=True)
|
|
||||||
|
|
||||||
# Stabilize numeric key columns (harmless if missing)
|
|
||||||
for c in (
|
|
||||||
"Input Len",
|
|
||||||
"Output Len",
|
|
||||||
"TP Size",
|
|
||||||
"PP Size",
|
|
||||||
"# of max concurrency.",
|
|
||||||
"qps",
|
|
||||||
):
|
|
||||||
if c in df.columns:
|
|
||||||
df[c] = pd.to_numeric(df[c], errors="coerce")
|
|
||||||
|
|
||||||
# Ensure all key columns exist
|
|
||||||
for c in key_cols:
|
|
||||||
if c not in df.columns:
|
|
||||||
df[c] = pd.NA
|
|
||||||
|
|
||||||
# Set index = key_cols and aggregate duplicates → unique MultiIndex
|
|
||||||
df_idx = df.set_index(key_cols, drop=False)
|
|
||||||
|
|
||||||
# meta (key columns), unique per key
|
|
||||||
meta = df_idx[key_cols]
|
|
||||||
if not meta.index.is_unique:
|
|
||||||
meta = meta.groupby(level=key_cols, dropna=False).first()
|
|
||||||
|
|
||||||
# metric series for this file, aggregated to one row per key
|
|
||||||
file_label = "/".join(file.split("/")[:-1]) or os.path.basename(file)
|
|
||||||
s = df_idx[data_column]
|
|
||||||
if not s.index.is_unique:
|
|
||||||
s = s.groupby(level=key_cols, dropna=False).mean()
|
|
||||||
s.name = file_label # column label like original
|
|
||||||
|
|
||||||
# add meta once (from first file) so keys are the leftmost columns
|
|
||||||
if not meta_added:
|
|
||||||
frames.append(meta)
|
|
||||||
meta_added = True
|
|
||||||
|
|
||||||
# (NEW) debug: aligned test-name column per file
|
|
||||||
if debug and name_column in df_idx.columns:
|
|
||||||
name_s = df_idx[name_column]
|
|
||||||
if not name_s.index.is_unique:
|
|
||||||
name_s = name_s.groupby(level=key_cols, dropna=False).first()
|
|
||||||
name_s.name = f"{file_label}_name"
|
|
||||||
frames.append(name_s)
|
|
||||||
|
|
||||||
frames.append(s)
|
|
||||||
raw_data_cols.append(file_label)
|
|
||||||
compare_frames.append(s)
|
|
||||||
|
|
||||||
# Generalize ratio: for any file N>=2, add ratio (fileN / file1)
|
|
||||||
if len(compare_frames) >= 2:
|
|
||||||
base = compare_frames[0]
|
|
||||||
current = compare_frames[-1]
|
|
||||||
ratio = current / base
|
|
||||||
ratio = ratio.mask(base == 0) # avoid inf when baseline is 0
|
|
||||||
ratio.name = f"Ratio 1 vs {len(compare_frames)}"
|
|
||||||
frames.append(ratio)
|
|
||||||
|
|
||||||
# 4) concat on columns with aligned MultiIndex;
|
|
||||||
# then reset_index to return keys as columns
|
|
||||||
concat_df = pd.concat(frames, axis=1)
|
|
||||||
concat_df = concat_df.reset_index(drop=True).reset_index()
|
|
||||||
if "index" in concat_df.columns:
|
|
||||||
concat_df = concat_df.drop(columns=["index"])
|
|
||||||
|
|
||||||
# Ensure key/info columns appear first (in your info_cols order)
|
|
||||||
front = [c for c in info_cols if c in concat_df.columns]
|
|
||||||
rest = [c for c in concat_df.columns if c not in front]
|
|
||||||
concat_df = concat_df[front + rest]
|
|
||||||
|
|
||||||
print(raw_data_cols)
|
|
||||||
return concat_df, raw_data_cols
|
|
||||||
|
|
||||||
|
|
||||||
def split_json_by_tp_pp(
|
|
||||||
input_file: str = "benchmark_results.json", output_root: str = "."
|
|
||||||
) -> list[str]:
|
|
||||||
"""
|
|
||||||
Split a benchmark JSON into separate folders by (TP Size, PP Size).
|
|
||||||
|
|
||||||
Creates: <output_root>/tp{TP}_pp{PP}/benchmark_results.json
|
|
||||||
Returns: list of file paths written.
|
|
||||||
"""
|
|
||||||
# Load JSON data into DataFrame
|
|
||||||
with open(input_file, encoding="utf-8") as f:
|
|
||||||
data = json.load(f)
|
|
||||||
|
|
||||||
# If the JSON is a dict with a list under common keys, use that list
|
|
||||||
if isinstance(data, dict):
|
|
||||||
for key in ("results", "serving_results", "benchmarks", "data"):
|
|
||||||
if isinstance(data.get(key), list):
|
|
||||||
data = data[key]
|
|
||||||
break
|
|
||||||
|
|
||||||
df = pd.DataFrame(data)
|
|
||||||
|
|
||||||
# Keep only "serving" tests
|
|
||||||
name_col = next(
|
|
||||||
(c for c in ["Test name", "test_name", "Test Name"] if c in df.columns), None
|
|
||||||
)
|
|
||||||
if name_col:
|
|
||||||
df = df[
|
|
||||||
df[name_col].astype(str).str.contains(r"serving", case=False, na=False)
|
|
||||||
].copy()
|
|
||||||
|
|
||||||
# Handle alias column names
|
|
||||||
rename_map = {
|
|
||||||
"tp_size": "TP Size",
|
|
||||||
"tensor_parallel_size": "TP Size",
|
|
||||||
"pp_size": "PP Size",
|
|
||||||
"pipeline_parallel_size": "PP Size",
|
|
||||||
}
|
|
||||||
df.rename(
|
|
||||||
columns={k: v for k, v in rename_map.items() if k in df.columns}, inplace=True
|
|
||||||
)
|
|
||||||
|
|
||||||
# Ensure TP/PP columns exist (default to 1 if missing)
|
|
||||||
if "TP Size" not in df.columns:
|
|
||||||
df["TP Size"] = 1
|
|
||||||
if "PP Size" not in df.columns:
|
|
||||||
df["PP Size"] = 1
|
|
||||||
|
|
||||||
# make sure TP/PP are numeric ints with no NaN
|
|
||||||
df["TP Size"] = (
|
|
||||||
pd.to_numeric(df.get("TP Size", 1), errors="coerce").fillna(1).astype(int)
|
|
||||||
)
|
|
||||||
df["PP Size"] = (
|
|
||||||
pd.to_numeric(df.get("PP Size", 1), errors="coerce").fillna(1).astype(int)
|
|
||||||
)
|
|
||||||
|
|
||||||
# Split into separate folders
|
|
||||||
saved_paths: list[str] = []
|
|
||||||
for (tp, pp), group_df in df.groupby(["TP Size", "PP Size"], dropna=False):
|
|
||||||
folder_name = os.path.join(output_root, f"tp{int(tp)}_pp{int(pp)}")
|
|
||||||
os.makedirs(folder_name, exist_ok=True)
|
|
||||||
filepath = os.path.join(folder_name, "benchmark_results.json")
|
|
||||||
group_df.to_json(filepath, orient="records", indent=2, force_ascii=False)
|
|
||||||
print(f"Saved: {filepath}")
|
|
||||||
saved_paths.append(filepath)
|
|
||||||
|
|
||||||
return saved_paths
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
parser = argparse.ArgumentParser()
|
|
||||||
parser.add_argument(
|
|
||||||
"-f", "--file", action="append", type=str, help="input file name"
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--debug", action="store_true", help="show all information for debugging"
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--plot",
|
|
||||||
action=argparse.BooleanOptionalAction,
|
|
||||||
default=True,
|
|
||||||
help="plot perf diagrams or not --no-plot --plot",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"-x",
|
|
||||||
"--xaxis",
|
|
||||||
type=str,
|
|
||||||
default="# of max concurrency.",
|
|
||||||
help="column name to use as X Axis in comparision graph",
|
|
||||||
)
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
drop_column = "P99"
|
|
||||||
name_column = "Test name"
|
|
||||||
info_cols = [
|
|
||||||
"Model",
|
|
||||||
"Dataset Name",
|
|
||||||
"Input Len",
|
|
||||||
"Output Len",
|
|
||||||
"TP Size",
|
|
||||||
"PP Size",
|
|
||||||
"# of max concurrency.",
|
|
||||||
"qps",
|
|
||||||
]
|
|
||||||
data_cols_to_compare = ["Output Tput (tok/s)", "Median TTFT (ms)", "Median"]
|
|
||||||
html_msgs_for_data_cols = [
|
|
||||||
"Compare Output Tokens /n",
|
|
||||||
"Median TTFT /n",
|
|
||||||
"Median TPOT /n",
|
|
||||||
]
|
|
||||||
|
|
||||||
if len(args.file) == 1:
|
|
||||||
files = split_json_by_tp_pp(args.file[0], output_root="splits")
|
|
||||||
info_cols = [c for c in info_cols if c not in ("TP Size", "PP Size")]
|
|
||||||
else:
|
|
||||||
files = args.file
|
|
||||||
print("comparing : " + ", ".join(files))
|
|
||||||
debug = args.debug
|
|
||||||
plot = args.plot
|
|
||||||
# For Plot feature, assign y axis from one of info_cols
|
|
||||||
y_axis_index = info_cols.index(args.xaxis) if args.xaxis in info_cols else 6
|
|
||||||
with open("perf_comparison.html", "w") as text_file:
|
|
||||||
for i in range(len(data_cols_to_compare)):
|
|
||||||
output_df, raw_data_cols = compare_data_columns(
|
|
||||||
files,
|
|
||||||
name_column,
|
|
||||||
data_cols_to_compare[i],
|
|
||||||
info_cols,
|
|
||||||
drop_column,
|
|
||||||
debug=debug,
|
|
||||||
)
|
|
||||||
|
|
||||||
# For Plot feature, insert y axis from one of info_cols
|
|
||||||
raw_data_cols.insert(0, info_cols[y_axis_index])
|
|
||||||
|
|
||||||
filtered_info_cols = info_cols[:-2]
|
|
||||||
existing_group_cols = [
|
|
||||||
c for c in filtered_info_cols if c in output_df.columns
|
|
||||||
]
|
|
||||||
if not existing_group_cols:
|
|
||||||
raise ValueError(
|
|
||||||
f"No valid group-by columns "
|
|
||||||
f"Expected subset: {filtered_info_cols}, "
|
|
||||||
f"but DataFrame has: {list(output_df.columns)}"
|
|
||||||
)
|
|
||||||
output_df_sorted = output_df.sort_values(by=existing_group_cols)
|
|
||||||
output_groups = output_df_sorted.groupby(existing_group_cols, dropna=False)
|
|
||||||
for name, group in output_groups:
|
|
||||||
html = group.to_html()
|
|
||||||
text_file.write(html_msgs_for_data_cols[i])
|
|
||||||
text_file.write(html)
|
|
||||||
|
|
||||||
if plot and plotly_found:
|
|
||||||
import plotly.express as px
|
|
||||||
|
|
||||||
df = group[raw_data_cols]
|
|
||||||
df_sorted = df.sort_values(by=info_cols[y_axis_index])
|
|
||||||
# Melt DataFrame for plotting
|
|
||||||
df_melted = df_sorted.melt(
|
|
||||||
id_vars=info_cols[y_axis_index],
|
|
||||||
var_name="Configuration",
|
|
||||||
value_name=data_cols_to_compare[i],
|
|
||||||
)
|
|
||||||
title = data_cols_to_compare[i] + " vs " + info_cols[y_axis_index]
|
|
||||||
# Create Plotly line chart
|
|
||||||
fig = px.line(
|
|
||||||
df_melted,
|
|
||||||
x=info_cols[y_axis_index],
|
|
||||||
y=data_cols_to_compare[i],
|
|
||||||
color="Configuration",
|
|
||||||
title=title,
|
|
||||||
markers=True,
|
|
||||||
)
|
|
||||||
# Export to HTML
|
|
||||||
text_file.write(fig.to_html(full_html=True, include_plotlyjs="cdn"))
|
|
||||||
@ -1,19 +1,12 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
|
|
||||||
import argparse
|
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import shlex
|
|
||||||
from importlib import util
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any
|
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import psutil
|
|
||||||
import regex as re
|
|
||||||
from tabulate import tabulate
|
from tabulate import tabulate
|
||||||
|
|
||||||
|
results_folder = Path("results/")
|
||||||
|
|
||||||
# latency results and the keys that will be printed into markdown
|
# latency results and the keys that will be printed into markdown
|
||||||
latency_results = []
|
latency_results = []
|
||||||
latency_column_mapping = {
|
latency_column_mapping = {
|
||||||
@ -33,39 +26,28 @@ throughput_results = []
|
|||||||
throughput_results_column_mapping = {
|
throughput_results_column_mapping = {
|
||||||
"test_name": "Test name",
|
"test_name": "Test name",
|
||||||
"gpu_type": "GPU",
|
"gpu_type": "GPU",
|
||||||
"num_requests": "# of req.",
|
# "num_requests": "# of req.",
|
||||||
"total_num_tokens": "Total # of tokens",
|
# "total_num_tokens": "Total # of tokens",
|
||||||
"elapsed_time": "Elapsed time (s)",
|
# "elapsed_time": "Elapsed time (s)",
|
||||||
"requests_per_second": "Tput (req/s)",
|
"requests_per_second": "Tput (req/s)",
|
||||||
"tokens_per_second": "Tput (tok/s)",
|
# "tokens_per_second": "Tput (tok/s)",
|
||||||
}
|
}
|
||||||
|
|
||||||
# serving results and the keys that will be printed into markdown
|
# serving results and the keys that will be printed into markdown
|
||||||
serving_results = []
|
serving_results = []
|
||||||
serving_column_mapping = {
|
serving_column_mapping = {
|
||||||
"test_name": "Test name",
|
"test_name": "Test name",
|
||||||
"model_id": "Model",
|
|
||||||
"dataset_name": "Dataset Name",
|
|
||||||
"input_len": "Input Len",
|
|
||||||
"output_len": "Output Len",
|
|
||||||
"tp_size": "TP Size",
|
|
||||||
"pp_size": "PP Size",
|
|
||||||
"dtype": "dtype",
|
|
||||||
"gpu_type": "GPU",
|
"gpu_type": "GPU",
|
||||||
"completed": "# of req.",
|
# "completed": "# of req.",
|
||||||
"qps": "qps",
|
|
||||||
"max_concurrency": "# of max concurrency.",
|
|
||||||
"request_throughput": "Tput (req/s)",
|
"request_throughput": "Tput (req/s)",
|
||||||
"total_token_throughput": "Total Token Tput (tok/s)",
|
# "input_throughput": "Input Tput (tok/s)",
|
||||||
"output_throughput": "Output Tput (tok/s)",
|
# "output_throughput": "Output Tput (tok/s)",
|
||||||
# "total_input_tokens": "Total input tokens",
|
|
||||||
# "total_output_tokens": "Total output tokens",
|
|
||||||
"mean_ttft_ms": "Mean TTFT (ms)",
|
"mean_ttft_ms": "Mean TTFT (ms)",
|
||||||
"median_ttft_ms": "Median TTFT (ms)",
|
"median_ttft_ms": "Median TTFT (ms)",
|
||||||
"p99_ttft_ms": "P99 TTFT (ms)",
|
"p99_ttft_ms": "P99 TTFT (ms)",
|
||||||
"mean_tpot_ms": "Mean TPOT (ms)",
|
# "mean_tpot_ms": "Mean TPOT (ms)",
|
||||||
"median_tpot_ms": "Median",
|
# "median_tpot_ms": "Median",
|
||||||
"p99_tpot_ms": "P99",
|
# "p99_tpot_ms": "P99",
|
||||||
"mean_itl_ms": "Mean ITL (ms)",
|
"mean_itl_ms": "Mean ITL (ms)",
|
||||||
"median_itl_ms": "Median ITL (ms)",
|
"median_itl_ms": "Median ITL (ms)",
|
||||||
"p99_itl_ms": "P99 ITL (ms)",
|
"p99_itl_ms": "P99 ITL (ms)",
|
||||||
@ -81,194 +63,42 @@ def read_markdown(file):
|
|||||||
|
|
||||||
|
|
||||||
def results_to_json(latency, throughput, serving):
|
def results_to_json(latency, throughput, serving):
|
||||||
return json.dumps(
|
return json.dumps({
|
||||||
{
|
'latency': latency.to_dict(),
|
||||||
"latency": latency.to_dict(),
|
'throughput': throughput.to_dict(),
|
||||||
"throughput": throughput.to_dict(),
|
'serving': serving.to_dict()
|
||||||
"serving": serving.to_dict(),
|
})
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def get_size_with_unit(bytes, suffix="B"):
|
|
||||||
"""
|
|
||||||
Scale bytes to its proper format
|
|
||||||
e.g:
|
|
||||||
1253656 => '1.20MB'
|
|
||||||
1253656678 => '1.17GB'
|
|
||||||
"""
|
|
||||||
factor = 1024
|
|
||||||
for unit in ["", "K", "M", "G", "T", "P"]:
|
|
||||||
if bytes < factor:
|
|
||||||
return f"{bytes:.2f}{unit}{suffix}"
|
|
||||||
bytes /= factor
|
|
||||||
|
|
||||||
|
|
||||||
def _coerce(val: str) -> Any:
|
|
||||||
"""Best-effort type coercion from string to Python types."""
|
|
||||||
low = val.lower()
|
|
||||||
if low == "null":
|
|
||||||
return None
|
|
||||||
if low == "true":
|
|
||||||
return True
|
|
||||||
if low == "false":
|
|
||||||
return False
|
|
||||||
# integers
|
|
||||||
if re.fullmatch(r"[+-]?\d+", val):
|
|
||||||
try:
|
|
||||||
return int(val)
|
|
||||||
except ValueError:
|
|
||||||
pass
|
|
||||||
# floats (keep 'inf'/'-inf'/'nan' as strings)
|
|
||||||
if re.fullmatch(r"[+-]?\d*\.\d+", val):
|
|
||||||
try:
|
|
||||||
return float(val)
|
|
||||||
except ValueError:
|
|
||||||
pass
|
|
||||||
return val
|
|
||||||
|
|
||||||
|
|
||||||
def parse_client_command(cmd: str) -> dict[str, Any]:
|
|
||||||
"""Parse the client_command shell string into {executable, script, args}."""
|
|
||||||
toks = shlex.split(cmd)
|
|
||||||
if len(toks) < 2:
|
|
||||||
raise ValueError("client_command must include an executable and a script")
|
|
||||||
executable, script = toks[0], toks[1]
|
|
||||||
args: dict[str, Any] = {}
|
|
||||||
|
|
||||||
i = 2
|
|
||||||
while i < len(toks):
|
|
||||||
t = toks[i]
|
|
||||||
if t.startswith("--"):
|
|
||||||
# --key=value or --key (value) or boolean flag
|
|
||||||
if "=" in t:
|
|
||||||
key, val = t.split("=", 1)
|
|
||||||
if key == "--metadata":
|
|
||||||
md = {}
|
|
||||||
if val:
|
|
||||||
if "=" in val:
|
|
||||||
k, v = val.split("=", 1)
|
|
||||||
md[k] = _coerce(v)
|
|
||||||
else:
|
|
||||||
md[val] = True
|
|
||||||
args[key] = md
|
|
||||||
else:
|
|
||||||
args[key] = _coerce(val)
|
|
||||||
i += 1
|
|
||||||
continue
|
|
||||||
|
|
||||||
key = t
|
|
||||||
|
|
||||||
# Special: consume metadata k=v pairs until next --flag
|
|
||||||
if key == "--metadata":
|
|
||||||
i += 1
|
|
||||||
md = {}
|
|
||||||
while i < len(toks) and not toks[i].startswith("--"):
|
|
||||||
pair = toks[i]
|
|
||||||
if "=" in pair:
|
|
||||||
k, v = pair.split("=", 1)
|
|
||||||
md[k] = _coerce(v)
|
|
||||||
else:
|
|
||||||
md[pair] = True
|
|
||||||
i += 1
|
|
||||||
args[key] = md
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Standard: check if next token is a value (not a flag)
|
|
||||||
if i + 1 < len(toks) and not toks[i + 1].startswith("--"):
|
|
||||||
args[key] = _coerce(toks[i + 1])
|
|
||||||
i += 2
|
|
||||||
else:
|
|
||||||
# lone flag -> True
|
|
||||||
args[key] = True
|
|
||||||
i += 1
|
|
||||||
else:
|
|
||||||
# unexpected positional; skip
|
|
||||||
i += 1
|
|
||||||
|
|
||||||
return {"executable": executable, "script": script, "args": args}
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
parser = argparse.ArgumentParser()
|
|
||||||
parser.add_argument(
|
|
||||||
"-r",
|
|
||||||
"--result",
|
|
||||||
type=str,
|
|
||||||
default="results",
|
|
||||||
help="Folder name for benchmark output results.",
|
|
||||||
)
|
|
||||||
args = parser.parse_args()
|
|
||||||
results_folder = Path(args.result)
|
|
||||||
if not results_folder.exists():
|
|
||||||
raise FileNotFoundError(f"results folder does not exist: {results_folder}")
|
|
||||||
# collect results
|
# collect results
|
||||||
for test_file in results_folder.glob("*.json"):
|
for test_file in results_folder.glob("*.json"):
|
||||||
|
|
||||||
with open(test_file) as f:
|
with open(test_file) as f:
|
||||||
raw_result = json.loads(f.read())
|
raw_result = json.loads(f.read())
|
||||||
|
|
||||||
if "serving" in str(test_file):
|
if "serving" in str(test_file):
|
||||||
# this result is generated via `vllm bench serve` command
|
# this result is generated via `benchmark_serving.py`
|
||||||
|
|
||||||
# attach the benchmarking command to raw_result
|
# attach the benchmarking command to raw_result
|
||||||
try:
|
with open(test_file.with_suffix(".commands")) as f:
|
||||||
with open(test_file.with_suffix(".commands")) as f:
|
command = json.loads(f.read())
|
||||||
command = json.loads(f.read())
|
|
||||||
except OSError as e:
|
|
||||||
print(e)
|
|
||||||
continue
|
|
||||||
# Parse Server Command Arg
|
|
||||||
out: dict[str, Any] = {
|
|
||||||
"server_command": parse_client_command(command["server_command"])
|
|
||||||
}
|
|
||||||
parse_args = [
|
|
||||||
"--tensor-parallel-size",
|
|
||||||
"--pipeline-parallel-size",
|
|
||||||
"--dtype",
|
|
||||||
]
|
|
||||||
col_mapping = ["tp_size", "pp_size", "dtype"]
|
|
||||||
for index, arg in enumerate(parse_args):
|
|
||||||
if arg in out["server_command"]["args"]:
|
|
||||||
raw_result.update(
|
|
||||||
{col_mapping[index]: out["server_command"]["args"][arg]}
|
|
||||||
)
|
|
||||||
|
|
||||||
# Parse Client Command Arg
|
|
||||||
out: dict[str, Any] = {
|
|
||||||
"client_command": parse_client_command(command["client_command"])
|
|
||||||
}
|
|
||||||
parse_args = [
|
|
||||||
"--dataset-name",
|
|
||||||
"--random-input-len",
|
|
||||||
"--random-output-len",
|
|
||||||
"--request-rate",
|
|
||||||
]
|
|
||||||
col_mapping = ["dataset_name", "input_len", "output_len", "qps"]
|
|
||||||
|
|
||||||
for index, arg in enumerate(parse_args):
|
|
||||||
if arg in out["client_command"]["args"]:
|
|
||||||
raw_result.update(
|
|
||||||
{col_mapping[index]: out["client_command"]["args"][arg]}
|
|
||||||
)
|
|
||||||
# Add Server, Client command
|
|
||||||
raw_result.update(command)
|
raw_result.update(command)
|
||||||
|
|
||||||
# update the test name of this result
|
# update the test name of this result
|
||||||
raw_result.update({"test_name": test_file.stem})
|
raw_result.update({"test_name": test_file.stem})
|
||||||
|
|
||||||
# add the result to raw_result
|
# add the result to raw_result
|
||||||
serving_results.append(raw_result)
|
serving_results.append(raw_result)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
elif "latency" in f.name:
|
elif "latency" in f.name:
|
||||||
# this result is generated via `vllm bench latency` command
|
# this result is generated via `benchmark_latency.py`
|
||||||
|
|
||||||
# attach the benchmarking command to raw_result
|
# attach the benchmarking command to raw_result
|
||||||
try:
|
with open(test_file.with_suffix(".commands")) as f:
|
||||||
with open(test_file.with_suffix(".commands")) as f:
|
command = json.loads(f.read())
|
||||||
command = json.loads(f.read())
|
|
||||||
except OSError as e:
|
|
||||||
print(e)
|
|
||||||
continue
|
|
||||||
|
|
||||||
raw_result.update(command)
|
raw_result.update(command)
|
||||||
|
|
||||||
# update the test name of this result
|
# update the test name of this result
|
||||||
@ -278,8 +108,7 @@ if __name__ == "__main__":
|
|||||||
for perc in [10, 25, 50, 75, 90, 99]:
|
for perc in [10, 25, 50, 75, 90, 99]:
|
||||||
# Multiply 1000 to convert the time unit from s to ms
|
# Multiply 1000 to convert the time unit from s to ms
|
||||||
raw_result.update(
|
raw_result.update(
|
||||||
{f"P{perc}": 1000 * raw_result["percentiles"][str(perc)]}
|
{f"P{perc}": 1000 * raw_result["percentiles"][str(perc)]})
|
||||||
)
|
|
||||||
raw_result["avg_latency"] = raw_result["avg_latency"] * 1000
|
raw_result["avg_latency"] = raw_result["avg_latency"] * 1000
|
||||||
|
|
||||||
# add the result to raw_result
|
# add the result to raw_result
|
||||||
@ -287,16 +116,11 @@ if __name__ == "__main__":
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
elif "throughput" in f.name:
|
elif "throughput" in f.name:
|
||||||
# this result is generated via `vllm bench throughput` command
|
# this result is generated via `benchmark_throughput.py`
|
||||||
|
|
||||||
# attach the benchmarking command to raw_result
|
# attach the benchmarking command to raw_result
|
||||||
try:
|
with open(test_file.with_suffix(".commands")) as f:
|
||||||
with open(test_file.with_suffix(".commands")) as f:
|
command = json.loads(f.read())
|
||||||
command = json.loads(f.read())
|
|
||||||
except OSError as e:
|
|
||||||
print(e)
|
|
||||||
continue
|
|
||||||
|
|
||||||
raw_result.update(command)
|
raw_result.update(command)
|
||||||
|
|
||||||
# update the test name of this result
|
# update the test name of this result
|
||||||
@ -312,51 +136,26 @@ if __name__ == "__main__":
|
|||||||
serving_results = pd.DataFrame.from_dict(serving_results)
|
serving_results = pd.DataFrame.from_dict(serving_results)
|
||||||
throughput_results = pd.DataFrame.from_dict(throughput_results)
|
throughput_results = pd.DataFrame.from_dict(throughput_results)
|
||||||
|
|
||||||
svmem = psutil.virtual_memory()
|
raw_results_json = results_to_json(latency_results, throughput_results,
|
||||||
platform_data = {
|
serving_results)
|
||||||
"Physical cores": [psutil.cpu_count(logical=False)],
|
|
||||||
"Total cores": [psutil.cpu_count(logical=True)],
|
|
||||||
"Total Memory": [get_size_with_unit(svmem.total)],
|
|
||||||
}
|
|
||||||
|
|
||||||
if util.find_spec("numa") is not None:
|
|
||||||
from numa import info
|
|
||||||
|
|
||||||
platform_data["Total NUMA nodes"] = [info.get_num_configured_nodes()]
|
|
||||||
|
|
||||||
if util.find_spec("cpuinfo") is not None:
|
|
||||||
from cpuinfo import get_cpu_info
|
|
||||||
|
|
||||||
platform_data["CPU Brand"] = [get_cpu_info()["brand_raw"]]
|
|
||||||
|
|
||||||
platform_results = pd.DataFrame.from_dict(
|
|
||||||
platform_data, orient="index", columns=["Platform Info"]
|
|
||||||
)
|
|
||||||
|
|
||||||
raw_results_json = results_to_json(
|
|
||||||
latency_results, throughput_results, serving_results
|
|
||||||
)
|
|
||||||
|
|
||||||
# remapping the key, for visualization purpose
|
# remapping the key, for visualization purpose
|
||||||
if not latency_results.empty:
|
if not latency_results.empty:
|
||||||
latency_results = latency_results[list(latency_column_mapping.keys())].rename(
|
latency_results = latency_results[list(
|
||||||
columns=latency_column_mapping
|
latency_column_mapping.keys())].rename(
|
||||||
)
|
columns=latency_column_mapping)
|
||||||
if not serving_results.empty:
|
if not serving_results.empty:
|
||||||
valid_columns = [
|
serving_results = serving_results[list(
|
||||||
col for col in serving_column_mapping if col in serving_results.columns
|
serving_column_mapping.keys())].rename(
|
||||||
]
|
columns=serving_column_mapping)
|
||||||
serving_results = serving_results[valid_columns].rename(
|
|
||||||
columns=serving_column_mapping
|
|
||||||
)
|
|
||||||
if not throughput_results.empty:
|
if not throughput_results.empty:
|
||||||
throughput_results = throughput_results[
|
throughput_results = throughput_results[list(
|
||||||
list(throughput_results_column_mapping.keys())
|
throughput_results_column_mapping.keys())].rename(
|
||||||
].rename(columns=throughput_results_column_mapping)
|
columns=throughput_results_column_mapping)
|
||||||
|
|
||||||
processed_results_json = results_to_json(
|
processed_results_json = results_to_json(latency_results,
|
||||||
latency_results, throughput_results, serving_results
|
throughput_results,
|
||||||
)
|
serving_results)
|
||||||
|
|
||||||
for df in [latency_results, serving_results, throughput_results]:
|
for df in [latency_results, serving_results, throughput_results]:
|
||||||
if df.empty:
|
if df.empty:
|
||||||
@ -368,45 +167,38 @@ if __name__ == "__main__":
|
|||||||
# The GPUs sometimes come in format of "GPUTYPE\nGPUTYPE\n...",
|
# The GPUs sometimes come in format of "GPUTYPE\nGPUTYPE\n...",
|
||||||
# we want to turn it into "8xGPUTYPE"
|
# we want to turn it into "8xGPUTYPE"
|
||||||
df["GPU"] = df["GPU"].apply(
|
df["GPU"] = df["GPU"].apply(
|
||||||
lambda x: f"{len(x.split('\n'))}x{x.split('\n')[0]}"
|
lambda x: f"{len(x.split('\n'))}x{x.split('\n')[0]}")
|
||||||
)
|
|
||||||
|
|
||||||
# get markdown tables
|
# get markdown tables
|
||||||
latency_md_table = tabulate(
|
latency_md_table = tabulate(latency_results,
|
||||||
latency_results, headers="keys", tablefmt="pipe", showindex=False
|
headers='keys',
|
||||||
)
|
tablefmt='pipe',
|
||||||
serving_md_table = tabulate(
|
showindex=False)
|
||||||
serving_results, headers="keys", tablefmt="pipe", showindex=False
|
serving_md_table = tabulate(serving_results,
|
||||||
)
|
headers='keys',
|
||||||
throughput_md_table = tabulate(
|
tablefmt='pipe',
|
||||||
throughput_results, headers="keys", tablefmt="pipe", showindex=False
|
showindex=False)
|
||||||
)
|
throughput_md_table = tabulate(throughput_results,
|
||||||
platform_md_table = tabulate(
|
headers='keys',
|
||||||
platform_results, headers="keys", tablefmt="pipe", showindex=True
|
tablefmt='pipe',
|
||||||
)
|
showindex=False)
|
||||||
|
|
||||||
# document the result
|
# document the result
|
||||||
md_file = "benchmark_results.md"
|
with open(results_folder / "benchmark_results.md", "w") as f:
|
||||||
json_file = "benchmark_results.json"
|
|
||||||
with open(results_folder / md_file, "w") as f:
|
results = read_markdown("../.buildkite/nightly-benchmarks/" +
|
||||||
results = read_markdown(
|
"performance-benchmarks-descriptions.md")
|
||||||
"../.buildkite/nightly-benchmarks/"
|
|
||||||
+ "performance-benchmarks-descriptions.md"
|
|
||||||
)
|
|
||||||
results = results.format(
|
results = results.format(
|
||||||
latency_tests_markdown_table=latency_md_table,
|
latency_tests_markdown_table=latency_md_table,
|
||||||
throughput_tests_markdown_table=throughput_md_table,
|
throughput_tests_markdown_table=throughput_md_table,
|
||||||
serving_tests_markdown_table=serving_md_table,
|
serving_tests_markdown_table=serving_md_table,
|
||||||
platform_markdown_table=platform_md_table,
|
benchmarking_results_in_json_string=processed_results_json)
|
||||||
benchmarking_results_in_json_string=processed_results_json,
|
|
||||||
)
|
|
||||||
f.write(results)
|
f.write(results)
|
||||||
|
|
||||||
# document benchmarking results in json
|
# document benchmarking results in json
|
||||||
with open(results_folder / json_file, "w") as f:
|
with open(results_folder / "benchmark_results.json", "w") as f:
|
||||||
results = (
|
|
||||||
latency_results.to_dict(orient="records")
|
results = latency_results.to_dict(
|
||||||
+ throughput_results.to_dict(orient="records")
|
orient='records') + throughput_results.to_dict(
|
||||||
+ serving_results.to_dict(orient="records")
|
orient='records') + serving_results.to_dict(orient='records')
|
||||||
)
|
|
||||||
f.write(json.dumps(results))
|
f.write(json.dumps(results))
|
||||||
|
|||||||
@ -1,6 +1,3 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
|
|
||||||
from transformers import AutoTokenizer
|
from transformers import AutoTokenizer
|
||||||
@ -15,12 +12,15 @@ def main(model, cachedir):
|
|||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
parser = argparse.ArgumentParser(
|
parser = argparse.ArgumentParser(
|
||||||
description="Download and save Hugging Face tokenizer"
|
description="Download and save Hugging Face tokenizer")
|
||||||
)
|
parser.add_argument("--model",
|
||||||
parser.add_argument("--model", type=str, required=True, help="Name of the model")
|
type=str,
|
||||||
parser.add_argument(
|
required=True,
|
||||||
"--cachedir", type=str, required=True, help="Directory to save the tokenizer"
|
help="Name of the model")
|
||||||
)
|
parser.add_argument("--cachedir",
|
||||||
|
type=str,
|
||||||
|
required=True,
|
||||||
|
help="Directory to save the tokenizer")
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
main(args.model, args.cachedir)
|
main(args.model, args.cachedir)
|
||||||
|
|||||||
@ -1,6 +1,3 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import json
|
import json
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
@ -12,33 +9,33 @@ from tabulate import tabulate
|
|||||||
|
|
||||||
def parse_arguments():
|
def parse_arguments():
|
||||||
parser = argparse.ArgumentParser(
|
parser = argparse.ArgumentParser(
|
||||||
description="Parse command line arguments for summary-nightly-results script."
|
description=
|
||||||
)
|
'Parse command line arguments for summary-nightly-results script.')
|
||||||
parser.add_argument(
|
parser.add_argument('--results-folder',
|
||||||
"--results-folder",
|
type=str,
|
||||||
type=str,
|
required=True,
|
||||||
required=True,
|
help='The folder where the results are stored.')
|
||||||
help="The folder where the results are stored.",
|
parser.add_argument('--description',
|
||||||
)
|
type=str,
|
||||||
parser.add_argument(
|
required=True,
|
||||||
"--description", type=str, required=True, help="Description of the results."
|
help='Description of the results.')
|
||||||
)
|
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
return args
|
return args
|
||||||
|
|
||||||
|
|
||||||
def get_perf(df, method, model, metric):
|
def get_perf(df, method, model, metric):
|
||||||
|
|
||||||
means = []
|
means = []
|
||||||
|
|
||||||
for qps in [2, 4, 8, 16, "inf"]:
|
for qps in [2, 4, 8, 16, "inf"]:
|
||||||
target = df["Test name"].str.contains(model)
|
target = df['Test name'].str.contains(model)
|
||||||
target = target & df["Engine"].str.contains(method)
|
target = target & df['Engine'].str.contains(method)
|
||||||
target = target & df["Test name"].str.contains("qps_" + str(qps))
|
target = target & df['Test name'].str.contains("qps_" + str(qps))
|
||||||
filtered_df = df[target]
|
filtered_df = df[target]
|
||||||
|
|
||||||
if filtered_df.empty:
|
if filtered_df.empty:
|
||||||
means.append(0.0)
|
means.append(0.)
|
||||||
else:
|
else:
|
||||||
means.append(filtered_df[metric].values[0])
|
means.append(filtered_df[metric].values[0])
|
||||||
|
|
||||||
@ -46,6 +43,7 @@ def get_perf(df, method, model, metric):
|
|||||||
|
|
||||||
|
|
||||||
def get_perf_w_std(df, method, model, metric):
|
def get_perf_w_std(df, method, model, metric):
|
||||||
|
|
||||||
if metric in ["TTFT", "ITL"]:
|
if metric in ["TTFT", "ITL"]:
|
||||||
mean = get_perf(df, method, model, "Mean " + metric + " (ms)")
|
mean = get_perf(df, method, model, "Mean " + metric + " (ms)")
|
||||||
mean = mean.tolist()
|
mean = mean.tolist()
|
||||||
@ -60,8 +58,7 @@ def get_perf_w_std(df, method, model, metric):
|
|||||||
else:
|
else:
|
||||||
assert metric == "Tput"
|
assert metric == "Tput"
|
||||||
mean = get_perf(df, method, model, "Input Tput (tok/s)") + get_perf(
|
mean = get_perf(df, method, model, "Input Tput (tok/s)") + get_perf(
|
||||||
df, method, model, "Output Tput (tok/s)"
|
df, method, model, "Output Tput (tok/s)")
|
||||||
)
|
|
||||||
mean = mean.tolist()
|
mean = mean.tolist()
|
||||||
std = None
|
std = None
|
||||||
|
|
||||||
@ -81,17 +78,18 @@ def main(args):
|
|||||||
# generate markdown table
|
# generate markdown table
|
||||||
df = pd.DataFrame.from_dict(results)
|
df = pd.DataFrame.from_dict(results)
|
||||||
|
|
||||||
md_table = tabulate(df, headers="keys", tablefmt="pipe", showindex=False)
|
md_table = tabulate(df, headers='keys', tablefmt='pipe', showindex=False)
|
||||||
|
|
||||||
with open(args.description) as f:
|
with open(args.description) as f:
|
||||||
description = f.read()
|
description = f.read()
|
||||||
|
|
||||||
description = description.format(nightly_results_benchmarking_table=md_table)
|
description = description.format(
|
||||||
|
nightly_results_benchmarking_table=md_table)
|
||||||
|
|
||||||
with open("nightly_results.md", "w") as f:
|
with open("nightly_results.md", "w") as f:
|
||||||
f.write(description)
|
f.write(description)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == '__main__':
|
||||||
args = parse_arguments()
|
args = parse_arguments()
|
||||||
main(args)
|
main(args)
|
||||||
|
|||||||
@ -1,6 +1,3 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
|
|
||||||
from lmdeploy.serve.openai.api_client import APIClient
|
from lmdeploy.serve.openai.api_client import APIClient
|
||||||
|
|
||||||
api_client = APIClient("http://localhost:8000")
|
api_client = APIClient("http://localhost:8000")
|
||||||
|
|||||||
@ -43,7 +43,7 @@ main() {
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
# The figures should be generated by a separate process outside the CI/CD pipeline
|
# The figures should be genereated by a separate process outside the CI/CD pipeline
|
||||||
|
|
||||||
# # generate figures
|
# # generate figures
|
||||||
# python3 -m pip install tabulate pandas matplotlib
|
# python3 -m pip install tabulate pandas matplotlib
|
||||||
|
|||||||
@ -73,7 +73,7 @@ get_current_llm_serving_engine() {
|
|||||||
echo "Container: vllm"
|
echo "Container: vllm"
|
||||||
# move to a completely irrelevant directory, to avoid import vllm from current folder
|
# move to a completely irrelevant directory, to avoid import vllm from current folder
|
||||||
export CURRENT_LLM_SERVING_ENGINE=vllm
|
export CURRENT_LLM_SERVING_ENGINE=vllm
|
||||||
|
|
||||||
return
|
return
|
||||||
fi
|
fi
|
||||||
}
|
}
|
||||||
@ -95,14 +95,12 @@ json2args() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
kill_gpu_processes() {
|
kill_gpu_processes() {
|
||||||
pkill -f '[p]ython'
|
pkill -f python
|
||||||
pkill -f '[p]ython3'
|
pkill -f python3
|
||||||
pkill -f '[t]ritonserver'
|
pkill -f tritonserver
|
||||||
pkill -f '[p]t_main_thread'
|
pkill -f pt_main_thread
|
||||||
pkill -f '[t]ext-generation'
|
pkill -f text-generation
|
||||||
pkill -f '[l]mdeploy'
|
pkill -f lmdeploy
|
||||||
# vLLM now names the process with VLLM prefix after https://github.com/vllm-project/vllm/pull/21445
|
|
||||||
pkill -f '[V]LLM'
|
|
||||||
|
|
||||||
while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
|
while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
|
||||||
sleep 1
|
sleep 1
|
||||||
@ -127,7 +125,7 @@ ensure_installed() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
run_serving_tests() {
|
run_serving_tests() {
|
||||||
# run serving tests using `vllm bench serve` command
|
# run serving tests using `benchmark_serving.py`
|
||||||
# $1: a json file specifying serving test cases
|
# $1: a json file specifying serving test cases
|
||||||
|
|
||||||
local serving_test_file
|
local serving_test_file
|
||||||
@ -227,7 +225,7 @@ run_serving_tests() {
|
|||||||
|
|
||||||
if [[ "$dataset_name" = "sharegpt" ]]; then
|
if [[ "$dataset_name" = "sharegpt" ]]; then
|
||||||
|
|
||||||
client_command="vllm bench serve \
|
client_command="python3 benchmark_serving.py \
|
||||||
--backend $backend \
|
--backend $backend \
|
||||||
--tokenizer /tokenizer_cache \
|
--tokenizer /tokenizer_cache \
|
||||||
--model $model \
|
--model $model \
|
||||||
@ -248,7 +246,7 @@ run_serving_tests() {
|
|||||||
sonnet_output_len=$(echo "$common_params" | jq -r '.sonnet_output_len')
|
sonnet_output_len=$(echo "$common_params" | jq -r '.sonnet_output_len')
|
||||||
sonnet_prefix_len=$(echo "$common_params" | jq -r '.sonnet_prefix_len')
|
sonnet_prefix_len=$(echo "$common_params" | jq -r '.sonnet_prefix_len')
|
||||||
|
|
||||||
client_command="vllm bench serve \
|
client_command="python3 benchmark_serving.py \
|
||||||
--backend $backend \
|
--backend $backend \
|
||||||
--tokenizer /tokenizer_cache \
|
--tokenizer /tokenizer_cache \
|
||||||
--model $model \
|
--model $model \
|
||||||
@ -267,13 +265,13 @@ run_serving_tests() {
|
|||||||
$client_args"
|
$client_args"
|
||||||
|
|
||||||
else
|
else
|
||||||
|
|
||||||
echo "The dataset name must be either 'sharegpt' or 'sonnet'. Got $dataset_name."
|
echo "The dataset name must be either 'sharegpt' or 'sonnet'. Got $dataset_name."
|
||||||
exit 1
|
exit 1
|
||||||
|
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
echo "Running test case $test_name with qps $qps"
|
echo "Running test case $test_name with qps $qps"
|
||||||
echo "Client command: $client_command"
|
echo "Client command: $client_command"
|
||||||
@ -303,104 +301,6 @@ run_serving_tests() {
|
|||||||
kill_gpu_processes
|
kill_gpu_processes
|
||||||
}
|
}
|
||||||
|
|
||||||
run_genai_perf_tests() {
|
|
||||||
# run genai-perf tests
|
|
||||||
|
|
||||||
# $1: a json file specifying genai-perf test cases
|
|
||||||
local genai_perf_test_file
|
|
||||||
genai_perf_test_file=$1
|
|
||||||
|
|
||||||
# Iterate over genai-perf tests
|
|
||||||
jq -c '.[]' "$genai_perf_test_file" | while read -r params; do
|
|
||||||
# get the test name, and append the GPU type back to it.
|
|
||||||
test_name=$(echo "$params" | jq -r '.test_name')
|
|
||||||
|
|
||||||
# if TEST_SELECTOR is set, only run the test cases that match the selector
|
|
||||||
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
|
|
||||||
echo "Skip test case $test_name."
|
|
||||||
continue
|
|
||||||
fi
|
|
||||||
|
|
||||||
# prepend the current serving engine to the test name
|
|
||||||
test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name}
|
|
||||||
|
|
||||||
# get common parameters
|
|
||||||
common_params=$(echo "$params" | jq -r '.common_parameters')
|
|
||||||
model=$(echo "$common_params" | jq -r '.model')
|
|
||||||
tp=$(echo "$common_params" | jq -r '.tp')
|
|
||||||
dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
|
|
||||||
dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
|
|
||||||
port=$(echo "$common_params" | jq -r '.port')
|
|
||||||
num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
|
|
||||||
reuse_server=$(echo "$common_params" | jq -r '.reuse_server')
|
|
||||||
|
|
||||||
# get client and server arguments
|
|
||||||
server_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_server_parameters")
|
|
||||||
qps_list=$(echo "$params" | jq -r '.qps_list')
|
|
||||||
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
|
|
||||||
echo "Running over qps list $qps_list"
|
|
||||||
|
|
||||||
# check if there is enough GPU to run the test
|
|
||||||
if [[ $gpu_count -lt $tp ]]; then
|
|
||||||
echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name."
|
|
||||||
continue
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ $reuse_server == "true" ]]; then
|
|
||||||
echo "Reuse previous server for test case $test_name"
|
|
||||||
else
|
|
||||||
kill_gpu_processes
|
|
||||||
bash "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/launch-server.sh" \
|
|
||||||
"$server_params" "$common_params"
|
|
||||||
fi
|
|
||||||
|
|
||||||
if wait_for_server; then
|
|
||||||
echo ""
|
|
||||||
echo "$CURRENT_LLM_SERVING_ENGINE server is up and running."
|
|
||||||
else
|
|
||||||
echo ""
|
|
||||||
echo "$CURRENT_LLM_SERVING_ENGINE failed to start within the timeout period."
|
|
||||||
break
|
|
||||||
fi
|
|
||||||
|
|
||||||
# iterate over different QPS
|
|
||||||
for qps in $qps_list; do
|
|
||||||
# remove the surrounding single quote from qps
|
|
||||||
if [[ "$qps" == *"inf"* ]]; then
|
|
||||||
echo "qps was $qps"
|
|
||||||
qps=$num_prompts
|
|
||||||
echo "now qps is $qps"
|
|
||||||
fi
|
|
||||||
|
|
||||||
new_test_name=$test_name"_qps_"$qps
|
|
||||||
backend=$CURRENT_LLM_SERVING_ENGINE
|
|
||||||
|
|
||||||
if [[ "$backend" == *"vllm"* ]]; then
|
|
||||||
backend="vllm"
|
|
||||||
fi
|
|
||||||
#TODO: add output dir.
|
|
||||||
client_command="genai-perf profile \
|
|
||||||
-m $model \
|
|
||||||
--service-kind openai \
|
|
||||||
--backend "$backend" \
|
|
||||||
--endpoint-type chat \
|
|
||||||
--streaming \
|
|
||||||
--url localhost:$port \
|
|
||||||
--request-rate $qps \
|
|
||||||
--num-prompts $num_prompts \
|
|
||||||
"
|
|
||||||
|
|
||||||
echo "Client command: $client_command"
|
|
||||||
|
|
||||||
eval "$client_command"
|
|
||||||
|
|
||||||
#TODO: process/record outputs
|
|
||||||
done
|
|
||||||
done
|
|
||||||
|
|
||||||
kill_gpu_processes
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
prepare_dataset() {
|
prepare_dataset() {
|
||||||
|
|
||||||
@ -415,7 +315,7 @@ prepare_dataset() {
|
|||||||
do
|
do
|
||||||
cat sonnet.txt >> sonnet_4x.txt
|
cat sonnet.txt >> sonnet_4x.txt
|
||||||
done
|
done
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
main() {
|
main() {
|
||||||
@ -428,17 +328,12 @@ main() {
|
|||||||
|
|
||||||
pip install -U transformers
|
pip install -U transformers
|
||||||
|
|
||||||
pip install -r requirements/dev.txt
|
|
||||||
which genai-perf
|
|
||||||
|
|
||||||
# check storage
|
# check storage
|
||||||
df -h
|
df -h
|
||||||
|
|
||||||
ensure_installed wget
|
ensure_installed wget
|
||||||
ensure_installed curl
|
ensure_installed curl
|
||||||
ensure_installed jq
|
ensure_installed jq
|
||||||
# genai-perf dependency
|
|
||||||
ensure_installed libb64-0d
|
|
||||||
|
|
||||||
prepare_dataset
|
prepare_dataset
|
||||||
|
|
||||||
@ -450,10 +345,6 @@ main() {
|
|||||||
# run the test
|
# run the test
|
||||||
run_serving_tests "$BENCHMARK_ROOT/tests/nightly-tests.json"
|
run_serving_tests "$BENCHMARK_ROOT/tests/nightly-tests.json"
|
||||||
|
|
||||||
# run genai-perf tests
|
|
||||||
run_genai_perf_tests "$BENCHMARK_ROOT/tests/genai-perf-tests.json"
|
|
||||||
mv artifacts/ $RESULTS_FOLDER/
|
|
||||||
|
|
||||||
# upload benchmark results to buildkite
|
# upload benchmark results to buildkite
|
||||||
python3 -m pip install tabulate pandas
|
python3 -m pip install tabulate pandas
|
||||||
python3 "$BENCHMARK_ROOT/scripts/summary-nightly-results.py"
|
python3 "$BENCHMARK_ROOT/scripts/summary-nightly-results.py"
|
||||||
|
|||||||
@ -10,38 +10,15 @@ set -x
|
|||||||
set -o pipefail
|
set -o pipefail
|
||||||
|
|
||||||
check_gpus() {
|
check_gpus() {
|
||||||
if command -v nvidia-smi; then
|
# check the number of GPUs and GPU type.
|
||||||
# check the number of GPUs and GPU type.
|
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
|
||||||
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
|
|
||||||
elif command -v amd-smi; then
|
|
||||||
declare -g gpu_count=$(amd-smi list | grep 'GPU' | wc -l)
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ $gpu_count -gt 0 ]]; then
|
if [[ $gpu_count -gt 0 ]]; then
|
||||||
echo "GPU found."
|
echo "GPU found."
|
||||||
else
|
else
|
||||||
echo "Need at least 1 GPU to run benchmarking."
|
echo "Need at least 1 GPU to run benchmarking."
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
if command -v nvidia-smi; then
|
declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')
|
||||||
declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')
|
|
||||||
elif command -v amd-smi; then
|
|
||||||
declare -g gpu_type=$(amd-smi static -g 0 -a | grep 'MARKET_NAME' | awk '{print $2}')
|
|
||||||
fi
|
|
||||||
echo "GPU type is $gpu_type"
|
|
||||||
}
|
|
||||||
|
|
||||||
check_cpus() {
|
|
||||||
# check the number of CPUs and NUMA Node and GPU type.
|
|
||||||
declare -g numa_count=$(lscpu | grep "NUMA node(s):" | awk '{print $3}')
|
|
||||||
if [[ $numa_count -gt 0 ]]; then
|
|
||||||
echo "NUMA found."
|
|
||||||
echo $numa_count
|
|
||||||
else
|
|
||||||
echo "Need at least 1 NUMA to run benchmarking."
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
declare -g gpu_type="cpu"
|
|
||||||
echo "GPU type is $gpu_type"
|
echo "GPU type is $gpu_type"
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -83,22 +60,6 @@ json2args() {
|
|||||||
echo "$args"
|
echo "$args"
|
||||||
}
|
}
|
||||||
|
|
||||||
json2envs() {
|
|
||||||
# transforms the JSON string to environment variables.
|
|
||||||
# example:
|
|
||||||
# input: { "VLLM_CPU_KVCACHE_SPACE": 5 }
|
|
||||||
# output: VLLM_CPU_KVCACHE_SPACE=5
|
|
||||||
local json_string=$1
|
|
||||||
local args=$(
|
|
||||||
echo "$json_string" | jq -r '
|
|
||||||
to_entries |
|
|
||||||
map((.key ) + "=" + (.value | tostring)) |
|
|
||||||
join(" ")
|
|
||||||
'
|
|
||||||
)
|
|
||||||
echo "$args"
|
|
||||||
}
|
|
||||||
|
|
||||||
wait_for_server() {
|
wait_for_server() {
|
||||||
# wait for vllm server to start
|
# wait for vllm server to start
|
||||||
# return 1 if vllm server crashes
|
# return 1 if vllm server crashes
|
||||||
@ -126,19 +87,12 @@ kill_gpu_processes() {
|
|||||||
ps -aux
|
ps -aux
|
||||||
lsof -t -i:8000 | xargs -r kill -9
|
lsof -t -i:8000 | xargs -r kill -9
|
||||||
pgrep python3 | xargs -r kill -9
|
pgrep python3 | xargs -r kill -9
|
||||||
# vLLM now names the process with VLLM prefix after https://github.com/vllm-project/vllm/pull/21445
|
|
||||||
pgrep VLLM | xargs -r kill -9
|
|
||||||
|
|
||||||
# wait until GPU memory usage smaller than 1GB
|
# wait until GPU memory usage smaller than 1GB
|
||||||
if command -v nvidia-smi; then
|
while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
|
||||||
while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
|
sleep 1
|
||||||
sleep 1
|
done
|
||||||
done
|
|
||||||
elif command -v amd-smi; then
|
|
||||||
while [ "$(amd-smi metric -g 0 | grep 'USED_VRAM' | awk '{print $2}')" -ge 1000 ]; do
|
|
||||||
sleep 1
|
|
||||||
done
|
|
||||||
fi
|
|
||||||
|
|
||||||
# remove vllm config file
|
# remove vllm config file
|
||||||
rm -rf ~/.config/vllm
|
rm -rf ~/.config/vllm
|
||||||
@ -165,7 +119,7 @@ upload_to_buildkite() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
run_latency_tests() {
|
run_latency_tests() {
|
||||||
# run latency tests using `vllm bench latency` command
|
# run latency tests using `benchmark_latency.py`
|
||||||
# $1: a json file specifying latency test cases
|
# $1: a json file specifying latency test cases
|
||||||
|
|
||||||
local latency_test_file
|
local latency_test_file
|
||||||
@ -189,26 +143,15 @@ run_latency_tests() {
|
|||||||
# get arguments
|
# get arguments
|
||||||
latency_params=$(echo "$params" | jq -r '.parameters')
|
latency_params=$(echo "$params" | jq -r '.parameters')
|
||||||
latency_args=$(json2args "$latency_params")
|
latency_args=$(json2args "$latency_params")
|
||||||
latency_environment_variables=$(echo "$params" | jq -r '.environment_variables')
|
|
||||||
latency_envs=$(json2envs "$latency_environment_variables")
|
|
||||||
|
|
||||||
# check if there is enough GPU to run the test
|
# check if there is enough GPU to run the test
|
||||||
tp=$(echo "$latency_params" | jq -r '.tensor_parallel_size')
|
tp=$(echo "$latency_params" | jq -r '.tensor_parallel_size')
|
||||||
if [ "$ON_CPU" == "1" ]; then
|
if [[ $gpu_count -lt $tp ]]; then
|
||||||
pp=$(echo "$latency_params" | jq -r '.pipeline_parallel_size')
|
echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
|
||||||
world_size=$(($tp*$pp))
|
continue
|
||||||
if [[ $numa_count -lt $world_size && -z "${REMOTE_HOST}" ]]; then
|
|
||||||
echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
|
|
||||||
continue
|
|
||||||
fi
|
|
||||||
else
|
|
||||||
if [[ $gpu_count -lt $tp ]]; then
|
|
||||||
echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
|
|
||||||
continue
|
|
||||||
fi
|
|
||||||
fi
|
fi
|
||||||
|
|
||||||
latency_command=" $latency_envs vllm bench latency \
|
latency_command="python3 benchmark_latency.py \
|
||||||
--output-json $RESULTS_FOLDER/${test_name}.json \
|
--output-json $RESULTS_FOLDER/${test_name}.json \
|
||||||
$latency_args"
|
$latency_args"
|
||||||
|
|
||||||
@ -234,7 +177,7 @@ run_latency_tests() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
run_throughput_tests() {
|
run_throughput_tests() {
|
||||||
# run throughput tests using `vllm bench throughput`
|
# run throughput tests using `benchmark_throughput.py`
|
||||||
# $1: a json file specifying throughput test cases
|
# $1: a json file specifying throughput test cases
|
||||||
|
|
||||||
local throughput_test_file
|
local throughput_test_file
|
||||||
@ -258,26 +201,15 @@ run_throughput_tests() {
|
|||||||
# get arguments
|
# get arguments
|
||||||
throughput_params=$(echo "$params" | jq -r '.parameters')
|
throughput_params=$(echo "$params" | jq -r '.parameters')
|
||||||
throughput_args=$(json2args "$throughput_params")
|
throughput_args=$(json2args "$throughput_params")
|
||||||
throughput_environment_variables=$(echo "$params" | jq -r '.environment_variables')
|
|
||||||
throughput_envs=$(json2envs "$throughput_environment_variables")
|
|
||||||
|
|
||||||
# check if there is enough GPU to run the test
|
# check if there is enough GPU to run the test
|
||||||
tp=$(echo "$throughput_params" | jq -r '.tensor_parallel_size')
|
tp=$(echo "$throughput_params" | jq -r '.tensor_parallel_size')
|
||||||
if [ "$ON_CPU" == "1" ]; then
|
if [[ $gpu_count -lt $tp ]]; then
|
||||||
pp=$(echo "$throughput_params" | jq -r '.pipeline_parallel_size')
|
echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
|
||||||
world_size=$(($tp*$pp))
|
continue
|
||||||
if [[ $numa_count -lt $world_size && -z "${REMOTE_HOST}" ]]; then
|
|
||||||
echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
|
|
||||||
continue
|
|
||||||
fi
|
|
||||||
else
|
|
||||||
if [[ $gpu_count -lt $tp ]]; then
|
|
||||||
echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
|
|
||||||
continue
|
|
||||||
fi
|
|
||||||
fi
|
fi
|
||||||
|
|
||||||
throughput_command=" $throughput_envs vllm bench throughput \
|
throughput_command="python3 benchmark_throughput.py \
|
||||||
--output-json $RESULTS_FOLDER/${test_name}.json \
|
--output-json $RESULTS_FOLDER/${test_name}.json \
|
||||||
$throughput_args"
|
$throughput_args"
|
||||||
|
|
||||||
@ -302,7 +234,7 @@ run_throughput_tests() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
run_serving_tests() {
|
run_serving_tests() {
|
||||||
# run serving tests using `vllm bench serve` command
|
# run serving tests using `benchmark_serving.py`
|
||||||
# $1: a json file specifying serving test cases
|
# $1: a json file specifying serving test cases
|
||||||
|
|
||||||
local serving_test_file
|
local serving_test_file
|
||||||
@ -325,36 +257,18 @@ run_serving_tests() {
|
|||||||
|
|
||||||
# get client and server arguments
|
# get client and server arguments
|
||||||
server_params=$(echo "$params" | jq -r '.server_parameters')
|
server_params=$(echo "$params" | jq -r '.server_parameters')
|
||||||
server_envs=$(echo "$params" | jq -r '.server_environment_variables')
|
|
||||||
client_params=$(echo "$params" | jq -r '.client_parameters')
|
client_params=$(echo "$params" | jq -r '.client_parameters')
|
||||||
server_args=$(json2args "$server_params")
|
server_args=$(json2args "$server_params")
|
||||||
server_envs=$(json2envs "$server_envs")
|
|
||||||
client_args=$(json2args "$client_params")
|
client_args=$(json2args "$client_params")
|
||||||
qps_list=$(echo "$params" | jq -r '.qps_list')
|
qps_list=$(echo "$params" | jq -r '.qps_list')
|
||||||
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
|
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
|
||||||
echo "Running over qps list $qps_list"
|
echo "Running over qps list $qps_list"
|
||||||
max_concurrency_list=$(echo "$params" | jq -r '.max_concurrency_list')
|
|
||||||
if [[ -z "$max_concurrency_list" || "$max_concurrency_list" == "null" ]]; then
|
|
||||||
num_prompts=$(echo "$client_params" | jq -r '.num_prompts')
|
|
||||||
max_concurrency_list="[$num_prompts]"
|
|
||||||
fi
|
|
||||||
max_concurrency_list=$(echo "$max_concurrency_list" | jq -r '.[] | @sh')
|
|
||||||
echo "Running over max concurrency list $max_concurrency_list"
|
|
||||||
|
|
||||||
# check if there is enough resources to run the test
|
# check if there is enough GPU to run the test
|
||||||
tp=$(echo "$server_params" | jq -r '.tensor_parallel_size')
|
tp=$(echo "$server_params" | jq -r '.tensor_parallel_size')
|
||||||
if [ "$ON_CPU" == "1" ]; then
|
if [[ $gpu_count -lt $tp ]]; then
|
||||||
pp=$(echo "$server_params" | jq -r '.pipeline_parallel_size')
|
echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
|
||||||
world_size=$(($tp*$pp))
|
continue
|
||||||
if [[ $numa_count -lt $world_size && -z "${REMOTE_HOST}" ]]; then
|
|
||||||
echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
|
|
||||||
continue
|
|
||||||
fi
|
|
||||||
else
|
|
||||||
if [[ $gpu_count -lt $tp ]]; then
|
|
||||||
echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
|
|
||||||
continue
|
|
||||||
fi
|
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# check if server model and client model is aligned
|
# check if server model and client model is aligned
|
||||||
@ -365,33 +279,23 @@ run_serving_tests() {
|
|||||||
continue
|
continue
|
||||||
fi
|
fi
|
||||||
|
|
||||||
server_command="$server_envs python3 \
|
server_command="python3 \
|
||||||
-m vllm.entrypoints.openai.api_server \
|
-m vllm.entrypoints.openai.api_server \
|
||||||
$server_args"
|
$server_args"
|
||||||
|
|
||||||
# run the server
|
# run the server
|
||||||
echo "Running test case $test_name"
|
echo "Running test case $test_name"
|
||||||
echo "Server command: $server_command"
|
echo "Server command: $server_command"
|
||||||
# support remote vllm server
|
bash -c "$server_command" &
|
||||||
client_remote_args=""
|
server_pid=$!
|
||||||
if [[ -z "${REMOTE_HOST}" ]]; then
|
|
||||||
bash -c "$server_command" &
|
# wait until the server is alive
|
||||||
server_pid=$!
|
if wait_for_server; then
|
||||||
# wait until the server is alive
|
echo ""
|
||||||
if wait_for_server; then
|
echo "vllm server is up and running."
|
||||||
echo ""
|
|
||||||
echo "vLLM server is up and running."
|
|
||||||
else
|
|
||||||
echo ""
|
|
||||||
echo "vLLM failed to start within the timeout period."
|
|
||||||
fi
|
|
||||||
else
|
else
|
||||||
server_command="Using Remote Server $REMOTE_HOST $REMOTE_PORT"
|
echo ""
|
||||||
if [[ ${REMOTE_PORT} ]]; then
|
echo "vllm failed to start within the timeout period."
|
||||||
client_remote_args=" --host=$REMOTE_HOST --port=$REMOTE_PORT "
|
|
||||||
else
|
|
||||||
client_remote_args=" --host=$REMOTE_HOST "
|
|
||||||
fi
|
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# iterate over different QPS
|
# iterate over different QPS
|
||||||
@ -403,39 +307,32 @@ run_serving_tests() {
|
|||||||
echo "now qps is $qps"
|
echo "now qps is $qps"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# iterate over different max_concurrency
|
new_test_name=$test_name"_qps_"$qps
|
||||||
for max_concurrency in $max_concurrency_list; do
|
|
||||||
new_test_name=$test_name"_qps_"$qps"_concurrency_"$max_concurrency
|
|
||||||
echo " new test name $new_test_name"
|
|
||||||
# pass the tensor parallel size to the client so that it can be displayed
|
|
||||||
# on the benchmark dashboard
|
|
||||||
client_command="vllm bench serve \
|
|
||||||
--save-result \
|
|
||||||
--result-dir $RESULTS_FOLDER \
|
|
||||||
--result-filename ${new_test_name}.json \
|
|
||||||
--request-rate $qps \
|
|
||||||
--max-concurrency $max_concurrency \
|
|
||||||
--metadata "tensor_parallel_size=$tp" \
|
|
||||||
$client_args $client_remote_args "
|
|
||||||
|
|
||||||
echo "Running test case $test_name with qps $qps"
|
client_command="python3 benchmark_serving.py \
|
||||||
echo "Client command: $client_command"
|
--save-result \
|
||||||
|
--result-dir $RESULTS_FOLDER \
|
||||||
|
--result-filename ${new_test_name}.json \
|
||||||
|
--request-rate $qps \
|
||||||
|
$client_args"
|
||||||
|
|
||||||
bash -c "$client_command"
|
echo "Running test case $test_name with qps $qps"
|
||||||
|
echo "Client command: $client_command"
|
||||||
|
|
||||||
# record the benchmarking commands
|
bash -c "$client_command"
|
||||||
jq_output=$(jq -n \
|
|
||||||
--arg server "$server_command" \
|
# record the benchmarking commands
|
||||||
--arg client "$client_command" \
|
jq_output=$(jq -n \
|
||||||
--arg gpu "$gpu_type" \
|
--arg server "$server_command" \
|
||||||
'{
|
--arg client "$client_command" \
|
||||||
server_command: $server,
|
--arg gpu "$gpu_type" \
|
||||||
client_command: $client,
|
'{
|
||||||
gpu_type: $gpu
|
server_command: $server,
|
||||||
}')
|
client_command: $client,
|
||||||
echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
|
gpu_type: $gpu
|
||||||
|
}')
|
||||||
|
echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
|
||||||
|
|
||||||
done
|
|
||||||
done
|
done
|
||||||
|
|
||||||
# clean up
|
# clean up
|
||||||
@ -445,30 +342,18 @@ run_serving_tests() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
main() {
|
main() {
|
||||||
local ARCH
|
check_gpus
|
||||||
ARCH=''
|
|
||||||
if [ "$ON_CPU" == "1" ];then
|
|
||||||
check_cpus
|
|
||||||
ARCH='-cpu'
|
|
||||||
else
|
|
||||||
check_gpus
|
|
||||||
fi
|
|
||||||
check_hf_token
|
check_hf_token
|
||||||
|
|
||||||
# Set to v1 to run v1 benchmark
|
|
||||||
if [[ "${ENGINE_VERSION:-v0}" == "v1" ]]; then
|
|
||||||
export VLLM_USE_V1=1
|
|
||||||
fi
|
|
||||||
|
|
||||||
# dependencies
|
# dependencies
|
||||||
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
|
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
|
||||||
(which jq) || (apt-get update && apt-get -y install jq)
|
(which jq) || (apt-get update && apt-get -y install jq)
|
||||||
(which lsof) || (apt-get update && apt-get install -y lsof)
|
(which lsof) || (apt-get update && apt-get install -y lsof)
|
||||||
|
|
||||||
# get the current IP address, required by `vllm bench serve` command
|
# get the current IP address, required by benchmark_serving.py
|
||||||
export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
|
export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
|
||||||
# turn of the reporting of the status of each request, to clean up the terminal output
|
# turn of the reporting of the status of each request, to clean up the terminal output
|
||||||
export VLLM_LOGGING_LEVEL="WARNING"
|
export VLLM_LOG_LEVEL="WARNING"
|
||||||
|
|
||||||
# prepare for benchmarking
|
# prepare for benchmarking
|
||||||
cd benchmarks || exit 1
|
cd benchmarks || exit 1
|
||||||
@ -478,9 +363,9 @@ main() {
|
|||||||
QUICK_BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
|
QUICK_BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
|
||||||
|
|
||||||
# benchmarking
|
# benchmarking
|
||||||
run_serving_tests $QUICK_BENCHMARK_ROOT/tests/"${SERVING_JSON:-serving-tests$ARCH.json}"
|
run_serving_tests $QUICK_BENCHMARK_ROOT/tests/serving-tests.json
|
||||||
run_latency_tests $QUICK_BENCHMARK_ROOT/tests/"${LATENCY_JSON:-latency-tests$ARCH.json}"
|
run_latency_tests $QUICK_BENCHMARK_ROOT/tests/latency-tests.json
|
||||||
run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/"${THROUGHPUT_JSON:-throughput-tests$ARCH.json}"
|
run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/throughput-tests.json
|
||||||
|
|
||||||
# postprocess benchmarking results
|
# postprocess benchmarking results
|
||||||
pip install tabulate pandas
|
pip install tabulate pandas
|
||||||
|
|||||||
@ -1,6 +1,3 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
|
|
||||||
import datetime
|
import datetime
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
@ -35,8 +32,10 @@ serving_column_mapping = {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
||||||
# collect results
|
# collect results
|
||||||
for test_file in results_folder.glob("*.json"):
|
for test_file in results_folder.glob("*.json"):
|
||||||
|
|
||||||
with open(test_file) as f:
|
with open(test_file) as f:
|
||||||
raw_result = json.loads(f.read())
|
raw_result = json.loads(f.read())
|
||||||
|
|
||||||
@ -55,16 +54,17 @@ if __name__ == "__main__":
|
|||||||
serving_results = pd.DataFrame.from_dict(serving_results)
|
serving_results = pd.DataFrame.from_dict(serving_results)
|
||||||
|
|
||||||
if not serving_results.empty:
|
if not serving_results.empty:
|
||||||
serving_results = serving_results[list(serving_column_mapping.keys())].rename(
|
serving_results = serving_results[list(
|
||||||
columns=serving_column_mapping
|
serving_column_mapping.keys())].rename(
|
||||||
)
|
columns=serving_column_mapping)
|
||||||
|
|
||||||
serving_md_table_with_headers = tabulate(
|
serving_md_table_with_headers = tabulate(serving_results,
|
||||||
serving_results, headers="keys", tablefmt="pipe", showindex=False
|
headers='keys',
|
||||||
)
|
tablefmt='pipe',
|
||||||
|
showindex=False)
|
||||||
# remove the first line of header
|
# remove the first line of header
|
||||||
serving_md_table_lines = serving_md_table_with_headers.split("\n")
|
serving_md_table_lines = serving_md_table_with_headers.split('\n')
|
||||||
serving_md_table_without_header = "\n".join(serving_md_table_lines[2:])
|
serving_md_table_without_header = '\n'.join(serving_md_table_lines[2:])
|
||||||
|
|
||||||
prefix = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
|
prefix = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
|
||||||
prefix = prefix + "_" + os.environ.get("CURRENT_LLM_SERVING_ENGINE")
|
prefix = prefix + "_" + os.environ.get("CURRENT_LLM_SERVING_ENGINE")
|
||||||
@ -74,9 +74,10 @@ if __name__ == "__main__":
|
|||||||
# document results with header.
|
# document results with header.
|
||||||
# for those who wants to reproduce our benchmark.
|
# for those who wants to reproduce our benchmark.
|
||||||
f.write(serving_md_table_with_headers)
|
f.write(serving_md_table_with_headers)
|
||||||
f.write("\n")
|
f.write('\n')
|
||||||
|
|
||||||
# document benchmarking results in json
|
# document benchmarking results in json
|
||||||
with open(results_folder / f"{prefix}_nightly_results.json", "w") as f:
|
with open(results_folder / f"{prefix}_nightly_results.json", "w") as f:
|
||||||
results = serving_results.to_dict(orient="records")
|
|
||||||
|
results = serving_results.to_dict(orient='records')
|
||||||
f.write(json.dumps(results))
|
f.write(json.dumps(results))
|
||||||
|
|||||||
@ -1,10 +1,6 @@
|
|||||||
#!/bin/sh
|
#!/bin/sh
|
||||||
TOKEN=$(curl -s -L "https://public.ecr.aws/token?service=public.ecr.aws&scope=repository:q9t5s3a7/vllm-ci-postmerge-repo:pull" | jq -r .token)
|
TOKEN=$(curl -s -L "https://public.ecr.aws/token?service=public.ecr.aws&scope=repository:q9t5s3a7/vllm-ci-postmerge-repo:pull" | jq -r .token)
|
||||||
if [[ "$BUILDKITE_BRANCH" == "main" ]]; then
|
URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-postmerge-repo/manifests/$BUILDKITE_COMMIT"
|
||||||
URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-postmerge-repo/manifests/$BUILDKITE_COMMIT"
|
|
||||||
else
|
|
||||||
URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-test-repo/manifests/$BUILDKITE_COMMIT"
|
|
||||||
fi
|
|
||||||
|
|
||||||
TIMEOUT_SECONDS=10
|
TIMEOUT_SECONDS=10
|
||||||
|
|
||||||
|
|||||||
@ -1,21 +0,0 @@
|
|||||||
[
|
|
||||||
{
|
|
||||||
"test_name": "llama8B_tp1_genai_perf",
|
|
||||||
"qps_list": [4,8,16,32],
|
|
||||||
"common_parameters": {
|
|
||||||
"model": "meta-llama/Meta-Llama-3-8B-Instruct",
|
|
||||||
"tp": 1,
|
|
||||||
"port": 8000,
|
|
||||||
"num_prompts": 500,
|
|
||||||
"reuse_server": false
|
|
||||||
},
|
|
||||||
"vllm_server_parameters": {
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"gpu_memory_utilization": 0.9,
|
|
||||||
"max_num_seqs": 512,
|
|
||||||
"dtype": "bfloat16"
|
|
||||||
},
|
|
||||||
"genai_perf_input_parameters": {
|
|
||||||
}
|
|
||||||
}
|
|
||||||
]
|
|
||||||
@ -1,30 +0,0 @@
|
|||||||
[
|
|
||||||
{
|
|
||||||
"test_name": "latency_llama8B_tp1",
|
|
||||||
"environment_variables": {
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"tensor_parallel_size": 1,
|
|
||||||
"load_format": "dummy",
|
|
||||||
"num_iters_warmup": 5,
|
|
||||||
"num_iters": 15
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "latency_llama8B_tp4",
|
|
||||||
"environment_variables": {
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"tensor_parallel_size": 4,
|
|
||||||
"load_format": "dummy",
|
|
||||||
"num_iters_warmup": 5,
|
|
||||||
"num_iters": 15
|
|
||||||
}
|
|
||||||
}
|
|
||||||
]
|
|
||||||
@ -29,4 +29,4 @@
|
|||||||
"num-iters": 15
|
"num-iters": 15
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
@ -35,7 +35,9 @@
|
|||||||
},
|
},
|
||||||
"vllm_server_parameters": {
|
"vllm_server_parameters": {
|
||||||
"disable_log_stats": "",
|
"disable_log_stats": "",
|
||||||
|
"disable_log_requests": "",
|
||||||
"gpu_memory_utilization": 0.9,
|
"gpu_memory_utilization": 0.9,
|
||||||
|
"num_scheduler_steps": 10,
|
||||||
"max_num_seqs": 512,
|
"max_num_seqs": 512,
|
||||||
"dtype": "bfloat16"
|
"dtype": "bfloat16"
|
||||||
},
|
},
|
||||||
@ -88,7 +90,9 @@
|
|||||||
},
|
},
|
||||||
"vllm_server_parameters": {
|
"vllm_server_parameters": {
|
||||||
"disable_log_stats": "",
|
"disable_log_stats": "",
|
||||||
|
"disable_log_requests": "",
|
||||||
"gpu_memory_utilization": 0.9,
|
"gpu_memory_utilization": 0.9,
|
||||||
|
"num_scheduler_steps": 10,
|
||||||
"max_num_seqs": 512,
|
"max_num_seqs": 512,
|
||||||
"dtype": "bfloat16"
|
"dtype": "bfloat16"
|
||||||
},
|
},
|
||||||
@ -141,7 +145,9 @@
|
|||||||
},
|
},
|
||||||
"vllm_server_parameters": {
|
"vllm_server_parameters": {
|
||||||
"disable_log_stats": "",
|
"disable_log_stats": "",
|
||||||
|
"disable_log_requests": "",
|
||||||
"gpu_memory_utilization": 0.9,
|
"gpu_memory_utilization": 0.9,
|
||||||
|
"num_scheduler_steps": 10,
|
||||||
"max_num_seqs": 512,
|
"max_num_seqs": 512,
|
||||||
"dtype": "bfloat16"
|
"dtype": "bfloat16"
|
||||||
},
|
},
|
||||||
@ -191,7 +197,9 @@
|
|||||||
},
|
},
|
||||||
"vllm_server_parameters": {
|
"vllm_server_parameters": {
|
||||||
"disable_log_stats": "",
|
"disable_log_stats": "",
|
||||||
|
"disable_log_requests": "",
|
||||||
"gpu_memory_utilization": 0.9,
|
"gpu_memory_utilization": 0.9,
|
||||||
|
"num_scheduler_steps": 10,
|
||||||
"max_num_seqs": 512,
|
"max_num_seqs": 512,
|
||||||
"dtype": "bfloat16"
|
"dtype": "bfloat16"
|
||||||
},
|
},
|
||||||
@ -243,7 +251,9 @@
|
|||||||
},
|
},
|
||||||
"vllm_server_parameters": {
|
"vllm_server_parameters": {
|
||||||
"disable_log_stats": "",
|
"disable_log_stats": "",
|
||||||
|
"disable_log_requests": "",
|
||||||
"gpu_memory_utilization": 0.9,
|
"gpu_memory_utilization": 0.9,
|
||||||
|
"num_scheduler_steps": 10,
|
||||||
"max_num_seqs": 512,
|
"max_num_seqs": 512,
|
||||||
"dtype": "bfloat16"
|
"dtype": "bfloat16"
|
||||||
},
|
},
|
||||||
@ -295,7 +305,9 @@
|
|||||||
},
|
},
|
||||||
"vllm_server_parameters": {
|
"vllm_server_parameters": {
|
||||||
"disable_log_stats": "",
|
"disable_log_stats": "",
|
||||||
|
"disable_log_requests": "",
|
||||||
"gpu_memory_utilization": 0.9,
|
"gpu_memory_utilization": 0.9,
|
||||||
|
"num_scheduler_steps": 10,
|
||||||
"max_num_seqs": 512,
|
"max_num_seqs": 512,
|
||||||
"dtype": "bfloat16"
|
"dtype": "bfloat16"
|
||||||
},
|
},
|
||||||
|
|||||||
@ -1,202 +0,0 @@
|
|||||||
[
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_tp1_sharegpt",
|
|
||||||
"qps_list": ["inf"],
|
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"tensor_parallel_size": 1,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "sharegpt",
|
|
||||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
|
||||||
"num_prompts": 200
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_tp2_sharegpt",
|
|
||||||
"qps_list": ["inf"],
|
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"tensor_parallel_size": 2,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "sharegpt",
|
|
||||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
|
||||||
"num_prompts": 200
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_tp4_sharegpt",
|
|
||||||
"qps_list": ["inf"],
|
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"tensor_parallel_size": 4,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "sharegpt",
|
|
||||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
|
||||||
"num_prompts": 200
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_tp1_random_128_128",
|
|
||||||
"qps_list": ["inf"],
|
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"tensor_parallel_size": 1,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"enable_chunked_prefill": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 128,
|
|
||||||
"ignore-eos": "",
|
|
||||||
"num_prompts": 1000
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_tp2_random_128_128",
|
|
||||||
"qps_list": ["inf"],
|
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"tensor_parallel_size": 2,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"enable_chunked_prefill": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 128,
|
|
||||||
"ignore-eos": "",
|
|
||||||
"num_prompts": 1000
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_tp4_random_128_128",
|
|
||||||
"qps_list": ["inf"],
|
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"tensor_parallel_size": 4,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"enable_chunked_prefill": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 128,
|
|
||||||
"num_prompts": 1000
|
|
||||||
}
|
|
||||||
}
|
|
||||||
]
|
|
||||||
@ -1,205 +0,0 @@
|
|||||||
[
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_pp1_sharegpt",
|
|
||||||
"qps_list": ["inf"],
|
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"pipeline_parallel_size": 1,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "sharegpt",
|
|
||||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
|
||||||
"num_prompts": 200
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_pp3_sharegpt",
|
|
||||||
"qps_list": ["inf"],
|
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"pipeline_parallel_size": 3,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "sharegpt",
|
|
||||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
|
||||||
"num_prompts": 200
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_tp2pp3_sharegpt",
|
|
||||||
"qps_list": ["inf"],
|
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"tensor_parallel_size": 2,
|
|
||||||
"pipeline_parallel_size": 3,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "sharegpt",
|
|
||||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
|
||||||
"num_prompts": 200
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_pp1_random_128_128",
|
|
||||||
"qps_list": ["inf"],
|
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"pipeline_parallel_size": 1,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"enable_chunked_prefill": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 128,
|
|
||||||
"ignore-eos": "",
|
|
||||||
"num_prompts": 1000
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_pp3_random_128_128",
|
|
||||||
"qps_list": ["inf"],
|
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"pipeline_parallel_size": 3,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"enable_chunked_prefill": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 128,
|
|
||||||
"ignore-eos": "",
|
|
||||||
"num_prompts": 1000
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_tp2pp3_random_128_128",
|
|
||||||
"qps_list": ["inf"],
|
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"tensor_parallel_size": 2,
|
|
||||||
"pipeline_parallel_size": 3,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"enable_chunked_prefill": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 128,
|
|
||||||
"ignore-eos": "",
|
|
||||||
"num_prompts": 1000
|
|
||||||
}
|
|
||||||
}
|
|
||||||
]
|
|
||||||
@ -1,168 +0,0 @@
|
|||||||
[
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_tp1_sharegpt",
|
|
||||||
"qps_list": [1, 4, 16, "inf"],
|
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"tensor_parallel_size": 1,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "sharegpt",
|
|
||||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
|
||||||
"num_prompts": 200
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_tp2_sharegpt",
|
|
||||||
"qps_list": [1, 4, 16, "inf"],
|
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"tensor_parallel_size": 2,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "sharegpt",
|
|
||||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
|
||||||
"num_prompts": 200
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_tp4_sharegpt",
|
|
||||||
"qps_list": [1, 4, 16, "inf"],
|
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"tensor_parallel_size": 4,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "sharegpt",
|
|
||||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
|
||||||
"num_prompts": 200
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_tp4_random_1024_128",
|
|
||||||
"qps_list": [1, 4, 16, "inf"],
|
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"tensor_parallel_size": 4,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"enable_chunked_prefill": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 1024,
|
|
||||||
"random-output-len": 128,
|
|
||||||
"ignore-eos": "",
|
|
||||||
"num_prompts": 100
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_pp6_random_1024_128",
|
|
||||||
"qps_list": [1, 4, 16, "inf"],
|
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"pipeline_parallel_size": 6,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"enable_chunked_prefill": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 1024,
|
|
||||||
"random-output-len": 128,
|
|
||||||
"ignore-eos": "",
|
|
||||||
"num_prompts": 100
|
|
||||||
}
|
|
||||||
}
|
|
||||||
]
|
|
||||||
@ -7,6 +7,7 @@
|
|||||||
"tensor_parallel_size": 1,
|
"tensor_parallel_size": 1,
|
||||||
"swap_space": 16,
|
"swap_space": 16,
|
||||||
"disable_log_stats": "",
|
"disable_log_stats": "",
|
||||||
|
"disable_log_requests": "",
|
||||||
"load_format": "dummy"
|
"load_format": "dummy"
|
||||||
},
|
},
|
||||||
"client_parameters": {
|
"client_parameters": {
|
||||||
@ -25,6 +26,7 @@
|
|||||||
"tensor_parallel_size": 4,
|
"tensor_parallel_size": 4,
|
||||||
"swap_space": 16,
|
"swap_space": 16,
|
||||||
"disable_log_stats": "",
|
"disable_log_stats": "",
|
||||||
|
"disable_log_requests": "",
|
||||||
"load_format": "dummy"
|
"load_format": "dummy"
|
||||||
},
|
},
|
||||||
"client_parameters": {
|
"client_parameters": {
|
||||||
@ -43,6 +45,7 @@
|
|||||||
"tensor_parallel_size": 2,
|
"tensor_parallel_size": 2,
|
||||||
"swap_space": 16,
|
"swap_space": 16,
|
||||||
"disable_log_stats": "",
|
"disable_log_stats": "",
|
||||||
|
"disable_log_requests": "",
|
||||||
"load_format": "dummy"
|
"load_format": "dummy"
|
||||||
},
|
},
|
||||||
"client_parameters": {
|
"client_parameters": {
|
||||||
@ -57,14 +60,14 @@
|
|||||||
"test_name": "serving_llama70B_tp4_sharegpt_specdecode",
|
"test_name": "serving_llama70B_tp4_sharegpt_specdecode",
|
||||||
"qps_list": [2],
|
"qps_list": [2],
|
||||||
"server_parameters": {
|
"server_parameters": {
|
||||||
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
||||||
|
"disable_log_requests": "",
|
||||||
"tensor_parallel_size": 4,
|
"tensor_parallel_size": 4,
|
||||||
"swap_space": 16,
|
"swap_space": 16,
|
||||||
"speculative_config": {
|
"speculative_model": "turboderp/Qwama-0.5B-Instruct",
|
||||||
"model": "turboderp/Qwama-0.5B-Instruct",
|
"num_speculative_tokens": 4,
|
||||||
"num_speculative_tokens": 4,
|
"speculative_draft_tensor_parallel_size": 1,
|
||||||
"draft_tensor_parallel_size": 1
|
"use_v2_block_manager": ""
|
||||||
}
|
|
||||||
},
|
},
|
||||||
"client_parameters": {
|
"client_parameters": {
|
||||||
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
||||||
|
|||||||
@ -1,32 +0,0 @@
|
|||||||
[
|
|
||||||
{
|
|
||||||
"test_name": "throughput_llama8B_tp1",
|
|
||||||
"environment_variables": {
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"tensor_parallel_size": 1,
|
|
||||||
"load_format": "dummy",
|
|
||||||
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
|
||||||
"num_prompts": 200,
|
|
||||||
"backend": "vllm"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "throughput_llama8B_tp4",
|
|
||||||
"environment_variables": {
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"tensor_parallel_size": 4,
|
|
||||||
"load_format": "dummy",
|
|
||||||
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
|
||||||
"num_prompts": 200,
|
|
||||||
"backend": "vllm"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
]
|
|
||||||
@ -32,4 +32,4 @@
|
|||||||
"backend": "vllm"
|
"backend": "vllm"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
@ -1,46 +0,0 @@
|
|||||||
# This local pyproject file is part of the migration from yapf to ruff format.
|
|
||||||
# It uses the same core rules as the main pyproject.toml file, but with the
|
|
||||||
# following differences:
|
|
||||||
# - ruff line length is overridden to 88
|
|
||||||
# - deprecated typing ignores (UP006, UP035) have been removed
|
|
||||||
|
|
||||||
[tool.ruff]
|
|
||||||
line-length = 88
|
|
||||||
|
|
||||||
[tool.ruff.lint.per-file-ignores]
|
|
||||||
"vllm/third_party/**" = ["ALL"]
|
|
||||||
"vllm/version.py" = ["F401"]
|
|
||||||
"vllm/_version.py" = ["ALL"]
|
|
||||||
|
|
||||||
[tool.ruff.lint]
|
|
||||||
select = [
|
|
||||||
# pycodestyle
|
|
||||||
"E",
|
|
||||||
# Pyflakes
|
|
||||||
"F",
|
|
||||||
# pyupgrade
|
|
||||||
"UP",
|
|
||||||
# flake8-bugbear
|
|
||||||
"B",
|
|
||||||
# flake8-simplify
|
|
||||||
"SIM",
|
|
||||||
# isort
|
|
||||||
"I",
|
|
||||||
# flake8-logging-format
|
|
||||||
"G",
|
|
||||||
]
|
|
||||||
ignore = [
|
|
||||||
# star imports
|
|
||||||
"F405", "F403",
|
|
||||||
# lambda expression assignment
|
|
||||||
"E731",
|
|
||||||
# Loop control variable not used within loop body
|
|
||||||
"B007",
|
|
||||||
# f-string format
|
|
||||||
"UP032",
|
|
||||||
# Can remove once 3.10+ is the minimum Python version
|
|
||||||
"UP007",
|
|
||||||
]
|
|
||||||
|
|
||||||
[tool.ruff.format]
|
|
||||||
docstring-code-format = true
|
|
||||||
@ -1,46 +1,12 @@
|
|||||||
steps:
|
steps:
|
||||||
# aarch64 + CUDA builds
|
- label: "Build wheel - CUDA 12.1"
|
||||||
- label: "Build arm64 wheel - CUDA 12.8"
|
|
||||||
id: build-wheel-arm64-cuda-12-8
|
|
||||||
agents:
|
|
||||||
queue: arm64_cpu_queue_postmerge
|
|
||||||
commands:
|
|
||||||
# #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
|
|
||||||
# https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
|
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
|
||||||
- "mkdir artifacts"
|
|
||||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
|
||||||
- "bash .buildkite/scripts/upload-wheels.sh"
|
|
||||||
env:
|
|
||||||
DOCKER_BUILDKIT: "1"
|
|
||||||
|
|
||||||
# x86 + CUDA builds
|
|
||||||
- label: "Build wheel - CUDA 12.8"
|
|
||||||
id: build-wheel-cuda-12-8
|
|
||||||
agents:
|
agents:
|
||||||
queue: cpu_queue_postmerge
|
queue: cpu_queue_postmerge
|
||||||
commands:
|
commands:
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain ."
|
||||||
- "mkdir artifacts"
|
- "mkdir artifacts"
|
||||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||||
- "bash .buildkite/scripts/upload-wheels.sh"
|
- "bash .buildkite/upload-wheels.sh"
|
||||||
env:
|
|
||||||
DOCKER_BUILDKIT: "1"
|
|
||||||
|
|
||||||
- block: "Build CUDA 12.6 wheel"
|
|
||||||
key: block-build-cu126-wheel
|
|
||||||
depends_on: ~
|
|
||||||
|
|
||||||
- label: "Build wheel - CUDA 12.6"
|
|
||||||
depends_on: block-build-cu126-wheel
|
|
||||||
id: build-wheel-cuda-12-6
|
|
||||||
agents:
|
|
||||||
queue: cpu_queue_postmerge
|
|
||||||
commands:
|
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.6.3 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
|
||||||
- "mkdir artifacts"
|
|
||||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
|
||||||
- "bash .buildkite/scripts/upload-wheels.sh"
|
|
||||||
env:
|
env:
|
||||||
DOCKER_BUILDKIT: "1"
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|
||||||
@ -51,93 +17,45 @@ steps:
|
|||||||
|
|
||||||
- label: "Build wheel - CUDA 11.8"
|
- label: "Build wheel - CUDA 11.8"
|
||||||
# depends_on: block-build-cu118-wheel
|
# depends_on: block-build-cu118-wheel
|
||||||
id: build-wheel-cuda-11-8
|
|
||||||
agents:
|
agents:
|
||||||
queue: cpu_queue_postmerge
|
queue: cpu_queue_postmerge
|
||||||
commands:
|
commands:
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain ."
|
||||||
- "mkdir artifacts"
|
- "mkdir artifacts"
|
||||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||||
- "bash .buildkite/scripts/upload-wheels.sh"
|
- "bash .buildkite/upload-wheels.sh"
|
||||||
env:
|
env:
|
||||||
DOCKER_BUILDKIT: "1"
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|
||||||
- block: "Build release image (x86)"
|
- block: "Build release image"
|
||||||
depends_on: ~
|
depends_on: ~
|
||||||
key: block-release-image-build
|
key: block-release-image-build
|
||||||
|
|
||||||
- label: "Build release image (x86)"
|
- label: "Build release image"
|
||||||
depends_on: block-release-image-build
|
depends_on: block-release-image-build
|
||||||
id: build-release-image-x86
|
|
||||||
agents:
|
agents:
|
||||||
queue: cpu_queue_postmerge
|
queue: cpu_queue_postmerge
|
||||||
commands:
|
commands:
|
||||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain ."
|
||||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
|
|
||||||
# re-tag to default image tag and push, just in case arm64 build fails
|
|
||||||
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
|
||||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
||||||
|
|
||||||
- label: "Build release image (arm64)"
|
|
||||||
depends_on: block-release-image-build
|
|
||||||
id: build-release-image-arm64
|
|
||||||
agents:
|
|
||||||
queue: arm64_cpu_queue_postmerge
|
|
||||||
commands:
|
|
||||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
|
|
||||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
|
|
||||||
|
|
||||||
# Add job to create multi-arch manifest
|
|
||||||
- label: "Create multi-arch manifest"
|
|
||||||
depends_on:
|
|
||||||
- build-release-image-x86
|
|
||||||
- build-release-image-arm64
|
|
||||||
id: create-multi-arch-manifest
|
|
||||||
agents:
|
|
||||||
queue: cpu_queue_postmerge
|
|
||||||
commands:
|
|
||||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
|
||||||
- "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 --amend"
|
|
||||||
- "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
|
||||||
|
|
||||||
- label: "Annotate release workflow"
|
|
||||||
depends_on:
|
|
||||||
- create-multi-arch-manifest
|
|
||||||
- build-wheel-cuda-12-8
|
|
||||||
- build-wheel-cuda-12-6
|
|
||||||
- build-wheel-cuda-11-8
|
|
||||||
id: annotate-release-workflow
|
|
||||||
agents:
|
|
||||||
queue: cpu_queue_postmerge
|
|
||||||
commands:
|
|
||||||
- "bash .buildkite/scripts/annotate-release.sh"
|
|
||||||
|
|
||||||
- label: "Build and publish TPU release image"
|
- label: "Build and publish TPU release image"
|
||||||
depends_on: ~
|
depends_on: ~
|
||||||
if: build.env("NIGHTLY") == "1"
|
if: build.env("NIGHTLY") == "1"
|
||||||
agents:
|
agents:
|
||||||
queue: tpu_queue_postmerge
|
queue: tpu_queue_postmerge
|
||||||
commands:
|
commands:
|
||||||
- "yes | docker system prune -a"
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f Dockerfile.tpu ."
|
||||||
- "git fetch --all"
|
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f docker/Dockerfile.tpu ."
|
|
||||||
- "docker push vllm/vllm-tpu:nightly"
|
- "docker push vllm/vllm-tpu:nightly"
|
||||||
- "docker push vllm/vllm-tpu:$BUILDKITE_COMMIT"
|
- "docker push vllm/vllm-tpu:$BUILDKITE_COMMIT"
|
||||||
plugins:
|
plugins:
|
||||||
- docker-login#v3.0.0:
|
- docker-login#v3.0.0:
|
||||||
username: vllmbot
|
username: vllm
|
||||||
password-env: DOCKERHUB_TOKEN
|
password-env: DOCKERHUB_TOKEN
|
||||||
env:
|
env:
|
||||||
DOCKER_BUILDKIT: "1"
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|
||||||
- input: "Provide Release version here"
|
|
||||||
id: input-release-version
|
|
||||||
fields:
|
|
||||||
- text: "What is the release version?"
|
|
||||||
key: release-version
|
|
||||||
|
|
||||||
- block: "Build CPU release image"
|
- block: "Build CPU release image"
|
||||||
key: block-cpu-release-image-build
|
key: block-cpu-release-image-build
|
||||||
depends_on: ~
|
depends_on: ~
|
||||||
@ -148,24 +66,7 @@ steps:
|
|||||||
queue: cpu_queue_postmerge
|
queue: cpu_queue_postmerge
|
||||||
commands:
|
commands:
|
||||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$RELEASE_VERSION --progress plain -f Dockerfile.cpu ."
|
||||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest"
|
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$RELEASE_VERSION"
|
||||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
|
|
||||||
env:
|
|
||||||
DOCKER_BUILDKIT: "1"
|
|
||||||
|
|
||||||
- block: "Build Neuron release image"
|
|
||||||
key: block-neuron-release-image-build
|
|
||||||
depends_on: ~
|
|
||||||
|
|
||||||
- label: "Build and publish Neuron release image"
|
|
||||||
depends_on: block-neuron-release-image-build
|
|
||||||
agents:
|
|
||||||
queue: neuron-postmerge
|
|
||||||
commands:
|
|
||||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:latest --progress plain -f docker/Dockerfile.neuron ."
|
|
||||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:latest"
|
|
||||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version)"
|
|
||||||
env:
|
env:
|
||||||
DOCKER_BUILDKIT: "1"
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|||||||
156
.buildkite/run-amd-test.sh
Executable file
156
.buildkite/run-amd-test.sh
Executable file
@ -0,0 +1,156 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# This script runs test inside the corresponding ROCm docker container.
|
||||||
|
set -o pipefail
|
||||||
|
|
||||||
|
# Print ROCm version
|
||||||
|
echo "--- Confirming Clean Initial State"
|
||||||
|
while true; do
|
||||||
|
sleep 3
|
||||||
|
if grep -q clean /opt/amdgpu/etc/gpu_state; then
|
||||||
|
echo "GPUs state is \"clean\""
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
echo "--- ROCm info"
|
||||||
|
rocminfo
|
||||||
|
|
||||||
|
# cleanup older docker images
|
||||||
|
cleanup_docker() {
|
||||||
|
# Get Docker's root directory
|
||||||
|
docker_root=$(docker info -f '{{.DockerRootDir}}')
|
||||||
|
if [ -z "$docker_root" ]; then
|
||||||
|
echo "Failed to determine Docker root directory."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
echo "Docker root directory: $docker_root"
|
||||||
|
# Check disk usage of the filesystem where Docker's root directory is located
|
||||||
|
disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
|
||||||
|
# Define the threshold
|
||||||
|
threshold=70
|
||||||
|
if [ "$disk_usage" -gt "$threshold" ]; then
|
||||||
|
echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
|
||||||
|
# Remove dangling images (those that are not tagged and not used by any container)
|
||||||
|
docker image prune -f
|
||||||
|
# Remove unused volumes / force the system prune for old images as well.
|
||||||
|
docker volume prune -f && docker system prune --force --filter "until=72h" --all
|
||||||
|
echo "Docker images and volumes cleanup completed."
|
||||||
|
else
|
||||||
|
echo "Disk usage is below $threshold%. No cleanup needed."
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
# Call the cleanup docker function
|
||||||
|
cleanup_docker
|
||||||
|
|
||||||
|
echo "--- Resetting GPUs"
|
||||||
|
|
||||||
|
echo "reset" > /opt/amdgpu/etc/gpu_state
|
||||||
|
|
||||||
|
while true; do
|
||||||
|
sleep 3
|
||||||
|
if grep -q clean /opt/amdgpu/etc/gpu_state; then
|
||||||
|
echo "GPUs state is \"clean\""
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
echo "--- Pulling container"
|
||||||
|
image_name="rocm/vllm-ci:${BUILDKITE_COMMIT}"
|
||||||
|
container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
|
||||||
|
docker pull "${image_name}"
|
||||||
|
|
||||||
|
remove_docker_container() {
|
||||||
|
docker rm -f "${container_name}" || docker image rm -f "${image_name}" || true
|
||||||
|
}
|
||||||
|
trap remove_docker_container EXIT
|
||||||
|
|
||||||
|
echo "--- Running container"
|
||||||
|
|
||||||
|
HF_CACHE="$(realpath ~)/huggingface"
|
||||||
|
mkdir -p "${HF_CACHE}"
|
||||||
|
HF_MOUNT="/root/.cache/huggingface"
|
||||||
|
|
||||||
|
commands=$@
|
||||||
|
echo "Commands:$commands"
|
||||||
|
#ignore certain kernels tests
|
||||||
|
if [[ $commands == *" kernels "* ]]; then
|
||||||
|
commands="${commands} \
|
||||||
|
--ignore=kernels/test_attention.py \
|
||||||
|
--ignore=kernels/test_attention_selector.py \
|
||||||
|
--ignore=kernels/test_blocksparse_attention.py \
|
||||||
|
--ignore=kernels/test_causal_conv1d.py \
|
||||||
|
--ignore=kernels/test_cutlass.py \
|
||||||
|
--ignore=kernels/test_encoder_decoder_attn.py \
|
||||||
|
--ignore=kernels/test_flash_attn.py \
|
||||||
|
--ignore=kernels/test_flashinfer.py \
|
||||||
|
--ignore=kernels/test_int8_quant.py \
|
||||||
|
--ignore=kernels/test_machete_gemm.py \
|
||||||
|
--ignore=kernels/test_mamba_ssm.py \
|
||||||
|
--ignore=kernels/test_marlin_gemm.py \
|
||||||
|
--ignore=kernels/test_moe.py \
|
||||||
|
--ignore=kernels/test_prefix_prefill.py \
|
||||||
|
--ignore=kernels/test_rand.py \
|
||||||
|
--ignore=kernels/test_sampler.py"
|
||||||
|
fi
|
||||||
|
|
||||||
|
#ignore certain Entrypoints tests
|
||||||
|
if [[ $commands == *" entrypoints/openai "* ]]; then
|
||||||
|
commands=${commands//" entrypoints/openai "/" entrypoints/openai \
|
||||||
|
--ignore=entrypoints/openai/test_accuracy.py \
|
||||||
|
--ignore=entrypoints/openai/test_audio.py \
|
||||||
|
--ignore=entrypoints/openai/test_encoder_decoder.py \
|
||||||
|
--ignore=entrypoints/openai/test_embedding.py \
|
||||||
|
--ignore=entrypoints/openai/test_oot_registration.py "}
|
||||||
|
fi
|
||||||
|
|
||||||
|
PARALLEL_JOB_COUNT=8
|
||||||
|
# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
|
||||||
|
if [[ $commands == *"--shard-id="* ]]; then
|
||||||
|
# assign job count as the number of shards used
|
||||||
|
commands=${commands//"--num-shards= "/"--num-shards=${PARALLEL_JOB_COUNT} "}
|
||||||
|
for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do
|
||||||
|
# assign shard-id for each shard
|
||||||
|
commands_gpu=${commands//"--shard-id= "/"--shard-id=${GPU} "}
|
||||||
|
echo "Shard ${GPU} commands:$commands_gpu"
|
||||||
|
docker run \
|
||||||
|
--device /dev/kfd --device /dev/dri \
|
||||||
|
--network host \
|
||||||
|
--shm-size=16gb \
|
||||||
|
--rm \
|
||||||
|
-e HIP_VISIBLE_DEVICES="${GPU}" \
|
||||||
|
-e HF_TOKEN \
|
||||||
|
-v "${HF_CACHE}:${HF_MOUNT}" \
|
||||||
|
-e "HF_HOME=${HF_MOUNT}" \
|
||||||
|
--name "${container_name}_${GPU}" \
|
||||||
|
"${image_name}" \
|
||||||
|
/bin/bash -c "${commands_gpu}" \
|
||||||
|
|& while read -r line; do echo ">>Shard $GPU: $line"; done &
|
||||||
|
PIDS+=($!)
|
||||||
|
done
|
||||||
|
#wait for all processes to finish and collect exit codes
|
||||||
|
for pid in "${PIDS[@]}"; do
|
||||||
|
wait "${pid}"
|
||||||
|
STATUS+=($?)
|
||||||
|
done
|
||||||
|
for st in "${STATUS[@]}"; do
|
||||||
|
if [[ ${st} -ne 0 ]]; then
|
||||||
|
echo "One of the processes failed with $st"
|
||||||
|
exit "${st}"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
else
|
||||||
|
docker run \
|
||||||
|
--device /dev/kfd --device /dev/dri \
|
||||||
|
--network host \
|
||||||
|
--shm-size=16gb \
|
||||||
|
--rm \
|
||||||
|
-e HIP_VISIBLE_DEVICES=0 \
|
||||||
|
-e HF_TOKEN \
|
||||||
|
-v "${HF_CACHE}:${HF_MOUNT}" \
|
||||||
|
-e "HF_HOME=${HF_MOUNT}" \
|
||||||
|
--name "${container_name}" \
|
||||||
|
"${image_name}" \
|
||||||
|
/bin/bash -c "${commands}"
|
||||||
|
fi
|
||||||
@ -5,16 +5,16 @@
|
|||||||
set -ex
|
set -ex
|
||||||
set -o pipefail
|
set -o pipefail
|
||||||
|
|
||||||
# cd 2 levels into the working directory
|
# cd into parent directory of this file
|
||||||
cd "$(dirname "${BASH_SOURCE[0]}")/../.."
|
cd "$(dirname "${BASH_SOURCE[0]}")/.."
|
||||||
|
|
||||||
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
|
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
|
||||||
|
|
||||||
# run python-based benchmarks and upload the result to buildkite
|
# run python-based benchmarks and upload the result to buildkite
|
||||||
vllm bench latency --output-json latency_results.json 2>&1 | tee benchmark_latency.txt
|
python3 benchmarks/benchmark_latency.py --output-json latency_results.json 2>&1 | tee benchmark_latency.txt
|
||||||
bench_latency_exit_code=$?
|
bench_latency_exit_code=$?
|
||||||
|
|
||||||
vllm bench throughput --input-len 256 --output-len 256 --output-json throughput_results.json 2>&1 | tee benchmark_throughput.txt
|
python3 benchmarks/benchmark_throughput.py --input-len 256 --output-len 256 --output-json throughput_results.json 2>&1 | tee benchmark_throughput.txt
|
||||||
bench_throughput_exit_code=$?
|
bench_throughput_exit_code=$?
|
||||||
|
|
||||||
# run server-based benchmarks and upload the result to buildkite
|
# run server-based benchmarks and upload the result to buildkite
|
||||||
@ -24,7 +24,7 @@ wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/r
|
|||||||
|
|
||||||
# wait for server to start, timeout after 600 seconds
|
# wait for server to start, timeout after 600 seconds
|
||||||
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
|
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
|
||||||
vllm bench serve \
|
python3 benchmarks/benchmark_serving.py \
|
||||||
--backend vllm \
|
--backend vllm \
|
||||||
--dataset-name sharegpt \
|
--dataset-name sharegpt \
|
||||||
--dataset-path ./ShareGPT_V3_unfiltered_cleaned_split.json \
|
--dataset-path ./ShareGPT_V3_unfiltered_cleaned_split.json \
|
||||||
@ -10,4 +10,5 @@ trap remove_docker_container EXIT
|
|||||||
remove_docker_container
|
remove_docker_container
|
||||||
|
|
||||||
# Try building the docker image
|
# Try building the docker image
|
||||||
docker build -t cpu-test -f docker/Dockerfile.s390x .
|
docker build -t cpu-test -f Dockerfile.ppc64le .
|
||||||
|
|
||||||
85
.buildkite/run-cpu-test.sh
Normal file
85
.buildkite/run-cpu-test.sh
Normal file
@ -0,0 +1,85 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# This script build the CPU docker image and run the offline inference inside the container.
|
||||||
|
# It serves a sanity check for compilation and basic model usage.
|
||||||
|
set -ex
|
||||||
|
|
||||||
|
# allow to bind to different cores
|
||||||
|
CORE_RANGE=${CORE_RANGE:-48-95}
|
||||||
|
NUMA_NODE=${NUMA_NODE:-1}
|
||||||
|
|
||||||
|
# Try building the docker image
|
||||||
|
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build -t cpu-test-"$BUILDKITE_BUILD_NUMBER" -f Dockerfile.cpu .
|
||||||
|
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 -f Dockerfile.cpu .
|
||||||
|
|
||||||
|
# Setup cleanup
|
||||||
|
remove_docker_container() { docker rm -f cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" || true; }
|
||||||
|
trap remove_docker_container EXIT
|
||||||
|
remove_docker_container
|
||||||
|
|
||||||
|
# Run the image, setting --shm-size=4g for tensor parallel.
|
||||||
|
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
|
||||||
|
--cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"
|
||||||
|
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
|
||||||
|
--cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2
|
||||||
|
|
||||||
|
function cpu_tests() {
|
||||||
|
set -e
|
||||||
|
export NUMA_NODE=$2
|
||||||
|
|
||||||
|
# offline inference
|
||||||
|
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" bash -c "
|
||||||
|
set -e
|
||||||
|
python3 examples/offline_inference.py"
|
||||||
|
|
||||||
|
# Run basic model test
|
||||||
|
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
|
||||||
|
set -e
|
||||||
|
pip install pytest pytest-asyncio \
|
||||||
|
decord einops librosa peft Pillow sentence-transformers soundfile \
|
||||||
|
transformers_stream_generator matplotlib datamodel_code_generator
|
||||||
|
pip install torchvision --index-url https://download.pytorch.org/whl/cpu
|
||||||
|
pytest -v -s tests/models/decoder_only/language -m cpu_model
|
||||||
|
pytest -v -s tests/models/embedding/language -m cpu_model
|
||||||
|
pytest -v -s tests/models/encoder_decoder/language -m cpu_model
|
||||||
|
pytest -v -s tests/models/decoder_only/audio_language -m cpu_model
|
||||||
|
pytest -v -s tests/models/decoder_only/vision_language -m cpu_model"
|
||||||
|
|
||||||
|
# Run compressed-tensor test
|
||||||
|
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
|
||||||
|
set -e
|
||||||
|
pytest -s -v \
|
||||||
|
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
|
||||||
|
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token"
|
||||||
|
|
||||||
|
# Run AWQ test
|
||||||
|
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
|
||||||
|
set -e
|
||||||
|
pytest -s -v \
|
||||||
|
tests/quantization/test_ipex_quant.py"
|
||||||
|
|
||||||
|
# Run chunked-prefill and prefix-cache test
|
||||||
|
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
|
||||||
|
set -e
|
||||||
|
pytest -s -v -k cpu_model \
|
||||||
|
tests/basic_correctness/test_chunked_prefill.py"
|
||||||
|
|
||||||
|
# online inference
|
||||||
|
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
|
||||||
|
set -e
|
||||||
|
export VLLM_CPU_KVCACHE_SPACE=10
|
||||||
|
export VLLM_CPU_OMP_THREADS_BIND=$1
|
||||||
|
python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half &
|
||||||
|
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
|
||||||
|
python3 benchmarks/benchmark_serving.py \
|
||||||
|
--backend vllm \
|
||||||
|
--dataset-name random \
|
||||||
|
--model facebook/opt-125m \
|
||||||
|
--num-prompts 20 \
|
||||||
|
--endpoint /v1/completions \
|
||||||
|
--tokenizer facebook/opt-125m"
|
||||||
|
}
|
||||||
|
|
||||||
|
# All of CPU tests are expected to be finished less than 25 mins.
|
||||||
|
export -f cpu_tests
|
||||||
|
timeout 30m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
|
||||||
@ -9,14 +9,13 @@ python3 use_existing_torch.py
|
|||||||
|
|
||||||
# Try building the docker image
|
# Try building the docker image
|
||||||
DOCKER_BUILDKIT=1 docker build . \
|
DOCKER_BUILDKIT=1 docker build . \
|
||||||
--file docker/Dockerfile \
|
|
||||||
--target vllm-openai \
|
--target vllm-openai \
|
||||||
--platform "linux/arm64" \
|
--platform "linux/arm64" \
|
||||||
-t gh200-test \
|
-t gh200-test \
|
||||||
--build-arg max_jobs=66 \
|
--build-arg max_jobs=66 \
|
||||||
--build-arg nvcc_threads=2 \
|
--build-arg nvcc_threads=2 \
|
||||||
--build-arg RUN_WHEEL_CHECK=false \
|
--build-arg torch_cuda_arch_list="9.0+PTX" \
|
||||||
--build-arg torch_cuda_arch_list="9.0+PTX"
|
--build-arg vllm_fa_cmake_gpu_arches="90-real"
|
||||||
|
|
||||||
# Setup cleanup
|
# Setup cleanup
|
||||||
remove_docker_container() { docker rm -f gh200-test || true; }
|
remove_docker_container() { docker rm -f gh200-test || true; }
|
||||||
@ -24,6 +23,6 @@ trap remove_docker_container EXIT
|
|||||||
remove_docker_container
|
remove_docker_container
|
||||||
|
|
||||||
# Run the image and test offline inference
|
# Run the image and test offline inference
|
||||||
docker run -e HF_TOKEN -e VLLM_WORKER_MULTIPROC_METHOD=spawn -v /root/.cache/huggingface:/root/.cache/huggingface --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
|
docker run --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
|
||||||
python3 examples/offline_inference/basic/generate.py --model meta-llama/Llama-3.2-1B
|
python3 examples/offline_inference.py
|
||||||
'
|
'
|
||||||
16
.buildkite/run-hpu-test.sh
Normal file
16
.buildkite/run-hpu-test.sh
Normal file
@ -0,0 +1,16 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# This script build the CPU docker image and run the offline inference inside the container.
|
||||||
|
# It serves a sanity check for compilation and basic model usage.
|
||||||
|
set -ex
|
||||||
|
|
||||||
|
# Try building the docker image
|
||||||
|
docker build -t hpu-test-env -f Dockerfile.hpu .
|
||||||
|
|
||||||
|
# Setup cleanup
|
||||||
|
remove_docker_container() { docker rm -f hpu-test || true; }
|
||||||
|
trap remove_docker_container EXIT
|
||||||
|
remove_docker_container
|
||||||
|
|
||||||
|
# Run the image and launch offline inference
|
||||||
|
docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference.py
|
||||||
@ -3,7 +3,7 @@
|
|||||||
set -euox pipefail
|
set -euox pipefail
|
||||||
|
|
||||||
if [[ $# -lt 4 ]]; then
|
if [[ $# -lt 4 ]]; then
|
||||||
echo "Usage: .buildkite/scripts/run-multi-node-test.sh WORKING_DIR NUM_NODES NUM_GPUS DOCKER_IMAGE COMMAND1 COMMAND2 ... COMMANDN"
|
echo "Usage: .buildkite/run-multi-node-test.sh WORKING_DIR NUM_NODES NUM_GPUS DOCKER_IMAGE COMMAND1 COMMAND2 ... COMMANDN"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
@ -11,14 +11,13 @@ container_name="neuron_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
|
|||||||
HF_CACHE="$(realpath ~)/huggingface"
|
HF_CACHE="$(realpath ~)/huggingface"
|
||||||
mkdir -p "${HF_CACHE}"
|
mkdir -p "${HF_CACHE}"
|
||||||
HF_MOUNT="/root/.cache/huggingface"
|
HF_MOUNT="/root/.cache/huggingface"
|
||||||
HF_TOKEN=$(aws secretsmanager get-secret-value --secret-id "ci/vllm-neuron/hf-token" --region us-west-2 --query 'SecretString' --output text | jq -r .VLLM_NEURON_CI_HF_TOKEN)
|
|
||||||
|
|
||||||
NEURON_COMPILE_CACHE_URL="$(realpath ~)/neuron_compile_cache"
|
NEURON_COMPILE_CACHE_URL="$(realpath ~)/neuron_compile_cache"
|
||||||
mkdir -p "${NEURON_COMPILE_CACHE_URL}"
|
mkdir -p "${NEURON_COMPILE_CACHE_URL}"
|
||||||
NEURON_COMPILE_CACHE_MOUNT="/root/.cache/neuron_compile_cache"
|
NEURON_COMPILE_CACHE_MOUNT="/root/.cache/neuron_compile_cache"
|
||||||
|
|
||||||
# Try building the docker image
|
# Try building the docker image
|
||||||
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws
|
aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-west-2.amazonaws.com
|
||||||
|
|
||||||
# prune old image and containers to save disk space, and only once a day
|
# prune old image and containers to save disk space, and only once a day
|
||||||
# by using a timestamp file in tmp.
|
# by using a timestamp file in tmp.
|
||||||
@ -26,17 +25,17 @@ if [ -f /tmp/neuron-docker-build-timestamp ]; then
|
|||||||
last_build=$(cat /tmp/neuron-docker-build-timestamp)
|
last_build=$(cat /tmp/neuron-docker-build-timestamp)
|
||||||
current_time=$(date +%s)
|
current_time=$(date +%s)
|
||||||
if [ $((current_time - last_build)) -gt 86400 ]; then
|
if [ $((current_time - last_build)) -gt 86400 ]; then
|
||||||
# Remove dangling images (those that are not tagged and not used by any container)
|
|
||||||
docker image prune -f
|
docker image prune -f
|
||||||
# Remove unused volumes / force the system prune for old images as well.
|
docker system prune -f
|
||||||
docker volume prune -f && docker system prune -f
|
rm -rf "${HF_MOUNT:?}/*"
|
||||||
|
rm -rf "${NEURON_COMPILE_CACHE_MOUNT:?}/*"
|
||||||
echo "$current_time" > /tmp/neuron-docker-build-timestamp
|
echo "$current_time" > /tmp/neuron-docker-build-timestamp
|
||||||
fi
|
fi
|
||||||
else
|
else
|
||||||
date "+%s" > /tmp/neuron-docker-build-timestamp
|
date "+%s" > /tmp/neuron-docker-build-timestamp
|
||||||
fi
|
fi
|
||||||
|
|
||||||
docker build -t "${image_name}" -f docker/Dockerfile.neuron .
|
docker build -t "${image_name}" -f Dockerfile.neuron .
|
||||||
|
|
||||||
# Setup cleanup
|
# Setup cleanup
|
||||||
remove_docker_container() {
|
remove_docker_container() {
|
||||||
@ -45,20 +44,11 @@ remove_docker_container() {
|
|||||||
trap remove_docker_container EXIT
|
trap remove_docker_container EXIT
|
||||||
|
|
||||||
# Run the image
|
# Run the image
|
||||||
docker run --rm -it --device=/dev/neuron0 --network bridge \
|
docker run --rm -it --device=/dev/neuron0 --device=/dev/neuron1 --network host \
|
||||||
-v "${HF_CACHE}:${HF_MOUNT}" \
|
-v "${HF_CACHE}:${HF_MOUNT}" \
|
||||||
-e "HF_HOME=${HF_MOUNT}" \
|
-e "HF_HOME=${HF_MOUNT}" \
|
||||||
-e "HF_TOKEN=${HF_TOKEN}" \
|
|
||||||
-v "${NEURON_COMPILE_CACHE_URL}:${NEURON_COMPILE_CACHE_MOUNT}" \
|
-v "${NEURON_COMPILE_CACHE_URL}:${NEURON_COMPILE_CACHE_MOUNT}" \
|
||||||
-e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \
|
-e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \
|
||||||
--name "${container_name}" \
|
--name "${container_name}" \
|
||||||
${image_name} \
|
${image_name} \
|
||||||
/bin/bash -c "
|
/bin/bash -c "python3 /workspace/vllm/examples/offline_inference_neuron.py"
|
||||||
set -e; # Exit on first error
|
|
||||||
python3 /workspace/vllm/examples/offline_inference/neuron.py;
|
|
||||||
python3 -m pytest /workspace/vllm/tests/neuron/1_core/ -v --capture=tee-sys;
|
|
||||||
for f in /workspace/vllm/tests/neuron/2_core/*.py; do
|
|
||||||
echo \"Running test file: \$f\";
|
|
||||||
python3 -m pytest \$f -v --capture=tee-sys;
|
|
||||||
done
|
|
||||||
"
|
|
||||||
16
.buildkite/run-openvino-test.sh
Executable file
16
.buildkite/run-openvino-test.sh
Executable file
@ -0,0 +1,16 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# This script build the OpenVINO docker image and run the offline inference inside the container.
|
||||||
|
# It serves a sanity check for compilation and basic model usage.
|
||||||
|
set -ex
|
||||||
|
|
||||||
|
# Try building the docker image
|
||||||
|
docker build -t openvino-test -f Dockerfile.openvino .
|
||||||
|
|
||||||
|
# Setup cleanup
|
||||||
|
remove_docker_container() { docker rm -f openvino-test || true; }
|
||||||
|
trap remove_docker_container EXIT
|
||||||
|
remove_docker_container
|
||||||
|
|
||||||
|
# Run the image and launch offline inference
|
||||||
|
docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference.py
|
||||||
17
.buildkite/run-tpu-test.sh
Normal file
17
.buildkite/run-tpu-test.sh
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
# Build the docker image.
|
||||||
|
docker build -f Dockerfile.tpu -t vllm-tpu .
|
||||||
|
|
||||||
|
# Set up cleanup.
|
||||||
|
remove_docker_container() { docker rm -f tpu-test || true; }
|
||||||
|
trap remove_docker_container EXIT
|
||||||
|
# Remove the container that might not be cleaned up in the previous run.
|
||||||
|
remove_docker_container
|
||||||
|
|
||||||
|
# For HF_TOKEN.
|
||||||
|
source /etc/environment
|
||||||
|
# Run a simple end-to-end example.
|
||||||
|
docker run --privileged --net host --shm-size=16G -it -e "HF_TOKEN=$HF_TOKEN" --name tpu-test vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git && python3 -m pip install pytest && python3 -m pip install lm_eval[api]==0.4.4 && pytest -v -s /workspace/vllm/tests/entrypoints/openai/test_accuracy.py && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py && python3 /workspace/vllm/tests/tpu/test_compilation.py && python3 /workspace/vllm/examples/offline_inference_tpu.py"
|
||||||
19
.buildkite/run-xpu-test.sh
Normal file
19
.buildkite/run-xpu-test.sh
Normal file
@ -0,0 +1,19 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# This script build the CPU docker image and run the offline inference inside the container.
|
||||||
|
# It serves a sanity check for compilation and basic model usage.
|
||||||
|
set -ex
|
||||||
|
|
||||||
|
# Try building the docker image
|
||||||
|
docker build -t xpu-test -f Dockerfile.xpu .
|
||||||
|
|
||||||
|
# Setup cleanup
|
||||||
|
remove_docker_container() { docker rm -f xpu-test || true; }
|
||||||
|
trap remove_docker_container EXIT
|
||||||
|
remove_docker_container
|
||||||
|
|
||||||
|
# Run the image and test offline inference/tensor parallel
|
||||||
|
docker run --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path --entrypoint="" xpu-test sh -c '
|
||||||
|
python3 examples/offline_inference.py
|
||||||
|
python3 examples/offline_inference_cli.py -tp 2
|
||||||
|
'
|
||||||
@ -1,31 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
set -ex
|
|
||||||
|
|
||||||
# Get release version and strip leading 'v' if present
|
|
||||||
RELEASE_VERSION=$(buildkite-agent meta-data get release-version | sed 's/^v//')
|
|
||||||
|
|
||||||
if [ -z "$RELEASE_VERSION" ]; then
|
|
||||||
echo "Error: RELEASE_VERSION is empty. 'release-version' metadata might not be set or is invalid."
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
buildkite-agent annotate --style 'info' --context 'release-workflow' << EOF
|
|
||||||
To download the wheel:
|
|
||||||
\`\`\`
|
|
||||||
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
|
|
||||||
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu126/vllm-${RELEASE_VERSION}+cu126-cp38-abi3-manylinux1_x86_64.whl .
|
|
||||||
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu118/vllm-${RELEASE_VERSION}+cu118-cp38-abi3-manylinux1_x86_64.whl .
|
|
||||||
\`\`\`
|
|
||||||
|
|
||||||
To download and upload the image:
|
|
||||||
|
|
||||||
\`\`\`
|
|
||||||
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}
|
|
||||||
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT} vllm/vllm-openai
|
|
||||||
docker tag vllm/vllm-openai vllm/vllm-openai:latest
|
|
||||||
docker tag vllm/vllm-openai vllm/vllm-openai:v${RELEASE_VERSION}
|
|
||||||
docker push vllm/vllm-openai:latest
|
|
||||||
docker push vllm/vllm-openai:v${RELEASE_VERSION}
|
|
||||||
\`\`\`
|
|
||||||
EOF
|
|
||||||
@ -1,17 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
# Usage: ./ci_clean_log.sh ci.log
|
|
||||||
# This script strips timestamps and color codes from CI log files.
|
|
||||||
|
|
||||||
# Check if argument is given
|
|
||||||
if [ $# -lt 1 ]; then
|
|
||||||
echo "Usage: $0 ci.log"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
INPUT_FILE="$1"
|
|
||||||
|
|
||||||
# Strip timestamps
|
|
||||||
sed -i 's/^\[[0-9]\{4\}-[0-9]\{2\}-[0-9]\{2\}T[0-9]\{2\}:[0-9]\{2\}:[0-9]\{2\}Z\] //' "$INPUT_FILE"
|
|
||||||
|
|
||||||
# Strip colorization
|
|
||||||
sed -i -r 's/\x1B\[[0-9;]*[mK]//g' "$INPUT_FILE"
|
|
||||||
@ -1,242 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
# This script runs test inside the corresponding ROCm docker container.
|
|
||||||
set -o pipefail
|
|
||||||
|
|
||||||
# Export Python path
|
|
||||||
export PYTHONPATH=".."
|
|
||||||
|
|
||||||
# Print ROCm version
|
|
||||||
echo "--- Confirming Clean Initial State"
|
|
||||||
while true; do
|
|
||||||
sleep 3
|
|
||||||
if grep -q clean /opt/amdgpu/etc/gpu_state; then
|
|
||||||
echo "GPUs state is \"clean\""
|
|
||||||
break
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
|
|
||||||
echo "--- ROCm info"
|
|
||||||
rocminfo
|
|
||||||
|
|
||||||
# cleanup older docker images
|
|
||||||
cleanup_docker() {
|
|
||||||
# Get Docker's root directory
|
|
||||||
docker_root=$(docker info -f '{{.DockerRootDir}}')
|
|
||||||
if [ -z "$docker_root" ]; then
|
|
||||||
echo "Failed to determine Docker root directory."
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
echo "Docker root directory: $docker_root"
|
|
||||||
# Check disk usage of the filesystem where Docker's root directory is located
|
|
||||||
disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
|
|
||||||
# Define the threshold
|
|
||||||
threshold=70
|
|
||||||
if [ "$disk_usage" -gt "$threshold" ]; then
|
|
||||||
echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
|
|
||||||
# Remove dangling images (those that are not tagged and not used by any container)
|
|
||||||
docker image prune -f
|
|
||||||
# Remove unused volumes / force the system prune for old images as well.
|
|
||||||
docker volume prune -f && docker system prune --force --filter "until=72h" --all
|
|
||||||
echo "Docker images and volumes cleanup completed."
|
|
||||||
else
|
|
||||||
echo "Disk usage is below $threshold%. No cleanup needed."
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
# Call the cleanup docker function
|
|
||||||
cleanup_docker
|
|
||||||
|
|
||||||
echo "--- Resetting GPUs"
|
|
||||||
|
|
||||||
echo "reset" > /opt/amdgpu/etc/gpu_state
|
|
||||||
|
|
||||||
while true; do
|
|
||||||
sleep 3
|
|
||||||
if grep -q clean /opt/amdgpu/etc/gpu_state; then
|
|
||||||
echo "GPUs state is \"clean\""
|
|
||||||
break
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
|
|
||||||
echo "--- Pulling container"
|
|
||||||
image_name="rocm/vllm-ci:${BUILDKITE_COMMIT}"
|
|
||||||
container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
|
|
||||||
docker pull "${image_name}"
|
|
||||||
|
|
||||||
remove_docker_container() {
|
|
||||||
docker rm -f "${container_name}" || docker image rm -f "${image_name}" || true
|
|
||||||
}
|
|
||||||
trap remove_docker_container EXIT
|
|
||||||
|
|
||||||
echo "--- Running container"
|
|
||||||
|
|
||||||
HF_CACHE="$(realpath ~)/huggingface"
|
|
||||||
mkdir -p "${HF_CACHE}"
|
|
||||||
HF_MOUNT="/root/.cache/huggingface"
|
|
||||||
|
|
||||||
commands=$@
|
|
||||||
echo "Commands:$commands"
|
|
||||||
|
|
||||||
if [[ $commands == *"pytest -v -s basic_correctness/test_basic_correctness.py"* ]]; then
|
|
||||||
commands=${commands//"pytest -v -s basic_correctness/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s basic_correctness/test_basic_correctness.py"}
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ $commands == *"pytest -v -s models/test_registry.py"* ]]; then
|
|
||||||
commands=${commands//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"}
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ $commands == *"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'"* ]]; then
|
|
||||||
commands=${commands//"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'"/"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2 and not BambaForCausalLM and not Gemma2ForCausalLM and not Grok1ModelForCausalLM and not Zamba2ForCausalLM and not Gemma2Model and not GritLM'"}
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ $commands == *"pytest -v -s compile/test_basic_correctness.py"* ]]; then
|
|
||||||
commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s compile/test_basic_correctness.py"}
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ $commands == *"pytest -v -s lora"* ]]; then
|
|
||||||
commands=${commands//"pytest -v -s lora"/"VLLM_ROCM_CUSTOM_PAGED_ATTN=0 pytest -v -s lora"}
|
|
||||||
fi
|
|
||||||
|
|
||||||
#ignore certain kernels tests
|
|
||||||
if [[ $commands == *" kernels/core"* ]]; then
|
|
||||||
commands="${commands} \
|
|
||||||
--ignore=kernels/core/test_fused_quant_layernorm.py \
|
|
||||||
--ignore=kernels/core/test_permute_cols.py"
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ $commands == *" kernels/attention"* ]]; then
|
|
||||||
commands="${commands} \
|
|
||||||
--ignore=kernels/attention/test_attention_selector.py \
|
|
||||||
--ignore=kernels/attention/test_encoder_decoder_attn.py \
|
|
||||||
--ignore=kernels/attention/test_flash_attn.py \
|
|
||||||
--ignore=kernels/attention/test_flashinfer.py \
|
|
||||||
--ignore=kernels/attention/test_prefix_prefill.py \
|
|
||||||
--ignore=kernels/attention/test_cascade_flash_attn.py \
|
|
||||||
--ignore=kernels/attention/test_mha_attn.py \
|
|
||||||
--ignore=kernels/attention/test_lightning_attn.py \
|
|
||||||
--ignore=kernels/attention/test_attention.py"
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ $commands == *" kernels/quantization"* ]]; then
|
|
||||||
commands="${commands} \
|
|
||||||
--ignore=kernels/quantization/test_int8_quant.py \
|
|
||||||
--ignore=kernels/quantization/test_machete_mm.py \
|
|
||||||
--ignore=kernels/quantization/test_block_fp8.py \
|
|
||||||
--ignore=kernels/quantization/test_block_int8.py \
|
|
||||||
--ignore=kernels/quantization/test_marlin_gemm.py \
|
|
||||||
--ignore=kernels/quantization/test_cutlass_scaled_mm.py \
|
|
||||||
--ignore=kernels/quantization/test_int8_kernel.py"
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ $commands == *" kernels/mamba"* ]]; then
|
|
||||||
commands="${commands} \
|
|
||||||
--ignore=kernels/mamba/test_mamba_mixer2.py \
|
|
||||||
--ignore=kernels/mamba/test_causal_conv1d.py \
|
|
||||||
--ignore=kernels/mamba/test_mamba_ssm_ssd.py"
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ $commands == *" kernels/moe"* ]]; then
|
|
||||||
commands="${commands} \
|
|
||||||
--ignore=kernels/moe/test_moe.py \
|
|
||||||
--ignore=kernels/moe/test_cutlass_moe.py \
|
|
||||||
--ignore=kernels/moe/test_triton_moe_ptpc_fp8.py"
|
|
||||||
fi
|
|
||||||
|
|
||||||
#ignore certain Entrypoints/openai tests
|
|
||||||
if [[ $commands == *" entrypoints/openai "* ]]; then
|
|
||||||
commands=${commands//" entrypoints/openai "/" entrypoints/openai \
|
|
||||||
--ignore=entrypoints/openai/test_audio.py \
|
|
||||||
--ignore=entrypoints/openai/test_shutdown.py \
|
|
||||||
--ignore=entrypoints/openai/test_completion.py \
|
|
||||||
--ignore=entrypoints/openai/test_sleep.py \
|
|
||||||
--ignore=entrypoints/openai/test_models.py \
|
|
||||||
--ignore=entrypoints/openai/test_lora_adapters.py \
|
|
||||||
--ignore=entrypoints/openai/test_return_tokens_as_ids.py \
|
|
||||||
--ignore=entrypoints/openai/test_root_path.py \
|
|
||||||
--ignore=entrypoints/openai/test_tokenization.py \
|
|
||||||
--ignore=entrypoints/openai/test_prompt_validation.py "}
|
|
||||||
fi
|
|
||||||
|
|
||||||
#ignore certain Entrypoints/llm tests
|
|
||||||
if [[ $commands == *" entrypoints/llm "* ]]; then
|
|
||||||
commands=${commands//" entrypoints/llm "/" entrypoints/llm \
|
|
||||||
--ignore=entrypoints/llm/test_chat.py \
|
|
||||||
--ignore=entrypoints/llm/test_accuracy.py \
|
|
||||||
--ignore=entrypoints/llm/test_init.py \
|
|
||||||
--ignore=entrypoints/llm/test_generate_multiple_loras.py \
|
|
||||||
--ignore=entrypoints/llm/test_prompt_validation.py "}
|
|
||||||
fi
|
|
||||||
|
|
||||||
#Obsolete currently
|
|
||||||
##ignore certain Entrypoints/llm tests
|
|
||||||
#if [[ $commands == *" && pytest -v -s entrypoints/llm/test_guided_generate.py"* ]]; then
|
|
||||||
# commands=${commands//" && pytest -v -s entrypoints/llm/test_guided_generate.py"/" "}
|
|
||||||
#fi
|
|
||||||
|
|
||||||
# --ignore=entrypoints/openai/test_encoder_decoder.py \
|
|
||||||
# --ignore=entrypoints/openai/test_embedding.py \
|
|
||||||
# --ignore=entrypoints/openai/test_oot_registration.py
|
|
||||||
# --ignore=entrypoints/openai/test_accuracy.py \
|
|
||||||
# --ignore=entrypoints/openai/test_models.py <= Fails on MI250 but passes on MI300 as of 2025-03-13
|
|
||||||
|
|
||||||
|
|
||||||
PARALLEL_JOB_COUNT=8
|
|
||||||
MYPYTHONPATH=".."
|
|
||||||
|
|
||||||
# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
|
|
||||||
if [[ $commands == *"--shard-id="* ]]; then
|
|
||||||
# assign job count as the number of shards used
|
|
||||||
commands=${commands//"--num-shards= "/"--num-shards=${PARALLEL_JOB_COUNT} "}
|
|
||||||
for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do
|
|
||||||
# assign shard-id for each shard
|
|
||||||
commands_gpu=${commands//"--shard-id= "/"--shard-id=${GPU} "}
|
|
||||||
echo "Shard ${GPU} commands:$commands_gpu"
|
|
||||||
echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
|
|
||||||
docker run \
|
|
||||||
--device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
|
|
||||||
--network=host \
|
|
||||||
--shm-size=16gb \
|
|
||||||
--rm \
|
|
||||||
-e HIP_VISIBLE_DEVICES="${GPU}" \
|
|
||||||
-e HF_TOKEN \
|
|
||||||
-e AWS_ACCESS_KEY_ID \
|
|
||||||
-e AWS_SECRET_ACCESS_KEY \
|
|
||||||
-v "${HF_CACHE}:${HF_MOUNT}" \
|
|
||||||
-e "HF_HOME=${HF_MOUNT}" \
|
|
||||||
-e "PYTHONPATH=${MYPYTHONPATH}" \
|
|
||||||
--name "${container_name}_${GPU}" \
|
|
||||||
"${image_name}" \
|
|
||||||
/bin/bash -c "${commands_gpu}" \
|
|
||||||
|& while read -r line; do echo ">>Shard $GPU: $line"; done &
|
|
||||||
PIDS+=($!)
|
|
||||||
done
|
|
||||||
#wait for all processes to finish and collect exit codes
|
|
||||||
for pid in "${PIDS[@]}"; do
|
|
||||||
wait "${pid}"
|
|
||||||
STATUS+=($?)
|
|
||||||
done
|
|
||||||
for st in "${STATUS[@]}"; do
|
|
||||||
if [[ ${st} -ne 0 ]]; then
|
|
||||||
echo "One of the processes failed with $st"
|
|
||||||
exit "${st}"
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
else
|
|
||||||
echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
|
|
||||||
docker run \
|
|
||||||
--device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
|
|
||||||
--network=host \
|
|
||||||
--shm-size=16gb \
|
|
||||||
--rm \
|
|
||||||
-e HIP_VISIBLE_DEVICES=0 \
|
|
||||||
-e HF_TOKEN \
|
|
||||||
-e AWS_ACCESS_KEY_ID \
|
|
||||||
-e AWS_SECRET_ACCESS_KEY \
|
|
||||||
-v "${HF_CACHE}:${HF_MOUNT}" \
|
|
||||||
-e "HF_HOME=${HF_MOUNT}" \
|
|
||||||
-e "PYTHONPATH=${MYPYTHONPATH}" \
|
|
||||||
--name "${container_name}" \
|
|
||||||
"${image_name}" \
|
|
||||||
/bin/bash -c "${commands}"
|
|
||||||
fi
|
|
||||||
@ -1,49 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
# This script build the CPU docker image and run the offline inference inside the container.
|
|
||||||
# It serves a sanity check for compilation and basic model usage.
|
|
||||||
set -ex
|
|
||||||
|
|
||||||
# Setup cleanup
|
|
||||||
remove_docker_container() {
|
|
||||||
if [[ -n "$container_id" ]]; then
|
|
||||||
podman stop --all -t0
|
|
||||||
podman rm -f "$container_id" || true
|
|
||||||
fi
|
|
||||||
podman system prune -f
|
|
||||||
}
|
|
||||||
trap remove_docker_container EXIT
|
|
||||||
remove_docker_container
|
|
||||||
|
|
||||||
# Try building the docker image
|
|
||||||
podman build -t cpu-test-ubi9-ppc -f docker/Dockerfile.ppc64le .
|
|
||||||
|
|
||||||
# Run the image
|
|
||||||
container_id=$(podman run -itd --entrypoint /bin/bash -v /tmp/:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN cpu-test-ubi9-ppc)
|
|
||||||
|
|
||||||
function cpu_tests() {
|
|
||||||
|
|
||||||
# offline inference
|
|
||||||
podman exec -it "$container_id" bash -c "
|
|
||||||
set -e
|
|
||||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
|
|
||||||
|
|
||||||
# Run basic model test
|
|
||||||
podman exec -it "$container_id" bash -c "
|
|
||||||
set -e
|
|
||||||
pip install pytest pytest-asyncio einops peft Pillow soundfile transformers_stream_generator matplotlib
|
|
||||||
pip install sentence-transformers datamodel_code_generator
|
|
||||||
pytest -v -s tests/models/language/generation/test_bart.py -m cpu_model
|
|
||||||
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-openai-community/gpt2]
|
|
||||||
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-facebook/opt-125m]
|
|
||||||
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-google/gemma-1.1-2b-it]
|
|
||||||
pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach]
|
|
||||||
pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model"
|
|
||||||
}
|
|
||||||
|
|
||||||
# All of CPU tests are expected to be finished less than 40 mins.
|
|
||||||
|
|
||||||
export container_id
|
|
||||||
export -f cpu_tests
|
|
||||||
timeout 40m bash -c cpu_tests
|
|
||||||
|
|
||||||
@ -1,107 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
# This script build the CPU docker image and run the offline inference inside the container.
|
|
||||||
# It serves a sanity check for compilation and basic model usage.
|
|
||||||
set -ex
|
|
||||||
|
|
||||||
# allow to bind to different cores
|
|
||||||
CORE_RANGE=${CORE_RANGE:-48-95}
|
|
||||||
# used for TP/PP E2E test
|
|
||||||
OMP_CORE_RANGE=${OMP_CORE_RANGE:-48-95}
|
|
||||||
NUMA_NODE=${NUMA_NODE:-1}
|
|
||||||
|
|
||||||
export CMAKE_BUILD_PARALLEL_LEVEL=32
|
|
||||||
|
|
||||||
# Setup cleanup
|
|
||||||
remove_docker_container() {
|
|
||||||
set -e;
|
|
||||||
docker rm -f cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"-avx2 || true;
|
|
||||||
}
|
|
||||||
trap remove_docker_container EXIT
|
|
||||||
remove_docker_container
|
|
||||||
|
|
||||||
# Try building the docker image
|
|
||||||
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE" --target vllm-test -f docker/Dockerfile.cpu .
|
|
||||||
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
|
|
||||||
|
|
||||||
# Run the image, setting --shm-size=4g for tensor parallel.
|
|
||||||
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
|
|
||||||
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2
|
|
||||||
|
|
||||||
function cpu_tests() {
|
|
||||||
set -e
|
|
||||||
export NUMA_NODE=$2
|
|
||||||
|
|
||||||
# list packages
|
|
||||||
docker exec cpu-test-"$NUMA_NODE"-avx2 bash -c "
|
|
||||||
set -e
|
|
||||||
pip list"
|
|
||||||
|
|
||||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
|
||||||
set -e
|
|
||||||
pip list"
|
|
||||||
|
|
||||||
# offline inference
|
|
||||||
docker exec cpu-test-"$NUMA_NODE"-avx2 bash -c "
|
|
||||||
set -e
|
|
||||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
|
|
||||||
|
|
||||||
# Run kernel tests
|
|
||||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
|
||||||
set -e
|
|
||||||
pytest -v -s tests/kernels/test_onednn.py"
|
|
||||||
|
|
||||||
# Run basic model test
|
|
||||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
|
||||||
set -e
|
|
||||||
# Note: disable until supports V1
|
|
||||||
# pytest -v -s tests/kernels/attention/test_cache.py -m cpu_model
|
|
||||||
# pytest -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
|
|
||||||
|
|
||||||
# Note: disable Bart until supports V1
|
|
||||||
pytest -v -s tests/models/language/generation -m cpu_model \
|
|
||||||
--ignore=tests/models/language/generation/test_bart.py
|
|
||||||
VLLM_CPU_SGL_KERNEL=1 pytest -v -s tests/models/language/generation -m cpu_model \
|
|
||||||
--ignore=tests/models/language/generation/test_bart.py
|
|
||||||
|
|
||||||
pytest -v -s tests/models/language/pooling -m cpu_model
|
|
||||||
pytest -v -s tests/models/multimodal/generation \
|
|
||||||
--ignore=tests/models/multimodal/generation/test_mllama.py \
|
|
||||||
--ignore=tests/models/multimodal/generation/test_pixtral.py \
|
|
||||||
-m cpu_model"
|
|
||||||
|
|
||||||
# Run compressed-tensor test
|
|
||||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
|
||||||
set -e
|
|
||||||
pytest -s -v \
|
|
||||||
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs[False-10-32-neuralmagic/Llama-3.2-1B-quantized.w8a8]"
|
|
||||||
|
|
||||||
# Note: disable it until supports V1
|
|
||||||
# Run AWQ test
|
|
||||||
# docker exec cpu-test-"$NUMA_NODE" bash -c "
|
|
||||||
# set -e
|
|
||||||
# VLLM_USE_V1=0 pytest -s -v \
|
|
||||||
# tests/quantization/test_ipex_quant.py"
|
|
||||||
|
|
||||||
# Run multi-lora tests
|
|
||||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
|
||||||
set -e
|
|
||||||
pytest -s -v \
|
|
||||||
tests/lora/test_qwen2vl.py"
|
|
||||||
|
|
||||||
# online serving
|
|
||||||
docker exec cpu-test-"$NUMA_NODE" bash -c '
|
|
||||||
set -e
|
|
||||||
VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 &
|
|
||||||
timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
|
|
||||||
vllm bench serve \
|
|
||||||
--backend vllm \
|
|
||||||
--dataset-name random \
|
|
||||||
--model meta-llama/Llama-3.2-3B-Instruct \
|
|
||||||
--num-prompts 20 \
|
|
||||||
--endpoint /v1/completions'
|
|
||||||
}
|
|
||||||
|
|
||||||
# All of CPU tests are expected to be finished less than 40 mins.
|
|
||||||
export -f cpu_tests
|
|
||||||
timeout 2h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
|
|
||||||
@ -1,56 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
# This script build the CPU docker image and run the offline inference inside the container.
|
|
||||||
# It serves a sanity check for compilation and basic model usage.
|
|
||||||
set -exuo pipefail
|
|
||||||
|
|
||||||
# Try building the docker image
|
|
||||||
cat <<EOF | docker build -t hpu-plugin-v1-test-env -f - .
|
|
||||||
FROM gaudi-base-image:latest
|
|
||||||
|
|
||||||
COPY ./ /workspace/vllm
|
|
||||||
|
|
||||||
WORKDIR /workspace/vllm
|
|
||||||
|
|
||||||
ENV no_proxy=localhost,127.0.0.1
|
|
||||||
ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true
|
|
||||||
|
|
||||||
RUN VLLM_TARGET_DEVICE=empty pip install .
|
|
||||||
RUN pip install git+https://github.com/vllm-project/vllm-gaudi.git
|
|
||||||
|
|
||||||
# install development dependencies (for testing)
|
|
||||||
RUN python3 -m pip install -e tests/vllm_test_utils
|
|
||||||
|
|
||||||
WORKDIR /workspace/
|
|
||||||
|
|
||||||
RUN git clone https://github.com/vllm-project/vllm-gaudi.git
|
|
||||||
|
|
||||||
RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
|
|
||||||
|
|
||||||
EOF
|
|
||||||
|
|
||||||
# Setup cleanup
|
|
||||||
# certain versions of HPU software stack have a bug that can
|
|
||||||
# override the exit code of the script, so we need to use
|
|
||||||
# separate remove_docker_containers and remove_docker_containers_and_exit
|
|
||||||
# functions, while other platforms only need one remove_docker_container
|
|
||||||
# function.
|
|
||||||
EXITCODE=1
|
|
||||||
remove_docker_containers() { docker rm -f hpu-plugin-v1-test || true; }
|
|
||||||
trap 'remove_docker_containers; exit $EXITCODE;' EXIT
|
|
||||||
remove_docker_containers
|
|
||||||
|
|
||||||
echo "Running HPU plugin v1 test"
|
|
||||||
docker run --rm --runtime=habana --name=hpu-plugin-v1-test --network=host \
|
|
||||||
-e HABANA_VISIBLE_DEVICES=all \
|
|
||||||
hpu-plugin-v1-test-env \
|
|
||||||
/bin/bash "/workspace/vllm-gaudi/tests/upstream_tests/ci_tests.sh"
|
|
||||||
|
|
||||||
EXITCODE=$?
|
|
||||||
if [ $EXITCODE -eq 0 ]; then
|
|
||||||
echo "Test with basic model passed"
|
|
||||||
else
|
|
||||||
echo "Test with basic model FAILED with exit code: $EXITCODE" >&2
|
|
||||||
fi
|
|
||||||
|
|
||||||
# The trap will handle the container removal and final exit.
|
|
||||||
@ -1,167 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
set -xu
|
|
||||||
|
|
||||||
|
|
||||||
remove_docker_container() {
|
|
||||||
docker rm -f tpu-test || true;
|
|
||||||
}
|
|
||||||
|
|
||||||
trap remove_docker_container EXIT
|
|
||||||
|
|
||||||
# Remove the container that might not be cleaned up in the previous run.
|
|
||||||
remove_docker_container
|
|
||||||
|
|
||||||
# Build the docker image.
|
|
||||||
docker build -f docker/Dockerfile.tpu -t vllm-tpu .
|
|
||||||
|
|
||||||
# Set up cleanup.
|
|
||||||
cleanup_docker() {
|
|
||||||
# Get Docker's root directory
|
|
||||||
docker_root=$(docker info -f '{{.DockerRootDir}}')
|
|
||||||
if [ -z "$docker_root" ]; then
|
|
||||||
echo "Failed to determine Docker root directory."
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
echo "Docker root directory: $docker_root"
|
|
||||||
# Check disk usage of the filesystem where Docker's root directory is located
|
|
||||||
disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
|
|
||||||
# Define the threshold
|
|
||||||
threshold=70
|
|
||||||
if [ "$disk_usage" -gt "$threshold" ]; then
|
|
||||||
echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
|
|
||||||
# Remove dangling images (those that are not tagged and not used by any container)
|
|
||||||
docker image prune -f
|
|
||||||
# Remove unused volumes / force the system prune for old images as well.
|
|
||||||
docker volume prune -f && docker system prune --force --filter "until=72h" --all
|
|
||||||
echo "Docker images and volumes cleanup completed."
|
|
||||||
else
|
|
||||||
echo "Disk usage is below $threshold%. No cleanup needed."
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
cleanup_docker
|
|
||||||
|
|
||||||
# For HF_TOKEN.
|
|
||||||
source /etc/environment
|
|
||||||
|
|
||||||
docker run --privileged --net host --shm-size=16G -it \
|
|
||||||
-e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
|
|
||||||
vllm-tpu /bin/bash -c '
|
|
||||||
set -e # Exit immediately if a command exits with a non-zero status.
|
|
||||||
set -u # Treat unset variables as an error.
|
|
||||||
|
|
||||||
echo "--- Starting script inside Docker container ---"
|
|
||||||
|
|
||||||
# Create results directory
|
|
||||||
RESULTS_DIR=$(mktemp -d)
|
|
||||||
# If mktemp fails, set -e will cause the script to exit.
|
|
||||||
echo "Results will be stored in: $RESULTS_DIR"
|
|
||||||
|
|
||||||
# Install dependencies
|
|
||||||
echo "--- Installing Python dependencies ---"
|
|
||||||
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
|
|
||||||
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
|
|
||||||
&& python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
|
|
||||||
&& python3 -m pip install --progress-bar off hf-transfer
|
|
||||||
echo "--- Python dependencies installed ---"
|
|
||||||
export VLLM_USE_V1=1
|
|
||||||
export VLLM_XLA_CHECK_RECOMPILATION=1
|
|
||||||
export VLLM_XLA_CACHE_PATH=
|
|
||||||
echo "Using VLLM V1"
|
|
||||||
|
|
||||||
echo "--- Hardware Information ---"
|
|
||||||
# tpu-info
|
|
||||||
echo "--- Starting Tests ---"
|
|
||||||
set +e
|
|
||||||
overall_script_exit_code=0
|
|
||||||
|
|
||||||
# --- Test Definitions ---
|
|
||||||
# If a test fails, this function will print logs and will not cause the main script to exit.
|
|
||||||
run_test() {
|
|
||||||
local test_num=$1
|
|
||||||
local test_name=$2
|
|
||||||
local test_command=$3
|
|
||||||
local log_file="$RESULTS_DIR/test_${test_num}.log"
|
|
||||||
local actual_exit_code
|
|
||||||
|
|
||||||
echo "--- TEST_$test_num: Running $test_name ---"
|
|
||||||
|
|
||||||
# Execute the test command.
|
|
||||||
eval "$test_command" > >(tee -a "$log_file") 2> >(tee -a "$log_file" >&2)
|
|
||||||
actual_exit_code=$?
|
|
||||||
|
|
||||||
echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" # This goes to main log
|
|
||||||
echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" >> "$log_file" # Also to per-test log
|
|
||||||
|
|
||||||
if [ "$actual_exit_code" -ne 0 ]; then
|
|
||||||
echo "TEST_$test_num ($test_name) FAILED with exit code $actual_exit_code." >&2
|
|
||||||
echo "--- Log for failed TEST_$test_num ($test_name) ---" >&2
|
|
||||||
if [ -f "$log_file" ]; then
|
|
||||||
cat "$log_file" >&2
|
|
||||||
else
|
|
||||||
echo "Log file $log_file not found for TEST_$test_num ($test_name)." >&2
|
|
||||||
fi
|
|
||||||
echo "--- End of log for TEST_$test_num ($test_name) ---" >&2
|
|
||||||
return "$actual_exit_code" # Return the failure code
|
|
||||||
else
|
|
||||||
echo "TEST_$test_num ($test_name) PASSED."
|
|
||||||
return 0 # Return success
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
# Helper function to call run_test and update the overall script exit code
|
|
||||||
run_and_track_test() {
|
|
||||||
local test_num_arg="$1"
|
|
||||||
local test_name_arg="$2"
|
|
||||||
local test_command_arg="$3"
|
|
||||||
|
|
||||||
# Run the test
|
|
||||||
run_test "$test_num_arg" "$test_name_arg" "$test_command_arg"
|
|
||||||
local test_specific_exit_code=$?
|
|
||||||
|
|
||||||
# If the test failed, set the overall script exit code to 1
|
|
||||||
if [ "$test_specific_exit_code" -ne 0 ]; then
|
|
||||||
# No need for extra echo here, run_test already logged the failure.
|
|
||||||
overall_script_exit_code=1
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
# --- Actual Test Execution ---
|
|
||||||
run_and_track_test 1 "test_struct_output_generate.py" \
|
|
||||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\""
|
|
||||||
run_and_track_test 2 "test_moe_pallas.py" \
|
|
||||||
"python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py"
|
|
||||||
run_and_track_test 3 "test_lora.py" \
|
|
||||||
"VLLM_XLA_CHECK_RECOMPILATION=0 python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/test_lora.py"
|
|
||||||
run_and_track_test 4 "test_tpu_qkv_linear.py" \
|
|
||||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_tpu_qkv_linear.py"
|
|
||||||
run_and_track_test 5 "test_spmd_model_weight_loading.py" \
|
|
||||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_spmd_model_weight_loading.py"
|
|
||||||
run_and_track_test 6 "test_kv_cache_update_kernel.py" \
|
|
||||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_kv_cache_update_kernel.py"
|
|
||||||
run_and_track_test 7 "test_tpu_int8.py" \
|
|
||||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_tpu_int8.py"
|
|
||||||
|
|
||||||
# After all tests have been attempted, exit with the overall status.
|
|
||||||
if [ "$overall_script_exit_code" -ne 0 ]; then
|
|
||||||
echo "--- One or more tests FAILED. Overall script exiting with failure code 1. ---"
|
|
||||||
else
|
|
||||||
echo "--- All tests have completed and PASSED. Overall script exiting with success code 0. ---"
|
|
||||||
fi
|
|
||||||
exit "$overall_script_exit_code"
|
|
||||||
' # IMPORTANT: This is the closing single quote for the bash -c "..." command. Ensure it is present and correct.
|
|
||||||
|
|
||||||
# Capture the exit code of the docker run command
|
|
||||||
DOCKER_RUN_EXIT_CODE=$?
|
|
||||||
|
|
||||||
# The trap will run for cleanup.
|
|
||||||
# Exit the main script with the Docker run command's exit code.
|
|
||||||
if [ "$DOCKER_RUN_EXIT_CODE" -ne 0 ]; then
|
|
||||||
echo "Docker run command failed with exit code $DOCKER_RUN_EXIT_CODE."
|
|
||||||
exit "$DOCKER_RUN_EXIT_CODE"
|
|
||||||
else
|
|
||||||
echo "Docker run command completed successfully."
|
|
||||||
exit 0
|
|
||||||
fi
|
|
||||||
# TODO: This test fails because it uses RANDOM_SEED sampling
|
|
||||||
# pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
|
|
||||||
@ -1,175 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
set -xu
|
|
||||||
|
|
||||||
|
|
||||||
remove_docker_container() {
|
|
||||||
docker rm -f tpu-test || true;
|
|
||||||
}
|
|
||||||
|
|
||||||
trap remove_docker_container EXIT
|
|
||||||
|
|
||||||
# Remove the container that might not be cleaned up in the previous run.
|
|
||||||
remove_docker_container
|
|
||||||
|
|
||||||
# Build the docker image.
|
|
||||||
docker build -f docker/Dockerfile.tpu -t vllm-tpu .
|
|
||||||
|
|
||||||
# Set up cleanup.
|
|
||||||
cleanup_docker() {
|
|
||||||
# Get Docker's root directory
|
|
||||||
docker_root=$(docker info -f '{{.DockerRootDir}}')
|
|
||||||
if [ -z "$docker_root" ]; then
|
|
||||||
echo "Failed to determine Docker root directory."
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
echo "Docker root directory: $docker_root"
|
|
||||||
# Check disk usage of the filesystem where Docker's root directory is located
|
|
||||||
disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
|
|
||||||
# Define the threshold
|
|
||||||
threshold=70
|
|
||||||
if [ "$disk_usage" -gt "$threshold" ]; then
|
|
||||||
echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
|
|
||||||
# Remove dangling images (those that are not tagged and not used by any container)
|
|
||||||
docker image prune -f
|
|
||||||
# Remove unused volumes / force the system prune for old images as well.
|
|
||||||
docker volume prune -f && docker system prune --force --filter "until=72h" --all
|
|
||||||
echo "Docker images and volumes cleanup completed."
|
|
||||||
else
|
|
||||||
echo "Disk usage is below $threshold%. No cleanup needed."
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
cleanup_docker
|
|
||||||
|
|
||||||
# For HF_TOKEN.
|
|
||||||
source /etc/environment
|
|
||||||
|
|
||||||
docker run --privileged --net host --shm-size=16G -it \
|
|
||||||
-e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
|
|
||||||
vllm-tpu /bin/bash -c '
|
|
||||||
set -e # Exit immediately if a command exits with a non-zero status.
|
|
||||||
set -u # Treat unset variables as an error.
|
|
||||||
|
|
||||||
echo "--- Starting script inside Docker container ---"
|
|
||||||
|
|
||||||
# Create results directory
|
|
||||||
RESULTS_DIR=$(mktemp -d)
|
|
||||||
# If mktemp fails, set -e will cause the script to exit.
|
|
||||||
echo "Results will be stored in: $RESULTS_DIR"
|
|
||||||
|
|
||||||
# Install dependencies
|
|
||||||
echo "--- Installing Python dependencies ---"
|
|
||||||
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
|
|
||||||
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
|
|
||||||
&& python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
|
|
||||||
&& python3 -m pip install --progress-bar off hf-transfer
|
|
||||||
echo "--- Python dependencies installed ---"
|
|
||||||
export VLLM_USE_V1=1
|
|
||||||
export VLLM_XLA_CHECK_RECOMPILATION=1
|
|
||||||
export VLLM_XLA_CACHE_PATH=
|
|
||||||
echo "Using VLLM V1"
|
|
||||||
|
|
||||||
echo "--- Hardware Information ---"
|
|
||||||
# tpu-info
|
|
||||||
echo "--- Starting Tests ---"
|
|
||||||
set +e
|
|
||||||
overall_script_exit_code=0
|
|
||||||
|
|
||||||
# --- Test Definitions ---
|
|
||||||
# If a test fails, this function will print logs and will not cause the main script to exit.
|
|
||||||
run_test() {
|
|
||||||
local test_num=$1
|
|
||||||
local test_name=$2
|
|
||||||
local test_command=$3
|
|
||||||
local log_file="$RESULTS_DIR/test_${test_num}.log"
|
|
||||||
local actual_exit_code
|
|
||||||
|
|
||||||
echo "--- TEST_$test_num: Running $test_name ---"
|
|
||||||
|
|
||||||
# Execute the test command.
|
|
||||||
eval "$test_command" > >(tee -a "$log_file") 2> >(tee -a "$log_file" >&2)
|
|
||||||
actual_exit_code=$?
|
|
||||||
|
|
||||||
echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" # This goes to main log
|
|
||||||
echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" >> "$log_file" # Also to per-test log
|
|
||||||
|
|
||||||
if [ "$actual_exit_code" -ne 0 ]; then
|
|
||||||
echo "TEST_$test_num ($test_name) FAILED with exit code $actual_exit_code." >&2
|
|
||||||
echo "--- Log for failed TEST_$test_num ($test_name) ---" >&2
|
|
||||||
if [ -f "$log_file" ]; then
|
|
||||||
cat "$log_file" >&2
|
|
||||||
else
|
|
||||||
echo "Log file $log_file not found for TEST_$test_num ($test_name)." >&2
|
|
||||||
fi
|
|
||||||
echo "--- End of log for TEST_$test_num ($test_name) ---" >&2
|
|
||||||
return "$actual_exit_code" # Return the failure code
|
|
||||||
else
|
|
||||||
echo "TEST_$test_num ($test_name) PASSED."
|
|
||||||
return 0 # Return success
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
# Helper function to call run_test and update the overall script exit code
|
|
||||||
run_and_track_test() {
|
|
||||||
local test_num_arg="$1"
|
|
||||||
local test_name_arg="$2"
|
|
||||||
local test_command_arg="$3"
|
|
||||||
|
|
||||||
# Run the test
|
|
||||||
run_test "$test_num_arg" "$test_name_arg" "$test_command_arg"
|
|
||||||
local test_specific_exit_code=$?
|
|
||||||
|
|
||||||
# If the test failed, set the overall script exit code to 1
|
|
||||||
if [ "$test_specific_exit_code" -ne 0 ]; then
|
|
||||||
# No need for extra echo here, run_test already logged the failure.
|
|
||||||
overall_script_exit_code=1
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
# --- Actual Test Execution ---
|
|
||||||
run_and_track_test 0 "test_perf.py" \
|
|
||||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_perf.py"
|
|
||||||
run_and_track_test 1 "test_compilation.py" \
|
|
||||||
"python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_compilation.py"
|
|
||||||
run_and_track_test 2 "test_basic.py" \
|
|
||||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_basic.py"
|
|
||||||
run_and_track_test 3 "test_accuracy.py::test_lm_eval_accuracy_v1_engine" \
|
|
||||||
"python3 -m pytest -s -v /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine"
|
|
||||||
run_and_track_test 4 "test_quantization_accuracy.py" \
|
|
||||||
"python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py"
|
|
||||||
run_and_track_test 5 "examples/offline_inference/tpu.py" \
|
|
||||||
"python3 /workspace/vllm/examples/offline_inference/tpu.py"
|
|
||||||
run_and_track_test 6 "test_tpu_model_runner.py" \
|
|
||||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/worker/test_tpu_model_runner.py"
|
|
||||||
run_and_track_test 7 "test_sampler.py" \
|
|
||||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py"
|
|
||||||
run_and_track_test 8 "test_topk_topp_sampler.py" \
|
|
||||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_topk_topp_sampler.py"
|
|
||||||
run_and_track_test 9 "test_multimodal.py" \
|
|
||||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py"
|
|
||||||
run_and_track_test 10 "test_pallas.py" \
|
|
||||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py"
|
|
||||||
|
|
||||||
# After all tests have been attempted, exit with the overall status.
|
|
||||||
if [ "$overall_script_exit_code" -ne 0 ]; then
|
|
||||||
echo "--- One or more tests FAILED. Overall script exiting with failure code 1. ---"
|
|
||||||
else
|
|
||||||
echo "--- All tests have completed and PASSED. Overall script exiting with success code 0. ---"
|
|
||||||
fi
|
|
||||||
exit "$overall_script_exit_code"
|
|
||||||
' # IMPORTANT: This is the closing single quote for the bash -c "..." command. Ensure it is present and correct.
|
|
||||||
|
|
||||||
# Capture the exit code of the docker run command
|
|
||||||
DOCKER_RUN_EXIT_CODE=$?
|
|
||||||
|
|
||||||
# The trap will run for cleanup.
|
|
||||||
# Exit the main script with the Docker run command's exit code.
|
|
||||||
if [ "$DOCKER_RUN_EXIT_CODE" -ne 0 ]; then
|
|
||||||
echo "Docker run command failed with exit code $DOCKER_RUN_EXIT_CODE."
|
|
||||||
exit "$DOCKER_RUN_EXIT_CODE"
|
|
||||||
else
|
|
||||||
echo "Docker run command completed successfully."
|
|
||||||
exit 0
|
|
||||||
fi
|
|
||||||
# TODO: This test fails because it uses RANDOM_SEED sampling
|
|
||||||
# pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
|
|
||||||
@ -1,48 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
# This script build the CPU docker image and run the offline inference inside the container.
|
|
||||||
# It serves a sanity check for compilation and basic model usage.
|
|
||||||
set -ex
|
|
||||||
|
|
||||||
image_name="xpu/vllm-ci:${BUILDKITE_COMMIT}"
|
|
||||||
container_name="xpu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
|
|
||||||
|
|
||||||
# Try building the docker image
|
|
||||||
docker build -t ${image_name} -f docker/Dockerfile.xpu .
|
|
||||||
|
|
||||||
# Setup cleanup
|
|
||||||
remove_docker_container() {
|
|
||||||
docker rm -f "${container_name}" || true;
|
|
||||||
docker image rm -f "${image_name}" || true;
|
|
||||||
docker system prune -f || true;
|
|
||||||
}
|
|
||||||
trap remove_docker_container EXIT
|
|
||||||
|
|
||||||
# Run the image and test offline inference/tensor parallel
|
|
||||||
docker run \
|
|
||||||
--device /dev/dri \
|
|
||||||
-v /dev/dri/by-path:/dev/dri/by-path \
|
|
||||||
--entrypoint="" \
|
|
||||||
-e "HF_TOKEN=${HF_TOKEN}" \
|
|
||||||
-e "ZE_AFFINITY_MASK=${ZE_AFFINITY_MASK}" \
|
|
||||||
--name "${container_name}" \
|
|
||||||
"${image_name}" \
|
|
||||||
bash -c '
|
|
||||||
set -e
|
|
||||||
echo $ZE_AFFINITY_MASK
|
|
||||||
VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
|
|
||||||
VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -O.cudagraph_mode=NONE
|
|
||||||
VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
|
|
||||||
VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
|
|
||||||
cd tests
|
|
||||||
pytest -v -s v1/core
|
|
||||||
pytest -v -s v1/engine
|
|
||||||
pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py
|
|
||||||
pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
|
|
||||||
pytest -v -s v1/structured_output
|
|
||||||
pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_eagle.py --ignore=v1/spec_decode/test_tree_attention.py
|
|
||||||
pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py
|
|
||||||
pytest -v -s v1/test_serial_utils.py
|
|
||||||
pytest -v -s v1/test_utils.py
|
|
||||||
pytest -v -s v1/test_metrics_reader.py
|
|
||||||
'
|
|
||||||
@ -1,18 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
# Usage: ./rerun_test.sh path/to/test.py::test_name
|
|
||||||
|
|
||||||
# Check if argument is given
|
|
||||||
if [ $# -lt 1 ]; then
|
|
||||||
echo "Usage: $0 path/to/test.py::test_name"
|
|
||||||
echo "Example: $0 tests/v1/engine/test_engine_core_client.py::test_kv_cache_events[True-tcp]"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
TEST=$1
|
|
||||||
COUNT=1
|
|
||||||
|
|
||||||
while pytest -sv "$TEST"; do
|
|
||||||
COUNT=$((COUNT + 1))
|
|
||||||
echo "RUN NUMBER ${COUNT}"
|
|
||||||
done
|
|
||||||
@ -1,24 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
docker_root=$(docker info -f '{{.DockerRootDir}}')
|
|
||||||
if [ -z "$docker_root" ]; then
|
|
||||||
echo "Failed to determine Docker root directory."
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
echo "Docker root directory: $docker_root"
|
|
||||||
# Check disk usage of the filesystem where Docker's root directory is located
|
|
||||||
disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
|
|
||||||
# Define the threshold
|
|
||||||
threshold=70
|
|
||||||
if [ "$disk_usage" -gt "$threshold" ]; then
|
|
||||||
echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
|
|
||||||
# Remove dangling images (those that are not tagged and not used by any container)
|
|
||||||
docker image prune -f
|
|
||||||
# Remove unused volumes / force the system prune for old images as well.
|
|
||||||
docker volume prune -f && docker system prune --force --filter "until=24h" --all
|
|
||||||
echo "Docker images and volumes cleanup completed."
|
|
||||||
else
|
|
||||||
echo "Disk usage is below $threshold%. No cleanup needed."
|
|
||||||
fi
|
|
||||||
@ -1,14 +0,0 @@
|
|||||||
# Environment config
|
|
||||||
TEST_NAME=llama8b
|
|
||||||
CONTAINER_NAME=tpu-test
|
|
||||||
|
|
||||||
# vllm config
|
|
||||||
MODEL=meta-llama/Llama-3.1-8B-Instruct
|
|
||||||
MAX_NUM_SEQS=256
|
|
||||||
MAX_NUM_BATCHED_TOKENS=1024
|
|
||||||
TENSOR_PARALLEL_SIZE=1
|
|
||||||
MAX_MODEL_LEN=2048
|
|
||||||
DOWNLOAD_DIR=/mnt/disks/persist
|
|
||||||
EXPECTED_THROUGHPUT=8.0
|
|
||||||
INPUT_LEN=1800
|
|
||||||
OUTPUT_LEN=128
|
|
||||||
@ -1,90 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
if [ ! -f "$1" ]; then
|
|
||||||
echo "Error: The env file '$1' does not exist."
|
|
||||||
exit 1 # Exit the script with a non-zero status to indicate an error
|
|
||||||
fi
|
|
||||||
|
|
||||||
ENV_FILE=$1
|
|
||||||
|
|
||||||
# For testing on local vm, use `set -a` to export all variables
|
|
||||||
source /etc/environment
|
|
||||||
source $ENV_FILE
|
|
||||||
|
|
||||||
remove_docker_container() {
|
|
||||||
docker rm -f $CONTAINER_NAME || true;
|
|
||||||
}
|
|
||||||
|
|
||||||
trap remove_docker_container EXIT
|
|
||||||
|
|
||||||
# Remove the container that might not be cleaned up in the previous run.
|
|
||||||
remove_docker_container
|
|
||||||
|
|
||||||
LOG_ROOT=$(mktemp -d)
|
|
||||||
# If mktemp fails, set -e will cause the script to exit.
|
|
||||||
echo "Results will be stored in: $LOG_ROOT"
|
|
||||||
|
|
||||||
if [ -z "$HF_TOKEN" ]; then
|
|
||||||
echo "Error: HF_TOKEN is not set or is empty."
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Make sure mounted disk or dir exists
|
|
||||||
if [ ! -d "$DOWNLOAD_DIR" ]; then
|
|
||||||
echo "Error: Folder $DOWNLOAD_DIR does not exist. This is useually a mounted drive. If no mounted drive, just create a folder."
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "Run model $MODEL"
|
|
||||||
echo
|
|
||||||
|
|
||||||
echo "starting docker...$CONTAINER_NAME"
|
|
||||||
echo
|
|
||||||
docker run \
|
|
||||||
-v $DOWNLOAD_DIR:$DOWNLOAD_DIR \
|
|
||||||
--env-file $ENV_FILE \
|
|
||||||
-e HF_TOKEN="$HF_TOKEN" \
|
|
||||||
-e TARGET_COMMIT=$BUILDKITE_COMMIT \
|
|
||||||
-e MODEL=$MODEL \
|
|
||||||
-e WORKSPACE=/workspace \
|
|
||||||
--name $CONTAINER_NAME \
|
|
||||||
-d \
|
|
||||||
--privileged \
|
|
||||||
--network host \
|
|
||||||
-v /dev/shm:/dev/shm \
|
|
||||||
vllm/vllm-tpu-bm tail -f /dev/null
|
|
||||||
|
|
||||||
echo "run script..."
|
|
||||||
echo
|
|
||||||
docker exec "$CONTAINER_NAME" /bin/bash -c ".buildkite/scripts/tpu/run_bm.sh"
|
|
||||||
|
|
||||||
echo "copy result back..."
|
|
||||||
VLLM_LOG="$LOG_ROOT/$TEST_NAME"_vllm_log.txt
|
|
||||||
BM_LOG="$LOG_ROOT/$TEST_NAME"_bm_log.txt
|
|
||||||
docker cp "$CONTAINER_NAME:/workspace/vllm_log.txt" "$VLLM_LOG"
|
|
||||||
docker cp "$CONTAINER_NAME:/workspace/bm_log.txt" "$BM_LOG"
|
|
||||||
|
|
||||||
throughput=$(grep "Request throughput (req/s):" "$BM_LOG" | sed 's/[^0-9.]//g')
|
|
||||||
echo "throughput for $TEST_NAME at $BUILDKITE_COMMIT: $throughput"
|
|
||||||
|
|
||||||
if [ "$BUILDKITE" = "true" ]; then
|
|
||||||
echo "Running inside Buildkite"
|
|
||||||
buildkite-agent artifact upload "$VLLM_LOG"
|
|
||||||
buildkite-agent artifact upload "$BM_LOG"
|
|
||||||
else
|
|
||||||
echo "Not running inside Buildkite"
|
|
||||||
fi
|
|
||||||
|
|
||||||
#
|
|
||||||
# compare the throughput with EXPECTED_THROUGHPUT
|
|
||||||
# and assert meeting the expectation
|
|
||||||
#
|
|
||||||
if [[ -z "$throughput" || ! "$throughput" =~ ^[0-9]+([.][0-9]+)?$ ]]; then
|
|
||||||
echo "Failed to get the throughput"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
if (( $(echo "$throughput < $EXPECTED_THROUGHPUT" | bc -l) )); then
|
|
||||||
echo "Error: throughput($throughput) is less than expected($EXPECTED_THROUGHPUT)"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
@ -1,14 +0,0 @@
|
|||||||
# Environment config
|
|
||||||
TEST_NAME=llama8bw8a8
|
|
||||||
CONTAINER_NAME=tpu-test
|
|
||||||
|
|
||||||
# vllm config
|
|
||||||
MODEL=RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8
|
|
||||||
MAX_NUM_SEQS=128
|
|
||||||
MAX_NUM_BATCHED_TOKENS=1024
|
|
||||||
TENSOR_PARALLEL_SIZE=1
|
|
||||||
MAX_MODEL_LEN=2048
|
|
||||||
DOWNLOAD_DIR=/mnt/disks/persist
|
|
||||||
EXPECTED_THROUGHPUT=10.0
|
|
||||||
INPUT_LEN=1800
|
|
||||||
OUTPUT_LEN=128
|
|
||||||
@ -1,93 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
VLLM_LOG="$WORKSPACE/vllm_log.txt"
|
|
||||||
BM_LOG="$WORKSPACE/bm_log.txt"
|
|
||||||
|
|
||||||
if [ -n "$TARGET_COMMIT" ]; then
|
|
||||||
head_hash=$(git rev-parse HEAD)
|
|
||||||
if [ "$TARGET_COMMIT" != "$head_hash" ]; then
|
|
||||||
echo "Error: target commit $TARGET_COMMIT does not match HEAD: $head_hash"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "model: $MODEL"
|
|
||||||
echo
|
|
||||||
|
|
||||||
#
|
|
||||||
# create a log folder
|
|
||||||
#
|
|
||||||
mkdir "$WORKSPACE/log"
|
|
||||||
|
|
||||||
# TODO: Move to image building.
|
|
||||||
pip install pandas
|
|
||||||
pip install datasets
|
|
||||||
|
|
||||||
#
|
|
||||||
# create sonnet_4x
|
|
||||||
#
|
|
||||||
echo "Create sonnet_4x.txt"
|
|
||||||
echo "" > benchmarks/sonnet_4x.txt
|
|
||||||
for _ in {1..4}
|
|
||||||
do
|
|
||||||
cat benchmarks/sonnet.txt >> benchmarks/sonnet_4x.txt
|
|
||||||
done
|
|
||||||
|
|
||||||
#
|
|
||||||
# start vllm service in backend
|
|
||||||
#
|
|
||||||
echo "lanching vllm..."
|
|
||||||
echo "logging to $VLLM_LOG"
|
|
||||||
echo
|
|
||||||
|
|
||||||
VLLM_USE_V1=1 vllm serve $MODEL \
|
|
||||||
--seed 42 \
|
|
||||||
--max-num-seqs $MAX_NUM_SEQS \
|
|
||||||
--max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \
|
|
||||||
--tensor-parallel-size $TENSOR_PARALLEL_SIZE \
|
|
||||||
--no-enable-prefix-caching \
|
|
||||||
--download_dir $DOWNLOAD_DIR \
|
|
||||||
--max-model-len $MAX_MODEL_LEN > "$VLLM_LOG" 2>&1 &
|
|
||||||
|
|
||||||
|
|
||||||
echo "wait for 20 minutes.."
|
|
||||||
echo
|
|
||||||
# sleep 1200
|
|
||||||
# wait for 10 minutes...
|
|
||||||
for i in {1..120}; do
|
|
||||||
# TODO: detect other type of errors.
|
|
||||||
if grep -Fq "raise RuntimeError" "$VLLM_LOG"; then
|
|
||||||
echo "Detected RuntimeError, exiting."
|
|
||||||
exit 1
|
|
||||||
elif grep -Fq "Application startup complete" "$VLLM_LOG"; then
|
|
||||||
echo "Application started"
|
|
||||||
break
|
|
||||||
else
|
|
||||||
echo "wait for 10 seconds..."
|
|
||||||
sleep 10
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
|
|
||||||
#
|
|
||||||
# run test
|
|
||||||
#
|
|
||||||
echo "run benchmark test..."
|
|
||||||
echo "logging to $BM_LOG"
|
|
||||||
echo
|
|
||||||
vllm bench serve \
|
|
||||||
--backend vllm \
|
|
||||||
--model $MODEL \
|
|
||||||
--dataset-name sonnet \
|
|
||||||
--dataset-path benchmarks/sonnet_4x.txt \
|
|
||||||
--sonnet-input-len $INPUT_LEN \
|
|
||||||
--sonnet-output-len $OUTPUT_LEN \
|
|
||||||
--ignore-eos > "$BM_LOG"
|
|
||||||
|
|
||||||
echo "completed..."
|
|
||||||
echo
|
|
||||||
|
|
||||||
throughput=$(grep "Request throughput (req/s):" "$BM_LOG" | sed 's/[^0-9.]//g')
|
|
||||||
echo "throughput: $throughput"
|
|
||||||
echo
|
|
||||||
@ -2,13 +2,12 @@
|
|||||||
# adding a new command to an existing step. See different options here for examples.
|
# adding a new command to an existing step. See different options here for examples.
|
||||||
|
|
||||||
# This script will be feed into Jinja template in `test-template-aws.j2` at
|
# This script will be feed into Jinja template in `test-template-aws.j2` at
|
||||||
# https://github.com/vllm-project/buildkite-ci/blob/main/scripts/test-template-aws.j2
|
# https://github.com/vllm-project/buildkite-ci/blob/main/scripts/test-template-aws.j2
|
||||||
# to generate the final pipeline yaml file.
|
# to generate the final pipeline yaml file.
|
||||||
|
|
||||||
# Documentation
|
# Documentation
|
||||||
# label(str): the name of the test. emoji allowed.
|
# label(str): the name of the test. emoji allowed.
|
||||||
# fast_check(bool): whether to run this on each commit on fastcheck pipeline.
|
# fast_check(bool): whether to run this on each commit on fastcheck pipeline.
|
||||||
# torch_nightly(bool): whether to run this on vllm against torch nightly pipeline.
|
|
||||||
# fast_check_only(bool): run this test on fastcheck pipeline only
|
# fast_check_only(bool): run this test on fastcheck pipeline only
|
||||||
# optional(bool): never run this test by default (i.e. need to unblock manually) unless it's scheduled nightly run.
|
# optional(bool): never run this test by default (i.e. need to unblock manually) unless it's scheduled nightly run.
|
||||||
# command(str): the single command to run for tests. incompatible with commands.
|
# command(str): the single command to run for tests. incompatible with commands.
|
||||||
@ -16,7 +15,7 @@
|
|||||||
# mirror_hardwares(list): the list of hardwares to run the test on as well. currently only supports [amd]
|
# mirror_hardwares(list): the list of hardwares to run the test on as well. currently only supports [amd]
|
||||||
# gpu(str): override the GPU selection for the test. default is on L4 GPUs. currently only supports a100
|
# gpu(str): override the GPU selection for the test. default is on L4 GPUs. currently only supports a100
|
||||||
# num_gpus(int): override the number of GPUs for the test. default to 1 GPU. currently support 2,4.
|
# num_gpus(int): override the number of GPUs for the test. default to 1 GPU. currently support 2,4.
|
||||||
# num_nodes(int): whether to simulate multi-node setup by launch multiple containers on one host,
|
# num_nodes(int): whether to simulate multi-node setup by launch multiple containers on one host,
|
||||||
# in this case, commands must be specified. the first command runs on first host, the second
|
# in this case, commands must be specified. the first command runs on first host, the second
|
||||||
# command runs on the second host.
|
# command runs on the second host.
|
||||||
# working_dir(str): specify the place where command should execute, default to /vllm-workspace/tests
|
# working_dir(str): specify the place where command should execute, default to /vllm-workspace/tests
|
||||||
@ -25,46 +24,44 @@
|
|||||||
# When adding a test
|
# When adding a test
|
||||||
# - If the test belong to an existing group, add it there
|
# - If the test belong to an existing group, add it there
|
||||||
# - If the test is short, add to any existing step
|
# - If the test is short, add to any existing step
|
||||||
# - If the test takes more than 10min, then it is okay to create a new step.
|
# - If the test takes more than 10min, then it is okay to create a new step.
|
||||||
# Note that all steps execute in parallel.
|
# Note that all steps execute in parallel.
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
##### fast check tests #####
|
##### fast check tests #####
|
||||||
|
|
||||||
- label: Pytorch Nightly Dependency Override Check # 2min
|
- label: Documentation Build # 2min
|
||||||
# if this test fails, it means the nightly torch version is not compatible with some
|
working_dir: "/vllm-workspace/test_docs/docs"
|
||||||
# of the dependencies. Please check the error message and add the package to whitelist
|
fast_check: true
|
||||||
# in /vllm/tools/generate_nightly_torch_test.py
|
no_gpu: True
|
||||||
soft_fail: true
|
|
||||||
source_file_dependencies:
|
|
||||||
- requirements/nightly_torch_test.txt
|
|
||||||
commands:
|
commands:
|
||||||
- bash standalone_tests/pytorch_nightly_dependency.sh
|
- pip install -r requirements-docs.txt
|
||||||
|
- SPHINXOPTS=\"-W\" make html
|
||||||
|
# Check API reference (if it fails, you may have missing mock imports)
|
||||||
|
- grep \"sig sig-object py\" build/html/dev/sampling_params.html
|
||||||
|
|
||||||
- label: Async Engine, Inputs, Utils, Worker Test # 24min
|
- label: Async Engine, Inputs, Utils, Worker Test # 24min
|
||||||
mirror_hardwares: [amdexperimental]
|
fast_check: true
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/mq_llm_engine
|
- tests/mq_llm_engine
|
||||||
- tests/async_engine
|
- tests/async_engine
|
||||||
- tests/test_inputs.py
|
- tests/test_inputs
|
||||||
- tests/test_outputs.py
|
|
||||||
- tests/multimodal
|
- tests/multimodal
|
||||||
- tests/utils_
|
- tests/test_utils
|
||||||
- tests/worker
|
- tests/worker
|
||||||
- tests/standalone_tests/lazy_imports.py
|
- tests/standalone_tests/lazy_torch_compile.py
|
||||||
commands:
|
commands:
|
||||||
- python3 standalone_tests/lazy_imports.py
|
- python3 standalone_tests/lazy_torch_compile.py
|
||||||
- pytest -v -s mq_llm_engine # MQLLMEngine
|
- pytest -v -s mq_llm_engine # MQLLMEngine
|
||||||
- pytest -v -s async_engine # AsyncLLMEngine
|
- pytest -v -s async_engine # AsyncLLMEngine
|
||||||
|
- NUM_SCHEDULER_STEPS=4 pytest -v -s async_engine/test_async_llm_engine.py
|
||||||
- pytest -v -s test_inputs.py
|
- pytest -v -s test_inputs.py
|
||||||
- pytest -v -s test_outputs.py
|
|
||||||
- pytest -v -s multimodal
|
- pytest -v -s multimodal
|
||||||
- pytest -v -s utils_ # Utils
|
- pytest -v -s test_utils.py # Utils
|
||||||
- pytest -v -s worker # Worker
|
- pytest -v -s worker # Worker
|
||||||
|
|
||||||
- label: Python-only Installation Test
|
- label: Python-only Installation Test
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- tests/standalone_tests/python_only_compile.sh
|
- tests/standalone_tests/python_only_compile.sh
|
||||||
- setup.py
|
- setup.py
|
||||||
@ -72,24 +69,28 @@ steps:
|
|||||||
- bash standalone_tests/python_only_compile.sh
|
- bash standalone_tests/python_only_compile.sh
|
||||||
|
|
||||||
- label: Basic Correctness Test # 30min
|
- label: Basic Correctness Test # 30min
|
||||||
mirror_hardwares: [amdexperimental]
|
#mirror_hardwares: [amd]
|
||||||
fast_check: true
|
fast_check: true
|
||||||
torch_nightly: true
|
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/basic_correctness/test_basic_correctness
|
- tests/basic_correctness/test_basic_correctness
|
||||||
- tests/basic_correctness/test_cpu_offload
|
- tests/basic_correctness/test_cpu_offload
|
||||||
- tests/basic_correctness/test_preemption
|
- tests/basic_correctness/test_preemption
|
||||||
- tests/basic_correctness/test_cumem.py
|
|
||||||
commands:
|
commands:
|
||||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
|
||||||
- pytest -v -s basic_correctness/test_cumem.py
|
|
||||||
- pytest -v -s basic_correctness/test_basic_correctness.py
|
- pytest -v -s basic_correctness/test_basic_correctness.py
|
||||||
- pytest -v -s basic_correctness/test_cpu_offload.py
|
- pytest -v -s basic_correctness/test_cpu_offload.py
|
||||||
- VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
|
- VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
|
||||||
|
|
||||||
|
- label: Chunked Prefill Test
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/basic_correctness/test_chunked_prefill
|
||||||
|
commands:
|
||||||
|
- VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
|
||||||
|
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
|
||||||
|
|
||||||
- label: Core Test # 10min
|
- label: Core Test # 10min
|
||||||
mirror_hardwares: [amdexperimental]
|
mirror_hardwares: [amd]
|
||||||
fast_check: true
|
fast_check: true
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/core
|
- vllm/core
|
||||||
@ -98,119 +99,59 @@ steps:
|
|||||||
commands:
|
commands:
|
||||||
- pytest -v -s core
|
- pytest -v -s core
|
||||||
|
|
||||||
- label: Entrypoints Test (LLM) # 40min
|
- label: Entrypoints Test # 40min
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
fast_check: true
|
fast_check: true
|
||||||
torch_nightly: true
|
mirror_hardwares: [amd]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/entrypoints/llm
|
|
||||||
- tests/entrypoints/offline_mode
|
|
||||||
commands:
|
commands:
|
||||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_guided_generate.py
|
||||||
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_collective_rpc.py
|
|
||||||
- pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
|
- pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
|
||||||
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
|
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
|
||||||
- pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
|
- pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
|
||||||
- VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
|
- pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
|
||||||
|
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py
|
||||||
- label: Entrypoints Test (API Server) # 40min
|
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
working_dir: "/vllm-workspace/tests"
|
|
||||||
fast_check: true
|
|
||||||
torch_nightly: true
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/
|
|
||||||
- tests/entrypoints/openai
|
|
||||||
- tests/entrypoints/test_chat_utils
|
|
||||||
commands:
|
|
||||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
|
||||||
- PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py # PYTHONPATH is needed to import custom Worker extension
|
|
||||||
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py
|
|
||||||
- pytest -v -s entrypoints/test_chat_utils.py
|
- pytest -v -s entrypoints/test_chat_utils.py
|
||||||
|
- pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
|
||||||
|
|
||||||
- label: Distributed Tests (4 GPUs) # 10min
|
- label: Distributed Tests (4 GPUs) # 10min
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
num_gpus: 4
|
num_gpus: 4
|
||||||
|
fast_check: true
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/distributed/
|
- vllm/distributed/
|
||||||
- vllm/core/
|
- vllm/core/
|
||||||
- tests/distributed/test_utils
|
- tests/distributed
|
||||||
- tests/distributed/test_pynccl
|
- tests/spec_decode/e2e/test_integration_dist_tp4
|
||||||
- tests/distributed/test_events
|
- tests/compile
|
||||||
- tests/compile/test_basic_correctness
|
|
||||||
- examples/offline_inference/rlhf.py
|
|
||||||
- examples/offline_inference/rlhf_colocate.py
|
|
||||||
- tests/examples/offline_inference/data_parallel.py
|
|
||||||
- tests/v1/test_async_llm_dp.py
|
|
||||||
- tests/v1/test_external_lb_dp.py
|
|
||||||
- tests/v1/test_internal_lb_dp.py
|
|
||||||
- tests/v1/test_hybrid_lb_dp.py
|
|
||||||
- tests/v1/engine/test_engine_core_client.py
|
|
||||||
commands:
|
commands:
|
||||||
# test with tp=2 and external_dp=2
|
|
||||||
- VLLM_USE_V1=0 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
|
|
||||||
- torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
|
|
||||||
# test with tp=2 and pp=2
|
|
||||||
- PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
|
|
||||||
# test with internal dp
|
|
||||||
- python3 ../examples/offline_inference/data_parallel.py --enforce-eager
|
|
||||||
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
|
|
||||||
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_external_lb_dp.py
|
|
||||||
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/test_internal_lb_dp.py
|
|
||||||
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/test_hybrid_lb_dp.py
|
|
||||||
- pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
|
|
||||||
- pytest -v -s distributed/test_utils.py
|
- pytest -v -s distributed/test_utils.py
|
||||||
- pytest -v -s compile/test_basic_correctness.py
|
- pytest -v -s compile/test_basic_correctness.py
|
||||||
- pytest -v -s distributed/test_pynccl.py
|
- pytest -v -s distributed/test_pynccl.py
|
||||||
- pytest -v -s distributed/test_events.py
|
- pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
|
||||||
# TODO: create a dedicated test section for multi-GPU example tests
|
|
||||||
# when we have multiple distributed example tests
|
|
||||||
- pushd ../examples/offline_inference
|
|
||||||
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
|
|
||||||
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
|
|
||||||
- popd
|
|
||||||
|
|
||||||
- label: EPLB Algorithm Test
|
|
||||||
working_dir: "/vllm-workspace/tests"
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/distributed/eplb
|
|
||||||
- tests/distributed/test_eplb_algo.py
|
|
||||||
commands:
|
|
||||||
- pytest -v -s distributed/test_eplb_algo.py
|
|
||||||
|
|
||||||
- label: EPLB Execution Test # 5min
|
|
||||||
working_dir: "/vllm-workspace/tests"
|
|
||||||
num_gpus: 4
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/distributed/eplb
|
|
||||||
- tests/distributed/test_eplb_execute.py
|
|
||||||
commands:
|
|
||||||
- pytest -v -s distributed/test_eplb_execute.py
|
|
||||||
|
|
||||||
- label: Metrics, Tracing Test # 10min
|
- label: Metrics, Tracing Test # 10min
|
||||||
mirror_hardwares: [amdexperimental]
|
num_gpus: 2
|
||||||
num_gpus: 2
|
fast_check: true
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/metrics
|
- tests/metrics
|
||||||
- tests/tracing
|
- tests/tracing
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s metrics
|
- pytest -v -s metrics
|
||||||
- "pip install \
|
- "pip install \
|
||||||
'opentelemetry-sdk>=1.26.0' \
|
'opentelemetry-sdk>=1.26.0,<1.27.0' \
|
||||||
'opentelemetry-api>=1.26.0' \
|
'opentelemetry-api>=1.26.0,<1.27.0' \
|
||||||
'opentelemetry-exporter-otlp>=1.26.0' \
|
'opentelemetry-exporter-otlp>=1.26.0,<1.27.0' \
|
||||||
'opentelemetry-semantic-conventions-ai>=0.4.1'"
|
'opentelemetry-semantic-conventions-ai>=0.4.1,<0.5.0'"
|
||||||
- pytest -v -s tracing
|
- pytest -v -s tracing
|
||||||
|
|
||||||
##### fast check tests #####
|
##### fast check tests #####
|
||||||
##### 1 GPU test #####
|
##### 1 GPU test #####
|
||||||
|
|
||||||
- label: Regression Test # 5min
|
- label: Regression Test # 5min
|
||||||
mirror_hardwares: [amdexperimental]
|
mirror_hardwares: [amd]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/test_regression
|
- tests/test_regression
|
||||||
@ -220,120 +161,92 @@ steps:
|
|||||||
working_dir: "/vllm-workspace/tests" # optional
|
working_dir: "/vllm-workspace/tests" # optional
|
||||||
|
|
||||||
- label: Engine Test # 10min
|
- label: Engine Test # 10min
|
||||||
mirror_hardwares: [amdexperimental]
|
mirror_hardwares: [amd]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/engine
|
- tests/engine
|
||||||
- tests/tokenization
|
- tests/tokenization
|
||||||
- tests/test_sequence
|
|
||||||
- tests/test_config
|
|
||||||
- tests/test_logger
|
|
||||||
- tests/test_vllm_port
|
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
|
- pytest -v -s engine test_sequence.py test_config.py test_logger.py
|
||||||
# OOM in the CI unless we run this separately
|
# OOM in the CI unless we run this separately
|
||||||
- pytest -v -s tokenization
|
- pytest -v -s tokenization
|
||||||
|
|
||||||
- label: V1 Test
|
- label: V1 Test
|
||||||
mirror_hardwares: [amdexperimental]
|
#mirror_hardwares: [amd]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/v1
|
- tests/v1
|
||||||
commands:
|
commands:
|
||||||
# split the test to avoid interference
|
- VLLM_USE_V1=1 pytest -v -s v1
|
||||||
- pytest -v -s v1/core
|
|
||||||
- pytest -v -s v1/engine
|
|
||||||
- pytest -v -s v1/entrypoints
|
|
||||||
- pytest -v -s v1/executor
|
|
||||||
- pytest -v -s v1/sample
|
|
||||||
- pytest -v -s v1/logits_processors
|
|
||||||
- pytest -v -s v1/worker
|
|
||||||
- pytest -v -s v1/structured_output
|
|
||||||
- pytest -v -s v1/spec_decode
|
|
||||||
- pytest -v -s v1/kv_connector/unit
|
|
||||||
- pytest -v -s v1/metrics
|
|
||||||
- pytest -v -s v1/test_serial_utils.py
|
|
||||||
- pytest -v -s v1/test_utils.py
|
|
||||||
- pytest -v -s v1/test_oracle.py
|
|
||||||
- pytest -v -s v1/test_metrics_reader.py
|
|
||||||
# TODO: accuracy does not match, whether setting
|
|
||||||
# VLLM_USE_FLASHINFER_SAMPLER or not on H100.
|
|
||||||
- pytest -v -s v1/e2e
|
|
||||||
# Integration test for streaming correctness (requires special branch).
|
|
||||||
- pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
|
|
||||||
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
|
|
||||||
|
|
||||||
- label: Examples Test # 25min
|
- label: Examples Test # 25min
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
working_dir: "/vllm-workspace/examples"
|
working_dir: "/vllm-workspace/examples"
|
||||||
|
#mirror_hardwares: [amd]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/entrypoints
|
- vllm/entrypoints
|
||||||
- examples/
|
- examples/
|
||||||
commands:
|
commands:
|
||||||
- pip install tensorizer # for tensorizer test
|
- pip install tensorizer # for tensorizer test
|
||||||
- python3 offline_inference/basic/generate.py --model facebook/opt-125m
|
- python3 offline_inference.py
|
||||||
- python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
|
- python3 cpu_offload.py
|
||||||
- python3 offline_inference/basic/chat.py
|
- python3 offline_inference_chat.py
|
||||||
- python3 offline_inference/prefix_caching.py
|
- python3 offline_inference_with_prefix.py
|
||||||
- python3 offline_inference/llm_engine_example.py
|
- python3 llm_engine_example.py
|
||||||
- python3 offline_inference/audio_language.py --seed 0
|
- python3 offline_inference_vision_language.py
|
||||||
- python3 offline_inference/vision_language.py --seed 0
|
- python3 offline_inference_vision_language_multi_image.py
|
||||||
- python3 offline_inference/vision_language_pooling.py --seed 0
|
- python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
||||||
- python3 offline_inference/vision_language_multi_image.py --seed 0
|
- python3 offline_inference_encoder_decoder.py
|
||||||
- VLLM_USE_V1=0 python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
- python3 offline_inference_classification.py
|
||||||
- python3 offline_inference/encoder_decoder.py
|
- python3 offline_inference_embedding.py
|
||||||
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
|
- python3 offline_inference_scoring.py
|
||||||
- python3 offline_inference/basic/classify.py
|
- python3 offline_profile.py --model facebook/opt-125m run_num_steps --num-steps 2
|
||||||
- python3 offline_inference/basic/embed.py
|
|
||||||
- python3 offline_inference/basic/score.py
|
|
||||||
- VLLM_USE_V1=0 python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2
|
|
||||||
|
|
||||||
- label: Platform Tests (CUDA)
|
- label: Prefix Caching Test # 9min
|
||||||
mirror_hardwares: [amdexperimental]
|
mirror_hardwares: [amd]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/cuda
|
- tests/prefix_caching
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s cuda/test_cuda_context.py
|
- pytest -v -s prefix_caching
|
||||||
|
|
||||||
- label: Samplers Test # 36min
|
- label: Samplers Test # 36min
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/model_executor/layers
|
- vllm/model_executor/layers
|
||||||
- vllm/sampling_metadata.py
|
- vllm/sampling_metadata.py
|
||||||
- tests/samplers
|
- tests/samplers
|
||||||
- tests/conftest.py
|
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s samplers
|
- pytest -v -s samplers
|
||||||
- VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
|
- VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
|
||||||
|
|
||||||
|
- label: LogitsProcessor Test # 5min
|
||||||
|
mirror_hardwares: [amd]
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/model_executor/layers
|
||||||
|
- vllm/model_executor/guided_decoding
|
||||||
|
- tests/test_logits_processor
|
||||||
|
- tests/model_executor/test_guided_processors
|
||||||
|
commands:
|
||||||
|
- pytest -v -s test_logits_processor.py
|
||||||
|
- pytest -v -s model_executor/test_guided_processors.py
|
||||||
|
|
||||||
|
- label: Speculative decoding tests # 30min
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/spec_decode
|
||||||
|
- tests/spec_decode
|
||||||
|
commands:
|
||||||
|
- pytest -v -s spec_decode/e2e/test_multistep_correctness.py
|
||||||
|
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py
|
||||||
|
|
||||||
- label: LoRA Test %N # 15min each
|
- label: LoRA Test %N # 15min each
|
||||||
mirror_hardwares: [amdexperimental]
|
mirror_hardwares: [amd]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/lora
|
- vllm/lora
|
||||||
- tests/lora
|
- tests/lora
|
||||||
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py
|
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_minicpmv_tp.py
|
||||||
parallelism: 4
|
parallelism: 4
|
||||||
|
|
||||||
- label: PyTorch Compilation Unit Tests
|
- label: "PyTorch Fullgraph Smoke Test" # 9min
|
||||||
mirror_hardwares: [amdexperimental]
|
fast_check: true
|
||||||
torch_nightly: true
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/
|
|
||||||
- tests/compile
|
|
||||||
commands:
|
|
||||||
- pytest -v -s compile/test_pass_manager.py
|
|
||||||
- pytest -v -s compile/test_fusion.py
|
|
||||||
- pytest -v -s compile/test_fusion_attn.py
|
|
||||||
- pytest -v -s compile/test_silu_mul_quant_fusion.py
|
|
||||||
- pytest -v -s compile/test_sequence_parallelism.py
|
|
||||||
- pytest -v -s compile/test_async_tp.py
|
|
||||||
- pytest -v -s compile/test_fusion_all_reduce.py
|
|
||||||
- pytest -v -s compile/test_decorator.py
|
|
||||||
|
|
||||||
- label: PyTorch Fullgraph Smoke Test # 9min
|
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
torch_nightly: true
|
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/compile
|
- tests/compile
|
||||||
@ -342,136 +255,60 @@ steps:
|
|||||||
# these tests need to be separated, cannot combine
|
# these tests need to be separated, cannot combine
|
||||||
- pytest -v -s compile/piecewise/test_simple.py
|
- pytest -v -s compile/piecewise/test_simple.py
|
||||||
- pytest -v -s compile/piecewise/test_toy_llama.py
|
- pytest -v -s compile/piecewise/test_toy_llama.py
|
||||||
- pytest -v -s compile/piecewise/test_full_cudagraph.py
|
|
||||||
- pytest -v -s compile/piecewise/test_multiple_graphs.py
|
|
||||||
|
|
||||||
- label: PyTorch Fullgraph Test # 18min
|
- label: "PyTorch Fullgraph Test" # 18min
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
torch_nightly: true
|
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/compile
|
- tests/compile
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s compile/test_full_graph.py
|
- pytest -v -s compile/test_full_graph.py
|
||||||
|
|
||||||
- label: Kernels Core Operation Test
|
- label: Kernels Test %N # 1h each
|
||||||
mirror_hardwares: [amdexperimental]
|
mirror_hardwares: [amd]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- csrc/
|
- csrc/
|
||||||
- tests/kernels/core
|
|
||||||
commands:
|
|
||||||
- pytest -v -s kernels/core
|
|
||||||
|
|
||||||
- label: Kernels Attention Test %N
|
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
source_file_dependencies:
|
|
||||||
- csrc/attention/
|
|
||||||
- vllm/attention
|
- vllm/attention
|
||||||
- vllm/v1/attention
|
- tests/kernels
|
||||||
- tests/kernels/attention
|
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
- pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
||||||
parallelism: 2
|
parallelism: 4
|
||||||
|
|
||||||
- label: Kernels Quantization Test %N
|
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
source_file_dependencies:
|
|
||||||
- csrc/quantization/
|
|
||||||
- vllm/model_executor/layers/quantization
|
|
||||||
- tests/kernels/quantization
|
|
||||||
commands:
|
|
||||||
- pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
|
||||||
parallelism: 2
|
|
||||||
|
|
||||||
- label: Kernels MoE Test %N
|
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
source_file_dependencies:
|
|
||||||
- csrc/quantization/cutlass_w8a8/moe/
|
|
||||||
- csrc/moe/
|
|
||||||
- tests/kernels/moe
|
|
||||||
- vllm/model_executor/layers/fused_moe/
|
|
||||||
- vllm/distributed/device_communicators/
|
|
||||||
commands:
|
|
||||||
- pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
|
||||||
parallelism: 2
|
|
||||||
|
|
||||||
- label: Kernels Mamba Test
|
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
source_file_dependencies:
|
|
||||||
- csrc/mamba/
|
|
||||||
- tests/kernels/mamba
|
|
||||||
commands:
|
|
||||||
- pytest -v -s kernels/mamba
|
|
||||||
|
|
||||||
- label: Tensorizer Test # 11min
|
- label: Tensorizer Test # 11min
|
||||||
mirror_hardwares: [amdexperimental]
|
mirror_hardwares: [amd]
|
||||||
|
soft_fail: true
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/model_executor/model_loader
|
- vllm/model_executor/model_loader
|
||||||
- tests/tensorizer_loader
|
- tests/tensorizer_loader
|
||||||
- tests/entrypoints/openai/test_tensorizer_entrypoint.py
|
|
||||||
commands:
|
commands:
|
||||||
- apt-get update && apt-get install -y curl libsodium23
|
- apt-get update && apt-get install -y curl libsodium23
|
||||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
- pytest -v -s tensorizer_loader
|
- pytest -v -s tensorizer_loader
|
||||||
- pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
|
|
||||||
|
|
||||||
- label: Model Executor Test
|
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/model_executor
|
|
||||||
- tests/model_executor
|
|
||||||
commands:
|
|
||||||
- apt-get update && apt-get install -y curl libsodium23
|
|
||||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
|
||||||
- pytest -v -s model_executor
|
|
||||||
|
|
||||||
- label: Benchmarks # 9min
|
- label: Benchmarks # 9min
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
working_dir: "/vllm-workspace/.buildkite"
|
working_dir: "/vllm-workspace/.buildkite"
|
||||||
|
mirror_hardwares: [amd]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- benchmarks/
|
- benchmarks/
|
||||||
commands:
|
commands:
|
||||||
- bash scripts/run-benchmarks.sh
|
- bash run-benchmarks.sh
|
||||||
|
|
||||||
- label: Benchmarks CLI Test # 10min
|
- label: Quantization Test # 33min
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/
|
|
||||||
- tests/benchmarks/
|
|
||||||
commands:
|
|
||||||
- pytest -v -s benchmarks/
|
|
||||||
|
|
||||||
- label: Quantization Test
|
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- csrc/
|
- csrc/
|
||||||
- vllm/model_executor/layers/quantization
|
- vllm/model_executor/layers/quantization
|
||||||
- tests/quantization
|
- tests/quantization
|
||||||
commands:
|
command: VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
|
||||||
# temporary install here since we need nightly, will move to requirements/test.in
|
|
||||||
# after torchao 0.12 release
|
|
||||||
- pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126
|
|
||||||
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
|
|
||||||
|
|
||||||
- label: LM Eval Small Models # 53min
|
- label: LM Eval Small Models # 53min
|
||||||
mirror_hardwares: [amdexperimental]
|
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- csrc/
|
- csrc/
|
||||||
- vllm/model_executor/layers/quantization
|
- vllm/model_executor/layers/quantization
|
||||||
commands:
|
commands:
|
||||||
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
|
- bash ./run-tests.sh -c configs/models-small.txt -t 1
|
||||||
- label: OpenAI API correctness
|
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
source_file_dependencies:
|
|
||||||
- csrc/
|
|
||||||
- vllm/entrypoints/openai/
|
|
||||||
- vllm/model_executor/models/whisper.py
|
|
||||||
commands: # LMEval+Transcription WER check
|
|
||||||
- pytest -s entrypoints/openai/correctness/
|
|
||||||
|
|
||||||
- label: Encoder Decoder tests # 5min
|
- label: Encoder Decoder tests # 5min
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/encoder_decoder
|
- tests/encoder_decoder
|
||||||
@ -479,136 +316,95 @@ steps:
|
|||||||
- pytest -v -s encoder_decoder
|
- pytest -v -s encoder_decoder
|
||||||
|
|
||||||
- label: OpenAI-Compatible Tool Use # 20 min
|
- label: OpenAI-Compatible Tool Use # 20 min
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
fast_check: false
|
fast_check: false
|
||||||
|
mirror_hardwares: [ amd ]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/tool_use
|
- tests/tool_use
|
||||||
- tests/mistral_tool_use
|
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s tool_use
|
- pytest -v -s tool_use
|
||||||
- pytest -v -s mistral_tool_use
|
|
||||||
|
|
||||||
##### models test #####
|
##### models test #####
|
||||||
|
|
||||||
- label: Basic Models Test # 24min
|
- label: Basic Models Test # 24min
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
torch_nightly: true
|
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/models
|
- tests/models
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s models/test_transformers.py
|
|
||||||
- pytest -v -s models/test_registry.py
|
- pytest -v -s models/test_registry.py
|
||||||
- pytest -v -s models/test_utils.py
|
|
||||||
- pytest -v -s models/test_vision.py
|
|
||||||
- pytest -v -s models/test_initialization.py
|
- pytest -v -s models/test_initialization.py
|
||||||
|
|
||||||
- label: Language Models Test (Standard)
|
- label: Language Models Test (Standard) # 32min
|
||||||
mirror_hardwares: [amdexperimental]
|
#mirror_hardwares: [amd]
|
||||||
torch_nightly: true
|
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/models/language
|
- tests/models/decoder_only/language
|
||||||
|
- tests/models/embedding/language
|
||||||
|
- tests/models/encoder_decoder/language
|
||||||
commands:
|
commands:
|
||||||
- pip freeze | grep -E 'torch'
|
- pytest -v -s models/decoder_only/language -m 'core_model or quant_model'
|
||||||
- pytest -v -s models/language -m core_model
|
- pytest -v -s models/embedding/language -m core_model
|
||||||
|
|
||||||
- label: Language Models Test (Hybrid) # 35 min
|
- label: Language Models Test (Extended) # 1h10min
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
torch_nightly: true
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/
|
|
||||||
- tests/models/language/generation
|
|
||||||
commands:
|
|
||||||
# Install fast path packages for testing against transformers
|
|
||||||
# Note: also needed to run plamo2 model in vLLM
|
|
||||||
- uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
|
|
||||||
- uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
|
|
||||||
- pytest -v -s models/language/generation -m hybrid_model
|
|
||||||
|
|
||||||
- label: Language Models Test (Extended Generation) # 1hr20min
|
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
optional: true
|
optional: true
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/models/language/generation
|
- tests/models/decoder_only/language
|
||||||
|
- tests/models/embedding/language
|
||||||
|
- tests/models/encoder_decoder/language
|
||||||
commands:
|
commands:
|
||||||
# Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
|
- pytest -v -s models/decoder_only/language -m 'not core_model and not quant_model'
|
||||||
- pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
|
- pytest -v -s models/embedding/language -m 'not core_model'
|
||||||
- pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'
|
|
||||||
|
|
||||||
- label: Language Models Test (Extended Pooling) # 36min
|
- label: Multi-Modal Models Test (Standard) # 40min
|
||||||
mirror_hardwares: [amdexperimental]
|
#mirror_hardwares: [amd]
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/models/decoder_only/audio_language
|
||||||
|
- tests/models/decoder_only/vision_language
|
||||||
|
- tests/models/embedding/vision_language
|
||||||
|
- tests/models/encoder_decoder/audio_language
|
||||||
|
- tests/models/encoder_decoder/vision_language
|
||||||
|
commands:
|
||||||
|
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||||
|
- pytest -v -s models/decoder_only/audio_language -m 'core_model or quant_model'
|
||||||
|
- pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'core_model or quant_model'
|
||||||
|
- pytest -v -s models/embedding/vision_language -m core_model
|
||||||
|
- pytest -v -s models/encoder_decoder/audio_language -m core_model
|
||||||
|
- pytest -v -s models/encoder_decoder/language -m core_model
|
||||||
|
- pytest -v -s models/encoder_decoder/vision_language -m core_model
|
||||||
|
|
||||||
|
- label: Multi-Modal Models Test (Extended) 1 # 48m
|
||||||
optional: true
|
optional: true
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/models/language/pooling
|
- tests/models/decoder_only/audio_language
|
||||||
commands:
|
- tests/models/decoder_only/vision_language
|
||||||
- pytest -v -s models/language/pooling -m 'not core_model'
|
- tests/models/embedding/vision_language
|
||||||
|
- tests/models/encoder_decoder/vision_language
|
||||||
- label: Multi-Modal Processor Test
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/
|
|
||||||
- tests/models/multimodal
|
|
||||||
commands:
|
commands:
|
||||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||||
- pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py
|
- pytest -v -s models/decoder_only/audio_language -m 'not core_model and not quant_model'
|
||||||
- pytest -v -s models/multimodal/processing/test_tensor_schema.py
|
- pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=0) and not core_model and not quant_model'
|
||||||
|
# HACK - run phi3v tests separately to sidestep this transformers bug
|
||||||
|
# https://github.com/huggingface/transformers/issues/34307
|
||||||
|
- pytest -v -s models/decoder_only/vision_language/test_phi3v.py
|
||||||
|
- pytest -v -s --ignore models/decoder_only/vision_language/test_models.py --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'not core_model and not quant_model'
|
||||||
|
- pytest -v -s models/embedding/vision_language -m 'not core_model'
|
||||||
|
- pytest -v -s models/encoder_decoder/language -m 'not core_model'
|
||||||
|
- pytest -v -s models/encoder_decoder/vision_language -m 'not core_model'
|
||||||
|
|
||||||
- label: Multi-Modal Models Test (Standard)
|
- label: Multi-Modal Models Test (Extended) 2 # 38m
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
torch_nightly: true
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/
|
|
||||||
- tests/models/multimodal
|
|
||||||
commands:
|
|
||||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
|
||||||
- pip freeze | grep -E 'torch'
|
|
||||||
- pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
|
|
||||||
- cd .. && pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
|
|
||||||
|
|
||||||
- label: Multi-Modal Models Test (Extended) 1
|
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
optional: true
|
optional: true
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/models/multimodal
|
- tests/models/decoder_only/vision_language
|
||||||
commands:
|
commands:
|
||||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||||
- pytest -v -s models/multimodal -m 'not core_model' --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing
|
- pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=1) and not core_model and not quant_model'
|
||||||
|
|
||||||
- label: Multi-Modal Models Test (Extended) 2
|
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
optional: true
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/
|
|
||||||
- tests/models/multimodal
|
|
||||||
commands:
|
|
||||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
|
||||||
- pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model'
|
|
||||||
|
|
||||||
- label: Multi-Modal Models Test (Extended) 3
|
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
optional: true
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/
|
|
||||||
- tests/models/multimodal
|
|
||||||
commands:
|
|
||||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
|
||||||
- pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model'
|
|
||||||
|
|
||||||
- label: Quantized Models Test
|
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/model_executor/layers/quantization
|
|
||||||
- tests/models/quantization
|
|
||||||
commands:
|
|
||||||
- pytest -v -s models/quantization
|
|
||||||
|
|
||||||
# This test is used only in PR development phase to test individual models and should never run on main
|
# This test is used only in PR development phase to test individual models and should never run on main
|
||||||
- label: Custom Models Test
|
- label: Custom Models Test
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
optional: true
|
optional: true
|
||||||
commands:
|
commands:
|
||||||
- echo 'Testing custom models...'
|
- echo 'Testing custom models...'
|
||||||
@ -616,59 +412,10 @@ steps:
|
|||||||
# e.g. pytest -v -s models/encoder_decoder/vision_language/test_mllama.py
|
# e.g. pytest -v -s models/encoder_decoder/vision_language/test_mllama.py
|
||||||
# *To avoid merge conflicts, remember to REMOVE (not just comment out) them before merging the PR*
|
# *To avoid merge conflicts, remember to REMOVE (not just comment out) them before merging the PR*
|
||||||
|
|
||||||
- label: Transformers Nightly Models Test
|
|
||||||
working_dir: "/vllm-workspace/"
|
|
||||||
optional: true
|
|
||||||
commands:
|
|
||||||
- pip install --upgrade git+https://github.com/huggingface/transformers
|
|
||||||
- pytest -v -s tests/models/test_initialization.py
|
|
||||||
- pytest -v -s tests/models/multimodal/processing/
|
|
||||||
- pytest -v -s tests/models/multimodal/test_mapping.py
|
|
||||||
- python3 examples/offline_inference/basic/chat.py
|
|
||||||
- python3 examples/offline_inference/audio_language.py --model-type whisper
|
|
||||||
- python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
|
|
||||||
|
|
||||||
- label: Blackwell Test
|
|
||||||
working_dir: "/vllm-workspace/"
|
|
||||||
gpu: b200
|
|
||||||
# optional: true
|
|
||||||
source_file_dependencies:
|
|
||||||
- csrc/quantization/fp4/
|
|
||||||
- csrc/attention/mla/
|
|
||||||
- csrc/quantization/cutlass_w8a8/moe/
|
|
||||||
- vllm/model_executor/layers/fused_moe/cutlass_moe.py
|
|
||||||
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
|
|
||||||
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
|
|
||||||
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
|
||||||
- vllm/v1/attention/backends/flashinfer.py
|
|
||||||
- vllm/compilation/fusion.py
|
|
||||||
- vllm/compilation/fusion_attn.py
|
|
||||||
commands:
|
|
||||||
- nvidia-smi
|
|
||||||
- python3 examples/offline_inference/basic/chat.py
|
|
||||||
# Attention
|
|
||||||
# num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
|
|
||||||
- pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2'
|
|
||||||
- pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py
|
|
||||||
- pytest -v -s tests/kernels/test_cutlass_mla_decode.py
|
|
||||||
# Quantization
|
|
||||||
- pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8'
|
|
||||||
- pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py
|
|
||||||
- pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
|
|
||||||
- pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
|
|
||||||
- pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
|
|
||||||
- pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
|
|
||||||
- pytest -v -s tests/kernels/moe/test_mxfp4_moe.py
|
|
||||||
# Fusion
|
|
||||||
- pytest -v -s tests/compile/test_fusion_all_reduce.py
|
|
||||||
- pytest -v -s tests/compile/test_fusion_attn.py::test_attention_quant_pattern
|
|
||||||
- pytest -v -s tests/kernels/moe/test_flashinfer.py
|
|
||||||
|
|
||||||
##### 1 GPU test #####
|
##### 1 GPU test #####
|
||||||
##### multi gpus test #####
|
##### multi gpus test #####
|
||||||
|
|
||||||
- label: Distributed Comm Ops Test # 7min
|
- label: Distributed Comm Ops Test # 7min
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
num_gpus: 2
|
num_gpus: 2
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
@ -679,7 +426,6 @@ steps:
|
|||||||
- pytest -v -s distributed/test_shm_broadcast.py
|
- pytest -v -s distributed/test_shm_broadcast.py
|
||||||
|
|
||||||
- label: 2 Node Tests (4 GPUs in total) # 16min
|
- label: 2 Node Tests (4 GPUs in total) # 16min
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
num_gpus: 2
|
num_gpus: 2
|
||||||
num_nodes: 2
|
num_nodes: 2
|
||||||
@ -689,21 +435,16 @@ steps:
|
|||||||
- vllm/executor/
|
- vllm/executor/
|
||||||
- vllm/model_executor/models/
|
- vllm/model_executor/models/
|
||||||
- tests/distributed/
|
- tests/distributed/
|
||||||
- tests/examples/offline_inference/data_parallel.py
|
|
||||||
commands:
|
commands:
|
||||||
- # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
|
- # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
|
||||||
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
|
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
|
||||||
- NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
|
|
||||||
- python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
|
|
||||||
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
|
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
|
||||||
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
|
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
|
||||||
- # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
|
- # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
|
||||||
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
|
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
|
||||||
- NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
|
|
||||||
- python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
|
|
||||||
|
|
||||||
- label: Distributed Tests (2 GPUs) # 40min
|
- label: Distributed Tests (2 GPUs) # 40min
|
||||||
mirror_hardwares: [amdexperimental]
|
#mirror_hardwares: [amd]
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
num_gpus: 2
|
num_gpus: 2
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
@ -716,55 +457,57 @@ steps:
|
|||||||
- vllm/worker/worker_base.py
|
- vllm/worker/worker_base.py
|
||||||
- vllm/worker/worker.py
|
- vllm/worker/worker.py
|
||||||
- vllm/worker/model_runner.py
|
- vllm/worker/model_runner.py
|
||||||
- entrypoints/llm/test_collective_rpc.py
|
|
||||||
- tests/v1/test_async_llm_dp.py
|
|
||||||
- tests/v1/test_external_lb_dp.py
|
|
||||||
- tests/v1/entrypoints/openai/test_multi_api_servers.py
|
|
||||||
- vllm/v1/engine/
|
|
||||||
commands:
|
commands:
|
||||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
|
|
||||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_external_lb_dp.py
|
|
||||||
- DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
|
|
||||||
- pytest -v -s entrypoints/llm/test_collective_rpc.py
|
|
||||||
- pytest -v -s ./compile/test_basic_correctness.py
|
- pytest -v -s ./compile/test_basic_correctness.py
|
||||||
- pytest -v -s ./compile/test_wrapper.py
|
- pytest -v -s ./compile/test_wrapper.py
|
||||||
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
||||||
- TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
|
- TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
|
||||||
# Avoid importing model tests that cause CUDA reinitialization error
|
# Avoid importing model tests that cause CUDA reinitialization error
|
||||||
- pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
|
- pytest models/encoder_decoder/language/test_bart.py -v -s -m 'distributed(num_gpus=2)'
|
||||||
- pytest models/language -v -s -m 'distributed(num_gpus=2)'
|
- pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m 'distributed(num_gpus=2)'
|
||||||
- pytest models/multimodal -v -s -m 'distributed(num_gpus=2)'
|
- pytest models/decoder_only/vision_language/test_models.py -v -s -m 'distributed(num_gpus=2)'
|
||||||
# test sequence parallel
|
- pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
|
||||||
- pytest -v -s distributed/test_sequence_parallel.py
|
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
|
||||||
# this test fails consistently.
|
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/disagg_test.py
|
||||||
# TODO: investigate and fix
|
|
||||||
- VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
|
|
||||||
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
|
|
||||||
- pytest -v -s models/multimodal/generation/test_maverick.py
|
|
||||||
|
|
||||||
- label: Plugin Tests (2 GPUs) # 40min
|
- label: Plugin Tests (2 GPUs) # 40min
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
num_gpus: 2
|
num_gpus: 2
|
||||||
|
fast_check: true
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/plugins/
|
- vllm/plugins/
|
||||||
- tests/plugins/
|
- tests/plugins/
|
||||||
commands:
|
commands:
|
||||||
# begin platform plugin and general plugin tests, all the code in-between runs on dummy platform
|
# begin platform plugin tests, all the code in-between runs on dummy platform
|
||||||
- pip install -e ./plugins/vllm_add_dummy_platform
|
- pip install -e ./plugins/vllm_add_dummy_platform
|
||||||
- pytest -v -s plugins_tests/test_platform_plugins.py
|
- pytest -v -s plugins_tests/test_platform_plugins.py
|
||||||
- pip uninstall vllm_add_dummy_platform -y
|
- pip uninstall vllm_add_dummy_platform -y
|
||||||
# end platform plugin tests
|
# end platform plugin tests
|
||||||
# other tests continue here:
|
# other tests continue here:
|
||||||
- pytest -v -s plugins_tests/test_scheduler_plugins.py
|
|
||||||
- pip install -e ./plugins/vllm_add_dummy_model
|
- pip install -e ./plugins/vllm_add_dummy_model
|
||||||
- pytest -v -s distributed/test_distributed_oot.py
|
- pytest -v -s distributed/test_distributed_oot.py
|
||||||
- pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
|
- pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
|
||||||
- pytest -v -s models/test_oot_registration.py # it needs a clean process
|
- pytest -v -s models/test_oot_registration.py # it needs a clean process
|
||||||
- pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins
|
|
||||||
|
- label: Multi-step Tests (4 GPUs) # 36min
|
||||||
|
working_dir: "/vllm-workspace/tests"
|
||||||
|
num_gpus: 4
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/model_executor/layers/sampler.py
|
||||||
|
- vllm/sequence.py
|
||||||
|
- vllm/worker/worker_base.py
|
||||||
|
- vllm/worker/worker.py
|
||||||
|
- vllm/worker/multi_step_worker.py
|
||||||
|
- vllm/worker/model_runner_base.py
|
||||||
|
- vllm/worker/model_runner.py
|
||||||
|
- vllm/worker/multi_step_model_runner.py
|
||||||
|
- vllm/engine
|
||||||
|
- tests/multi_step
|
||||||
|
commands:
|
||||||
|
- pytest -v -s multi_step/test_correctness_async_llm.py
|
||||||
|
- pytest -v -s multi_step/test_correctness_llm.py
|
||||||
|
|
||||||
- label: Pipeline Parallelism Test # 45min
|
- label: Pipeline Parallelism Test # 45min
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
num_gpus: 4
|
num_gpus: 4
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
@ -778,7 +521,6 @@ steps:
|
|||||||
- pytest -v -s distributed/test_pipeline_parallel.py
|
- pytest -v -s distributed/test_pipeline_parallel.py
|
||||||
|
|
||||||
- label: LoRA TP Test (Distributed)
|
- label: LoRA TP Test (Distributed)
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
num_gpus: 4
|
num_gpus: 4
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/lora
|
- vllm/lora
|
||||||
@ -787,15 +529,16 @@ steps:
|
|||||||
# FIXIT: find out which code initialize cuda before running the test
|
# FIXIT: find out which code initialize cuda before running the test
|
||||||
# before the fix, we need to use spawn to test it
|
# before the fix, we need to use spawn to test it
|
||||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
# There is some Tensor Parallelism related processing logic in LoRA that
|
# This test runs llama 13B, so it is required to run on 4 GPUs.
|
||||||
|
- pytest -v -s -x lora/test_long_context.py
|
||||||
|
# There is some Tensor Parallelism related processing logic in LoRA that
|
||||||
# requires multi-GPU testing for validation.
|
# requires multi-GPU testing for validation.
|
||||||
- pytest -v -s -x lora/test_chatglm3_tp.py
|
- pytest -v -s -x lora/test_chatglm3_tp.py
|
||||||
- pytest -v -s -x lora/test_llama_tp.py
|
- pytest -v -s -x lora/test_llama_tp.py
|
||||||
- pytest -v -s -x lora/test_multi_loras_with_tp.py
|
- pytest -v -s -x lora/test_minicpmv_tp.py
|
||||||
|
|
||||||
|
|
||||||
- label: Weight Loading Multiple GPU Test # 33min
|
- label: Weight Loading Multiple GPU Test # 33min
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
num_gpus: 2
|
num_gpus: 2
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
@ -805,7 +548,6 @@ steps:
|
|||||||
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt
|
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt
|
||||||
|
|
||||||
- label: Weight Loading Multiple GPU Test - Large Models # optional
|
- label: Weight Loading Multiple GPU Test - Large Models # optional
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
num_gpus: 2
|
num_gpus: 2
|
||||||
gpu: a100
|
gpu: a100
|
||||||
@ -814,7 +556,7 @@ steps:
|
|||||||
- vllm/
|
- vllm/
|
||||||
- tests/weight_loading
|
- tests/weight_loading
|
||||||
commands:
|
commands:
|
||||||
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
|
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
|
||||||
|
|
||||||
|
|
||||||
##### multi gpus test #####
|
##### multi gpus test #####
|
||||||
@ -826,7 +568,7 @@ steps:
|
|||||||
num_gpus: 4
|
num_gpus: 4
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
commands:
|
commands:
|
||||||
# NOTE: don't test llama model here, it seems hf implementation is buggy
|
# NOTE: don't test llama model here, it seems hf implementation is buggy
|
||||||
# see https://github.com/vllm-project/vllm/pull/5689 for details
|
# see https://github.com/vllm-project/vllm/pull/5689 for details
|
||||||
- pytest -v -s distributed/test_custom_all_reduce.py
|
- pytest -v -s distributed/test_custom_all_reduce.py
|
||||||
@ -844,11 +586,4 @@ steps:
|
|||||||
- vllm/model_executor/layers/quantization
|
- vllm/model_executor/layers/quantization
|
||||||
commands:
|
commands:
|
||||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
|
- bash ./run-tests.sh -c configs/models-large.txt -t 4
|
||||||
|
|
||||||
- label: Qwen MoE EP Test # optional
|
|
||||||
gpu: h200
|
|
||||||
optional: true
|
|
||||||
num_gpus: 2
|
|
||||||
commands:
|
|
||||||
- CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 /vllm-workspace/examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
|
|
||||||
|
|||||||
@ -14,19 +14,8 @@ fi
|
|||||||
# Get the single wheel file
|
# Get the single wheel file
|
||||||
wheel="${wheel_files[0]}"
|
wheel="${wheel_files[0]}"
|
||||||
|
|
||||||
# Detect architecture and rename 'linux' to appropriate manylinux version
|
# Rename 'linux' to 'manylinux1' in the wheel filename
|
||||||
arch=$(uname -m)
|
new_wheel="${wheel/linux/manylinux1}"
|
||||||
if [[ $arch == "x86_64" ]]; then
|
|
||||||
manylinux_version="manylinux1"
|
|
||||||
elif [[ $arch == "aarch64" ]]; then
|
|
||||||
manylinux_version="manylinux2014"
|
|
||||||
else
|
|
||||||
echo "Warning: Unknown architecture $arch, using manylinux1 as default"
|
|
||||||
manylinux_version="manylinux1"
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Rename 'linux' to the appropriate manylinux version in the wheel filename
|
|
||||||
new_wheel="${wheel/linux/$manylinux_version}"
|
|
||||||
mv -- "$wheel" "$new_wheel"
|
mv -- "$wheel" "$new_wheel"
|
||||||
wheel="$new_wheel"
|
wheel="$new_wheel"
|
||||||
|
|
||||||
@ -61,11 +50,8 @@ aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
|
|||||||
if [[ $normal_wheel == *"cu118"* ]]; then
|
if [[ $normal_wheel == *"cu118"* ]]; then
|
||||||
# if $normal_wheel matches cu118, do not upload the index.html
|
# if $normal_wheel matches cu118, do not upload the index.html
|
||||||
echo "Skipping index files for cu118 wheels"
|
echo "Skipping index files for cu118 wheels"
|
||||||
elif [[ $normal_wheel == *"cu126"* ]]; then
|
|
||||||
# if $normal_wheel matches cu126, do not upload the index.html
|
|
||||||
echo "Skipping index files for cu126 wheels"
|
|
||||||
else
|
else
|
||||||
# only upload index.html for cu128 wheels (default wheels)
|
# only upload index.html for cu12 wheels (default wheels)
|
||||||
aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
|
aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
|
||||||
aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
|
aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
|
||||||
fi
|
fi
|
||||||
@ -77,13 +63,9 @@ aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"
|
|||||||
if [[ $normal_wheel == *"cu118"* ]]; then
|
if [[ $normal_wheel == *"cu118"* ]]; then
|
||||||
# if $normal_wheel matches cu118, do not upload the index.html
|
# if $normal_wheel matches cu118, do not upload the index.html
|
||||||
echo "Skipping index files for cu118 wheels"
|
echo "Skipping index files for cu118 wheels"
|
||||||
elif [[ $normal_wheel == *"cu126"* ]]; then
|
|
||||||
# if $normal_wheel matches cu126, do not upload the index.html
|
|
||||||
echo "Skipping index files for cu126 wheels"
|
|
||||||
else
|
else
|
||||||
# only upload index.html for cu128 wheels (default wheels)
|
# only upload index.html for cu12 wheels (default wheels)
|
||||||
aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
|
aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
|
aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
|
||||||
aws s3 cp index.html "s3://vllm-wheels/$version/vllm/index.html"
|
|
||||||
@ -1,6 +0,0 @@
|
|||||||
# https://developers.google.com/gemini-code-assist/docs/customize-gemini-behavior-github
|
|
||||||
have_fun: false # Just review the code
|
|
||||||
code_review:
|
|
||||||
comment_severity_threshold: HIGH # Reduce quantity of comments
|
|
||||||
pull_request_opened:
|
|
||||||
summary: false # Don't summarize the PR in a separate comment
|
|
||||||
99
.github/CODEOWNERS
vendored
99
.github/CODEOWNERS
vendored
@ -2,87 +2,32 @@
|
|||||||
# for more info about CODEOWNERS file
|
# for more info about CODEOWNERS file
|
||||||
|
|
||||||
# This lists cover the "core" components of vLLM that require careful review
|
# This lists cover the "core" components of vLLM that require careful review
|
||||||
/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
|
||||||
/vllm/core @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
/vllm/core @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
|
||||||
/vllm/engine/llm_engine.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
/vllm/engine/llm_engine.py @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
|
||||||
/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
|
||||||
/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
|
||||||
/vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
/vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
|
||||||
/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
|
||||||
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256
|
CMakeLists.txt @tlrmchlsmth
|
||||||
/vllm/model_executor/layers/mamba @tdoublep
|
|
||||||
/vllm/multimodal @DarkLight1337 @ywang96
|
|
||||||
/vllm/vllm_flash_attn @LucasWilkinson
|
|
||||||
/vllm/lora @jeejeelee
|
|
||||||
/vllm/reasoning @aarnphm
|
|
||||||
/vllm/entrypoints @aarnphm
|
|
||||||
/vllm/compilation @zou3519 @youkaichao @ProExpertProg
|
|
||||||
CMakeLists.txt @tlrmchlsmth @LucasWilkinson
|
|
||||||
|
|
||||||
# Any change to the VllmConfig changes can have a large user-facing impact,
|
|
||||||
# so spam a lot of people
|
|
||||||
/vllm/config @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg
|
|
||||||
|
|
||||||
# vLLM V1
|
# vLLM V1
|
||||||
/vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
|
/vllm/v1 @WoosukKwon @robertgshaw2-neuralmagic @njhill @ywang96 @comaniac @alexm-neuralmagic
|
||||||
/vllm/v1/structured_output @mgoin @russellb @aarnphm
|
|
||||||
/vllm/v1/attention/backends/triton_attn.py @tdoublep
|
|
||||||
|
|
||||||
# Test ownership
|
# Test ownership
|
||||||
/.buildkite/lm-eval-harness @mgoin @simon-mo
|
/tests/async_engine @njhill @robertgshaw2-neuralmagic @simon-mo
|
||||||
/tests/async_engine @njhill @robertgshaw2-redhat @simon-mo
|
/tests/test_inputs.py @DarkLight1337 @ywang96
|
||||||
/tests/distributed/test_multi_node_assignment.py @youkaichao
|
/tests/entrypoints @DarkLight1337 @robertgshaw2-neuralmagic @simon-mo
|
||||||
/tests/distributed/test_pipeline_parallel.py @youkaichao
|
|
||||||
/tests/distributed/test_same_node.py @youkaichao
|
|
||||||
/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo @aarnphm
|
|
||||||
/tests/kernels @tlrmchlsmth @WoosukKwon @yewentao256
|
|
||||||
/tests/models @DarkLight1337 @ywang96
|
/tests/models @DarkLight1337 @ywang96
|
||||||
/tests/multimodal @DarkLight1337 @ywang96
|
/tests/multimodal @DarkLight1337 @ywang96
|
||||||
/tests/prefix_caching @comaniac @KuntaiDu
|
/tests/prefix_caching @comaniac @KuntaiDu
|
||||||
/tests/quantization @mgoin @robertgshaw2-redhat @yewentao256
|
/tests/spec_decode @njhill @LiuXiaoxuanPKU
|
||||||
/tests/test_inputs.py @DarkLight1337 @ywang96
|
/tests/kernels @tlrmchlsmth @WoosukKwon
|
||||||
/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
|
/tests/quantization @mgoin @robertgshaw2-neuralmagic
|
||||||
/tests/v1/structured_output @mgoin @russellb @aarnphm
|
/.buildkite/lm-eval-harness @mgoin @simon-mo
|
||||||
/tests/weight_loading @mgoin @youkaichao @yewentao256
|
/tests/distributed/test_multi_node_assignment.py @youkaichao
|
||||||
/tests/lora @jeejeelee
|
/tests/distributed/test_pipeline_parallel.py @youkaichao
|
||||||
/tests/models/language/generation/test_hybrid.py @tdoublep
|
/tests/distributed/test_same_node.py @youkaichao
|
||||||
|
/tests/multi_step @alexm-neuralmagic @comaniac
|
||||||
# Docs
|
/tests/weight_loading @mgoin @youkaichao
|
||||||
/docs @hmellor
|
/tests/basic_correctness/test_chunked_prefill @rkooo567 @comaniac
|
||||||
mkdocs.yaml @hmellor
|
|
||||||
|
|
||||||
# CPU
|
|
||||||
/vllm/v1/worker/^cpu @bigPYJ1151
|
|
||||||
/csrc/cpu @bigPYJ1151
|
|
||||||
/vllm/platforms/cpu.py @bigPYJ1151
|
|
||||||
/cmake/cpu_extension.cmake @bigPYJ1151
|
|
||||||
/docker/Dockerfile.cpu @bigPYJ1151
|
|
||||||
|
|
||||||
# Intel GPU
|
|
||||||
/vllm/v1/worker/^xpu @jikunshang
|
|
||||||
/vllm/platforms/xpu.py @jikunshang
|
|
||||||
/docker/Dockerfile.xpu @jikunshang
|
|
||||||
|
|
||||||
# Qwen-specific files
|
|
||||||
/vllm/attention/backends/dual_chunk_flash_attn.py @sighingnow
|
|
||||||
/vllm/model_executor/models/qwen* @sighingnow
|
|
||||||
|
|
||||||
# Mistral-specific files
|
|
||||||
/vllm/model_executor/models/mistral*.py @patrickvonplaten
|
|
||||||
/vllm/model_executor/models/mixtral*.py @patrickvonplaten
|
|
||||||
/vllm/model_executor/models/voxtral*.py @patrickvonplaten
|
|
||||||
/vllm/model_executor/models/pixtral*.py @patrickvonplaten
|
|
||||||
/vllm/transformers_utils/configs/mistral.py @patrickvonplaten
|
|
||||||
/vllm/transformers_utils/tokenizers/mistral.py @patrickvonplaten
|
|
||||||
|
|
||||||
# Kernels
|
|
||||||
/vllm/attention/ops/chunked_prefill_paged_decode.py @tdoublep
|
|
||||||
/vllm/attention/ops/triton_unified_attention.py @tdoublep
|
|
||||||
|
|
||||||
# ROCm related: specify owner with write access to notify AMD folks for careful code review
|
|
||||||
/docker/Dockerfile.rocm* @gshtras
|
|
||||||
/vllm/v1/attention/backends/rocm*.py @gshtras
|
|
||||||
/vllm/v1/attention/backends/mla/rocm*.py @gshtras
|
|
||||||
/vllm/attention/ops/rocm*.py @gshtras
|
|
||||||
/vllm/model_executor/layers/fused_moe/rocm*.py @gshtras
|
|
||||||
|
|
||||||
|
|||||||
2
.github/ISSUE_TEMPLATE/200-installation.yml
vendored
2
.github/ISSUE_TEMPLATE/200-installation.yml
vendored
@ -14,7 +14,7 @@ body:
|
|||||||
description: |
|
description: |
|
||||||
Please run the following and paste the output below.
|
Please run the following and paste the output below.
|
||||||
```sh
|
```sh
|
||||||
wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py
|
wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
|
||||||
# For security purposes, please feel free to check the contents of collect_env.py before running it.
|
# For security purposes, please feel free to check the contents of collect_env.py before running it.
|
||||||
python collect_env.py
|
python collect_env.py
|
||||||
```
|
```
|
||||||
|
|||||||
2
.github/ISSUE_TEMPLATE/300-usage.yml
vendored
2
.github/ISSUE_TEMPLATE/300-usage.yml
vendored
@ -14,7 +14,7 @@ body:
|
|||||||
description: |
|
description: |
|
||||||
Please run the following and paste the output below.
|
Please run the following and paste the output below.
|
||||||
```sh
|
```sh
|
||||||
wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py
|
wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
|
||||||
# For security purposes, please feel free to check the contents of collect_env.py before running it.
|
# For security purposes, please feel free to check the contents of collect_env.py before running it.
|
||||||
python collect_env.py
|
python collect_env.py
|
||||||
```
|
```
|
||||||
|
|||||||
33
.github/ISSUE_TEMPLATE/400-bug-report.yml
vendored
33
.github/ISSUE_TEMPLATE/400-bug-report.yml
vendored
@ -8,38 +8,37 @@ body:
|
|||||||
attributes:
|
attributes:
|
||||||
value: >
|
value: >
|
||||||
#### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
|
#### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
|
||||||
- type: markdown
|
|
||||||
attributes:
|
|
||||||
value: |
|
|
||||||
⚠️ **SECURITY WARNING:** Please review any text you paste to ensure it does not contain sensitive information such as:
|
|
||||||
- API tokens or keys (e.g., Hugging Face tokens, OpenAI API keys)
|
|
||||||
- Passwords or authentication credentials
|
|
||||||
- Private URLs or endpoints
|
|
||||||
- Personal or confidential data
|
|
||||||
|
|
||||||
Consider redacting or replacing sensitive values with placeholders like `<YOUR_TOKEN_HERE>` when sharing configuration or code examples.
|
|
||||||
- type: textarea
|
- type: textarea
|
||||||
attributes:
|
attributes:
|
||||||
label: Your current environment
|
label: Your current environment
|
||||||
description: |
|
description: |
|
||||||
Please run the following and paste the output below.
|
Please run the following and paste the output below.
|
||||||
```sh
|
```sh
|
||||||
wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py
|
wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
|
||||||
# For security purposes, please feel free to check the contents of collect_env.py before running it.
|
# For security purposes, please feel free to check the contents of collect_env.py before running it.
|
||||||
python collect_env.py
|
python collect_env.py
|
||||||
```
|
```
|
||||||
It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues.
|
It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues.
|
||||||
value: |
|
value: |
|
||||||
<details>
|
<details>
|
||||||
<summary>The output of <code>python collect_env.py</code></summary>
|
<summary>The output of `python collect_env.py`</summary>
|
||||||
|
|
||||||
```text
|
```text
|
||||||
Your output of `python collect_env.py` here
|
Your output of `python collect_env.py` here
|
||||||
```
|
```
|
||||||
|
|
||||||
</details>
|
</details>
|
||||||
validations:
|
validations:
|
||||||
required: true
|
required: true
|
||||||
|
- type: textarea
|
||||||
|
attributes:
|
||||||
|
label: Model Input Dumps
|
||||||
|
description: |
|
||||||
|
If you are facing crashing due to illegal memory access or other issues with model execution, vLLM may dump the problematic input of the model. In this case, you will see the message `Error in model execution (input dumped to /tmp/err_xxx.pkl)`. If you see this message, please zip the file (because GitHub doesn't support .pkl file format) and upload it here. This will help us to reproduce the issue and facilitate the debugging process.
|
||||||
|
placeholder: |
|
||||||
|
Upload the dumped input file.
|
||||||
|
validations:
|
||||||
|
required: false
|
||||||
- type: textarea
|
- type: textarea
|
||||||
attributes:
|
attributes:
|
||||||
label: 🐛 Describe the bug
|
label: 🐛 Describe the bug
|
||||||
@ -85,20 +84,20 @@ body:
|
|||||||
```
|
```
|
||||||
|
|
||||||
```
|
```
|
||||||
The error message you got, with the full traceback and the error logs with [dump_input.py:##] if present.
|
The error message you got, with the full traceback.
|
||||||
```
|
```
|
||||||
validations:
|
validations:
|
||||||
required: true
|
required: true
|
||||||
- type: markdown
|
- type: markdown
|
||||||
attributes:
|
attributes:
|
||||||
value: |
|
value: >
|
||||||
⚠️ Please separate bugs of `transformers` implementation or usage from bugs of `vllm`. If you think anything is wrong with the model's output:
|
⚠️ Please separate bugs of `transformers` implementation or usage from bugs of `vllm`. If you think anything is wrong with the models' output:
|
||||||
|
|
||||||
- Try the counterpart of `transformers` first. If the error appears, please go to [their issues](https://github.com/huggingface/transformers/issues?q=is%3Aissue+is%3Aopen+sort%3Aupdated-desc).
|
- Try the counterpart of `transformers` first. If the error appears, please go to [their issues](https://github.com/huggingface/transformers/issues?q=is%3Aissue+is%3Aopen+sort%3Aupdated-desc).
|
||||||
|
|
||||||
- If the error only appears in vllm, please provide the detailed script of how you run `transformers` and `vllm`, also highlight the difference and what you expect.
|
- If the error only appears in vllm, please provide the detailed script of how you run `transformers` and `vllm`, also highlight the difference and what you expect.
|
||||||
|
|
||||||
Thanks for reporting 🙏!
|
Thanks for contributing 🎉!
|
||||||
- type: checkboxes
|
- type: checkboxes
|
||||||
id: askllm
|
id: askllm
|
||||||
attributes:
|
attributes:
|
||||||
|
|||||||
69
.github/ISSUE_TEMPLATE/450-ci-failure.yml
vendored
69
.github/ISSUE_TEMPLATE/450-ci-failure.yml
vendored
@ -1,69 +0,0 @@
|
|||||||
name: 🧪 CI failure report
|
|
||||||
description: Report a failing test.
|
|
||||||
title: "[CI Failure]: "
|
|
||||||
labels: ["ci-failure"]
|
|
||||||
|
|
||||||
body:
|
|
||||||
- type: markdown
|
|
||||||
attributes:
|
|
||||||
value: >
|
|
||||||
#### Include the name of the failing Buildkite step and test file in the title.
|
|
||||||
- type: input
|
|
||||||
attributes:
|
|
||||||
label: Name of failing test
|
|
||||||
description: |
|
|
||||||
Paste in the fully-qualified name of the failing test from the logs.
|
|
||||||
placeholder: |
|
|
||||||
`path/to/test_file.py::test_name[params]`
|
|
||||||
validations:
|
|
||||||
required: true
|
|
||||||
- type: checkboxes
|
|
||||||
attributes:
|
|
||||||
label: Basic information
|
|
||||||
description: Select all items that apply to the failing test.
|
|
||||||
options:
|
|
||||||
- label: Flaky test
|
|
||||||
- label: Can reproduce locally
|
|
||||||
- label: Caused by external libraries (e.g. bug in `transformers`)
|
|
||||||
- type: textarea
|
|
||||||
attributes:
|
|
||||||
label: 🧪 Describe the failing test
|
|
||||||
description: |
|
|
||||||
Please provide a clear and concise description of the failing test.
|
|
||||||
placeholder: |
|
|
||||||
A clear and concise description of the failing test.
|
|
||||||
|
|
||||||
```
|
|
||||||
The error message you got, with the full traceback and the error logs with [dump_input.py:##] if present.
|
|
||||||
```
|
|
||||||
validations:
|
|
||||||
required: true
|
|
||||||
- type: textarea
|
|
||||||
attributes:
|
|
||||||
label: 📝 History of failing test
|
|
||||||
description: |
|
|
||||||
Since when did the test start to fail?
|
|
||||||
You can look up its history via [Buildkite Test Suites](https://buildkite.com/organizations/vllm/analytics/suites/ci-1/tests?branch=main).
|
|
||||||
|
|
||||||
If you have time, identify the PR that caused the test to fail on main. You can do so via the following methods:
|
|
||||||
|
|
||||||
- Use Buildkite Test Suites to find the PR where the test failure first occurred, and reproduce the failure locally.
|
|
||||||
|
|
||||||
- Run [`git bisect`](https://git-scm.com/docs/git-bisect) locally.
|
|
||||||
|
|
||||||
- Manually unblock Buildkite steps for suspected PRs on main and check the results. (authorized users only)
|
|
||||||
placeholder: |
|
|
||||||
Approximate timeline and/or problematic PRs
|
|
||||||
|
|
||||||
A link to the Buildkite analytics of the failing test (if available)
|
|
||||||
validations:
|
|
||||||
required: true
|
|
||||||
- type: textarea
|
|
||||||
attributes:
|
|
||||||
label: CC List.
|
|
||||||
description: >
|
|
||||||
The list of people you want to CC. Usually, this includes those who worked on the PR that failed the test.
|
|
||||||
- type: markdown
|
|
||||||
attributes:
|
|
||||||
value: >
|
|
||||||
Thanks for reporting 🙏!
|
|
||||||
2
.github/ISSUE_TEMPLATE/600-new-model.yml
vendored
2
.github/ISSUE_TEMPLATE/600-new-model.yml
vendored
@ -9,7 +9,7 @@ body:
|
|||||||
value: >
|
value: >
|
||||||
#### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
|
#### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
|
||||||
|
|
||||||
#### We also highly recommend you read https://docs.vllm.ai/en/latest/contributing/model/index.html first to understand how to add a new model.
|
#### We also highly recommend you read https://docs.vllm.ai/en/latest/contributing/model/adding_model.html first to understand how to add a new model.
|
||||||
- type: textarea
|
- type: textarea
|
||||||
attributes:
|
attributes:
|
||||||
label: The model to consider.
|
label: The model to consider.
|
||||||
|
|||||||
@ -35,7 +35,7 @@ body:
|
|||||||
description: |
|
description: |
|
||||||
Please run the following and paste the output below.
|
Please run the following and paste the output below.
|
||||||
```sh
|
```sh
|
||||||
wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py
|
wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
|
||||||
# For security purposes, please feel free to check the contents of collect_env.py before running it.
|
# For security purposes, please feel free to check the contents of collect_env.py before running it.
|
||||||
python collect_env.py
|
python collect_env.py
|
||||||
```
|
```
|
||||||
|
|||||||
2
.github/ISSUE_TEMPLATE/750-RFC.yml
vendored
2
.github/ISSUE_TEMPLATE/750-RFC.yml
vendored
@ -46,7 +46,7 @@ body:
|
|||||||
- type: markdown
|
- type: markdown
|
||||||
attributes:
|
attributes:
|
||||||
value: >
|
value: >
|
||||||
Thanks for contributing 🎉! The vLLM core team hosts a biweekly RFC review session at 9:30AM Pacific Time, while most RFCs can be discussed online, you can optionally sign up for a slot to discuss your RFC online [here](https://docs.google.com/document/d/1CiLVBZeIVfR7_PNAKVSusxpceywkoOOB78qoWqHvSZc/edit).
|
Thanks for contributing 🎉!
|
||||||
- type: checkboxes
|
- type: checkboxes
|
||||||
id: askllm
|
id: askllm
|
||||||
attributes:
|
attributes:
|
||||||
|
|||||||
28
.github/ISSUE_TEMPLATE/800-misc-discussion.yml
vendored
Normal file
28
.github/ISSUE_TEMPLATE/800-misc-discussion.yml
vendored
Normal file
@ -0,0 +1,28 @@
|
|||||||
|
name: 🎲 Misc/random discussions that do not fit into the above categories.
|
||||||
|
description: Submit a discussion as you like. Note that developers are heavily overloaded and we mainly rely on community users to answer these issues.
|
||||||
|
title: "[Misc]: "
|
||||||
|
labels: ["misc"]
|
||||||
|
|
||||||
|
body:
|
||||||
|
- type: markdown
|
||||||
|
attributes:
|
||||||
|
value: >
|
||||||
|
#### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
|
||||||
|
- type: textarea
|
||||||
|
attributes:
|
||||||
|
label: Anything you want to discuss about vllm.
|
||||||
|
description: >
|
||||||
|
Anything you want to discuss about vllm.
|
||||||
|
validations:
|
||||||
|
required: true
|
||||||
|
- type: markdown
|
||||||
|
attributes:
|
||||||
|
value: >
|
||||||
|
Thanks for contributing 🎉!
|
||||||
|
- type: checkboxes
|
||||||
|
id: askllm
|
||||||
|
attributes:
|
||||||
|
label: Before submitting a new issue...
|
||||||
|
options:
|
||||||
|
- label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions.
|
||||||
|
required: true
|
||||||
4
.github/ISSUE_TEMPLATE/config.yml
vendored
4
.github/ISSUE_TEMPLATE/config.yml
vendored
@ -1,5 +1 @@
|
|||||||
blank_issues_enabled: false
|
blank_issues_enabled: false
|
||||||
contact_links:
|
|
||||||
- name: Questions
|
|
||||||
url: https://discuss.vllm.ai
|
|
||||||
about: Ask questions and discuss with other vLLM community members
|
|
||||||
|
|||||||
22
.github/PULL_REQUEST_TEMPLATE.md
vendored
22
.github/PULL_REQUEST_TEMPLATE.md
vendored
@ -1,21 +1,5 @@
|
|||||||
<!-- markdownlint-disable -->
|
FILL IN THE PR DESCRIPTION HERE
|
||||||
PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS (AT THE BOTTOM) HAVE BEEN CONSIDERED.
|
|
||||||
|
|
||||||
## Purpose
|
FIX #xxxx (*link existing issues this PR will resolve*)
|
||||||
|
|
||||||
## Test Plan
|
**BEFORE SUBMITTING, PLEASE READ https://docs.vllm.ai/en/latest/contributing/overview.html **
|
||||||
|
|
||||||
## Test Result
|
|
||||||
|
|
||||||
---
|
|
||||||
<details>
|
|
||||||
<summary> Essential Elements of an Effective PR Description Checklist </summary>
|
|
||||||
|
|
||||||
- [ ] The purpose of the PR, such as "Fix some issue (link existing issues this PR will resolve)".
|
|
||||||
- [ ] The test plan, such as providing test command.
|
|
||||||
- [ ] The test results, such as pasting the results comparison before and after, or e2e results
|
|
||||||
- [ ] (Optional) The necessary documentation update, such as updating `supported_models.md` and `examples` for a new model.
|
|
||||||
- [ ] (Optional) Release notes update. If your change is user facing, please update the release notes draft in the [Google Doc](https://docs.google.com/document/d/1YyVqrgX4gHTtrstbq8oWUImOyPCKSGnJ7xtTpmXzlRs/edit?tab=t.0).
|
|
||||||
</details>
|
|
||||||
|
|
||||||
**BEFORE SUBMITTING, PLEASE READ <https://docs.vllm.ai/en/latest/contributing>** (anything written below this line will be removed by GitHub Actions)
|
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user