Compare commits
4 Commits
fp8_ep_dp
...
sampler-en
| Author | SHA1 | Date | |
|---|---|---|---|
| 4c42267293 | |||
| 24f68342b4 | |||
| c5d963835b | |||
| b313220727 |
@ -8,12 +8,12 @@ import zipfile
|
||||
# Note that we have 400 MiB quota, please use it wisely.
|
||||
# See https://github.com/pypi/support/issues/3792 .
|
||||
# Please also sync the value with the one in Dockerfile.
|
||||
VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 400))
|
||||
VLLM_MAX_SIZE_MB = int(os.environ.get('VLLM_MAX_SIZE_MB', 400))
|
||||
|
||||
|
||||
def print_top_10_largest_files(zip_file):
|
||||
"""Print the top 10 largest files in the given zip file."""
|
||||
with zipfile.ZipFile(zip_file, "r") as z:
|
||||
with zipfile.ZipFile(zip_file, 'r') as z:
|
||||
file_sizes = [(f, z.getinfo(f).file_size) for f in z.namelist()]
|
||||
file_sizes.sort(key=lambda x: x[1], reverse=True)
|
||||
for f, size in file_sizes[:10]:
|
||||
@ -28,18 +28,14 @@ def check_wheel_size(directory):
|
||||
wheel_path = os.path.join(root, file_name)
|
||||
wheel_size_mb = os.path.getsize(wheel_path) / (1024 * 1024)
|
||||
if wheel_size_mb > VLLM_MAX_SIZE_MB:
|
||||
print(
|
||||
f"Not allowed: Wheel {wheel_path} is larger "
|
||||
f"({wheel_size_mb:.2f} MB) than the limit "
|
||||
f"({VLLM_MAX_SIZE_MB} MB)."
|
||||
)
|
||||
print(f"Not allowed: Wheel {wheel_path} is larger "
|
||||
f"({wheel_size_mb:.2f} MB) than the limit "
|
||||
f"({VLLM_MAX_SIZE_MB} MB).")
|
||||
print_top_10_largest_files(wheel_path)
|
||||
return 1
|
||||
else:
|
||||
print(
|
||||
f"Wheel {wheel_path} is within the allowed size "
|
||||
f"({wheel_size_mb:.2f} MB)."
|
||||
)
|
||||
print(f"Wheel {wheel_path} is within the allowed size "
|
||||
f"({wheel_size_mb:.2f} MB).")
|
||||
return 0
|
||||
|
||||
|
||||
@ -49,4 +45,4 @@ if __name__ == "__main__":
|
||||
sys.exit(1)
|
||||
|
||||
directory = sys.argv[1]
|
||||
sys.exit(check_wheel_size(directory))
|
||||
sys.exit(check_wheel_size(directory))
|
||||
@ -22,5 +22,5 @@ with open("index.html", "w") as f:
|
||||
print(f"Generated index.html for {args.wheel}")
|
||||
# cloudfront requires escaping the '+' character
|
||||
f.write(
|
||||
template.format(wheel=filename, wheel_html_escaped=filename.replace("+", "%2B"))
|
||||
)
|
||||
template.format(wheel=filename,
|
||||
wheel_html_escaped=filename.replace("+", "%2B")))
|
||||
|
||||
@ -1,4 +1,3 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m deepseek-ai/DeepSeek-V2-Lite-Chat -b "auto" -l 1000 -f 5 -t 2
|
||||
model_name: "deepseek-ai/DeepSeek-V2-Lite-Chat"
|
||||
tasks:
|
||||
|
||||
@ -1,4 +1,3 @@
|
||||
# For hf script, without -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5
|
||||
model_name: "nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform"
|
||||
tasks:
|
||||
|
||||
@ -1,4 +1,3 @@
|
||||
# For hf script, without -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-70B-Instruct -b 32 -l 250 -f 5
|
||||
model_name: "meta-llama/Meta-Llama-3-70B-Instruct"
|
||||
tasks:
|
||||
|
||||
@ -1,4 +1,3 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors -b auto -l 1000 -f 5 -t 1
|
||||
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors"
|
||||
tasks:
|
||||
|
||||
@ -1,4 +1,3 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5 -t 1
|
||||
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform"
|
||||
tasks:
|
||||
|
||||
@ -1,4 +1,3 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test -b 32 -l 1000 -f 5 -t 1
|
||||
model_name: "nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test"
|
||||
tasks:
|
||||
|
||||
@ -1,4 +1,3 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Meta-Llama-3-8B-Instruct-FP8 -b 32 -l 250 -f 5 -t 1
|
||||
model_name: "neuralmagic/Meta-Llama-3-8B-Instruct-FP8"
|
||||
tasks:
|
||||
|
||||
@ -1,4 +1,3 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
|
||||
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test"
|
||||
tasks:
|
||||
|
||||
@ -1,4 +1,3 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
|
||||
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test"
|
||||
tasks:
|
||||
|
||||
@ -1,4 +1,3 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test -b auto -l 1000 -f 5 -t 1
|
||||
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test"
|
||||
tasks:
|
||||
|
||||
@ -1,5 +1,4 @@
|
||||
# For hf script, without -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-8B-Instruct -b 32 -l 250 -f 5
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-8B-Instruct -b 32 -l 250 -f 5 -t 1
|
||||
model_name: "meta-llama/Meta-Llama-3-8B-Instruct"
|
||||
tasks:
|
||||
- name: "gsm8k"
|
||||
|
||||
@ -1,4 +1,3 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m HandH1998/QQQ-Llama-3-8b-g128 -b 32 -l 1000 -f 5 -t 1
|
||||
model_name: "HandH1998/QQQ-Llama-3-8b-g128"
|
||||
tasks:
|
||||
|
||||
@ -1,11 +0,0 @@
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Llama-3.2-1B-Instruct-FP8 -b "auto" -l 1319 -f 5 -t 1
|
||||
model_name: "RedHatAI/Llama-3.2-1B-Instruct-FP8"
|
||||
tasks:
|
||||
- name: "gsm8k"
|
||||
metrics:
|
||||
- name: "exact_match,strict-match"
|
||||
value: 0.335
|
||||
- name: "exact_match,flexible-extract"
|
||||
value: 0.323
|
||||
limit: 1319
|
||||
num_fewshot: 5
|
||||
@ -1,4 +1,3 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
|
||||
model_name: "neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8"
|
||||
tasks:
|
||||
|
||||
@ -1,4 +1,3 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m mgoin/Minitron-4B-Base-FP8 -b auto -l 1000 -f 5 -t 1
|
||||
model_name: "mgoin/Minitron-4B-Base-FP8"
|
||||
tasks:
|
||||
|
||||
@ -1,4 +1,3 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic -b "auto" -l 250 -f 5 -t 8
|
||||
model_name: "neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic"
|
||||
tasks:
|
||||
|
||||
@ -1,4 +1,3 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8 -b "auto" -l 250 -f 5 -t 4
|
||||
model_name: "neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8"
|
||||
tasks:
|
||||
|
||||
@ -1,5 +1,4 @@
|
||||
# For hf script, without -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1 -b 32 -l 250 -f 5
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1 -b 32 -l 250 -f 5 -t 4
|
||||
model_name: "mistralai/Mixtral-8x7B-Instruct-v0.1"
|
||||
tasks:
|
||||
- name: "gsm8k"
|
||||
|
||||
@ -1,12 +0,0 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16 -b auto -l 1319 -f 5 -t 1
|
||||
model_name: "nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16"
|
||||
tasks:
|
||||
- name: "gsm8k"
|
||||
metrics:
|
||||
- name: "exact_match,strict-match"
|
||||
value: 0.30
|
||||
- name: "exact_match,flexible-extract"
|
||||
value: 0.465
|
||||
limit: 1319
|
||||
num_fewshot: 5
|
||||
@ -1,4 +1,3 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-FP8W8 -b auto -l 1000 -f 5 -t 1
|
||||
model_name: "nm-testing/Qwen2-1.5B-Instruct-FP8W8"
|
||||
tasks:
|
||||
|
||||
@ -1,4 +1,3 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
|
||||
model_name: "neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8"
|
||||
tasks:
|
||||
|
||||
@ -1,4 +1,3 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise -b "auto" -l 1000 -f 5 -t 1
|
||||
model_name: "nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise"
|
||||
tasks:
|
||||
|
||||
@ -1,4 +1,3 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m Qwen/Qwen2-57B-A14B-Instruct -b "auto" -l 250 -f 5 -t 4
|
||||
model_name: "Qwen/Qwen2-57B-A14B-Instruct"
|
||||
tasks:
|
||||
|
||||
@ -1,11 +0,0 @@
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m Qwen/Qwen2.5-1.5B-Instruct -b auto -l 1319 -f 5 -t 1
|
||||
model_name: "Qwen/Qwen2.5-1.5B-Instruct"
|
||||
tasks:
|
||||
- name: "gsm8k"
|
||||
metrics:
|
||||
- name: "exact_match,strict-match"
|
||||
value: 0.54
|
||||
- name: "exact_match,flexible-extract"
|
||||
value: 0.59
|
||||
limit: 1319
|
||||
num_fewshot: 5
|
||||
@ -1,11 +0,0 @@
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -b auto -l 1319 -f 5 -t 1
|
||||
model_name: "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic"
|
||||
tasks:
|
||||
- name: "gsm8k"
|
||||
metrics:
|
||||
- name: "exact_match,strict-match"
|
||||
value: 0.47
|
||||
- name: "exact_match,flexible-extract"
|
||||
value: 0.64
|
||||
limit: 1319
|
||||
num_fewshot: 5
|
||||
@ -1,4 +1,3 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM -b "auto" -t 2
|
||||
model_name: "nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM"
|
||||
tasks:
|
||||
|
||||
@ -3,4 +3,3 @@ Meta-Llama-3-70B-Instruct.yaml
|
||||
Mixtral-8x7B-Instruct-v0.1.yaml
|
||||
Qwen2-57B-A14-Instruct.yaml
|
||||
DeepSeek-V2-Lite-Chat.yaml
|
||||
Meta-Llama-3-8B-QQQ.yaml
|
||||
|
||||
@ -1,6 +1,10 @@
|
||||
Qwen2.5-1.5B-Instruct.yaml
|
||||
Meta-Llama-3-8B-Instruct.yaml
|
||||
Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
|
||||
Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
|
||||
Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
|
||||
Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
|
||||
Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
|
||||
Qwen1.5-MoE-W4A16-compressed-tensors.yaml
|
||||
Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
|
||||
Minitron-4B-Base-FP8.yaml
|
||||
Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
|
||||
Qwen2-1.5B-Instruct-FP8W8.yaml
|
||||
Meta-Llama-3-8B-QQQ.yaml
|
||||
|
||||
@ -1,43 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
def pytest_addoption(parser):
|
||||
parser.addoption(
|
||||
"--config-list-file",
|
||||
action="store",
|
||||
help="Path to the file listing model config YAMLs (one per line)",
|
||||
)
|
||||
parser.addoption(
|
||||
"--tp-size",
|
||||
action="store",
|
||||
default="1",
|
||||
help="Tensor parallel size to use for evaluation",
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def config_list_file(pytestconfig, config_dir):
|
||||
rel_path = pytestconfig.getoption("--config-list-file")
|
||||
return config_dir / rel_path
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def tp_size(pytestconfig):
|
||||
return pytestconfig.getoption("--tp-size")
|
||||
|
||||
|
||||
def pytest_generate_tests(metafunc):
|
||||
if "config_filename" in metafunc.fixturenames:
|
||||
rel_path = metafunc.config.getoption("--config-list-file")
|
||||
config_list_file = Path(rel_path).resolve()
|
||||
config_dir = config_list_file.parent
|
||||
with open(config_list_file, encoding="utf-8") as f:
|
||||
configs = [
|
||||
config_dir / line.strip()
|
||||
for line in f
|
||||
if line.strip() and not line.startswith("#")
|
||||
]
|
||||
metafunc.parametrize("config_filename", configs)
|
||||
59
.buildkite/lm-eval-harness/run-tests.sh
Normal file
59
.buildkite/lm-eval-harness/run-tests.sh
Normal file
@ -0,0 +1,59 @@
|
||||
#!/bin/bash
|
||||
|
||||
usage() {
|
||||
echo``
|
||||
echo "Runs lm eval harness on GSM8k using vllm and compares to "
|
||||
echo "precomputed baseline (measured by HF transformers.)"
|
||||
echo
|
||||
echo "usage: ${0} <options>"
|
||||
echo
|
||||
echo " -c - path to the test data config (e.g. configs/small-models.txt)"
|
||||
echo " -t - tensor parallel size"
|
||||
echo
|
||||
}
|
||||
|
||||
SUCCESS=0
|
||||
|
||||
while getopts "c:t:" OPT; do
|
||||
case ${OPT} in
|
||||
c )
|
||||
CONFIG="$OPTARG"
|
||||
;;
|
||||
t )
|
||||
TP_SIZE="$OPTARG"
|
||||
;;
|
||||
\? )
|
||||
usage
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
# Parse list of configs.
|
||||
IFS=$'\n' read -d '' -r -a MODEL_CONFIGS < "$CONFIG"
|
||||
|
||||
for MODEL_CONFIG in "${MODEL_CONFIGS[@]}"
|
||||
do
|
||||
LOCAL_SUCCESS=0
|
||||
|
||||
echo "=== RUNNING MODEL: $MODEL_CONFIG WITH TP SIZE: $TP_SIZE==="
|
||||
|
||||
export LM_EVAL_TEST_DATA_FILE=$PWD/configs/${MODEL_CONFIG}
|
||||
export LM_EVAL_TP_SIZE=$TP_SIZE
|
||||
pytest -s test_lm_eval_correctness.py || LOCAL_SUCCESS=$?
|
||||
|
||||
if [[ $LOCAL_SUCCESS == 0 ]]; then
|
||||
echo "=== PASSED MODEL: ${MODEL_CONFIG} ==="
|
||||
else
|
||||
echo "=== FAILED MODEL: ${MODEL_CONFIG} ==="
|
||||
fi
|
||||
|
||||
SUCCESS=$((SUCCESS + LOCAL_SUCCESS))
|
||||
|
||||
done
|
||||
|
||||
if [ "${SUCCESS}" -eq "0" ]; then
|
||||
exit 0
|
||||
else
|
||||
exit 1
|
||||
fi
|
||||
@ -3,52 +3,67 @@
|
||||
LM eval harness on model to compare vs HF baseline computed offline.
|
||||
Configs are found in configs/$MODEL.yaml
|
||||
|
||||
pytest -s -v test_lm_eval_correctness.py \
|
||||
--config-list-file=configs/models-small.txt \
|
||||
--tp-size=1
|
||||
* export LM_EVAL_TEST_DATA_FILE=configs/Meta-Llama-3-70B-Instruct.yaml
|
||||
* export LM_EVAL_TP_SIZE=4
|
||||
* pytest -s test_lm_eval_correctness.py
|
||||
"""
|
||||
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
import lm_eval
|
||||
import numpy as np
|
||||
import numpy
|
||||
import pytest
|
||||
import yaml
|
||||
|
||||
RTOL = 0.08
|
||||
RTOL = 0.05
|
||||
TEST_DATA_FILE = os.environ.get(
|
||||
"LM_EVAL_TEST_DATA_FILE",
|
||||
".buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml")
|
||||
|
||||
TP_SIZE = os.environ.get("LM_EVAL_TP_SIZE", 1)
|
||||
|
||||
|
||||
def launch_lm_eval(eval_config, tp_size):
|
||||
trust_remote_code = eval_config.get("trust_remote_code", False)
|
||||
model_args = (
|
||||
f"pretrained={eval_config['model_name']},"
|
||||
f"tensor_parallel_size={tp_size},"
|
||||
f"enforce_eager=true,"
|
||||
f"add_bos_token=true,"
|
||||
f"trust_remote_code={trust_remote_code}"
|
||||
)
|
||||
def launch_lm_eval(eval_config):
|
||||
trust_remote_code = eval_config.get('trust_remote_code', False)
|
||||
|
||||
model_args = f"pretrained={eval_config['model_name']}," \
|
||||
f"tensor_parallel_size={TP_SIZE}," \
|
||||
f"add_bos_token=true," \
|
||||
f"trust_remote_code={trust_remote_code}"
|
||||
|
||||
results = lm_eval.simple_evaluate(
|
||||
model="vllm",
|
||||
model_args=model_args,
|
||||
tasks=[task["name"] for task in eval_config["tasks"]],
|
||||
num_fewshot=eval_config["num_fewshot"],
|
||||
limit=eval_config["limit"],
|
||||
batch_size="auto",
|
||||
)
|
||||
batch_size="auto")
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def test_lm_eval_correctness_param(config_filename, tp_size):
|
||||
eval_config = yaml.safe_load(config_filename.read_text(encoding="utf-8"))
|
||||
def test_lm_eval_correctness():
|
||||
eval_config = yaml.safe_load(
|
||||
Path(TEST_DATA_FILE).read_text(encoding="utf-8"))
|
||||
|
||||
results = launch_lm_eval(eval_config, tp_size)
|
||||
if eval_config[
|
||||
"model_name"] == "nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform": #noqa: E501
|
||||
pytest.skip("FBGEMM is currently failing on main.")
|
||||
|
||||
# Launch eval requests.
|
||||
results = launch_lm_eval(eval_config)
|
||||
|
||||
# Confirm scores match ground truth.
|
||||
success = True
|
||||
for task in eval_config["tasks"]:
|
||||
for metric in task["metrics"]:
|
||||
ground_truth = metric["value"]
|
||||
measured_value = results["results"][task["name"]][metric["name"]]
|
||||
print(
|
||||
f"{task['name']} | {metric['name']}: "
|
||||
f"ground_truth={ground_truth} | measured={measured_value}"
|
||||
)
|
||||
success = success and np.isclose(ground_truth, measured_value, rtol=RTOL)
|
||||
print(f'{task["name"]} | {metric["name"]}: '
|
||||
f'ground_truth={ground_truth} | measured={measured_value}')
|
||||
success = success and numpy.isclose(
|
||||
ground_truth, measured_value, rtol=RTOL)
|
||||
|
||||
# Assert at the end, print all scores even on failure for debugging.
|
||||
assert success
|
||||
|
||||
@ -113,7 +113,7 @@ WARNING: The benchmarking script will save json results by itself, so please do
|
||||
|
||||
### Visualizing the results
|
||||
|
||||
The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](performance-benchmarks-descriptions.md) with real benchmarking results.
|
||||
The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](tests/descriptions.md) with real benchmarking results.
|
||||
You can find the result presented as a table inside the `buildkite/performance-benchmark` job page.
|
||||
If you do not see the table, please wait till the benchmark finish running.
|
||||
The json version of the table (together with the json version of the benchmark) will be also attached to the markdown file.
|
||||
|
||||
@ -65,18 +65,18 @@ def read_markdown(file):
|
||||
|
||||
|
||||
def results_to_json(latency, throughput, serving):
|
||||
return json.dumps(
|
||||
{
|
||||
"latency": latency.to_dict(),
|
||||
"throughput": throughput.to_dict(),
|
||||
"serving": serving.to_dict(),
|
||||
}
|
||||
)
|
||||
return json.dumps({
|
||||
'latency': latency.to_dict(),
|
||||
'throughput': throughput.to_dict(),
|
||||
'serving': serving.to_dict()
|
||||
})
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
# collect results
|
||||
for test_file in results_folder.glob("*.json"):
|
||||
|
||||
with open(test_file) as f:
|
||||
raw_result = json.loads(f.read())
|
||||
|
||||
@ -120,8 +120,7 @@ if __name__ == "__main__":
|
||||
for perc in [10, 25, 50, 75, 90, 99]:
|
||||
# Multiply 1000 to convert the time unit from s to ms
|
||||
raw_result.update(
|
||||
{f"P{perc}": 1000 * raw_result["percentiles"][str(perc)]}
|
||||
)
|
||||
{f"P{perc}": 1000 * raw_result["percentiles"][str(perc)]})
|
||||
raw_result["avg_latency"] = raw_result["avg_latency"] * 1000
|
||||
|
||||
# add the result to raw_result
|
||||
@ -154,27 +153,26 @@ if __name__ == "__main__":
|
||||
serving_results = pd.DataFrame.from_dict(serving_results)
|
||||
throughput_results = pd.DataFrame.from_dict(throughput_results)
|
||||
|
||||
raw_results_json = results_to_json(
|
||||
latency_results, throughput_results, serving_results
|
||||
)
|
||||
raw_results_json = results_to_json(latency_results, throughput_results,
|
||||
serving_results)
|
||||
|
||||
# remapping the key, for visualization purpose
|
||||
if not latency_results.empty:
|
||||
latency_results = latency_results[list(latency_column_mapping.keys())].rename(
|
||||
columns=latency_column_mapping
|
||||
)
|
||||
latency_results = latency_results[list(
|
||||
latency_column_mapping.keys())].rename(
|
||||
columns=latency_column_mapping)
|
||||
if not serving_results.empty:
|
||||
serving_results = serving_results[list(serving_column_mapping.keys())].rename(
|
||||
columns=serving_column_mapping
|
||||
)
|
||||
serving_results = serving_results[list(
|
||||
serving_column_mapping.keys())].rename(
|
||||
columns=serving_column_mapping)
|
||||
if not throughput_results.empty:
|
||||
throughput_results = throughput_results[
|
||||
list(throughput_results_column_mapping.keys())
|
||||
].rename(columns=throughput_results_column_mapping)
|
||||
throughput_results = throughput_results[list(
|
||||
throughput_results_column_mapping.keys())].rename(
|
||||
columns=throughput_results_column_mapping)
|
||||
|
||||
processed_results_json = results_to_json(
|
||||
latency_results, throughput_results, serving_results
|
||||
)
|
||||
processed_results_json = results_to_json(latency_results,
|
||||
throughput_results,
|
||||
serving_results)
|
||||
|
||||
for df in [latency_results, serving_results, throughput_results]:
|
||||
if df.empty:
|
||||
@ -186,39 +184,38 @@ if __name__ == "__main__":
|
||||
# The GPUs sometimes come in format of "GPUTYPE\nGPUTYPE\n...",
|
||||
# we want to turn it into "8xGPUTYPE"
|
||||
df["GPU"] = df["GPU"].apply(
|
||||
lambda x: f"{len(x.split('\n'))}x{x.split('\n')[0]}"
|
||||
)
|
||||
lambda x: f"{len(x.split('\n'))}x{x.split('\n')[0]}")
|
||||
|
||||
# get markdown tables
|
||||
latency_md_table = tabulate(
|
||||
latency_results, headers="keys", tablefmt="pipe", showindex=False
|
||||
)
|
||||
serving_md_table = tabulate(
|
||||
serving_results, headers="keys", tablefmt="pipe", showindex=False
|
||||
)
|
||||
throughput_md_table = tabulate(
|
||||
throughput_results, headers="keys", tablefmt="pipe", showindex=False
|
||||
)
|
||||
latency_md_table = tabulate(latency_results,
|
||||
headers='keys',
|
||||
tablefmt='pipe',
|
||||
showindex=False)
|
||||
serving_md_table = tabulate(serving_results,
|
||||
headers='keys',
|
||||
tablefmt='pipe',
|
||||
showindex=False)
|
||||
throughput_md_table = tabulate(throughput_results,
|
||||
headers='keys',
|
||||
tablefmt='pipe',
|
||||
showindex=False)
|
||||
|
||||
# document the result
|
||||
with open(results_folder / "benchmark_results.md", "w") as f:
|
||||
results = read_markdown(
|
||||
"../.buildkite/nightly-benchmarks/"
|
||||
+ "performance-benchmarks-descriptions.md"
|
||||
)
|
||||
|
||||
results = read_markdown("../.buildkite/nightly-benchmarks/" +
|
||||
"performance-benchmarks-descriptions.md")
|
||||
results = results.format(
|
||||
latency_tests_markdown_table=latency_md_table,
|
||||
throughput_tests_markdown_table=throughput_md_table,
|
||||
serving_tests_markdown_table=serving_md_table,
|
||||
benchmarking_results_in_json_string=processed_results_json,
|
||||
)
|
||||
benchmarking_results_in_json_string=processed_results_json)
|
||||
f.write(results)
|
||||
|
||||
# document benchmarking results in json
|
||||
with open(results_folder / "benchmark_results.json", "w") as f:
|
||||
results = (
|
||||
latency_results.to_dict(orient="records")
|
||||
+ throughput_results.to_dict(orient="records")
|
||||
+ serving_results.to_dict(orient="records")
|
||||
)
|
||||
|
||||
results = latency_results.to_dict(
|
||||
orient='records') + throughput_results.to_dict(
|
||||
orient='records') + serving_results.to_dict(orient='records')
|
||||
f.write(json.dumps(results))
|
||||
|
||||
@ -14,12 +14,15 @@ def main(model, cachedir):
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Download and save Hugging Face tokenizer"
|
||||
)
|
||||
parser.add_argument("--model", type=str, required=True, help="Name of the model")
|
||||
parser.add_argument(
|
||||
"--cachedir", type=str, required=True, help="Directory to save the tokenizer"
|
||||
)
|
||||
description="Download and save Hugging Face tokenizer")
|
||||
parser.add_argument("--model",
|
||||
type=str,
|
||||
required=True,
|
||||
help="Name of the model")
|
||||
parser.add_argument("--cachedir",
|
||||
type=str,
|
||||
required=True,
|
||||
help="Directory to save the tokenizer")
|
||||
|
||||
args = parser.parse_args()
|
||||
main(args.model, args.cachedir)
|
||||
|
||||
@ -11,33 +11,33 @@ from tabulate import tabulate
|
||||
|
||||
def parse_arguments():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Parse command line arguments for summary-nightly-results script."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--results-folder",
|
||||
type=str,
|
||||
required=True,
|
||||
help="The folder where the results are stored.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--description", type=str, required=True, help="Description of the results."
|
||||
)
|
||||
description=
|
||||
'Parse command line arguments for summary-nightly-results script.')
|
||||
parser.add_argument('--results-folder',
|
||||
type=str,
|
||||
required=True,
|
||||
help='The folder where the results are stored.')
|
||||
parser.add_argument('--description',
|
||||
type=str,
|
||||
required=True,
|
||||
help='Description of the results.')
|
||||
|
||||
args = parser.parse_args()
|
||||
return args
|
||||
|
||||
|
||||
def get_perf(df, method, model, metric):
|
||||
|
||||
means = []
|
||||
|
||||
for qps in [2, 4, 8, 16, "inf"]:
|
||||
target = df["Test name"].str.contains(model)
|
||||
target = target & df["Engine"].str.contains(method)
|
||||
target = target & df["Test name"].str.contains("qps_" + str(qps))
|
||||
target = df['Test name'].str.contains(model)
|
||||
target = target & df['Engine'].str.contains(method)
|
||||
target = target & df['Test name'].str.contains("qps_" + str(qps))
|
||||
filtered_df = df[target]
|
||||
|
||||
if filtered_df.empty:
|
||||
means.append(0.0)
|
||||
means.append(0.)
|
||||
else:
|
||||
means.append(filtered_df[metric].values[0])
|
||||
|
||||
@ -45,6 +45,7 @@ def get_perf(df, method, model, metric):
|
||||
|
||||
|
||||
def get_perf_w_std(df, method, model, metric):
|
||||
|
||||
if metric in ["TTFT", "ITL"]:
|
||||
mean = get_perf(df, method, model, "Mean " + metric + " (ms)")
|
||||
mean = mean.tolist()
|
||||
@ -59,8 +60,7 @@ def get_perf_w_std(df, method, model, metric):
|
||||
else:
|
||||
assert metric == "Tput"
|
||||
mean = get_perf(df, method, model, "Input Tput (tok/s)") + get_perf(
|
||||
df, method, model, "Output Tput (tok/s)"
|
||||
)
|
||||
df, method, model, "Output Tput (tok/s)")
|
||||
mean = mean.tolist()
|
||||
std = None
|
||||
|
||||
@ -80,17 +80,18 @@ def main(args):
|
||||
# generate markdown table
|
||||
df = pd.DataFrame.from_dict(results)
|
||||
|
||||
md_table = tabulate(df, headers="keys", tablefmt="pipe", showindex=False)
|
||||
md_table = tabulate(df, headers='keys', tablefmt='pipe', showindex=False)
|
||||
|
||||
with open(args.description) as f:
|
||||
description = f.read()
|
||||
|
||||
description = description.format(nightly_results_benchmarking_table=md_table)
|
||||
description = description.format(
|
||||
nightly_results_benchmarking_table=md_table)
|
||||
|
||||
with open("nightly_results.md", "w") as f:
|
||||
f.write(description)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
if __name__ == '__main__':
|
||||
args = parse_arguments()
|
||||
main(args)
|
||||
|
||||
@ -10,24 +10,15 @@ set -x
|
||||
set -o pipefail
|
||||
|
||||
check_gpus() {
|
||||
if command -v nvidia-smi; then
|
||||
# check the number of GPUs and GPU type.
|
||||
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
|
||||
elif command -v amd-smi; then
|
||||
declare -g gpu_count=$(amd-smi list | grep 'GPU' | wc -l)
|
||||
fi
|
||||
|
||||
# check the number of GPUs and GPU type.
|
||||
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
|
||||
if [[ $gpu_count -gt 0 ]]; then
|
||||
echo "GPU found."
|
||||
else
|
||||
echo "Need at least 1 GPU to run benchmarking."
|
||||
exit 1
|
||||
fi
|
||||
if command -v nvidia-smi; then
|
||||
declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')
|
||||
elif command -v amd-smi; then
|
||||
declare -g gpu_type=$(amd-smi static -g 0 -a | grep 'MARKET_NAME' | awk '{print $2}')
|
||||
fi
|
||||
declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')
|
||||
echo "GPU type is $gpu_type"
|
||||
}
|
||||
|
||||
@ -99,15 +90,9 @@ kill_gpu_processes() {
|
||||
|
||||
|
||||
# wait until GPU memory usage smaller than 1GB
|
||||
if command -v nvidia-smi; then
|
||||
while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
|
||||
sleep 1
|
||||
done
|
||||
elif command -v amd-smi; then
|
||||
while [ "$(amd-smi metric -g 0 | grep 'USED_VRAM' | awk '{print $2}')" -ge 1000 ]; do
|
||||
sleep 1
|
||||
done
|
||||
fi
|
||||
while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
|
||||
sleep 1
|
||||
done
|
||||
|
||||
# remove vllm config file
|
||||
rm -rf ~/.config/vllm
|
||||
|
||||
@ -34,8 +34,10 @@ serving_column_mapping = {
|
||||
}
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
# collect results
|
||||
for test_file in results_folder.glob("*.json"):
|
||||
|
||||
with open(test_file) as f:
|
||||
raw_result = json.loads(f.read())
|
||||
|
||||
@ -54,16 +56,17 @@ if __name__ == "__main__":
|
||||
serving_results = pd.DataFrame.from_dict(serving_results)
|
||||
|
||||
if not serving_results.empty:
|
||||
serving_results = serving_results[list(serving_column_mapping.keys())].rename(
|
||||
columns=serving_column_mapping
|
||||
)
|
||||
serving_results = serving_results[list(
|
||||
serving_column_mapping.keys())].rename(
|
||||
columns=serving_column_mapping)
|
||||
|
||||
serving_md_table_with_headers = tabulate(
|
||||
serving_results, headers="keys", tablefmt="pipe", showindex=False
|
||||
)
|
||||
serving_md_table_with_headers = tabulate(serving_results,
|
||||
headers='keys',
|
||||
tablefmt='pipe',
|
||||
showindex=False)
|
||||
# remove the first line of header
|
||||
serving_md_table_lines = serving_md_table_with_headers.split("\n")
|
||||
serving_md_table_without_header = "\n".join(serving_md_table_lines[2:])
|
||||
serving_md_table_lines = serving_md_table_with_headers.split('\n')
|
||||
serving_md_table_without_header = '\n'.join(serving_md_table_lines[2:])
|
||||
|
||||
prefix = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
|
||||
prefix = prefix + "_" + os.environ.get("CURRENT_LLM_SERVING_ENGINE")
|
||||
@ -73,9 +76,10 @@ if __name__ == "__main__":
|
||||
# document results with header.
|
||||
# for those who wants to reproduce our benchmark.
|
||||
f.write(serving_md_table_with_headers)
|
||||
f.write("\n")
|
||||
f.write('\n')
|
||||
|
||||
# document benchmarking results in json
|
||||
with open(results_folder / f"{prefix}_nightly_results.json", "w") as f:
|
||||
results = serving_results.to_dict(orient="records")
|
||||
|
||||
results = serving_results.to_dict(orient='records')
|
||||
f.write(json.dumps(results))
|
||||
|
||||
@ -63,12 +63,10 @@
|
||||
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
||||
"disable_log_requests": "",
|
||||
"tensor_parallel_size": 4,
|
||||
"swap_space": 16,
|
||||
"speculative_config": {
|
||||
"model": "turboderp/Qwama-0.5B-Instruct",
|
||||
"num_speculative_tokens": 4,
|
||||
"draft_tensor_parallel_size": 1
|
||||
}
|
||||
"swap_space": 16,
|
||||
"speculative_model": "turboderp/Qwama-0.5B-Instruct",
|
||||
"num_speculative_tokens": 4,
|
||||
"speculative_draft_tensor_parallel_size": 1
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
||||
|
||||
@ -1,46 +0,0 @@
|
||||
# This local pyproject file is part of the migration from yapf to ruff format.
|
||||
# It uses the same core rules as the main pyproject.toml file, but with the
|
||||
# following differences:
|
||||
# - ruff line length is overridden to 88
|
||||
# - deprecated typing ignores (UP006, UP035) have been removed
|
||||
|
||||
[tool.ruff]
|
||||
line-length = 88
|
||||
|
||||
[tool.ruff.lint.per-file-ignores]
|
||||
"vllm/third_party/**" = ["ALL"]
|
||||
"vllm/version.py" = ["F401"]
|
||||
"vllm/_version.py" = ["ALL"]
|
||||
|
||||
[tool.ruff.lint]
|
||||
select = [
|
||||
# pycodestyle
|
||||
"E",
|
||||
# Pyflakes
|
||||
"F",
|
||||
# pyupgrade
|
||||
"UP",
|
||||
# flake8-bugbear
|
||||
"B",
|
||||
# flake8-simplify
|
||||
"SIM",
|
||||
# isort
|
||||
"I",
|
||||
# flake8-logging-format
|
||||
"G",
|
||||
]
|
||||
ignore = [
|
||||
# star imports
|
||||
"F405", "F403",
|
||||
# lambda expression assignment
|
||||
"E731",
|
||||
# Loop control variable not used within loop body
|
||||
"B007",
|
||||
# f-string format
|
||||
"UP032",
|
||||
# Can remove once 3.10+ is the minimum Python version
|
||||
"UP007",
|
||||
]
|
||||
|
||||
[tool.ruff.format]
|
||||
docstring-code-format = true
|
||||
@ -1,23 +1,23 @@
|
||||
steps:
|
||||
- label: "Build wheel - CUDA 12.8"
|
||||
- label: "Build wheel - CUDA 12.4"
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag vllm-ci:build-image --target build --progress plain ."
|
||||
- "mkdir artifacts"
|
||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||
- "bash .buildkite/scripts/upload-wheels.sh"
|
||||
- "bash .buildkite/upload-wheels.sh"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
- label: "Build wheel - CUDA 12.6"
|
||||
- label: "Build wheel - CUDA 12.1"
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.6.3 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain ."
|
||||
- "mkdir artifacts"
|
||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||
- "bash .buildkite/scripts/upload-wheels.sh"
|
||||
- "bash .buildkite/upload-wheels.sh"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
@ -31,10 +31,10 @@ steps:
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain ."
|
||||
- "mkdir artifacts"
|
||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||
- "bash .buildkite/scripts/upload-wheels.sh"
|
||||
- "bash .buildkite/upload-wheels.sh"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
@ -48,7 +48,7 @@ steps:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ."
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain ."
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
||||
|
||||
- label: "Build and publish TPU release image"
|
||||
@ -57,14 +57,12 @@ steps:
|
||||
agents:
|
||||
queue: tpu_queue_postmerge
|
||||
commands:
|
||||
- "yes | docker system prune -a"
|
||||
- "git fetch --all"
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f docker/Dockerfile.tpu ."
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f Dockerfile.tpu ."
|
||||
- "docker push vllm/vllm-tpu:nightly"
|
||||
- "docker push vllm/vllm-tpu:$BUILDKITE_COMMIT"
|
||||
plugins:
|
||||
- docker-login#v3.0.0:
|
||||
username: vllmbot
|
||||
username: vllm
|
||||
password-env: DOCKERHUB_TOKEN
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
@ -84,22 +82,7 @@ steps:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain -f Dockerfile.cpu ."
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
- block: "Build Neuron release image"
|
||||
key: block-neuron-release-image-build
|
||||
depends_on: ~
|
||||
|
||||
- label: "Build and publish Neuron release image"
|
||||
depends_on: block-neuron-release-image-build
|
||||
agents:
|
||||
queue: neuron-postmerge
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:latest --progress plain -f docker/Dockerfile.neuron ."
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version)"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
@ -3,9 +3,6 @@
|
||||
# This script runs test inside the corresponding ROCm docker container.
|
||||
set -o pipefail
|
||||
|
||||
# Export Python path
|
||||
export PYTHONPATH=".."
|
||||
|
||||
# Print ROCm version
|
||||
echo "--- Confirming Clean Initial State"
|
||||
while true; do
|
||||
@ -77,102 +74,50 @@ HF_MOUNT="/root/.cache/huggingface"
|
||||
|
||||
commands=$@
|
||||
echo "Commands:$commands"
|
||||
|
||||
if [[ $commands == *"pytest -v -s basic_correctness/test_basic_correctness.py"* ]]; then
|
||||
commands=${commands//"pytest -v -s basic_correctness/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s basic_correctness/test_basic_correctness.py"}
|
||||
fi
|
||||
|
||||
if [[ $commands == *"pytest -v -s models/test_registry.py"* ]]; then
|
||||
commands=${commands//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"}
|
||||
fi
|
||||
|
||||
if [[ $commands == *"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'"* ]]; then
|
||||
commands=${commands//"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'"/"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2 and not BambaForCausalLM and not Gemma2ForCausalLM and not Grok1ModelForCausalLM and not Zamba2ForCausalLM and not Gemma2Model and not GritLM'"}
|
||||
fi
|
||||
|
||||
if [[ $commands == *"pytest -v -s compile/test_basic_correctness.py"* ]]; then
|
||||
commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s compile/test_basic_correctness.py"}
|
||||
fi
|
||||
|
||||
#ignore certain kernels tests
|
||||
if [[ $commands == *" kernels/core"* ]]; then
|
||||
if [[ $commands == *" kernels "* ]]; then
|
||||
commands="${commands} \
|
||||
--ignore=kernels/core/test_fused_quant_layernorm.py \
|
||||
--ignore=kernels/core/test_permute_cols.py"
|
||||
fi
|
||||
|
||||
if [[ $commands == *" kernels/attention"* ]]; then
|
||||
commands="${commands} \
|
||||
--ignore=kernels/attention/stest_attention_selector.py \
|
||||
--ignore=kernels/attention/test_blocksparse_attention.py \
|
||||
--ignore=kernels/attention/test_encoder_decoder_attn.py \
|
||||
--ignore=kernels/attention/test_attention_selector.py \
|
||||
--ignore=kernels/attention/test_flash_attn.py \
|
||||
--ignore=kernels/attention/test_flashinfer.py \
|
||||
--ignore=kernels/attention/test_prefix_prefill.py \
|
||||
--ignore=kernels/attention/test_cascade_flash_attn.py \
|
||||
--ignore=kernels/attention/test_mha_attn.py \
|
||||
--ignore=kernels/attention/test_lightning_attn.py \
|
||||
--ignore=kernels/attention/test_attention.py"
|
||||
fi
|
||||
|
||||
if [[ $commands == *" kernels/quantization"* ]]; then
|
||||
commands="${commands} \
|
||||
--ignore=kernels/quantization/test_int8_quant.py \
|
||||
--ignore=kernels/quantization/test_aqlm.py \
|
||||
--ignore=kernels/quantization/test_machete_mm.py \
|
||||
--ignore=kernels/quantization/test_block_fp8.py \
|
||||
--ignore=kernels/quantization/test_block_int8.py \
|
||||
--ignore=kernels/quantization/test_marlin_gemm.py \
|
||||
--ignore=kernels/quantization/test_cutlass_scaled_mm.py \
|
||||
--ignore=kernels/quantization/test_int8_kernel.py"
|
||||
fi
|
||||
|
||||
if [[ $commands == *" kernels/mamba"* ]]; then
|
||||
commands="${commands} \
|
||||
--ignore=kernels/mamba/test_mamba_mixer2.py \
|
||||
--ignore=kernels/mamba/test_causal_conv1d.py \
|
||||
--ignore=kernels/mamba/test_mamba_ssm_ssd.py"
|
||||
fi
|
||||
|
||||
if [[ $commands == *" kernels/moe"* ]]; then
|
||||
commands="${commands} \
|
||||
--ignore=kernels/moe/test_moe.py \
|
||||
--ignore=kernels/moe/test_cutlass_moe.py \
|
||||
--ignore=kernels/moe/test_triton_moe_ptpc_fp8.py"
|
||||
--ignore=kernels/test_attention_selector.py \
|
||||
--ignore=kernels/test_blocksparse_attention.py \
|
||||
--ignore=kernels/test_causal_conv1d.py \
|
||||
--ignore=kernels/test_cutlass.py \
|
||||
--ignore=kernels/test_encoder_decoder_attn.py \
|
||||
--ignore=kernels/test_flash_attn.py \
|
||||
--ignore=kernels/test_flashinfer.py \
|
||||
--ignore=kernels/test_int8_quant.py \
|
||||
--ignore=kernels/test_machete_gemm.py \
|
||||
--ignore=kernels/test_mamba_ssm.py \
|
||||
--ignore=kernels/test_marlin_gemm.py \
|
||||
--ignore=kernels/test_moe.py \
|
||||
--ignore=kernels/test_prefix_prefill.py \
|
||||
--ignore=kernels/test_rand.py \
|
||||
--ignore=kernels/test_sampler.py \
|
||||
--ignore=kernels/test_cascade_flash_attn.py \
|
||||
--ignore=kernels/test_mamba_mixer2.py \
|
||||
--ignore=kernels/test_aqlm.py \
|
||||
--ignore=kernels/test_machete_mm.py \
|
||||
--ignore=kernels/test_mha_attn.py \
|
||||
--ignore=kernels/test_block_fp8.py \
|
||||
--ignore=kernels/test_permute_cols.py"
|
||||
fi
|
||||
|
||||
#ignore certain Entrypoints/openai tests
|
||||
if [[ $commands == *" entrypoints/openai "* ]]; then
|
||||
commands=${commands//" entrypoints/openai "/" entrypoints/openai \
|
||||
--ignore=entrypoints/openai/test_audio.py \
|
||||
--ignore=entrypoints/openai/test_chat.py \
|
||||
--ignore=entrypoints/openai/test_shutdown.py \
|
||||
--ignore=entrypoints/openai/test_completion.py \
|
||||
--ignore=entrypoints/openai/test_sleep.py \
|
||||
--ignore=entrypoints/openai/test_models.py \
|
||||
--ignore=entrypoints/openai/test_lora_adapters.py \
|
||||
--ignore=entrypoints/openai/test_return_tokens_as_ids.py \
|
||||
--ignore=entrypoints/openai/test_root_path.py \
|
||||
--ignore=entrypoints/openai/test_tokenization.py \
|
||||
--ignore=entrypoints/openai/test_prompt_validation.py "}
|
||||
fi
|
||||
|
||||
#ignore certain Entrypoints/llm tests
|
||||
if [[ $commands == *" entrypoints/llm "* ]]; then
|
||||
commands=${commands//" entrypoints/llm "/" entrypoints/llm \
|
||||
--ignore=entrypoints/llm/test_chat.py \
|
||||
--ignore=entrypoints/llm/test_accuracy.py \
|
||||
--ignore=entrypoints/llm/test_init.py \
|
||||
--ignore=entrypoints/llm/test_generate_multiple_loras.py \
|
||||
--ignore=entrypoints/llm/test_prompt_validation.py "}
|
||||
if [[ $commands == *" && pytest -v -s entrypoints/llm/test_guided_generate.py"* ]]; then
|
||||
commands=${commands//" && pytest -v -s entrypoints/llm/test_guided_generate.py"/" "}
|
||||
fi
|
||||
|
||||
#Obsolete currently
|
||||
##ignore certain Entrypoints/llm tests
|
||||
#if [[ $commands == *" && pytest -v -s entrypoints/llm/test_guided_generate.py"* ]]; then
|
||||
# commands=${commands//" && pytest -v -s entrypoints/llm/test_guided_generate.py"/" "}
|
||||
#fi
|
||||
|
||||
# --ignore=entrypoints/openai/test_encoder_decoder.py \
|
||||
# --ignore=entrypoints/openai/test_embedding.py \
|
||||
# --ignore=entrypoints/openai/test_oot_registration.py
|
||||
@ -181,8 +126,6 @@ fi
|
||||
|
||||
|
||||
PARALLEL_JOB_COUNT=8
|
||||
MYPYTHONPATH=".."
|
||||
|
||||
# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
|
||||
if [[ $commands == *"--shard-id="* ]]; then
|
||||
# assign job count as the number of shards used
|
||||
@ -203,7 +146,6 @@ if [[ $commands == *"--shard-id="* ]]; then
|
||||
-e AWS_SECRET_ACCESS_KEY \
|
||||
-v "${HF_CACHE}:${HF_MOUNT}" \
|
||||
-e "HF_HOME=${HF_MOUNT}" \
|
||||
-e "PYTHONPATH=${MYPYTHONPATH}" \
|
||||
--name "${container_name}_${GPU}" \
|
||||
"${image_name}" \
|
||||
/bin/bash -c "${commands_gpu}" \
|
||||
@ -234,7 +176,6 @@ else
|
||||
-e AWS_SECRET_ACCESS_KEY \
|
||||
-v "${HF_CACHE}:${HF_MOUNT}" \
|
||||
-e "HF_HOME=${HF_MOUNT}" \
|
||||
-e "PYTHONPATH=${MYPYTHONPATH}" \
|
||||
--name "${container_name}" \
|
||||
"${image_name}" \
|
||||
/bin/bash -c "${commands}"
|
||||
@ -5,8 +5,8 @@
|
||||
set -ex
|
||||
set -o pipefail
|
||||
|
||||
# cd 2 levels into the working directory
|
||||
cd "$(dirname "${BASH_SOURCE[0]}")/../.."
|
||||
# cd into parent directory of this file
|
||||
cd "$(dirname "${BASH_SOURCE[0]}")/.."
|
||||
|
||||
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
|
||||
|
||||
@ -10,4 +10,5 @@ trap remove_docker_container EXIT
|
||||
remove_docker_container
|
||||
|
||||
# Try building the docker image
|
||||
docker build -t cpu-test -f docker/Dockerfile.s390x .
|
||||
docker build -t cpu-test -f Dockerfile.ppc64le .
|
||||
|
||||
@ -8,19 +8,15 @@ set -ex
|
||||
CORE_RANGE=${CORE_RANGE:-48-95}
|
||||
NUMA_NODE=${NUMA_NODE:-1}
|
||||
|
||||
# Try building the docker image
|
||||
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build -t cpu-test-"$BUILDKITE_BUILD_NUMBER" -f Dockerfile.cpu .
|
||||
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 -f Dockerfile.cpu .
|
||||
|
||||
# Setup cleanup
|
||||
remove_docker_container() {
|
||||
set -e;
|
||||
docker rm -f cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" || true;
|
||||
docker image rm cpu-test-"$BUILDKITE_BUILD_NUMBER" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 || true;
|
||||
}
|
||||
remove_docker_container() { set -e; docker rm -f cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" || true; }
|
||||
trap remove_docker_container EXIT
|
||||
remove_docker_container
|
||||
|
||||
# Try building the docker image
|
||||
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$BUILDKITE_BUILD_NUMBER" --target vllm-test -f docker/Dockerfile.cpu .
|
||||
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
|
||||
|
||||
# Run the image, setting --shm-size=4g for tensor parallel.
|
||||
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
|
||||
--cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"
|
||||
@ -40,6 +36,8 @@ function cpu_tests() {
|
||||
# Run basic model test
|
||||
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
|
||||
set -e
|
||||
pip install -r vllm/requirements/test.txt
|
||||
pip install -r vllm/requirements/cpu.txt
|
||||
pytest -v -s tests/kernels/test_cache.py -m cpu_model
|
||||
pytest -v -s tests/kernels/test_mla_decode_cpu.py -m cpu_model
|
||||
pytest -v -s tests/models/decoder_only/language -m cpu_model
|
||||
@ -9,7 +9,6 @@ python3 use_existing_torch.py
|
||||
|
||||
# Try building the docker image
|
||||
DOCKER_BUILDKIT=1 docker build . \
|
||||
--file docker/Dockerfile \
|
||||
--target vllm-openai \
|
||||
--platform "linux/arm64" \
|
||||
-t gh200-test \
|
||||
@ -5,22 +5,20 @@
|
||||
set -ex
|
||||
|
||||
# Try building the docker image
|
||||
docker build -t hpu-test-env -f docker/Dockerfile.hpu .
|
||||
docker build -t hpu-test-env -f Dockerfile.hpu .
|
||||
|
||||
# Setup cleanup
|
||||
# certain versions of HPU software stack have a bug that can
|
||||
# override the exit code of the script, so we need to use
|
||||
# separate remove_docker_containers and remove_docker_containers_and_exit
|
||||
# separate remove_docker_container and remove_docker_container_and_exit
|
||||
# functions, while other platforms only need one remove_docker_container
|
||||
# function.
|
||||
EXITCODE=1
|
||||
remove_docker_containers() { docker rm -f hpu-test || true; docker rm -f hpu-test-tp2 || true; }
|
||||
remove_docker_containers_and_exit() { remove_docker_containers; exit $EXITCODE; }
|
||||
trap remove_docker_containers_and_exit EXIT
|
||||
remove_docker_containers
|
||||
remove_docker_container() { docker rm -f hpu-test || true; }
|
||||
remove_docker_container_and_exit() { remove_docker_container; exit $EXITCODE; }
|
||||
trap remove_docker_container_and_exit EXIT
|
||||
remove_docker_container
|
||||
|
||||
# Run the image and launch offline inference
|
||||
docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m
|
||||
docker run --runtime=habana --name=hpu-test-tp2 --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --tensor-parallel-size 2
|
||||
|
||||
EXITCODE=$?
|
||||
@ -3,7 +3,7 @@
|
||||
set -euox pipefail
|
||||
|
||||
if [[ $# -lt 4 ]]; then
|
||||
echo "Usage: .buildkite/scripts/run-multi-node-test.sh WORKING_DIR NUM_NODES NUM_GPUS DOCKER_IMAGE COMMAND1 COMMAND2 ... COMMANDN"
|
||||
echo "Usage: .buildkite/run-multi-node-test.sh WORKING_DIR NUM_NODES NUM_GPUS DOCKER_IMAGE COMMAND1 COMMAND2 ... COMMANDN"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
@ -11,14 +11,13 @@ container_name="neuron_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
|
||||
HF_CACHE="$(realpath ~)/huggingface"
|
||||
mkdir -p "${HF_CACHE}"
|
||||
HF_MOUNT="/root/.cache/huggingface"
|
||||
HF_TOKEN=$(aws secretsmanager get-secret-value --secret-id "ci/vllm-neuron/hf-token" --region us-west-2 --query 'SecretString' --output text | jq -r .VLLM_NEURON_CI_HF_TOKEN)
|
||||
|
||||
NEURON_COMPILE_CACHE_URL="$(realpath ~)/neuron_compile_cache"
|
||||
mkdir -p "${NEURON_COMPILE_CACHE_URL}"
|
||||
NEURON_COMPILE_CACHE_MOUNT="/root/.cache/neuron_compile_cache"
|
||||
|
||||
# Try building the docker image
|
||||
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws
|
||||
aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-west-2.amazonaws.com
|
||||
|
||||
# prune old image and containers to save disk space, and only once a day
|
||||
# by using a timestamp file in tmp.
|
||||
@ -36,7 +35,7 @@ else
|
||||
date "+%s" > /tmp/neuron-docker-build-timestamp
|
||||
fi
|
||||
|
||||
docker build -t "${image_name}" -f docker/Dockerfile.neuron .
|
||||
docker build -t "${image_name}" -f Dockerfile.neuron .
|
||||
|
||||
# Setup cleanup
|
||||
remove_docker_container() {
|
||||
@ -48,16 +47,8 @@ trap remove_docker_container EXIT
|
||||
docker run --rm -it --device=/dev/neuron0 --network bridge \
|
||||
-v "${HF_CACHE}:${HF_MOUNT}" \
|
||||
-e "HF_HOME=${HF_MOUNT}" \
|
||||
-e "HF_TOKEN=${HF_TOKEN}" \
|
||||
-v "${NEURON_COMPILE_CACHE_URL}:${NEURON_COMPILE_CACHE_MOUNT}" \
|
||||
-e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \
|
||||
--name "${container_name}" \
|
||||
${image_name} \
|
||||
/bin/bash -c "
|
||||
python3 /workspace/vllm/examples/offline_inference/neuron.py;
|
||||
python3 -m pytest /workspace/vllm/tests/neuron/1_core/ -v --capture=tee-sys;
|
||||
for f in /workspace/vllm/tests/neuron/2_core/*.py; do
|
||||
echo 'Running test file: '$f;
|
||||
python3 -m pytest \$f -v --capture=tee-sys;
|
||||
done
|
||||
"
|
||||
/bin/bash -c "python3 /workspace/vllm/examples/offline_inference/neuron.py && python3 -m pytest /workspace/vllm/tests/neuron/1_core/ -v --capture=tee-sys && python3 -m pytest /workspace/vllm/tests/neuron/2_core/ -v --capture=tee-sys"
|
||||
42
.buildkite/run-tpu-v1-test.sh
Executable file
42
.buildkite/run-tpu-v1-test.sh
Executable file
@ -0,0 +1,42 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
|
||||
# Build the docker image.
|
||||
docker build -f Dockerfile.tpu -t vllm-tpu .
|
||||
|
||||
# Set up cleanup.
|
||||
remove_docker_container() { docker rm -f tpu-test || true; }
|
||||
trap remove_docker_container EXIT
|
||||
# Remove the container that might not be cleaned up in the previous run.
|
||||
remove_docker_container
|
||||
|
||||
# For HF_TOKEN.
|
||||
source /etc/environment
|
||||
# Run a simple end-to-end example.
|
||||
docker run --privileged --net host --shm-size=16G -it \
|
||||
-e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
|
||||
vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \
|
||||
&& python3 -m pip install pytest \
|
||||
&& python3 -m pip install lm_eval[api]==0.4.4 \
|
||||
&& export VLLM_USE_V1=1 \
|
||||
&& export VLLM_XLA_CHECK_RECOMPILATION=1 \
|
||||
&& echo TEST_1 \
|
||||
&& pytest -v -s /workspace/vllm/tests/tpu/test_compilation.py \
|
||||
&& echo TEST_2 \
|
||||
&& pytest -v -s /workspace/vllm/tests/v1/tpu/test_basic.py \
|
||||
&& echo TEST_3 \
|
||||
&& pytest -v -s /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine \
|
||||
&& echo TEST_4 \
|
||||
&& pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py \
|
||||
&& echo TEST_5 \
|
||||
&& python3 /workspace/vllm/examples/offline_inference/tpu.py \
|
||||
&& echo TEST_6 \
|
||||
&& pytest -s -v /workspace/vllm/tests/tpu/worker/test_tpu_model_runner.py \
|
||||
&& echo TEST_7 \
|
||||
&& pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py" \
|
||||
|
||||
|
||||
# TODO: This test fails because it uses RANDOM_SEED sampling
|
||||
# && VLLM_USE_V1=1 pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
|
||||
|
||||
@ -8,7 +8,7 @@ image_name="xpu/vllm-ci:${BUILDKITE_COMMIT}"
|
||||
container_name="xpu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
|
||||
|
||||
# Try building the docker image
|
||||
docker build -t ${image_name} -f docker/Dockerfile.xpu .
|
||||
docker build -t ${image_name} -f Dockerfile.xpu .
|
||||
|
||||
# Setup cleanup
|
||||
remove_docker_container() {
|
||||
@ -1,48 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
# This script build the CPU docker image and run the offline inference inside the container.
|
||||
# It serves a sanity check for compilation and basic model usage.
|
||||
set -ex
|
||||
|
||||
# Setup cleanup
|
||||
remove_docker_container() {
|
||||
if [[ -n "$container_id" ]]; then
|
||||
podman rm -f "$container_id" || true
|
||||
fi
|
||||
podman system prune -f
|
||||
}
|
||||
trap remove_docker_container EXIT
|
||||
remove_docker_container
|
||||
|
||||
# Try building the docker image
|
||||
podman build -t cpu-test-ubi9-ppc -f docker/Dockerfile.ppc64le .
|
||||
|
||||
# Run the image
|
||||
container_id=$(podman run -itd --entrypoint /bin/bash -v /tmp/:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN cpu-test-ubi9-ppc)
|
||||
|
||||
function cpu_tests() {
|
||||
|
||||
# offline inference
|
||||
podman exec -it "$container_id" bash -c "
|
||||
set -e
|
||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
|
||||
|
||||
# Run basic model test
|
||||
podman exec -it "$container_id" bash -c "
|
||||
set -e
|
||||
pip install pytest pytest-asyncio einops peft Pillow soundfile transformers_stream_generator matplotlib
|
||||
pip install sentence-transformers datamodel_code_generator
|
||||
pytest -v -s tests/models/language/generation/test_bart.py -m cpu_model
|
||||
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-openai-community/gpt2]
|
||||
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-facebook/opt-125m]
|
||||
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-google/gemma-1.1-2b-it]
|
||||
pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach]
|
||||
pytest -v -s tests/models/language/pooling/test_embedding.py::test_models[half-BAAI/bge-base-en-v1.5]"
|
||||
}
|
||||
|
||||
# All of CPU tests are expected to be finished less than 40 mins.
|
||||
|
||||
export container_id
|
||||
export -f cpu_tests
|
||||
timeout 40m bash -c cpu_tests
|
||||
|
||||
@ -1,181 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -xu
|
||||
|
||||
|
||||
remove_docker_container() {
|
||||
docker rm -f tpu-test || true;
|
||||
docker rm -f vllm-tpu || true;
|
||||
}
|
||||
|
||||
trap remove_docker_container EXIT
|
||||
|
||||
# Remove the container that might not be cleaned up in the previous run.
|
||||
remove_docker_container
|
||||
|
||||
# Build the docker image.
|
||||
docker build -f docker/Dockerfile.tpu -t vllm-tpu .
|
||||
|
||||
# Set up cleanup.
|
||||
cleanup_docker() {
|
||||
# Get Docker's root directory
|
||||
docker_root=$(docker info -f '{{.DockerRootDir}}')
|
||||
if [ -z "$docker_root" ]; then
|
||||
echo "Failed to determine Docker root directory."
|
||||
exit 1
|
||||
fi
|
||||
echo "Docker root directory: $docker_root"
|
||||
# Check disk usage of the filesystem where Docker's root directory is located
|
||||
disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
|
||||
# Define the threshold
|
||||
threshold=70
|
||||
if [ "$disk_usage" -gt "$threshold" ]; then
|
||||
echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
|
||||
# Remove dangling images (those that are not tagged and not used by any container)
|
||||
docker image prune -f
|
||||
# Remove unused volumes / force the system prune for old images as well.
|
||||
docker volume prune -f && docker system prune --force --filter "until=72h" --all
|
||||
echo "Docker images and volumes cleanup completed."
|
||||
else
|
||||
echo "Disk usage is below $threshold%. No cleanup needed."
|
||||
fi
|
||||
}
|
||||
cleanup_docker
|
||||
|
||||
# For HF_TOKEN.
|
||||
source /etc/environment
|
||||
|
||||
docker run --privileged --net host --shm-size=16G -it \
|
||||
-e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
|
||||
vllm-tpu /bin/bash -c '
|
||||
set -e # Exit immediately if a command exits with a non-zero status.
|
||||
set -u # Treat unset variables as an error.
|
||||
|
||||
echo "--- Starting script inside Docker container ---"
|
||||
|
||||
# Create results directory
|
||||
RESULTS_DIR=$(mktemp -d)
|
||||
# If mktemp fails, set -e will cause the script to exit.
|
||||
echo "Results will be stored in: $RESULTS_DIR"
|
||||
|
||||
# Install dependencies
|
||||
echo "--- Installing Python dependencies ---"
|
||||
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
|
||||
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
|
||||
&& python3 -m pip install --progress-bar off lm_eval[api]==0.4.4
|
||||
echo "--- Python dependencies installed ---"
|
||||
export VLLM_USE_V1=1
|
||||
export VLLM_XLA_CHECK_RECOMPILATION=1
|
||||
export VLLM_XLA_CACHE_PATH=
|
||||
echo "Using VLLM V1"
|
||||
|
||||
echo "--- Hardware Information ---"
|
||||
tpu-info
|
||||
echo "--- Starting Tests ---"
|
||||
set +e
|
||||
overall_script_exit_code=0
|
||||
|
||||
# --- Test Definitions ---
|
||||
# If a test fails, this function will print logs and will not cause the main script to exit.
|
||||
run_test() {
|
||||
local test_num=$1
|
||||
local test_name=$2
|
||||
local test_command=$3
|
||||
local log_file="$RESULTS_DIR/test_${test_num}.log"
|
||||
local actual_exit_code
|
||||
|
||||
echo "--- TEST_$test_num: Running $test_name ---"
|
||||
|
||||
# Execute the test command.
|
||||
eval "$test_command" > >(tee -a "$log_file") 2> >(tee -a "$log_file" >&2)
|
||||
actual_exit_code=$?
|
||||
|
||||
echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" # This goes to main log
|
||||
echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" >> "$log_file" # Also to per-test log
|
||||
|
||||
if [ "$actual_exit_code" -ne 0 ]; then
|
||||
echo "TEST_$test_num ($test_name) FAILED with exit code $actual_exit_code." >&2
|
||||
echo "--- Log for failed TEST_$test_num ($test_name) ---" >&2
|
||||
if [ -f "$log_file" ]; then
|
||||
cat "$log_file" >&2
|
||||
else
|
||||
echo "Log file $log_file not found for TEST_$test_num ($test_name)." >&2
|
||||
fi
|
||||
echo "--- End of log for TEST_$test_num ($test_name) ---" >&2
|
||||
return "$actual_exit_code" # Return the failure code
|
||||
else
|
||||
echo "TEST_$test_num ($test_name) PASSED."
|
||||
return 0 # Return success
|
||||
fi
|
||||
}
|
||||
|
||||
# Helper function to call run_test and update the overall script exit code
|
||||
run_and_track_test() {
|
||||
local test_num_arg="$1"
|
||||
local test_name_arg="$2"
|
||||
local test_command_arg="$3"
|
||||
|
||||
# Run the test
|
||||
run_test "$test_num_arg" "$test_name_arg" "$test_command_arg"
|
||||
local test_specific_exit_code=$?
|
||||
|
||||
# If the test failed, set the overall script exit code to 1
|
||||
if [ "$test_specific_exit_code" -ne 0 ]; then
|
||||
# No need for extra echo here, run_test already logged the failure.
|
||||
overall_script_exit_code=1
|
||||
fi
|
||||
}
|
||||
|
||||
# --- Actual Test Execution ---
|
||||
run_and_track_test 0 "test_perf.py" \
|
||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_perf.py"
|
||||
run_and_track_test 1 "test_compilation.py" \
|
||||
"python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_compilation.py"
|
||||
run_and_track_test 2 "test_basic.py" \
|
||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_basic.py"
|
||||
run_and_track_test 3 "test_accuracy.py::test_lm_eval_accuracy_v1_engine" \
|
||||
"python3 -m pytest -s -v /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine"
|
||||
run_and_track_test 4 "test_quantization_accuracy.py" \
|
||||
"python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py"
|
||||
run_and_track_test 5 "examples/offline_inference/tpu.py" \
|
||||
"python3 /workspace/vllm/examples/offline_inference/tpu.py"
|
||||
run_and_track_test 6 "test_tpu_model_runner.py" \
|
||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/worker/test_tpu_model_runner.py"
|
||||
run_and_track_test 7 "test_sampler.py" \
|
||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py"
|
||||
run_and_track_test 8 "test_topk_topp_sampler.py" \
|
||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_topk_topp_sampler.py"
|
||||
run_and_track_test 9 "test_multimodal.py" \
|
||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py"
|
||||
run_and_track_test 10 "test_pallas.py" \
|
||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py"
|
||||
run_and_track_test 11 "test_struct_output_generate.py" \
|
||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py"
|
||||
run_and_track_test 12 "test_moe_pallas.py" \
|
||||
"python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py"
|
||||
run_and_track_test 13 "test_lora.py" \
|
||||
"VLLM_XLA_CHECK_RECOMPILATION=0 python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/test_lora.py"
|
||||
|
||||
# After all tests have been attempted, exit with the overall status.
|
||||
if [ "$overall_script_exit_code" -ne 0 ]; then
|
||||
echo "--- One or more tests FAILED. Overall script exiting with failure code 1. ---"
|
||||
else
|
||||
echo "--- All tests have completed and PASSED. Overall script exiting with success code 0. ---"
|
||||
fi
|
||||
exit "$overall_script_exit_code"
|
||||
' # IMPORTANT: This is the closing single quote for the bash -c "..." command. Ensure it is present and correct.
|
||||
|
||||
# Capture the exit code of the docker run command
|
||||
DOCKER_RUN_EXIT_CODE=$?
|
||||
|
||||
# The trap will run for cleanup.
|
||||
# Exit the main script with the Docker run command's exit code.
|
||||
if [ "$DOCKER_RUN_EXIT_CODE" -ne 0 ]; then
|
||||
echo "Docker run command failed with exit code $DOCKER_RUN_EXIT_CODE."
|
||||
exit "$DOCKER_RUN_EXIT_CODE"
|
||||
else
|
||||
echo "Docker run command completed successfully."
|
||||
exit 0
|
||||
fi
|
||||
# TODO: This test fails because it uses RANDOM_SEED sampling
|
||||
# pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
|
||||
@ -8,7 +8,6 @@
|
||||
# Documentation
|
||||
# label(str): the name of the test. emoji allowed.
|
||||
# fast_check(bool): whether to run this on each commit on fastcheck pipeline.
|
||||
# torch_nightly(bool): whether to run this on vllm against torch nightly pipeline.
|
||||
# fast_check_only(bool): run this test on fastcheck pipeline only
|
||||
# optional(bool): never run this test by default (i.e. need to unblock manually) unless it's scheduled nightly run.
|
||||
# command(str): the single command to run for tests. incompatible with commands.
|
||||
@ -32,17 +31,16 @@ steps:
|
||||
##### fast check tests #####
|
||||
|
||||
- label: Documentation Build # 2min
|
||||
mirror_hardwares: [amdexperimental]
|
||||
working_dir: "/vllm-workspace/test_docs"
|
||||
working_dir: "/vllm-workspace/test_docs/docs"
|
||||
fast_check: true
|
||||
no_gpu: True
|
||||
commands:
|
||||
- pip install -r ../requirements/docs.txt
|
||||
# TODO: add `--strict` once warnings in docstrings are fixed
|
||||
- mkdocs build
|
||||
- pip install -r ../../requirements/docs.txt
|
||||
- SPHINXOPTS=\"-W\" make html
|
||||
# Check API reference (if it fails, you may have missing mock imports)
|
||||
- grep \"sig sig-object py\" build/html/api/inference_params.html
|
||||
|
||||
- label: Async Engine, Inputs, Utils, Worker Test # 24min
|
||||
mirror_hardwares: [amdexperimental]
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/mq_llm_engine
|
||||
@ -58,13 +56,11 @@ steps:
|
||||
- pytest -v -s async_engine # AsyncLLMEngine
|
||||
- NUM_SCHEDULER_STEPS=4 pytest -v -s async_engine/test_async_llm_engine.py
|
||||
- pytest -v -s test_inputs.py
|
||||
- pytest -v -s test_outputs.py
|
||||
- pytest -v -s multimodal
|
||||
- pytest -v -s test_utils.py # Utils
|
||||
- pytest -v -s worker # Worker
|
||||
|
||||
- label: Python-only Installation Test
|
||||
mirror_hardwares: [amdexperimental]
|
||||
source_file_dependencies:
|
||||
- tests/standalone_tests/python_only_compile.sh
|
||||
- setup.py
|
||||
@ -72,9 +68,8 @@ steps:
|
||||
- bash standalone_tests/python_only_compile.sh
|
||||
|
||||
- label: Basic Correctness Test # 30min
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
#mirror_hardwares: [amd]
|
||||
fast_check: true
|
||||
torch_nightly: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/basic_correctness/test_basic_correctness
|
||||
@ -89,7 +84,6 @@ steps:
|
||||
- VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
|
||||
|
||||
- label: Chunked Prefill Test
|
||||
mirror_hardwares: [amdexperimental]
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/basic_correctness/test_chunked_prefill
|
||||
@ -98,7 +92,7 @@ steps:
|
||||
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
|
||||
|
||||
- label: Core Test # 10min
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
mirror_hardwares: [amd]
|
||||
fast_check: true
|
||||
source_file_dependencies:
|
||||
- vllm/core
|
||||
@ -108,10 +102,9 @@ steps:
|
||||
- pytest -v -s core
|
||||
|
||||
- label: Entrypoints Test # 40min
|
||||
mirror_hardwares: [amdexperimental]
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
fast_check: true
|
||||
torch_nightly: true
|
||||
mirror_hardwares: [amd]
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/entrypoints/llm
|
||||
@ -125,12 +118,11 @@ steps:
|
||||
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
|
||||
- pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
|
||||
- VLLM_USE_V1=0 pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
|
||||
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/
|
||||
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/correctness/
|
||||
- pytest -v -s entrypoints/test_chat_utils.py
|
||||
- VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
|
||||
|
||||
- label: Distributed Tests (4 GPUs) # 10min
|
||||
mirror_hardwares: [amdexperimental]
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 4
|
||||
source_file_dependencies:
|
||||
@ -138,7 +130,6 @@ steps:
|
||||
- vllm/core/
|
||||
- tests/distributed/test_utils
|
||||
- tests/distributed/test_pynccl
|
||||
- tests/distributed/test_events
|
||||
- tests/spec_decode/e2e/test_integration_dist_tp4
|
||||
- tests/compile/test_basic_correctness
|
||||
- examples/offline_inference/rlhf.py
|
||||
@ -149,25 +140,21 @@ steps:
|
||||
# test with tp=2 and external_dp=2
|
||||
- VLLM_USE_V1=0 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
|
||||
- torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
|
||||
# test with tp=2 and pp=2
|
||||
- PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
|
||||
# test with internal dp
|
||||
- python3 ../examples/offline_inference/data_parallel.py
|
||||
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
|
||||
- pytest -v -s distributed/test_utils.py
|
||||
- pytest -v -s compile/test_basic_correctness.py
|
||||
- pytest -v -s distributed/test_pynccl.py
|
||||
- pytest -v -s distributed/test_events.py
|
||||
- pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
|
||||
# TODO: create a dedicated test section for multi-GPU example tests
|
||||
# when we have multiple distributed example tests
|
||||
- pushd ../examples/offline_inference
|
||||
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
|
||||
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
|
||||
- VLLM_ENABLE_V1_MULTIPROCESSING=0 python3 rlhf.py
|
||||
- VLLM_ENABLE_V1_MULTIPROCESSING=0 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
|
||||
- popd
|
||||
|
||||
- label: Metrics, Tracing Test # 10min
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
num_gpus: 2
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
@ -175,13 +162,18 @@ steps:
|
||||
- tests/tracing
|
||||
commands:
|
||||
- pytest -v -s metrics
|
||||
- "pip install \
|
||||
'opentelemetry-sdk>=1.26.0,<1.27.0' \
|
||||
'opentelemetry-api>=1.26.0,<1.27.0' \
|
||||
'opentelemetry-exporter-otlp>=1.26.0,<1.27.0' \
|
||||
'opentelemetry-semantic-conventions-ai>=0.4.1,<0.5.0'"
|
||||
- pytest -v -s tracing
|
||||
|
||||
##### fast check tests #####
|
||||
##### 1 GPU test #####
|
||||
|
||||
- label: Regression Test # 5min
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
mirror_hardwares: [amd]
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/test_regression
|
||||
@ -191,7 +183,7 @@ steps:
|
||||
working_dir: "/vllm-workspace/tests" # optional
|
||||
|
||||
- label: Engine Test # 10min
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
mirror_hardwares: [amd]
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/engine
|
||||
@ -199,31 +191,28 @@ steps:
|
||||
- tests/test_sequence
|
||||
- tests/test_config
|
||||
- tests/test_logger
|
||||
- tests/test_vllm_port
|
||||
commands:
|
||||
- pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
|
||||
- pytest -v -s engine test_sequence.py test_config.py test_logger.py
|
||||
# OOM in the CI unless we run this separately
|
||||
- pytest -v -s tokenization
|
||||
|
||||
- label: V1 Test
|
||||
mirror_hardwares: [amdexperimental]
|
||||
#mirror_hardwares: [amd]
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/v1
|
||||
commands:
|
||||
# split the test to avoid interference
|
||||
- pytest -v -s v1/core
|
||||
- pytest -v -s v1/entrypoints
|
||||
- pytest -v -s v1/engine
|
||||
- pytest -v -s v1/entrypoints
|
||||
- pytest -v -s v1/sample
|
||||
- pytest -v -s v1/worker
|
||||
- pytest -v -s v1/structured_output
|
||||
- pytest -v -s v1/spec_decode
|
||||
- pytest -v -s v1/kv_connector/unit
|
||||
- pytest -v -s v1/test_serial_utils.py
|
||||
- pytest -v -s v1/test_stats.py
|
||||
- pytest -v -s v1/test_utils.py
|
||||
- pytest -v -s v1/test_oracle.py
|
||||
- pytest -v -s v1/test_metrics_reader.py
|
||||
# TODO: accuracy does not match, whether setting
|
||||
# VLLM_USE_FLASHINFER_SAMPLER or not on H100.
|
||||
- pytest -v -s v1/e2e
|
||||
@ -232,8 +221,8 @@ steps:
|
||||
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
|
||||
|
||||
- label: Examples Test # 25min
|
||||
mirror_hardwares: [amdexperimental]
|
||||
working_dir: "/vllm-workspace/examples"
|
||||
#mirror_hardwares: [amd]
|
||||
source_file_dependencies:
|
||||
- vllm/entrypoints
|
||||
- examples/
|
||||
@ -248,7 +237,7 @@ steps:
|
||||
- python3 offline_inference/vision_language.py --seed 0
|
||||
- python3 offline_inference/vision_language_embedding.py --seed 0
|
||||
- python3 offline_inference/vision_language_multi_image.py --seed 0
|
||||
- VLLM_USE_V1=0 python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
||||
- VLLM_USE_V1=0 python3 other/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 other/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
||||
- python3 offline_inference/encoder_decoder.py
|
||||
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
|
||||
- python3 offline_inference/basic/classify.py
|
||||
@ -257,7 +246,7 @@ steps:
|
||||
- VLLM_USE_V1=0 python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2
|
||||
|
||||
- label: Prefix Caching Test # 9min
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
mirror_hardwares: [amd]
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/prefix_caching
|
||||
@ -265,7 +254,6 @@ steps:
|
||||
- pytest -v -s prefix_caching
|
||||
|
||||
- label: Samplers Test # 36min
|
||||
mirror_hardwares: [amdexperimental]
|
||||
source_file_dependencies:
|
||||
- vllm/model_executor/layers
|
||||
- vllm/sampling_metadata.py
|
||||
@ -275,8 +263,18 @@ steps:
|
||||
- pytest -v -s samplers
|
||||
- VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
|
||||
|
||||
- label: LogitsProcessor Test # 5min
|
||||
mirror_hardwares: [amd]
|
||||
source_file_dependencies:
|
||||
- vllm/model_executor/layers
|
||||
- vllm/model_executor/guided_decoding
|
||||
- tests/test_logits_processor
|
||||
- tests/model_executor/test_guided_processors
|
||||
commands:
|
||||
- pytest -v -s test_logits_processor.py
|
||||
- pytest -v -s model_executor/test_guided_processors.py
|
||||
|
||||
- label: Speculative decoding tests # 40min
|
||||
mirror_hardwares: [amdexperimental]
|
||||
source_file_dependencies:
|
||||
- vllm/spec_decode
|
||||
- tests/spec_decode
|
||||
@ -287,29 +285,14 @@ steps:
|
||||
- pytest -v -s spec_decode/e2e/test_eagle_correctness.py
|
||||
|
||||
- label: LoRA Test %N # 15min each
|
||||
mirror_hardwares: [amdexperimental]
|
||||
mirror_hardwares: [amd]
|
||||
source_file_dependencies:
|
||||
- vllm/lora
|
||||
- tests/lora
|
||||
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py
|
||||
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_minicpmv_tp.py --ignore=lora/test_transfomers_model.py
|
||||
parallelism: 4
|
||||
|
||||
- label: PyTorch Compilation Unit Tests
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
torch_nightly: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/compile
|
||||
commands:
|
||||
- pytest -v -s compile/test_pass_manager.py
|
||||
- pytest -v -s compile/test_fusion.py
|
||||
- pytest -v -s compile/test_silu_mul_quant_fusion.py
|
||||
- pytest -v -s compile/test_sequence_parallelism.py
|
||||
- pytest -v -s compile/test_async_tp.py
|
||||
|
||||
- label: PyTorch Fullgraph Smoke Test # 9min
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
torch_nightly: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/compile
|
||||
@ -318,123 +301,61 @@ steps:
|
||||
# these tests need to be separated, cannot combine
|
||||
- pytest -v -s compile/piecewise/test_simple.py
|
||||
- pytest -v -s compile/piecewise/test_toy_llama.py
|
||||
- pytest -v -s compile/test_pass_manager.py
|
||||
|
||||
- label: PyTorch Fullgraph Test # 18min
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
torch_nightly: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/compile
|
||||
commands:
|
||||
- pytest -v -s compile/test_full_graph.py
|
||||
|
||||
- label: Kernels Core Operation Test
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
- label: Kernels Test %N # 1h each
|
||||
mirror_hardwares: [amd]
|
||||
source_file_dependencies:
|
||||
- csrc/
|
||||
- tests/kernels/core
|
||||
commands:
|
||||
- pytest -v -s kernels/core
|
||||
|
||||
- label: Kernels Attention Test %N
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
source_file_dependencies:
|
||||
- csrc/attention/
|
||||
- vllm/attention
|
||||
- vllm/v1/attention
|
||||
- tests/kernels/attention
|
||||
- tests/kernels
|
||||
commands:
|
||||
- pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
||||
parallelism: 2
|
||||
|
||||
- label: Kernels Quantization Test %N
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
source_file_dependencies:
|
||||
- csrc/quantization/
|
||||
- vllm/model_executor/layers/quantization
|
||||
- tests/kernels/quantization
|
||||
commands:
|
||||
- pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
||||
parallelism: 2
|
||||
|
||||
- label: Kernels MoE Test
|
||||
mirror_hardwares: [amdexperimental]
|
||||
source_file_dependencies:
|
||||
- csrc/moe/
|
||||
- tests/kernels/moe
|
||||
- vllm/model_executor/layers/fused_moe/
|
||||
commands:
|
||||
- pytest -v -s kernels/moe
|
||||
|
||||
- label: Kernels Mamba Test
|
||||
mirror_hardwares: [amdexperimental]
|
||||
source_file_dependencies:
|
||||
- csrc/mamba/
|
||||
- tests/kernels/mamba
|
||||
commands:
|
||||
- pytest -v -s kernels/mamba
|
||||
- pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
||||
parallelism: 4
|
||||
|
||||
- label: Tensorizer Test # 11min
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
mirror_hardwares: [amd]
|
||||
soft_fail: true
|
||||
source_file_dependencies:
|
||||
- vllm/model_executor/model_loader
|
||||
- tests/tensorizer_loader
|
||||
- tests/entrypoints/openai/test_tensorizer_entrypoint.py
|
||||
commands:
|
||||
- apt-get update && apt-get install -y curl libsodium23
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- pytest -v -s tensorizer_loader
|
||||
- pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
|
||||
|
||||
- label: Model Executor Test
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
soft_fail: true
|
||||
source_file_dependencies:
|
||||
- vllm/model_executor
|
||||
- tests/model_executor
|
||||
commands:
|
||||
- apt-get update && apt-get install -y curl libsodium23
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- pytest -v -s model_executor
|
||||
|
||||
- label: Benchmarks # 9min
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
working_dir: "/vllm-workspace/.buildkite"
|
||||
mirror_hardwares: [amd]
|
||||
source_file_dependencies:
|
||||
- benchmarks/
|
||||
commands:
|
||||
- bash scripts/run-benchmarks.sh
|
||||
- bash run-benchmarks.sh
|
||||
|
||||
- label: Benchmarks CLI Test # 10min
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/benchmarks/
|
||||
commands:
|
||||
- pytest -v -s benchmarks/
|
||||
|
||||
- label: Quantization Test
|
||||
mirror_hardwares: [amdexperimental]
|
||||
- label: Quantization Test # 33min
|
||||
source_file_dependencies:
|
||||
- csrc/
|
||||
- vllm/model_executor/layers/quantization
|
||||
- tests/quantization
|
||||
commands:
|
||||
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
|
||||
command: VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
|
||||
|
||||
- label: LM Eval Small Models # 53min
|
||||
mirror_hardwares: [amdexperimental]
|
||||
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
||||
source_file_dependencies:
|
||||
- csrc/
|
||||
- vllm/model_executor/layers/quantization
|
||||
commands:
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
|
||||
- bash ./run-tests.sh -c configs/models-small.txt -t 1
|
||||
|
||||
- label: OpenAI API correctness
|
||||
mirror_hardwares: [amdexperimental]
|
||||
source_file_dependencies:
|
||||
- csrc/
|
||||
- vllm/entrypoints/openai/
|
||||
@ -443,7 +364,6 @@ steps:
|
||||
- pytest -s entrypoints/openai/correctness/
|
||||
|
||||
- label: Encoder Decoder tests # 5min
|
||||
mirror_hardwares: [amdexperimental]
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/encoder_decoder
|
||||
@ -451,117 +371,98 @@ steps:
|
||||
- pytest -v -s encoder_decoder
|
||||
|
||||
- label: OpenAI-Compatible Tool Use # 20 min
|
||||
mirror_hardwares: [amdexperimental]
|
||||
fast_check: false
|
||||
mirror_hardwares: [ amd ]
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/tool_use
|
||||
- tests/mistral_tool_use
|
||||
commands:
|
||||
- pytest -v -s tool_use
|
||||
- pytest -v -s mistral_tool_use
|
||||
|
||||
##### models test #####
|
||||
|
||||
- label: Basic Models Test # 24min
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
torch_nightly: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models
|
||||
commands:
|
||||
- pytest -v -s models/test_transformers.py
|
||||
- pytest -v -s models/test_registry.py
|
||||
- pytest -v -s models/test_utils.py
|
||||
- pytest -v -s models/test_vision.py
|
||||
- pytest -v -s models/test_initialization.py
|
||||
# V1 Test: https://github.com/vllm-project/vllm/issues/14531
|
||||
- VLLM_USE_V1=0 pytest -v -s models/test_initialization.py
|
||||
|
||||
- label: Language Models Test (Standard)
|
||||
mirror_hardwares: [amdexperimental]
|
||||
torch_nightly: true
|
||||
- label: Language Models Test (Standard) # 32min
|
||||
#mirror_hardwares: [amd]
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/language
|
||||
- tests/models/decoder_only/language
|
||||
- tests/models/embedding/language
|
||||
- tests/models/encoder_decoder/language
|
||||
commands:
|
||||
# Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
|
||||
- pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
|
||||
- pip freeze | grep -E 'torch'
|
||||
- pytest -v -s models/language -m core_model
|
||||
- pytest -v -s models/decoder_only/language -m 'core_model or quant_model'
|
||||
- pytest -v -s models/embedding/language -m core_model
|
||||
|
||||
- label: Language Models Test (Extended Generation) # 1hr20min
|
||||
mirror_hardwares: [amdexperimental]
|
||||
- label: Language Models Test (Extended) # 1h10min
|
||||
optional: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/language/generation
|
||||
- tests/models/decoder_only/language
|
||||
- tests/models/embedding/language
|
||||
- tests/models/encoder_decoder/language
|
||||
commands:
|
||||
# Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
|
||||
- pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
|
||||
- pytest -v -s models/language/generation -m 'not core_model'
|
||||
- pytest -v -s models/decoder_only/language -m 'not core_model and not quant_model'
|
||||
- pytest -v -s models/embedding/language -m 'not core_model'
|
||||
|
||||
- label: Language Models Test (Extended Pooling) # 36min
|
||||
mirror_hardwares: [amdexperimental]
|
||||
optional: true
|
||||
- label: Multi-Modal Models Test (Standard) # 40min
|
||||
#mirror_hardwares: [amd]
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/language/pooling
|
||||
commands:
|
||||
- pytest -v -s models/language/pooling -m 'not core_model'
|
||||
|
||||
- label: Multi-Modal Models Test (Standard)
|
||||
mirror_hardwares: [amdexperimental]
|
||||
torch_nightly: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/multimodal
|
||||
- tests/models/decoder_only/audio_language
|
||||
- tests/models/decoder_only/vision_language
|
||||
- tests/models/embedding/vision_language
|
||||
- tests/models/encoder_decoder/audio_language
|
||||
- tests/models/encoder_decoder/vision_language
|
||||
commands:
|
||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||
- pip freeze | grep -E 'torch'
|
||||
- pytest -v -s models/multimodal/processing
|
||||
- pytest -v -s --ignore models/multimodal/generation/test_whisper.py models/multimodal -m core_model
|
||||
- cd .. && pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
|
||||
- pytest -v -s models/multimodal
|
||||
- pytest -v -s models/decoder_only/audio_language -m 'core_model or quant_model'
|
||||
- pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'core_model or quant_model'
|
||||
- pytest -v -s models/embedding/vision_language -m core_model
|
||||
- pytest -v -s models/encoder_decoder/audio_language -m core_model
|
||||
- pytest -v -s models/encoder_decoder/language -m core_model
|
||||
- pytest -v -s models/encoder_decoder/vision_language -m core_model
|
||||
|
||||
- label: Multi-Modal Models Test (Extended) 1
|
||||
mirror_hardwares: [amdexperimental]
|
||||
- label: Multi-Modal Models Test (Extended) 1 # 48m
|
||||
optional: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/multimodal
|
||||
- tests/models/decoder_only/audio_language
|
||||
- tests/models/decoder_only/vision_language
|
||||
- tests/models/embedding/vision_language
|
||||
- tests/models/encoder_decoder/vision_language
|
||||
commands:
|
||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||
- pytest -v -s --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing models/multimodal -m 'not core_model'
|
||||
- pytest -v -s models/decoder_only/audio_language -m 'not core_model and not quant_model'
|
||||
- pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=0) and not core_model and not quant_model'
|
||||
# HACK - run phi3v tests separately to sidestep this transformers bug
|
||||
# https://github.com/huggingface/transformers/issues/34307
|
||||
- pytest -v -s models/decoder_only/vision_language/test_phi3v.py
|
||||
- pytest -v -s --ignore models/decoder_only/vision_language/test_models.py --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'not core_model and not quant_model'
|
||||
- pytest -v -s models/embedding/vision_language -m 'not core_model'
|
||||
- pytest -v -s models/encoder_decoder/language -m 'not core_model'
|
||||
- pytest -v -s models/encoder_decoder/vision_language -m 'not core_model'
|
||||
|
||||
- label: Multi-Modal Models Test (Extended) 2
|
||||
mirror_hardwares: [amdexperimental]
|
||||
- label: Multi-Modal Models Test (Extended) 2 # 38m
|
||||
optional: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/multimodal
|
||||
- tests/models/decoder_only/vision_language
|
||||
commands:
|
||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||
- pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model'
|
||||
|
||||
- label: Multi-Modal Models Test (Extended) 3
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
optional: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/multimodal
|
||||
commands:
|
||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||
- pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model'
|
||||
|
||||
- label: Quantized Models Test
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
source_file_dependencies:
|
||||
- vllm/model_executor/layers/quantization
|
||||
- tests/models/quantization
|
||||
commands:
|
||||
- pytest -v -s models/quantization
|
||||
- pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=1) and not core_model and not quant_model'
|
||||
|
||||
# This test is used only in PR development phase to test individual models and should never run on main
|
||||
- label: Custom Models Test
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
optional: true
|
||||
commands:
|
||||
- echo 'Testing custom models...'
|
||||
@ -573,7 +474,6 @@ steps:
|
||||
##### multi gpus test #####
|
||||
|
||||
- label: Distributed Comm Ops Test # 7min
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 2
|
||||
source_file_dependencies:
|
||||
@ -584,7 +484,6 @@ steps:
|
||||
- pytest -v -s distributed/test_shm_broadcast.py
|
||||
|
||||
- label: 2 Node Tests (4 GPUs in total) # 16min
|
||||
mirror_hardwares: [amdexperimental]
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 2
|
||||
num_nodes: 2
|
||||
@ -603,7 +502,7 @@ steps:
|
||||
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
|
||||
|
||||
- label: Distributed Tests (2 GPUs) # 40min
|
||||
mirror_hardwares: [amdexperimental]
|
||||
#mirror_hardwares: [amd]
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 2
|
||||
source_file_dependencies:
|
||||
@ -618,38 +517,33 @@ steps:
|
||||
- vllm/worker/model_runner.py
|
||||
- entrypoints/llm/test_collective_rpc.py
|
||||
- tests/v1/test_async_llm_dp.py
|
||||
- tests/v1/entrypoints/openai/test_multi_api_servers.py
|
||||
- vllm/v1/engine/
|
||||
commands:
|
||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
|
||||
- DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
|
||||
- pytest -v -s entrypoints/llm/test_collective_rpc.py
|
||||
- VLLM_ENABLE_V1_MULTIPROCESSING=0 pytest -v -s entrypoints/llm/test_collective_rpc.py
|
||||
- pytest -v -s ./compile/test_basic_correctness.py
|
||||
- pytest -v -s ./compile/test_wrapper.py
|
||||
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
||||
- TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
|
||||
# Avoid importing model tests that cause CUDA reinitialization error
|
||||
- pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
|
||||
- pytest models/language -v -s -m 'distributed(num_gpus=2)'
|
||||
- pytest models/multimodal -v -s -m 'distributed(num_gpus=2)'
|
||||
# test sequence parallel
|
||||
- pytest -v -s distributed/test_sequence_parallel.py
|
||||
- pytest models/encoder_decoder/language/test_bart.py -v -s -m 'distributed(num_gpus=2)'
|
||||
- pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m 'distributed(num_gpus=2)'
|
||||
- pytest models/decoder_only/vision_language/test_models.py -v -s -m 'distributed(num_gpus=2)'
|
||||
# this test fails consistently.
|
||||
# TODO: investigate and fix
|
||||
# - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
|
||||
- VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
|
||||
- VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/test_disagg.py
|
||||
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
|
||||
|
||||
- label: Plugin Tests (2 GPUs) # 40min
|
||||
mirror_hardwares: [amdexperimental]
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 2
|
||||
source_file_dependencies:
|
||||
- vllm/plugins/
|
||||
- tests/plugins/
|
||||
commands:
|
||||
# begin platform plugin and general plugin tests, all the code in-between runs on dummy platform
|
||||
# begin platform plugin tests, all the code in-between runs on dummy platform
|
||||
- pip install -e ./plugins/vllm_add_dummy_platform
|
||||
- pytest -v -s plugins_tests/test_platform_plugins.py
|
||||
- pip uninstall vllm_add_dummy_platform -y
|
||||
@ -660,10 +554,8 @@ steps:
|
||||
- pytest -v -s distributed/test_distributed_oot.py
|
||||
- pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
|
||||
- pytest -v -s models/test_oot_registration.py # it needs a clean process
|
||||
- pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins
|
||||
|
||||
- label: Multi-step Tests (4 GPUs) # 36min
|
||||
mirror_hardwares: [amdexperimental]
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 4
|
||||
source_file_dependencies:
|
||||
@ -684,7 +576,6 @@ steps:
|
||||
- pytest -v -s multi_step/test_correctness_llm.py
|
||||
|
||||
- label: Pipeline Parallelism Test # 45min
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 4
|
||||
source_file_dependencies:
|
||||
@ -698,7 +589,6 @@ steps:
|
||||
- pytest -v -s distributed/test_pipeline_parallel.py
|
||||
|
||||
- label: LoRA TP Test (Distributed)
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
num_gpus: 4
|
||||
source_file_dependencies:
|
||||
- vllm/lora
|
||||
@ -711,10 +601,11 @@ steps:
|
||||
# requires multi-GPU testing for validation.
|
||||
- pytest -v -s -x lora/test_chatglm3_tp.py
|
||||
- pytest -v -s -x lora/test_llama_tp.py
|
||||
- pytest -v -s -x lora/test_minicpmv_tp.py
|
||||
- pytest -v -s -x lora/test_transfomers_model.py
|
||||
|
||||
|
||||
- label: Weight Loading Multiple GPU Test # 33min
|
||||
mirror_hardwares: [amdexperimental]
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 2
|
||||
source_file_dependencies:
|
||||
@ -724,7 +615,6 @@ steps:
|
||||
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt
|
||||
|
||||
- label: Weight Loading Multiple GPU Test - Large Models # optional
|
||||
mirror_hardwares: [amdexperimental]
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 2
|
||||
gpu: a100
|
||||
@ -763,4 +653,4 @@ steps:
|
||||
- vllm/model_executor/layers/quantization
|
||||
commands:
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
|
||||
- bash ./run-tests.sh -c configs/models-large.txt -t 4
|
||||
|
||||
@ -50,11 +50,11 @@ aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
|
||||
if [[ $normal_wheel == *"cu118"* ]]; then
|
||||
# if $normal_wheel matches cu118, do not upload the index.html
|
||||
echo "Skipping index files for cu118 wheels"
|
||||
elif [[ $normal_wheel == *"cu126"* ]]; then
|
||||
# if $normal_wheel matches cu126, do not upload the index.html
|
||||
echo "Skipping index files for cu126 wheels"
|
||||
elif [[ $normal_wheel == *"cu121"* ]]; then
|
||||
# if $normal_wheel matches cu121, do not upload the index.html
|
||||
echo "Skipping index files for cu121 wheels"
|
||||
else
|
||||
# only upload index.html for cu128 wheels (default wheels)
|
||||
# only upload index.html for cu124 wheels (default wheels)
|
||||
aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
|
||||
aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
|
||||
fi
|
||||
@ -66,13 +66,12 @@ aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"
|
||||
if [[ $normal_wheel == *"cu118"* ]]; then
|
||||
# if $normal_wheel matches cu118, do not upload the index.html
|
||||
echo "Skipping index files for cu118 wheels"
|
||||
elif [[ $normal_wheel == *"cu126"* ]]; then
|
||||
# if $normal_wheel matches cu126, do not upload the index.html
|
||||
echo "Skipping index files for cu126 wheels"
|
||||
elif [[ $normal_wheel == *"cu121"* ]]; then
|
||||
# if $normal_wheel matches cu121, do not upload the index.html
|
||||
echo "Skipping index files for cu121 wheels"
|
||||
else
|
||||
# only upload index.html for cu128 wheels (default wheels)
|
||||
# only upload index.html for cu124 wheels (default wheels)
|
||||
aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
|
||||
fi
|
||||
|
||||
aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
|
||||
aws s3 cp index.html "s3://vllm-wheels/$version/vllm/index.html"
|
||||
aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
|
||||
7
.github/CODEOWNERS
vendored
7
.github/CODEOWNERS
vendored
@ -12,8 +12,6 @@
|
||||
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth
|
||||
/vllm/model_executor/guided_decoding @mgoin @russellb
|
||||
/vllm/multimodal @DarkLight1337 @ywang96
|
||||
/vllm/vllm_flash_attn @LucasWilkinson
|
||||
/vllm/lora @jeejeelee
|
||||
CMakeLists.txt @tlrmchlsmth
|
||||
|
||||
# vLLM V1
|
||||
@ -41,8 +39,3 @@ CMakeLists.txt @tlrmchlsmth
|
||||
/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb
|
||||
/tests/v1/structured_output @mgoin @russellb
|
||||
/tests/weight_loading @mgoin @youkaichao
|
||||
/tests/lora @jeejeelee
|
||||
|
||||
# Docs
|
||||
/docs @hmellor
|
||||
mkdocs.yaml @hmellor
|
||||
2
.github/ISSUE_TEMPLATE/200-installation.yml
vendored
2
.github/ISSUE_TEMPLATE/200-installation.yml
vendored
@ -14,7 +14,7 @@ body:
|
||||
description: |
|
||||
Please run the following and paste the output below.
|
||||
```sh
|
||||
wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py
|
||||
wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
|
||||
# For security purposes, please feel free to check the contents of collect_env.py before running it.
|
||||
python collect_env.py
|
||||
```
|
||||
|
||||
2
.github/ISSUE_TEMPLATE/300-usage.yml
vendored
2
.github/ISSUE_TEMPLATE/300-usage.yml
vendored
@ -14,7 +14,7 @@ body:
|
||||
description: |
|
||||
Please run the following and paste the output below.
|
||||
```sh
|
||||
wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py
|
||||
wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
|
||||
# For security purposes, please feel free to check the contents of collect_env.py before running it.
|
||||
python collect_env.py
|
||||
```
|
||||
|
||||
14
.github/ISSUE_TEMPLATE/400-bug-report.yml
vendored
14
.github/ISSUE_TEMPLATE/400-bug-report.yml
vendored
@ -14,19 +14,19 @@ body:
|
||||
description: |
|
||||
Please run the following and paste the output below.
|
||||
```sh
|
||||
wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py
|
||||
wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
|
||||
# For security purposes, please feel free to check the contents of collect_env.py before running it.
|
||||
python collect_env.py
|
||||
```
|
||||
It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues.
|
||||
value: |
|
||||
<details>
|
||||
<summary>The output of <code>python collect_env.py</code></summary>
|
||||
<summary>The output of `python collect_env.py`</summary>
|
||||
|
||||
```text
|
||||
Your output of `python collect_env.py` here
|
||||
```
|
||||
|
||||
|
||||
</details>
|
||||
validations:
|
||||
required: true
|
||||
@ -75,20 +75,20 @@ body:
|
||||
```
|
||||
|
||||
```
|
||||
The error message you got, with the full traceback and the error logs with [dump_input.py:##] if present.
|
||||
The error message you got, with the full traceback.
|
||||
```
|
||||
validations:
|
||||
required: true
|
||||
- type: markdown
|
||||
attributes:
|
||||
value: |
|
||||
⚠️ Please separate bugs of `transformers` implementation or usage from bugs of `vllm`. If you think anything is wrong with the model's output:
|
||||
value: >
|
||||
⚠️ Please separate bugs of `transformers` implementation or usage from bugs of `vllm`. If you think anything is wrong with the models' output:
|
||||
|
||||
- Try the counterpart of `transformers` first. If the error appears, please go to [their issues](https://github.com/huggingface/transformers/issues?q=is%3Aissue+is%3Aopen+sort%3Aupdated-desc).
|
||||
|
||||
- If the error only appears in vllm, please provide the detailed script of how you run `transformers` and `vllm`, also highlight the difference and what you expect.
|
||||
|
||||
Thanks for reporting 🙏!
|
||||
Thanks for contributing 🎉!
|
||||
- type: checkboxes
|
||||
id: askllm
|
||||
attributes:
|
||||
|
||||
69
.github/ISSUE_TEMPLATE/450-ci-failure.yml
vendored
69
.github/ISSUE_TEMPLATE/450-ci-failure.yml
vendored
@ -1,69 +0,0 @@
|
||||
name: 🧪 CI failure report
|
||||
description: Report a failing test.
|
||||
title: "[CI Failure]: "
|
||||
labels: ["ci-failure"]
|
||||
|
||||
body:
|
||||
- type: markdown
|
||||
attributes:
|
||||
value: >
|
||||
#### Include the name of the failing Buildkite step and test file in the title.
|
||||
- type: input
|
||||
attributes:
|
||||
label: Name of failing test
|
||||
description: |
|
||||
Paste in the fully-qualified name of the failing test from the logs.
|
||||
placeholder: |
|
||||
`path/to/test_file.py::test_name[params]`
|
||||
validations:
|
||||
required: true
|
||||
- type: checkboxes
|
||||
attributes:
|
||||
label: Basic information
|
||||
description: Select all items that apply to the failing test.
|
||||
options:
|
||||
- label: Flaky test
|
||||
- label: Can reproduce locally
|
||||
- label: Caused by external libraries (e.g. bug in `transformers`)
|
||||
- type: textarea
|
||||
attributes:
|
||||
label: 🧪 Describe the failing test
|
||||
description: |
|
||||
Please provide a clear and concise description of the failing test.
|
||||
placeholder: |
|
||||
A clear and concise description of the failing test.
|
||||
|
||||
```
|
||||
The error message you got, with the full traceback and the error logs with [dump_input.py:##] if present.
|
||||
```
|
||||
validations:
|
||||
required: true
|
||||
- type: textarea
|
||||
attributes:
|
||||
label: 📝 History of failing test
|
||||
description: |
|
||||
Since when did the test start to fail?
|
||||
You can look up its history via [Buildkite Test Suites](https://buildkite.com/organizations/vllm/analytics/suites/ci-1/tests?branch=main).
|
||||
|
||||
If you have time, identify the PR that caused the test to fail on main. You can do so via the following methods:
|
||||
|
||||
- Use Buildkite Test Suites to find the PR where the test failure first occurred, and reproduce the failure locally.
|
||||
|
||||
- Run [`git bisect`](https://git-scm.com/docs/git-bisect) locally.
|
||||
|
||||
- Manually unblock Buildkite steps for suspected PRs on main and check the results. (authorized users only)
|
||||
placeholder: |
|
||||
Approximate timeline and/or problematic PRs
|
||||
|
||||
A link to the Buildkite analytics of the failing test (if available)
|
||||
validations:
|
||||
required: true
|
||||
- type: textarea
|
||||
attributes:
|
||||
label: CC List.
|
||||
description: >
|
||||
The list of people you want to CC. Usually, this includes those who worked on the PR that failed the test.
|
||||
- type: markdown
|
||||
attributes:
|
||||
value: >
|
||||
Thanks for reporting 🙏!
|
||||
2
.github/ISSUE_TEMPLATE/600-new-model.yml
vendored
2
.github/ISSUE_TEMPLATE/600-new-model.yml
vendored
@ -9,7 +9,7 @@ body:
|
||||
value: >
|
||||
#### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
|
||||
|
||||
#### We also highly recommend you read https://docs.vllm.ai/en/latest/contributing/model/index.html first to understand how to add a new model.
|
||||
#### We also highly recommend you read https://docs.vllm.ai/en/latest/contributing/model/adding_model.html first to understand how to add a new model.
|
||||
- type: textarea
|
||||
attributes:
|
||||
label: The model to consider.
|
||||
|
||||
@ -35,7 +35,7 @@ body:
|
||||
description: |
|
||||
Please run the following and paste the output below.
|
||||
```sh
|
||||
wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py
|
||||
wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
|
||||
# For security purposes, please feel free to check the contents of collect_env.py before running it.
|
||||
python collect_env.py
|
||||
```
|
||||
|
||||
2
.github/PULL_REQUEST_TEMPLATE.md
vendored
2
.github/PULL_REQUEST_TEMPLATE.md
vendored
@ -3,4 +3,4 @@ FILL IN THE PR DESCRIPTION HERE
|
||||
FIX #xxxx (*link existing issues this PR will resolve*)
|
||||
|
||||
<!--- pyml disable-next-line no-emphasis-as-heading -->
|
||||
**BEFORE SUBMITTING, PLEASE READ <https://docs.vllm.ai/en/latest/contributing>** (anything written below this line will be removed by GitHub Actions)
|
||||
**BEFORE SUBMITTING, PLEASE READ <https://docs.vllm.ai/en/latest/contributing/overview.html>**
|
||||
|
||||
45
.github/mergify.yml
vendored
45
.github/mergify.yml
vendored
@ -19,7 +19,7 @@ pull_request_rules:
|
||||
- files~=\.buildkite/
|
||||
- files~=^cmake/
|
||||
- files=CMakeLists.txt
|
||||
- files~=^docker/Dockerfile
|
||||
- files~=^Dockerfile
|
||||
- files~=^requirements.*\.txt
|
||||
- files=setup.py
|
||||
actions:
|
||||
@ -55,19 +55,11 @@ pull_request_rules:
|
||||
description: Automatically apply structured-output label
|
||||
conditions:
|
||||
- or:
|
||||
- files~=^benchmarks/structured_schemas/
|
||||
- files=benchmarks/benchmark_serving_structured_output.py
|
||||
- files=benchmarks/run_structured_output_benchmark.sh
|
||||
- files=docs/features/structured_outputs.md
|
||||
- files=examples/offline_inference/structured_outputs.py
|
||||
- files=examples/online_serving/openai_chat_completion_structured_outputs.py
|
||||
- files=examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
|
||||
- files~=^vllm/model_executor/guided_decoding/
|
||||
- files=tests/model_executor/test_guided_processors.py
|
||||
- files=tests/entrypoints/llm/test_guided_generate.py
|
||||
- files~=^tests/v1/structured_output/
|
||||
- files=tests/v1/entrypoints/llm/test_guided_generate.py
|
||||
- files~=^vllm/v1/structured_output/
|
||||
- files=benchmarks/benchmark_serving_guided.py
|
||||
- files=benchmarks/benchmark_guided.py
|
||||
actions:
|
||||
label:
|
||||
add:
|
||||
@ -126,26 +118,6 @@ pull_request_rules:
|
||||
remove:
|
||||
- tpu
|
||||
|
||||
- name: label-tool-calling
|
||||
description: Automatically add tool-calling label
|
||||
conditions:
|
||||
- or:
|
||||
- files~=^tests/tool_use/
|
||||
- files~=^tests/mistral_tool_use/
|
||||
- files~=^tests/entrypoints/openai/tool_parsers/
|
||||
- files=tests/entrypoints/openai/test_chat_with_tool_reasoning.py
|
||||
- files~=^vllm/entrypoints/openai/tool_parsers/
|
||||
- files=docs/features/tool_calling.md
|
||||
- files~=^examples/tool_chat_*
|
||||
- files=examples/offline_inference/chat_with_tools.py
|
||||
- files=examples/online_serving/openai_chat_completion_client_with_tools_required.py
|
||||
- files=examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py
|
||||
- files=examples/online_serving/openai_chat_completion_client_with_tools.py
|
||||
actions:
|
||||
label:
|
||||
add:
|
||||
- tool-calling
|
||||
|
||||
- name: ping author on conflicts and add 'needs-rebase' label
|
||||
conditions:
|
||||
- conflict
|
||||
@ -161,17 +133,6 @@ pull_request_rules:
|
||||
|
||||
https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/syncing-a-fork
|
||||
|
||||
- name: assign reviewer for tensorizer changes
|
||||
conditions:
|
||||
- files~=^vllm/model_executor/model_loader/tensorizer.py
|
||||
- files~=^vllm/model_executor/model_loader/tensorizer_loader.py
|
||||
- files~=^tests/entrypoints/openai/test_tensorizer_entrypoint.py
|
||||
- files~=^tests/tensorizer_loader/
|
||||
actions:
|
||||
assign:
|
||||
users:
|
||||
- "sangstar"
|
||||
|
||||
- name: remove 'needs-rebase' label when conflict is resolved
|
||||
conditions:
|
||||
- -conflict
|
||||
|
||||
2
.github/scripts/cleanup_pr_body.sh
vendored
2
.github/scripts/cleanup_pr_body.sh
vendored
@ -26,7 +26,7 @@ sed -i '/\*\*BEFORE SUBMITTING, PLEASE READ.*\*\*/,$d' "${NEW}"
|
||||
|
||||
# Remove HTML <details> section that includes <summary> text of "PR Checklist (Click to Expand)"
|
||||
python3 - <<EOF
|
||||
import regex as re
|
||||
import re
|
||||
|
||||
with open("${NEW}", "r") as file:
|
||||
content = file.read()
|
||||
|
||||
2
.github/workflows/add_label_automerge.yml
vendored
2
.github/workflows/add_label_automerge.yml
vendored
@ -1,6 +1,4 @@
|
||||
name: Add label on auto-merge enabled
|
||||
permissions:
|
||||
pull-requests: write
|
||||
on:
|
||||
pull_request_target:
|
||||
types:
|
||||
|
||||
7
.github/workflows/cleanup_pr_body.yml
vendored
7
.github/workflows/cleanup_pr_body.yml
vendored
@ -20,12 +20,7 @@ jobs:
|
||||
with:
|
||||
python-version: '3.12'
|
||||
|
||||
- name: Install Python dependencies
|
||||
run: |
|
||||
python3 -m pip install --upgrade pip
|
||||
python3 -m pip install regex
|
||||
|
||||
- name: Update PR description
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
run: bash .github/scripts/cleanup_pr_body.sh "${{ github.event.number }}"
|
||||
run: .github/scripts/cleanup_pr_body.sh "${{ github.event.number }}"
|
||||
|
||||
9
.github/workflows/lint-and-deploy.yaml
vendored
9
.github/workflows/lint-and-deploy.yaml
vendored
@ -2,9 +2,6 @@ name: Lint and Deploy Charts
|
||||
|
||||
on: pull_request
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
lint-and-deploy:
|
||||
runs-on: ubuntu-latest
|
||||
@ -53,7 +50,7 @@ jobs:
|
||||
uses: helm/kind-action@a1b0e391336a6ee6713a0583f8c6240d70863de3 # v1.12.0
|
||||
|
||||
- name: Build the Docker image vllm cpu
|
||||
run: docker buildx build -f docker/Dockerfile.cpu -t vllm-cpu-env .
|
||||
run: docker buildx build -f Dockerfile.cpu -t vllm-cpu-env .
|
||||
|
||||
- name: Configuration of docker images, network and namespace for the kind cluster
|
||||
run: |
|
||||
@ -69,7 +66,7 @@ jobs:
|
||||
export AWS_SECRET_ACCESS_KEY=minioadmin
|
||||
sleep 30 && kubectl -n ns-vllm logs -f "$(kubectl -n ns-vllm get pods | awk '/deployment/ {print $1;exit}')" &
|
||||
helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/online_serving/chart-helm -f examples/online_serving/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env"
|
||||
|
||||
|
||||
- name: curl test
|
||||
run: |
|
||||
kubectl -n ns-vllm port-forward service/test-vllm-service 8001:80 &
|
||||
@ -82,4 +79,4 @@ jobs:
|
||||
"max_tokens": 7,
|
||||
"temperature": 0
|
||||
}'):$CODE"
|
||||
echo "$CODE"
|
||||
echo "$CODE"
|
||||
3
.github/workflows/pre-commit.yml
vendored
3
.github/workflows/pre-commit.yml
vendored
@ -5,9 +5,6 @@ on:
|
||||
push:
|
||||
branches: [main]
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
pre-commit:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
2
.github/workflows/reminder_comment.yml
vendored
2
.github/workflows/reminder_comment.yml
vendored
@ -1,6 +1,4 @@
|
||||
name: PR Reminder Comment Bot
|
||||
permissions:
|
||||
pull-requests: write
|
||||
on:
|
||||
pull_request_target:
|
||||
types: [opened]
|
||||
|
||||
9
.gitignore
vendored
9
.gitignore
vendored
@ -3,6 +3,7 @@
|
||||
|
||||
# vllm-flash-attn built from source
|
||||
vllm/vllm_flash_attn/*
|
||||
!vllm/vllm_flash_attn/fa_utils.py
|
||||
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
@ -77,6 +78,10 @@ instance/
|
||||
# Scrapy stuff:
|
||||
.scrapy
|
||||
|
||||
# Sphinx documentation
|
||||
docs/_build/
|
||||
docs/source/getting_started/examples/
|
||||
|
||||
# PyBuilder
|
||||
.pybuilder/
|
||||
target/
|
||||
@ -146,7 +151,6 @@ venv.bak/
|
||||
|
||||
# mkdocs documentation
|
||||
/site
|
||||
docs/examples
|
||||
|
||||
# mypy
|
||||
.mypy_cache/
|
||||
@ -199,6 +203,3 @@ benchmarks/**/*.json
|
||||
# Linting
|
||||
actionlint
|
||||
shellcheck*/
|
||||
|
||||
# Ingore moe/marlin_moe gen code
|
||||
csrc/moe/marlin_moe_wna16/kernel_*
|
||||
|
||||
@ -1,6 +1,3 @@
|
||||
default_install_hook_types:
|
||||
- pre-commit
|
||||
- commit-msg
|
||||
default_stages:
|
||||
- pre-commit # Run locally
|
||||
- manual # Run in CI
|
||||
@ -11,45 +8,43 @@ repos:
|
||||
hooks:
|
||||
- id: yapf
|
||||
args: [--in-place, --verbose]
|
||||
additional_dependencies: [toml] # TODO: Remove when yapf is upgraded
|
||||
- repo: https://github.com/astral-sh/ruff-pre-commit
|
||||
rev: v0.11.7
|
||||
rev: v0.9.3
|
||||
hooks:
|
||||
- id: ruff
|
||||
args: [--output-format, github, --fix]
|
||||
- id: ruff-format
|
||||
files: ^(.buildkite|benchmarks|examples)/.*
|
||||
- repo: https://github.com/codespell-project/codespell
|
||||
rev: v2.4.1
|
||||
rev: v2.4.0
|
||||
hooks:
|
||||
- id: codespell
|
||||
additional_dependencies: ['tomli']
|
||||
args: ['--toml', 'pyproject.toml']
|
||||
- repo: https://github.com/PyCQA/isort
|
||||
rev: 6.0.1
|
||||
rev: 0a0b7a830386ba6a31c2ec8316849ae4d1b8240d # 6.0.0
|
||||
hooks:
|
||||
- id: isort
|
||||
- repo: https://github.com/pre-commit/mirrors-clang-format
|
||||
rev: v20.1.3
|
||||
rev: v19.1.7
|
||||
hooks:
|
||||
- id: clang-format
|
||||
exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))|vllm/third_party/.*'
|
||||
types_or: [c++, cuda]
|
||||
args: [--style=file, --verbose]
|
||||
- repo: https://github.com/jackdewinter/pymarkdown
|
||||
rev: v0.9.29
|
||||
rev: v0.9.27
|
||||
hooks:
|
||||
- id: pymarkdown
|
||||
exclude: '.*\.inc\.md'
|
||||
args: [fix]
|
||||
- repo: https://github.com/rhysd/actionlint
|
||||
rev: v1.7.7
|
||||
hooks:
|
||||
- id: actionlint
|
||||
- repo: https://github.com/astral-sh/uv-pre-commit
|
||||
rev: 0.6.17
|
||||
rev: 0.6.2
|
||||
hooks:
|
||||
- id: pip-compile
|
||||
args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu128]
|
||||
args: [requirements/test.in, -o, requirements/test.txt]
|
||||
files: ^requirements/test\.(in|txt)$
|
||||
- repo: local
|
||||
hooks:
|
||||
@ -58,7 +53,7 @@ repos:
|
||||
entry: tools/mypy.sh 0 "local"
|
||||
language: python
|
||||
types: [python]
|
||||
additional_dependencies: &mypy_deps [mypy==1.11.1, types-cachetools, types-setuptools, types-PyYAML, types-requests, pydantic]
|
||||
additional_dependencies: &mypy_deps [mypy==1.11.1, types-cachetools, types-setuptools, types-PyYAML, types-requests]
|
||||
stages: [pre-commit] # Don't run in CI
|
||||
- id: mypy-3.9 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
|
||||
name: Run mypy for Python 3.9
|
||||
@ -104,8 +99,8 @@ repos:
|
||||
args:
|
||||
- -c
|
||||
- |
|
||||
if ! grep -q "^Signed-off-by: $(git config user.name) <$(git config user.email)>" "$(git rev-parse --git-path COMMIT_EDITMSG)"; then
|
||||
printf "\nSigned-off-by: $(git config user.name) <$(git config user.email)>\n" >> "$(git rev-parse --git-path COMMIT_EDITMSG)"
|
||||
if ! grep -q "^Signed-off-by: $(git config user.name) <$(git config user.email)>" .git/COMMIT_EDITMSG; then
|
||||
printf "\nSigned-off-by: $(git config user.name) <$(git config user.email)>\n" >> .git/COMMIT_EDITMSG
|
||||
fi
|
||||
language: system
|
||||
verbose: true
|
||||
@ -124,25 +119,6 @@ repos:
|
||||
language: system
|
||||
always_run: true
|
||||
pass_filenames: false
|
||||
- id: update-dockerfile-graph
|
||||
name: Update Dockerfile dependency graph
|
||||
entry: tools/update-dockerfile-graph.sh
|
||||
language: script
|
||||
- id: enforce-import-regex-instead-of-re
|
||||
name: Enforce import regex as re
|
||||
entry: python tools/enforce_regex_import.py
|
||||
language: python
|
||||
types: [python]
|
||||
pass_filenames: false
|
||||
additional_dependencies: [regex]
|
||||
# forbid directly import triton
|
||||
- id: forbid-direct-triton-import
|
||||
name: "Forbid direct 'import triton'"
|
||||
entry: python tools/check_triton_import.py
|
||||
language: python
|
||||
types: [python]
|
||||
pass_filenames: false
|
||||
additional_dependencies: [regex]
|
||||
# Keep `suggestion` last
|
||||
- id: suggestion
|
||||
name: Suggestion
|
||||
|
||||
@ -8,8 +8,12 @@ build:
|
||||
tools:
|
||||
python: "3.12"
|
||||
|
||||
mkdocs:
|
||||
configuration: mkdocs.yaml
|
||||
sphinx:
|
||||
configuration: docs/source/conf.py
|
||||
fail_on_warning: true
|
||||
|
||||
# If using Sphinx, optionally build your docs in additional formats such as PDF
|
||||
formats: []
|
||||
|
||||
# Optionally declare the Python requirements required to build your docs
|
||||
python:
|
||||
|
||||
196
CMakeLists.txt
196
CMakeLists.txt
@ -15,6 +15,7 @@ project(vllm_extensions LANGUAGES CXX)
|
||||
|
||||
# CUDA by default, can be overridden by using -DVLLM_TARGET_DEVICE=... (used by setup.py)
|
||||
set(VLLM_TARGET_DEVICE "cuda" CACHE STRING "Target device backend for vLLM")
|
||||
|
||||
message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
|
||||
message(STATUS "Target device: ${VLLM_TARGET_DEVICE}")
|
||||
|
||||
@ -23,17 +24,17 @@ include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake)
|
||||
# Suppress potential warnings about unused manually-specified variables
|
||||
set(ignoreMe "${VLLM_PYTHON_PATH}")
|
||||
|
||||
# Prevent installation of dependencies (cutlass) by default.
|
||||
install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
|
||||
|
||||
#
|
||||
# Supported python versions. These versions will be searched in order, the
|
||||
# first match will be selected. These should be kept in sync with setup.py.
|
||||
#
|
||||
set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12")
|
||||
|
||||
# Supported NVIDIA architectures.
|
||||
set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0")
|
||||
|
||||
# Supported AMD GPU architectures.
|
||||
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201")
|
||||
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101")
|
||||
|
||||
#
|
||||
# Supported/expected torch versions for CUDA/ROCm.
|
||||
@ -43,10 +44,10 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1
|
||||
#
|
||||
# Note: the CUDA torch version is derived from pyproject.toml and various
|
||||
# requirements.txt files and should be kept consistent. The ROCm torch
|
||||
# versions are derived from docker/Dockerfile.rocm
|
||||
# versions are derived from Dockerfile.rocm
|
||||
#
|
||||
set(TORCH_SUPPORTED_VERSION_CUDA "2.7.0")
|
||||
set(TORCH_SUPPORTED_VERSION_ROCM "2.7.0")
|
||||
set(TORCH_SUPPORTED_VERSION_CUDA "2.6.0")
|
||||
set(TORCH_SUPPORTED_VERSION_ROCM "2.6.0")
|
||||
|
||||
#
|
||||
# Try to find python package with an executable that exactly matches
|
||||
@ -79,15 +80,6 @@ endif()
|
||||
#
|
||||
find_package(Torch REQUIRED)
|
||||
|
||||
# Supported NVIDIA architectures.
|
||||
# This check must happen after find_package(Torch) because that's when CMAKE_CUDA_COMPILER_VERSION gets defined
|
||||
if(DEFINED CMAKE_CUDA_COMPILER_VERSION AND
|
||||
CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8)
|
||||
set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0")
|
||||
else()
|
||||
set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0")
|
||||
endif()
|
||||
|
||||
#
|
||||
# Forward the non-CUDA device extensions to external CMake scripts.
|
||||
#
|
||||
@ -235,34 +227,28 @@ endif()
|
||||
#
|
||||
|
||||
set(VLLM_EXT_SRC
|
||||
"csrc/mamba/mamba_ssm/selective_scan_fwd.cu"
|
||||
"csrc/mamba/causal_conv1d/causal_conv1d.cu"
|
||||
"csrc/cache_kernels.cu"
|
||||
"csrc/attention/paged_attention_v1.cu"
|
||||
"csrc/attention/paged_attention_v2.cu"
|
||||
"csrc/attention/merge_attn_states.cu"
|
||||
"csrc/attention/vertical_slash_index.cu"
|
||||
"csrc/pos_encoding_kernels.cu"
|
||||
"csrc/activation_kernels.cu"
|
||||
"csrc/layernorm_kernels.cu"
|
||||
"csrc/layernorm_quant_kernels.cu"
|
||||
"csrc/cuda_view.cu"
|
||||
"csrc/quantization/gptq/q_gemm.cu"
|
||||
"csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
|
||||
"csrc/quantization/fp8/common.cu"
|
||||
"csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu"
|
||||
"csrc/quantization/gguf/gguf_kernel.cu"
|
||||
"csrc/quantization/activation_kernels.cu"
|
||||
"csrc/cuda_utils_kernels.cu"
|
||||
"csrc/prepare_inputs/advance_step.cu"
|
||||
"csrc/custom_all_reduce.cu"
|
||||
"csrc/torch_bindings.cpp")
|
||||
|
||||
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
|
||||
|
||||
# Set CUTLASS_REVISION. Used for FetchContent. Also fixes some bogus messages when building.
|
||||
set(CUTLASS_REVISION "v3.9.2" CACHE STRING "CUTLASS revision to use")
|
||||
# Set CUTLASS_REVISION manually -- its revision detection doesn't work in this case.
|
||||
# Please keep this in sync with FetchContent_Declare line below.
|
||||
set(CUTLASS_REVISION "v3.8.0" CACHE STRING "CUTLASS revision to use")
|
||||
|
||||
# Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
|
||||
if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
|
||||
@ -280,7 +266,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
cutlass
|
||||
GIT_REPOSITORY https://github.com/nvidia/cutlass.git
|
||||
# Please keep this in sync with CUTLASS_REVISION line above.
|
||||
GIT_TAG ${CUTLASS_REVISION}
|
||||
GIT_TAG v3.8.0
|
||||
GIT_PROGRESS TRUE
|
||||
|
||||
# Speed up CUTLASS download by retrieving only the specified GIT_TAG instead of the history.
|
||||
@ -292,16 +278,17 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
FetchContent_MakeAvailable(cutlass)
|
||||
|
||||
list(APPEND VLLM_EXT_SRC
|
||||
"csrc/mamba/mamba_ssm/selective_scan_fwd.cu"
|
||||
"csrc/mamba/causal_conv1d/causal_conv1d.cu"
|
||||
"csrc/quantization/aqlm/gemm_kernels.cu"
|
||||
"csrc/quantization/awq/gemm_kernels.cu"
|
||||
"csrc/custom_all_reduce.cu"
|
||||
"csrc/permute_cols.cu"
|
||||
"csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
|
||||
"csrc/quantization/fp4/nvfp4_quant_entry.cu"
|
||||
"csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu"
|
||||
"csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu"
|
||||
"csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
|
||||
"csrc/cutlass_extensions/common.cpp"
|
||||
"csrc/attention/mla/cutlass_mla_entry.cu")
|
||||
"csrc/cutlass_extensions/common.cpp")
|
||||
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${VLLM_EXT_SRC}"
|
||||
@ -310,55 +297,10 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
# Only build Marlin kernels if we are building for at least some compatible archs.
|
||||
# Keep building Marlin for 9.0 as there are some group sizes and shapes that
|
||||
# are not supported by Machete yet.
|
||||
# 9.0 for latest bf16 atomicAdd PTX
|
||||
cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;9.0+PTX" "${CUDA_ARCHS}")
|
||||
cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0" "${CUDA_ARCHS}")
|
||||
if (MARLIN_ARCHS)
|
||||
|
||||
#
|
||||
# For the Marlin kernels we automatically generate sources for various
|
||||
# preselected input type pairs and schedules.
|
||||
# Generate sources:
|
||||
set(MARLIN_GEN_SCRIPT
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/gptq_marlin/generate_kernels.py)
|
||||
file(MD5 ${MARLIN_GEN_SCRIPT} MARLIN_GEN_SCRIPT_HASH)
|
||||
|
||||
message(STATUS "Marlin generation script hash: ${MARLIN_GEN_SCRIPT_HASH}")
|
||||
message(STATUS "Last run Marlin generate script hash: $CACHE{MARLIN_GEN_SCRIPT_HASH}")
|
||||
|
||||
if (NOT DEFINED CACHE{MARLIN_GEN_SCRIPT_HASH}
|
||||
OR NOT $CACHE{MARLIN_GEN_SCRIPT_HASH} STREQUAL ${MARLIN_GEN_SCRIPT_HASH})
|
||||
execute_process(
|
||||
COMMAND ${CMAKE_COMMAND} -E env
|
||||
PYTHONPATH=$PYTHONPATH
|
||||
${Python_EXECUTABLE} ${MARLIN_GEN_SCRIPT}
|
||||
RESULT_VARIABLE marlin_generation_result
|
||||
OUTPUT_VARIABLE marlin_generation_result
|
||||
OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log
|
||||
ERROR_FILE ${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log
|
||||
)
|
||||
|
||||
if (NOT marlin_generation_result EQUAL 0)
|
||||
message(FATAL_ERROR "Marlin generation failed."
|
||||
" Result: \"${marlin_generation_result}\""
|
||||
"\nCheck the log for details: "
|
||||
"${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log")
|
||||
else()
|
||||
set(MARLIN_GEN_SCRIPT_HASH ${MARLIN_GEN_SCRIPT_HASH}
|
||||
CACHE STRING "Last run Marlin generate script hash" FORCE)
|
||||
message(STATUS "Marlin generation completed successfully.")
|
||||
endif()
|
||||
else()
|
||||
message(STATUS "Marlin generation script has not changed, skipping generation.")
|
||||
endif()
|
||||
|
||||
file(GLOB MARLIN_TEMPLATE_KERNEL_SRC "csrc/quantization/gptq_marlin/kernel_*.cu")
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${MARLIN_TEMPLATE_KERNEL_SRC}"
|
||||
CUDA_ARCHS "${MARLIN_ARCHS}")
|
||||
|
||||
list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_KERNEL_SRC})
|
||||
|
||||
set(MARLIN_SRCS
|
||||
"csrc/quantization/fp8/fp8_marlin.cu"
|
||||
"csrc/quantization/marlin/dense/marlin_cuda_kernel.cu"
|
||||
"csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
|
||||
"csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu"
|
||||
@ -430,7 +372,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
set(SRCS
|
||||
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu"
|
||||
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu"
|
||||
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8.cu"
|
||||
)
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${SRCS}"
|
||||
@ -455,9 +396,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
#
|
||||
# For the cutlass_scaled_mm kernels we want to build the c2x (CUTLASS 2.x)
|
||||
# kernels for the remaining archs that are not already built for 3x.
|
||||
# (Build 8.9 for FP8)
|
||||
cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS
|
||||
"7.5;8.0;8.9+PTX" "${CUDA_ARCHS}")
|
||||
"7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0" "${CUDA_ARCHS}")
|
||||
# subtract out the archs that are already built for 3x
|
||||
list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
|
||||
if (SCALED_MM_2X_ARCHS)
|
||||
@ -508,9 +448,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND FP4_ARCHS)
|
||||
set(SRCS
|
||||
"csrc/quantization/fp4/nvfp4_quant_kernels.cu"
|
||||
"csrc/quantization/fp4/nvfp4_experts_quant.cu"
|
||||
"csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu"
|
||||
"csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu")
|
||||
"csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu")
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${SRCS}"
|
||||
CUDA_ARCHS "${FP4_ARCHS}")
|
||||
@ -523,32 +461,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
set(FP4_ARCHS)
|
||||
endif()
|
||||
|
||||
# CUTLASS MLA Archs and flags
|
||||
cuda_archs_loose_intersection(MLA_ARCHS "10.0a" "${CUDA_ARCHS}")
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND MLA_ARCHS)
|
||||
set(SRCS
|
||||
"csrc/attention/mla/cutlass_mla_kernels.cu")
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${SRCS}"
|
||||
CUDA_ARCHS "${MLA_ARCHS}")
|
||||
list(APPEND VLLM_EXT_SRC "${SRCS}")
|
||||
list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MLA=1")
|
||||
# Add MLA-specific include directories only to MLA source files
|
||||
set_source_files_properties(${SRCS}
|
||||
PROPERTIES INCLUDE_DIRECTORIES "${CUTLASS_DIR}/examples/77_blackwell_fmha;${CUTLASS_DIR}/examples/common")
|
||||
message(STATUS "Building CUTLASS MLA for archs: ${MLA_ARCHS}")
|
||||
else()
|
||||
message(STATUS "Not building CUTLASS MLA as no compatible archs were found.")
|
||||
# clear MLA_ARCHS
|
||||
set(MLA_ARCHS)
|
||||
endif()
|
||||
|
||||
#
|
||||
# CUTLASS MoE kernels
|
||||
|
||||
# The MoE kernel cutlass_moe_mm requires CUDA 12.3 or later (and only works
|
||||
# on Hopper). get_cutlass_moe_mm_data should only be compiled if it's possible
|
||||
# to compile MoE kernels that use its output.
|
||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;10.0a" "${CUDA_ARCHS}")
|
||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}")
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
|
||||
set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu"
|
||||
"csrc/quantization/cutlass_w8a8/moe/moe_data.cu")
|
||||
@ -686,54 +605,23 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
CUDA_ARCHS "${CUDA_ARCHS}")
|
||||
|
||||
list(APPEND VLLM_MOE_EXT_SRC "${VLLM_MOE_WNA16_SRC}")
|
||||
# 9.0 for latest bf16 atomicAdd PTX
|
||||
cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;9.0+PTX" "${CUDA_ARCHS}")
|
||||
cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0" "${CUDA_ARCHS}")
|
||||
if (MARLIN_MOE_ARCHS)
|
||||
set(MARLIN_MOE_SRC
|
||||
"csrc/moe/marlin_kernels/marlin_moe_kernel.h"
|
||||
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.h"
|
||||
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.cu"
|
||||
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.h"
|
||||
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.cu"
|
||||
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.h"
|
||||
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.cu"
|
||||
"csrc/moe/marlin_moe_ops.cu")
|
||||
|
||||
#
|
||||
# For the Marlin MOE kernels we automatically generate sources for various
|
||||
# preselected input type pairs and schedules.
|
||||
# Generate sources:
|
||||
set(MOE_MARLIN_GEN_SCRIPT
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/csrc/moe/marlin_moe_wna16/generate_kernels.py)
|
||||
file(MD5 ${MOE_MARLIN_GEN_SCRIPT} MOE_MARLIN_GEN_SCRIPT_HASH)
|
||||
|
||||
message(STATUS "Marlin MOE generation script hash: ${MOE_MARLIN_GEN_SCRIPT_HASH}")
|
||||
message(STATUS "Last run Marlin MOE generate script hash: $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH}")
|
||||
|
||||
if (NOT DEFINED CACHE{MOE_MARLIN_GEN_SCRIPT_HASH}
|
||||
OR NOT $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH} STREQUAL ${MOE_MARLIN_GEN_SCRIPT_HASH})
|
||||
execute_process(
|
||||
COMMAND ${CMAKE_COMMAND} -E env
|
||||
PYTHONPATH=$PYTHONPATH
|
||||
${Python_EXECUTABLE} ${MOE_MARLIN_GEN_SCRIPT}
|
||||
RESULT_VARIABLE moe_marlin_generation_result
|
||||
OUTPUT_VARIABLE moe_marlin_generation_output
|
||||
OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log
|
||||
ERROR_FILE ${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log
|
||||
)
|
||||
|
||||
if (NOT moe_marlin_generation_result EQUAL 0)
|
||||
message(FATAL_ERROR "Marlin MOE generation failed."
|
||||
" Result: \"${moe_marlin_generation_result}\""
|
||||
"\nCheck the log for details: "
|
||||
"${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log")
|
||||
else()
|
||||
set(MOE_MARLIN_GEN_SCRIPT_HASH ${MOE_MARLIN_GEN_SCRIPT_HASH}
|
||||
CACHE STRING "Last run Marlin MOE generate script hash" FORCE)
|
||||
message(STATUS "Marlin MOE generation completed successfully.")
|
||||
endif()
|
||||
else()
|
||||
message(STATUS "Marlin MOE generation script has not changed, skipping generation.")
|
||||
endif()
|
||||
|
||||
file(GLOB MOE_WNAA16_MARLIN_SRC "csrc/moe/marlin_moe_wna16/*.cu")
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${MOE_WNAA16_MARLIN_SRC}"
|
||||
SRCS "${MARLIN_MOE_SRC}"
|
||||
CUDA_ARCHS "${MARLIN_MOE_ARCHS}")
|
||||
|
||||
list(APPEND VLLM_MOE_EXT_SRC ${MOE_WNAA16_MARLIN_SRC})
|
||||
|
||||
list(APPEND VLLM_MOE_EXT_SRC "${MARLIN_MOE_SRC}")
|
||||
message(STATUS "Building Marlin MOE kernels for archs: ${MARLIN_MOE_ARCHS}")
|
||||
else()
|
||||
message(STATUS "Not building Marlin MOE kernels as no compatible archs found"
|
||||
@ -741,17 +629,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
set(MOE_PERMUTE_SRC
|
||||
"csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.cu"
|
||||
"csrc/moe/moe_permute_unpermute_op.cu")
|
||||
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${MARLIN_PERMUTE_SRC}"
|
||||
CUDA_ARCHS "${MOE_PERMUTE_ARCHS}")
|
||||
|
||||
list(APPEND VLLM_MOE_EXT_SRC "${MOE_PERMUTE_SRC}")
|
||||
endif()
|
||||
message(STATUS "Enabling moe extension.")
|
||||
define_gpu_extension_target(
|
||||
_moe_C
|
||||
@ -760,8 +637,6 @@ define_gpu_extension_target(
|
||||
SOURCES ${VLLM_MOE_EXT_SRC}
|
||||
COMPILE_FLAGS ${VLLM_GPU_FLAGS}
|
||||
ARCHITECTURES ${VLLM_GPU_ARCHES}
|
||||
INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR}
|
||||
INCLUDE_DIRECTORIES ${CUTLASS_TOOLS_UTIL_INCLUDE_DIR}
|
||||
USE_SABI 3
|
||||
WITH_SOABI)
|
||||
|
||||
@ -771,7 +646,6 @@ if(VLLM_GPU_LANG STREQUAL "HIP")
|
||||
#
|
||||
set(VLLM_ROCM_EXT_SRC
|
||||
"csrc/rocm/torch_bindings.cpp"
|
||||
"csrc/rocm/skinny_gemms.cu"
|
||||
"csrc/rocm/attention.cu")
|
||||
|
||||
define_gpu_extension_target(
|
||||
@ -788,7 +662,5 @@ endif()
|
||||
# For CUDA we also build and ship some external projects.
|
||||
if (VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
include(cmake/external_projects/flashmla.cmake)
|
||||
|
||||
# vllm-flash-attn should be last as it overwrites some CMake functions
|
||||
include(cmake/external_projects/vllm_flash_attn.cmake)
|
||||
endif ()
|
||||
|
||||
@ -1,3 +1,3 @@
|
||||
# Contributing to vLLM
|
||||
|
||||
You may find information about contributing to vLLM on [docs.vllm.ai](https://docs.vllm.ai/en/latest/contributing).
|
||||
You may find information about contributing to vLLM on [docs.vllm.ai](https://docs.vllm.ai/en/latest/contributing/overview.html).
|
||||
|
||||
@ -2,14 +2,14 @@
|
||||
# to run the OpenAI compatible server.
|
||||
|
||||
# Please update any changes made here to
|
||||
# docs/contributing/dockerfile/dockerfile.md and
|
||||
# docs/assets/contributing/dockerfile-stages-dependency.png
|
||||
# docs/source/contributing/dockerfile/dockerfile.md and
|
||||
# docs/source/assets/contributing/dockerfile-stages-dependency.png
|
||||
|
||||
ARG CUDA_VERSION=12.8.1
|
||||
ARG CUDA_VERSION=12.4.1
|
||||
#################### BASE BUILD IMAGE ####################
|
||||
# prepare basic build environment
|
||||
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 AS base
|
||||
ARG CUDA_VERSION=12.8.1
|
||||
ARG CUDA_VERSION=12.4.1
|
||||
ARG PYTHON_VERSION=3.12
|
||||
ARG TARGETPLATFORM
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
@ -19,10 +19,7 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
|
||||
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
|
||||
&& apt-get update -y \
|
||||
&& apt-get install -y ccache software-properties-common git curl sudo \
|
||||
&& for i in 1 2 3; do \
|
||||
add-apt-repository -y ppa:deadsnakes/ppa && break || \
|
||||
{ echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \
|
||||
done \
|
||||
&& add-apt-repository ppa:deadsnakes/ppa \
|
||||
&& apt-get update -y \
|
||||
&& apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
|
||||
&& update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
|
||||
@ -37,7 +34,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
|
||||
# Reference: https://github.com/astral-sh/uv/pull/1694
|
||||
ENV UV_HTTP_TIMEOUT=500
|
||||
ENV UV_INDEX_STRATEGY="unsafe-best-match"
|
||||
|
||||
# Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519
|
||||
# as it was causing spam when compiling the CUTLASS kernels
|
||||
@ -70,14 +66,13 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
COPY requirements/common.txt requirements/common.txt
|
||||
COPY requirements/cuda.txt requirements/cuda.txt
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --system -r requirements/cuda.txt \
|
||||
--extra-index-url https://download.pytorch.org/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
|
||||
uv pip install --system -r requirements/cuda.txt
|
||||
|
||||
# cuda arch list used by torch
|
||||
# can be useful for both `dev` and `test`
|
||||
# explicitly set the list to avoid issues with torch 2.2
|
||||
# see https://github.com/pytorch/pytorch/pull/123243
|
||||
ARG torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0 10.0+PTX'
|
||||
ARG torch_cuda_arch_list='7.0 7.5 8.0 8.6 8.9 9.0+PTX'
|
||||
ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
|
||||
# Override the arch list for flash-attn to reduce the binary size
|
||||
ARG vllm_fa_cmake_gpu_arches='80-real;90-real'
|
||||
@ -94,11 +89,9 @@ COPY requirements/build.txt requirements/build.txt
|
||||
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
|
||||
# Reference: https://github.com/astral-sh/uv/pull/1694
|
||||
ENV UV_HTTP_TIMEOUT=500
|
||||
ENV UV_INDEX_STRATEGY="unsafe-best-match"
|
||||
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --system -r requirements/build.txt \
|
||||
--extra-index-url https://download.pytorch.org/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
|
||||
uv pip install --system -r requirements/build.txt
|
||||
|
||||
COPY . .
|
||||
ARG GIT_REPO_CHECK=0
|
||||
@ -165,32 +158,24 @@ FROM base as dev
|
||||
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
|
||||
# Reference: https://github.com/astral-sh/uv/pull/1694
|
||||
ENV UV_HTTP_TIMEOUT=500
|
||||
ENV UV_INDEX_STRATEGY="unsafe-best-match"
|
||||
|
||||
# Workaround for #17068
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --system --no-build-isolation "git+https://github.com/state-spaces/mamba@v2.2.4"
|
||||
|
||||
COPY requirements/lint.txt requirements/lint.txt
|
||||
COPY requirements/test.txt requirements/test.txt
|
||||
COPY requirements/dev.txt requirements/dev.txt
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --system -r requirements/dev.txt \
|
||||
--extra-index-url https://download.pytorch.org/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
|
||||
uv pip install --system -r requirements/dev.txt
|
||||
#################### DEV IMAGE ####################
|
||||
|
||||
#################### vLLM installation IMAGE ####################
|
||||
# image with vLLM installed
|
||||
# TODO: Restore to base image after FlashInfer AOT wheel fixed
|
||||
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS vllm-base
|
||||
ARG CUDA_VERSION=12.8.1
|
||||
ARG CUDA_VERSION=12.4.1
|
||||
ARG PYTHON_VERSION=3.12
|
||||
WORKDIR /vllm-workspace
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
ARG TARGETPLATFORM
|
||||
|
||||
SHELL ["/bin/bash", "-c"]
|
||||
|
||||
RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
|
||||
echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment
|
||||
|
||||
@ -200,10 +185,7 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
|
||||
&& apt-get update -y \
|
||||
&& apt-get install -y ccache software-properties-common git curl wget sudo vim python3-pip \
|
||||
&& apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
|
||||
&& for i in 1 2 3; do \
|
||||
add-apt-repository -y ppa:deadsnakes/ppa && break || \
|
||||
{ echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \
|
||||
done \
|
||||
&& add-apt-repository ppa:deadsnakes/ppa \
|
||||
&& apt-get update -y \
|
||||
&& apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv libibverbs-dev \
|
||||
&& update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
|
||||
@ -218,7 +200,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
|
||||
# Reference: https://github.com/astral-sh/uv/pull/1694
|
||||
ENV UV_HTTP_TIMEOUT=500
|
||||
ENV UV_INDEX_STRATEGY="unsafe-best-match"
|
||||
|
||||
# Workaround for https://github.com/openai/triton/issues/2507 and
|
||||
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
|
||||
@ -239,8 +220,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
# Install vllm wheel first, so that torch etc will be installed.
|
||||
RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
|
||||
--mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --system dist/*.whl --verbose \
|
||||
--extra-index-url https://download.pytorch.org/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
|
||||
uv pip install --system dist/*.whl --verbose
|
||||
|
||||
# If we need to build FlashInfer wheel before its release:
|
||||
# $ export FLASHINFER_ENABLE_AOT=1
|
||||
@ -257,34 +237,17 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
. /etc/environment && \
|
||||
if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \
|
||||
# FlashInfer alreary has a wheel for PyTorch 2.7.0 and CUDA 12.8. This is enough for CI use
|
||||
if [[ "$CUDA_VERSION" == 12.8* ]]; then \
|
||||
uv pip install --system https://download.pytorch.org/whl/cu128/flashinfer/flashinfer_python-0.2.5%2Bcu128torch2.7-cp38-abi3-linux_x86_64.whl; \
|
||||
else \
|
||||
export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0+PTX'; \
|
||||
CUDA_MAJOR="${CUDA_VERSION%%.*}"; \
|
||||
if [ "$CUDA_MAJOR" -lt 12 ]; then \
|
||||
export FLASHINFER_ENABLE_SM90=0; \
|
||||
fi; \
|
||||
uv pip install --system --no-build-isolation "git+https://github.com/flashinfer-ai/flashinfer@21ea1d2545f74782b91eb8c08fd503ac4c0743fc" ; \
|
||||
fi \
|
||||
uv pip install --system https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.1.post2/flashinfer_python-0.2.1.post2+cu124torch2.6-cp38-abi3-linux_x86_64.whl ; \
|
||||
fi
|
||||
COPY examples examples
|
||||
COPY benchmarks benchmarks
|
||||
COPY ./vllm/collect_env.py .
|
||||
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
. /etc/environment && \
|
||||
uv pip list
|
||||
|
||||
# Even when we build Flashinfer with AOT mode, there's still
|
||||
# Although we build Flashinfer with AOT mode, there's still
|
||||
# some issues w.r.t. JIT compilation. Therefore we need to
|
||||
# install build dependencies for JIT compilation.
|
||||
# TODO: Remove this once FlashInfer AOT wheel is fixed
|
||||
COPY requirements/build.txt requirements/build.txt
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --system -r requirements/build.txt \
|
||||
--extra-index-url https://download.pytorch.org/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
|
||||
uv pip install --system -r requirements/build.txt
|
||||
|
||||
#################### vLLM installation IMAGE ####################
|
||||
|
||||
@ -298,18 +261,10 @@ ADD . /vllm-workspace/
|
||||
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
|
||||
# Reference: https://github.com/astral-sh/uv/pull/1694
|
||||
ENV UV_HTTP_TIMEOUT=500
|
||||
ENV UV_INDEX_STRATEGY="unsafe-best-match"
|
||||
|
||||
# Workaround for #17068
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --system --no-build-isolation "git+https://github.com/state-spaces/mamba@v2.2.4"
|
||||
|
||||
# install development dependencies (for testing)
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
CUDA_MAJOR="${CUDA_VERSION%%.*}"; \
|
||||
if [ "$CUDA_MAJOR" -ge 12 ]; then \
|
||||
uv pip install --system -r requirements/dev.txt; \
|
||||
fi
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --system -r requirements/dev.txt
|
||||
|
||||
# install development dependencies (for testing)
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
@ -328,15 +283,12 @@ COPY vllm/v1 /usr/local/lib/python3.12/dist-packages/vllm/v1
|
||||
# will not be imported by other tests
|
||||
RUN mkdir test_docs
|
||||
RUN mv docs test_docs/
|
||||
RUN cp -r examples test_docs/
|
||||
RUN mv vllm test_docs/
|
||||
RUN mv mkdocs.yaml test_docs/
|
||||
#################### TEST IMAGE ####################
|
||||
|
||||
#################### OPENAI API SERVER ####################
|
||||
# base openai image with additional requirements, for any subsequent openai-style images
|
||||
FROM vllm-base AS vllm-openai-base
|
||||
ARG TARGETPLATFORM
|
||||
|
||||
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
|
||||
# Reference: https://github.com/astral-sh/uv/pull/1694
|
||||
69
Dockerfile.cpu
Normal file
69
Dockerfile.cpu
Normal file
@ -0,0 +1,69 @@
|
||||
# This vLLM Dockerfile is used to construct image that can build and run vLLM on x86 CPU platform.
|
||||
|
||||
FROM ubuntu:22.04 AS cpu-test-1
|
||||
|
||||
ENV CCACHE_DIR=/root/.cache/ccache
|
||||
|
||||
ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache
|
||||
|
||||
RUN --mount=type=cache,target=/var/cache/apt \
|
||||
apt-get update -y \
|
||||
&& apt-get install -y curl ccache git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 libnuma-dev \
|
||||
&& apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
|
||||
&& update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
|
||||
|
||||
# https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/performance_tuning/tuning_guide.html
|
||||
# intel-openmp provides additional performance improvement vs. openmp
|
||||
# tcmalloc provides better memory allocation efficiency, e.g, holding memory in caches to speed up access of commonly-used objects.
|
||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||
pip install intel-openmp==2025.0.1
|
||||
|
||||
ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/usr/local/lib/libiomp5.so"
|
||||
|
||||
RUN echo 'ulimit -c 0' >> ~/.bashrc
|
||||
|
||||
RUN pip install intel_extension_for_pytorch==2.6.0
|
||||
|
||||
WORKDIR /workspace
|
||||
|
||||
ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
|
||||
ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
|
||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||
--mount=type=bind,src=requirements/build.txt,target=requirements/build.txt \
|
||||
pip install --upgrade pip && \
|
||||
pip install -r requirements/build.txt
|
||||
|
||||
FROM cpu-test-1 AS build
|
||||
|
||||
WORKDIR /workspace/vllm
|
||||
|
||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||
--mount=type=bind,src=requirements/common.txt,target=requirements/common.txt \
|
||||
--mount=type=bind,src=requirements/cpu.txt,target=requirements/cpu.txt \
|
||||
pip install -v -r requirements/cpu.txt
|
||||
|
||||
COPY . .
|
||||
ARG GIT_REPO_CHECK=0
|
||||
RUN --mount=type=bind,source=.git,target=.git \
|
||||
if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
|
||||
|
||||
# Support for building with non-AVX512 vLLM: docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" ...
|
||||
ARG VLLM_CPU_DISABLE_AVX512
|
||||
ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}
|
||||
|
||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||
--mount=type=cache,target=/root/.cache/ccache \
|
||||
--mount=type=bind,source=.git,target=.git \
|
||||
VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel && \
|
||||
pip install dist/*.whl && \
|
||||
rm -rf dist
|
||||
|
||||
WORKDIR /workspace/
|
||||
|
||||
RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
|
||||
|
||||
# install development dependencies (for testing)
|
||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||
pip install -e tests/vllm_test_utils
|
||||
|
||||
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
|
||||
@ -1,4 +1,4 @@
|
||||
FROM vault.habana.ai/gaudi-docker/1.20.1/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
|
||||
FROM vault.habana.ai/gaudi-docker/1.19.1/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest
|
||||
|
||||
COPY ./ /workspace/vllm
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
# default base image
|
||||
# https://gallery.ecr.aws/neuron/pytorch-inference-neuronx
|
||||
ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.6.0-neuronx-py310-sdk2.23.0-ubuntu22.04"
|
||||
ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.5.1-neuronx-py310-sdk2.21.0-ubuntu22.04"
|
||||
|
||||
FROM $BASE_IMAGE
|
||||
|
||||
@ -21,8 +21,9 @@ VOLUME [ ${APP_MOUNT} ]
|
||||
WORKDIR ${APP_MOUNT}/vllm
|
||||
|
||||
RUN python3 -m pip install --upgrade pip
|
||||
RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas tenacity
|
||||
RUN python3 -m pip install neuronx-cc==2.* --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
|
||||
RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas
|
||||
RUN python3 -m pip install sentencepiece transformers==4.45.2 -U
|
||||
RUN python3 -m pip install neuronx-cc==2.16.345.0 --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
|
||||
RUN python3 -m pip install pytest
|
||||
|
||||
# uninstall transformers-neuronx package explicitly to avoid version conflict
|
||||
@ -48,8 +49,6 @@ RUN python3 -m pip install -e tests/vllm_test_utils
|
||||
# FIXME: `--no-deps` argument is temporarily added to resolve transformers package version conflict
|
||||
RUN python3 -m pip install transformers-neuronx==0.13.* --extra-index-url=https://pip.repos.neuron.amazonaws.com -U --no-deps
|
||||
|
||||
RUN python3 -m pip install sentencepiece transformers==4.48.0 -U
|
||||
|
||||
# overwrite entrypoint to run bash script
|
||||
RUN echo "import subprocess; import sys; subprocess.check_call(sys.argv[1:])" > /usr/local/bin/dockerd-entrypoint.py
|
||||
|
||||
@ -21,8 +21,12 @@ ENV UV_LINK_MODE=copy
|
||||
# Note: A dummy file 'control' is created in /tmp/ to artificially create dependencies between stages when building stages in parallel
|
||||
# when `--jobs=<N>` is passed with podman build command
|
||||
RUN microdnf install -y openssl-devel dnf \
|
||||
&& dnf install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm \
|
||||
&& dnf config-manager --set-enabled codeready-builder-for-rhel-9-ppc64le-rpms \
|
||||
&& dnf install -y https://mirror.stream.centos.org/9-stream/BaseOS/`arch`/os/Packages/centos-gpg-keys-9.0-24.el9.noarch.rpm \
|
||||
https://mirror.stream.centos.org/9-stream/BaseOS/`arch`/os/Packages/centos-stream-repos-9.0-24.el9.noarch.rpm \
|
||||
https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm \
|
||||
&& dnf config-manager --add-repo https://mirror.stream.centos.org/9-stream/BaseOS/`arch`/os \
|
||||
&& dnf config-manager --add-repo https://mirror.stream.centos.org/9-stream/AppStream/`arch`/os \
|
||||
&& dnf config-manager --set-enabled crb \
|
||||
&& dnf install -y \
|
||||
git tar gcc-toolset-13 automake libtool numactl-devel lapack-devel \
|
||||
pkgconfig xsimd zeromq-devel kmod findutils protobuf* \
|
||||
@ -34,7 +38,7 @@ RUN microdnf install -y openssl-devel dnf \
|
||||
&& ln -sf /usr/lib64/libatomic.so.1 /usr/lib64/libatomic.so \
|
||||
&& python${PYTHON_VERSION} -m venv ${VIRTUAL_ENV} \
|
||||
&& python -m pip install -U pip uv \
|
||||
&& uv pip install wheel build "setuptools<70" setuptools_scm setuptools_rust meson-python 'cmake<4' ninja cython scikit_build_core scikit_build \
|
||||
&& uv pip install wheel build "setuptools<70" setuptools_scm setuptools_rust meson-python cmake ninja cython scikit_build_core scikit_build \
|
||||
&& curl -sL https://ftp2.osuosl.org/pub/ppc64el/openblas/latest/Openblas_${OPENBLAS_VERSION}_ppc64le.tar.gz | tar xvf - -C /usr/local \
|
||||
&& curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y \
|
||||
&& cd /tmp && touch control
|
||||
@ -122,16 +126,13 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
FROM base-builder AS cv-builder
|
||||
|
||||
ARG MAX_JOBS
|
||||
ARG OPENCV_VERSION=86
|
||||
# patch for version 4.11.0.86
|
||||
ARG OPENCV_PATCH=97f3f39
|
||||
ARG OPENCV_VERSION=84
|
||||
ARG ENABLE_HEADLESS=1
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
source /opt/rh/gcc-toolset-13/enable && \
|
||||
git clone --recursive https://github.com/opencv/opencv-python.git -b ${OPENCV_VERSION} && \
|
||||
cd opencv-python && \
|
||||
sed -i -E -e 's/"setuptools.+",/"setuptools",/g' pyproject.toml && \
|
||||
cd opencv && git cherry-pick --no-commit $OPENCV_PATCH && cd .. && \
|
||||
sed -i 's/"setuptools==59.2.0",/"setuptools<70.0",/g' pyproject.toml && \
|
||||
python -m build --wheel --installer=uv --outdir /opencvwheels/
|
||||
|
||||
###############################################################
|
||||
@ -147,15 +148,9 @@ COPY --from=arrow-builder /tmp/control /dev/null
|
||||
COPY --from=cv-builder /tmp/control /dev/null
|
||||
|
||||
ARG VLLM_TARGET_DEVICE=cpu
|
||||
ARG GRPC_PYTHON_BUILD_SYSTEM_OPENSSL=1
|
||||
|
||||
# this step installs vllm and populates uv cache
|
||||
# with all the transitive dependencies
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
source /opt/rh/gcc-toolset-13/enable && \
|
||||
git clone https://github.com/huggingface/xet-core.git && cd xet-core/hf_xet/ && \
|
||||
uv pip install maturin && \
|
||||
uv build --wheel --out-dir /hf_wheels/
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
--mount=type=bind,from=torch-builder,source=/torchwheels/,target=/torchwheels/,ro \
|
||||
--mount=type=bind,from=arrow-builder,source=/arrowwheels/,target=/arrowwheels/,ro \
|
||||
@ -164,7 +159,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
source /opt/rh/gcc-toolset-13/enable && \
|
||||
uv pip install /opencvwheels/*.whl /arrowwheels/*.whl /torchwheels/*.whl && \
|
||||
sed -i -e 's/.*torch.*//g' /src/pyproject.toml /src/requirements/*.txt && \
|
||||
uv pip install pandas pythran pybind11 /hf_wheels/*.whl && \
|
||||
uv pip install pandas pythran pybind11 && \
|
||||
# sentencepiece.pc is in some pkgconfig inside uv cache
|
||||
export PKG_CONFIG_PATH=$(find / -type d -name "pkgconfig" 2>/dev/null | tr '\n' ':') && \
|
||||
uv pip install -r /src/requirements/common.txt -r /src/requirements/cpu.txt -r /src/requirements/build.txt --no-build-isolation && \
|
||||
@ -243,7 +238,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
&& python -m pip install -U pip uv --no-cache \
|
||||
&& curl -sL https://ftp2.osuosl.org/pub/ppc64el/openblas/latest/Openblas_${OPENBLAS_VERSION}_ppc64le.tar.gz | tar xvf - -C /usr/local \
|
||||
&& make -C /numactl install \
|
||||
&& uv pip install 'cmake<4' \
|
||||
&& uv pip install cmake \
|
||||
&& cmake --install /lapack/build \
|
||||
&& uv pip uninstall cmake
|
||||
|
||||
@ -252,9 +247,8 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
--mount=type=bind,from=torch-builder,source=/torchwheels/,target=/torchwheels/,ro \
|
||||
--mount=type=bind,from=arrow-builder,source=/arrowwheels/,target=/arrowwheels/,ro \
|
||||
--mount=type=bind,from=cv-builder,source=/opencvwheels/,target=/opencvwheels/,ro \
|
||||
--mount=type=bind,from=vllmcache-builder,source=/hf_wheels/,target=/hf_wheels/,ro \
|
||||
--mount=type=bind,from=vllmcache-builder,source=/vllmwheel/,target=/vllmwheel/,ro \
|
||||
HOME=/root uv pip install /opencvwheels/*.whl /arrowwheels/*.whl /torchwheels/*.whl /hf_wheels/*.whl /vllmwheel/*.whl
|
||||
HOME=/root uv pip install /opencvwheels/*.whl /arrowwheels/*.whl /torchwheels/*.whl /vllmwheel/*.whl
|
||||
|
||||
COPY ./ /workspace/vllm
|
||||
WORKDIR /workspace/vllm
|
||||
@ -114,16 +114,8 @@ COPY --from=export_vllm /examples ${COMMON_WORKDIR}/vllm/examples
|
||||
ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
|
||||
ENV TOKENIZERS_PARALLELISM=false
|
||||
|
||||
# ENV that can improve safe tensor loading, and end-to-end time
|
||||
ENV SAFETENSORS_FAST_GPU=1
|
||||
|
||||
# User-friendly environment setting for multi-processing to avoid below RuntimeError.
|
||||
# RuntimeError: Cannot re-initialize CUDA in forked subprocess. To use CUDA with multiprocessing,
|
||||
# you must use the 'spawn' start method
|
||||
# See https://pytorch.org/docs/stable/notes/multiprocessing.html#cuda-in-multiprocessing
|
||||
ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
|
||||
# Performance environment variable.
|
||||
ENV HIP_FORCE_DEV_KERNARG=1
|
||||
|
||||
CMD ["/bin/bash"]
|
||||
|
||||
@ -1,18 +1,18 @@
|
||||
ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:6.3.1-complete
|
||||
ARG HIPBLASLT_BRANCH="db8e93b4"
|
||||
ARG HIPBLASLT_BRANCH="4d40e36"
|
||||
ARG HIPBLAS_COMMON_BRANCH="7c1566b"
|
||||
ARG LEGACY_HIPBLASLT_OPTION=
|
||||
ARG RCCL_BRANCH="648a58d"
|
||||
ARG RCCL_REPO="https://github.com/ROCm/rccl"
|
||||
ARG TRITON_BRANCH="e5be006"
|
||||
ARG TRITON_REPO="https://github.com/triton-lang/triton.git"
|
||||
ARG PYTORCH_BRANCH="295f2ed4"
|
||||
ARG PYTORCH_VISION_BRANCH="v0.21.0"
|
||||
ARG PYTORCH_BRANCH="3a585126"
|
||||
ARG PYTORCH_VISION_BRANCH="v0.19.1"
|
||||
ARG PYTORCH_REPO="https://github.com/pytorch/pytorch.git"
|
||||
ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"
|
||||
ARG FA_BRANCH="1a7f4dfa"
|
||||
ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git"
|
||||
ARG AITER_BRANCH="c1debd8"
|
||||
ARG FA_BRANCH="b7d29fb"
|
||||
ARG FA_REPO="https://github.com/ROCm/flash-attention.git"
|
||||
ARG AITER_BRANCH="21d47a9"
|
||||
ARG AITER_REPO="https://github.com/ROCm/aiter.git"
|
||||
|
||||
FROM ${BASE_IMAGE} AS base
|
||||
@ -20,7 +20,7 @@ FROM ${BASE_IMAGE} AS base
|
||||
ENV PATH=/opt/rocm/llvm/bin:$PATH
|
||||
ENV ROCM_PATH=/opt/rocm
|
||||
ENV LD_LIBRARY_PATH=/opt/rocm/lib:/usr/local/lib:
|
||||
ARG PYTORCH_ROCM_ARCH=gfx90a;gfx942;gfx1100;gfx1101;gfx1200;gfx1201
|
||||
ARG PYTORCH_ROCM_ARCH=gfx90a;gfx942
|
||||
ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}
|
||||
|
||||
ARG PYTHON_VERSION=3.12
|
||||
@ -31,11 +31,8 @@ ENV DEBIAN_FRONTEND=noninteractive
|
||||
|
||||
# Install Python and other dependencies
|
||||
RUN apt-get update -y \
|
||||
&& apt-get install -y software-properties-common git curl sudo vim less libgfortran5 \
|
||||
&& for i in 1 2 3; do \
|
||||
add-apt-repository -y ppa:deadsnakes/ppa && break || \
|
||||
{ echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \
|
||||
done \
|
||||
&& apt-get install -y software-properties-common git curl sudo vim less \
|
||||
&& add-apt-repository ppa:deadsnakes/ppa \
|
||||
&& apt-get update -y \
|
||||
&& apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
|
||||
python${PYTHON_VERSION}-lib2to3 python-is-python3 \
|
||||
@ -45,7 +42,7 @@ RUN apt-get update -y \
|
||||
&& curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
|
||||
&& python3 --version && python3 -m pip --version
|
||||
|
||||
RUN pip install -U packaging 'cmake<4' ninja wheel setuptools pybind11 Cython
|
||||
RUN pip install -U packaging cmake ninja wheel setuptools pybind11 Cython
|
||||
|
||||
FROM base AS build_hipblaslt
|
||||
ARG HIPBLASLT_BRANCH
|
||||
@ -63,8 +60,7 @@ RUN cd hipBLAS-common \
|
||||
RUN git clone https://github.com/ROCm/hipBLASLt
|
||||
RUN cd hipBLASLt \
|
||||
&& git checkout ${HIPBLASLT_BRANCH} \
|
||||
&& apt-get install -y llvm-dev \
|
||||
&& ./install.sh -dc --architecture ${PYTORCH_ROCM_ARCH} ${LEGACY_HIPBLASLT_OPTION} \
|
||||
&& ./install.sh -d --architecture ${PYTORCH_ROCM_ARCH} ${LEGACY_HIPBLASLT_OPTION} \
|
||||
&& cd build/release \
|
||||
&& make package
|
||||
RUN mkdir -p /app/install && cp /app/hipBLASLt/build/release/*.deb /app/hipBLAS-common/build/*.deb /app/install
|
||||
@ -114,24 +110,11 @@ RUN git clone ${FA_REPO}
|
||||
RUN cd flash-attention \
|
||||
&& git checkout ${FA_BRANCH} \
|
||||
&& git submodule update --init \
|
||||
&& GPU_ARCHS=$(echo ${PYTORCH_ROCM_ARCH} | sed -e 's/;gfx1[0-9]\{3\}//g') python3 setup.py bdist_wheel --dist-dir=dist
|
||||
&& MAX_JOBS=64 GPU_ARCHS=${PYTORCH_ROCM_ARCH} python3 setup.py bdist_wheel --dist-dir=dist
|
||||
RUN mkdir -p /app/install && cp /app/pytorch/dist/*.whl /app/install \
|
||||
&& cp /app/vision/dist/*.whl /app/install \
|
||||
&& cp /app/flash-attention/dist/*.whl /app/install
|
||||
|
||||
FROM base AS build_aiter
|
||||
ARG AITER_BRANCH
|
||||
ARG AITER_REPO
|
||||
RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
|
||||
pip install /install/*.whl
|
||||
RUN git clone --recursive ${AITER_REPO}
|
||||
RUN cd aiter \
|
||||
&& git checkout ${AITER_BRANCH} \
|
||||
&& git submodule update --init --recursive \
|
||||
&& pip install -r requirements.txt
|
||||
RUN pip install pyyaml && cd aiter && PREBUILD_KERNELS=1 GPU_ARCHS=gfx942 python3 setup.py bdist_wheel --dist-dir=dist && ls /app/aiter/dist/*.whl
|
||||
RUN mkdir -p /app/install && cp /app/aiter/dist/*.whl /app/install
|
||||
|
||||
FROM base AS final
|
||||
RUN --mount=type=bind,from=build_hipblaslt,src=/app/install/,target=/install \
|
||||
dpkg -i /install/*deb \
|
||||
@ -147,12 +130,19 @@ RUN --mount=type=bind,from=build_amdsmi,src=/app/install/,target=/install \
|
||||
pip install /install/*.whl
|
||||
RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
|
||||
pip install /install/*.whl
|
||||
RUN --mount=type=bind,from=build_aiter,src=/app/install/,target=/install \
|
||||
pip install /install/*.whl
|
||||
|
||||
ARG AITER_REPO
|
||||
ARG AITER_BRANCH
|
||||
RUN git clone --recursive ${AITER_REPO}
|
||||
RUN cd aiter \
|
||||
&& git checkout ${AITER_BRANCH} \
|
||||
&& git submodule update --init --recursive \
|
||||
&& pip install -r requirements.txt \
|
||||
&& PREBUILD_KERNELS=1 GPU_ARCHS=gfx942 python3 setup.py develop && pip show aiter
|
||||
|
||||
ARG BASE_IMAGE
|
||||
ARG HIPBLAS_COMMON_BRANCH
|
||||
ARG HIPBLASLT_BRANCH
|
||||
ARG HIPBLAS_COMMON_BRANCH
|
||||
ARG LEGACY_HIPBLASLT_OPTION
|
||||
ARG RCCL_BRANCH
|
||||
ARG RCCL_REPO
|
||||
@ -164,8 +154,6 @@ ARG PYTORCH_REPO
|
||||
ARG PYTORCH_VISION_REPO
|
||||
ARG FA_BRANCH
|
||||
ARG FA_REPO
|
||||
ARG AITER_BRANCH
|
||||
ARG AITER_REPO
|
||||
RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
|
||||
&& echo "HIPBLAS_COMMON_BRANCH: ${HIPBLAS_COMMON_BRANCH}" >> /app/versions.txt \
|
||||
&& echo "HIPBLASLT_BRANCH: ${HIPBLASLT_BRANCH}" >> /app/versions.txt \
|
||||
@ -179,5 +167,6 @@ RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
|
||||
&& echo "PYTORCH_REPO: ${PYTORCH_REPO}" >> /app/versions.txt \
|
||||
&& echo "PYTORCH_VISION_REPO: ${PYTORCH_VISION_REPO}" >> /app/versions.txt \
|
||||
&& echo "FA_BRANCH: ${FA_BRANCH}" >> /app/versions.txt \
|
||||
&& echo "FA_REPO: ${FA_REPO}" >> /app/versions.txt \
|
||||
&& echo "AITER_BRANCH: ${AITER_BRANCH}" >> /app/versions.txt \
|
||||
&& echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt
|
||||
@ -16,7 +16,7 @@ ENV LANG=C.UTF-8 \
|
||||
RUN microdnf install -y \
|
||||
which procps findutils tar vim git gcc gcc-gfortran g++ make patch zlib-devel \
|
||||
libjpeg-turbo-devel libtiff-devel libpng-devel libwebp-devel freetype-devel harfbuzz-devel \
|
||||
openssl-devel openblas openblas-devel autoconf automake libtool cmake numpy && \
|
||||
openssl-devel openblas openblas-devel autoconf automake libtool cmake && \
|
||||
microdnf clean all
|
||||
|
||||
# Python Installation
|
||||
@ -58,7 +58,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
cd ../../python && \
|
||||
export PYARROW_PARALLEL=4 && \
|
||||
export ARROW_BUILD_TYPE=release && \
|
||||
uv pip install -r requirements-build.txt && \
|
||||
uv pip install -r requirements/build.txt && \
|
||||
python setup.py build_ext --build-type=$ARROW_BUILD_TYPE --bundle-arrow-cpp bdist_wheel
|
||||
|
||||
FROM python-install AS numa-build
|
||||
@ -84,58 +84,18 @@ RUN curl https://sh.rustup.rs -sSf | sh -s -- -y && \
|
||||
rustup default stable && \
|
||||
rustup show
|
||||
|
||||
FROM python-install AS torch
|
||||
ARG TORCH_VERSION=2.7.0
|
||||
ENV export _GLIBCXX_USE_CXX11_ABI=1
|
||||
ENV CARGO_HOME=/root/.cargo
|
||||
ENV RUSTUP_HOME=/root/.rustup
|
||||
ENV PATH="$CARGO_HOME/bin:$RUSTUP_HOME/bin:$PATH"
|
||||
|
||||
WORKDIR /tmp
|
||||
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
--mount=type=bind,from=rust,source=/root/.cargo,target=/root/.cargo,rw \
|
||||
--mount=type=bind,from=rust,source=/root/.rustup,target=/root/.rustup,rw \
|
||||
git clone https://github.com/pytorch/pytorch.git && \
|
||||
cd pytorch && \
|
||||
git checkout v2.7.0 && \
|
||||
git submodule sync && \
|
||||
git submodule update --init --recursive && \
|
||||
uv pip install cmake ninja && \
|
||||
uv pip install -r requirements.txt && \
|
||||
python setup.py bdist_wheel
|
||||
|
||||
|
||||
FROM python-install AS torch-vision
|
||||
# Install torchvision
|
||||
ARG TORCH_VERSION=2.7.0
|
||||
ARG TORCH_VERSION=2.7.0.dev20250304
|
||||
ARG TORCH_VISION_VERSION=v0.20.1
|
||||
WORKDIR /tmp
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
--mount=type=bind,from=torch,source=/tmp/pytorch/dist,target=/tmp/torch-wheels/ \
|
||||
git clone https://github.com/pytorch/vision.git && \
|
||||
cd vision && \
|
||||
git checkout $TORCH_VISION_VERSION && \
|
||||
TORCH_WHL_FILE=$(ls /tmp/torch-wheels/*.whl | head -n 1) && \
|
||||
uv pip install -v $TORCH_WHL_FILE && \
|
||||
uv pip install -v torch==${TORCH_VERSION} --extra-index-url https://download.pytorch.org/whl/nightly/cpu && \
|
||||
python setup.py bdist_wheel
|
||||
|
||||
FROM python-install AS hf-xet-builder
|
||||
# Install hf-xet
|
||||
WORKDIR /tmp
|
||||
ENV CARGO_HOME=/root/.cargo
|
||||
ENV RUSTUP_HOME=/root/.rustup
|
||||
ENV PATH="$CARGO_HOME/bin:$RUSTUP_HOME/bin:$PATH"
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
--mount=type=bind,from=rust,source=/root/.cargo,target=/root/.cargo,rw \
|
||||
--mount=type=bind,from=rust,source=/root/.rustup,target=/root/.rustup,rw \
|
||||
git clone https://github.com/huggingface/xet-core.git && \
|
||||
cd xet-core/hf_xet/ && \
|
||||
uv pip install maturin patchelf && \
|
||||
python -m maturin build --release --out dist && \
|
||||
mkdir -p /tmp/hf-xet/dist && \
|
||||
cp dist/*.whl /tmp/hf-xet/dist/
|
||||
|
||||
# Final build stage
|
||||
FROM python-install AS vllm-cpu
|
||||
ARG PYTHON_VERSION
|
||||
@ -147,7 +107,6 @@ ENV UV_LINK_MODE=copy
|
||||
ENV CARGO_HOME=/root/.cargo
|
||||
ENV RUSTUP_HOME=/root/.rustup
|
||||
ENV PATH="$CARGO_HOME/bin:$RUSTUP_HOME/bin:$PATH"
|
||||
ENV GRPC_PYTHON_BUILD_SYSTEM_OPENSSL=1
|
||||
|
||||
COPY . /workspace/vllm
|
||||
WORKDIR /workspace/vllm
|
||||
@ -161,18 +120,13 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
--mount=type=bind,from=rust,source=/root/.rustup,target=/root/.rustup,rw \
|
||||
--mount=type=bind,from=pyarrow,source=/tmp/arrow/python/dist,target=/tmp/arrow-wheels \
|
||||
--mount=type=bind,from=torch-vision,source=/tmp/vision/dist,target=/tmp/vision-wheels/ \
|
||||
--mount=type=bind,from=hf-xet-builder,source=/tmp/hf-xet/dist,target=/tmp/hf-xet-wheels/ \
|
||||
--mount=type=bind,from=torch,source=/tmp/pytorch/dist,target=/tmp/torch-wheels/ \
|
||||
sed -i '/^torch/d' requirements/build.txt && \
|
||||
ARROW_WHL_FILE=$(ls /tmp/arrow-wheels/pyarrow-*.whl | head -n 1) && \
|
||||
VISION_WHL_FILE=$(ls /tmp/vision-wheels/*.whl | head -n 1) && \
|
||||
HF_XET_WHL_FILE=$(ls /tmp/hf-xet-wheels/*.whl | head -n 1) && \
|
||||
TORCH_WHL_FILE=$(ls /tmp/torch-wheels/*.whl | head -n 1) && \
|
||||
uv pip install -v \
|
||||
$ARROW_WHL_FILE \
|
||||
$VISION_WHL_FILE \
|
||||
$HF_XET_WHL_FILE \
|
||||
$TORCH_WHL_FILE \
|
||||
--extra-index-url https://download.pytorch.org/whl/nightly/cpu \
|
||||
--index-strategy unsafe-best-match \
|
||||
-r requirements/build.txt \
|
||||
-r requirements/cpu.txt
|
||||
@ -195,5 +149,4 @@ USER 2000
|
||||
WORKDIR /home/vllm
|
||||
|
||||
# Set the default entrypoint
|
||||
ENTRYPOINT ["python", "-m", "vllm.entrypoints.openai.api_server"]
|
||||
|
||||
ENTRYPOINT ["python", "-m", "vllm.entrypoints.openai.api_server"]
|
||||
@ -23,7 +23,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
|
||||
--mount=type=bind,source=.git,target=.git \
|
||||
python3 -m pip install \
|
||||
-r requirements/tpu.txt
|
||||
RUN python3 -m pip install -e .
|
||||
RUN python3 setup.py develop
|
||||
|
||||
# install development dependencies (for testing)
|
||||
RUN python3 -m pip install -e tests/vllm_test_utils
|
||||
@ -40,6 +40,12 @@ RUN --mount=type=cache,target=/root/.cache/pip \
|
||||
--mount=type=bind,source=.git,target=.git \
|
||||
python3 setup.py install
|
||||
|
||||
# Please refer xpu doc, we need manually install intel-extension-for-pytorch 2.6.10+xpu due to there are some conflict dependencies with torch 2.6.0+xpu
|
||||
# FIXME: This will be fix in ipex 2.7. just leave this here for awareness.
|
||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||
pip install intel-extension-for-pytorch==2.6.10+xpu \
|
||||
--extra-index-url=https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
|
||||
|
||||
CMD ["/bin/bash"]
|
||||
|
||||
FROM vllm-base AS vllm-openai
|
||||
36
README.md
36
README.md
@ -1,7 +1,7 @@
|
||||
<p align="center">
|
||||
<picture>
|
||||
<source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/vllm-project/vllm/main/docs/assets/logos/vllm-logo-text-dark.png">
|
||||
<img alt="vLLM" src="https://raw.githubusercontent.com/vllm-project/vllm/main/docs/assets/logos/vllm-logo-text-light.png" width=55%>
|
||||
<source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/vllm-project/vllm/main/docs/source/assets/logos/vllm-logo-text-dark.png">
|
||||
<img alt="vLLM" src="https://raw.githubusercontent.com/vllm-project/vllm/main/docs/source/assets/logos/vllm-logo-text-light.png" width=55%>
|
||||
</picture>
|
||||
</p>
|
||||
|
||||
@ -10,26 +10,29 @@ Easy, fast, and cheap LLM serving for everyone
|
||||
</h3>
|
||||
|
||||
<p align="center">
|
||||
| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://blog.vllm.ai/"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> | <a href="https://discuss.vllm.ai"><b>User Forum</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> |
|
||||
| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://vllm.ai"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> | <a href="https://discuss.vllm.ai"><b>User Forum</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> |
|
||||
</p>
|
||||
|
||||
---
|
||||
|
||||
[2025/03] We are collaborating with Ollama to host an [Inference Night](https://lu.ma/vllm-ollama) at Y Combinator in San Francisco on Thursday, March 27, at 6 PM. Discuss all things inference local or data center!
|
||||
|
||||
[2025/04] We're hosting our first-ever *vLLM Asia Developer Day* in Singapore on *April 3rd*! This is a full-day event (9 AM - 9 PM SGT) in partnership with SGInnovate, AMD, and Embedded LLM. Meet the vLLM team and learn about LLM inference for RL, MI300X, and more! [Register Now](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day)
|
||||
|
||||
---
|
||||
|
||||
*Latest News* 🔥
|
||||
- [2025/05] We hosted [NYC vLLM Meetup](https://lu.ma/c1rqyf1f)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1_q_aW_ioMJWUImf1s1YM-ZhjXz8cUeL0IJvaquOYBeA/edit?usp=sharing).
|
||||
- [2025/05] vLLM is now a hosted project under PyTorch Foundation! Please find the announcement [here](https://pytorch.org/blog/pytorch-foundation-welcomes-vllm/).
|
||||
- [2025/04] We hosted [Asia Developer Day](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/19cp6Qu8u48ihB91A064XfaXruNYiBOUKrBxAmDOllOo/edit?usp=sharing).
|
||||
|
||||
- [2025/03] We hosted [the first vLLM China Meetup](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg)! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1REHvfQMKGnvz6p3Fd23HhSO4c8j5WPGZV0bKYLwnHyQ/edit?usp=sharing).
|
||||
- [2025/03] We hosted [the East Coast vLLM Meetup](https://lu.ma/7mu4k4xx)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1NHiv8EUFF1NLd3fEYODm56nDmL26lEeXCaDgyDlTsRs/edit#slide=id.g31441846c39_0_0).
|
||||
- [2025/02] We hosted [the ninth vLLM meetup](https://lu.ma/h7g3kuj9) with Meta! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1jzC_PZVXrVNSFVCW-V4cFXb6pn7zZ2CyP_Flwo05aqg/edit?usp=sharing) and AMD [here](https://drive.google.com/file/d/1Zk5qEJIkTmlQ2eQcXQZlljAx3m9s7nwn/view?usp=sharing). The slides from Meta will not be posted.
|
||||
- [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html).
|
||||
- [2025/01] We hosted [the eighth vLLM meetup](https://lu.ma/zep56hui) with Google Cloud! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1epVkt4Zu8Jz_S5OhEHPc798emsYh2BwYfRuDDVEF7u4/edit?usp=sharing), and Google Cloud team [here](https://drive.google.com/file/d/1h24pHewANyRL11xy5dXUbvRC9F9Kkjix/view?usp=sharing).
|
||||
- [2024/12] vLLM joins [pytorch ecosystem](https://pytorch.org/blog/vllm-joins-pytorch)! Easy, Fast, and Cheap LLM Serving for Everyone!
|
||||
|
||||
<details>
|
||||
<summary>Previous News</summary>
|
||||
|
||||
- [2025/03] We hosted [vLLM x Ollama Inference Night](https://lu.ma/vllm-ollama)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/16T2PDD1YwRnZ4Tu8Q5r6n53c5Lr5c73UV9Vd2_eBo4U/edit?usp=sharing).
|
||||
- [2025/03] We hosted [the first vLLM China Meetup](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg)! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1REHvfQMKGnvz6p3Fd23HhSO4c8j5WPGZV0bKYLwnHyQ/edit?usp=sharing).
|
||||
- [2025/03] We hosted [the East Coast vLLM Meetup](https://lu.ma/7mu4k4xx)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1NHiv8EUFF1NLd3fEYODm56nDmL26lEeXCaDgyDlTsRs/edit#slide=id.g31441846c39_0_0).
|
||||
- [2025/02] We hosted [the ninth vLLM meetup](https://lu.ma/h7g3kuj9) with Meta! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1jzC_PZVXrVNSFVCW-V4cFXb6pn7zZ2CyP_Flwo05aqg/edit?usp=sharing) and AMD [here](https://drive.google.com/file/d/1Zk5qEJIkTmlQ2eQcXQZlljAx3m9s7nwn/view?usp=sharing). The slides from Meta will not be posted.
|
||||
- [2025/01] We hosted [the eighth vLLM meetup](https://lu.ma/zep56hui) with Google Cloud! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1epVkt4Zu8Jz_S5OhEHPc798emsYh2BwYfRuDDVEF7u4/edit?usp=sharing), and Google Cloud team [here](https://drive.google.com/file/d/1h24pHewANyRL11xy5dXUbvRC9F9Kkjix/view?usp=sharing).
|
||||
- [2024/12] vLLM joins [pytorch ecosystem](https://pytorch.org/blog/vllm-joins-pytorch)! Easy, Fast, and Cheap LLM Serving for Everyone!
|
||||
- [2024/11] We hosted [the seventh vLLM meetup](https://lu.ma/h0qvrajz) with Snowflake! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing), and Snowflake team [here](https://docs.google.com/presentation/d/1qF3RkDAbOULwz9WK5TOltt2fE9t6uIc_hVNLFAaQX6A/edit?usp=sharing).
|
||||
- [2024/10] We have just created a developer slack ([slack.vllm.ai](https://slack.vllm.ai)) focusing on coordinating contributions and discussing features. Please feel free to join us there!
|
||||
- [2024/10] Ray Summit 2024 held a special track for vLLM! Please find the opening talk slides from the vLLM team [here](https://docs.google.com/presentation/d/1B_KQxpHBTRa_mDF-tR6i8rWdOU5QoTZNcEg2MKZxEHM/edit?usp=sharing). Learn more from the [talks](https://www.youtube.com/playlist?list=PLzTswPQNepXl6AQwifuwUImLPFRVpksjR) from other vLLM contributors and users!
|
||||
@ -58,7 +61,7 @@ vLLM is fast with:
|
||||
- Efficient management of attention key and value memory with [**PagedAttention**](https://blog.vllm.ai/2023/06/20/vllm.html)
|
||||
- Continuous batching of incoming requests
|
||||
- Fast model execution with CUDA/HIP graph
|
||||
- Quantizations: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), [AutoRound](https://arxiv.org/abs/2309.05516),INT4, INT8, and FP8.
|
||||
- Quantizations: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), INT4, INT8, and FP8.
|
||||
- Optimized CUDA kernels, including integration with FlashAttention and FlashInfer.
|
||||
- Speculative decoding
|
||||
- Chunked prefill
|
||||
@ -74,7 +77,7 @@ vLLM is flexible and easy to use with:
|
||||
- OpenAI-compatible API server
|
||||
- Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, TPU, and AWS Neuron.
|
||||
- Prefix caching support
|
||||
- Multi-LoRA support
|
||||
- Multi-lora support
|
||||
|
||||
vLLM seamlessly supports most popular open-source models on HuggingFace, including:
|
||||
- Transformer-like LLMs (e.g., Llama)
|
||||
@ -100,14 +103,14 @@ Visit our [documentation](https://docs.vllm.ai/en/latest/) to learn more.
|
||||
## Contributing
|
||||
|
||||
We welcome and value any contributions and collaborations.
|
||||
Please check out [Contributing to vLLM](https://docs.vllm.ai/en/latest/contributing/index.html) for how to get involved.
|
||||
Please check out [CONTRIBUTING.md](./CONTRIBUTING.md) for how to get involved.
|
||||
|
||||
## Sponsors
|
||||
|
||||
vLLM is a community project. Our compute resources for development and testing are supported by the following organizations. Thank you for your support!
|
||||
|
||||
<!-- Note: Please sort them in alphabetical order. -->
|
||||
<!-- Note: Please keep these consistent with docs/community/sponsors.md -->
|
||||
<!-- Note: Please keep these consistent with docs/source/community/sponsors.md -->
|
||||
Cash Donations:
|
||||
- a16z
|
||||
- Dropbox
|
||||
@ -123,7 +126,6 @@ Compute Resources:
|
||||
- Databricks
|
||||
- DeepInfra
|
||||
- Google Cloud
|
||||
- Intel
|
||||
- Lambda Lab
|
||||
- Nebius
|
||||
- Novita AI
|
||||
|
||||
@ -8,6 +8,4 @@ Please report security issues privately using [the vulnerability submission form
|
||||
|
||||
---
|
||||
|
||||
Please see the [Security Guide in the vLLM documentation](https://docs.vllm.ai/en/latest/usage/security.html) for more information on vLLM's security assumptions and recommendations.
|
||||
|
||||
Please see [PyTorch's Security Policy](https://github.com/pytorch/pytorch/blob/main/SECURITY.md) for more information and recommendations on how to securely interact with models.
|
||||
|
||||
@ -41,45 +41,29 @@ become available.
|
||||
<td><code>synthetic</code></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td><strong>HuggingFace-VisionArena</strong></td>
|
||||
<td style="text-align: center;">✅</td>
|
||||
<td style="text-align: center;">✅</td>
|
||||
<td><code>lmarena-ai/VisionArena-Chat</code></td>
|
||||
<td><strong>HuggingFace</strong></td>
|
||||
<td style="text-align: center;">🟡</td>
|
||||
<td style="text-align: center;">🟡</td>
|
||||
<td>Specify your dataset path on HuggingFace</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td><strong>HuggingFace-InstructCoder</strong></td>
|
||||
<td><strong>VisionArena</strong></td>
|
||||
<td style="text-align: center;">✅</td>
|
||||
<td style="text-align: center;">✅</td>
|
||||
<td><code>likaixin/InstructCoder</code></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td><strong>HuggingFace-AIMO</strong></td>
|
||||
<td style="text-align: center;">✅</td>
|
||||
<td style="text-align: center;">✅</td>
|
||||
<td><code>AI-MO/aimo-validation-aime</code> , <code>AI-MO/NuminaMath-1.5</code>, <code>AI-MO/NuminaMath-CoT</code></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td><strong>HuggingFace-Other</strong></td>
|
||||
<td style="text-align: center;">✅</td>
|
||||
<td style="text-align: center;">✅</td>
|
||||
<td><code>lmms-lab/LLaVA-OneVision-Data</code>, <code>Aeala/ShareGPT_Vicuna_unfiltered</code></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td><strong>Custom</strong></td>
|
||||
<td style="text-align: center;">✅</td>
|
||||
<td style="text-align: center;">✅</td>
|
||||
<td>Local file: <code>data.jsonl</code></td>
|
||||
<td><code>lmarena-ai/vision-arena-bench-v0.1</code> (a HuggingFace dataset)</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
|
||||
✅: supported
|
||||
|
||||
🟡: Partial support
|
||||
|
||||
🚧: to be supported
|
||||
|
||||
**Note**: HuggingFace dataset's `dataset-name` should be set to `hf`
|
||||
🟡: Partial support. Currently, HuggingFaceDataset only supports dataset formats
|
||||
similar to `lmms-lab/LLaVA-OneVision-Data` and `Aeala/ShareGPT_Vicuna_unfiltered`.
|
||||
If you need support for other dataset formats, please consider contributing.
|
||||
|
||||
**Note**: VisionArena’s `dataset-name` should be set to `hf`
|
||||
|
||||
---
|
||||
## Example - Online Benchmark
|
||||
@ -87,7 +71,8 @@ become available.
|
||||
First start serving your model
|
||||
|
||||
```bash
|
||||
vllm serve NousResearch/Hermes-3-Llama-3.1-8B --disable-log-requests
|
||||
MODEL_NAME="NousResearch/Hermes-3-Llama-3.1-8B"
|
||||
vllm serve ${MODEL_NAME} --disable-log-requests
|
||||
```
|
||||
|
||||
Then run the benchmarking script
|
||||
@ -95,13 +80,12 @@ Then run the benchmarking script
|
||||
```bash
|
||||
# download dataset
|
||||
# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
||||
python3 vllm/benchmarks/benchmark_serving.py \
|
||||
--backend vllm \
|
||||
--model NousResearch/Hermes-3-Llama-3.1-8B \
|
||||
--endpoint /v1/completions \
|
||||
--dataset-name sharegpt \
|
||||
--dataset-path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
|
||||
--num-prompts 10
|
||||
MODEL_NAME="NousResearch/Hermes-3-Llama-3.1-8B"
|
||||
NUM_PROMPTS=10
|
||||
BACKEND="vllm"
|
||||
DATASET_NAME="sharegpt"
|
||||
DATASET_PATH="<your data path>/ShareGPT_V3_unfiltered_cleaned_split.json"
|
||||
python3 vllm/benchmarks/benchmark_serving.py --backend ${BACKEND} --model ${MODEL_NAME} --endpoint /v1/completions --dataset-name ${DATASET_NAME} --dataset-path ${DATASET_PATH} --num-prompts ${NUM_PROMPTS}
|
||||
```
|
||||
|
||||
If successful, you will see the following output
|
||||
@ -130,38 +114,6 @@ P99 ITL (ms): 8.39
|
||||
==================================================
|
||||
```
|
||||
|
||||
### Custom Dataset
|
||||
If the dataset you want to benchmark is not supported yet in vLLM, even then you can benchmark on it using `CustomDataset`. Your data needs to be in `.jsonl` format and needs to have "prompt" field per entry, e.g., data.jsonl
|
||||
|
||||
```
|
||||
{"prompt": "What is the capital of India?"}
|
||||
{"prompt": "What is the capital of Iran?"}
|
||||
{"prompt": "What is the capital of China?"}
|
||||
```
|
||||
|
||||
```bash
|
||||
# start server
|
||||
VLLM_USE_V1=1 vllm serve meta-llama/Llama-3.1-8B-Instruct --disable-log-requests
|
||||
```
|
||||
|
||||
```bash
|
||||
# run benchmarking script
|
||||
python3 benchmarks/benchmark_serving.py --port 9001 --save-result --save-detailed \
|
||||
--backend vllm \
|
||||
--model meta-llama/Llama-3.1-8B-Instruct \
|
||||
--endpoint /v1/completions \
|
||||
--dataset-name custom \
|
||||
--dataset-path <path-to-your-data-jsonl> \
|
||||
--custom-skip-chat-template \
|
||||
--num-prompts 80 \
|
||||
--max-concurrency 1 \
|
||||
--temperature=0.3 \
|
||||
--top-p=0.75 \
|
||||
--result-dir "./log/"
|
||||
```
|
||||
|
||||
You can skip applying chat template if your data already has it by using `--custom-skip-chat-template`.
|
||||
|
||||
### VisionArena Benchmark for Vision Language Models
|
||||
|
||||
```bash
|
||||
@ -170,114 +122,88 @@ vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
|
||||
```
|
||||
|
||||
```bash
|
||||
MODEL_NAME="Qwen/Qwen2-VL-7B-Instruct"
|
||||
NUM_PROMPTS=10
|
||||
BACKEND="openai-chat"
|
||||
DATASET_NAME="hf"
|
||||
DATASET_PATH="lmarena-ai/vision-arena-bench-v0.1"
|
||||
DATASET_SPLIT='train'
|
||||
|
||||
python3 vllm/benchmarks/benchmark_serving.py \
|
||||
--backend openai-chat \
|
||||
--model Qwen/Qwen2-VL-7B-Instruct \
|
||||
--endpoint /v1/chat/completions \
|
||||
--dataset-name hf \
|
||||
--dataset-path lmarena-ai/VisionArena-Chat \
|
||||
--hf-split train \
|
||||
--num-prompts 1000
|
||||
--backend "${BACKEND}" \
|
||||
--model "${MODEL_NAME}" \
|
||||
--endpoint "/v1/chat/completions" \
|
||||
--dataset-name "${DATASET_NAME}" \
|
||||
--dataset-path "${DATASET_PATH}" \
|
||||
--hf-split "${DATASET_SPLIT}" \
|
||||
--num-prompts "${NUM_PROMPTS}"
|
||||
```
|
||||
|
||||
### InstructCoder Benchmark with Speculative Decoding
|
||||
### HuggingFaceDataset Examples
|
||||
|
||||
``` bash
|
||||
VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
|
||||
--speculative-config $'{"method": "ngram",
|
||||
"num_speculative_tokens": 5, "prompt_lookup_max": 5,
|
||||
"prompt_lookup_min": 2}'
|
||||
```
|
||||
|
||||
``` bash
|
||||
python3 benchmarks/benchmark_serving.py \
|
||||
--model meta-llama/Meta-Llama-3-8B-Instruct \
|
||||
--dataset-name hf \
|
||||
--dataset-path likaixin/InstructCoder \
|
||||
--num-prompts 2048
|
||||
```
|
||||
|
||||
### Other HuggingFaceDataset Examples
|
||||
Currently, HuggingFaceDataset only supports dataset formats
|
||||
similar to `lmms-lab/LLaVA-OneVision-Data` and `Aeala/ShareGPT_Vicuna_unfiltered`. If you need support for other dataset
|
||||
formats, please consider contributing.
|
||||
|
||||
```bash
|
||||
# need a model with vision capability here
|
||||
vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
|
||||
```
|
||||
|
||||
**`lmms-lab/LLaVA-OneVision-Data`**
|
||||
|
||||
```bash
|
||||
MODEL_NAME="Qwen/Qwen2-VL-7B-Instruct"
|
||||
NUM_PROMPTS=10
|
||||
BACKEND="openai-chat"
|
||||
DATASET_NAME="hf"
|
||||
DATASET_PATH="lmms-lab/LLaVA-OneVision-Data"
|
||||
DATASET_SPLIT='train'
|
||||
DATASET_SUBSET='chart2text(cauldron)'
|
||||
python3 vllm/benchmarks/benchmark_serving.py \
|
||||
--backend openai-chat \
|
||||
--model Qwen/Qwen2-VL-7B-Instruct \
|
||||
--endpoint /v1/chat/completions \
|
||||
--dataset-name hf \
|
||||
--dataset-path lmms-lab/LLaVA-OneVision-Data \
|
||||
--hf-split train \
|
||||
--hf-subset "chart2text(cauldron)" \
|
||||
--num-prompts 10
|
||||
--backend "${BACKEND}" \
|
||||
--model "${MODEL_NAME}" \
|
||||
--endpoint "/v1/chat/completions" \
|
||||
--dataset-name "${DATASET_NAME}" \
|
||||
--dataset-path "${DATASET_PATH}" \
|
||||
--hf-split "${DATASET_SPLIT}" \
|
||||
--num-prompts "${NUM_PROMPTS}" \
|
||||
--hf-subset "${DATASET_SUBSET}"
|
||||
```
|
||||
|
||||
**`Aeala/ShareGPT_Vicuna_unfiltered`**
|
||||
|
||||
```bash
|
||||
MODEL_NAME="Qwen/Qwen2-VL-7B-Instruct"
|
||||
NUM_PROMPTS=10
|
||||
BACKEND="openai-chat"
|
||||
DATASET_NAME="hf"
|
||||
DATASET_PATH="Aeala/ShareGPT_Vicuna_unfiltered"
|
||||
DATASET_SPLIT='train'
|
||||
python3 vllm/benchmarks/benchmark_serving.py \
|
||||
--backend openai-chat \
|
||||
--model Qwen/Qwen2-VL-7B-Instruct \
|
||||
--endpoint /v1/chat/completions \
|
||||
--dataset-name hf \
|
||||
--dataset-path Aeala/ShareGPT_Vicuna_unfiltered \
|
||||
--hf-split train \
|
||||
--num-prompts 10
|
||||
```
|
||||
|
||||
**`AI-MO/aimo-validation-aime`**
|
||||
|
||||
``` bash
|
||||
python3 vllm/benchmarks/benchmark_serving.py \
|
||||
--model Qwen/QwQ-32B \
|
||||
--dataset-name hf \
|
||||
--dataset-path AI-MO/aimo-validation-aime \
|
||||
--num-prompts 10 \
|
||||
--seed 42
|
||||
```
|
||||
|
||||
**`philschmid/mt-bench`**
|
||||
|
||||
``` bash
|
||||
python3 vllm/benchmarks/benchmark_serving.py \
|
||||
--model Qwen/QwQ-32B \
|
||||
--dataset-name hf \
|
||||
--dataset-path philschmid/mt-bench \
|
||||
--num-prompts 80
|
||||
```
|
||||
|
||||
### Running With Sampling Parameters
|
||||
|
||||
When using OpenAI-compatible backends such as `vllm`, optional sampling
|
||||
parameters can be specified. Example client command:
|
||||
|
||||
```bash
|
||||
python3 vllm/benchmarks/benchmark_serving.py \
|
||||
--backend vllm \
|
||||
--model NousResearch/Hermes-3-Llama-3.1-8B \
|
||||
--endpoint /v1/completions \
|
||||
--dataset-name sharegpt \
|
||||
--dataset-path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
|
||||
--top-k 10 \
|
||||
--top-p 0.9 \
|
||||
--temperature 0.5 \
|
||||
--num-prompts 10
|
||||
--backend "${BACKEND}" \
|
||||
--model "${MODEL_NAME}" \
|
||||
--endpoint "/v1/chat/completions" \
|
||||
--dataset-name "${DATASET_NAME}" \
|
||||
--dataset-path "${DATASET_PATH}" \
|
||||
--hf-split "${DATASET_SPLIT}" \
|
||||
--num-prompts "${NUM_PROMPTS}" \
|
||||
```
|
||||
|
||||
---
|
||||
## Example - Offline Throughput Benchmark
|
||||
|
||||
```bash
|
||||
MODEL_NAME="NousResearch/Hermes-3-Llama-3.1-8B"
|
||||
NUM_PROMPTS=10
|
||||
DATASET_NAME="sonnet"
|
||||
DATASET_PATH="vllm/benchmarks/sonnet.txt"
|
||||
|
||||
python3 vllm/benchmarks/benchmark_throughput.py \
|
||||
--model NousResearch/Hermes-3-Llama-3.1-8B \
|
||||
--dataset-name sonnet \
|
||||
--dataset-path vllm/benchmarks/sonnet.txt \
|
||||
--num-prompts 10
|
||||
--model "${MODEL_NAME}" \
|
||||
--dataset-name "${DATASET_NAME}" \
|
||||
--dataset-path "${DATASET_PATH}" \
|
||||
--num-prompts "${NUM_PROMPTS}"
|
||||
```
|
||||
|
||||
If successful, you will see the following output
|
||||
@ -291,13 +217,19 @@ Total num output tokens: 1500
|
||||
### VisionArena Benchmark for Vision Language Models
|
||||
|
||||
``` bash
|
||||
MODEL_NAME="Qwen/Qwen2-VL-7B-Instruct"
|
||||
NUM_PROMPTS=10
|
||||
DATASET_NAME="hf"
|
||||
DATASET_PATH="lmarena-ai/vision-arena-bench-v0.1"
|
||||
DATASET_SPLIT="train"
|
||||
|
||||
python3 vllm/benchmarks/benchmark_throughput.py \
|
||||
--model Qwen/Qwen2-VL-7B-Instruct \
|
||||
--backend vllm-chat \
|
||||
--dataset-name hf \
|
||||
--dataset-path lmarena-ai/VisionArena-Chat \
|
||||
--num-prompts 1000 \
|
||||
--hf-split train
|
||||
--model "${MODEL_NAME}" \
|
||||
--backend "vllm-chat" \
|
||||
--dataset-name "${DATASET_NAME}" \
|
||||
--dataset-path "${DATASET_PATH}" \
|
||||
--num-prompts "${NUM_PROMPTS}" \
|
||||
--hf-split "${DATASET_SPLIT}"
|
||||
```
|
||||
|
||||
The `num prompt tokens` now includes image token counts
|
||||
@ -308,82 +240,29 @@ Total num prompt tokens: 14527
|
||||
Total num output tokens: 1280
|
||||
```
|
||||
|
||||
### InstructCoder Benchmark with Speculative Decoding
|
||||
|
||||
``` bash
|
||||
VLLM_WORKER_MULTIPROC_METHOD=spawn \
|
||||
VLLM_USE_V1=1 \
|
||||
python3 vllm/benchmarks/benchmark_throughput.py \
|
||||
--dataset-name=hf \
|
||||
--dataset-path=likaixin/InstructCoder \
|
||||
--model=meta-llama/Meta-Llama-3-8B-Instruct \
|
||||
--input-len=1000 \
|
||||
--output-len=100 \
|
||||
--num-prompts=2048 \
|
||||
--async-engine \
|
||||
--speculative-config $'{"method": "ngram",
|
||||
"num_speculative_tokens": 5, "prompt_lookup_max": 5,
|
||||
"prompt_lookup_min": 2}'
|
||||
```
|
||||
|
||||
```
|
||||
Throughput: 104.77 requests/s, 23836.22 total tokens/s, 10477.10 output tokens/s
|
||||
Total num prompt tokens: 261136
|
||||
Total num output tokens: 204800
|
||||
```
|
||||
|
||||
### Other HuggingFaceDataset Examples
|
||||
|
||||
**`lmms-lab/LLaVA-OneVision-Data`**
|
||||
|
||||
```bash
|
||||
python3 vllm/benchmarks/benchmark_throughput.py \
|
||||
--model Qwen/Qwen2-VL-7B-Instruct \
|
||||
--backend vllm-chat \
|
||||
--dataset-name hf \
|
||||
--dataset-path lmms-lab/LLaVA-OneVision-Data \
|
||||
--hf-split train \
|
||||
--hf-subset "chart2text(cauldron)" \
|
||||
--num-prompts 10
|
||||
```
|
||||
|
||||
**`Aeala/ShareGPT_Vicuna_unfiltered`**
|
||||
|
||||
```bash
|
||||
python3 vllm/benchmarks/benchmark_throughput.py \
|
||||
--model Qwen/Qwen2-VL-7B-Instruct \
|
||||
--backend vllm-chat \
|
||||
--dataset-name hf \
|
||||
--dataset-path Aeala/ShareGPT_Vicuna_unfiltered \
|
||||
--hf-split train \
|
||||
--num-prompts 10
|
||||
```
|
||||
|
||||
**`AI-MO/aimo-validation-aime`**
|
||||
|
||||
```bash
|
||||
python3 benchmarks/benchmark_throughput.py \
|
||||
--model Qwen/QwQ-32B \
|
||||
--backend vllm \
|
||||
--dataset-name hf \
|
||||
--dataset-path AI-MO/aimo-validation-aime \
|
||||
--hf-split train \
|
||||
--num-prompts 10
|
||||
```
|
||||
|
||||
### Benchmark with LoRA Adapters
|
||||
|
||||
``` bash
|
||||
# download dataset
|
||||
# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
||||
MODEL_NAME="meta-llama/Llama-2-7b-hf"
|
||||
BACKEND="vllm"
|
||||
DATASET_NAME="sharegpt"
|
||||
DATASET_PATH="<your data path>/ShareGPT_V3_unfiltered_cleaned_split.json"
|
||||
NUM_PROMPTS=10
|
||||
MAX_LORAS=2
|
||||
MAX_LORA_RANK=8
|
||||
ENABLE_LORA="--enable-lora"
|
||||
LORA_PATH="yard1/llama-2-7b-sql-lora-test"
|
||||
|
||||
python3 vllm/benchmarks/benchmark_throughput.py \
|
||||
--model meta-llama/Llama-2-7b-hf \
|
||||
--backend vllm \
|
||||
--dataset_path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
|
||||
--dataset_name sharegpt \
|
||||
--num-prompts 10 \
|
||||
--max-loras 2 \
|
||||
--max-lora-rank 8 \
|
||||
--enable-lora \
|
||||
--lora-path yard1/llama-2-7b-sql-lora-test
|
||||
--model "${MODEL_NAME}" \
|
||||
--backend "${BACKEND}" \
|
||||
--dataset_path "${DATASET_PATH}" \
|
||||
--dataset_name "${DATASET_NAME}" \
|
||||
--num-prompts "${NUM_PROMPTS}" \
|
||||
--max-loras "${MAX_LORAS}" \
|
||||
--max-lora-rank "${MAX_LORA_RANK}" \
|
||||
${ENABLE_LORA} \
|
||||
--lora-path "${LORA_PATH}"
|
||||
```
|
||||
|
||||
@ -1,212 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
# This script aims to tune the best server parameter combinations to maximize throughput for given requirement.
|
||||
# The current server parameter combination is max_num_seqs and max_num_batched_tokens
|
||||
# It also supports additional requirement: e2e latency and prefix cache.
|
||||
|
||||
# Pre-requisite:
|
||||
# 1. Checkout to your branch, install/ update the correct running env. For TPU, activate conda env and install the corresponding torch, xla version.
|
||||
# 2. If the model is customized, replace the MODEL's config with the customized config.
|
||||
# 3. Set variables (ALL REQUIRED)
|
||||
# BASE: your directory for vllm repo
|
||||
# MODEL: the model served by vllm
|
||||
# DOWNLOAD_DIR: directory to download and load model weights.
|
||||
# INPUT_LEN: request input len
|
||||
# OUTPUT_LEN: request output len
|
||||
# MIN_CACHE_HIT_PCT: prefix cache rate
|
||||
# MAX_LATENCY_ALLOWED_MS: (e2e) latency requirement. If there's no latency requirement, set it to a large number like 1000000000
|
||||
# 4. Run the script, it might take a long time, you can use tmux to avoid the script stop if disconnection happens.
|
||||
# 5. The final result will be saved in RESULT file.
|
||||
|
||||
|
||||
# Example use cases
|
||||
# 1. Given input_len=1800, output_len=20, what's the best max_num_seqs and max_num_batched_tokens to get highest throughput?
|
||||
# Use INPUT_LEN=1800, OUTPUT_LEN=20, MIN_CACHE_HIT_PCT=0, MAX_LATENCY_ALLOWED_MS=100000000000
|
||||
# 2. If we have latency requirement to be lower than 500ms, what's the best server parameter?
|
||||
# Use INPUT_LEN=1800, OUTPUT_LEN=20, MIN_CACHE_HIT_PCT=0, MAX_LATENCY_ALLOWED_MS=500
|
||||
# 3. If we want to reach 60% prefix cache, what's the best server parameter?
|
||||
# Use INPUT_LEN=1800, OUTPUT_LEN=20, MIN_CACHE_HIT_PCT=60, MAX_LATENCY_ALLOWED_MS=500
|
||||
|
||||
TAG=$(date +"%Y_%m_%d_%H_%M")
|
||||
BASE=""
|
||||
MODEL="meta-llama/Llama-3.1-8B-Instruct"
|
||||
DOWNLOAD_DIR=""
|
||||
INPUT_LEN=4000
|
||||
OUTPUT_LEN=16
|
||||
MIN_CACHE_HIT_PCT_PCT=0
|
||||
MAX_LATENCY_ALLOWED_MS=100000000000
|
||||
|
||||
LOG_FOLDER="$BASE/auto-benchmark/$TAG"
|
||||
RESULT="$LOG_FOLDER/result.txt"
|
||||
|
||||
echo "result file$ $RESULT"
|
||||
echo "model: $MODEL"
|
||||
echo
|
||||
|
||||
rm -rf $LOG_FOLDER
|
||||
mkdir -p $LOG_FOLDER
|
||||
|
||||
cd "$BASE/vllm"
|
||||
# create sonnet-4x.txt so that we can sample 2048 tokens for input
|
||||
echo "" > benchmarks/sonnet_4x.txt
|
||||
for _ in {1..4}
|
||||
do
|
||||
cat benchmarks/sonnet.txt >> benchmarks/sonnet_4x.txt
|
||||
done
|
||||
|
||||
pip install datasets
|
||||
|
||||
current_hash=$(git rev-parse HEAD)
|
||||
echo "hash:$current_hash" >> "$RESULT"
|
||||
echo "current_hash: $current_hash"
|
||||
|
||||
best_throughput=0
|
||||
best_max_num_seqs=0
|
||||
best_num_batched_tokens=0
|
||||
best_goodput=0
|
||||
run_benchmark() {
|
||||
local max_num_seqs=$1
|
||||
local max_num_batched_tokens=$2
|
||||
echo "max_num_seq: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens"
|
||||
local vllm_log="$LOG_FOLDER/vllm_log_${max_num_seqs}_${max_num_batched_tokens}.txt"
|
||||
echo "vllm_log: $vllm_log"
|
||||
echo
|
||||
rm -f $vllm_log
|
||||
|
||||
# start the server
|
||||
VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 vllm serve $MODEL \
|
||||
--disable-log-requests \
|
||||
--port 8004 \
|
||||
--gpu-memory-utilization 0.98 \
|
||||
--max-num-seqs $max_num_seqs \
|
||||
--max-num-batched-tokens $max_num_batched_tokens \
|
||||
--tensor-parallel-size 1 \
|
||||
--enable-prefix-caching \
|
||||
--load-format dummy \
|
||||
--download-dir $DOWNLOAD_DIR \
|
||||
--max-model-len $(( INPUT_LEN+OUTPUT_LEN )) > "$vllm_log" 2>&1 &
|
||||
echo "wait for 10 minutes.."
|
||||
echo
|
||||
# wait for 10 minutes...
|
||||
server_started=0
|
||||
for i in {1..60}; do
|
||||
if grep -Fq "Application startup complete" "$vllm_log"; then
|
||||
echo "Application started"
|
||||
server_started=1
|
||||
break
|
||||
else
|
||||
# echo "wait for 10 seconds..."
|
||||
sleep 10
|
||||
fi
|
||||
done
|
||||
|
||||
if (( ! server_started )); then
|
||||
echo "server did not start within 10 minutes, terminate the benchmarking. Please check server log at $vllm_log"
|
||||
echo "pkill -f vllm"
|
||||
echo
|
||||
pkill vllm
|
||||
sleep 10
|
||||
return 1
|
||||
fi
|
||||
|
||||
echo "run benchmark test..."
|
||||
echo
|
||||
meet_latency_requirement=0
|
||||
# get a basic qps by using request-rate inf
|
||||
bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_inf.txt"
|
||||
prefix_len=$(( INPUT_LEN * MIN_CACHE_HIT_PCT / 100 ))
|
||||
python benchmarks/benchmark_serving.py \
|
||||
--backend vllm \
|
||||
--model $MODEL \
|
||||
--dataset-name sonnet \
|
||||
--dataset-path benchmarks/sonnet_4x.txt \
|
||||
--sonnet-input-len $INPUT_LEN \
|
||||
--sonnet-output-len $OUTPUT_LEN \
|
||||
--ignore-eos \
|
||||
--disable-tqdm \
|
||||
--request-rate inf \
|
||||
--percentile-metrics ttft,tpot,itl,e2el \
|
||||
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \
|
||||
--num-prompts 100 \
|
||||
--sonnet-prefix-len $prefix_len \
|
||||
--port 8004 > "$bm_log"
|
||||
through_put=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
|
||||
e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
|
||||
goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
|
||||
|
||||
if (( $(echo "$e2el <= $MAX_LATENCY_ALLOWED_MS" | bc -l) )); then
|
||||
meet_latency_requirement=1
|
||||
fi
|
||||
|
||||
if (( ! meet_latency_requirement )); then
|
||||
# start from request-rate as int(through_put) + 1
|
||||
request_rate=$((${through_put%.*} + 1))
|
||||
while ((request_rate > 0)); do
|
||||
# clear prefix cache
|
||||
curl -X POST http://0.0.0.0:8004/reset_prefix_cache
|
||||
sleep 5
|
||||
bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_${request_rate}.txt"
|
||||
python benchmarks/benchmark_serving.py \
|
||||
--backend vllm \
|
||||
--model $MODEL \
|
||||
--dataset-name sonnet \
|
||||
--dataset-path benchmarks/sonnet_4x.txt \
|
||||
--sonnet-input-len $INPUT_LEN \
|
||||
--sonnet-output-len $OUTPUT_LEN \
|
||||
--ignore_eos \
|
||||
--disable-tqdm \
|
||||
--request-rate $request_rate \
|
||||
--percentile-metrics ttft,tpot,itl,e2el \
|
||||
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \
|
||||
--num-prompts 100 \
|
||||
--sonnet-prefix-len $prefix_len \
|
||||
--port 8004 > "$bm_log"
|
||||
through_put=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
|
||||
e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
|
||||
goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
|
||||
if (( $(echo "$e2el <= $MAX_LATENCY_ALLOWED_MS" | bc -l) )); then
|
||||
meet_latency_requirement=1
|
||||
break
|
||||
fi
|
||||
request_rate=$((request_rate-1))
|
||||
done
|
||||
fi
|
||||
# write the results and update the best result.
|
||||
if ((meet_latency_requirement)); then
|
||||
echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens, request_rate: $request_rate, e2el: $e2el, through put: $through_put, goodput: $goodput"
|
||||
echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens, request_rate: $request_rate, e2el: $e2el, through put: $through_put, goodput: $goodput" >> "$RESULT"
|
||||
if (( $(echo "$through_put > $best_throughput" | bc -l) )); then
|
||||
best_throughput=$through_put
|
||||
best_max_num_seqs=$max_num_seqs
|
||||
best_num_batched_tokens=$max_num_batched_tokens
|
||||
best_goodput=$goodput
|
||||
fi
|
||||
else
|
||||
echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens does not meet latency requirement ${MAX_LATENCY_ALLOWED_MS}"
|
||||
echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens does not meet latency requirement ${MAX_LATENCY_ALLOWED_MS}" >> "$RESULT"
|
||||
fi
|
||||
|
||||
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput"
|
||||
|
||||
echo "pkill -f vllm"
|
||||
echo
|
||||
pkill vllm
|
||||
sleep 10
|
||||
rm -f $vllm_log
|
||||
printf '=%.0s' $(seq 1 20)
|
||||
return 0
|
||||
}
|
||||
|
||||
|
||||
num_seqs_list="128 256"
|
||||
num_batched_tokens_list="512 1024 2048 4096"
|
||||
for num_seqs in $num_seqs_list; do
|
||||
for num_batched_tokens in $num_batched_tokens_list; do
|
||||
run_benchmark $num_seqs $num_batched_tokens
|
||||
exit 0
|
||||
done
|
||||
done
|
||||
echo "finish permutations"
|
||||
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput"
|
||||
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput" >> "$RESULT"
|
||||
|
||||
@ -1,6 +1,5 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
import io
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
@ -12,7 +11,8 @@ from typing import Optional, Union
|
||||
import aiohttp
|
||||
import huggingface_hub.constants
|
||||
from tqdm.asyncio import tqdm
|
||||
from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast
|
||||
from transformers import (AutoTokenizer, PreTrainedTokenizer,
|
||||
PreTrainedTokenizerFast)
|
||||
|
||||
# NOTE(simon): do not import vLLM here so the benchmark script
|
||||
# can run without vLLM installed.
|
||||
@ -32,7 +32,6 @@ class RequestFuncInput:
|
||||
extra_body: Optional[dict] = None
|
||||
multi_modal_content: Optional[dict] = None
|
||||
ignore_eos: bool = False
|
||||
language: Optional[str] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
@ -42,7 +41,8 @@ class RequestFuncOutput:
|
||||
latency: float = 0.0
|
||||
output_tokens: int = 0
|
||||
ttft: float = 0.0 # Time to first token
|
||||
itl: list[float] = field(default_factory=list) # list of inter-token latencies
|
||||
itl: list[float] = field(
|
||||
default_factory=list) # list of inter-token latencies
|
||||
tpot: float = 0.0 # avg next-token latencies
|
||||
prompt_len: int = 0
|
||||
error: str = ""
|
||||
@ -55,9 +55,8 @@ async def async_request_tgi(
|
||||
api_url = request_func_input.api_url
|
||||
assert api_url.endswith("generate_stream")
|
||||
|
||||
async with aiohttp.ClientSession(
|
||||
trust_env=True, timeout=AIOHTTP_TIMEOUT
|
||||
) as session:
|
||||
async with aiohttp.ClientSession(trust_env=True,
|
||||
timeout=AIOHTTP_TIMEOUT) as session:
|
||||
params = {
|
||||
"max_new_tokens": request_func_input.output_len,
|
||||
"do_sample": True,
|
||||
@ -104,7 +103,8 @@ async def async_request_tgi(
|
||||
|
||||
# Decoding phase
|
||||
else:
|
||||
output.itl.append(timestamp - most_recent_timestamp)
|
||||
output.itl.append(timestamp -
|
||||
most_recent_timestamp)
|
||||
|
||||
most_recent_timestamp = timestamp
|
||||
|
||||
@ -131,9 +131,8 @@ async def async_request_trt_llm(
|
||||
api_url = request_func_input.api_url
|
||||
assert api_url.endswith("generate_stream")
|
||||
|
||||
async with aiohttp.ClientSession(
|
||||
trust_env=True, timeout=AIOHTTP_TIMEOUT
|
||||
) as session:
|
||||
async with aiohttp.ClientSession(trust_env=True,
|
||||
timeout=AIOHTTP_TIMEOUT) as session:
|
||||
payload = {
|
||||
"accumulate_tokens": True,
|
||||
"text_input": request_func_input.prompt,
|
||||
@ -158,7 +157,8 @@ async def async_request_trt_llm(
|
||||
if not chunk_bytes:
|
||||
continue
|
||||
|
||||
chunk = chunk_bytes.decode("utf-8").removeprefix("data:")
|
||||
chunk = chunk_bytes.decode("utf-8").removeprefix(
|
||||
"data:")
|
||||
|
||||
data = json.loads(chunk)
|
||||
output.generated_text += data["text_output"]
|
||||
@ -170,7 +170,8 @@ async def async_request_trt_llm(
|
||||
|
||||
# Decoding phase
|
||||
else:
|
||||
output.itl.append(timestamp - most_recent_timestamp)
|
||||
output.itl.append(timestamp -
|
||||
most_recent_timestamp)
|
||||
|
||||
most_recent_timestamp = timestamp
|
||||
|
||||
@ -194,23 +195,15 @@ async def async_request_deepspeed_mii(
|
||||
request_func_input: RequestFuncInput,
|
||||
pbar: Optional[tqdm] = None,
|
||||
) -> RequestFuncOutput:
|
||||
api_url = request_func_input.api_url
|
||||
assert api_url.endswith(("completions", "profile")), (
|
||||
"OpenAI Completions API URL must end with 'completions' or 'profile'."
|
||||
)
|
||||
async with aiohttp.ClientSession(trust_env=True,
|
||||
timeout=AIOHTTP_TIMEOUT) as session:
|
||||
|
||||
async with aiohttp.ClientSession(
|
||||
trust_env=True, timeout=AIOHTTP_TIMEOUT
|
||||
) as session:
|
||||
payload = {
|
||||
"model": request_func_input.model,
|
||||
"prompt": request_func_input.prompt,
|
||||
"max_tokens": request_func_input.output_len,
|
||||
"temperature": 0.01, # deepspeed-mii does not accept 0.0 temp.
|
||||
"top_p": 1.0,
|
||||
}
|
||||
headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
|
||||
|
||||
output = RequestFuncOutput()
|
||||
output.prompt_len = request_func_input.prompt_len
|
||||
|
||||
@ -221,22 +214,12 @@ async def async_request_deepspeed_mii(
|
||||
|
||||
st = time.perf_counter()
|
||||
try:
|
||||
async with session.post(
|
||||
url=api_url, json=payload, headers=headers
|
||||
) as response:
|
||||
async with session.post(url=request_func_input.api_url,
|
||||
json=payload) as response:
|
||||
if response.status == 200:
|
||||
parsed_resp = await response.json()
|
||||
output.latency = time.perf_counter() - st
|
||||
if "choices" in parsed_resp:
|
||||
output.generated_text = parsed_resp["choices"][0]["text"]
|
||||
elif "text" in parsed_resp:
|
||||
output.generated_text = parsed_resp["text"][0]
|
||||
else:
|
||||
output.error = (
|
||||
"Unexpected response format: "
|
||||
"neither 'choices' nor 'text' found"
|
||||
)
|
||||
output.success = False
|
||||
output.generated_text = parsed_resp["text"][0]
|
||||
output.success = True
|
||||
else:
|
||||
output.error = response.reason or ""
|
||||
@ -256,20 +239,17 @@ async def async_request_openai_completions(
|
||||
pbar: Optional[tqdm] = None,
|
||||
) -> RequestFuncOutput:
|
||||
api_url = request_func_input.api_url
|
||||
assert api_url.endswith(("completions", "profile")), (
|
||||
"OpenAI Completions API URL must end with 'completions' or 'profile'."
|
||||
)
|
||||
assert api_url.endswith(
|
||||
("completions", "profile")
|
||||
), "OpenAI Completions API URL must end with 'completions' or 'profile'."
|
||||
|
||||
async with aiohttp.ClientSession(
|
||||
trust_env=True, timeout=AIOHTTP_TIMEOUT
|
||||
) as session:
|
||||
async with aiohttp.ClientSession(trust_env=True,
|
||||
timeout=AIOHTTP_TIMEOUT) as session:
|
||||
payload = {
|
||||
"model": request_func_input.model_name
|
||||
if request_func_input.model_name
|
||||
else request_func_input.model,
|
||||
"model": request_func_input.model_name \
|
||||
if request_func_input.model_name else request_func_input.model,
|
||||
"prompt": request_func_input.prompt,
|
||||
"temperature": 0.0,
|
||||
"repetition_penalty": 1.0,
|
||||
"max_tokens": request_func_input.output_len,
|
||||
"logprobs": request_func_input.logprobs,
|
||||
"stream": True,
|
||||
@ -281,7 +261,9 @@ async def async_request_openai_completions(
|
||||
payload["ignore_eos"] = request_func_input.ignore_eos
|
||||
if request_func_input.extra_body:
|
||||
payload.update(request_func_input.extra_body)
|
||||
headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
|
||||
headers = {
|
||||
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"
|
||||
}
|
||||
|
||||
output = RequestFuncOutput()
|
||||
output.prompt_len = request_func_input.prompt_len
|
||||
@ -290,9 +272,8 @@ async def async_request_openai_completions(
|
||||
st = time.perf_counter()
|
||||
most_recent_timestamp = st
|
||||
try:
|
||||
async with session.post(
|
||||
url=api_url, json=payload, headers=headers
|
||||
) as response:
|
||||
async with session.post(url=api_url, json=payload,
|
||||
headers=headers) as response:
|
||||
if response.status == 200:
|
||||
first_chunk_received = False
|
||||
async for chunk_bytes in response.content:
|
||||
@ -300,7 +281,8 @@ async def async_request_openai_completions(
|
||||
if not chunk_bytes:
|
||||
continue
|
||||
|
||||
chunk = chunk_bytes.decode("utf-8").removeprefix("data: ")
|
||||
chunk = chunk_bytes.decode("utf-8").removeprefix(
|
||||
"data: ")
|
||||
if chunk != "[DONE]":
|
||||
data = json.loads(chunk)
|
||||
|
||||
@ -320,20 +302,21 @@ async def async_request_openai_completions(
|
||||
|
||||
# Decoding phase
|
||||
else:
|
||||
output.itl.append(timestamp - most_recent_timestamp)
|
||||
output.itl.append(timestamp -
|
||||
most_recent_timestamp)
|
||||
|
||||
most_recent_timestamp = timestamp
|
||||
generated_text += text or ""
|
||||
if usage := data.get("usage"):
|
||||
output.output_tokens = usage.get("completion_tokens")
|
||||
elif usage := data.get("usage"):
|
||||
output.output_tokens = usage.get(
|
||||
"completion_tokens")
|
||||
if first_chunk_received:
|
||||
output.success = True
|
||||
else:
|
||||
output.success = False
|
||||
output.error = (
|
||||
"Never received a valid chunk to calculate TTFT."
|
||||
"This response will be marked as failed!"
|
||||
)
|
||||
"This response will be marked as failed!")
|
||||
output.generated_text = generated_text
|
||||
output.latency = most_recent_timestamp - st
|
||||
else:
|
||||
@ -354,22 +337,23 @@ async def async_request_openai_chat_completions(
|
||||
pbar: Optional[tqdm] = None,
|
||||
) -> RequestFuncOutput:
|
||||
api_url = request_func_input.api_url
|
||||
assert api_url.endswith(("chat/completions", "profile")), (
|
||||
"OpenAI Chat Completions API URL must end with 'chat/completions'."
|
||||
)
|
||||
assert api_url.endswith(
|
||||
("chat/completions", "profile")
|
||||
), "OpenAI Chat Completions API URL must end with 'chat/completions'."
|
||||
|
||||
async with aiohttp.ClientSession(
|
||||
trust_env=True, timeout=AIOHTTP_TIMEOUT
|
||||
) as session:
|
||||
async with aiohttp.ClientSession(trust_env=True,
|
||||
timeout=AIOHTTP_TIMEOUT) as session:
|
||||
content = [{"type": "text", "text": request_func_input.prompt}]
|
||||
if request_func_input.multi_modal_content:
|
||||
content.append(request_func_input.multi_modal_content)
|
||||
payload = {
|
||||
"model": request_func_input.model_name
|
||||
if request_func_input.model_name
|
||||
else request_func_input.model,
|
||||
"model": request_func_input.model_name \
|
||||
if request_func_input.model_name else request_func_input.model,
|
||||
"messages": [
|
||||
{"role": "user", "content": content},
|
||||
{
|
||||
"role": "user",
|
||||
"content": content
|
||||
},
|
||||
],
|
||||
"temperature": 0.0,
|
||||
"max_completion_tokens": request_func_input.output_len,
|
||||
@ -395,16 +379,16 @@ async def async_request_openai_chat_completions(
|
||||
st = time.perf_counter()
|
||||
most_recent_timestamp = st
|
||||
try:
|
||||
async with session.post(
|
||||
url=api_url, json=payload, headers=headers
|
||||
) as response:
|
||||
async with session.post(url=api_url, json=payload,
|
||||
headers=headers) as response:
|
||||
if response.status == 200:
|
||||
async for chunk_bytes in response.content:
|
||||
chunk_bytes = chunk_bytes.strip()
|
||||
if not chunk_bytes:
|
||||
continue
|
||||
|
||||
chunk = chunk_bytes.decode("utf-8").removeprefix("data: ")
|
||||
chunk = chunk_bytes.decode("utf-8").removeprefix(
|
||||
"data: ")
|
||||
if chunk != "[DONE]":
|
||||
timestamp = time.perf_counter()
|
||||
data = json.loads(chunk)
|
||||
@ -418,11 +402,13 @@ async def async_request_openai_chat_completions(
|
||||
|
||||
# Decoding phase
|
||||
else:
|
||||
output.itl.append(timestamp - most_recent_timestamp)
|
||||
output.itl.append(timestamp -
|
||||
most_recent_timestamp)
|
||||
|
||||
generated_text += content or ""
|
||||
elif usage := data.get("usage"):
|
||||
output.output_tokens = usage.get("completion_tokens")
|
||||
output.output_tokens = usage.get(
|
||||
"completion_tokens")
|
||||
|
||||
most_recent_timestamp = timestamp
|
||||
|
||||
@ -442,115 +428,8 @@ async def async_request_openai_chat_completions(
|
||||
return output
|
||||
|
||||
|
||||
async def async_request_openai_audio(
|
||||
request_func_input: RequestFuncInput,
|
||||
pbar: Optional[tqdm] = None,
|
||||
) -> RequestFuncOutput:
|
||||
# Lazy import without PlaceholderModule to avoid vllm dep.
|
||||
import soundfile
|
||||
|
||||
api_url = request_func_input.api_url
|
||||
assert api_url.endswith(("transcriptions", "translations")), (
|
||||
"OpenAI Chat Completions API URL must end with 'transcriptions' "
|
||||
)
|
||||
"or `translations`."
|
||||
|
||||
async with aiohttp.ClientSession(
|
||||
trust_env=True, timeout=AIOHTTP_TIMEOUT
|
||||
) as session:
|
||||
content = [{"type": "text", "text": request_func_input.prompt}]
|
||||
payload = {
|
||||
"model": request_func_input.model_name
|
||||
if request_func_input.model_name
|
||||
else request_func_input.model,
|
||||
"temperature": 0.0,
|
||||
"max_completion_tokens": request_func_input.output_len,
|
||||
"stream": True,
|
||||
"language": "en",
|
||||
# Flattened due to multipart/form-data
|
||||
"stream_include_usage": True,
|
||||
"stream_continuous_usage_stats": True,
|
||||
}
|
||||
if request_func_input.extra_body:
|
||||
payload.update(request_func_input.extra_body)
|
||||
headers = {
|
||||
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
|
||||
}
|
||||
|
||||
# Send audio file
|
||||
def to_bytes(y, sr):
|
||||
buffer = io.BytesIO()
|
||||
soundfile.write(buffer, y, sr, format="WAV")
|
||||
buffer.seek(0)
|
||||
return buffer
|
||||
|
||||
with to_bytes(*request_func_input.multi_modal_content["audio"]) as f:
|
||||
form = aiohttp.FormData()
|
||||
form.add_field("file", f, content_type="audio/wav")
|
||||
for key, value in payload.items():
|
||||
form.add_field(key, str(value))
|
||||
|
||||
output = RequestFuncOutput()
|
||||
output.prompt_len = request_func_input.prompt_len
|
||||
|
||||
generated_text = ""
|
||||
ttft = 0.0
|
||||
st = time.perf_counter()
|
||||
most_recent_timestamp = st
|
||||
try:
|
||||
async with session.post(
|
||||
url=api_url, data=form, headers=headers
|
||||
) as response:
|
||||
if response.status == 200:
|
||||
async for chunk_bytes in response.content:
|
||||
chunk_bytes = chunk_bytes.strip()
|
||||
if not chunk_bytes:
|
||||
continue
|
||||
|
||||
chunk = chunk_bytes.decode("utf-8").removeprefix("data: ")
|
||||
if chunk != "[DONE]":
|
||||
timestamp = time.perf_counter()
|
||||
data = json.loads(chunk)
|
||||
|
||||
if choices := data.get("choices"):
|
||||
content = choices[0]["delta"].get("content")
|
||||
# First token
|
||||
if ttft == 0.0:
|
||||
ttft = timestamp - st
|
||||
output.ttft = ttft
|
||||
|
||||
# Decoding phase
|
||||
else:
|
||||
output.itl.append(
|
||||
timestamp - most_recent_timestamp
|
||||
)
|
||||
|
||||
generated_text += content or ""
|
||||
elif usage := data.get("usage"):
|
||||
output.output_tokens = usage.get(
|
||||
"completion_tokens"
|
||||
)
|
||||
|
||||
most_recent_timestamp = timestamp
|
||||
|
||||
output.generated_text = generated_text
|
||||
output.success = True
|
||||
output.latency = most_recent_timestamp - st
|
||||
else:
|
||||
output.error = response.reason or ""
|
||||
output.success = False
|
||||
except Exception:
|
||||
output.success = False
|
||||
exc_info = sys.exc_info()
|
||||
output.error = "".join(traceback.format_exception(*exc_info))
|
||||
|
||||
if pbar:
|
||||
pbar.update(1)
|
||||
return output
|
||||
|
||||
|
||||
def get_model(pretrained_model_name_or_path: str) -> str:
|
||||
if os.getenv("VLLM_USE_MODELSCOPE", "False").lower() == "true":
|
||||
if os.getenv('VLLM_USE_MODELSCOPE', 'False').lower() == 'true':
|
||||
from modelscope import snapshot_download
|
||||
|
||||
from vllm.model_executor.model_loader.weight_utils import get_lock
|
||||
@ -561,8 +440,7 @@ def get_model(pretrained_model_name_or_path: str) -> str:
|
||||
model_path = snapshot_download(
|
||||
model_id=pretrained_model_name_or_path,
|
||||
local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
|
||||
ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"],
|
||||
)
|
||||
ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"])
|
||||
|
||||
return model_path
|
||||
return pretrained_model_name_or_path
|
||||
@ -575,23 +453,23 @@ def get_tokenizer(
|
||||
**kwargs,
|
||||
) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
|
||||
if pretrained_model_name_or_path is not None and not os.path.exists(
|
||||
pretrained_model_name_or_path
|
||||
):
|
||||
pretrained_model_name_or_path = get_model(pretrained_model_name_or_path)
|
||||
pretrained_model_name_or_path):
|
||||
pretrained_model_name_or_path = get_model(
|
||||
pretrained_model_name_or_path)
|
||||
if tokenizer_mode == "slow":
|
||||
if kwargs.get("use_fast", False):
|
||||
raise ValueError("Cannot use the fast tokenizer in slow tokenizer mode.")
|
||||
raise ValueError(
|
||||
"Cannot use the fast tokenizer in slow tokenizer mode.")
|
||||
kwargs["use_fast"] = False
|
||||
if tokenizer_mode == "mistral":
|
||||
try:
|
||||
from vllm.transformers_utils.tokenizer import MistralTokenizer
|
||||
except ImportError as e:
|
||||
raise ImportError(
|
||||
"MistralTokenizer requires vllm package.\n"
|
||||
"Please install it with `pip install vllm` "
|
||||
"to use mistral tokenizer mode."
|
||||
) from e
|
||||
return MistralTokenizer.from_pretrained(str(pretrained_model_name_or_path))
|
||||
raise ImportError("MistralTokenizer requires vllm package.\n"
|
||||
"Please install it with `pip install vllm` "
|
||||
"to use mistral tokenizer mode.") from e
|
||||
return MistralTokenizer.from_pretrained(
|
||||
str(pretrained_model_name_or_path))
|
||||
else:
|
||||
return AutoTokenizer.from_pretrained(
|
||||
pretrained_model_name_or_path,
|
||||
@ -607,15 +485,7 @@ ASYNC_REQUEST_FUNCS = {
|
||||
"deepspeed-mii": async_request_deepspeed_mii,
|
||||
"openai": async_request_openai_completions,
|
||||
"openai-chat": async_request_openai_chat_completions,
|
||||
"openai-audio": async_request_openai_audio,
|
||||
"tensorrt-llm": async_request_trt_llm,
|
||||
"scalellm": async_request_openai_completions,
|
||||
"sglang": async_request_openai_completions,
|
||||
"llama.cpp": async_request_openai_completions,
|
||||
}
|
||||
|
||||
OPENAI_COMPATIBLE_BACKENDS = [
|
||||
k
|
||||
for k, v in ASYNC_REQUEST_FUNCS.items()
|
||||
if v in (async_request_openai_completions, async_request_openai_chat_completions)
|
||||
]
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -6,13 +6,14 @@ import dataclasses
|
||||
import json
|
||||
import os
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Any, Optional
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
|
||||
from tqdm import tqdm
|
||||
|
||||
import vllm.envs as envs
|
||||
from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.engine.arg_utils import EngineArgs
|
||||
from vllm.inputs import PromptType
|
||||
@ -20,14 +21,13 @@ from vllm.sampling_params import BeamSearchParams
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
|
||||
|
||||
def save_to_pytorch_benchmark_format(
|
||||
args: argparse.Namespace, results: dict[str, Any]
|
||||
) -> None:
|
||||
def save_to_pytorch_benchmark_format(args: argparse.Namespace,
|
||||
results: dict[str, Any]) -> None:
|
||||
pt_records = convert_to_pytorch_benchmark_format(
|
||||
args=args,
|
||||
metrics={"latency": results["latencies"]},
|
||||
extra_info={k: results[k] for k in ["avg_latency", "percentiles"]},
|
||||
)
|
||||
extra_info={k: results[k]
|
||||
for k in ["avg_latency", "percentiles"]})
|
||||
if pt_records:
|
||||
pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch.json"
|
||||
write_to_json(pt_file, pt_records)
|
||||
@ -42,11 +42,9 @@ def main(args: argparse.Namespace):
|
||||
# the engine will automatically process the request in multiple batches.
|
||||
llm = LLM(**dataclasses.asdict(engine_args))
|
||||
assert llm.llm_engine.model_config.max_model_len >= (
|
||||
args.input_len + args.output_len
|
||||
), (
|
||||
"Please ensure that max_model_len is greater than"
|
||||
" the sum of input_len and output_len."
|
||||
)
|
||||
args.input_len +
|
||||
args.output_len), ("Please ensure that max_model_len is greater than"
|
||||
" the sum of input_len and output_len.")
|
||||
|
||||
sampling_params = SamplingParams(
|
||||
n=args.n,
|
||||
@ -57,16 +55,18 @@ def main(args: argparse.Namespace):
|
||||
detokenize=not args.disable_detokenize,
|
||||
)
|
||||
print(sampling_params)
|
||||
dummy_prompt_token_ids = np.random.randint(
|
||||
10000, size=(args.batch_size, args.input_len)
|
||||
)
|
||||
dummy_prompts: list[PromptType] = [
|
||||
{"prompt_token_ids": batch} for batch in dummy_prompt_token_ids.tolist()
|
||||
]
|
||||
dummy_prompt_token_ids = np.random.randint(10000,
|
||||
size=(args.batch_size,
|
||||
args.input_len))
|
||||
dummy_prompts: list[PromptType] = [{
|
||||
"prompt_token_ids": batch
|
||||
} for batch in dummy_prompt_token_ids.tolist()]
|
||||
|
||||
def llm_generate():
|
||||
if not args.use_beam_search:
|
||||
llm.generate(dummy_prompts, sampling_params=sampling_params, use_tqdm=False)
|
||||
llm.generate(dummy_prompts,
|
||||
sampling_params=sampling_params,
|
||||
use_tqdm=False)
|
||||
else:
|
||||
llm.beam_search(
|
||||
dummy_prompts,
|
||||
@ -79,9 +79,16 @@ def main(args: argparse.Namespace):
|
||||
|
||||
def run_to_completion(profile_dir: Optional[str] = None):
|
||||
if profile_dir:
|
||||
llm.start_profile()
|
||||
llm_generate()
|
||||
llm.stop_profile()
|
||||
with torch.profiler.profile(
|
||||
activities=[
|
||||
torch.profiler.ProfilerActivity.CPU,
|
||||
torch.profiler.ProfilerActivity.CUDA,
|
||||
],
|
||||
on_trace_ready=torch.profiler.tensorboard_trace_handler(
|
||||
str(profile_dir)),
|
||||
) as p:
|
||||
llm_generate()
|
||||
print(p.key_averages().table(sort_by="self_cuda_time_total"))
|
||||
else:
|
||||
start_time = time.perf_counter()
|
||||
llm_generate()
|
||||
@ -94,7 +101,10 @@ def main(args: argparse.Namespace):
|
||||
run_to_completion(profile_dir=None)
|
||||
|
||||
if args.profile:
|
||||
profile_dir = envs.VLLM_TORCH_PROFILER_DIR
|
||||
profile_dir = args.profile_result_dir
|
||||
if not profile_dir:
|
||||
profile_dir = (Path(".") / "vllm_benchmark_result" /
|
||||
f"latency_result_{time.time()}")
|
||||
print(f"Profiling (results will be saved to '{profile_dir}')...")
|
||||
run_to_completion(profile_dir=profile_dir)
|
||||
return
|
||||
@ -125,8 +135,7 @@ def main(args: argparse.Namespace):
|
||||
if __name__ == "__main__":
|
||||
parser = FlexibleArgumentParser(
|
||||
description="Benchmark the latency of processing a single batch of "
|
||||
"requests till completion."
|
||||
)
|
||||
"requests till completion.")
|
||||
parser.add_argument("--input-len", type=int, default=32)
|
||||
parser.add_argument("--output-len", type=int, default=128)
|
||||
parser.add_argument("--batch-size", type=int, default=8)
|
||||
@ -143,14 +152,22 @@ if __name__ == "__main__":
|
||||
default=10,
|
||||
help="Number of iterations to run for warmup.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--num-iters", type=int, default=30, help="Number of iterations to run."
|
||||
)
|
||||
parser.add_argument("--num-iters",
|
||||
type=int,
|
||||
default=30,
|
||||
help="Number of iterations to run.")
|
||||
parser.add_argument(
|
||||
"--profile",
|
||||
action="store_true",
|
||||
help="profile the generation process of a single batch",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--profile-result-dir",
|
||||
type=str,
|
||||
default=None,
|
||||
help=("path to save the pytorch profiler output. Can be visualized "
|
||||
"with ui.perfetto.dev or Tensorboard."),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output-json",
|
||||
type=str,
|
||||
@ -160,20 +177,10 @@ if __name__ == "__main__":
|
||||
parser.add_argument(
|
||||
"--disable-detokenize",
|
||||
action="store_true",
|
||||
help=(
|
||||
"Do not detokenize responses (i.e. do not include "
|
||||
"detokenization time in the latency measurement)"
|
||||
),
|
||||
help=("Do not detokenize responses (i.e. do not include "
|
||||
"detokenization time in the latency measurement)"),
|
||||
)
|
||||
|
||||
parser = EngineArgs.add_cli_args(parser)
|
||||
# V1 enables prefix caching by default which skews the latency
|
||||
# numbers. We need to disable prefix caching by default.
|
||||
parser.set_defaults(enable_prefix_caching=False)
|
||||
args = parser.parse_args()
|
||||
if args.profile and not envs.VLLM_TORCH_PROFILER_DIR:
|
||||
raise OSError(
|
||||
"The environment variable 'VLLM_TORCH_PROFILER_DIR' is not set. "
|
||||
"Please set it to a valid path to use torch profiler."
|
||||
)
|
||||
main(args)
|
||||
|
||||
@ -76,7 +76,7 @@ def repeat_prompts(prompts, repeat_count, mode: str):
|
||||
- 'random': Shuffle the prompts randomly after repetition.
|
||||
- 'tile': Repeat the entire prompt list in sequence.
|
||||
Example: [1, 2, 3] -> [1, 2, 3, 1, 2, 3].
|
||||
- 'interleave': Repeat each prompt consecutively before moving to
|
||||
- 'interleave': Repeat each prompt consecutively before moving to
|
||||
the next. Example: [1, 2, 3] -> [1, 1, 2, 2, 3, 3].
|
||||
|
||||
Returns:
|
||||
@ -86,21 +86,20 @@ def repeat_prompts(prompts, repeat_count, mode: str):
|
||||
ValueError: If an invalid mode is provided.
|
||||
"""
|
||||
print("Repeat mode: ", mode)
|
||||
if mode == "random":
|
||||
if mode == 'random':
|
||||
repeated_prompts = prompts * repeat_count
|
||||
random.shuffle(repeated_prompts)
|
||||
return repeated_prompts
|
||||
elif mode == "tile":
|
||||
elif mode == 'tile':
|
||||
return prompts * repeat_count
|
||||
elif mode == "interleave":
|
||||
elif mode == 'interleave':
|
||||
repeated_prompts = []
|
||||
for prompt in prompts:
|
||||
repeated_prompts.extend([prompt] * repeat_count)
|
||||
return repeated_prompts
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Invalid mode: {mode}, only support 'random', 'tile', 'interleave'"
|
||||
)
|
||||
raise ValueError(f"Invalid mode: {mode}, only support "
|
||||
"'random', 'tile', 'interleave'")
|
||||
|
||||
|
||||
def main(args):
|
||||
@ -110,16 +109,16 @@ def main(args):
|
||||
# we append the document id at the beginning to avoid any of the document
|
||||
# being the prefix of other documents
|
||||
prompts = [
|
||||
str(i) + " ".join(["hi"] * args.document_length)
|
||||
str(i) + ' '.join(['hi'] * args.document_length)
|
||||
for i in range(args.num_documents)
|
||||
]
|
||||
|
||||
prompts = repeat_prompts(prompts, args.repeat_count, mode=args.repeat_mode)
|
||||
|
||||
warmup_prompts = [
|
||||
"This is warm up request " + str(i) + " ".join(["hi"] * args.document_length)
|
||||
for i in range(args.num_documents)
|
||||
]
|
||||
"This is warm up request " + str(i) + \
|
||||
' '.join(['hi'] * args.document_length)
|
||||
for i in range(args.num_documents)]
|
||||
|
||||
# Create the LLM engine
|
||||
engine_args = EngineArgs.from_cli_args(args)
|
||||
@ -143,52 +142,42 @@ def main(args):
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = FlexibleArgumentParser(
|
||||
description="Benchmark the performance with or "
|
||||
"without automatic prefix caching."
|
||||
)
|
||||
description=
|
||||
'Benchmark the performance with or without automatic prefix caching.')
|
||||
|
||||
parser.add_argument(
|
||||
"--document-length",
|
||||
'--document-length',
|
||||
type=int,
|
||||
# Roughly the number of tokens for a system paper,
|
||||
# excluding images
|
||||
default=20000,
|
||||
help="Range of input lengths for sampling prompts, "
|
||||
'specified as "min:max" (e.g., "128:256").',
|
||||
)
|
||||
help='Range of input lengths for sampling prompts,'
|
||||
'specified as "min:max" (e.g., "128:256").')
|
||||
|
||||
parser.add_argument(
|
||||
"--num-documents",
|
||||
type=int,
|
||||
default=8,
|
||||
help="Range of input lengths for sampling prompts, "
|
||||
'specified as "min:max" (e.g., "128:256").',
|
||||
)
|
||||
parser.add_argument('--num-documents',
|
||||
type=int,
|
||||
default=8,
|
||||
help='Range of input lengths for sampling prompts,'
|
||||
'specified as "min:max" (e.g., "128:256").')
|
||||
|
||||
parser.add_argument("--output-len", type=int, default=10)
|
||||
parser.add_argument('--output-len', type=int, default=10)
|
||||
|
||||
parser.add_argument(
|
||||
"--repeat-count",
|
||||
type=int,
|
||||
default=2,
|
||||
help="Number of times to repeat each prompt",
|
||||
)
|
||||
parser.add_argument('--repeat-count',
|
||||
type=int,
|
||||
default=2,
|
||||
help='Number of times to repeat each prompt')
|
||||
|
||||
parser.add_argument(
|
||||
"--repeat-mode",
|
||||
type=str,
|
||||
default="random",
|
||||
help="The mode to repeat prompts. The supported "
|
||||
'modes are "random", "tile", and "interleave". '
|
||||
"See repeat_prompts() in the source code for details.",
|
||||
)
|
||||
parser.add_argument("--repeat-mode",
|
||||
type=str,
|
||||
default='random',
|
||||
help='The mode to repeat prompts. The supported '
|
||||
'modes are "random", "tile", and "interleave". '
|
||||
'See repeat_prompts() in the source code for details.')
|
||||
|
||||
parser.add_argument(
|
||||
"--shuffle-seed",
|
||||
type=int,
|
||||
default=0,
|
||||
help='Random seed when the repeat mode is "random"',
|
||||
)
|
||||
parser.add_argument("--shuffle-seed",
|
||||
type=int,
|
||||
default=0,
|
||||
help='Random seed when the repeat mode is "random"')
|
||||
|
||||
parser = EngineArgs.add_cli_args(parser)
|
||||
args = parser.parse_args()
|
||||
|
||||
@ -63,15 +63,14 @@ class Request:
|
||||
output_len: int
|
||||
|
||||
|
||||
def sample_tokens(tokenizer: PreTrainedTokenizerBase, length: int) -> list[int]:
|
||||
def sample_tokens(tokenizer: PreTrainedTokenizerBase, length: int) -> str:
|
||||
vocab = tokenizer.get_vocab()
|
||||
all_special_ids = set(tokenizer.all_special_ids)
|
||||
|
||||
# Remove the special tokens.
|
||||
return random.choices(
|
||||
[v for k, v in vocab.items() if k not in all_special_ids],
|
||||
k=length,
|
||||
)
|
||||
vocab = {
|
||||
k: v
|
||||
for k, v in vocab.items() if k not in tokenizer.all_special_ids
|
||||
}
|
||||
return random.choices(list(vocab.values()), k=length)
|
||||
|
||||
|
||||
def sample_requests_from_dataset(
|
||||
@ -90,10 +89,8 @@ def sample_requests_from_dataset(
|
||||
# Filter out the conversations with less than 2 turns.
|
||||
dataset = [data for data in dataset if len(data["conversations"]) >= 2]
|
||||
# Only keep the first two turns of each conversation.
|
||||
dataset = [
|
||||
(data["conversations"][0]["value"], data["conversations"][1]["value"])
|
||||
for data in dataset
|
||||
]
|
||||
dataset = [(data["conversations"][0]["value"],
|
||||
data["conversations"][1]["value"]) for data in dataset]
|
||||
|
||||
# Shuffle the dataset.
|
||||
random.shuffle(dataset)
|
||||
@ -114,9 +111,8 @@ def sample_requests_from_dataset(
|
||||
completion = dataset[i][1]
|
||||
completion_token_ids = tokenizer(completion).input_ids
|
||||
prompt_len = len(prompt_token_ids)
|
||||
output_len = (
|
||||
len(completion_token_ids) if fixed_output_len is None else fixed_output_len
|
||||
)
|
||||
output_len = (len(completion_token_ids)
|
||||
if fixed_output_len is None else fixed_output_len)
|
||||
if min_len <= prompt_len <= max_len:
|
||||
filtered_requests.append(Request(prompt, prompt_len, output_len))
|
||||
|
||||
@ -130,27 +126,27 @@ def sample_requests_from_random(
|
||||
fixed_output_len: Optional[int],
|
||||
prefix_len: int,
|
||||
) -> list[Request]:
|
||||
|
||||
requests = []
|
||||
prefix_token_ids = sample_tokens(tokenizer, prefix_len)
|
||||
min_len, max_len = input_length_range
|
||||
|
||||
for i in range(num_requests):
|
||||
unique_part_token_ids = sample_tokens(
|
||||
tokenizer, random.randint(min_len - prefix_len, max_len - prefix_len)
|
||||
)
|
||||
tokenizer,
|
||||
random.randint(min_len - prefix_len, max_len - prefix_len))
|
||||
prompt_token_ids = prefix_token_ids + unique_part_token_ids
|
||||
prompt = tokenizer.decode(prompt_token_ids)
|
||||
prompt_len = len(prompt_token_ids)
|
||||
assert min_len <= prompt_len <= max_len, (
|
||||
f"prompt_len {prompt_len} out of range {min_len}:{max_len}"
|
||||
)
|
||||
assert (min_len <= prompt_len <= max_len
|
||||
), f"prompt_len {prompt_len} out of range {min_len}:{max_len}"
|
||||
requests.append(Request(prompt, prompt_len, fixed_output_len))
|
||||
return requests
|
||||
|
||||
|
||||
def repeat_and_sort_requests(
|
||||
requests: list[Request], repeat_count: int, sort: bool = False
|
||||
) -> list[str]:
|
||||
def repeat_and_sort_requests(requests: list[Request],
|
||||
repeat_count: int,
|
||||
sort: bool = False) -> list[str]:
|
||||
repeated_requests = requests * repeat_count
|
||||
if sort:
|
||||
repeated_requests.sort(key=lambda x: x[1])
|
||||
@ -161,14 +157,14 @@ def repeat_and_sort_requests(
|
||||
|
||||
def main(args):
|
||||
tokenizer = get_tokenizer(args.model, trust_remote_code=True)
|
||||
input_length_range = tuple(map(int, args.input_length_range.split(":")))
|
||||
input_length_range = tuple(map(int, args.input_length_range.split(':')))
|
||||
random.seed(args.seed)
|
||||
if args.dataset_path is not None:
|
||||
if args.prefix_len > 0:
|
||||
raise ValueError(
|
||||
"prefix-len is not supported when dataset-path is provided."
|
||||
)
|
||||
print(f"Start to sample {args.num_prompts} prompts from {args.dataset_path}")
|
||||
raise ValueError("prefix-len is not supported when "
|
||||
"dataset-path is provided.")
|
||||
print(f"Start to sample {args.num_prompts} prompts "
|
||||
f"from {args.dataset_path}")
|
||||
filtered_requests = sample_requests_from_dataset(
|
||||
dataset_path=args.dataset_path,
|
||||
num_requests=args.num_prompts,
|
||||
@ -198,16 +194,14 @@ def main(args):
|
||||
|
||||
llm = LLM(**dataclasses.asdict(engine_args))
|
||||
|
||||
sampling_params = SamplingParams(
|
||||
temperature=0,
|
||||
max_tokens=args.output_len,
|
||||
detokenize=not args.disable_detokenize,
|
||||
)
|
||||
sampling_params = SamplingParams(temperature=0,
|
||||
max_tokens=args.output_len,
|
||||
detokenize=not args.disable_detokenize)
|
||||
|
||||
print("Testing filtered requests")
|
||||
prompts = repeat_and_sort_requests(
|
||||
filtered_requests, repeat_count=args.repeat_count, sort=args.sort
|
||||
)
|
||||
prompts = repeat_and_sort_requests(filtered_requests,
|
||||
repeat_count=args.repeat_count,
|
||||
sort=args.sort)
|
||||
|
||||
print("------start generating------")
|
||||
test_prefix(
|
||||
@ -219,35 +213,29 @@ def main(args):
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = FlexibleArgumentParser(
|
||||
description="Benchmark the performance with or without "
|
||||
"automatic prefix caching."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dataset-path", type=str, default=None, help="Path to the dataset."
|
||||
)
|
||||
parser.add_argument("--output-len", type=int, default=10)
|
||||
parser.add_argument(
|
||||
"--num-prompts",
|
||||
type=int,
|
||||
required=True,
|
||||
help="Number of the prompts sampled from dataset",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--repeat-count",
|
||||
type=int,
|
||||
default=1,
|
||||
help="Number of times to repeat each prompt",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--sort", action="store_true", help="Sort prompts by input length"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--input-length-range",
|
||||
type=str,
|
||||
required=True,
|
||||
help="Range of input lengths for sampling prompts,"
|
||||
'specified as "min:max" (e.g., "128:256").',
|
||||
)
|
||||
description=
|
||||
'Benchmark the performance with or without automatic prefix caching.')
|
||||
parser.add_argument("--dataset-path",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Path to the dataset.")
|
||||
parser.add_argument('--output-len', type=int, default=10)
|
||||
parser.add_argument('--num-prompts',
|
||||
type=int,
|
||||
required=True,
|
||||
help="Number of the prompts sampled from dataset")
|
||||
parser.add_argument('--repeat-count',
|
||||
type=int,
|
||||
default=1,
|
||||
help='Number of times to repeat each prompt')
|
||||
parser.add_argument('--sort',
|
||||
action='store_true',
|
||||
help='Sort prompts by input length')
|
||||
parser.add_argument('--input-length-range',
|
||||
type=str,
|
||||
required=True,
|
||||
help='Range of input lengths for sampling prompts,'
|
||||
'specified as "min:max" (e.g., "128:256").')
|
||||
parser.add_argument(
|
||||
"--prefix-len",
|
||||
type=int,
|
||||
@ -258,12 +246,10 @@ if __name__ == "__main__":
|
||||
"when dataset-path is not provided.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--disable-detokenize",
|
||||
action="store_true",
|
||||
help=(
|
||||
"Do not detokenize responses (i.e. do not include "
|
||||
"detokenization time in the latency measurement)"
|
||||
),
|
||||
'--disable-detokenize',
|
||||
action='store_true',
|
||||
help=("Do not detokenize responses (i.e. do not include "
|
||||
"detokenization time in the latency measurement)"),
|
||||
)
|
||||
|
||||
parser = EngineArgs.add_cli_args(parser)
|
||||
|
||||
@ -1,6 +1,5 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
"""Benchmark offline prioritization."""
|
||||
|
||||
import argparse
|
||||
import dataclasses
|
||||
import json
|
||||
@ -14,7 +13,7 @@ from vllm.engine.arg_utils import EngineArgs
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
|
||||
|
||||
# Select a equi-probable random priority
|
||||
#Select a equi-probable random priority
|
||||
def get_random_flag():
|
||||
return 0 if random.random() < 0.5 else 1
|
||||
|
||||
@ -34,10 +33,8 @@ def sample_requests(
|
||||
# Filter out the conversations with less than 2 turns.
|
||||
dataset = [data for data in dataset if len(data["conversations"]) >= 2]
|
||||
# Only keep the first two turns of each conversation.
|
||||
dataset = [
|
||||
(data["conversations"][0]["value"], data["conversations"][1]["value"])
|
||||
for data in dataset
|
||||
]
|
||||
dataset = [(data["conversations"][0]["value"],
|
||||
data["conversations"][1]["value"]) for data in dataset]
|
||||
|
||||
# Shuffle the dataset.
|
||||
random.shuffle(dataset)
|
||||
@ -54,9 +51,8 @@ def sample_requests(
|
||||
completion = dataset[i][1]
|
||||
completion_token_ids = tokenizer(completion).input_ids
|
||||
prompt_len = len(prompt_token_ids)
|
||||
output_len = (
|
||||
len(completion_token_ids) if fixed_output_len is None else fixed_output_len
|
||||
)
|
||||
output_len = len(completion_token_ids
|
||||
) if fixed_output_len is None else fixed_output_len
|
||||
if prompt_len < 4 or output_len < 4:
|
||||
# Prune too short sequences.
|
||||
continue
|
||||
@ -78,16 +74,13 @@ def run_vllm(
|
||||
disable_detokenize: bool = False,
|
||||
) -> float:
|
||||
from vllm import LLM, SamplingParams
|
||||
|
||||
llm = LLM(**dataclasses.asdict(engine_args))
|
||||
|
||||
assert all(
|
||||
llm.llm_engine.model_config.max_model_len >= (request[1] + request[2])
|
||||
for request in requests
|
||||
), (
|
||||
"Please ensure that max_model_len is greater than the sum of"
|
||||
" input_len and output_len for all requests."
|
||||
)
|
||||
for request in requests), (
|
||||
"Please ensure that max_model_len is greater than the sum of"
|
||||
" input_len and output_len for all requests.")
|
||||
|
||||
# Add the requests to the engine.
|
||||
prompts = []
|
||||
@ -104,8 +97,7 @@ def run_vllm(
|
||||
ignore_eos=True,
|
||||
max_tokens=output_len,
|
||||
detokenize=not disable_detokenize,
|
||||
)
|
||||
)
|
||||
))
|
||||
|
||||
start = time.perf_counter()
|
||||
llm.generate(prompts, sampling_params, priority=priority, use_tqdm=True)
|
||||
@ -119,33 +111,26 @@ def main(args: argparse.Namespace):
|
||||
|
||||
# Sample the requests.
|
||||
tokenizer = AutoTokenizer.from_pretrained(
|
||||
args.tokenizer, trust_remote_code=args.trust_remote_code
|
||||
)
|
||||
args.tokenizer, trust_remote_code=args.trust_remote_code)
|
||||
if args.dataset is None:
|
||||
# Synthesize a prompt with the given input length.
|
||||
prompt = "hi" * (args.input_len - 1)
|
||||
requests = [
|
||||
(prompt, args.input_len, args.output_len, get_random_flag())
|
||||
for _ in range(args.num_prompts)
|
||||
]
|
||||
requests = [(prompt, args.input_len, args.output_len,
|
||||
get_random_flag()) for _ in range(args.num_prompts)]
|
||||
else:
|
||||
requests = sample_requests(
|
||||
args.dataset, args.num_prompts, tokenizer, args.output_len
|
||||
)
|
||||
requests = sample_requests(args.dataset, args.num_prompts, tokenizer,
|
||||
args.output_len)
|
||||
|
||||
if args.backend == "vllm":
|
||||
elapsed_time = run_vllm(
|
||||
requests, args.n, EngineArgs.from_cli_args(args), args.disable_detokenize
|
||||
)
|
||||
elapsed_time = run_vllm(requests, args.n,
|
||||
EngineArgs.from_cli_args(args),
|
||||
args.disable_detokenize)
|
||||
else:
|
||||
raise ValueError(f"Unknown backend: {args.backend}")
|
||||
total_num_tokens = sum(
|
||||
prompt_len + output_len for _, prompt_len, output_len, priority in requests
|
||||
)
|
||||
print(
|
||||
f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
|
||||
f"{total_num_tokens / elapsed_time:.2f} tokens/s"
|
||||
)
|
||||
total_num_tokens = sum(prompt_len + output_len
|
||||
for _, prompt_len, output_len, priority in requests)
|
||||
print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
|
||||
f"{total_num_tokens / elapsed_time:.2f} tokens/s")
|
||||
|
||||
# Output JSON results if specified
|
||||
if args.output_json:
|
||||
@ -162,44 +147,41 @@ def main(args: argparse.Namespace):
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = FlexibleArgumentParser(description="Benchmark the throughput.")
|
||||
parser.add_argument("--backend",
|
||||
type=str,
|
||||
choices=["vllm", "hf", "mii"],
|
||||
default="vllm")
|
||||
parser.add_argument("--dataset",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Path to the dataset.")
|
||||
parser.add_argument("--input-len",
|
||||
type=int,
|
||||
default=None,
|
||||
help="Input prompt length for each request")
|
||||
parser.add_argument("--output-len",
|
||||
type=int,
|
||||
default=None,
|
||||
help="Output length for each request. Overrides the "
|
||||
"output length from the dataset.")
|
||||
parser.add_argument("--n",
|
||||
type=int,
|
||||
default=1,
|
||||
help="Number of generated sequences per prompt.")
|
||||
parser.add_argument("--num-prompts",
|
||||
type=int,
|
||||
default=200,
|
||||
help="Number of prompts to process.")
|
||||
parser.add_argument(
|
||||
"--backend", type=str, choices=["vllm", "hf", "mii"], default="vllm"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dataset", type=str, default=None, help="Path to the dataset."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--input-len",
|
||||
type=int,
|
||||
default=None,
|
||||
help="Input prompt length for each request",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output-len",
|
||||
type=int,
|
||||
default=None,
|
||||
help="Output length for each request. Overrides the "
|
||||
"output length from the dataset.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--n", type=int, default=1, help="Number of generated sequences per prompt."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--num-prompts", type=int, default=200, help="Number of prompts to process."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output-json",
|
||||
'--output-json',
|
||||
type=str,
|
||||
default=None,
|
||||
help="Path to save the throughput results in JSON format.",
|
||||
)
|
||||
help='Path to save the throughput results in JSON format.')
|
||||
parser.add_argument(
|
||||
"--disable-detokenize",
|
||||
action="store_true",
|
||||
help=(
|
||||
"Do not detokenize responses (i.e. do not include "
|
||||
"detokenization time in the latency measurement)"
|
||||
),
|
||||
'--disable-detokenize',
|
||||
action='store_true',
|
||||
help=("Do not detokenize responses (i.e. do not include "
|
||||
"detokenization time in the latency measurement)"),
|
||||
)
|
||||
|
||||
parser = EngineArgs.add_cli_args(parser)
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -5,13 +5,16 @@ On the server side, run one of the following commands:
|
||||
(vLLM OpenAI API server)
|
||||
vllm serve <your_model> --disable-log-requests
|
||||
|
||||
(TGI backend)
|
||||
./launch_tgi_server.sh <your_model> <max_batch_total_tokens>
|
||||
|
||||
On the client side, run:
|
||||
python benchmarks/benchmark_serving_structured_output.py \
|
||||
--backend <backend> \
|
||||
--model <your_model> \
|
||||
--dataset json \
|
||||
--structured-output-ratio 1.0 \
|
||||
--structured-output-backend auto \
|
||||
--structured-output-backend xgrammar \
|
||||
--request-rate 10 \
|
||||
--num-prompts 1000
|
||||
|
||||
@ -19,7 +22,6 @@ On the client side, run:
|
||||
--endpoint /generate_stream
|
||||
to the end of the command above.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import copy
|
||||
@ -37,15 +39,11 @@ from typing import Optional
|
||||
import datasets
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput,
|
||||
RequestFuncOutput)
|
||||
from tqdm.asyncio import tqdm
|
||||
from transformers import PreTrainedTokenizerBase
|
||||
|
||||
from backend_request_func import (
|
||||
ASYNC_REQUEST_FUNCS,
|
||||
RequestFuncInput,
|
||||
RequestFuncOutput,
|
||||
)
|
||||
|
||||
try:
|
||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||
except ImportError:
|
||||
@ -56,9 +54,8 @@ try:
|
||||
except ImportError:
|
||||
from argparse import ArgumentParser as FlexibleArgumentParser
|
||||
|
||||
from vllm.v1.structured_output.backend_xgrammar import (
|
||||
has_xgrammar_unsupported_json_features,
|
||||
)
|
||||
from vllm.v1.structured_output.utils import (
|
||||
has_xgrammar_unsupported_json_features)
|
||||
|
||||
MILLISECONDS_TO_SECONDS_CONVERSION = 1000
|
||||
|
||||
@ -104,7 +101,6 @@ class SampleRequest:
|
||||
prompt_len: The length of the prompt in tokens.
|
||||
expected_output_len: The expected length of the output in tokens.
|
||||
"""
|
||||
|
||||
prompt: str
|
||||
prompt_len: int
|
||||
expected_output_len: int
|
||||
@ -113,61 +109,60 @@ class SampleRequest:
|
||||
completion: str = None
|
||||
|
||||
|
||||
def sample_requests(
|
||||
tokenizer: PreTrainedTokenizerBase, args: argparse.Namespace
|
||||
) -> list[SampleRequest]:
|
||||
if args.dataset == "json" or args.dataset == "json-unique":
|
||||
def sample_requests(tokenizer: PreTrainedTokenizerBase,
|
||||
args: argparse.Namespace) -> list[SampleRequest]:
|
||||
if args.dataset == 'json' or args.dataset == 'json-unique':
|
||||
if args.json_schema_path is None:
|
||||
dir_path = os.path.dirname(os.path.realpath(__file__))
|
||||
args.json_schema_path = os.path.join(
|
||||
dir_path, "structured_schemas", "structured_schema_1.json"
|
||||
)
|
||||
args.json_schema_path = os.path.join(dir_path,
|
||||
"structured_schemas",
|
||||
"structured_schema_1.json")
|
||||
json_schemas = []
|
||||
with open(args.json_schema_path) as f:
|
||||
schema = json.load(f)
|
||||
|
||||
if args.dataset == "json-unique":
|
||||
json_schemas = [copy.deepcopy(schema) for _ in range(args.num_prompts)]
|
||||
if args.dataset == 'json-unique':
|
||||
json_schemas = [
|
||||
copy.deepcopy(schema) for _ in range(args.num_prompts)
|
||||
]
|
||||
for i in range(len(json_schemas)):
|
||||
if "properties" not in json_schemas[i]:
|
||||
json_schemas[i]["properties"] = {}
|
||||
json_schemas[i]["properties"][f"__optional_field_{uuid.uuid4()}"] = {
|
||||
"type": "string",
|
||||
"description": "An unique optional field to avoid cached schemas",
|
||||
}
|
||||
else:
|
||||
json_schemas = [schema] * args.num_prompts
|
||||
json_schemas[i]["properties"][
|
||||
f"__optional_field_{uuid.uuid4()}"] = {
|
||||
"type":
|
||||
"string",
|
||||
"description":
|
||||
"An unique optional field to avoid cached schemas"
|
||||
}
|
||||
|
||||
def gen_prompt(index: int):
|
||||
return f"Generate an example of a brief user profile given the following schema: {json.dumps(get_schema(index))}" # noqa: E501
|
||||
schema = json_schemas[index % len(json_schemas)]
|
||||
return f"Generate an example of a user profile given the following schema: {json.dumps(schema)}" # noqa: E501
|
||||
|
||||
def get_schema(index: int):
|
||||
return json_schemas[index % len(json_schemas)]
|
||||
|
||||
requests = [
|
||||
SampleRequest(
|
||||
prompt=gen_prompt(i),
|
||||
prompt_len=len(tokenizer(gen_prompt(i)).input_ids),
|
||||
expected_output_len=args.output_len,
|
||||
schema=get_schema(i),
|
||||
structure_type=args.structure_type,
|
||||
)
|
||||
SampleRequest(prompt=gen_prompt(i),
|
||||
prompt_len=len(tokenizer(gen_prompt(i)).input_ids),
|
||||
expected_output_len=args.output_len,
|
||||
schema=get_schema(i),
|
||||
structure_type=args.structure_type)
|
||||
for i in range(args.num_prompts)
|
||||
]
|
||||
|
||||
elif args.dataset == "grammar":
|
||||
schema = """
|
||||
root ::= select_statement
|
||||
?start: select_statement
|
||||
|
||||
select_statement ::= "SELECT " column " from " table " where " condition
|
||||
?select_statement: "SELECT " column_list " FROM " table_name
|
||||
|
||||
column ::= "col_1 " | "col_2 "
|
||||
?column_list: column_name ("," column_name)*
|
||||
|
||||
table ::= "table_1 " | "table_2 "
|
||||
?table_name: identifier
|
||||
|
||||
condition ::= column "= " number
|
||||
?column_name: identifier
|
||||
|
||||
number ::= "1 " | "2 "
|
||||
?identifier: /[a-zA-Z_][a-zA-Z0-9_]*/
|
||||
"""
|
||||
prompt = "Generate an SQL query to show the 'username' \
|
||||
and 'email' from the 'users' table."
|
||||
@ -175,13 +170,11 @@ def sample_requests(
|
||||
input_len = len(tokenizer(prompt).input_ids)
|
||||
print(f"Input length of the prompt: {input_len} tokens")
|
||||
requests = [
|
||||
SampleRequest(
|
||||
prompt=prompt,
|
||||
prompt_len=input_len,
|
||||
expected_output_len=args.output_len,
|
||||
schema=schema,
|
||||
structure_type=args.structure_type,
|
||||
)
|
||||
SampleRequest(prompt=prompt,
|
||||
prompt_len=input_len,
|
||||
expected_output_len=args.output_len,
|
||||
schema=schema,
|
||||
structure_type=args.structure_type)
|
||||
for _ in range(args.num_prompts)
|
||||
]
|
||||
|
||||
@ -195,13 +188,11 @@ def sample_requests(
|
||||
input_len = len(tokenizer(prompt).input_ids)
|
||||
print(f"Input length of the prompt: {input_len} tokens")
|
||||
requests = [
|
||||
SampleRequest(
|
||||
prompt=prompt,
|
||||
prompt_len=input_len,
|
||||
expected_output_len=args.output_len,
|
||||
schema=regex,
|
||||
structure_type=args.structure_type,
|
||||
)
|
||||
SampleRequest(prompt=prompt,
|
||||
prompt_len=input_len,
|
||||
expected_output_len=args.output_len,
|
||||
schema=regex,
|
||||
structure_type=args.structure_type)
|
||||
for _ in range(args.num_prompts)
|
||||
]
|
||||
|
||||
@ -212,55 +203,47 @@ def sample_requests(
|
||||
input_len = len(tokenizer(prompt).input_ids)
|
||||
print(f"Input length of the prompt: {input_len} tokens")
|
||||
requests = [
|
||||
SampleRequest(
|
||||
prompt=prompt,
|
||||
prompt_len=input_len,
|
||||
expected_output_len=args.output_len,
|
||||
schema=choice,
|
||||
structure_type=args.structure_type,
|
||||
)
|
||||
SampleRequest(prompt=prompt,
|
||||
prompt_len=input_len,
|
||||
expected_output_len=args.output_len,
|
||||
schema=choice,
|
||||
structure_type=args.structure_type)
|
||||
for _ in range(args.num_prompts)
|
||||
]
|
||||
|
||||
elif args.dataset == "xgrammar_bench":
|
||||
requests: list[SampleRequest] = []
|
||||
dataset = datasets.load_dataset("NousResearch/json-mode-eval", split="train")
|
||||
dataset = datasets.load_dataset("NousResearch/json-mode-eval",
|
||||
split="train")
|
||||
full_dataset_len = len(dataset)
|
||||
|
||||
def _filter_func(item):
|
||||
import json
|
||||
|
||||
schema = json.loads(item["schema"])
|
||||
return not has_xgrammar_unsupported_json_features(schema)
|
||||
|
||||
dataset = dataset.filter(_filter_func)
|
||||
num_filtered_out = full_dataset_len - len(dataset)
|
||||
print(
|
||||
f"dataset has {len(dataset)} entries after filtering "
|
||||
f"out {num_filtered_out} entries with unsupported features"
|
||||
)
|
||||
print(f"dataset has {len(dataset)} entries after filtering "
|
||||
f"out {num_filtered_out} entries with unsupported features")
|
||||
len_dataset = len(dataset)
|
||||
for data_point_idx in range(args.num_prompts):
|
||||
idx = data_point_idx
|
||||
while idx >= len_dataset:
|
||||
idx -= len_dataset
|
||||
schema = dataset["schema"][idx]
|
||||
prompt = tokenizer.apply_chat_template(
|
||||
dataset["prompt"][idx], tokenize=False, add_generation_prompt=True
|
||||
)
|
||||
prompt = tokenizer.apply_chat_template(dataset["prompt"][idx],
|
||||
tokenize=False)
|
||||
input_len = len(tokenizer(prompt).input_ids)
|
||||
completion = dataset["completion"][idx]
|
||||
|
||||
requests.append(
|
||||
SampleRequest(
|
||||
prompt=prompt,
|
||||
prompt_len=input_len,
|
||||
expected_output_len=args.output_len,
|
||||
schema=schema,
|
||||
structure_type=args.structure_type,
|
||||
completion=completion,
|
||||
)
|
||||
)
|
||||
SampleRequest(prompt=prompt,
|
||||
prompt_len=input_len,
|
||||
expected_output_len=args.output_len,
|
||||
schema=schema,
|
||||
structure_type=args.structure_type,
|
||||
completion=completion))
|
||||
|
||||
return requests
|
||||
|
||||
@ -292,8 +275,7 @@ async def get_request(
|
||||
|
||||
# Calculate scale parameter theta to maintain the desired request_rate.
|
||||
assert burstiness > 0, (
|
||||
f"A positive burstiness factor is expected, but given {burstiness}."
|
||||
)
|
||||
f"A positive burstiness factor is expected, but given {burstiness}.")
|
||||
theta = 1.0 / (request_rate * burstiness)
|
||||
|
||||
for i, request in enumerate(input_requests):
|
||||
@ -335,8 +317,8 @@ def calculate_metrics(
|
||||
# multiple output tokens may be bundled together
|
||||
# Note : this may inflate the output token count slightly
|
||||
output_len = len(
|
||||
tokenizer(outputs[i].generated_text, add_special_tokens=False).input_ids
|
||||
)
|
||||
tokenizer(outputs[i].generated_text,
|
||||
add_special_tokens=False).input_ids)
|
||||
actual_output_lens.append(output_len)
|
||||
total_input += input_requests[i].prompt_len
|
||||
tpot = 0
|
||||
@ -360,19 +342,16 @@ def calculate_metrics(
|
||||
|
||||
if "ttft" in goodput_config_dict:
|
||||
valid_metrics.append(ttfts)
|
||||
slo_values.append(
|
||||
goodput_config_dict["ttft"] / MILLISECONDS_TO_SECONDS_CONVERSION
|
||||
)
|
||||
slo_values.append(goodput_config_dict["ttft"] /
|
||||
MILLISECONDS_TO_SECONDS_CONVERSION)
|
||||
if "tpot" in goodput_config_dict:
|
||||
valid_metrics.append(all_tpots)
|
||||
slo_values.append(
|
||||
goodput_config_dict["tpot"] / MILLISECONDS_TO_SECONDS_CONVERSION
|
||||
)
|
||||
slo_values.append(goodput_config_dict["tpot"] /
|
||||
MILLISECONDS_TO_SECONDS_CONVERSION)
|
||||
if "e2el" in goodput_config_dict:
|
||||
valid_metrics.append(e2els)
|
||||
slo_values.append(
|
||||
goodput_config_dict["e2el"] / MILLISECONDS_TO_SECONDS_CONVERSION
|
||||
)
|
||||
slo_values.append(goodput_config_dict["e2el"] /
|
||||
MILLISECONDS_TO_SECONDS_CONVERSION)
|
||||
|
||||
for req_metric in zip(*valid_metrics):
|
||||
is_good_req = all([s >= r for s, r in zip(slo_values, req_metric)])
|
||||
@ -383,8 +362,7 @@ def calculate_metrics(
|
||||
warnings.warn(
|
||||
"All requests failed. This is likely due to a misconfiguration "
|
||||
"on the benchmark arguments.",
|
||||
stacklevel=2,
|
||||
)
|
||||
stacklevel=2)
|
||||
metrics = BenchmarkMetrics(
|
||||
completed=completed,
|
||||
total_input=total_input,
|
||||
@ -393,31 +371,27 @@ def calculate_metrics(
|
||||
request_goodput=good_completed / dur_s,
|
||||
output_throughput=sum(actual_output_lens) / dur_s,
|
||||
total_token_throughput=(total_input + sum(actual_output_lens)) / dur_s,
|
||||
mean_ttft_ms=np.mean(ttfts or 0)
|
||||
* 1000, # ttfts is empty if streaming is not supported by backend
|
||||
mean_ttft_ms=np.mean(ttfts or 0) *
|
||||
1000, # ttfts is empty if streaming is not supported by backend
|
||||
std_ttft_ms=np.std(ttfts or 0) * 1000,
|
||||
median_ttft_ms=np.median(ttfts or 0) * 1000,
|
||||
percentiles_ttft_ms=[
|
||||
(p, np.percentile(ttfts or 0, p) * 1000) for p in selected_percentiles
|
||||
],
|
||||
percentiles_ttft_ms=[(p, np.percentile(ttfts or 0, p) * 1000)
|
||||
for p in selected_percentiles],
|
||||
mean_tpot_ms=np.mean(tpots or 0) * 1000,
|
||||
std_tpot_ms=np.std(tpots or 0) * 1000,
|
||||
median_tpot_ms=np.median(tpots or 0) * 1000,
|
||||
percentiles_tpot_ms=[
|
||||
(p, np.percentile(tpots or 0, p) * 1000) for p in selected_percentiles
|
||||
],
|
||||
percentiles_tpot_ms=[(p, np.percentile(tpots or 0, p) * 1000)
|
||||
for p in selected_percentiles],
|
||||
mean_itl_ms=np.mean(itls or 0) * 1000,
|
||||
std_itl_ms=np.std(itls or 0) * 1000,
|
||||
median_itl_ms=np.median(itls or 0) * 1000,
|
||||
percentiles_itl_ms=[
|
||||
(p, np.percentile(itls or 0, p) * 1000) for p in selected_percentiles
|
||||
],
|
||||
percentiles_itl_ms=[(p, np.percentile(itls or 0, p) * 1000)
|
||||
for p in selected_percentiles],
|
||||
mean_e2el_ms=np.mean(e2els or 0) * 1000,
|
||||
std_e2el_ms=np.std(e2els or 0) * 1000,
|
||||
median_e2el_ms=np.median(e2els or 0) * 1000,
|
||||
percentiles_e2el_ms=[
|
||||
(p, np.percentile(e2els or 0, p) * 1000) for p in selected_percentiles
|
||||
],
|
||||
percentiles_e2el_ms=[(p, np.percentile(e2els or 0, p) * 1000)
|
||||
for p in selected_percentiles],
|
||||
)
|
||||
|
||||
return metrics, actual_output_lens
|
||||
@ -439,6 +413,7 @@ async def benchmark(
|
||||
ignore_eos: bool,
|
||||
max_concurrency: Optional[int],
|
||||
structured_output_ratio: float,
|
||||
structured_output_backend: str,
|
||||
goodput_config_dict: Optional[dict[str, float]] = None,
|
||||
):
|
||||
if backend in ASYNC_REQUEST_FUNCS:
|
||||
@ -450,17 +425,18 @@ async def benchmark(
|
||||
extra_body = {}
|
||||
# Add the schema to the extra_body
|
||||
extra_body[request.structure_type] = request.schema
|
||||
# Add the specific structured_output_backend
|
||||
extra_body["guided_decoding_backend"] = structured_output_backend
|
||||
return extra_body
|
||||
|
||||
print("Starting initial single prompt test run...")
|
||||
structured_output_req_idx = random.sample(
|
||||
range(len(input_requests)), int(len(input_requests) * structured_output_ratio)
|
||||
)
|
||||
range(len(input_requests)),
|
||||
int(len(input_requests) * structured_output_ratio))
|
||||
|
||||
test_request = input_requests[0]
|
||||
test_req_extra_body = (
|
||||
prepare_extra_body(test_request) if 0 in structured_output_req_idx else None
|
||||
)
|
||||
test_req_extra_body = (prepare_extra_body(test_request)
|
||||
if 0 in structured_output_req_idx else None)
|
||||
test_input = RequestFuncInput(
|
||||
model=model_id,
|
||||
prompt=test_request.prompt,
|
||||
@ -474,8 +450,7 @@ async def benchmark(
|
||||
if not test_output.success:
|
||||
raise ValueError(
|
||||
"Initial test run failed - Please make sure benchmark arguments "
|
||||
f"are correctly specified. Error: {test_output.error}"
|
||||
)
|
||||
f"are correctly specified. Error: {test_output.error}")
|
||||
else:
|
||||
print("Initial test run completed. Starting main benchmark run...")
|
||||
|
||||
@ -494,7 +469,10 @@ async def benchmark(
|
||||
if profile_output.success:
|
||||
print("Profiler started")
|
||||
|
||||
distribution = "Poisson process" if burstiness == 1.0 else "Gamma distribution"
|
||||
if burstiness == 1.0:
|
||||
distribution = "Poisson process"
|
||||
else:
|
||||
distribution = "Gamma distribution"
|
||||
|
||||
print(f"Traffic request rate: {request_rate}")
|
||||
print(f"Burstiness factor: {burstiness} ({distribution})")
|
||||
@ -506,21 +484,24 @@ async def benchmark(
|
||||
# and it will simplify the code in limited_request_func.
|
||||
# semaphore = (asyncio.Semaphore(max_concurrency)
|
||||
# if max_concurrency else contextlib.nullcontext())
|
||||
semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else None
|
||||
semaphore = (asyncio.Semaphore(max_concurrency)
|
||||
if max_concurrency else None)
|
||||
|
||||
async def limited_request_func(request_func_input, pbar):
|
||||
if semaphore is None:
|
||||
return await request_func(request_func_input=request_func_input, pbar=pbar)
|
||||
return await request_func(request_func_input=request_func_input,
|
||||
pbar=pbar)
|
||||
async with semaphore:
|
||||
return await request_func(request_func_input=request_func_input, pbar=pbar)
|
||||
return await request_func(request_func_input=request_func_input,
|
||||
pbar=pbar)
|
||||
|
||||
benchmark_start_time = time.perf_counter()
|
||||
tasks: list[asyncio.Task] = []
|
||||
expected: list[str] = []
|
||||
async for i, request in get_request(input_requests, request_rate, burstiness):
|
||||
extra_body = (
|
||||
prepare_extra_body(request) if i in structured_output_req_idx else None
|
||||
)
|
||||
async for i, request in get_request(input_requests, request_rate,
|
||||
burstiness):
|
||||
extra_body = prepare_extra_body(
|
||||
request) if i in structured_output_req_idx else None
|
||||
request_func_input = RequestFuncInput(
|
||||
model=model_id,
|
||||
prompt=request.prompt,
|
||||
@ -533,9 +514,8 @@ async def benchmark(
|
||||
expected.append(request.completion)
|
||||
tasks.append(
|
||||
asyncio.create_task(
|
||||
limited_request_func(request_func_input=request_func_input, pbar=pbar)
|
||||
)
|
||||
)
|
||||
limited_request_func(request_func_input=request_func_input,
|
||||
pbar=pbar)))
|
||||
outputs: list[RequestFuncOutput] = await asyncio.gather(*tasks)
|
||||
|
||||
if profile:
|
||||
@ -567,58 +547,54 @@ async def benchmark(
|
||||
goodput_config_dict=goodput_config_dict,
|
||||
)
|
||||
|
||||
print("{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=50, c="="))
|
||||
print("{s:{c}^{n}}".format(s=' Serving Benchmark Result ', n=50, c='='))
|
||||
print("{:<40} {:<10}".format("Successful requests:", metrics.completed))
|
||||
print("{:<40} {:<10.2f}".format("Benchmark duration (s):", benchmark_duration))
|
||||
print("{:<40} {:<10.2f}".format("Benchmark duration (s):",
|
||||
benchmark_duration))
|
||||
print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input))
|
||||
print("{:<40} {:<10}".format("Total generated tokens:", metrics.total_output))
|
||||
print(
|
||||
"{:<40} {:<10.2f}".format(
|
||||
"Request throughput (req/s):", metrics.request_throughput
|
||||
)
|
||||
)
|
||||
print("{:<40} {:<10}".format("Total generated tokens:",
|
||||
metrics.total_output))
|
||||
print("{:<40} {:<10.2f}".format("Request throughput (req/s):",
|
||||
metrics.request_throughput))
|
||||
if goodput_config_dict:
|
||||
print(
|
||||
"{:<40} {:<10.2f}".format(
|
||||
"Request goodput (req/s):", metrics.request_goodput
|
||||
)
|
||||
)
|
||||
print(
|
||||
"{:<40} {:<10.2f}".format(
|
||||
"Output token throughput (tok/s):", metrics.output_throughput
|
||||
)
|
||||
)
|
||||
print(
|
||||
"{:<40} {:<10.2f}".format(
|
||||
"Total Token throughput (tok/s):", metrics.total_token_throughput
|
||||
)
|
||||
)
|
||||
print("{:<40} {:<10.2f}".format("Request goodput (req/s):",
|
||||
metrics.request_goodput))
|
||||
print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):",
|
||||
metrics.output_throughput))
|
||||
print("{:<40} {:<10.2f}".format("Total Token throughput (tok/s):",
|
||||
metrics.total_token_throughput))
|
||||
|
||||
result = {
|
||||
"duration": benchmark_duration,
|
||||
"completed": metrics.completed,
|
||||
"total_input_tokens": metrics.total_input,
|
||||
"total_output_tokens": metrics.total_output,
|
||||
"request_throughput": metrics.request_throughput,
|
||||
"output_throughput": metrics.output_throughput,
|
||||
"total_token_throughput": metrics.total_token_throughput,
|
||||
"ttft_description": pd.Series([output.ttft for output in outputs])
|
||||
.describe()
|
||||
.to_dict(),
|
||||
"tpot_description": pd.Series([output.tpot for output in outputs])
|
||||
.describe()
|
||||
.to_dict(),
|
||||
"duration":
|
||||
benchmark_duration,
|
||||
"completed":
|
||||
metrics.completed,
|
||||
"total_input_tokens":
|
||||
metrics.total_input,
|
||||
"total_output_tokens":
|
||||
metrics.total_output,
|
||||
"request_throughput":
|
||||
metrics.request_throughput,
|
||||
"output_throughput":
|
||||
metrics.output_throughput,
|
||||
"total_token_throughput":
|
||||
metrics.total_token_throughput,
|
||||
"ttft_description":
|
||||
pd.Series([output.ttft for output in outputs]).describe().to_dict(),
|
||||
"tpot_description":
|
||||
pd.Series([output.tpot for output in outputs]).describe().to_dict(),
|
||||
"input_lens": [output.prompt_len for output in outputs],
|
||||
"output_lens": actual_output_lens,
|
||||
"output_lens":
|
||||
actual_output_lens,
|
||||
"ttfts": [output.ttft for output in outputs],
|
||||
"itls": [output.itl for output in outputs],
|
||||
"errors": [output.error for output in outputs],
|
||||
}
|
||||
|
||||
ret = [
|
||||
{"generated": output.generated_text, "expected": gt}
|
||||
for output, gt in zip(outputs, expected)
|
||||
]
|
||||
ret = [{
|
||||
'generated': output.generated_text,
|
||||
'expected': gt
|
||||
} for output, gt in zip(outputs, expected)]
|
||||
|
||||
def process_one_metric(
|
||||
# E.g., "ttft"
|
||||
@ -632,35 +608,29 @@ async def benchmark(
|
||||
# metric.
|
||||
if metric_attribute_name not in selected_percentile_metrics:
|
||||
return
|
||||
print("{s:{c}^{n}}".format(s=metric_header, n=50, c="-"))
|
||||
print(
|
||||
"{:<40} {:<10.2f}".format(
|
||||
f"Mean {metric_name} (ms):",
|
||||
getattr(metrics, f"mean_{metric_attribute_name}_ms"),
|
||||
)
|
||||
)
|
||||
print(
|
||||
"{:<40} {:<10.2f}".format(
|
||||
f"Median {metric_name} (ms):",
|
||||
getattr(metrics, f"median_{metric_attribute_name}_ms"),
|
||||
)
|
||||
)
|
||||
print("{s:{c}^{n}}".format(s=metric_header, n=50, c='-'))
|
||||
print("{:<40} {:<10.2f}".format(
|
||||
f"Mean {metric_name} (ms):",
|
||||
getattr(metrics, f"mean_{metric_attribute_name}_ms")))
|
||||
print("{:<40} {:<10.2f}".format(
|
||||
f"Median {metric_name} (ms):",
|
||||
getattr(metrics, f"median_{metric_attribute_name}_ms")))
|
||||
result[f"mean_{metric_attribute_name}_ms"] = getattr(
|
||||
metrics, f"mean_{metric_attribute_name}_ms"
|
||||
)
|
||||
metrics, f"mean_{metric_attribute_name}_ms")
|
||||
result[f"median_{metric_attribute_name}_ms"] = getattr(
|
||||
metrics, f"median_{metric_attribute_name}_ms"
|
||||
)
|
||||
metrics, f"median_{metric_attribute_name}_ms")
|
||||
result[f"std_{metric_attribute_name}_ms"] = getattr(
|
||||
metrics, f"std_{metric_attribute_name}_ms"
|
||||
)
|
||||
for p, value in getattr(metrics, f"percentiles_{metric_attribute_name}_ms"):
|
||||
metrics, f"std_{metric_attribute_name}_ms")
|
||||
for p, value in getattr(metrics,
|
||||
f"percentiles_{metric_attribute_name}_ms"):
|
||||
p_word = str(int(p)) if int(p) == p else str(p)
|
||||
print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name} (ms):", value))
|
||||
print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name} (ms):",
|
||||
value))
|
||||
result[f"p{p_word}_{metric_attribute_name}_ms"] = value
|
||||
|
||||
process_one_metric("ttft", "TTFT", "Time to First Token")
|
||||
process_one_metric("tpot", "TPOT", "Time per Output Token (excl. 1st token)")
|
||||
process_one_metric("tpot", "TPOT",
|
||||
"Time per Output Token (excl. 1st token)")
|
||||
process_one_metric("itl", "ITL", "Inter-token Latency")
|
||||
process_one_metric("e2el", "E2EL", "End-to-end Latency")
|
||||
|
||||
@ -670,13 +640,13 @@ async def benchmark(
|
||||
|
||||
|
||||
def evaluate(ret, args):
|
||||
|
||||
def _eval_correctness_json(expected, actual):
|
||||
# extract json string from string using regex
|
||||
import regex as re
|
||||
|
||||
actual = actual.replace("\n", "").replace(" ", "").strip()
|
||||
import re
|
||||
actual = actual.replace('\n', '').replace(' ', '').strip()
|
||||
try:
|
||||
actual = re.search(r"\{.*\}", actual).group()
|
||||
actual = re.search(r'\{.*\}', actual).group()
|
||||
actual = json.loads(actual)
|
||||
except Exception:
|
||||
return False
|
||||
@ -687,33 +657,29 @@ def evaluate(ret, args):
|
||||
return actual in args.choice
|
||||
|
||||
def _eval_correctness_regex(expected, actual):
|
||||
import regex as re
|
||||
|
||||
import re
|
||||
return re.match(args.regex, actual) is not None
|
||||
|
||||
def _eval_correctness(expected, actual):
|
||||
if args.structure_type == "guided_json":
|
||||
if args.structure_type == 'guided_json':
|
||||
return _eval_correctness_json(expected, actual)
|
||||
elif args.structure_type == "guided_regex":
|
||||
elif args.structure_type == 'guided_regex':
|
||||
return _eval_correctness_regex(expected, actual)
|
||||
elif args.structure_type == "guided_choice":
|
||||
elif args.structure_type == 'guided_choice':
|
||||
return _eval_correctness_choice(expected, actual)
|
||||
else:
|
||||
return None
|
||||
|
||||
scores = []
|
||||
for res in ret:
|
||||
score = _eval_correctness(res["expected"], res["generated"])
|
||||
res["correctness"] = score
|
||||
score = _eval_correctness(res['expected'], res['generated'])
|
||||
res['correctness'] = score
|
||||
scores.append(score)
|
||||
|
||||
not_none_scores = [score for score in scores if score is not None]
|
||||
|
||||
return (
|
||||
(sum(not_none_scores) / len(not_none_scores) * 100)
|
||||
if len(not_none_scores) > 0
|
||||
else None
|
||||
)
|
||||
return (sum(not_none_scores) / len(not_none_scores) *
|
||||
100) if len(not_none_scores) > 0 else None
|
||||
|
||||
|
||||
def parse_goodput(slo_pairs):
|
||||
@ -725,10 +691,9 @@ def parse_goodput(slo_pairs):
|
||||
except ValueError as err:
|
||||
raise argparse.ArgumentTypeError(
|
||||
"Invalid format found for service level objectives. "
|
||||
'Specify service level objectives for goodput as "KEY:VALUE" '
|
||||
"Specify service level objectives for goodput as \"KEY:VALUE\" "
|
||||
"pairs, where the key is a metric name, and the value is a "
|
||||
"number in milliseconds."
|
||||
) from err
|
||||
"number in milliseconds.") from err
|
||||
return goodput_config_dict
|
||||
|
||||
|
||||
@ -742,14 +707,12 @@ def check_goodput_args(args):
|
||||
raise ValueError(
|
||||
f"Invalid metric name found, {slo_name}: {slo_val}. "
|
||||
"The service level objective name should be one of "
|
||||
f"{str(VALID_NAMES)}. "
|
||||
)
|
||||
f"{str(VALID_NAMES)}. ")
|
||||
if slo_val < 0:
|
||||
raise ValueError(
|
||||
f"Invalid value found, {slo_name}: {slo_val}. "
|
||||
"The service level objective value should be "
|
||||
"non-negative."
|
||||
)
|
||||
"non-negative.")
|
||||
return goodput_config_dict
|
||||
|
||||
|
||||
@ -775,19 +738,19 @@ def main(args: argparse.Namespace):
|
||||
tokenizer_mode=args.tokenizer_mode,
|
||||
)
|
||||
|
||||
if args.dataset == "grammar":
|
||||
args.structure_type = "guided_grammar"
|
||||
elif args.dataset == "regex":
|
||||
args.structure_type = "guided_regex"
|
||||
elif args.dataset == "choice":
|
||||
args.structure_type = "guided_choice"
|
||||
if args.dataset == 'grammar':
|
||||
args.structure_type = 'guided_grammar'
|
||||
elif args.dataset == 'regex':
|
||||
args.structure_type = 'guided_regex'
|
||||
elif args.dataset == 'choice':
|
||||
args.structure_type = 'guided_choice'
|
||||
else:
|
||||
args.structure_type = "guided_json"
|
||||
args.structure_type = 'guided_json'
|
||||
|
||||
if args.no_structured_output:
|
||||
args.structured_output_ratio = 0
|
||||
if args.save_results:
|
||||
result_file_name = f"{args.structured_output_ratio}guided"
|
||||
result_file_name = f'{args.structured_output_ratio}guided'
|
||||
result_file_name += f"_{backend}"
|
||||
result_file_name += f"_{args.request_rate}qps"
|
||||
result_file_name += f"_{args.model.split('/')[-1]}"
|
||||
@ -815,29 +778,37 @@ def main(args: argparse.Namespace):
|
||||
disable_tqdm=args.disable_tqdm,
|
||||
profile=args.profile,
|
||||
selected_percentile_metrics=args.percentile_metrics.split(","),
|
||||
selected_percentiles=[float(p) for p in args.metric_percentiles.split(",")],
|
||||
selected_percentiles=[
|
||||
float(p) for p in args.metric_percentiles.split(",")
|
||||
],
|
||||
ignore_eos=args.ignore_eos,
|
||||
max_concurrency=args.max_concurrency,
|
||||
structured_output_ratio=args.structured_output_ratio,
|
||||
structured_output_backend=args.structured_output_backend,
|
||||
goodput_config_dict=goodput_config_dict,
|
||||
)
|
||||
)
|
||||
))
|
||||
|
||||
# Save config and results to json
|
||||
score = evaluate(ret, args)
|
||||
print("correct_rate(%)", score, "\n")
|
||||
print("correct_rate(%)", score, '\n')
|
||||
if args.save_results:
|
||||
results = {
|
||||
"backend": backend,
|
||||
"model_id": model_id,
|
||||
"tokenizer_id": tokenizer_id,
|
||||
"num_prompts": args.num_prompts,
|
||||
"request_rate": args.request_rate
|
||||
if args.request_rate < float("inf")
|
||||
else "inf",
|
||||
"burstiness": args.burstiness,
|
||||
"max_concurrency": args.max_concurrency,
|
||||
"correct_rate(%)": score,
|
||||
"backend":
|
||||
backend,
|
||||
"model_id":
|
||||
model_id,
|
||||
"tokenizer_id":
|
||||
tokenizer_id,
|
||||
"num_prompts":
|
||||
args.num_prompts,
|
||||
"request_rate":
|
||||
args.request_rate if args.request_rate < float("inf") else "inf",
|
||||
"burstiness":
|
||||
args.burstiness,
|
||||
"max_concurrency":
|
||||
args.max_concurrency,
|
||||
"correct_rate(%)":
|
||||
score
|
||||
}
|
||||
results = {"outputs": ret, **results, **benchmark_result}
|
||||
|
||||
@ -846,14 +817,13 @@ def main(args: argparse.Namespace):
|
||||
result_file_name = args.result_filename
|
||||
if args.result_dir:
|
||||
result_file_name = os.path.join(args.result_dir, result_file_name)
|
||||
with open(result_file_name, "w", encoding="utf-8") as outfile:
|
||||
with open(result_file_name, "w", encoding='utf-8') as outfile:
|
||||
json.dump(results, outfile, indent=4)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = FlexibleArgumentParser(
|
||||
description="Benchmark the online serving throughput."
|
||||
)
|
||||
description="Benchmark the online serving throughput.")
|
||||
parser.add_argument(
|
||||
"--backend",
|
||||
type=str,
|
||||
@ -875,14 +845,16 @@ if __name__ == "__main__":
|
||||
default="/v1/completions",
|
||||
help="API endpoint.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dataset",
|
||||
default="json",
|
||||
choices=["json", "json-unique", "grammar", "regex", "choice", "xgrammar_bench"],
|
||||
)
|
||||
parser.add_argument(
|
||||
"--json-schema-path", type=str, default=None, help="Path to json schema."
|
||||
)
|
||||
parser.add_argument("--dataset",
|
||||
default='json',
|
||||
choices=[
|
||||
'json', 'json-unique', 'grammar', 'regex',
|
||||
'choice', 'xgrammar_bench'
|
||||
])
|
||||
parser.add_argument("--json_schema_path",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Path to json schema.")
|
||||
parser.add_argument(
|
||||
"--max-concurrency",
|
||||
type=int,
|
||||
@ -894,8 +866,7 @@ if __name__ == "__main__":
|
||||
"initiated, this argument will control how many are actually allowed "
|
||||
"to execute at a time. This means that when used in combination, the "
|
||||
"actual request rate may be lower than specified with --request-rate, "
|
||||
"if the server is not processing requests fast enough to keep up.",
|
||||
)
|
||||
"if the server is not processing requests fast enough to keep up.")
|
||||
parser.add_argument(
|
||||
"--model",
|
||||
type=str,
|
||||
@ -905,13 +876,15 @@ if __name__ == "__main__":
|
||||
parser.add_argument(
|
||||
"--tokenizer",
|
||||
type=str,
|
||||
help="Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501
|
||||
help=
|
||||
"Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501
|
||||
)
|
||||
parser.add_argument(
|
||||
"--tokenizer-mode",
|
||||
type=str,
|
||||
default="auto",
|
||||
help="Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501
|
||||
help=
|
||||
"Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501
|
||||
)
|
||||
parser.add_argument(
|
||||
"--num-prompts",
|
||||
@ -988,51 +961,50 @@ if __name__ == "__main__":
|
||||
"--ignore-eos",
|
||||
action="store_true",
|
||||
help="Set ignore_eos flag when sending the benchmark request."
|
||||
"Warning: ignore_eos is not supported in deepspeed_mii and tgi.",
|
||||
)
|
||||
"Warning: ignore_eos is not supported in deepspeed_mii and tgi.")
|
||||
parser.add_argument(
|
||||
"--percentile-metrics",
|
||||
type=str,
|
||||
default="ttft,tpot,itl",
|
||||
help="Comma-separated list of selected metrics to report percentils. "
|
||||
help="Comma-seperated list of selected metrics to report percentils. "
|
||||
"This argument specifies the metrics to report percentiles. "
|
||||
'Allowed metric names are "ttft", "tpot", "itl", "e2el". '
|
||||
'Default value is "ttft,tpot,itl".',
|
||||
)
|
||||
"Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"e2el\". "
|
||||
"Default value is \"ttft,tpot,itl\".")
|
||||
parser.add_argument(
|
||||
"--metric-percentiles",
|
||||
type=str,
|
||||
default="99",
|
||||
help="Comma-separated list of percentiles for selected metrics. "
|
||||
'To report 25-th, 50-th, and 75-th percentiles, use "25,50,75". '
|
||||
'Default value is "99". '
|
||||
'Use "--percentile-metrics" to select metrics.',
|
||||
help="Comma-seperated list of percentiles for selected metrics. "
|
||||
"To report 25-th, 50-th, and 75-th percentiles, use \"25,50,75\". "
|
||||
"Default value is \"99\". "
|
||||
"Use \"--percentile-metrics\" to select metrics.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--goodput",
|
||||
nargs="+",
|
||||
required=False,
|
||||
help='Specify service level objectives for goodput as "KEY:VALUE" '
|
||||
help="Specify service level objectives for goodput as \"KEY:VALUE\" "
|
||||
"pairs, where the key is a metric name, and the value is in "
|
||||
'milliseconds. Multiple "KEY:VALUE" pairs can be provided, '
|
||||
"milliseconds. Multiple \"KEY:VALUE\" pairs can be provided, "
|
||||
"separated by spaces. Allowed request level metric names are "
|
||||
'"ttft", "tpot", "e2el". For more context on the definition of '
|
||||
"\"ttft\", \"tpot\", \"e2el\". For more context on the definition of "
|
||||
"goodput, refer to DistServe paper: https://arxiv.org/pdf/2401.09670 "
|
||||
"and the blog: https://hao-ai-lab.github.io/blogs/distserve",
|
||||
)
|
||||
"and the blog: https://hao-ai-lab.github.io/blogs/distserve")
|
||||
|
||||
parser.add_argument("--no-structured-output",
|
||||
action='store_true',
|
||||
default=False,
|
||||
help="Whether to disable JSON decoding or not.")
|
||||
parser.add_argument("--structured-output-ratio",
|
||||
type=float,
|
||||
default=1.0,
|
||||
help="Ratio of Structured Outputs requests")
|
||||
parser.add_argument(
|
||||
"--no-structured-output",
|
||||
action="store_true",
|
||||
default=False,
|
||||
help="Whether to disable JSON decoding or not.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--structured-output-ratio",
|
||||
type=float,
|
||||
default=1.0,
|
||||
help="Ratio of Structured Outputs requests",
|
||||
)
|
||||
"--structured-output-backend",
|
||||
type=str,
|
||||
choices=["outlines", "lm-format-enforcer", "xgrammar", "guidance"],
|
||||
default="xgrammar",
|
||||
help="Backend to use for structured outputs")
|
||||
|
||||
args = parser.parse_args()
|
||||
main(args)
|
||||
|
||||
@ -1,6 +1,5 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
"""Benchmark offline inference throughput."""
|
||||
|
||||
import argparse
|
||||
import dataclasses
|
||||
import json
|
||||
@ -12,25 +11,17 @@ from typing import Any, Optional, Union
|
||||
|
||||
import torch
|
||||
import uvloop
|
||||
from tqdm import tqdm
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizerBase
|
||||
|
||||
from benchmark_dataset import (
|
||||
AIMODataset,
|
||||
BurstGPTDataset,
|
||||
ConversationDataset,
|
||||
InstructCoderDataset,
|
||||
RandomDataset,
|
||||
SampleRequest,
|
||||
ShareGPTDataset,
|
||||
SonnetDataset,
|
||||
VisionArenaDataset,
|
||||
)
|
||||
from benchmark_dataset import (BurstGPTDataset, HuggingFaceDataset,
|
||||
RandomDataset, SampleRequest, ShareGPTDataset,
|
||||
SonnetDataset, VisionArenaDataset)
|
||||
from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
|
||||
from tqdm import tqdm
|
||||
from transformers import (AutoModelForCausalLM, AutoTokenizer,
|
||||
PreTrainedTokenizerBase)
|
||||
|
||||
from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
|
||||
from vllm.entrypoints.openai.api_server import (
|
||||
build_async_engine_client_from_engine_args,
|
||||
)
|
||||
build_async_engine_client_from_engine_args)
|
||||
from vllm.inputs import TextPrompt, TokensPrompt
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.outputs import RequestOutput
|
||||
@ -45,30 +36,23 @@ def run_vllm(
|
||||
disable_detokenize: bool = False,
|
||||
) -> tuple[float, Optional[list[RequestOutput]]]:
|
||||
from vllm import LLM, SamplingParams
|
||||
|
||||
llm = LLM(**dataclasses.asdict(engine_args))
|
||||
assert all(
|
||||
llm.llm_engine.model_config.max_model_len
|
||||
>= (request.prompt_len + request.expected_output_len)
|
||||
for request in requests
|
||||
), (
|
||||
"Please ensure that max_model_len is greater than the sum of"
|
||||
" prompt_len and expected_output_len for all requests."
|
||||
)
|
||||
llm.llm_engine.model_config.max_model_len >= (
|
||||
request.prompt_len + request.expected_output_len)
|
||||
for request in requests), (
|
||||
"Please ensure that max_model_len is greater than the sum of"
|
||||
" prompt_len and expected_output_len for all requests.")
|
||||
# Add the requests to the engine.
|
||||
prompts: list[Union[TextPrompt, TokensPrompt]] = []
|
||||
sampling_params: list[SamplingParams] = []
|
||||
for request in requests:
|
||||
prompts.append(
|
||||
TokensPrompt(
|
||||
prompt_token_ids=request.prompt["prompt_token_ids"],
|
||||
multi_modal_data=request.multi_modal_data,
|
||||
)
|
||||
if "prompt_token_ids" in request.prompt
|
||||
else TextPrompt(
|
||||
prompt=request.prompt, multi_modal_data=request.multi_modal_data
|
||||
)
|
||||
)
|
||||
TokensPrompt(prompt_token_ids=request.prompt["prompt_token_ids"],
|
||||
multi_modal_data=request.multi_modal_data)
|
||||
if "prompt_token_ids" in request.prompt else \
|
||||
TextPrompt(prompt=request.prompt,
|
||||
multi_modal_data=request.multi_modal_data))
|
||||
sampling_params.append(
|
||||
SamplingParams(
|
||||
n=n,
|
||||
@ -77,8 +61,7 @@ def run_vllm(
|
||||
ignore_eos=True,
|
||||
max_tokens=request.expected_output_len,
|
||||
detokenize=not disable_detokenize,
|
||||
)
|
||||
)
|
||||
))
|
||||
lora_requests: Optional[list[LoRARequest]] = None
|
||||
if engine_args.enable_lora:
|
||||
lora_requests = [request.lora_request for request in requests]
|
||||
@ -88,9 +71,10 @@ def run_vllm(
|
||||
outputs = None
|
||||
if not use_beam_search:
|
||||
start = time.perf_counter()
|
||||
outputs = llm.generate(
|
||||
prompts, sampling_params, lora_request=lora_requests, use_tqdm=True
|
||||
)
|
||||
outputs = llm.generate(prompts,
|
||||
sampling_params,
|
||||
lora_request=lora_requests,
|
||||
use_tqdm=True)
|
||||
end = time.perf_counter()
|
||||
else:
|
||||
assert lora_requests is None, "BeamSearch API does not support LoRA"
|
||||
@ -106,35 +90,30 @@ def run_vllm(
|
||||
beam_width=n,
|
||||
max_tokens=output_len,
|
||||
ignore_eos=True,
|
||||
),
|
||||
)
|
||||
))
|
||||
end = time.perf_counter()
|
||||
return end - start, outputs
|
||||
|
||||
|
||||
def run_vllm_chat(
|
||||
requests: list[SampleRequest],
|
||||
n: int,
|
||||
engine_args: EngineArgs,
|
||||
disable_detokenize: bool = False,
|
||||
) -> tuple[float, list[RequestOutput]]:
|
||||
requests: list[SampleRequest],
|
||||
n: int,
|
||||
engine_args: EngineArgs,
|
||||
disable_detokenize: bool = False) -> tuple[float, list[RequestOutput]]:
|
||||
"""
|
||||
Run vLLM chat benchmark. This function is recommended ONLY for benchmarking
|
||||
multimodal models as it properly handles multimodal inputs and chat
|
||||
formatting. For non-multimodal models, use run_vllm() instead.
|
||||
"""
|
||||
from vllm import LLM, SamplingParams
|
||||
|
||||
llm = LLM(**dataclasses.asdict(engine_args))
|
||||
|
||||
assert all(
|
||||
llm.llm_engine.model_config.max_model_len
|
||||
>= (request.prompt_len + request.expected_output_len)
|
||||
for request in requests
|
||||
), (
|
||||
"Please ensure that max_model_len is greater than the sum of "
|
||||
"prompt_len and expected_output_len for all requests."
|
||||
)
|
||||
llm.llm_engine.model_config.max_model_len >= (
|
||||
request.prompt_len + request.expected_output_len)
|
||||
for request in requests), (
|
||||
"Please ensure that max_model_len is greater than the sum of "
|
||||
"prompt_len and expected_output_len for all requests.")
|
||||
|
||||
prompts = []
|
||||
sampling_params: list[SamplingParams] = []
|
||||
@ -148,8 +127,7 @@ def run_vllm_chat(
|
||||
ignore_eos=True,
|
||||
max_tokens=request.expected_output_len,
|
||||
detokenize=not disable_detokenize,
|
||||
)
|
||||
)
|
||||
))
|
||||
start = time.perf_counter()
|
||||
outputs = llm.chat(prompts, sampling_params, use_tqdm=True)
|
||||
end = time.perf_counter()
|
||||
@ -166,17 +144,13 @@ async def run_vllm_async(
|
||||
from vllm import SamplingParams
|
||||
|
||||
async with build_async_engine_client_from_engine_args(
|
||||
engine_args, disable_frontend_multiprocessing
|
||||
) as llm:
|
||||
model_config = await llm.get_model_config()
|
||||
engine_args, disable_frontend_multiprocessing) as llm:
|
||||
assert all(
|
||||
model_config.max_model_len
|
||||
>= (request.prompt_len + request.expected_output_len)
|
||||
for request in requests
|
||||
), (
|
||||
"Please ensure that max_model_len is greater than the sum of"
|
||||
" prompt_len and expected_output_len for all requests."
|
||||
)
|
||||
llm.model_config.max_model_len >= (request.prompt_len +
|
||||
request.expected_output_len)
|
||||
for request in requests), (
|
||||
"Please ensure that max_model_len is greater than the sum of"
|
||||
" prompt_len and expected_output_len for all requests.")
|
||||
|
||||
# Add the requests to the engine.
|
||||
prompts: list[Union[TextPrompt, TokensPrompt]] = []
|
||||
@ -184,15 +158,11 @@ async def run_vllm_async(
|
||||
lora_requests: list[Optional[LoRARequest]] = []
|
||||
for request in requests:
|
||||
prompts.append(
|
||||
TokensPrompt(
|
||||
prompt_token_ids=request.prompt["prompt_token_ids"],
|
||||
multi_modal_data=request.multi_modal_data,
|
||||
)
|
||||
if "prompt_token_ids" in request.prompt
|
||||
else TextPrompt(
|
||||
prompt=request.prompt, multi_modal_data=request.multi_modal_data
|
||||
)
|
||||
)
|
||||
TokensPrompt(prompt_token_ids=request.prompt["prompt_token_ids"],
|
||||
multi_modal_data=request.multi_modal_data)
|
||||
if "prompt_token_ids" in request.prompt else \
|
||||
TextPrompt(prompt=request.prompt,
|
||||
multi_modal_data=request.multi_modal_data))
|
||||
sampling_params.append(
|
||||
SamplingParams(
|
||||
n=n,
|
||||
@ -201,16 +171,17 @@ async def run_vllm_async(
|
||||
ignore_eos=True,
|
||||
max_tokens=request.expected_output_len,
|
||||
detokenize=not disable_detokenize,
|
||||
)
|
||||
)
|
||||
))
|
||||
lora_requests.append(request.lora_request)
|
||||
|
||||
generators = []
|
||||
start = time.perf_counter()
|
||||
for i, (prompt, sp, lr) in enumerate(
|
||||
zip(prompts, sampling_params, lora_requests)
|
||||
):
|
||||
generator = llm.generate(prompt, sp, lora_request=lr, request_id=f"test{i}")
|
||||
for i, (prompt, sp,
|
||||
lr) in enumerate(zip(prompts, sampling_params, lora_requests)):
|
||||
generator = llm.generate(prompt,
|
||||
sp,
|
||||
lora_request=lr,
|
||||
request_id=f"test{i}")
|
||||
generators.append(generator)
|
||||
all_gens = merge_async_iterators(*generators)
|
||||
async for i, res in all_gens:
|
||||
@ -229,8 +200,7 @@ def run_hf(
|
||||
disable_detokenize: bool = False,
|
||||
) -> float:
|
||||
llm = AutoModelForCausalLM.from_pretrained(
|
||||
model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code
|
||||
)
|
||||
model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code)
|
||||
if llm.config.model_type == "llama":
|
||||
# To enable padding in the HF backend.
|
||||
tokenizer.pad_token = tokenizer.eos_token
|
||||
@ -242,26 +212,22 @@ def run_hf(
|
||||
max_prompt_len = 0
|
||||
max_output_len = 0
|
||||
for i in range(len(requests)):
|
||||
prompt = requests[i].prompt
|
||||
prompt_len = requests[i].prompt_len
|
||||
output_len = requests[i].expected_output_len
|
||||
prompt, prompt_len, output_len = requests[i]
|
||||
# Add the prompt to the batch.
|
||||
batch.append(prompt)
|
||||
max_prompt_len = max(max_prompt_len, prompt_len)
|
||||
max_output_len = max(max_output_len, output_len)
|
||||
if len(batch) < max_batch_size and i != len(requests) - 1:
|
||||
# Check if we can add more requests to the batch.
|
||||
next_prompt_len = requests[i + 1].prompt_len
|
||||
next_output_len = requests[i + 1].expected_output_len
|
||||
if (
|
||||
max(max_prompt_len, next_prompt_len)
|
||||
+ max(max_output_len, next_output_len)
|
||||
) <= 2048:
|
||||
_, next_prompt_len, next_output_len = requests[i + 1]
|
||||
if (max(max_prompt_len, next_prompt_len) +
|
||||
max(max_output_len, next_output_len)) <= 2048:
|
||||
# We can add more requests to the batch.
|
||||
continue
|
||||
|
||||
# Generate the sequences.
|
||||
input_ids = tokenizer(batch, return_tensors="pt", padding=True).input_ids
|
||||
input_ids = tokenizer(batch, return_tensors="pt",
|
||||
padding=True).input_ids
|
||||
llm_outputs = llm.generate(
|
||||
input_ids=input_ids.cuda(),
|
||||
do_sample=True,
|
||||
@ -291,7 +257,6 @@ def run_mii(
|
||||
output_len: int,
|
||||
) -> float:
|
||||
from mii import client, serve
|
||||
|
||||
llm = serve(model, tensor_parallel=tensor_parallel_size)
|
||||
prompts = [request.prompt for request in requests]
|
||||
|
||||
@ -303,9 +268,8 @@ def run_mii(
|
||||
return end - start
|
||||
|
||||
|
||||
def save_to_pytorch_benchmark_format(
|
||||
args: argparse.Namespace, results: dict[str, Any]
|
||||
) -> None:
|
||||
def save_to_pytorch_benchmark_format(args: argparse.Namespace,
|
||||
results: dict[str, Any]) -> None:
|
||||
pt_records = convert_to_pytorch_benchmark_format(
|
||||
args=args,
|
||||
metrics={
|
||||
@ -313,9 +277,9 @@ def save_to_pytorch_benchmark_format(
|
||||
"tokens_per_second": [results["tokens_per_second"]],
|
||||
},
|
||||
extra_info={
|
||||
k: results[k] for k in ["elapsed_time", "num_requests", "total_num_tokens"]
|
||||
},
|
||||
)
|
||||
k: results[k]
|
||||
for k in ["elapsed_time", "num_requests", "total_num_tokens"]
|
||||
})
|
||||
if pt_records:
|
||||
# Don't use json suffix here as we don't want CI to pick it up
|
||||
pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch.json"
|
||||
@ -336,7 +300,6 @@ def get_requests(args, tokenizer):
|
||||
"input_len": args.input_len,
|
||||
"output_len": args.output_len,
|
||||
}
|
||||
|
||||
if args.dataset_path is None or args.dataset_name == "random":
|
||||
sample_kwargs["range_ratio"] = args.random_range_ratio
|
||||
sample_kwargs["prefix_len"] = args.prefix_len
|
||||
@ -347,31 +310,25 @@ def get_requests(args, tokenizer):
|
||||
sample_kwargs["enable_multimodal_chat"] = True
|
||||
elif args.dataset_name == "sonnet":
|
||||
assert tokenizer.chat_template or tokenizer.default_chat_template, (
|
||||
"Tokenizer/model must have chat template for sonnet dataset."
|
||||
)
|
||||
"Tokenizer/model must have chat template for sonnet dataset.")
|
||||
dataset_cls = SonnetDataset
|
||||
sample_kwargs["prefix_len"] = args.prefix_len
|
||||
sample_kwargs["return_prompt_formatted"] = True
|
||||
elif args.dataset_name == "burstgpt":
|
||||
dataset_cls = BurstGPTDataset
|
||||
elif args.dataset_name == "hf":
|
||||
if args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS:
|
||||
dataset_cls = VisionArenaDataset
|
||||
common_kwargs["dataset_subset"] = None
|
||||
common_kwargs["dataset_split"] = "train"
|
||||
sample_kwargs["enable_multimodal_chat"] = True
|
||||
elif args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS:
|
||||
dataset_cls = InstructCoderDataset
|
||||
common_kwargs["dataset_split"] = "train"
|
||||
elif args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS:
|
||||
dataset_cls = ConversationDataset
|
||||
common_kwargs["dataset_subset"] = args.hf_subset
|
||||
common_kwargs["dataset_split"] = args.hf_split
|
||||
sample_kwargs["enable_multimodal_chat"] = True
|
||||
elif args.dataset_path in AIMODataset.SUPPORTED_DATASET_PATHS:
|
||||
dataset_cls = AIMODataset
|
||||
common_kwargs["dataset_subset"] = None
|
||||
common_kwargs["dataset_split"] = "train"
|
||||
if args.backend != "vllm-chat":
|
||||
raise ValueError(
|
||||
"hf datasets only are supported by vllm-chat backend")
|
||||
# Choose between VisionArenaDataset and HuggingFaceDataset based on
|
||||
# provided parameters.
|
||||
dataset_cls = (VisionArenaDataset if args.dataset_path
|
||||
== VisionArenaDataset.VISION_ARENA_DATASET_PATH
|
||||
and args.hf_subset is None else HuggingFaceDataset)
|
||||
common_kwargs['dataset_subset'] = args.hf_subset
|
||||
common_kwargs['dataset_split'] = args.hf_split
|
||||
sample_kwargs["enable_multimodal_chat"] = True
|
||||
|
||||
else:
|
||||
raise ValueError(f"Unknown dataset name: {args.dataset_name}")
|
||||
# Remove None values
|
||||
@ -386,10 +343,10 @@ def main(args: argparse.Namespace):
|
||||
random.seed(args.seed)
|
||||
# Sample the requests.
|
||||
tokenizer = AutoTokenizer.from_pretrained(
|
||||
args.tokenizer, trust_remote_code=args.trust_remote_code
|
||||
)
|
||||
args.tokenizer, trust_remote_code=args.trust_remote_code)
|
||||
requests = get_requests(args, tokenizer)
|
||||
is_multi_modal = any(request.multi_modal_data is not None for request in requests)
|
||||
is_multi_modal = any(request.multi_modal_data is not None
|
||||
for request in requests)
|
||||
request_outputs: Optional[list[RequestOutput]] = None
|
||||
if args.backend == "vllm":
|
||||
if args.async_engine:
|
||||
@ -400,34 +357,23 @@ def main(args: argparse.Namespace):
|
||||
AsyncEngineArgs.from_cli_args(args),
|
||||
args.disable_frontend_multiprocessing,
|
||||
args.disable_detokenize,
|
||||
)
|
||||
)
|
||||
))
|
||||
else:
|
||||
elapsed_time, request_outputs = run_vllm(
|
||||
requests,
|
||||
args.n,
|
||||
EngineArgs.from_cli_args(args),
|
||||
args.disable_detokenize,
|
||||
)
|
||||
requests, args.n, EngineArgs.from_cli_args(args),
|
||||
args.disable_detokenize)
|
||||
elif args.backend == "hf":
|
||||
assert args.tensor_parallel_size == 1
|
||||
elapsed_time = run_hf(
|
||||
requests,
|
||||
args.model,
|
||||
tokenizer,
|
||||
args.n,
|
||||
args.hf_max_batch_size,
|
||||
args.trust_remote_code,
|
||||
args.disable_detokenize,
|
||||
)
|
||||
elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
|
||||
args.hf_max_batch_size, args.trust_remote_code,
|
||||
args.disable_detokenize)
|
||||
elif args.backend == "mii":
|
||||
elapsed_time = run_mii(
|
||||
requests, args.model, args.tensor_parallel_size, args.output_len
|
||||
)
|
||||
elapsed_time = run_mii(requests, args.model, args.tensor_parallel_size,
|
||||
args.output_len)
|
||||
elif args.backend == "vllm-chat":
|
||||
elapsed_time, request_outputs = run_vllm_chat(
|
||||
requests, args.n, EngineArgs.from_cli_args(args), args.disable_detokenize
|
||||
)
|
||||
requests, args.n, EngineArgs.from_cli_args(args),
|
||||
args.disable_detokenize)
|
||||
else:
|
||||
raise ValueError(f"Unknown backend: {args.backend}")
|
||||
|
||||
@ -439,31 +385,28 @@ def main(args: argparse.Namespace):
|
||||
for ro in request_outputs:
|
||||
if not isinstance(ro, RequestOutput):
|
||||
continue
|
||||
total_prompt_tokens += (
|
||||
len(ro.prompt_token_ids) if ro.prompt_token_ids else 0
|
||||
)
|
||||
total_output_tokens += sum(len(o.token_ids) for o in ro.outputs if o)
|
||||
total_prompt_tokens += len(
|
||||
ro.prompt_token_ids) if ro.prompt_token_ids else 0
|
||||
total_output_tokens += sum(
|
||||
len(o.token_ids) for o in ro.outputs if o)
|
||||
total_num_tokens = total_prompt_tokens + total_output_tokens
|
||||
else:
|
||||
total_num_tokens = sum(r.prompt_len + r.expected_output_len for r in requests)
|
||||
total_num_tokens = sum(r.prompt_len + r.expected_output_len
|
||||
for r in requests)
|
||||
total_output_tokens = sum(r.expected_output_len for r in requests)
|
||||
total_prompt_tokens = total_num_tokens - total_output_tokens
|
||||
|
||||
if is_multi_modal and args.backend != "vllm-chat":
|
||||
print(
|
||||
"\033[91mWARNING\033[0m: Multi-modal request with "
|
||||
f"{args.backend} backend detected. The "
|
||||
"following metrics are not accurate because image tokens are not"
|
||||
" counted. See vllm-project/vllm/issues/9778 for details."
|
||||
)
|
||||
print("\033[91mWARNING\033[0m: Multi-modal request with "
|
||||
f"{args.backend} backend detected. The "
|
||||
"following metrics are not accurate because image tokens are not"
|
||||
" counted. See vllm-project/vllm/issues/9778 for details.")
|
||||
# TODO(vllm-project/vllm/issues/9778): Count multi-modal token length.
|
||||
# vllm-chat backend counts the image tokens now
|
||||
|
||||
print(
|
||||
f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
|
||||
f"{total_num_tokens / elapsed_time:.2f} total tokens/s, "
|
||||
f"{total_output_tokens / elapsed_time:.2f} output tokens/s"
|
||||
)
|
||||
print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
|
||||
f"{total_num_tokens / elapsed_time:.2f} total tokens/s, "
|
||||
f"{total_output_tokens / elapsed_time:.2f} output tokens/s")
|
||||
print(f"Total num prompt tokens: {total_prompt_tokens}")
|
||||
print(f"Total num output tokens: {total_output_tokens}")
|
||||
|
||||
@ -491,8 +434,7 @@ def validate_args(args):
|
||||
warnings.warn(
|
||||
"The '--dataset' argument will be deprecated in the next release. "
|
||||
"Please use '--dataset-name' and '--dataset-path' instead.",
|
||||
stacklevel=2,
|
||||
)
|
||||
stacklevel=2)
|
||||
args.dataset_path = args.dataset
|
||||
|
||||
if not getattr(args, "tokenizer", None):
|
||||
@ -505,8 +447,9 @@ def validate_args(args):
|
||||
|
||||
# === Dataset Configuration ===
|
||||
if not args.dataset and not args.dataset_path:
|
||||
print("When dataset path is not set, it will default to random dataset")
|
||||
args.dataset_name = "random"
|
||||
print(
|
||||
"When dataset path is not set, it will default to random dataset")
|
||||
args.dataset_name = 'random'
|
||||
if args.input_len is None:
|
||||
raise ValueError("input_len must be provided for a random dataset")
|
||||
|
||||
@ -514,55 +457,33 @@ def validate_args(args):
|
||||
# --hf-subset and --hf-split: only used
|
||||
# when dataset_name is 'hf'
|
||||
if args.dataset_name != "hf" and (
|
||||
getattr(args, "hf_subset", None) is not None
|
||||
or getattr(args, "hf_split", None) is not None
|
||||
):
|
||||
warnings.warn(
|
||||
"--hf-subset and --hf-split will be ignored \
|
||||
getattr(args, "hf_subset", None) is not None
|
||||
or getattr(args, "hf_split", None) is not None):
|
||||
warnings.warn("--hf-subset and --hf-split will be ignored \
|
||||
since --dataset-name is not 'hf'.",
|
||||
stacklevel=2,
|
||||
)
|
||||
elif args.dataset_name == "hf":
|
||||
if args.dataset_path in (
|
||||
VisionArenaDataset.SUPPORTED_DATASET_PATHS.keys()
|
||||
| ConversationDataset.SUPPORTED_DATASET_PATHS
|
||||
):
|
||||
assert args.backend == "vllm-chat", (
|
||||
f"{args.dataset_path} needs to use vllm-chat as the backend."
|
||||
) # noqa: E501
|
||||
elif args.dataset_path in (
|
||||
InstructCoderDataset.SUPPORTED_DATASET_PATHS
|
||||
| AIMODataset.SUPPORTED_DATASET_PATHS
|
||||
):
|
||||
assert args.backend == "vllm", (
|
||||
f"{args.dataset_path} needs to use vllm as the backend."
|
||||
) # noqa: E501
|
||||
else:
|
||||
raise ValueError(f"{args.dataset_path} is not supported by hf dataset.")
|
||||
stacklevel=2)
|
||||
elif args.dataset_name == "hf" and args.backend != "vllm-chat":
|
||||
raise ValueError(
|
||||
"When --dataset-name is 'hf', backend must be 'vllm-chat'")
|
||||
|
||||
# --random-range-ratio: only used when dataset_name is 'random'
|
||||
if args.dataset_name != "random" and args.random_range_ratio is not None:
|
||||
warnings.warn(
|
||||
"--random-range-ratio will be ignored since \
|
||||
if args.dataset_name != 'random' and args.random_range_ratio is not None:
|
||||
warnings.warn("--random-range-ratio will be ignored since \
|
||||
--dataset-name is not 'random'.",
|
||||
stacklevel=2,
|
||||
)
|
||||
stacklevel=2)
|
||||
|
||||
# --prefix-len: only used when dataset_name is 'random', 'sonnet', or not
|
||||
# set.
|
||||
if (
|
||||
args.dataset_name not in {"random", "sonnet", None}
|
||||
and args.prefix_len is not None
|
||||
):
|
||||
warnings.warn(
|
||||
"--prefix-len will be ignored since --dataset-name\
|
||||
if args.dataset_name not in {"random", "sonnet", None
|
||||
} and args.prefix_len is not None:
|
||||
warnings.warn("--prefix-len will be ignored since --dataset-name\
|
||||
is not 'random', 'sonnet', or not set.",
|
||||
stacklevel=2,
|
||||
)
|
||||
stacklevel=2)
|
||||
|
||||
# === LoRA Settings ===
|
||||
if getattr(args, "enable_lora", False) and args.backend != "vllm":
|
||||
raise ValueError("LoRA benchmarking is only supported for vLLM backend")
|
||||
raise ValueError(
|
||||
"LoRA benchmarking is only supported for vLLM backend")
|
||||
if getattr(args, "enable_lora", False) and args.lora_path is None:
|
||||
raise ValueError("LoRA path must be provided when enable_lora is True")
|
||||
|
||||
@ -572,10 +493,8 @@ def validate_args(args):
|
||||
if args.backend != "hf" and args.hf_max_batch_size is not None:
|
||||
raise ValueError("HF max batch size is only for HF backend.")
|
||||
|
||||
if (
|
||||
args.backend in {"hf", "mii"}
|
||||
and getattr(args, "quantization", None) is not None
|
||||
):
|
||||
if args.backend in {"hf", "mii"} and getattr(args, "quantization",
|
||||
None) is not None:
|
||||
raise ValueError("Quantization is only for vLLM backend.")
|
||||
|
||||
if args.backend == "mii" and args.dtype != "auto":
|
||||
@ -583,32 +502,22 @@ def validate_args(args):
|
||||
if args.backend == "mii" and args.n != 1:
|
||||
raise ValueError("n must be 1 for MII backend.")
|
||||
if args.backend == "mii" and args.tokenizer != args.model:
|
||||
raise ValueError("Tokenizer must be the same as the model for MII backend.")
|
||||
|
||||
# --data-parallel is not supported currently.
|
||||
# https://github.com/vllm-project/vllm/issues/16222
|
||||
if args.data_parallel_size > 1:
|
||||
raise ValueError(
|
||||
"Data parallel is not supported in offline benchmark, \
|
||||
please use benchmark serving instead"
|
||||
)
|
||||
"Tokenizer must be the same as the model for MII backend.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = FlexibleArgumentParser(description="Benchmark the throughput.")
|
||||
parser.add_argument(
|
||||
"--backend",
|
||||
type=str,
|
||||
choices=["vllm", "hf", "mii", "vllm-chat"],
|
||||
default="vllm",
|
||||
)
|
||||
parser.add_argument("--backend",
|
||||
type=str,
|
||||
choices=["vllm", "hf", "mii", "vllm-chat"],
|
||||
default="vllm")
|
||||
parser.add_argument(
|
||||
"--dataset-name",
|
||||
type=str,
|
||||
choices=["sharegpt", "random", "sonnet", "burstgpt", "hf"],
|
||||
help="Name of the dataset to benchmark on.",
|
||||
default="sharegpt",
|
||||
)
|
||||
default="sharegpt")
|
||||
parser.add_argument(
|
||||
"--dataset",
|
||||
type=str,
|
||||
@ -616,104 +525,80 @@ if __name__ == "__main__":
|
||||
help="Path to the ShareGPT dataset, will be deprecated in\
|
||||
the next release. The dataset is expected to "
|
||||
"be a json in form of list[dict[..., conversations: "
|
||||
"list[dict[..., value: <prompt_or_response>]]]]",
|
||||
)
|
||||
"list[dict[..., value: <prompt_or_response>]]]]")
|
||||
parser.add_argument("--dataset-path",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Path to the dataset")
|
||||
parser.add_argument("--input-len",
|
||||
type=int,
|
||||
default=None,
|
||||
help="Input prompt length for each request")
|
||||
parser.add_argument("--output-len",
|
||||
type=int,
|
||||
default=None,
|
||||
help="Output length for each request. Overrides the "
|
||||
"output length from the dataset.")
|
||||
parser.add_argument("--n",
|
||||
type=int,
|
||||
default=1,
|
||||
help="Number of generated sequences per prompt.")
|
||||
parser.add_argument("--num-prompts",
|
||||
type=int,
|
||||
default=1000,
|
||||
help="Number of prompts to process.")
|
||||
parser.add_argument("--hf-max-batch-size",
|
||||
type=int,
|
||||
default=None,
|
||||
help="Maximum batch size for HF backend.")
|
||||
parser.add_argument(
|
||||
"--dataset-path", type=str, default=None, help="Path to the dataset"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--input-len",
|
||||
type=int,
|
||||
default=None,
|
||||
help="Input prompt length for each request",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output-len",
|
||||
type=int,
|
||||
default=None,
|
||||
help="Output length for each request. Overrides the "
|
||||
"output length from the dataset.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--n", type=int, default=1, help="Number of generated sequences per prompt."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--num-prompts", type=int, default=1000, help="Number of prompts to process."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--hf-max-batch-size",
|
||||
type=int,
|
||||
default=None,
|
||||
help="Maximum batch size for HF backend.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output-json",
|
||||
'--output-json',
|
||||
type=str,
|
||||
default=None,
|
||||
help="Path to save the throughput results in JSON format.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--async-engine",
|
||||
action="store_true",
|
||||
default=False,
|
||||
help="Use vLLM async engine rather than LLM class.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--disable-frontend-multiprocessing",
|
||||
action="store_true",
|
||||
default=False,
|
||||
help="Disable decoupled async engine frontend.",
|
||||
)
|
||||
help='Path to save the throughput results in JSON format.')
|
||||
parser.add_argument("--async-engine",
|
||||
action='store_true',
|
||||
default=False,
|
||||
help="Use vLLM async engine rather than LLM class.")
|
||||
parser.add_argument("--disable-frontend-multiprocessing",
|
||||
action='store_true',
|
||||
default=False,
|
||||
help="Disable decoupled async engine frontend.")
|
||||
parser.add_argument(
|
||||
"--disable-detokenize",
|
||||
action="store_true",
|
||||
help=(
|
||||
"Do not detokenize the response (i.e. do not include "
|
||||
"detokenization time in the measurement)"
|
||||
),
|
||||
)
|
||||
help=("Do not detokenize the response (i.e. do not include "
|
||||
"detokenization time in the measurement)"))
|
||||
# LoRA
|
||||
parser.add_argument(
|
||||
"--lora-path",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Path to the LoRA adapters to use. This can be an absolute path, "
|
||||
"a relative path, or a Hugging Face model identifier.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--prefix-len",
|
||||
type=int,
|
||||
default=None,
|
||||
help=f"Number of prefix tokens to be used in RandomDataset "
|
||||
"and SonnetDataset. For RandomDataset, the total input "
|
||||
"length is the sum of prefix-len (default: "
|
||||
f"{RandomDataset.DEFAULT_PREFIX_LEN}) and a random context length "
|
||||
"sampled from [input_len * (1 - range_ratio), "
|
||||
"input_len * (1 + range_ratio)]. For SonnetDataset, "
|
||||
f"prefix_len (default: {SonnetDataset.DEFAULT_PREFIX_LEN}) "
|
||||
"controls how much of the input is fixed lines versus "
|
||||
"random lines, but the total input length remains approximately "
|
||||
"input_len tokens.",
|
||||
)
|
||||
help="Path to the lora adapters to use. This can be an absolute path, "
|
||||
"a relative path, or a Hugging Face model identifier.")
|
||||
parser.add_argument("--prefix-len",
|
||||
type=int,
|
||||
default=None,
|
||||
help="Number of prefix tokens per request."
|
||||
"This is for the RandomDataset and SonnetDataset")
|
||||
# random dataset
|
||||
parser.add_argument(
|
||||
"--random-range-ratio",
|
||||
type=float,
|
||||
default=None,
|
||||
help=f"Range ratio (default : {RandomDataset.DEFAULT_RANGE_RATIO}) "
|
||||
"for sampling input/output length, "
|
||||
"used only for RandomDataset. Must be in the range [0, 1) to "
|
||||
"define a symmetric sampling range "
|
||||
"[length * (1 - range_ratio), length * (1 + range_ratio)].",
|
||||
help="Range of sampled ratio of input/output length, "
|
||||
"used only for RandomDataSet.",
|
||||
)
|
||||
|
||||
# hf dtaset
|
||||
parser.add_argument(
|
||||
"--hf-subset", type=str, default=None, help="Subset of the HF dataset."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--hf-split", type=str, default=None, help="Split of the HF dataset."
|
||||
)
|
||||
parser.add_argument("--hf-subset",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Subset of the HF dataset.")
|
||||
parser.add_argument("--hf-split",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Split of the HF dataset.")
|
||||
|
||||
parser = AsyncEngineArgs.add_cli_args(parser)
|
||||
args = parser.parse_args()
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user