Compare commits
6 Commits
wentao-ref
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
| 36960501d3 | |||
| b2e65cb4a7 | |||
| 2bf0bcc1fc | |||
| 697f507a8e | |||
| d5d2a0fe74 | |||
| c9791f1813 |
@ -7,7 +7,7 @@ vLLM also maintains a continuous performance benchmark under [perf.vllm.ai](http
|
|||||||
|
|
||||||
## Performance benchmark quick overview
|
## Performance benchmark quick overview
|
||||||
|
|
||||||
**Benchmarking Coverage**: latency, throughput and fix-qps serving on B200, A100, H100 and Intel® Xeon® Processors, with different models.
|
**Benchmarking Coverage**: latency, throughput and fix-qps serving on B200, A100, H100, Intel® Xeon® Processors and Intel® Gaudi® 3 Accelerators with different models.
|
||||||
|
|
||||||
**Benchmarking Duration**: about 1hr.
|
**Benchmarking Duration**: about 1hr.
|
||||||
|
|
||||||
@ -34,6 +34,7 @@ Runtime environment variables:
|
|||||||
|
|
||||||
See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.
|
See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.
|
||||||
> NOTE: For Intel® Xeon® Processors, use `tests/latency-tests-cpu.json`, `tests/throughput-tests-cpu.json`, `tests/serving-tests-cpu.json` instead.
|
> NOTE: For Intel® Xeon® Processors, use `tests/latency-tests-cpu.json`, `tests/throughput-tests-cpu.json`, `tests/serving-tests-cpu.json` instead.
|
||||||
|
For Intel® Gaudi® 3 Accelerators, use `tests/latency-tests-hpu.json`, `tests/throughput-tests-hpu.json`, `tests/serving-tests-hpu.json` instead.
|
||||||
>
|
>
|
||||||
### Latency test
|
### Latency test
|
||||||
|
|
||||||
|
|||||||
@ -5,7 +5,7 @@
|
|||||||
- Input length: 32 tokens.
|
- Input length: 32 tokens.
|
||||||
- Output length: 128 tokens.
|
- Output length: 128 tokens.
|
||||||
- Batch size: fixed (8).
|
- Batch size: fixed (8).
|
||||||
- GPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
|
- GPU/HPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
|
||||||
- CPU Models: llama-3.1 8B.
|
- CPU Models: llama-3.1 8B.
|
||||||
- Evaluation metrics: end-to-end latency (mean, median, p99).
|
- Evaluation metrics: end-to-end latency (mean, median, p99).
|
||||||
|
|
||||||
@ -16,7 +16,7 @@
|
|||||||
- Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
|
- Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
|
||||||
- Output length: the corresponding output length of these 200 prompts.
|
- Output length: the corresponding output length of these 200 prompts.
|
||||||
- Batch size: dynamically determined by vllm to achieve maximum throughput.
|
- Batch size: dynamically determined by vllm to achieve maximum throughput.
|
||||||
- GPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
|
- GPU/HPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
|
||||||
- CPU Models: llama-3.1 8B.
|
- CPU Models: llama-3.1 8B.
|
||||||
- Evaluation metrics: throughput.
|
- Evaluation metrics: throughput.
|
||||||
|
|
||||||
@ -28,7 +28,7 @@
|
|||||||
- Output length: the corresponding output length of these 200 prompts.
|
- Output length: the corresponding output length of these 200 prompts.
|
||||||
- Batch size: dynamically determined by vllm and the arrival pattern of the requests.
|
- Batch size: dynamically determined by vllm and the arrival pattern of the requests.
|
||||||
- **Average QPS (query per second)**: 1, 4, 16 and inf. QPS = inf means all requests come at once. For other QPS values, the arrival time of each query is determined using a random Poisson process (with fixed random seed).
|
- **Average QPS (query per second)**: 1, 4, 16 and inf. QPS = inf means all requests come at once. For other QPS values, the arrival time of each query is determined using a random Poisson process (with fixed random seed).
|
||||||
- GPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
|
- GPU/HPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
|
||||||
- We also added a speculative decoding test for llama-3 70B on GPU, under QPS 2
|
- We also added a speculative decoding test for llama-3 70B on GPU, under QPS 2
|
||||||
- CPU Models: llama-3.1 8B.
|
- CPU Models: llama-3.1 8B.
|
||||||
- Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99).
|
- Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99).
|
||||||
|
|||||||
@ -15,6 +15,8 @@ check_gpus() {
|
|||||||
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
|
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
|
||||||
elif command -v amd-smi; then
|
elif command -v amd-smi; then
|
||||||
declare -g gpu_count=$(amd-smi list | grep 'GPU' | wc -l)
|
declare -g gpu_count=$(amd-smi list | grep 'GPU' | wc -l)
|
||||||
|
elif command -v hl-smi; then
|
||||||
|
declare -g gpu_count=$(hl-smi --list | grep -i "Module ID" | wc -l)
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [[ $gpu_count -gt 0 ]]; then
|
if [[ $gpu_count -gt 0 ]]; then
|
||||||
@ -23,10 +25,16 @@ check_gpus() {
|
|||||||
echo "Need at least 1 GPU to run benchmarking."
|
echo "Need at least 1 GPU to run benchmarking."
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
declare -g arch_suffix=''
|
||||||
|
|
||||||
if command -v nvidia-smi; then
|
if command -v nvidia-smi; then
|
||||||
declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')
|
declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')
|
||||||
elif command -v amd-smi; then
|
elif command -v amd-smi; then
|
||||||
declare -g gpu_type=$(amd-smi static -g 0 -a | grep 'MARKET_NAME' | awk '{print $2}')
|
declare -g gpu_type=$(amd-smi static -g 0 -a | grep 'MARKET_NAME' | awk '{print $2}')
|
||||||
|
elif command -v hl-smi; then
|
||||||
|
declare -g gpu_type=$(hl-smi -q | grep "Product Name" | head -n 1 | awk -F ':' '{print $2}' | sed 's/^ *//')
|
||||||
|
arch_suffix='-hpu'
|
||||||
fi
|
fi
|
||||||
echo "GPU type is $gpu_type"
|
echo "GPU type is $gpu_type"
|
||||||
}
|
}
|
||||||
@ -138,6 +146,10 @@ kill_gpu_processes() {
|
|||||||
while [ "$(amd-smi metric -g 0 | grep 'USED_VRAM' | awk '{print $2}')" -ge 1000 ]; do
|
while [ "$(amd-smi metric -g 0 | grep 'USED_VRAM' | awk '{print $2}')" -ge 1000 ]; do
|
||||||
sleep 1
|
sleep 1
|
||||||
done
|
done
|
||||||
|
elif command -v hl-smi; then
|
||||||
|
while [ "$(hl-smi -q | grep "Used" | head -n 1 | awk '{print $3}')" -ge 1000 ]; do
|
||||||
|
sleep 1
|
||||||
|
done
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# remove vllm config file
|
# remove vllm config file
|
||||||
@ -451,6 +463,7 @@ main() {
|
|||||||
ARCH='-cpu'
|
ARCH='-cpu'
|
||||||
else
|
else
|
||||||
check_gpus
|
check_gpus
|
||||||
|
ARCH="$arch_suffix"
|
||||||
fi
|
fi
|
||||||
check_hf_token
|
check_hf_token
|
||||||
|
|
||||||
|
|||||||
@ -0,0 +1,55 @@
|
|||||||
|
[
|
||||||
|
{
|
||||||
|
"test_name": "latency_llama8B_tp1",
|
||||||
|
"environment_variables": {
|
||||||
|
"PT_HPU_LAZY_MODE": 1,
|
||||||
|
"VLLM_CONTIGUOUS_PA": 1,
|
||||||
|
"VLLM_DEFRAG": 1
|
||||||
|
},
|
||||||
|
"parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||||
|
"tensor_parallel_size": 1,
|
||||||
|
"load_format": "dummy",
|
||||||
|
"num-iters-warmup": 5,
|
||||||
|
"num-iters": 15,
|
||||||
|
"max-model-len": 256,
|
||||||
|
"async-scheduling": ""
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "latency_llama70B_tp4",
|
||||||
|
"environment_variables": {
|
||||||
|
"PT_HPU_LAZY_MODE": 1,
|
||||||
|
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
|
||||||
|
"VLLM_CONTIGUOUS_PA": 1,
|
||||||
|
"VLLM_DEFRAG": 1
|
||||||
|
},
|
||||||
|
"parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
||||||
|
"tensor_parallel_size": 4,
|
||||||
|
"load_format": "dummy",
|
||||||
|
"num-iters-warmup": 5,
|
||||||
|
"num-iters": 15,
|
||||||
|
"max-model-len": 256,
|
||||||
|
"async-scheduling": ""
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "latency_mixtral8x7B_tp2",
|
||||||
|
"environment_variables": {
|
||||||
|
"PT_HPU_LAZY_MODE": 1,
|
||||||
|
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
|
||||||
|
"VLLM_CONTIGUOUS_PA": 1,
|
||||||
|
"VLLM_DEFRAG": 1
|
||||||
|
},
|
||||||
|
"parameters": {
|
||||||
|
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
|
||||||
|
"tensor_parallel_size": 2,
|
||||||
|
"load_format": "dummy",
|
||||||
|
"num-iters-warmup": 5,
|
||||||
|
"num-iters": 15,
|
||||||
|
"max-model-len": 256,
|
||||||
|
"async-scheduling": ""
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
@ -0,0 +1,82 @@
|
|||||||
|
[
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_tp1_sharegpt",
|
||||||
|
"qps_list": [1, 4, 16, "inf"],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"PT_HPU_LAZY_MODE": 1,
|
||||||
|
"VLLM_CONTIGUOUS_PA": 1,
|
||||||
|
"VLLM_DEFRAG": 1
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||||
|
"tensor_parallel_size": 1,
|
||||||
|
"swap_space": 16,
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"load_format": "dummy",
|
||||||
|
"max-model-len": 2048,
|
||||||
|
"max-num-seqs": 256,
|
||||||
|
"async-scheduling": ""
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "sharegpt",
|
||||||
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"num_prompts": 200
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama70B_tp4_sharegpt",
|
||||||
|
"qps_list": [1, 4, 16, "inf"],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"PT_HPU_LAZY_MODE": 1,
|
||||||
|
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
|
||||||
|
"VLLM_CONTIGUOUS_PA": 1,
|
||||||
|
"VLLM_DEFRAG": 1
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
||||||
|
"tensor_parallel_size": 4,
|
||||||
|
"swap_space": 16,
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"load_format": "dummy",
|
||||||
|
"max-model-len": 2048,
|
||||||
|
"max-num-seqs": 256,
|
||||||
|
"async-scheduling": ""
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "sharegpt",
|
||||||
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"num_prompts": 200
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_mixtral8x7B_tp2_sharegpt",
|
||||||
|
"qps_list": [1, 4, 16, "inf"],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"PT_HPU_LAZY_MODE": 1,
|
||||||
|
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
|
||||||
|
"VLLM_CONTIGUOUS_PA": 1,
|
||||||
|
"VLLM_DEFRAG": 1
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
|
||||||
|
"tensor_parallel_size": 2,
|
||||||
|
"swap_space": 16,
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"load_format": "dummy",
|
||||||
|
"max-model-len": 2048,
|
||||||
|
"max-num-seqs": 256,
|
||||||
|
"async-scheduling": ""
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "sharegpt",
|
||||||
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"num_prompts": 200
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
@ -0,0 +1,61 @@
|
|||||||
|
[
|
||||||
|
{
|
||||||
|
"test_name": "throughput_llama8B_tp1",
|
||||||
|
"environment_variables": {
|
||||||
|
"PT_HPU_LAZY_MODE": 1,
|
||||||
|
"VLLM_CONTIGUOUS_PA": 1,
|
||||||
|
"VLLM_DEFRAG": 1
|
||||||
|
},
|
||||||
|
"parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||||
|
"tensor_parallel_size": 1,
|
||||||
|
"load_format": "dummy",
|
||||||
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"num_prompts": 1000,
|
||||||
|
"backend": "vllm",
|
||||||
|
"max-model-len": 2048,
|
||||||
|
"max-num-seqs": 512,
|
||||||
|
"async-scheduling": ""
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "throughput_llama70B_tp4",
|
||||||
|
"environment_variables": {
|
||||||
|
"PT_HPU_LAZY_MODE": 1,
|
||||||
|
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
|
||||||
|
"VLLM_CONTIGUOUS_PA": 1,
|
||||||
|
"VLLM_DEFRAG": 1
|
||||||
|
},
|
||||||
|
"parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
||||||
|
"tensor_parallel_size": 4,
|
||||||
|
"load_format": "dummy",
|
||||||
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"num_prompts": 1000,
|
||||||
|
"backend": "vllm",
|
||||||
|
"max-model-len": 2048,
|
||||||
|
"max-num-seqs": 512,
|
||||||
|
"async-scheduling": ""
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "throughput_mixtral8x7B_tp2",
|
||||||
|
"environment_variables": {
|
||||||
|
"PT_HPU_LAZY_MODE": 1,
|
||||||
|
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
|
||||||
|
"VLLM_CONTIGUOUS_PA": 1,
|
||||||
|
"VLLM_DEFRAG": 1
|
||||||
|
},
|
||||||
|
"parameters": {
|
||||||
|
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
|
||||||
|
"tensor_parallel_size": 2,
|
||||||
|
"load_format": "dummy",
|
||||||
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"num_prompts": 1000,
|
||||||
|
"backend": "vllm",
|
||||||
|
"max-model-len": 2048,
|
||||||
|
"max-num-seqs": 512,
|
||||||
|
"async-scheduling": ""
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
@ -0,0 +1,62 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euxo pipefail
|
||||||
|
|
||||||
|
# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT]
|
||||||
|
THRESHOLD=${1:-0.25}
|
||||||
|
NUM_Q=${2:-1319}
|
||||||
|
PORT=${3:-8010}
|
||||||
|
OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled}
|
||||||
|
mkdir -p "${OUT_DIR}"
|
||||||
|
|
||||||
|
wait_for_server() {
|
||||||
|
local port=$1
|
||||||
|
timeout 600 bash -c '
|
||||||
|
until curl -sf "http://127.0.0.1:'"$port"'/health" > /dev/null; do
|
||||||
|
sleep 1
|
||||||
|
done'
|
||||||
|
}
|
||||||
|
|
||||||
|
MODEL="deepseek-ai/DeepSeek-V2-lite"
|
||||||
|
BACKENDS=("deepep_high_throughput" "deepep_low_latency")
|
||||||
|
|
||||||
|
cleanup() {
|
||||||
|
if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
|
||||||
|
kill "${SERVER_PID}" 2>/dev/null || true
|
||||||
|
for _ in {1..20}; do
|
||||||
|
kill -0 "${SERVER_PID}" 2>/dev/null || break
|
||||||
|
sleep 0.5
|
||||||
|
done
|
||||||
|
kill -9 "${SERVER_PID}" 2>/dev/null || true
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
trap cleanup EXIT
|
||||||
|
|
||||||
|
for BACK in "${BACKENDS[@]}"; do
|
||||||
|
VLLM_DEEP_GEMM_WARMUP=skip \
|
||||||
|
VLLM_ALL2ALL_BACKEND=$BACK \
|
||||||
|
vllm serve "$MODEL" \
|
||||||
|
--enforce-eager \
|
||||||
|
--tensor-parallel-size 2 \
|
||||||
|
--data-parallel-size 2 \
|
||||||
|
--enable-expert-parallel \
|
||||||
|
--enable-eplb \
|
||||||
|
--trust-remote-code \
|
||||||
|
--max-model-len 2048 \
|
||||||
|
--port $PORT &
|
||||||
|
SERVER_PID=$!
|
||||||
|
wait_for_server $PORT
|
||||||
|
|
||||||
|
TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
|
||||||
|
OUT="${OUT_DIR}/${TAG}_${BACK}.json"
|
||||||
|
python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
|
||||||
|
python3 - <<PY
|
||||||
|
import json; acc=json.load(open('${OUT}'))['accuracy']
|
||||||
|
print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")
|
||||||
|
assert acc >= ${THRESHOLD}, f"${MODEL} ${BACK} accuracy {acc}"
|
||||||
|
PY
|
||||||
|
|
||||||
|
cleanup
|
||||||
|
SERVER_PID=
|
||||||
|
sleep 1
|
||||||
|
PORT=$((PORT+1))
|
||||||
|
done
|
||||||
@ -0,0 +1,61 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euxo pipefail
|
||||||
|
|
||||||
|
# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT]
|
||||||
|
THRESHOLD=${1:-0.8}
|
||||||
|
NUM_Q=${2:-1319}
|
||||||
|
PORT=${3:-8020}
|
||||||
|
OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled}
|
||||||
|
mkdir -p "${OUT_DIR}"
|
||||||
|
|
||||||
|
wait_for_server() {
|
||||||
|
local port=$1
|
||||||
|
timeout 600 bash -c '
|
||||||
|
until curl -sf "http://127.0.0.1:'"$port"'/health" > /dev/null; do
|
||||||
|
sleep 1
|
||||||
|
done'
|
||||||
|
}
|
||||||
|
|
||||||
|
MODEL="QWen/Qwen3-30B-A3B-FP8"
|
||||||
|
BACKENDS=("deepep_high_throughput" "deepep_low_latency")
|
||||||
|
|
||||||
|
cleanup() {
|
||||||
|
if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
|
||||||
|
kill "${SERVER_PID}" 2>/dev/null || true
|
||||||
|
for _ in {1..20}; do
|
||||||
|
kill -0 "${SERVER_PID}" 2>/dev/null || break
|
||||||
|
sleep 0.5
|
||||||
|
done
|
||||||
|
kill -9 "${SERVER_PID}" 2>/dev/null || true
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
trap cleanup EXIT
|
||||||
|
|
||||||
|
for BACK in "${BACKENDS[@]}"; do
|
||||||
|
VLLM_DEEP_GEMM_WARMUP=skip \
|
||||||
|
VLLM_ALL2ALL_BACKEND=$BACK \
|
||||||
|
vllm serve "$MODEL" \
|
||||||
|
--enforce-eager \
|
||||||
|
--tensor-parallel-size 2 \
|
||||||
|
--data-parallel-size 2 \
|
||||||
|
--enable-expert-parallel \
|
||||||
|
--trust-remote-code \
|
||||||
|
--max-model-len 2048 \
|
||||||
|
--port $PORT &
|
||||||
|
SERVER_PID=$!
|
||||||
|
wait_for_server $PORT
|
||||||
|
|
||||||
|
TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
|
||||||
|
OUT="${OUT_DIR}/${TAG}_${BACK}.json"
|
||||||
|
python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
|
||||||
|
python3 - <<PY
|
||||||
|
import json; acc=json.load(open('${OUT}'))['accuracy']
|
||||||
|
print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")
|
||||||
|
assert acc >= ${THRESHOLD}, f"${MODEL} ${BACK} accuracy {acc}"
|
||||||
|
PY
|
||||||
|
|
||||||
|
cleanup
|
||||||
|
SERVER_PID=
|
||||||
|
sleep 1
|
||||||
|
PORT=$((PORT+1))
|
||||||
|
done
|
||||||
@ -1234,3 +1234,21 @@ steps:
|
|||||||
- .buildkite/scripts/run-prime-rl-test.sh
|
- .buildkite/scripts/run-prime-rl-test.sh
|
||||||
commands:
|
commands:
|
||||||
- bash .buildkite/scripts/run-prime-rl-test.sh
|
- bash .buildkite/scripts/run-prime-rl-test.sh
|
||||||
|
|
||||||
|
- label: DeepSeek V2-Lite Accuracy
|
||||||
|
timeout_in_minutes: 60
|
||||||
|
gpu: h100
|
||||||
|
optional: true
|
||||||
|
num_gpus: 4
|
||||||
|
working_dir: "/vllm-workspace"
|
||||||
|
commands:
|
||||||
|
- bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010
|
||||||
|
|
||||||
|
- label: Qwen3-30B-A3B-FP8-block Accuracy
|
||||||
|
timeout_in_minutes: 60
|
||||||
|
gpu: h100
|
||||||
|
optional: true
|
||||||
|
num_gpus: 4
|
||||||
|
working_dir: "/vllm-workspace"
|
||||||
|
commands:
|
||||||
|
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep.sh 0.8 200 8020
|
||||||
|
|||||||
0
tools/check_repo.sh
Normal file → Executable file
0
tools/check_repo.sh
Normal file → Executable file
0
tools/ep_kernels/configure_system_drivers.sh
Normal file → Executable file
0
tools/ep_kernels/configure_system_drivers.sh
Normal file → Executable file
0
tools/ep_kernels/elastic_ep/install_eep_libraries.sh
Normal file → Executable file
0
tools/ep_kernels/elastic_ep/install_eep_libraries.sh
Normal file → Executable file
1
tools/ep_kernels/install_python_libraries.sh
Normal file → Executable file
1
tools/ep_kernels/install_python_libraries.sh
Normal file → Executable file
@ -1,3 +1,4 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
set -ex
|
set -ex
|
||||||
|
|
||||||
# prepare workspace directory
|
# prepare workspace directory
|
||||||
|
|||||||
0
tools/flashinfer-build.sh
Normal file → Executable file
0
tools/flashinfer-build.sh
Normal file → Executable file
0
tools/vllm-tpu/build.sh
Normal file → Executable file
0
tools/vllm-tpu/build.sh
Normal file → Executable file
@ -26,6 +26,7 @@ import os
|
|||||||
import random
|
import random
|
||||||
import shutil
|
import shutil
|
||||||
import time
|
import time
|
||||||
|
import uuid
|
||||||
import warnings
|
import warnings
|
||||||
from collections.abc import AsyncGenerator, Iterable
|
from collections.abc import AsyncGenerator, Iterable
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
@ -1160,7 +1161,7 @@ def add_cli_args(parser: argparse.ArgumentParser):
|
|||||||
"--request-id-prefix",
|
"--request-id-prefix",
|
||||||
type=str,
|
type=str,
|
||||||
required=False,
|
required=False,
|
||||||
default="benchmark-serving",
|
default=f"bench-{uuid.uuid4().hex[:8]}-",
|
||||||
help="Specify the prefix of request id.",
|
help="Specify the prefix of request id.",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@ -316,7 +316,8 @@ class CpuPlatform(Platform):
|
|||||||
|
|
||||||
if (
|
if (
|
||||||
platform.system() == "Linux"
|
platform.system() == "Linux"
|
||||||
and Platform.get_cpu_architecture() == CpuArchEnum.ARM
|
and Platform.get_cpu_architecture()
|
||||||
|
in (CpuArchEnum.ARM, CpuArchEnum.POWERPC)
|
||||||
and not ("libomp" in ld_preload_str or "libgomp" in ld_preload_str)
|
and not ("libomp" in ld_preload_str or "libgomp" in ld_preload_str)
|
||||||
):
|
):
|
||||||
# We need to LD_PRELOAD PyTorch's libgomp, otherwise only
|
# We need to LD_PRELOAD PyTorch's libgomp, otherwise only
|
||||||
|
|||||||
@ -322,7 +322,7 @@ def initialize_ray_cluster(
|
|||||||
|
|
||||||
# Prevalidate GPU requirements before Ray processing
|
# Prevalidate GPU requirements before Ray processing
|
||||||
if current_platform.is_cuda() and parallel_config.world_size > 1:
|
if current_platform.is_cuda() and parallel_config.world_size > 1:
|
||||||
from vllm.utils import cuda_device_count_stateless
|
from vllm.utils.torch_utils import cuda_device_count_stateless
|
||||||
|
|
||||||
available_gpus = cuda_device_count_stateless()
|
available_gpus = cuda_device_count_stateless()
|
||||||
if parallel_config.world_size > available_gpus:
|
if parallel_config.world_size > available_gpus:
|
||||||
|
|||||||
Reference in New Issue
Block a user