Compare commits

..

6 Commits

19 changed files with 18 additions and 450 deletions

View File

@ -7,7 +7,7 @@ vLLM also maintains a continuous performance benchmark under [perf.vllm.ai](http
## Performance benchmark quick overview
**Benchmarking Coverage**: latency, throughput and fix-qps serving on B200, A100, H100, Intel® Xeon® Processors and Intel® Gaudi® 3 Accelerators with different models.
**Benchmarking Coverage**: latency, throughput and fix-qps serving on B200, A100, H100 and Intel® Xeon® Processors, with different models.
**Benchmarking Duration**: about 1hr.
@ -34,7 +34,6 @@ Runtime environment variables:
See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.
> NOTE: For Intel® Xeon® Processors, use `tests/latency-tests-cpu.json`, `tests/throughput-tests-cpu.json`, `tests/serving-tests-cpu.json` instead.
For Intel® Gaudi® 3 Accelerators, use `tests/latency-tests-hpu.json`, `tests/throughput-tests-hpu.json`, `tests/serving-tests-hpu.json` instead.
>
### Latency test

View File

@ -5,7 +5,7 @@
- Input length: 32 tokens.
- Output length: 128 tokens.
- Batch size: fixed (8).
- GPU/HPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
- GPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
- CPU Models: llama-3.1 8B.
- Evaluation metrics: end-to-end latency (mean, median, p99).
@ -16,7 +16,7 @@
- Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
- Output length: the corresponding output length of these 200 prompts.
- Batch size: dynamically determined by vllm to achieve maximum throughput.
- GPU/HPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
- GPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
- CPU Models: llama-3.1 8B.
- Evaluation metrics: throughput.
@ -28,7 +28,7 @@
- Output length: the corresponding output length of these 200 prompts.
- Batch size: dynamically determined by vllm and the arrival pattern of the requests.
- **Average QPS (query per second)**: 1, 4, 16 and inf. QPS = inf means all requests come at once. For other QPS values, the arrival time of each query is determined using a random Poisson process (with fixed random seed).
- GPU/HPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
- GPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
- We also added a speculative decoding test for llama-3 70B on GPU, under QPS 2
- CPU Models: llama-3.1 8B.
- Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99).

View File

@ -15,8 +15,6 @@ check_gpus() {
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
elif command -v amd-smi; then
declare -g gpu_count=$(amd-smi list | grep 'GPU' | wc -l)
elif command -v hl-smi; then
declare -g gpu_count=$(hl-smi --list | grep -i "Module ID" | wc -l)
fi
if [[ $gpu_count -gt 0 ]]; then
@ -25,16 +23,10 @@ check_gpus() {
echo "Need at least 1 GPU to run benchmarking."
exit 1
fi
declare -g arch_suffix=''
if command -v nvidia-smi; then
declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')
elif command -v amd-smi; then
declare -g gpu_type=$(amd-smi static -g 0 -a | grep 'MARKET_NAME' | awk '{print $2}')
elif command -v hl-smi; then
declare -g gpu_type=$(hl-smi -q | grep "Product Name" | head -n 1 | awk -F ':' '{print $2}' | sed 's/^ *//')
arch_suffix='-hpu'
fi
echo "GPU type is $gpu_type"
}
@ -146,10 +138,6 @@ kill_gpu_processes() {
while [ "$(amd-smi metric -g 0 | grep 'USED_VRAM' | awk '{print $2}')" -ge 1000 ]; do
sleep 1
done
elif command -v hl-smi; then
while [ "$(hl-smi -q | grep "Used" | head -n 1 | awk '{print $3}')" -ge 1000 ]; do
sleep 1
done
fi
# remove vllm config file
@ -463,7 +451,6 @@ main() {
ARCH='-cpu'
else
check_gpus
ARCH="$arch_suffix"
fi
check_hf_token

View File

@ -1,55 +0,0 @@
[
{
"test_name": "latency_llama8B_tp1",
"environment_variables": {
"PT_HPU_LAZY_MODE": 1,
"VLLM_CONTIGUOUS_PA": 1,
"VLLM_DEFRAG": 1
},
"parameters": {
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"tensor_parallel_size": 1,
"load_format": "dummy",
"num-iters-warmup": 5,
"num-iters": 15,
"max-model-len": 256,
"async-scheduling": ""
}
},
{
"test_name": "latency_llama70B_tp4",
"environment_variables": {
"PT_HPU_LAZY_MODE": 1,
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
"VLLM_CONTIGUOUS_PA": 1,
"VLLM_DEFRAG": 1
},
"parameters": {
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
"tensor_parallel_size": 4,
"load_format": "dummy",
"num-iters-warmup": 5,
"num-iters": 15,
"max-model-len": 256,
"async-scheduling": ""
}
},
{
"test_name": "latency_mixtral8x7B_tp2",
"environment_variables": {
"PT_HPU_LAZY_MODE": 1,
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
"VLLM_CONTIGUOUS_PA": 1,
"VLLM_DEFRAG": 1
},
"parameters": {
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
"tensor_parallel_size": 2,
"load_format": "dummy",
"num-iters-warmup": 5,
"num-iters": 15,
"max-model-len": 256,
"async-scheduling": ""
}
}
]

View File

@ -1,82 +0,0 @@
[
{
"test_name": "serving_llama8B_tp1_sharegpt",
"qps_list": [1, 4, 16, "inf"],
"server_environment_variables": {
"PT_HPU_LAZY_MODE": 1,
"VLLM_CONTIGUOUS_PA": 1,
"VLLM_DEFRAG": 1
},
"server_parameters": {
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"tensor_parallel_size": 1,
"swap_space": 16,
"disable_log_stats": "",
"load_format": "dummy",
"max-model-len": 2048,
"max-num-seqs": 256,
"async-scheduling": ""
},
"client_parameters": {
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{
"test_name": "serving_llama70B_tp4_sharegpt",
"qps_list": [1, 4, 16, "inf"],
"server_environment_variables": {
"PT_HPU_LAZY_MODE": 1,
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
"VLLM_CONTIGUOUS_PA": 1,
"VLLM_DEFRAG": 1
},
"server_parameters": {
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
"tensor_parallel_size": 4,
"swap_space": 16,
"disable_log_stats": "",
"load_format": "dummy",
"max-model-len": 2048,
"max-num-seqs": 256,
"async-scheduling": ""
},
"client_parameters": {
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{
"test_name": "serving_mixtral8x7B_tp2_sharegpt",
"qps_list": [1, 4, 16, "inf"],
"server_environment_variables": {
"PT_HPU_LAZY_MODE": 1,
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
"VLLM_CONTIGUOUS_PA": 1,
"VLLM_DEFRAG": 1
},
"server_parameters": {
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
"tensor_parallel_size": 2,
"swap_space": 16,
"disable_log_stats": "",
"load_format": "dummy",
"max-model-len": 2048,
"max-num-seqs": 256,
"async-scheduling": ""
},
"client_parameters": {
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
}
]

View File

@ -1,61 +0,0 @@
[
{
"test_name": "throughput_llama8B_tp1",
"environment_variables": {
"PT_HPU_LAZY_MODE": 1,
"VLLM_CONTIGUOUS_PA": 1,
"VLLM_DEFRAG": 1
},
"parameters": {
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"tensor_parallel_size": 1,
"load_format": "dummy",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 1000,
"backend": "vllm",
"max-model-len": 2048,
"max-num-seqs": 512,
"async-scheduling": ""
}
},
{
"test_name": "throughput_llama70B_tp4",
"environment_variables": {
"PT_HPU_LAZY_MODE": 1,
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
"VLLM_CONTIGUOUS_PA": 1,
"VLLM_DEFRAG": 1
},
"parameters": {
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
"tensor_parallel_size": 4,
"load_format": "dummy",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 1000,
"backend": "vllm",
"max-model-len": 2048,
"max-num-seqs": 512,
"async-scheduling": ""
}
},
{
"test_name": "throughput_mixtral8x7B_tp2",
"environment_variables": {
"PT_HPU_LAZY_MODE": 1,
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
"VLLM_CONTIGUOUS_PA": 1,
"VLLM_DEFRAG": 1
},
"parameters": {
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
"tensor_parallel_size": 2,
"load_format": "dummy",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 1000,
"backend": "vllm",
"max-model-len": 2048,
"max-num-seqs": 512,
"async-scheduling": ""
}
}
]

View File

@ -1,62 +0,0 @@
#!/usr/bin/env bash
set -euxo pipefail
# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT]
THRESHOLD=${1:-0.25}
NUM_Q=${2:-1319}
PORT=${3:-8010}
OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled}
mkdir -p "${OUT_DIR}"
wait_for_server() {
local port=$1
timeout 600 bash -c '
until curl -sf "http://127.0.0.1:'"$port"'/health" > /dev/null; do
sleep 1
done'
}
MODEL="deepseek-ai/DeepSeek-V2-lite"
BACKENDS=("deepep_high_throughput" "deepep_low_latency")
cleanup() {
if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
kill "${SERVER_PID}" 2>/dev/null || true
for _ in {1..20}; do
kill -0 "${SERVER_PID}" 2>/dev/null || break
sleep 0.5
done
kill -9 "${SERVER_PID}" 2>/dev/null || true
fi
}
trap cleanup EXIT
for BACK in "${BACKENDS[@]}"; do
VLLM_DEEP_GEMM_WARMUP=skip \
VLLM_ALL2ALL_BACKEND=$BACK \
vllm serve "$MODEL" \
--enforce-eager \
--tensor-parallel-size 2 \
--data-parallel-size 2 \
--enable-expert-parallel \
--enable-eplb \
--trust-remote-code \
--max-model-len 2048 \
--port $PORT &
SERVER_PID=$!
wait_for_server $PORT
TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
OUT="${OUT_DIR}/${TAG}_${BACK}.json"
python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
python3 - <<PY
import json; acc=json.load(open('${OUT}'))['accuracy']
print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")
assert acc >= ${THRESHOLD}, f"${MODEL} ${BACK} accuracy {acc}"
PY
cleanup
SERVER_PID=
sleep 1
PORT=$((PORT+1))
done

View File

@ -1,61 +0,0 @@
#!/usr/bin/env bash
set -euxo pipefail
# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT]
THRESHOLD=${1:-0.8}
NUM_Q=${2:-1319}
PORT=${3:-8020}
OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled}
mkdir -p "${OUT_DIR}"
wait_for_server() {
local port=$1
timeout 600 bash -c '
until curl -sf "http://127.0.0.1:'"$port"'/health" > /dev/null; do
sleep 1
done'
}
MODEL="QWen/Qwen3-30B-A3B-FP8"
BACKENDS=("deepep_high_throughput" "deepep_low_latency")
cleanup() {
if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
kill "${SERVER_PID}" 2>/dev/null || true
for _ in {1..20}; do
kill -0 "${SERVER_PID}" 2>/dev/null || break
sleep 0.5
done
kill -9 "${SERVER_PID}" 2>/dev/null || true
fi
}
trap cleanup EXIT
for BACK in "${BACKENDS[@]}"; do
VLLM_DEEP_GEMM_WARMUP=skip \
VLLM_ALL2ALL_BACKEND=$BACK \
vllm serve "$MODEL" \
--enforce-eager \
--tensor-parallel-size 2 \
--data-parallel-size 2 \
--enable-expert-parallel \
--trust-remote-code \
--max-model-len 2048 \
--port $PORT &
SERVER_PID=$!
wait_for_server $PORT
TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
OUT="${OUT_DIR}/${TAG}_${BACK}.json"
python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
python3 - <<PY
import json; acc=json.load(open('${OUT}'))['accuracy']
print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")
assert acc >= ${THRESHOLD}, f"${MODEL} ${BACK} accuracy {acc}"
PY
cleanup
SERVER_PID=
sleep 1
PORT=$((PORT+1))
done

View File

@ -1234,21 +1234,3 @@ steps:
- .buildkite/scripts/run-prime-rl-test.sh
commands:
- bash .buildkite/scripts/run-prime-rl-test.sh
- label: DeepSeek V2-Lite Accuracy
timeout_in_minutes: 60
gpu: h100
optional: true
num_gpus: 4
working_dir: "/vllm-workspace"
commands:
- bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010
- label: Qwen3-30B-A3B-FP8-block Accuracy
timeout_in_minutes: 60
gpu: h100
optional: true
num_gpus: 4
working_dir: "/vllm-workspace"
commands:
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep.sh 0.8 200 8020

0
tools/check_repo.sh Executable file → Normal file
View File

0
tools/ep_kernels/configure_system_drivers.sh Executable file → Normal file
View File

0
tools/ep_kernels/elastic_ep/install_eep_libraries.sh Executable file → Normal file
View File

1
tools/ep_kernels/install_python_libraries.sh Executable file → Normal file
View File

@ -1,4 +1,3 @@
#!/usr/bin/env bash
set -ex
# prepare workspace directory

0
tools/flashinfer-build.sh Executable file → Normal file
View File

0
tools/vllm-tpu/build.sh Executable file → Normal file
View File

View File

@ -26,7 +26,6 @@ import os
import random
import shutil
import time
import uuid
import warnings
from collections.abc import AsyncGenerator, Iterable
from dataclasses import dataclass
@ -1161,7 +1160,7 @@ def add_cli_args(parser: argparse.ArgumentParser):
"--request-id-prefix",
type=str,
required=False,
default=f"bench-{uuid.uuid4().hex[:8]}-",
default="benchmark-serving",
help="Specify the prefix of request id.",
)

View File

@ -41,7 +41,6 @@ from vllm.model_executor.layers.quantization.base_config import (
QuantizationConfig,
QuantizeMethodBase,
)
from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
from vllm.model_executor.layers.quantization.utils.flashinfer_utils import (
FlashinferMoeBackend,
@ -94,11 +93,9 @@ from vllm.model_executor.utils import set_weight_attrs
from vllm.platforms import current_platform
from vllm.scalar_type import scalar_types
from vllm.utils.deep_gemm import (
fp8_gemm_nt,
get_col_major_tma_aligned_tensor,
is_deep_gemm_e8m0_used,
is_deep_gemm_supported,
should_use_deepgemm_for_fp8_linear,
)
from vllm.utils.flashinfer import has_flashinfer_moe
from vllm.utils.import_utils import has_deep_gemm
@ -546,83 +543,19 @@ class Fp8LinearMethod(LinearMethodBase):
# if batch invariant mode is enabled, prefer DeepGEMM FP8 path
# we will use BF16 dequant when DeepGEMM is not supported.
if vllm_is_batch_invariant():
# Call is_deep_gemm_supported() ahead of time for torch.compile
# dynamo has trouble tracing through
if self.block_quant and should_use_deepgemm_for_fp8_linear(
torch.bfloat16, layer.weight, self.use_deep_gemm
):
# use group quant consistent with block size across K
assert self.act_q_group_shape is not None
q_input, input_scale = QuantFP8(
False,
self.act_q_group_shape,
column_major_scales=True,
)(x)
output_2d = torch.empty(
(q_input.shape[0], layer.weight.shape[0]),
dtype=torch.bfloat16,
device=q_input.device,
)
fp8_gemm_nt(
(q_input, input_scale),
(layer.weight, layer.weight_scale),
output_2d,
)
if bias is not None:
output_2d = output_2d + bias
return output_2d
# Dequantize FP8 weights to BF16
weight_fp8 = layer.weight.to(torch.bfloat16)
weight_scale = layer.weight_scale.to(torch.bfloat16)
# Handle different quantization granularities
if self.block_quant:
# Block-wise quantization:
# - Weight is NOT transposed, shape is [N, K] (output_size, input_size)
# - Scale has shape [num_blocks_k, num_blocks_n] (TRANSPOSED!)
assert self.weight_block_size is not None
block_n, block_k = self.weight_block_size # Note: order is [N, K]
N, K = weight_fp8.shape
# determine expected number of blocks along N and K
num_blocks_n = (N + block_n - 1) // block_n
num_blocks_k = (K + block_k - 1) // block_k
# scale layout may be [num_blocks_n, num_blocks_k]
# or [num_blocks_k, num_blocks_n] depending on backend
if weight_scale.dim() != 2:
raise RuntimeError(
f"FP8 block scale must be 2D, got {tuple(weight_scale.shape)}"
)
scale_rows, scale_cols = weight_scale.shape
if (scale_rows, scale_cols) == (num_blocks_k, num_blocks_n):
if num_blocks_n == num_blocks_k:
# ambiguous square case, warn and skip transpose
logger.warning(
"Batch-invariant FP8: square block-scale %dx%d; "
"skipping transpose to avoid misorientation.",
scale_rows,
scale_cols,
)
else:
# clear KN -> transpose to NK
weight_scale = weight_scale.t()
# Expand scale to match weight dimensions
# scale_expanded should have shape [N, K]
scale_expanded = weight_scale.repeat_interleave(
block_n, dim=0
).repeat_interleave(block_k, dim=1)
# Trim to exact weight size (in case of padding)
scale_expanded = scale_expanded[:N, :K]
weight_bf16 = weight_fp8 * scale_expanded
return self.w8a8_block_fp8_linear.apply(
input=x,
weight=layer.weight,
weight_scale=layer.weight_scale,
input_scale=layer.input_scale,
bias=bias,
)
else:
# Per-tensor quantization: weight IS transposed to [K, N]
# scale should be scalar or [1] or per-output-channel [N]
# per-tensor/channel: dequant to BF16 and run GEMM
weight_fp8 = layer.weight.to(torch.bfloat16)
weight_scale = layer.weight_scale.to(torch.bfloat16)
if weight_scale.numel() == 1:
# Per-tensor: simple scalar multiplication
weight_bf16 = weight_fp8 * weight_scale
@ -641,16 +574,7 @@ class Fp8LinearMethod(LinearMethodBase):
else:
# Fallback
weight_bf16 = weight_fp8 * weight_scale
# For block quant, weight is [N, K], for per-tensor it's [K, N]
# F.linear expects weight to be [N, K], so:
if self.block_quant:
# Already in correct shape [N, K]
output = torch.nn.functional.linear(x, weight_bf16, bias)
else:
# Need to transpose back: [K, N] -> [N, K]
output = torch.nn.functional.linear(x, weight_bf16.t(), bias)
return output
return torch.nn.functional.linear(x, weight_bf16.t(), bias)
if self.use_marlin:
return apply_fp8_marlin_linear(

View File

@ -316,8 +316,7 @@ class CpuPlatform(Platform):
if (
platform.system() == "Linux"
and Platform.get_cpu_architecture()
in (CpuArchEnum.ARM, CpuArchEnum.POWERPC)
and Platform.get_cpu_architecture() == CpuArchEnum.ARM
and not ("libomp" in ld_preload_str or "libgomp" in ld_preload_str)
):
# We need to LD_PRELOAD PyTorch's libgomp, otherwise only

View File

@ -322,7 +322,7 @@ def initialize_ray_cluster(
# Prevalidate GPU requirements before Ray processing
if current_platform.is_cuda() and parallel_config.world_size > 1:
from vllm.utils.torch_utils import cuda_device_count_stateless
from vllm.utils import cuda_device_count_stateless
available_gpus = cuda_device_count_stateless()
if parallel_config.world_size > available_gpus: