Compare commits

..

1 Commits

Author SHA1 Message Date
221118dc85 [Bugfix] Use a different prompt for benchmark_serving.py test prompt
Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
2025-05-17 18:36:31 +00:00
2076 changed files with 48342 additions and 135128 deletions

View File

@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os import os
import sys import sys

View File

@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse import argparse
import os import os

View File

@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from pathlib import Path from pathlib import Path
import pytest import pytest

View File

@ -46,6 +46,6 @@ while getopts "m:b:l:f:t:" OPT; do
done done
lm_eval --model vllm \ lm_eval --model vllm \
--model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,add_bos_token=true,trust_remote_code=true,max_model_len=4096" \ --model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,distributed_executor_backend=ray,trust_remote_code=true,max_model_len=4096" \
--tasks gsm8k --num_fewshot "$FEWSHOT" --limit "$LIMIT" \ --tasks gsm8k --num_fewshot "$FEWSHOT" --limit "$LIMIT" \
--batch_size "$BATCH_SIZE" --batch_size "$BATCH_SIZE"

View File

@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
""" """
LM eval harness on model to compare vs HF baseline computed offline. LM eval harness on model to compare vs HF baseline computed offline.
Configs are found in configs/$MODEL.yaml Configs are found in configs/$MODEL.yaml
@ -18,14 +17,12 @@ RTOL = 0.08
def launch_lm_eval(eval_config, tp_size): def launch_lm_eval(eval_config, tp_size):
trust_remote_code = eval_config.get("trust_remote_code", False) trust_remote_code = eval_config.get("trust_remote_code", False)
max_model_len = eval_config.get("max_model_len", 4096)
model_args = ( model_args = (
f"pretrained={eval_config['model_name']}," f"pretrained={eval_config['model_name']},"
f"tensor_parallel_size={tp_size}," f"tensor_parallel_size={tp_size},"
f"enforce_eager=true," f"enforce_eager=true,"
f"add_bos_token=true," f"add_bos_token=true,"
f"trust_remote_code={trust_remote_code}," f"trust_remote_code={trust_remote_code}"
f"max_model_len={max_model_len}"
) )
results = lm_eval.simple_evaluate( results = lm_eval.simple_evaluate(
model="vllm", model="vllm",

View File

@ -11,7 +11,7 @@ See [vLLM performance dashboard](https://perf.vllm.ai) for the latest performanc
## Performance benchmark quick overview ## Performance benchmark quick overview
**Benchmarking Coverage**: latency, throughput and fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!) and Intel® Xeon® Processors, with different models. **Benchmarking Coverage**: latency, throughput and fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!), with different models.
**Benchmarking Duration**: about 1hr. **Benchmarking Duration**: about 1hr.
@ -31,27 +31,13 @@ Performance benchmark will be triggered when:
- A PR being merged into vllm. - A PR being merged into vllm.
- Every commit for those PRs with `perf-benchmarks` label AND `ready` label. - Every commit for those PRs with `perf-benchmarks` label AND `ready` label.
Manually Trigger the benchmark
```bash
bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
```
Runtime environment variables:
- `ON_CPU`: set the value to '1' on Intel® Xeon® Processors. Default value is 0.
- `SERVING_JSON`: JSON file to use for the serving tests. Default value is empty string (use default file).
- `LATENCY_JSON`: JSON file to use for the latency tests. Default value is empty string (use default file).
- `THROUGHPUT_JSON`: JSON file to use for the throughout tests. Default value is empty string (use default file).
- `REMOTE_HOST`: IP for the remote vLLM service to benchmark. Default value is empty string.
- `REMOTE_PORT`: Port for the remote vLLM service to benchmark. Default value is empty string.
Nightly benchmark will be triggered when: Nightly benchmark will be triggered when:
- Every commit for those PRs with `perf-benchmarks` label and `nightly-benchmarks` label. - Every commit for those PRs with `perf-benchmarks` label and `nightly-benchmarks` label.
## Performance benchmark details ## Performance benchmark details
See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases. See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.
> NOTE: For Intel® Xeon® Processors, use `tests/latency-tests-cpu.json`, `tests/throughput-tests-cpu.json`, `tests/serving-tests-cpu.json` instead.
### Latency test ### Latency test
Here is an example of one test inside `latency-tests.json`: Here is an example of one test inside `latency-tests.json`:
@ -127,36 +113,12 @@ WARNING: The benchmarking script will save json results by itself, so please do
### Visualizing the results ### Visualizing the results
The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](performance-benchmarks-descriptions.md) with real benchmarking results. The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](tests/descriptions.md) with real benchmarking results.
You can find the result presented as a table inside the `buildkite/performance-benchmark` job page. You can find the result presented as a table inside the `buildkite/performance-benchmark` job page.
If you do not see the table, please wait till the benchmark finish running. If you do not see the table, please wait till the benchmark finish running.
The json version of the table (together with the json version of the benchmark) will be also attached to the markdown file. The json version of the table (together with the json version of the benchmark) will be also attached to the markdown file.
The raw benchmarking results (in the format of json files) are in the `Artifacts` tab of the benchmarking. The raw benchmarking results (in the format of json files) are in the `Artifacts` tab of the benchmarking.
The `compare-json-results.py` helps to compare benchmark results JSON files converted using `convert-results-json-to-markdown.py`.
When run, benchmark script generates results under `benchmark/results` folder, along with the `benchmark_results.md` and `benchmark_results.json`.
`compare-json-results.py` compares two `benchmark_results.json` files and provides performance ratio e.g. for Output Tput, Median TTFT and Median TPOT.
Here is an example using the script to compare result_a and result_b without detail test name.
`python3 compare-json-results.py -f results_a/benchmark_results.json -f results_b/benchmark_results.json --ignore_test_name`
| | results_a/benchmark_results.json | results_b/benchmark_results.json | perf_ratio |
|----|----------------------------------------|----------------------------------------|----------|
| 0 | 142.633982 | 156.526018 | 1.097396 |
| 1 | 241.620334 | 294.018783 | 1.216863 |
| 2 | 218.298905 | 262.664916 | 1.203235 |
| 3 | 242.743860 | 299.816190 | 1.235113 |
Here is an example using the script to compare result_a and result_b with detail test name.
`python3 compare-json-results.py -f results_a/benchmark_results.json -f results_b/benchmark_results.json`
| | results_a/benchmark_results.json_name | results_a/benchmark_results.json | results_b/benchmark_results.json_name | results_b/benchmark_results.json | perf_ratio |
|---|---------------------------------------------|----------------------------------------|---------------------------------------------|----------------------------------------|----------|
| 0 | serving_llama8B_tp1_sharegpt_qps_1 | 142.633982 | serving_llama8B_tp1_sharegpt_qps_1 | 156.526018 | 1.097396 |
| 1 | serving_llama8B_tp1_sharegpt_qps_16 | 241.620334 | serving_llama8B_tp1_sharegpt_qps_16 | 294.018783 | 1.216863 |
| 2 | serving_llama8B_tp1_sharegpt_qps_4 | 218.298905 | serving_llama8B_tp1_sharegpt_qps_4 | 262.664916 | 1.203235 |
| 3 | serving_llama8B_tp1_sharegpt_qps_inf | 242.743860 | serving_llama8B_tp1_sharegpt_qps_inf | 299.816190 | 1.235113 |
| 4 | serving_llama8B_tp2_random_1024_128_qps_1 | 96.613390 | serving_llama8B_tp4_random_1024_128_qps_1 | 108.404853 | 1.122048 |
## Nightly test details ## Nightly test details
See [nightly-descriptions.md](nightly-descriptions.md) for the detailed description on test workload, models and docker containers of benchmarking other llm engines. See [nightly-descriptions.md](nightly-descriptions.md) for the detailed description on test workload, models and docker containers of benchmarking other llm engines.

View File

@ -16,7 +16,7 @@ Please download the visualization scripts in the post
- Download `nightly-benchmarks.zip`. - Download `nightly-benchmarks.zip`.
- In the same folder, run the following code: - In the same folder, run the following code:
```bash ```console
export HF_TOKEN=<your HF token> export HF_TOKEN=<your HF token>
apt update apt update
apt install -y git apt install -y git

View File

@ -4,8 +4,7 @@
- Input length: 32 tokens. - Input length: 32 tokens.
- Output length: 128 tokens. - Output length: 128 tokens.
- Batch size: fixed (8). - Batch size: fixed (8).
- GPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B. - Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
- CPU Models: llama-3.1 8B.
- Evaluation metrics: end-to-end latency (mean, median, p99). - Evaluation metrics: end-to-end latency (mean, median, p99).
{latency_tests_markdown_table} {latency_tests_markdown_table}
@ -15,8 +14,7 @@
- Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed). - Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
- Output length: the corresponding output length of these 200 prompts. - Output length: the corresponding output length of these 200 prompts.
- Batch size: dynamically determined by vllm to achieve maximum throughput. - Batch size: dynamically determined by vllm to achieve maximum throughput.
- GPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B. - Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
- CPU Models: llama-3.1 8B.
- Evaluation metrics: throughput. - Evaluation metrics: throughput.
{throughput_tests_markdown_table} {throughput_tests_markdown_table}
@ -27,18 +25,12 @@
- Output length: the corresponding output length of these 200 prompts. - Output length: the corresponding output length of these 200 prompts.
- Batch size: dynamically determined by vllm and the arrival pattern of the requests. - Batch size: dynamically determined by vllm and the arrival pattern of the requests.
- **Average QPS (query per second)**: 1, 4, 16 and inf. QPS = inf means all requests come at once. For other QPS values, the arrival time of each query is determined using a random Poisson process (with fixed random seed). - **Average QPS (query per second)**: 1, 4, 16 and inf. QPS = inf means all requests come at once. For other QPS values, the arrival time of each query is determined using a random Poisson process (with fixed random seed).
- GPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B. - Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
- We also added a speculative decoding test for llama-3 70B on GPU, under QPS 2 - We also added a speculative decoding test for llama-3 70B, under QPS 2
- CPU Models: llama-3.1 8B.
- Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99). - Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99).
- For CPU, we added random dataset tests to benchmark fixed input/output length with 100 prompts.
{serving_tests_markdown_table} {serving_tests_markdown_table}
## Platform Information
{platform_markdown_table}
## json version of the benchmarking tables ## json version of the benchmarking tables
This section contains the data of the markdown tables above in JSON format. This section contains the data of the markdown tables above in JSON format.

View File

@ -1,66 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse
import pandas as pd
def compare_data_columns(
files, name_column, data_column, drop_column, ignore_test_name=False
):
print("\ncompare_data_column: " + data_column)
frames = []
compare_frames = []
for file in files:
data_df = pd.read_json(file)
serving_df = data_df.dropna(subset=[drop_column], ignore_index=True)
if ignore_test_name is False:
serving_df = serving_df.rename(columns={name_column: file + "_name"})
frames.append(serving_df[file + "_name"])
serving_df = serving_df.rename(columns={data_column: file})
frames.append(serving_df[file])
compare_frames.append(serving_df[file])
if len(compare_frames) >= 2:
# Compare numbers among two files
ratio_df = compare_frames[1] / compare_frames[0]
frames.append(ratio_df)
compare_frames.pop(1)
concat_df = pd.concat(frames, axis=1)
return concat_df
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"-f", "--file", action="append", type=str, help="input file name"
)
parser.add_argument(
"--ignore_test_name", action="store_true", help="ignore_test_name or not"
)
args = parser.parse_args()
files = args.file
print("comparing : " + ", ".join(files))
drop_column = "P99"
name_column = "Test name"
data_cols_to_compare = ["Output Tput (tok/s)", "Median TTFT (ms)", "Median"]
html_msgs_for_data_cols = [
"Compare Output Tokens /n",
"Median TTFT /n",
"Median TPOT /n",
]
ignore_test_name = args.ignore_test_name
with open("perf_comparison.html", "w") as text_file:
for i in range(len(data_cols_to_compare)):
output_df = compare_data_columns(
files,
name_column,
data_cols_to_compare[i],
drop_column,
ignore_test_name=ignore_test_name,
)
print(output_df)
html = output_df.to_html()
text_file.write(html_msgs_for_data_cols[i])
text_file.write(html)

View File

@ -1,13 +1,10 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import json import json
import os import os
from importlib import util
from pathlib import Path from pathlib import Path
import pandas as pd import pandas as pd
import psutil
from tabulate import tabulate from tabulate import tabulate
results_folder = Path("results/") results_folder = Path("results/")
@ -31,11 +28,11 @@ throughput_results = []
throughput_results_column_mapping = { throughput_results_column_mapping = {
"test_name": "Test name", "test_name": "Test name",
"gpu_type": "GPU", "gpu_type": "GPU",
"num_requests": "# of req.", # "num_requests": "# of req.",
"total_num_tokens": "Total # of tokens", # "total_num_tokens": "Total # of tokens",
"elapsed_time": "Elapsed time (s)", # "elapsed_time": "Elapsed time (s)",
"requests_per_second": "Tput (req/s)", "requests_per_second": "Tput (req/s)",
"tokens_per_second": "Tput (tok/s)", # "tokens_per_second": "Tput (tok/s)",
} }
# serving results and the keys that will be printed into markdown # serving results and the keys that will be printed into markdown
@ -43,18 +40,16 @@ serving_results = []
serving_column_mapping = { serving_column_mapping = {
"test_name": "Test name", "test_name": "Test name",
"gpu_type": "GPU", "gpu_type": "GPU",
"completed": "# of req.", # "completed": "# of req.",
"request_throughput": "Tput (req/s)", "request_throughput": "Tput (req/s)",
"total_token_throughput": "Total Token Tput (tok/s)", # "input_throughput": "Input Tput (tok/s)",
"output_throughput": "Output Tput (tok/s)", # "output_throughput": "Output Tput (tok/s)",
"total_input_tokens": "Total input tokens",
"total_output_tokens": "Total output tokens",
"mean_ttft_ms": "Mean TTFT (ms)", "mean_ttft_ms": "Mean TTFT (ms)",
"median_ttft_ms": "Median TTFT (ms)", "median_ttft_ms": "Median TTFT (ms)",
"p99_ttft_ms": "P99 TTFT (ms)", "p99_ttft_ms": "P99 TTFT (ms)",
"mean_tpot_ms": "Mean TPOT (ms)", # "mean_tpot_ms": "Mean TPOT (ms)",
"median_tpot_ms": "Median", # "median_tpot_ms": "Median",
"p99_tpot_ms": "P99", # "p99_tpot_ms": "P99",
"mean_itl_ms": "Mean ITL (ms)", "mean_itl_ms": "Mean ITL (ms)",
"median_itl_ms": "Median ITL (ms)", "median_itl_ms": "Median ITL (ms)",
"p99_itl_ms": "P99 ITL (ms)", "p99_itl_ms": "P99 ITL (ms)",
@ -79,20 +74,6 @@ def results_to_json(latency, throughput, serving):
) )
def get_size_with_unit(bytes, suffix="B"):
"""
Scale bytes to its proper format
e.g:
1253656 => '1.20MB'
1253656678 => '1.17GB'
"""
factor = 1024
for unit in ["", "K", "M", "G", "T", "P"]:
if bytes < factor:
return f"{bytes:.2f}{unit}{suffix}"
bytes /= factor
if __name__ == "__main__": if __name__ == "__main__":
# collect results # collect results
for test_file in results_folder.glob("*.json"): for test_file in results_folder.glob("*.json"):
@ -173,27 +154,6 @@ if __name__ == "__main__":
serving_results = pd.DataFrame.from_dict(serving_results) serving_results = pd.DataFrame.from_dict(serving_results)
throughput_results = pd.DataFrame.from_dict(throughput_results) throughput_results = pd.DataFrame.from_dict(throughput_results)
svmem = psutil.virtual_memory()
platform_data = {
"Physical cores": [psutil.cpu_count(logical=False)],
"Total cores": [psutil.cpu_count(logical=True)],
"Total Memory": [get_size_with_unit(svmem.total)],
}
if util.find_spec("numa") is not None:
from numa import info
platform_data["Total NUMA nodes"] = [info.get_num_configured_nodes()]
if util.find_spec("cpuinfo") is not None:
from cpuinfo import get_cpu_info
platform_data["CPU Brand"] = [get_cpu_info()["brand_raw"]]
platform_results = pd.DataFrame.from_dict(
platform_data, orient="index", columns=["Platform Info"]
)
raw_results_json = results_to_json( raw_results_json = results_to_json(
latency_results, throughput_results, serving_results latency_results, throughput_results, serving_results
) )
@ -239,9 +199,6 @@ if __name__ == "__main__":
throughput_md_table = tabulate( throughput_md_table = tabulate(
throughput_results, headers="keys", tablefmt="pipe", showindex=False throughput_results, headers="keys", tablefmt="pipe", showindex=False
) )
platform_md_table = tabulate(
platform_results, headers="keys", tablefmt="pipe", showindex=True
)
# document the result # document the result
with open(results_folder / "benchmark_results.md", "w") as f: with open(results_folder / "benchmark_results.md", "w") as f:
@ -253,7 +210,6 @@ if __name__ == "__main__":
latency_tests_markdown_table=latency_md_table, latency_tests_markdown_table=latency_md_table,
throughput_tests_markdown_table=throughput_md_table, throughput_tests_markdown_table=throughput_md_table,
serving_tests_markdown_table=serving_md_table, serving_tests_markdown_table=serving_md_table,
platform_markdown_table=platform_md_table,
benchmarking_results_in_json_string=processed_results_json, benchmarking_results_in_json_string=processed_results_json,
) )
f.write(results) f.write(results)

View File

@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse import argparse

View File

@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse import argparse
import json import json

View File

@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from lmdeploy.serve.openai.api_client import APIClient from lmdeploy.serve.openai.api_client import APIClient

View File

@ -31,20 +31,6 @@ check_gpus() {
echo "GPU type is $gpu_type" echo "GPU type is $gpu_type"
} }
check_cpus() {
# check the number of CPUs and NUMA Node and GPU type.
declare -g numa_count=$(python3 -c "from numa import info;numa_size = info.get_num_configured_nodes(); print(numa_size)")
if [[ $numa_count -gt 0 ]]; then
echo "NUMA found."
echo $numa_count
else
echo "Need at least 1 NUMA to run benchmarking."
exit 1
fi
declare -g gpu_type="cpu"
echo "GPU type is $gpu_type"
}
check_hf_token() { check_hf_token() {
# check if HF_TOKEN is available and valid # check if HF_TOKEN is available and valid
if [[ -z "$HF_TOKEN" ]]; then if [[ -z "$HF_TOKEN" ]]; then
@ -83,22 +69,6 @@ json2args() {
echo "$args" echo "$args"
} }
json2envs() {
# transforms the JSON string to environment variables.
# example:
# input: { "VLLM_CPU_KVCACHE_SPACE": 5 }
# output: VLLM_CPU_KVCACHE_SPACE=5
local json_string=$1
local args=$(
echo "$json_string" | jq -r '
to_entries |
map((.key ) + "=" + (.value | tostring)) |
join(" ")
'
)
echo "$args"
}
wait_for_server() { wait_for_server() {
# wait for vllm server to start # wait for vllm server to start
# return 1 if vllm server crashes # return 1 if vllm server crashes
@ -188,24 +158,15 @@ run_latency_tests() {
# get arguments # get arguments
latency_params=$(echo "$params" | jq -r '.parameters') latency_params=$(echo "$params" | jq -r '.parameters')
latency_args=$(json2args "$latency_params") latency_args=$(json2args "$latency_params")
latency_environment_variables=$(echo "$params" | jq -r '.environment_variables')
latency_envs=$(json2envs "$latency_environment_variables")
# check if there is enough GPU to run the test # check if there is enough GPU to run the test
tp=$(echo "$latency_params" | jq -r '.tensor_parallel_size') tp=$(echo "$latency_params" | jq -r '.tensor_parallel_size')
if [ "$ON_CPU" == "1" ];then if [[ $gpu_count -lt $tp ]]; then
if [[ $numa_count -lt $tp ]]; then echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
echo "Required tensor-parallel-size $tp but only $numa_count NUMA nodes found. Skip testcase $test_name." continue
continue
fi
else
if [[ $gpu_count -lt $tp ]]; then
echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
continue
fi
fi fi
latency_command=" $latency_envs python3 benchmark_latency.py \ latency_command="python3 benchmark_latency.py \
--output-json $RESULTS_FOLDER/${test_name}.json \ --output-json $RESULTS_FOLDER/${test_name}.json \
$latency_args" $latency_args"
@ -255,24 +216,15 @@ run_throughput_tests() {
# get arguments # get arguments
throughput_params=$(echo "$params" | jq -r '.parameters') throughput_params=$(echo "$params" | jq -r '.parameters')
throughput_args=$(json2args "$throughput_params") throughput_args=$(json2args "$throughput_params")
throughput_environment_variables=$(echo "$params" | jq -r '.environment_variables')
throughput_envs=$(json2envs "$throughput_environment_variables")
# check if there is enough GPU to run the test # check if there is enough GPU to run the test
tp=$(echo "$throughput_params" | jq -r '.tensor_parallel_size') tp=$(echo "$throughput_params" | jq -r '.tensor_parallel_size')
if [ "$ON_CPU" == "1" ];then if [[ $gpu_count -lt $tp ]]; then
if [[ $numa_count -lt $tp ]]; then echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
echo "Required tensor-parallel-size $tp but only $numa_count NUMA nodes found. Skip testcase $test_name." continue
continue
fi
else
if [[ $gpu_count -lt $tp ]]; then
echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
continue
fi
fi fi
throughput_command=" $throughput_envs python3 benchmark_throughput.py \ throughput_command="python3 benchmark_throughput.py \
--output-json $RESULTS_FOLDER/${test_name}.json \ --output-json $RESULTS_FOLDER/${test_name}.json \
$throughput_args" $throughput_args"
@ -320,27 +272,18 @@ run_serving_tests() {
# get client and server arguments # get client and server arguments
server_params=$(echo "$params" | jq -r '.server_parameters') server_params=$(echo "$params" | jq -r '.server_parameters')
server_envs=$(echo "$params" | jq -r '.server_environment_variables')
client_params=$(echo "$params" | jq -r '.client_parameters') client_params=$(echo "$params" | jq -r '.client_parameters')
server_args=$(json2args "$server_params") server_args=$(json2args "$server_params")
server_envs=$(json2envs "$server_envs")
client_args=$(json2args "$client_params") client_args=$(json2args "$client_params")
qps_list=$(echo "$params" | jq -r '.qps_list') qps_list=$(echo "$params" | jq -r '.qps_list')
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh') qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
echo "Running over qps list $qps_list" echo "Running over qps list $qps_list"
# check if there is enough resources to run the test # check if there is enough GPU to run the test
tp=$(echo "$server_params" | jq -r '.tensor_parallel_size') tp=$(echo "$server_params" | jq -r '.tensor_parallel_size')
if [ "$ON_CPU" == "1" ];then if [[ $gpu_count -lt $tp ]]; then
if [[ $numa_count -lt $tp ]]; then echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
echo "Required tensor-parallel-size $tp but only $numa_count NUMA nodes found. Skip testcase $test_name." continue
continue
fi
else
if [[ $gpu_count -lt $tp ]]; then
echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
continue
fi
fi fi
# check if server model and client model is aligned # check if server model and client model is aligned
@ -351,33 +294,23 @@ run_serving_tests() {
continue continue
fi fi
server_command="$server_envs python3 \ server_command="python3 \
-m vllm.entrypoints.openai.api_server \ -m vllm.entrypoints.openai.api_server \
$server_args" $server_args"
# run the server # run the server
echo "Running test case $test_name" echo "Running test case $test_name"
echo "Server command: $server_command" echo "Server command: $server_command"
# support remote vllm server bash -c "$server_command" &
client_remote_args="" server_pid=$!
if [[ -z "${REMOTE_HOST}" ]]; then
bash -c "$server_command" & # wait until the server is alive
server_pid=$! if wait_for_server; then
# wait until the server is alive echo ""
if wait_for_server; then echo "vllm server is up and running."
echo ""
echo "vLLM server is up and running."
else
echo ""
echo "vLLM failed to start within the timeout period."
fi
else else
server_command="Using Remote Server $REMOTE_HOST $REMOTE_PORT" echo ""
if [[ ${REMOTE_PORT} ]]; then echo "vllm failed to start within the timeout period."
client_remote_args=" --host=$REMOTE_HOST --port=$REMOTE_PORT "
else
client_remote_args=" --host=$REMOTE_HOST "
fi
fi fi
# iterate over different QPS # iterate over different QPS
@ -399,7 +332,7 @@ run_serving_tests() {
--result-filename ${new_test_name}.json \ --result-filename ${new_test_name}.json \
--request-rate $qps \ --request-rate $qps \
--metadata "tensor_parallel_size=$tp" \ --metadata "tensor_parallel_size=$tp" \
$client_args $client_remote_args " $client_args"
echo "Running test case $test_name with qps $qps" echo "Running test case $test_name with qps $qps"
echo "Client command: $client_command" echo "Client command: $client_command"
@ -427,14 +360,7 @@ run_serving_tests() {
} }
main() { main() {
local ARCH check_gpus
ARCH=''
if [ "$ON_CPU" == "1" ];then
check_cpus
ARCH='-cpu'
else
check_gpus
fi
check_hf_token check_hf_token
# Set to v1 to run v1 benchmark # Set to v1 to run v1 benchmark
@ -460,9 +386,9 @@ main() {
QUICK_BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/ QUICK_BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
# benchmarking # benchmarking
run_serving_tests $QUICK_BENCHMARK_ROOT/tests/"${SERVING_JSON:-serving-tests$ARCH.json}" run_serving_tests $QUICK_BENCHMARK_ROOT/tests/serving-tests.json
run_latency_tests $QUICK_BENCHMARK_ROOT/tests/"${LATENCY_JSON:-latency-tests$ARCH.json}" run_latency_tests $QUICK_BENCHMARK_ROOT/tests/latency-tests.json
run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/"${THROUGHPUT_JSON:-throughput-tests$ARCH.json}" run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/throughput-tests.json
# postprocess benchmarking results # postprocess benchmarking results
pip install tabulate pandas pip install tabulate pandas

View File

@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import datetime import datetime
import json import json

View File

@ -1,30 +0,0 @@
[
{
"test_name": "latency_llama8B_tp1",
"environment_variables": {
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"parameters": {
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"tensor_parallel_size": 1,
"load_format": "dummy",
"num_iters_warmup": 5,
"num_iters": 15
}
},
{
"test_name": "latency_llama8B_tp4",
"environment_variables": {
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"parameters": {
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"tensor_parallel_size": 4,
"load_format": "dummy",
"num_iters_warmup": 5,
"num_iters": 15
}
}
]

View File

@ -1,158 +0,0 @@
[
{
"test_name": "serving_llama8B_tp1_sharegpt",
"qps_list": [1, 4, 16, "inf"],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"tensor_parallel_size": 1,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"disable_log_requests": "",
"enforce_eager": "",
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"max_concurrency": 60,
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_tp2_sharegpt",
"qps_list": [1, 4, 16, "inf"],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"tensor_parallel_size": 2,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"disable_log_requests": "",
"enforce_eager": "",
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"max_concurrency": 60,
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_tp4_sharegpt",
"qps_list": [1, 4, 16, "inf"],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"tensor_parallel_size": 4,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"disable_log_requests": "",
"enforce_eager": "",
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"max_concurrency": 60,
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_tp4_random_1024_128",
"qps_list": [1, 4, 16, "inf"],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"tensor_parallel_size": 4,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"disable_log_requests": "",
"enforce_eager": "",
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 1024,
"random-output-len": 128,
"ignore-eos": "",
"max_concurrency": 100,
"num_prompts": 100
}
},
{
"test_name": "serving_llama8B_pp6_random_1024_128",
"qps_list": [1, 4, 16, "inf"],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"pipeline_parallel_size": 6,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"disable_log_requests": "",
"enforce_eager": "",
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 1024,
"random-output-len": 128,
"ignore-eos": "",
"max_concurrency": 100,
"num_prompts": 100
}
}
]

View File

@ -1,32 +0,0 @@
[
{
"test_name": "throughput_llama8B_tp1",
"environment_variables": {
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"parameters": {
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"tensor_parallel_size": 1,
"load_format": "dummy",
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200,
"backend": "vllm"
}
},
{
"test_name": "throughput_llama8B_tp4",
"environment_variables": {
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"parameters": {
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"tensor_parallel_size": 4,
"load_format": "dummy",
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200,
"backend": "vllm"
}
}
]

View File

@ -6,6 +6,11 @@
[tool.ruff] [tool.ruff]
line-length = 88 line-length = 88
exclude = [
# External file, leaving license intact
"examples/other/fp8/quantizer/quantize.py",
"vllm/vllm_flash_attn/flash_attn_interface.pyi"
]
[tool.ruff.lint.per-file-ignores] [tool.ruff.lint.per-file-ignores]
"vllm/third_party/**" = ["ALL"] "vllm/third_party/**" = ["ALL"]

View File

@ -1,6 +1,5 @@
steps: steps:
- label: "Build wheel - CUDA 12.8" - label: "Build wheel - CUDA 12.8"
id: build-wheel-cuda-12-8
agents: agents:
queue: cpu_queue_postmerge queue: cpu_queue_postmerge
commands: commands:
@ -12,11 +11,10 @@ steps:
DOCKER_BUILDKIT: "1" DOCKER_BUILDKIT: "1"
- label: "Build wheel - CUDA 12.6" - label: "Build wheel - CUDA 12.6"
id: build-wheel-cuda-12-6
agents: agents:
queue: cpu_queue_postmerge queue: cpu_queue_postmerge
commands: commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.6.3 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.6.3 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts" - "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-wheels.sh" - "bash .buildkite/scripts/upload-wheels.sh"
@ -30,11 +28,10 @@ steps:
- label: "Build wheel - CUDA 11.8" - label: "Build wheel - CUDA 11.8"
# depends_on: block-build-cu118-wheel # depends_on: block-build-cu118-wheel
id: build-wheel-cuda-11-8
agents: agents:
queue: cpu_queue_postmerge queue: cpu_queue_postmerge
commands: commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts" - "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-wheels.sh" - "bash .buildkite/scripts/upload-wheels.sh"
@ -47,26 +44,13 @@ steps:
- label: "Build release image" - label: "Build release image"
depends_on: block-release-image-build depends_on: block-release-image-build
id: build-release-image
agents: agents:
queue: cpu_queue_postmerge queue: cpu_queue_postmerge
commands: commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ." - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT" - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
- label: "Annotate release workflow"
depends_on:
- build-release-image
- build-wheel-cuda-12-8
- build-wheel-cuda-12-6
- build-wheel-cuda-11-8
id: annotate-release-workflow
agents:
queue: cpu_queue_postmerge
commands:
- "bash .buildkite/scripts/annotate-release.sh"
- label: "Build and publish TPU release image" - label: "Build and publish TPU release image"
depends_on: ~ depends_on: ~
if: build.env("NIGHTLY") == "1" if: build.env("NIGHTLY") == "1"
@ -80,16 +64,15 @@ steps:
- "docker push vllm/vllm-tpu:$BUILDKITE_COMMIT" - "docker push vllm/vllm-tpu:$BUILDKITE_COMMIT"
plugins: plugins:
- docker-login#v3.0.0: - docker-login#v3.0.0:
username: vllmbot username: vllm
password-env: DOCKERHUB_TOKEN password-env: DOCKERHUB_TOKEN
env: env:
DOCKER_BUILDKIT: "1" DOCKER_BUILDKIT: "1"
- input: "Provide Release version here" - input: "Provide Release version here"
id: input-release-version
fields: fields:
- text: "What is the release version?" - text: "What is the release version?"
key: release-version key: "release-version"
- block: "Build CPU release image" - block: "Build CPU release image"
key: block-cpu-release-image-build key: block-cpu-release-image-build
@ -101,8 +84,7 @@ steps:
queue: cpu_queue_postmerge queue: cpu_queue_postmerge
commands: commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ." - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest"
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)" - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
env: env:
DOCKER_BUILDKIT: "1" DOCKER_BUILDKIT: "1"
@ -118,7 +100,6 @@ steps:
commands: commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:latest --progress plain -f docker/Dockerfile.neuron ." - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:latest --progress plain -f docker/Dockerfile.neuron ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:latest"
- "docker push public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version)" - "docker push public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version)"
env: env:
DOCKER_BUILDKIT: "1" DOCKER_BUILDKIT: "1"

View File

@ -1,31 +0,0 @@
#!/bin/bash
set -ex
# Get release version and strip leading 'v' if present
RELEASE_VERSION=$(buildkite-agent meta-data get release-version | sed 's/^v//')
if [ -z "$RELEASE_VERSION" ]; then
echo "Error: RELEASE_VERSION is empty. 'release-version' metadata might not be set or is invalid."
exit 1
fi
buildkite-agent annotate --style 'info' --context 'release-workflow' << EOF
To download the wheel:
\`\`\`
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu126/vllm-${RELEASE_VERSION}+cu126-cp38-abi3-manylinux1_x86_64.whl .
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu118/vllm-${RELEASE_VERSION}+cu118-cp38-abi3-manylinux1_x86_64.whl .
\`\`\`
To download and upload the image:
\`\`\`
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT} vllm/vllm-openai
docker tag vllm/vllm-openai vllm/vllm-openai:latest
docker tag vllm/vllm-openai vllm/vllm-openai:v${RELEASE_VERSION}
docker push vllm/vllm-openai:latest
docker push vllm/vllm-openai:v${RELEASE_VERSION}
\`\`\`
EOF

View File

@ -1,17 +0,0 @@
#!/bin/bash
# Usage: ./ci_clean_log.sh ci.log
# This script strips timestamps and color codes from CI log files.
# Check if argument is given
if [ $# -lt 1 ]; then
echo "Usage: $0 ci.log"
exit 1
fi
INPUT_FILE="$1"
# Strip timestamps
sed -i 's/^\[[0-9]\{4\}-[0-9]\{2\}-[0-9]\{2\}T[0-9]\{2\}:[0-9]\{2\}:[0-9]\{2\}Z\] //' "$INPUT_FILE"
# Strip colorization
sed -i -r 's/\x1B\[[0-9;]*[mK]//g' "$INPUT_FILE"

View File

@ -94,10 +94,6 @@ if [[ $commands == *"pytest -v -s compile/test_basic_correctness.py"* ]]; then
commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s compile/test_basic_correctness.py"} commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s compile/test_basic_correctness.py"}
fi fi
if [[ $commands == *"pytest -v -s lora"* ]]; then
commands=${commands//"pytest -v -s lora"/"VLLM_ROCM_CUSTOM_PAGED_ATTN=0 pytest -v -s lora"}
fi
#ignore certain kernels tests #ignore certain kernels tests
if [[ $commands == *" kernels/core"* ]]; then if [[ $commands == *" kernels/core"* ]]; then
commands="${commands} \ commands="${commands} \
@ -107,9 +103,10 @@ fi
if [[ $commands == *" kernels/attention"* ]]; then if [[ $commands == *" kernels/attention"* ]]; then
commands="${commands} \ commands="${commands} \
--ignore=kernels/attention/test_attention_selector.py \ --ignore=kernels/attention/stest_attention_selector.py \
--ignore=kernels/attention/test_blocksparse_attention.py \ --ignore=kernels/attention/test_blocksparse_attention.py \
--ignore=kernels/attention/test_encoder_decoder_attn.py \ --ignore=kernels/attention/test_encoder_decoder_attn.py \
--ignore=kernels/attention/test_attention_selector.py \
--ignore=kernels/attention/test_flash_attn.py \ --ignore=kernels/attention/test_flash_attn.py \
--ignore=kernels/attention/test_flashinfer.py \ --ignore=kernels/attention/test_flashinfer.py \
--ignore=kernels/attention/test_prefix_prefill.py \ --ignore=kernels/attention/test_prefix_prefill.py \

View File

@ -7,7 +7,6 @@ set -ex
# Setup cleanup # Setup cleanup
remove_docker_container() { remove_docker_container() {
if [[ -n "$container_id" ]]; then if [[ -n "$container_id" ]]; then
podman stop --all -t0
podman rm -f "$container_id" || true podman rm -f "$container_id" || true
fi fi
podman system prune -f podman system prune -f
@ -38,7 +37,7 @@ function cpu_tests() {
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-facebook/opt-125m] pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-facebook/opt-125m]
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-google/gemma-1.1-2b-it] pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-google/gemma-1.1-2b-it]
pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach] pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach]
pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model" pytest -v -s tests/models/language/pooling/test_embedding.py::test_models[half-BAAI/bge-base-en-v1.5]"
} }
# All of CPU tests are expected to be finished less than 40 mins. # All of CPU tests are expected to be finished less than 40 mins.

View File

@ -6,83 +6,75 @@ set -ex
# allow to bind to different cores # allow to bind to different cores
CORE_RANGE=${CORE_RANGE:-48-95} CORE_RANGE=${CORE_RANGE:-48-95}
OMP_CORE_RANGE=${OMP_CORE_RANGE:-48-95}
NUMA_NODE=${NUMA_NODE:-1} NUMA_NODE=${NUMA_NODE:-1}
export CMAKE_BUILD_PARALLEL_LEVEL=32
# Setup cleanup # Setup cleanup
remove_docker_container() { remove_docker_container() {
set -e; set -e;
docker rm -f cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"-avx2 || true; docker rm -f cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" || true;
docker image rm cpu-test-"$BUILDKITE_BUILD_NUMBER" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 || true;
} }
trap remove_docker_container EXIT trap remove_docker_container EXIT
remove_docker_container remove_docker_container
# Try building the docker image # Try building the docker image
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE" --target vllm-test -f docker/Dockerfile.cpu . numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$BUILDKITE_BUILD_NUMBER" --target vllm-test -f docker/Dockerfile.cpu .
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu . numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
# Run the image, setting --shm-size=4g for tensor parallel. # Run the image, setting --shm-size=4g for tensor parallel.
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_OMP_THREADS_BIND="$OMP_CORE_RANGE" --env VLLM_CPU_CI_ENV=1 --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE" docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_OMP_THREADS_BIND="$OMP_CORE_RANGE" --env VLLM_CPU_CI_ENV=1 --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2 --cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
--cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2
function cpu_tests() { function cpu_tests() {
set -e set -e
export NUMA_NODE=$2 export NUMA_NODE=$2
export BUILDKITE_BUILD_NUMBER=$3
# list packages
docker exec cpu-test-"$NUMA_NODE"-avx2 bash -c "
set -e
pip list"
docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e
pip list"
# offline inference # offline inference
docker exec cpu-test-"$NUMA_NODE"-avx2 bash -c " docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" bash -c "
set -e set -e
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
# Run basic model test # Run basic model test
docker exec cpu-test-"$NUMA_NODE" bash -c " docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
set -e set -e
# Note: disable until supports V1 pytest -v -s tests/kernels/test_cache.py -m cpu_model
# pytest -v -s tests/kernels/attention/test_cache.py -m cpu_model pytest -v -s tests/kernels/test_mla_decode_cpu.py -m cpu_model
# pytest -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model pytest -v -s tests/models/decoder_only/language -m cpu_model
pytest -v -s tests/models/embedding/language -m cpu_model
# Note: disable Bart until supports V1 pytest -v -s tests/models/encoder_decoder/language -m cpu_model
pytest -v -s tests/models/language/generation -m cpu_model \ pytest -v -s tests/models/decoder_only/audio_language -m cpu_model
--ignore=tests/models/language/generation/test_bart.py pytest -v -s tests/models/decoder_only/vision_language -m cpu_model"
VLLM_CPU_SGL_KERNEL=1 pytest -v -s tests/models/language/generation -m cpu_model \
--ignore=tests/models/language/generation/test_bart.py
pytest -v -s tests/models/language/pooling -m cpu_model
pytest -v -s tests/models/multimodal/generation \
--ignore=tests/models/multimodal/generation/test_mllama.py \
--ignore=tests/models/multimodal/generation/test_pixtral.py \
-m cpu_model"
# Run compressed-tensor test # Run compressed-tensor test
docker exec cpu-test-"$NUMA_NODE" bash -c " docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
set -e set -e
pytest -s -v \ pytest -s -v \
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs[False-10-32-neuralmagic/Llama-3.2-1B-quantized.w8a8]" tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token"
# Note: disable it until supports V1
# Run AWQ test # Run AWQ test
# docker exec cpu-test-"$NUMA_NODE" bash -c " docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
# set -e set -e
# VLLM_USE_V1=0 pytest -s -v \ pytest -s -v \
# tests/quantization/test_ipex_quant.py" tests/quantization/test_ipex_quant.py"
# Run chunked-prefill and prefix-cache test
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
set -e
pytest -s -v -k cpu_model \
tests/basic_correctness/test_chunked_prefill.py"
# online serving # online serving
docker exec cpu-test-"$NUMA_NODE" bash -c " docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
set -e set -e
export VLLM_CPU_KVCACHE_SPACE=10
export VLLM_CPU_OMP_THREADS_BIND=$1
python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half & python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half &
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1 timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
VLLM_CPU_CI_ENV=0 python3 benchmarks/benchmark_serving.py \ python3 benchmarks/benchmark_serving.py \
--backend vllm \ --backend vllm \
--dataset-name random \ --dataset-name random \
--model facebook/opt-125m \ --model facebook/opt-125m \
@ -91,7 +83,7 @@ function cpu_tests() {
--tokenizer facebook/opt-125m" --tokenizer facebook/opt-125m"
# Run multi-lora tests # Run multi-lora tests
docker exec cpu-test-"$NUMA_NODE" bash -c " docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
set -e set -e
pytest -s -v \ pytest -s -v \
tests/lora/test_qwen2vl.py" tests/lora/test_qwen2vl.py"
@ -99,4 +91,4 @@ function cpu_tests() {
# All of CPU tests are expected to be finished less than 40 mins. # All of CPU tests are expected to be finished less than 40 mins.
export -f cpu_tests export -f cpu_tests
timeout 1.5h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE" timeout 40m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE $BUILDKITE_BUILD_NUMBER"

View File

@ -2,55 +2,23 @@
# This script build the CPU docker image and run the offline inference inside the container. # This script build the CPU docker image and run the offline inference inside the container.
# It serves a sanity check for compilation and basic model usage. # It serves a sanity check for compilation and basic model usage.
set -exuo pipefail set -ex
# Try building the docker image # Try building the docker image
cat <<EOF | docker build -t hpu-plugin-v1-test-env -f - . docker build -t hpu-test-env -f docker/Dockerfile.hpu .
FROM gaudi-base-image:latest
COPY ./ /workspace/vllm
WORKDIR /workspace/vllm
ENV no_proxy=localhost,127.0.0.1
ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true
RUN VLLM_TARGET_DEVICE=empty pip install .
RUN pip install git+https://github.com/vllm-project/vllm-gaudi.git
# install development dependencies (for testing)
RUN python3 -m pip install -e tests/vllm_test_utils
WORKDIR /workspace/
RUN git clone https://github.com/vllm-project/vllm-gaudi.git
RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
EOF
# Setup cleanup # Setup cleanup
# certain versions of HPU software stack have a bug that can # certain versions of HPU software stack have a bug that can
# override the exit code of the script, so we need to use # override the exit code of the script, so we need to use
# separate remove_docker_containers and remove_docker_containers_and_exit # separate remove_docker_container and remove_docker_container_and_exit
# functions, while other platforms only need one remove_docker_container # functions, while other platforms only need one remove_docker_container
# function. # function.
EXITCODE=1 EXITCODE=1
remove_docker_containers() { docker rm -f hpu-plugin-v1-test || true; } remove_docker_container() { docker rm -f hpu-test || true; }
trap 'remove_docker_containers; exit $EXITCODE;' EXIT remove_docker_container_and_exit() { remove_docker_container; exit $EXITCODE; }
remove_docker_containers trap remove_docker_container_and_exit EXIT
remove_docker_container
echo "Running HPU plugin v1 test"
docker run --rm --runtime=habana --name=hpu-plugin-v1-test --network=host \
-e HABANA_VISIBLE_DEVICES=all \
hpu-plugin-v1-test-env \
/bin/bash "/workspace/vllm-gaudi/tests/upstream_tests/ci_tests.sh"
# Run the image and launch offline inference
docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m
EXITCODE=$? EXITCODE=$?
if [ $EXITCODE -eq 0 ]; then
echo "Test with basic model passed"
else
echo "Test with basic model FAILED with exit code: $EXITCODE" >&2
fi
# The trap will handle the container removal and final exit.

View File

@ -11,14 +11,13 @@ container_name="neuron_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
HF_CACHE="$(realpath ~)/huggingface" HF_CACHE="$(realpath ~)/huggingface"
mkdir -p "${HF_CACHE}" mkdir -p "${HF_CACHE}"
HF_MOUNT="/root/.cache/huggingface" HF_MOUNT="/root/.cache/huggingface"
HF_TOKEN=$(aws secretsmanager get-secret-value --secret-id "ci/vllm-neuron/hf-token" --region us-west-2 --query 'SecretString' --output text | jq -r .VLLM_NEURON_CI_HF_TOKEN)
NEURON_COMPILE_CACHE_URL="$(realpath ~)/neuron_compile_cache" NEURON_COMPILE_CACHE_URL="$(realpath ~)/neuron_compile_cache"
mkdir -p "${NEURON_COMPILE_CACHE_URL}" mkdir -p "${NEURON_COMPILE_CACHE_URL}"
NEURON_COMPILE_CACHE_MOUNT="/root/.cache/neuron_compile_cache" NEURON_COMPILE_CACHE_MOUNT="/root/.cache/neuron_compile_cache"
# Try building the docker image # Try building the docker image
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-west-2.amazonaws.com
# prune old image and containers to save disk space, and only once a day # prune old image and containers to save disk space, and only once a day
# by using a timestamp file in tmp. # by using a timestamp file in tmp.
@ -48,17 +47,8 @@ trap remove_docker_container EXIT
docker run --rm -it --device=/dev/neuron0 --network bridge \ docker run --rm -it --device=/dev/neuron0 --network bridge \
-v "${HF_CACHE}:${HF_MOUNT}" \ -v "${HF_CACHE}:${HF_MOUNT}" \
-e "HF_HOME=${HF_MOUNT}" \ -e "HF_HOME=${HF_MOUNT}" \
-e "HF_TOKEN=${HF_TOKEN}" \
-v "${NEURON_COMPILE_CACHE_URL}:${NEURON_COMPILE_CACHE_MOUNT}" \ -v "${NEURON_COMPILE_CACHE_URL}:${NEURON_COMPILE_CACHE_MOUNT}" \
-e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \ -e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \
--name "${container_name}" \ --name "${container_name}" \
${image_name} \ ${image_name} \
/bin/bash -c " /bin/bash -c "python3 /workspace/vllm/examples/offline_inference/neuron.py && python3 -m pytest /workspace/vllm/tests/neuron/1_core/ -v --capture=tee-sys && python3 -m pytest /workspace/vllm/tests/neuron/2_core/ -v --capture=tee-sys"
set -e; # Exit on first error
python3 /workspace/vllm/examples/offline_inference/neuron.py;
python3 -m pytest /workspace/vllm/tests/neuron/1_core/ -v --capture=tee-sys;
for f in /workspace/vllm/tests/neuron/2_core/*.py; do
echo \"Running test file: \$f\";
python3 -m pytest \$f -v --capture=tee-sys;
done
"

View File

@ -2,186 +2,102 @@
set -xu set -xu
remove_docker_container() {
docker rm -f tpu-test || true;
docker rm -f vllm-tpu || true;
}
trap remove_docker_container EXIT
# Remove the container that might not be cleaned up in the previous run.
remove_docker_container
# Build the docker image. # Build the docker image.
docker build -f docker/Dockerfile.tpu -t vllm-tpu . docker build -f docker/Dockerfile.tpu -t vllm-tpu .
# Set up cleanup. # Set up cleanup.
cleanup_docker() { remove_docker_container() { docker rm -f tpu-test || true; }
# Get Docker's root directory trap remove_docker_container EXIT
docker_root=$(docker info -f '{{.DockerRootDir}}') # Remove the container that might not be cleaned up in the previous run.
if [ -z "$docker_root" ]; then remove_docker_container
echo "Failed to determine Docker root directory."
exit 1
fi
echo "Docker root directory: $docker_root"
# Check disk usage of the filesystem where Docker's root directory is located
disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
# Define the threshold
threshold=70
if [ "$disk_usage" -gt "$threshold" ]; then
echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
# Remove dangling images (those that are not tagged and not used by any container)
docker image prune -f
# Remove unused volumes / force the system prune for old images as well.
docker volume prune -f && docker system prune --force --filter "until=72h" --all
echo "Docker images and volumes cleanup completed."
else
echo "Disk usage is below $threshold%. No cleanup needed."
fi
}
cleanup_docker
# For HF_TOKEN. # For HF_TOKEN.
source /etc/environment source /etc/environment
# Run a simple end-to-end example.
docker run --privileged --net host --shm-size=16G -it \ docker run --privileged --net host --shm-size=16G -it \
-e "HF_TOKEN=$HF_TOKEN" --name tpu-test \ -e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
vllm-tpu /bin/bash -c ' vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \
set -e # Exit immediately if a command exits with a non-zero status. && python3 -m pip install pytest pytest-asyncio tpu-info \
set -u # Treat unset variables as an error. && python3 -m pip install lm_eval[api]==0.4.4 \
&& export VLLM_XLA_CACHE_PATH= \
&& export VLLM_USE_V1=1 \
&& export VLLM_XLA_CHECK_RECOMPILATION=1 \
&& echo HARDWARE \
&& tpu-info \
&& { \
echo TEST_0: Running test_perf.py; \
python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_perf.py; \
echo TEST_0_EXIT_CODE: \$?; \
} & \
{ \
echo TEST_1: Running test_compilation.py; \
python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_compilation.py; \
echo TEST_1_EXIT_CODE: \$?; \
} & \
{ \
echo TEST_2: Running test_basic.py; \
python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_basic.py; \
echo TEST_2_EXIT_CODE: \$?; \
} & \
{ \
echo TEST_3: Running test_accuracy.py::test_lm_eval_accuracy_v1_engine; \
python3 -m pytest -s -v /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine; \
echo TEST_3_EXIT_CODE: \$?; \
} & \
{ \
echo TEST_4: Running test_quantization_accuracy.py; \
python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py; \
echo TEST_4_EXIT_CODE: \$?; \
} & \
{ \
echo TEST_5: Running examples/offline_inference/tpu.py; \
python3 /workspace/vllm/examples/offline_inference/tpu.py; \
echo TEST_5_EXIT_CODE: \$?; \
} & \
{ \
echo TEST_6: Running test_tpu_model_runner.py; \
python3 -m pytest -s -v /workspace/vllm/tests/tpu/worker/test_tpu_model_runner.py; \
echo TEST_6_EXIT_CODE: \$?; \
} & \
{ \
echo TEST_7: Running test_sampler.py; \
python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py; \
echo TEST_7_EXIT_CODE: \$?; \
} & \
{ \
echo TEST_8: Running test_topk_topp_sampler.py; \
python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_topk_topp_sampler.py; \
echo TEST_8_EXIT_CODE: \$?; \
} & \
{ \
echo TEST_9: Running test_multimodal.py; \
python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py; \
echo TEST_9_EXIT_CODE: \$?; \
} & \
{ \
echo TEST_10: Running test_pallas.py; \
python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py; \
echo TEST_10_EXIT_CODE: \$?; \
} & \
{ \
echo TEST_11: Running test_struct_output_generate.py; \
python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py; \
echo TEST_11_EXIT_CODE: \$?; \
} & \
{ \
echo TEST_12: Running test_moe_pallas.py; \
python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py; \
echo TEST_12_EXIT_CODE: \$?; \
} & \
# Disable the TPU LoRA tests until the feature is activated
# & { \
# echo TEST_13: Running test_moe_pallas.py; \
# python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/; \
# echo TEST_13_EXIT_CODE: \$?; \
# } & \
wait \
&& echo 'All tests have attempted to run. Check logs for individual test statuses and exit codes.' \
"
echo "--- Starting script inside Docker container ---"
# Create results directory
RESULTS_DIR=$(mktemp -d)
# If mktemp fails, set -e will cause the script to exit.
echo "Results will be stored in: $RESULTS_DIR"
# Install dependencies
echo "--- Installing Python dependencies ---"
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
&& python3 -m pip install --progress-bar off lm_eval[api]==0.4.4
echo "--- Python dependencies installed ---"
export VLLM_USE_V1=1
export VLLM_XLA_CHECK_RECOMPILATION=1
export VLLM_XLA_CACHE_PATH=
echo "Using VLLM V1"
echo "--- Hardware Information ---"
# tpu-info
echo "--- Starting Tests ---"
set +e
overall_script_exit_code=0
# --- Test Definitions ---
# If a test fails, this function will print logs and will not cause the main script to exit.
run_test() {
local test_num=$1
local test_name=$2
local test_command=$3
local log_file="$RESULTS_DIR/test_${test_num}.log"
local actual_exit_code
echo "--- TEST_$test_num: Running $test_name ---"
# Execute the test command.
eval "$test_command" > >(tee -a "$log_file") 2> >(tee -a "$log_file" >&2)
actual_exit_code=$?
echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" # This goes to main log
echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" >> "$log_file" # Also to per-test log
if [ "$actual_exit_code" -ne 0 ]; then
echo "TEST_$test_num ($test_name) FAILED with exit code $actual_exit_code." >&2
echo "--- Log for failed TEST_$test_num ($test_name) ---" >&2
if [ -f "$log_file" ]; then
cat "$log_file" >&2
else
echo "Log file $log_file not found for TEST_$test_num ($test_name)." >&2
fi
echo "--- End of log for TEST_$test_num ($test_name) ---" >&2
return "$actual_exit_code" # Return the failure code
else
echo "TEST_$test_num ($test_name) PASSED."
return 0 # Return success
fi
}
# Helper function to call run_test and update the overall script exit code
run_and_track_test() {
local test_num_arg="$1"
local test_name_arg="$2"
local test_command_arg="$3"
# Run the test
run_test "$test_num_arg" "$test_name_arg" "$test_command_arg"
local test_specific_exit_code=$?
# If the test failed, set the overall script exit code to 1
if [ "$test_specific_exit_code" -ne 0 ]; then
# No need for extra echo here, run_test already logged the failure.
overall_script_exit_code=1
fi
}
# --- Actual Test Execution ---
run_and_track_test 0 "test_perf.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_perf.py"
run_and_track_test 1 "test_compilation.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_compilation.py"
run_and_track_test 2 "test_basic.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_basic.py"
run_and_track_test 3 "test_accuracy.py::test_lm_eval_accuracy_v1_engine" \
"python3 -m pytest -s -v /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine"
run_and_track_test 4 "test_quantization_accuracy.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py"
run_and_track_test 5 "examples/offline_inference/tpu.py" \
"python3 /workspace/vllm/examples/offline_inference/tpu.py"
run_and_track_test 6 "test_tpu_model_runner.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/worker/test_tpu_model_runner.py"
run_and_track_test 7 "test_sampler.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py"
run_and_track_test 8 "test_topk_topp_sampler.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_topk_topp_sampler.py"
run_and_track_test 9 "test_multimodal.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py"
run_and_track_test 10 "test_pallas.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py"
run_and_track_test 11 "test_struct_output_generate.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\""
run_and_track_test 12 "test_moe_pallas.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py"
run_and_track_test 13 "test_lora.py" \
"VLLM_XLA_CHECK_RECOMPILATION=0 python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/test_lora.py"
run_and_track_test 14 "test_tpu_qkv_linear.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_tpu_qkv_linear.py"
run_and_track_test 15 "test_spmd_model_weight_loading.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_spmd_model_weight_loading.py"
run_and_track_test 16 "test_kv_cache_update_kernel.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_kv_cache_update_kernel.py"
# After all tests have been attempted, exit with the overall status.
if [ "$overall_script_exit_code" -ne 0 ]; then
echo "--- One or more tests FAILED. Overall script exiting with failure code 1. ---"
else
echo "--- All tests have completed and PASSED. Overall script exiting with success code 0. ---"
fi
exit "$overall_script_exit_code"
' # IMPORTANT: This is the closing single quote for the bash -c "..." command. Ensure it is present and correct.
# Capture the exit code of the docker run command
DOCKER_RUN_EXIT_CODE=$?
# The trap will run for cleanup.
# Exit the main script with the Docker run command's exit code.
if [ "$DOCKER_RUN_EXIT_CODE" -ne 0 ]; then
echo "Docker run command failed with exit code $DOCKER_RUN_EXIT_CODE."
exit "$DOCKER_RUN_EXIT_CODE"
else
echo "Docker run command completed successfully."
exit 0
fi
# TODO: This test fails because it uses RANDOM_SEED sampling # TODO: This test fails because it uses RANDOM_SEED sampling
# pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \ # && VLLM_USE_V1=1 pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \

View File

@ -11,8 +11,8 @@ container_name="xpu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head
docker build -t ${image_name} -f docker/Dockerfile.xpu . docker build -t ${image_name} -f docker/Dockerfile.xpu .
# Setup cleanup # Setup cleanup
remove_docker_container() { remove_docker_container() {
docker rm -f "${container_name}" || true; docker rm -f "${container_name}" || true;
docker image rm -f "${image_name}" || true; docker image rm -f "${image_name}" || true;
docker system prune -f || true; docker system prune -f || true;
} }
@ -26,9 +26,6 @@ docker run \
--name "${container_name}" \ --name "${container_name}" \
"${image_name}" \ "${image_name}" \
sh -c ' sh -c '
VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager VLLM_USE_V1=0 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m
VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray VLLM_USE_V1=0 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m -tp 2
VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
cd tests
pytest -v -s v1/core
' '

View File

@ -1,18 +0,0 @@
#!/bin/bash
# Usage: ./rerun_test.sh path/to/test.py::test_name
# Check if argument is given
if [ $# -lt 1 ]; then
echo "Usage: $0 path/to/test.py::test_name"
echo "Example: $0 tests/v1/engine/test_engine_core_client.py::test_kv_cache_events[True-tcp]"
exit 1
fi
TEST=$1
COUNT=1
while pytest -sv "$TEST"; do
COUNT=$((COUNT + 1))
echo "RUN NUMBER ${COUNT}"
done

View File

@ -1,24 +0,0 @@
#!/bin/bash
set -euo pipefail
docker_root=$(docker info -f '{{.DockerRootDir}}')
if [ -z "$docker_root" ]; then
echo "Failed to determine Docker root directory."
exit 1
fi
echo "Docker root directory: $docker_root"
# Check disk usage of the filesystem where Docker's root directory is located
disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
# Define the threshold
threshold=70
if [ "$disk_usage" -gt "$threshold" ]; then
echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
# Remove dangling images (those that are not tagged and not used by any container)
docker image prune -f
# Remove unused volumes / force the system prune for old images as well.
docker volume prune -f && docker system prune --force --filter "until=72h" --all
echo "Docker images and volumes cleanup completed."
else
echo "Disk usage is below $threshold%. No cleanup needed."
fi

View File

@ -1,14 +0,0 @@
# Environment config
TEST_NAME=llama8b
CONTAINER_NAME=vllm-tpu
# vllm config
MODEL=meta-llama/Llama-3.1-8B-Instruct
MAX_NUM_SEQS=256
MAX_NUM_BATCHED_TOKENS=1024
TENSOR_PARALLEL_SIZE=1
MAX_MODEL_LEN=2048
DOWNLOAD_DIR=/mnt/disks/persist
EXPECTED_THROUGHPUT=8.0
INPUT_LEN=1800
OUTPUT_LEN=128

View File

@ -1,92 +0,0 @@
#!/bin/bash
if [ ! -f "$1" ]; then
echo "Error: The env file '$1' does not exist."
exit 1 # Exit the script with a non-zero status to indicate an error
fi
ENV_FILE=$1
# For testing on local vm, use `set -a` to export all variables
source /etc/environment
source $ENV_FILE
remove_docker_container() {
docker rm -f tpu-test || true;
docker rm -f vllm-tpu || true;
docker rm -f $CONTAINER_NAME || true;
}
trap remove_docker_container EXIT
# Remove the container that might not be cleaned up in the previous run.
remove_docker_container
LOG_ROOT=$(mktemp -d)
# If mktemp fails, set -e will cause the script to exit.
echo "Results will be stored in: $LOG_ROOT"
if [ -z "$HF_TOKEN" ]; then
echo "Error: HF_TOKEN is not set or is empty."
exit 1
fi
# Make sure mounted disk or dir exists
if [ ! -d "$DOWNLOAD_DIR" ]; then
echo "Error: Folder $DOWNLOAD_DIR does not exist. This is useually a mounted drive. If no mounted drive, just create a folder."
exit 1
fi
echo "Run model $MODEL"
echo
echo "starting docker...$CONTAINER_NAME"
echo
docker run \
-v $DOWNLOAD_DIR:$DOWNLOAD_DIR \
--env-file $ENV_FILE \
-e HF_TOKEN="$HF_TOKEN" \
-e TARGET_COMMIT=$BUILDKITE_COMMIT \
-e MODEL=$MODEL \
-e WORKSPACE=/workspace \
--name $CONTAINER_NAME \
-d \
--privileged \
--network host \
-v /dev/shm:/dev/shm \
vllm/vllm-tpu-bm tail -f /dev/null
echo "run script..."
echo
docker exec "$CONTAINER_NAME" /bin/bash -c ".buildkite/scripts/tpu/run_bm.sh"
echo "copy result back..."
VLLM_LOG="$LOG_ROOT/$TEST_NAME"_vllm_log.txt
BM_LOG="$LOG_ROOT/$TEST_NAME"_bm_log.txt
docker cp "$CONTAINER_NAME:/workspace/vllm_log.txt" "$VLLM_LOG"
docker cp "$CONTAINER_NAME:/workspace/bm_log.txt" "$BM_LOG"
throughput=$(grep "Request throughput (req/s):" "$BM_LOG" | sed 's/[^0-9.]//g')
echo "throughput for $TEST_NAME at $BUILDKITE_COMMIT: $throughput"
if [ "$BUILDKITE" = "true" ]; then
echo "Running inside Buildkite"
buildkite-agent artifact upload "$VLLM_LOG"
buildkite-agent artifact upload "$BM_LOG"
else
echo "Not running inside Buildkite"
fi
#
# compare the throughput with EXPECTED_THROUGHPUT
# and assert meeting the expectation
#
if [[ -z "$throughput" || ! "$throughput" =~ ^[0-9]+([.][0-9]+)?$ ]]; then
echo "Failed to get the throughput"
exit 1
fi
if (( $(echo "$throughput < $EXPECTED_THROUGHPUT" | bc -l) )); then
echo "Error: throughput($throughput) is less than expected($EXPECTED_THROUGHPUT)"
exit 1
fi

View File

@ -1,14 +0,0 @@
# Environment config
TEST_NAME=llama8bw8a8
CONTAINER_NAME=vllm-tpu
# vllm config
MODEL=RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8
MAX_NUM_SEQS=128
MAX_NUM_BATCHED_TOKENS=1024
TENSOR_PARALLEL_SIZE=1
MAX_MODEL_LEN=2048
DOWNLOAD_DIR=/mnt/disks/persist
EXPECTED_THROUGHPUT=10.0
INPUT_LEN=1800
OUTPUT_LEN=128

View File

@ -1,94 +0,0 @@
#!/bin/bash
set -euo pipefail
VLLM_LOG="$WORKSPACE/vllm_log.txt"
BM_LOG="$WORKSPACE/bm_log.txt"
if [ -n "$TARGET_COMMIT" ]; then
head_hash=$(git rev-parse HEAD)
if [ "$TARGET_COMMIT" != "$head_hash" ]; then
echo "Error: target commit $TARGET_COMMIT does not match HEAD: $head_hash"
exit 1
fi
fi
echo "model: $MODEL"
echo
#
# create a log folder
#
mkdir "$WORKSPACE/log"
# TODO: Move to image building.
pip install pandas
pip install datasets
#
# create sonnet_4x
#
echo "Create sonnet_4x.txt"
echo "" > benchmarks/sonnet_4x.txt
for _ in {1..4}
do
cat benchmarks/sonnet.txt >> benchmarks/sonnet_4x.txt
done
#
# start vllm service in backend
#
echo "lanching vllm..."
echo "logging to $VLLM_LOG"
echo
VLLM_USE_V1=1 vllm serve $MODEL \
--seed 42 \
--disable-log-requests \
--max-num-seqs $MAX_NUM_SEQS \
--max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \
--tensor-parallel-size $TENSOR_PARALLEL_SIZE \
--no-enable-prefix-caching \
--download_dir $DOWNLOAD_DIR \
--max-model-len $MAX_MODEL_LEN > "$VLLM_LOG" 2>&1 &
echo "wait for 20 minutes.."
echo
# sleep 1200
# wait for 10 minutes...
for i in {1..120}; do
# TODO: detect other type of errors.
if grep -Fq "raise RuntimeError" "$VLLM_LOG"; then
echo "Detected RuntimeError, exiting."
exit 1
elif grep -Fq "Application startup complete" "$VLLM_LOG"; then
echo "Application started"
break
else
echo "wait for 10 seconds..."
sleep 10
fi
done
#
# run test
#
echo "run benchmark test..."
echo "logging to $BM_LOG"
echo
python benchmarks/benchmark_serving.py \
--backend vllm \
--model $MODEL \
--dataset-name sonnet \
--dataset-path benchmarks/sonnet_4x.txt \
--sonnet-input-len $INPUT_LEN \
--sonnet-output-len $OUTPUT_LEN \
--ignore-eos > "$BM_LOG"
echo "completed..."
echo
throughput=$(grep "Request throughput (req/s):" "$BM_LOG" | sed 's/[^0-9.]//g')
echo "throughput: $throughput"
echo

View File

@ -33,23 +33,14 @@ steps:
- label: Documentation Build # 2min - label: Documentation Build # 2min
mirror_hardwares: [amdexperimental] mirror_hardwares: [amdexperimental]
working_dir: "/vllm-workspace/test_docs" working_dir: "/vllm-workspace/test_docs/docs"
fast_check: true fast_check: true
no_gpu: True no_gpu: True
commands: commands:
- pip install -r ../requirements/docs.txt - pip install -r ../../requirements/docs.txt
# TODO: add `--strict` once warnings in docstrings are fixed - SPHINXOPTS=\"-W\" make html
- mkdocs build # Check API reference (if it fails, you may have missing mock imports)
- grep \"sig sig-object py\" build/html/api/vllm/vllm.sampling_params.html
- label: Pytorch Nightly Dependency Override Check # 2min
# if this test fails, it means the nightly torch version is not compatible with some
# of the dependencies. Please check the error message and add the package to whitelist
# in /vllm/tools/generate_nightly_torch_test.py
soft_fail: true
source_file_dependencies:
- requirements/nightly_torch_test.txt
commands:
- bash standalone_tests/pytorch_nightly_dependency.sh
- label: Async Engine, Inputs, Utils, Worker Test # 24min - label: Async Engine, Inputs, Utils, Worker Test # 24min
mirror_hardwares: [amdexperimental] mirror_hardwares: [amdexperimental]
@ -68,7 +59,6 @@ steps:
- pytest -v -s async_engine # AsyncLLMEngine - pytest -v -s async_engine # AsyncLLMEngine
- NUM_SCHEDULER_STEPS=4 pytest -v -s async_engine/test_async_llm_engine.py - NUM_SCHEDULER_STEPS=4 pytest -v -s async_engine/test_async_llm_engine.py
- pytest -v -s test_inputs.py - pytest -v -s test_inputs.py
- pytest -v -s test_outputs.py
- pytest -v -s multimodal - pytest -v -s multimodal
- pytest -v -s test_utils.py # Utils - pytest -v -s test_utils.py # Utils
- pytest -v -s worker # Worker - pytest -v -s worker # Worker
@ -99,7 +89,7 @@ steps:
- VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
- label: Chunked Prefill Test - label: Chunked Prefill Test
mirror_hardwares: [amdexperimental, amdproduction] mirror_hardwares: [amdexperimental]
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/basic_correctness/test_chunked_prefill - tests/basic_correctness/test_chunked_prefill
@ -117,7 +107,7 @@ steps:
commands: commands:
- pytest -v -s core - pytest -v -s core
- label: Entrypoints Test (LLM) # 40min - label: Entrypoints Test # 40min
mirror_hardwares: [amdexperimental] mirror_hardwares: [amdexperimental]
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"
fast_check: true fast_check: true
@ -125,6 +115,8 @@ steps:
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/entrypoints/llm - tests/entrypoints/llm
- tests/entrypoints/openai
- tests/entrypoints/test_chat_utils
- tests/entrypoints/offline_mode - tests/entrypoints/offline_mode
commands: commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn - export VLLM_WORKER_MULTIPROC_METHOD=spawn
@ -133,21 +125,9 @@ steps:
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
- pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process - pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
- VLLM_USE_V1=0 pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process - VLLM_USE_V1=0 pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
- VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_openai_schema.py
- label: Entrypoints Test (API Server) # 40min
mirror_hardwares: [amdexperimental]
working_dir: "/vllm-workspace/tests"
fast_check: true
torch_nightly: true
source_file_dependencies:
- vllm/
- tests/entrypoints/openai
- tests/entrypoints/test_chat_utils
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/
- pytest -v -s entrypoints/test_chat_utils.py - pytest -v -s entrypoints/test_chat_utils.py
- VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
- label: Distributed Tests (4 GPUs) # 10min - label: Distributed Tests (4 GPUs) # 10min
mirror_hardwares: [amdexperimental] mirror_hardwares: [amdexperimental]
@ -158,15 +138,12 @@ steps:
- vllm/core/ - vllm/core/
- tests/distributed/test_utils - tests/distributed/test_utils
- tests/distributed/test_pynccl - tests/distributed/test_pynccl
- tests/distributed/test_events
- tests/spec_decode/e2e/test_integration_dist_tp4 - tests/spec_decode/e2e/test_integration_dist_tp4
- tests/compile/test_basic_correctness - tests/compile/test_basic_correctness
- examples/offline_inference/rlhf.py - examples/offline_inference/rlhf.py
- examples/offline_inference/rlhf_colocate.py - examples/offline_inference/rlhf_colocate.py
- tests/examples/offline_inference/data_parallel.py - tests/examples/offline_inference/data_parallel.py
- tests/v1/test_async_llm_dp.py - tests/v1/test_async_llm_dp.py
- tests/v1/test_external_lb_dp.py
- tests/v1/engine/test_engine_core_client.py
commands: commands:
# test with tp=2 and external_dp=2 # test with tp=2 and external_dp=2
- VLLM_USE_V1=0 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py - VLLM_USE_V1=0 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
@ -174,14 +151,11 @@ steps:
# test with tp=2 and pp=2 # test with tp=2 and pp=2
- PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py - PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
# test with internal dp # test with internal dp
- python3 ../examples/offline_inference/data_parallel.py --enforce-eager - python3 ../examples/offline_inference/data_parallel.py
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_external_lb_dp.py
- pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
- pytest -v -s distributed/test_utils.py - pytest -v -s distributed/test_utils.py
- pytest -v -s compile/test_basic_correctness.py - pytest -v -s compile/test_basic_correctness.py
- pytest -v -s distributed/test_pynccl.py - pytest -v -s distributed/test_pynccl.py
- pytest -v -s distributed/test_events.py
- pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py - pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
# TODO: create a dedicated test section for multi-GPU example tests # TODO: create a dedicated test section for multi-GPU example tests
# when we have multiple distributed example tests # when we have multiple distributed example tests
@ -190,23 +164,6 @@ steps:
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
- popd - popd
- label: EPLB Algorithm Test
working_dir: "/vllm-workspace/tests"
source_file_dependencies:
- vllm/distributed/eplb
- tests/distributed/test_eplb_algo.py
commands:
- pytest -v -s distributed/test_eplb_algo.py
- label: EPLB Execution Test # 5min
working_dir: "/vllm-workspace/tests"
num_gpus: 4
source_file_dependencies:
- vllm/distributed/eplb
- tests/distributed/test_eplb_execute.py
commands:
- pytest -v -s distributed/test_eplb_execute.py
- label: Metrics, Tracing Test # 10min - label: Metrics, Tracing Test # 10min
mirror_hardwares: [amdexperimental, amdproduction] mirror_hardwares: [amdexperimental, amdproduction]
num_gpus: 2 num_gpus: 2
@ -216,18 +173,13 @@ steps:
- tests/tracing - tests/tracing
commands: commands:
- pytest -v -s metrics - pytest -v -s metrics
- "pip install \
'opentelemetry-sdk>=1.26.0' \
'opentelemetry-api>=1.26.0' \
'opentelemetry-exporter-otlp>=1.26.0' \
'opentelemetry-semantic-conventions-ai>=0.4.1'"
- pytest -v -s tracing - pytest -v -s tracing
##### fast check tests ##### ##### fast check tests #####
##### 1 GPU test ##### ##### 1 GPU test #####
- label: Regression Test # 5min - label: Regression Test # 5min
mirror_hardwares: [amdexperimental] mirror_hardwares: [amdexperimental, amdproduction]
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/test_regression - tests/test_regression
@ -237,7 +189,7 @@ steps:
working_dir: "/vllm-workspace/tests" # optional working_dir: "/vllm-workspace/tests" # optional
- label: Engine Test # 10min - label: Engine Test # 10min
mirror_hardwares: [amdexperimental] mirror_hardwares: [amdexperimental, amdproduction]
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/engine - tests/engine
@ -245,9 +197,8 @@ steps:
- tests/test_sequence - tests/test_sequence
- tests/test_config - tests/test_config
- tests/test_logger - tests/test_logger
- tests/test_vllm_port
commands: commands:
- pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py - pytest -v -s engine test_sequence.py test_config.py test_logger.py
# OOM in the CI unless we run this separately # OOM in the CI unless we run this separately
- pytest -v -s tokenization - pytest -v -s tokenization
@ -269,7 +220,6 @@ steps:
- pytest -v -s v1/test_serial_utils.py - pytest -v -s v1/test_serial_utils.py
- pytest -v -s v1/test_utils.py - pytest -v -s v1/test_utils.py
- pytest -v -s v1/test_oracle.py - pytest -v -s v1/test_oracle.py
- pytest -v -s v1/test_metrics_reader.py
# TODO: accuracy does not match, whether setting # TODO: accuracy does not match, whether setting
# VLLM_USE_FLASHINFER_SAMPLER or not on H100. # VLLM_USE_FLASHINFER_SAMPLER or not on H100.
- pytest -v -s v1/e2e - pytest -v -s v1/e2e
@ -292,9 +242,9 @@ steps:
- python3 offline_inference/llm_engine_example.py - python3 offline_inference/llm_engine_example.py
- python3 offline_inference/audio_language.py --seed 0 - python3 offline_inference/audio_language.py --seed 0
- python3 offline_inference/vision_language.py --seed 0 - python3 offline_inference/vision_language.py --seed 0
- python3 offline_inference/vision_language_pooling.py --seed 0 - python3 offline_inference/vision_language_embedding.py --seed 0
- python3 offline_inference/vision_language_multi_image.py --seed 0 - python3 offline_inference/vision_language_multi_image.py --seed 0
- VLLM_USE_V1=0 python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - VLLM_USE_V1=0 python3 other/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 other/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
- python3 offline_inference/encoder_decoder.py - python3 offline_inference/encoder_decoder.py
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0 - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
- python3 offline_inference/basic/classify.py - python3 offline_inference/basic/classify.py
@ -310,15 +260,6 @@ steps:
commands: commands:
- pytest -v -s prefix_caching - pytest -v -s prefix_caching
- label: Platform Tests (CUDA)
mirror_hardwares: [amdexperimental]
source_file_dependencies:
- vllm/
- tests/cuda
commands:
- pytest -v -s cuda/test_cuda_context.py
- label: Samplers Test # 36min - label: Samplers Test # 36min
mirror_hardwares: [amdexperimental] mirror_hardwares: [amdexperimental]
source_file_dependencies: source_file_dependencies:
@ -330,6 +271,17 @@ steps:
- pytest -v -s samplers - pytest -v -s samplers
- VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
- label: LogitsProcessor Test # 5min
mirror_hardwares: [amdexperimental, amdproduction]
source_file_dependencies:
- vllm/model_executor/layers
- vllm/model_executor/guided_decoding
- tests/test_logits_processor
- tests/model_executor/test_guided_processors
commands:
- pytest -v -s test_logits_processor.py
- pytest -v -s model_executor/test_guided_processors.py
- label: Speculative decoding tests # 40min - label: Speculative decoding tests # 40min
mirror_hardwares: [amdexperimental] mirror_hardwares: [amdexperimental]
source_file_dependencies: source_file_dependencies:
@ -342,7 +294,7 @@ steps:
- pytest -v -s spec_decode/e2e/test_eagle_correctness.py - pytest -v -s spec_decode/e2e/test_eagle_correctness.py
- label: LoRA Test %N # 15min each - label: LoRA Test %N # 15min each
mirror_hardwares: [amdexperimental, amdproduction] mirror_hardwares: [amdexperimental]
source_file_dependencies: source_file_dependencies:
- vllm/lora - vllm/lora
- tests/lora - tests/lora
@ -350,7 +302,7 @@ steps:
parallelism: 4 parallelism: 4
- label: PyTorch Compilation Unit Tests - label: PyTorch Compilation Unit Tests
mirror_hardwares: [amdexperimental] mirror_hardwares: [amdexperimental, amdproduction]
torch_nightly: true torch_nightly: true
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
@ -358,10 +310,8 @@ steps:
commands: commands:
- pytest -v -s compile/test_pass_manager.py - pytest -v -s compile/test_pass_manager.py
- pytest -v -s compile/test_fusion.py - pytest -v -s compile/test_fusion.py
- pytest -v -s compile/test_fusion_attn.py
- pytest -v -s compile/test_silu_mul_quant_fusion.py - pytest -v -s compile/test_silu_mul_quant_fusion.py
- pytest -v -s compile/test_sequence_parallelism.py - pytest -v -s compile/test_sequence_parallelism.py
- pytest -v -s compile/test_async_tp.py
- label: PyTorch Fullgraph Smoke Test # 9min - label: PyTorch Fullgraph Smoke Test # 9min
mirror_hardwares: [amdexperimental, amdproduction] mirror_hardwares: [amdexperimental, amdproduction]
@ -374,7 +324,6 @@ steps:
# these tests need to be separated, cannot combine # these tests need to be separated, cannot combine
- pytest -v -s compile/piecewise/test_simple.py - pytest -v -s compile/piecewise/test_simple.py
- pytest -v -s compile/piecewise/test_toy_llama.py - pytest -v -s compile/piecewise/test_toy_llama.py
- pytest -v -s compile/piecewise/test_full_cudagraph.py
- label: PyTorch Fullgraph Test # 18min - label: PyTorch Fullgraph Test # 18min
mirror_hardwares: [amdexperimental, amdproduction] mirror_hardwares: [amdexperimental, amdproduction]
@ -432,28 +381,15 @@ steps:
- pytest -v -s kernels/mamba - pytest -v -s kernels/mamba
- label: Tensorizer Test # 11min - label: Tensorizer Test # 11min
mirror_hardwares: [amdexperimental] mirror_hardwares: [amdexperimental, amdproduction]
soft_fail: true soft_fail: true
source_file_dependencies: source_file_dependencies:
- vllm/model_executor/model_loader - vllm/model_executor/model_loader
- tests/tensorizer_loader - tests/tensorizer_loader
- tests/entrypoints/openai/test_tensorizer_entrypoint.py
commands: commands:
- apt-get update && apt-get install -y curl libsodium23 - apt-get update && apt-get install -y curl libsodium23
- export VLLM_WORKER_MULTIPROC_METHOD=spawn - export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s tensorizer_loader - pytest -v -s tensorizer_loader
- pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
- label: Model Executor Test
mirror_hardwares: [amdexperimental, amdproduction]
soft_fail: true
source_file_dependencies:
- vllm/model_executor
- tests/model_executor
commands:
- apt-get update && apt-get install -y curl libsodium23
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s model_executor
- label: Benchmarks # 9min - label: Benchmarks # 9min
mirror_hardwares: [amdexperimental, amdproduction] mirror_hardwares: [amdexperimental, amdproduction]
@ -478,9 +414,6 @@ steps:
- vllm/model_executor/layers/quantization - vllm/model_executor/layers/quantization
- tests/quantization - tests/quantization
commands: commands:
# temporary install here since we need nightly, will move to requirements/test.in
# after torchao 0.12 release
- pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
- label: LM Eval Small Models # 53min - label: LM Eval Small Models # 53min
@ -524,7 +457,7 @@ steps:
##### models test ##### ##### models test #####
- label: Basic Models Test # 24min - label: Basic Models Test # 24min
mirror_hardwares: [amdexperimental] mirror_hardwares: [amdexperimental, amdproduction]
torch_nightly: true torch_nightly: true
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
@ -534,7 +467,10 @@ steps:
- pytest -v -s models/test_registry.py - pytest -v -s models/test_registry.py
- pytest -v -s models/test_utils.py - pytest -v -s models/test_utils.py
- pytest -v -s models/test_vision.py - pytest -v -s models/test_vision.py
- pytest -v -s models/test_initialization.py # V1 Test: https://github.com/vllm-project/vllm/issues/14531
- VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'
- VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'llama4'
- VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'plamo2'
- label: Language Models Test (Standard) - label: Language Models Test (Standard)
mirror_hardwares: [amdexperimental] mirror_hardwares: [amdexperimental]
@ -548,36 +484,16 @@ steps:
- pip freeze | grep -E 'torch' - pip freeze | grep -E 'torch'
- pytest -v -s models/language -m core_model - pytest -v -s models/language -m core_model
- label: Language Models Test (Hybrid) # 35 min - label: Language Models Test (Extended)
mirror_hardwares: [amdexperimental]
torch_nightly: true
source_file_dependencies:
- vllm/
- tests/models/language/generation
commands:
# Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
- pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
- pytest -v -s models/language/generation -m hybrid_model
- label: Language Models Test (Extended Generation) # 1hr20min
mirror_hardwares: [amdexperimental] mirror_hardwares: [amdexperimental]
optional: true optional: true
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/models/language/generation - tests/models/language
commands: commands:
# Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile. # Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
- pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8' - pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
- pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)' - pytest -v -s models/language -m 'not core_model'
- label: Language Models Test (Extended Pooling) # 36min
mirror_hardwares: [amdexperimental]
optional: true
source_file_dependencies:
- vllm/
- tests/models/language/pooling
commands:
- pytest -v -s models/language/pooling -m 'not core_model'
- label: Multi-Modal Models Test (Standard) - label: Multi-Modal Models Test (Standard)
mirror_hardwares: [amdexperimental] mirror_hardwares: [amdexperimental]
@ -613,7 +529,7 @@ steps:
- pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model' - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model'
- label: Multi-Modal Models Test (Extended) 3 - label: Multi-Modal Models Test (Extended) 3
mirror_hardwares: [amdexperimental] mirror_hardwares: [amdexperimental, amdproduction]
optional: true optional: true
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
@ -640,18 +556,6 @@ steps:
# e.g. pytest -v -s models/encoder_decoder/vision_language/test_mllama.py # e.g. pytest -v -s models/encoder_decoder/vision_language/test_mllama.py
# *To avoid merge conflicts, remember to REMOVE (not just comment out) them before merging the PR* # *To avoid merge conflicts, remember to REMOVE (not just comment out) them before merging the PR*
- label: Transformers Nightly Models Test
working_dir: "/vllm-workspace/"
optional: true
commands:
- pip install --upgrade git+https://github.com/huggingface/transformers
- pytest -v -s tests/models/test_initialization.py
- pytest -v -s tests/models/multimodal/processing/
- pytest -v -s tests/models/multimodal/test_mapping.py
- python3 examples/offline_inference/basic/chat.py
- python3 examples/offline_inference/audio_language.py --model-type whisper
- python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
##### 1 GPU test ##### ##### 1 GPU test #####
##### multi gpus test ##### ##### multi gpus test #####
@ -677,18 +581,13 @@ steps:
- vllm/executor/ - vllm/executor/
- vllm/model_executor/models/ - vllm/model_executor/models/
- tests/distributed/ - tests/distributed/
- tests/examples/offline_inference/data_parallel.py
commands: commands:
- # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up) - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
- NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
- python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
- # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up) - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
- NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
- python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
- label: Distributed Tests (2 GPUs) # 40min - label: Distributed Tests (2 GPUs) # 40min
mirror_hardwares: [amdexperimental] mirror_hardwares: [amdexperimental]
@ -706,13 +605,9 @@ steps:
- vllm/worker/model_runner.py - vllm/worker/model_runner.py
- entrypoints/llm/test_collective_rpc.py - entrypoints/llm/test_collective_rpc.py
- tests/v1/test_async_llm_dp.py - tests/v1/test_async_llm_dp.py
- tests/v1/test_external_lb_dp.py
- tests/v1/entrypoints/openai/test_multi_api_servers.py
- vllm/v1/engine/ - vllm/v1/engine/
commands: commands:
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_external_lb_dp.py
- DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
- pytest -v -s entrypoints/llm/test_collective_rpc.py - pytest -v -s entrypoints/llm/test_collective_rpc.py
- pytest -v -s ./compile/test_basic_correctness.py - pytest -v -s ./compile/test_basic_correctness.py
- pytest -v -s ./compile/test_wrapper.py - pytest -v -s ./compile/test_wrapper.py
@ -753,7 +648,7 @@ steps:
- pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins - pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins
- label: Multi-step Tests (4 GPUs) # 36min - label: Multi-step Tests (4 GPUs) # 36min
mirror_hardwares: [amdexperimental, amdproduction] mirror_hardwares: [amdexperimental]
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"
num_gpus: 4 num_gpus: 4
source_file_dependencies: source_file_dependencies:
@ -814,7 +709,7 @@ steps:
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt
- label: Weight Loading Multiple GPU Test - Large Models # optional - label: Weight Loading Multiple GPU Test - Large Models # optional
mirror_hardwares: [amdexperimental] mirror_hardwares: [amdexperimental]
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"
num_gpus: 2 num_gpus: 2
gpu: a100 gpu: a100

View File

@ -1,6 +0,0 @@
# https://developers.google.com/gemini-code-assist/docs/customize-gemini-behavior-github
have_fun: false # Just review the code
code_review:
comment_severity_threshold: HIGH # Reduce quantity of comments
pull_request_opened:
summary: false # Don't summarize the PR in a separate comment

27
.github/CODEOWNERS vendored
View File

@ -10,22 +10,14 @@
/vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill /vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill /vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth /vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth
/vllm/model_executor/guided_decoding @mgoin @russellb @aarnphm /vllm/model_executor/guided_decoding @mgoin @russellb
/vllm/multimodal @DarkLight1337 @ywang96 /vllm/multimodal @DarkLight1337 @ywang96
/vllm/vllm_flash_attn @LucasWilkinson /vllm/vllm_flash_attn @LucasWilkinson
/vllm/lora @jeejeelee CMakeLists.txt @tlrmchlsmth
/vllm/reasoning @aarnphm
/vllm/entrypoints @aarnphm
/vllm/compilation @zou3519 @youkaichao @ProExpertProg
CMakeLists.txt @tlrmchlsmth @LucasWilkinson
# Any change to the VllmConfig changes can have a large user-facing impact,
# so spam a lot of people
/vllm/config.py @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor
# vLLM V1 # vLLM V1
/vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat /vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
/vllm/v1/structured_output @mgoin @russellb @aarnphm /vllm/v1/structured_output @mgoin @russellb
# Test ownership # Test ownership
/.buildkite/lm-eval-harness @mgoin @simon-mo /.buildkite/lm-eval-harness @mgoin @simon-mo
@ -34,8 +26,8 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
/tests/distributed/test_multi_node_assignment.py @youkaichao /tests/distributed/test_multi_node_assignment.py @youkaichao
/tests/distributed/test_pipeline_parallel.py @youkaichao /tests/distributed/test_pipeline_parallel.py @youkaichao
/tests/distributed/test_same_node.py @youkaichao /tests/distributed/test_same_node.py @youkaichao
/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo @aarnphm /tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo
/tests/entrypoints/llm/test_guided_generate.py @mgoin @russellb @aarnphm /tests/entrypoints/llm/test_guided_generate.py @mgoin @russellb
/tests/kernels @tlrmchlsmth @WoosukKwon /tests/kernels @tlrmchlsmth @WoosukKwon
/tests/model_executor/test_guided_processors.py @mgoin @russellb /tests/model_executor/test_guided_processors.py @mgoin @russellb
/tests/models @DarkLight1337 @ywang96 /tests/models @DarkLight1337 @ywang96
@ -45,11 +37,6 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
/tests/quantization @mgoin @robertgshaw2-redhat /tests/quantization @mgoin @robertgshaw2-redhat
/tests/spec_decode @njhill @LiuXiaoxuanPKU /tests/spec_decode @njhill @LiuXiaoxuanPKU
/tests/test_inputs.py @DarkLight1337 @ywang96 /tests/test_inputs.py @DarkLight1337 @ywang96
/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm /tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb
/tests/v1/structured_output @mgoin @russellb @aarnphm /tests/v1/structured_output @mgoin @russellb
/tests/weight_loading @mgoin @youkaichao /tests/weight_loading @mgoin @youkaichao
/tests/lora @jeejeelee
# Docs
/docs @hmellor
mkdocs.yaml @hmellor

View File

@ -8,16 +8,6 @@ body:
attributes: attributes:
value: > value: >
#### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+). #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
- type: markdown
attributes:
value: |
⚠️ **SECURITY WARNING:** Please review any text you paste to ensure it does not contain sensitive information such as:
- API tokens or keys (e.g., Hugging Face tokens, OpenAI API keys)
- Passwords or authentication credentials
- Private URLs or endpoints
- Personal or confidential data
Consider redacting or replacing sensitive values with placeholders like `<YOUR_TOKEN_HERE>` when sharing configuration or code examples.
- type: textarea - type: textarea
attributes: attributes:
label: Your current environment label: Your current environment
@ -91,14 +81,14 @@ body:
required: true required: true
- type: markdown - type: markdown
attributes: attributes:
value: | value: >
⚠️ Please separate bugs of `transformers` implementation or usage from bugs of `vllm`. If you think anything is wrong with the model's output: ⚠️ Please separate bugs of `transformers` implementation or usage from bugs of `vllm`. If you think anything is wrong with the models' output:
- Try the counterpart of `transformers` first. If the error appears, please go to [their issues](https://github.com/huggingface/transformers/issues?q=is%3Aissue+is%3Aopen+sort%3Aupdated-desc). - Try the counterpart of `transformers` first. If the error appears, please go to [their issues](https://github.com/huggingface/transformers/issues?q=is%3Aissue+is%3Aopen+sort%3Aupdated-desc).
- If the error only appears in vllm, please provide the detailed script of how you run `transformers` and `vllm`, also highlight the difference and what you expect. - If the error only appears in vllm, please provide the detailed script of how you run `transformers` and `vllm`, also highlight the difference and what you expect.
Thanks for reporting 🙏! Thanks for contributing 🎉!
- type: checkboxes - type: checkboxes
id: askllm id: askllm
attributes: attributes:

View File

@ -1,69 +0,0 @@
name: 🧪 CI failure report
description: Report a failing test.
title: "[CI Failure]: "
labels: ["ci-failure"]
body:
- type: markdown
attributes:
value: >
#### Include the name of the failing Buildkite step and test file in the title.
- type: input
attributes:
label: Name of failing test
description: |
Paste in the fully-qualified name of the failing test from the logs.
placeholder: |
`path/to/test_file.py::test_name[params]`
validations:
required: true
- type: checkboxes
attributes:
label: Basic information
description: Select all items that apply to the failing test.
options:
- label: Flaky test
- label: Can reproduce locally
- label: Caused by external libraries (e.g. bug in `transformers`)
- type: textarea
attributes:
label: 🧪 Describe the failing test
description: |
Please provide a clear and concise description of the failing test.
placeholder: |
A clear and concise description of the failing test.
```
The error message you got, with the full traceback and the error logs with [dump_input.py:##] if present.
```
validations:
required: true
- type: textarea
attributes:
label: 📝 History of failing test
description: |
Since when did the test start to fail?
You can look up its history via [Buildkite Test Suites](https://buildkite.com/organizations/vllm/analytics/suites/ci-1/tests?branch=main).
If you have time, identify the PR that caused the test to fail on main. You can do so via the following methods:
- Use Buildkite Test Suites to find the PR where the test failure first occurred, and reproduce the failure locally.
- Run [`git bisect`](https://git-scm.com/docs/git-bisect) locally.
- Manually unblock Buildkite steps for suspected PRs on main and check the results. (authorized users only)
placeholder: |
Approximate timeline and/or problematic PRs
A link to the Buildkite analytics of the failing test (if available)
validations:
required: true
- type: textarea
attributes:
label: CC List.
description: >
The list of people you want to CC. Usually, this includes those who worked on the PR that failed the test.
- type: markdown
attributes:
value: >
Thanks for reporting 🙏!

View File

@ -1,18 +1,6 @@
## Essential Elements of an Effective PR Description Checklist FILL IN THE PR DESCRIPTION HERE
- [ ] The purpose of the PR, such as "Fix some issue (link existing issues this PR will resolve)".
- [ ] The test plan, such as providing test command.
- [ ] The test results, such as pasting the results comparison before and after, or e2e results
- [ ] (Optional) The necessary documentation update, such as updating `supported_models.md` and `examples` for a new model.
PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS ABOVE HAVE BEEN CONSIDERED. FIX #xxxx (*link existing issues this PR will resolve*)
## Purpose
## Test Plan
## Test Result
## (Optional) Documentation Update
<!--- pyml disable-next-line no-emphasis-as-heading --> <!--- pyml disable-next-line no-emphasis-as-heading -->
**BEFORE SUBMITTING, PLEASE READ <https://docs.vllm.ai/en/latest/contributing>** (anything written below this line will be removed by GitHub Actions) **BEFORE SUBMITTING, PLEASE READ <https://docs.vllm.ai/en/latest/contributing/overview.html>** (anything written below this line will be removed by GitHub Actions)

103
.github/mergify.yml vendored
View File

@ -27,22 +27,6 @@ pull_request_rules:
add: add:
- ci/build - ci/build
- name: label-deepseek
description: Automatically apply deepseek label
conditions:
- or:
- files~=^examples/.*deepseek.*\.py
- files~=^tests/.*deepseek.*\.py
- files~=^vllm/entrypoints/openai/tool_parsers/.*deepseek.*\.py
- files~=^vllm/model_executor/models/.*deepseek.*\.py
- files~=^vllm/reasoning/.*deepseek.*\.py
- files~=^vllm/transformers_utils/.*deepseek.*\.py
- title~=(?i)DeepSeek
actions:
label:
add:
- deepseek
- name: label-frontend - name: label-frontend
description: Automatically apply frontend label description: Automatically apply frontend label
conditions: conditions:
@ -52,21 +36,6 @@ pull_request_rules:
add: add:
- frontend - frontend
- name: label-llama
description: Automatically apply llama label
conditions:
- or:
- files~=^examples/.*llama.*\.py
- files~=^tests/.*llama.*\.py
- files~=^vllm/entrypoints/openai/tool_parsers/llama.*\.py
- files~=^vllm/model_executor/models/.*llama.*\.py
- files~=^vllm/transformers_utils/configs/.*llama.*\.py
- title~=(?i)llama
actions:
label:
add:
- llama
- name: label-multi-modality - name: label-multi-modality
description: Automatically apply multi-modality label description: Automatically apply multi-modality label
conditions: conditions:
@ -74,70 +43,14 @@ pull_request_rules:
- files~=^vllm/multimodal/ - files~=^vllm/multimodal/
- files~=^tests/multimodal/ - files~=^tests/multimodal/
- files~=^tests/models/multimodal/ - files~=^tests/models/multimodal/
- files~=^tests/models/*/audio_language/
- files~=^tests/models/*/vision_language/
- files=tests/models/test_vision.py - files=tests/models/test_vision.py
actions: actions:
label: label:
add: add:
- multi-modality - multi-modality
- name: label-new-model
description: Automatically apply new-model label
conditions:
- and:
- files~=^vllm/model_executor/models/
- files=vllm/model_executor/models/registry.py
actions:
label:
add:
- new-model
- name: label-performance
description: Automatically apply performance label
conditions:
- or:
- files~=^benchmarks/
- files~=^vllm/benchmarks/
- files~=^tests/benchmarks/
- files~=^\.buildkite/nightly-benchmarks/
actions:
label:
add:
- performance
- name: label-qwen
description: Automatically apply qwen label
conditions:
- or:
- files~=^examples/.*qwen.*\.py
- files~=^tests/.*qwen.*\.py
- files~=^vllm/model_executor/models/.*qwen.*\.py
- files~=^vllm/reasoning/.*qwen.*\.py
- title~=(?i)Qwen
actions:
label:
add:
- qwen
- name: label-rocm
description: Automatically apply rocm label
conditions:
- or:
- files~=^csrc/rocm/
- files~=^docker/Dockerfile.rocm
- files~=^requirements/rocm.*\.txt
- files~=^vllm/attention/backends/rocm.*\.py
- files~=^vllm/attention/ops/rocm.*\.py
- files~=^vllm/model_executor/layers/fused_moe/rocm.*\.py
- files~=^vllm/v1/attention/backends/mla/rocm.*\.py
- files~=^tests/kernels/.*_rocm.*\.py
- files=vllm/platforms/rocm.py
- title~=(?i)AMD
- title~=(?i)ROCm
actions:
label:
add:
- rocm
- name: label-structured-output - name: label-structured-output
description: Automatically apply structured-output label description: Automatically apply structured-output label
conditions: conditions:
@ -145,7 +58,7 @@ pull_request_rules:
- files~=^benchmarks/structured_schemas/ - files~=^benchmarks/structured_schemas/
- files=benchmarks/benchmark_serving_structured_output.py - files=benchmarks/benchmark_serving_structured_output.py
- files=benchmarks/run_structured_output_benchmark.sh - files=benchmarks/run_structured_output_benchmark.sh
- files=docs/features/structured_outputs.md - files=docs/source/features/structured_outputs.md
- files=examples/offline_inference/structured_outputs.py - files=examples/offline_inference/structured_outputs.py
- files=examples/online_serving/openai_chat_completion_structured_outputs.py - files=examples/online_serving/openai_chat_completion_structured_outputs.py
- files=examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py - files=examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
@ -165,14 +78,8 @@ pull_request_rules:
conditions: conditions:
- or: - or:
- files~=^vllm/spec_decode/ - files~=^vllm/spec_decode/
- files~=^vllm/v1/spec_decode/
- files=vllm/model_executor/layers/spec_decode_base_sampler.py - files=vllm/model_executor/layers/spec_decode_base_sampler.py
- files~=^tests/spec_decode/ - files~=^tests/spec_decode/
- files~=^tests/v1/spec_decode/
- files~=^examples/.*(spec_decode|mlpspeculator|eagle|speculation).*\.py
- files~=^vllm/model_executor/models/.*eagle.*\.py
- files=vllm/model_executor/models/mlp_speculator.py
- files~=^vllm/transformers_utils/configs/(eagle|medusa|mlp_speculator)\.py
actions: actions:
label: label:
add: add:
@ -228,7 +135,9 @@ pull_request_rules:
- files~=^tests/entrypoints/openai/tool_parsers/ - files~=^tests/entrypoints/openai/tool_parsers/
- files=tests/entrypoints/openai/test_chat_with_tool_reasoning.py - files=tests/entrypoints/openai/test_chat_with_tool_reasoning.py
- files~=^vllm/entrypoints/openai/tool_parsers/ - files~=^vllm/entrypoints/openai/tool_parsers/
- files=docs/features/tool_calling.md - files=docs/source/features/tool_calling.md
- files=docs/source/getting_started/examples/openai_chat_completion_client_with_tools.md
- files=docs/source/getting_started/examples/chat_with_tools.md
- files~=^examples/tool_chat_* - files~=^examples/tool_chat_*
- files=examples/offline_inference/chat_with_tools.py - files=examples/offline_inference/chat_with_tools.py
- files=examples/online_serving/openai_chat_completion_client_with_tools_required.py - files=examples/online_serving/openai_chat_completion_client_with_tools_required.py

View File

@ -26,7 +26,7 @@ sed -i '/\*\*BEFORE SUBMITTING, PLEASE READ.*\*\*/,$d' "${NEW}"
# Remove HTML <details> section that includes <summary> text of "PR Checklist (Click to Expand)" # Remove HTML <details> section that includes <summary> text of "PR Checklist (Click to Expand)"
python3 - <<EOF python3 - <<EOF
import regex as re import re
with open("${NEW}", "r") as file: with open("${NEW}", "r") as file:
content = file.read() content = file.read()

View File

@ -20,12 +20,7 @@ jobs:
with: with:
python-version: '3.12' python-version: '3.12'
- name: Install Python dependencies
run: |
python3 -m pip install --upgrade pip
python3 -m pip install regex
- name: Update PR description - name: Update PR description
env: env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: bash .github/scripts/cleanup_pr_body.sh "${{ github.event.number }}" run: .github/scripts/cleanup_pr_body.sh "${{ github.event.number }}"

View File

@ -68,7 +68,7 @@ jobs:
export AWS_ACCESS_KEY_ID=minioadmin export AWS_ACCESS_KEY_ID=minioadmin
export AWS_SECRET_ACCESS_KEY=minioadmin export AWS_SECRET_ACCESS_KEY=minioadmin
sleep 30 && kubectl -n ns-vllm logs -f "$(kubectl -n ns-vllm get pods | awk '/deployment/ {print $1;exit}')" & sleep 30 && kubectl -n ns-vllm logs -f "$(kubectl -n ns-vllm get pods | awk '/deployment/ {print $1;exit}')" &
helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/online_serving/chart-helm -f examples/online_serving/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set image.env[2].name=VLLM_CPU_CI_ENV --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string image.env[2].value="1" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env" helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/online_serving/chart-helm -f examples/online_serving/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env"
- name: curl test - name: curl test
run: | run: |

9
.gitignore vendored
View File

@ -77,6 +77,11 @@ instance/
# Scrapy stuff: # Scrapy stuff:
.scrapy .scrapy
# Sphinx documentation
docs/_build/
docs/source/getting_started/examples/
docs/source/api/vllm
# PyBuilder # PyBuilder
.pybuilder/ .pybuilder/
target/ target/
@ -146,8 +151,6 @@ venv.bak/
# mkdocs documentation # mkdocs documentation
/site /site
docs/argparse
docs/examples
# mypy # mypy
.mypy_cache/ .mypy_cache/
@ -201,5 +204,5 @@ benchmarks/**/*.json
actionlint actionlint
shellcheck*/ shellcheck*/
# Ignore moe/marlin_moe gen code # Ingore moe/marlin_moe gen code
csrc/moe/marlin_moe_wna16/kernel_* csrc/moe/marlin_moe_wna16/kernel_*

View File

@ -11,19 +11,19 @@ repos:
hooks: hooks:
- id: yapf - id: yapf
args: [--in-place, --verbose] args: [--in-place, --verbose]
# Keep the same list from yapfignore here to avoid yapf failing without any inputs
exclude: '(.buildkite|benchmarks|build|examples)/.*'
- repo: https://github.com/astral-sh/ruff-pre-commit - repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.11.7 rev: v0.11.7
hooks: hooks:
- id: ruff - id: ruff
args: [--output-format, github, --fix] args: [--output-format, github, --fix]
- id: ruff-format - id: ruff-format
files: ^(.buildkite|benchmarks|examples)/.* files: ^(.buildkite|benchmarks)/.*
- repo: https://github.com/crate-ci/typos - repo: https://github.com/codespell-project/codespell
rev: v1.34.0 rev: v2.4.1
hooks: hooks:
- id: typos - id: codespell
additional_dependencies: ['tomli']
args: ['--toml', 'pyproject.toml']
- repo: https://github.com/PyCQA/isort - repo: https://github.com/PyCQA/isort
rev: 6.0.1 rev: 6.0.1
hooks: hooks:
@ -39,7 +39,6 @@ repos:
rev: v0.9.29 rev: v0.9.29
hooks: hooks:
- id: pymarkdown - id: pymarkdown
exclude: '.*\.inc\.md'
args: [fix] args: [fix]
- repo: https://github.com/rhysd/actionlint - repo: https://github.com/rhysd/actionlint
rev: v1.7.7 rev: v1.7.7
@ -53,17 +52,12 @@ repos:
files: ^requirements/test\.(in|txt)$ files: ^requirements/test\.(in|txt)$
- repo: local - repo: local
hooks: hooks:
- id: format-torch-nightly-test
name: reformat nightly_torch_test.txt to be in sync with test.in
language: python
entry: python tools/generate_nightly_torch_test.py
files: ^requirements/test\.(in|txt)$
- id: mypy-local - id: mypy-local
name: Run mypy for local Python installation name: Run mypy for local Python installation
entry: tools/mypy.sh 0 "local" entry: tools/mypy.sh 0 "local"
language: python language: python
types: [python] types: [python]
additional_dependencies: &mypy_deps [mypy==1.11.1, types-cachetools, types-setuptools, types-PyYAML, types-requests, pydantic] additional_dependencies: &mypy_deps [mypy==1.11.1, types-cachetools, types-setuptools, types-PyYAML, types-requests]
stages: [pre-commit] # Don't run in CI stages: [pre-commit] # Don't run in CI
- id: mypy-3.9 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward - id: mypy-3.9 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
name: Run mypy for Python 3.9 name: Run mypy for Python 3.9
@ -120,11 +114,6 @@ repos:
entry: python tools/check_spdx_header.py entry: python tools/check_spdx_header.py
language: python language: python
types: [python] types: [python]
- id: check-root-lazy-imports
name: Check root lazy imports
entry: python tools/check_init_lazy_imports.py
language: python
types: [python]
- id: check-filenames - id: check-filenames
name: Check for spaces in all filenames name: Check for spaces in all filenames
entry: bash entry: bash
@ -138,39 +127,10 @@ repos:
name: Update Dockerfile dependency graph name: Update Dockerfile dependency graph
entry: tools/update-dockerfile-graph.sh entry: tools/update-dockerfile-graph.sh
language: script language: script
- id: enforce-import-regex-instead-of-re
name: Enforce import regex as re
entry: python tools/enforce_regex_import.py
language: python
types: [python]
pass_filenames: false
additional_dependencies: [regex]
# forbid directly import triton
- id: forbid-direct-triton-import
name: "Forbid direct 'import triton'"
entry: python tools/check_triton_import.py
language: python
types: [python]
pass_filenames: false
additional_dependencies: [regex]
- id: check-pickle-imports
name: Prevent new pickle/cloudpickle imports
entry: python tools/check_pickle_imports.py
language: python
types: [python]
pass_filenames: false
additional_dependencies: [pathspec, regex]
- id: validate-config
name: Validate configuration has default values and that each field has a docstring
entry: python tools/validate_config.py
language: python
types: [python]
pass_filenames: true
files: vllm/config.py|tests/test_config.py|vllm/entrypoints/openai/cli_args.py
# Keep `suggestion` last # Keep `suggestion` last
- id: suggestion - id: suggestion
name: Suggestion name: Suggestion
entry: bash -c 'echo "To bypass all the pre-commit hooks, add --no-verify to git commit. To skip a specific hook, prefix the commit command with SKIP=<hook-id>."' entry: bash -c 'echo "To bypass pre-commit hooks, add --no-verify to git commit."'
language: system language: system
verbose: true verbose: true
pass_filenames: false pass_filenames: false

View File

@ -8,8 +8,12 @@ build:
tools: tools:
python: "3.12" python: "3.12"
mkdocs: sphinx:
configuration: mkdocs.yaml configuration: docs/source/conf.py
fail_on_warning: true
# If using Sphinx, optionally build your docs in additional formats such as PDF
formats: []
# Optionally declare the Python requirements required to build your docs # Optionally declare the Python requirements required to build your docs
python: python:

View File

@ -23,15 +23,15 @@ include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake)
# Suppress potential warnings about unused manually-specified variables # Suppress potential warnings about unused manually-specified variables
set(ignoreMe "${VLLM_PYTHON_PATH}") set(ignoreMe "${VLLM_PYTHON_PATH}")
# Prevent installation of dependencies (cutlass) by default.
install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
# #
# Supported python versions. These versions will be searched in order, the # Supported python versions. These versions will be searched in order, the
# first match will be selected. These should be kept in sync with setup.py. # first match will be selected. These should be kept in sync with setup.py.
# #
set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12") set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12")
# Supported NVIDIA architectures.
set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0")
# Supported AMD GPU architectures. # Supported AMD GPU architectures.
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201") set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201")
@ -45,7 +45,7 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1
# requirements.txt files and should be kept consistent. The ROCm torch # requirements.txt files and should be kept consistent. The ROCm torch
# versions are derived from docker/Dockerfile.rocm # versions are derived from docker/Dockerfile.rocm
# #
set(TORCH_SUPPORTED_VERSION_CUDA "2.7.1") set(TORCH_SUPPORTED_VERSION_CUDA "2.7.0")
set(TORCH_SUPPORTED_VERSION_ROCM "2.7.0") set(TORCH_SUPPORTED_VERSION_ROCM "2.7.0")
# #
@ -79,15 +79,6 @@ endif()
# #
find_package(Torch REQUIRED) find_package(Torch REQUIRED)
# Supported NVIDIA architectures.
# This check must happen after find_package(Torch) because that's when CMAKE_CUDA_COMPILER_VERSION gets defined
if(DEFINED CMAKE_CUDA_COMPILER_VERSION AND
CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8)
set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0")
else()
set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0")
endif()
# #
# Forward the non-CUDA device extensions to external CMake scripts. # Forward the non-CUDA device extensions to external CMake scripts.
# #
@ -171,6 +162,7 @@ if(NVCC_THREADS AND VLLM_GPU_LANG STREQUAL "CUDA")
list(APPEND VLLM_GPU_FLAGS "--threads=${NVCC_THREADS}") list(APPEND VLLM_GPU_FLAGS "--threads=${NVCC_THREADS}")
endif() endif()
# #
# Use FetchContent for C++ dependencies that are compiled as part of vLLM's build process. # Use FetchContent for C++ dependencies that are compiled as part of vLLM's build process.
# setup.py will override FETCHCONTENT_BASE_DIR to play nicely with sccache. # setup.py will override FETCHCONTENT_BASE_DIR to play nicely with sccache.
@ -181,6 +173,9 @@ include(FetchContent)
file(MAKE_DIRECTORY ${FETCHCONTENT_BASE_DIR}) # Ensure the directory exists file(MAKE_DIRECTORY ${FETCHCONTENT_BASE_DIR}) # Ensure the directory exists
message(STATUS "FetchContent base directory: ${FETCHCONTENT_BASE_DIR}") message(STATUS "FetchContent base directory: ${FETCHCONTENT_BASE_DIR}")
#
# Set rocm version dev int.
#
if(VLLM_GPU_LANG STREQUAL "HIP") if(VLLM_GPU_LANG STREQUAL "HIP")
# #
# Overriding the default -O set up by cmake, adding ggdb3 for the most verbose devug info # Overriding the default -O set up by cmake, adding ggdb3 for the most verbose devug info
@ -188,6 +183,7 @@ if(VLLM_GPU_LANG STREQUAL "HIP")
set(CMAKE_${VLLM_GPU_LANG}_FLAGS_DEBUG "${CMAKE_${VLLM_GPU_LANG}_FLAGS_DEBUG} -O0 -ggdb3") set(CMAKE_${VLLM_GPU_LANG}_FLAGS_DEBUG "${CMAKE_${VLLM_GPU_LANG}_FLAGS_DEBUG} -O0 -ggdb3")
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0 -ggdb3") set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0 -ggdb3")
# #
# Certain HIP functions are marked as [[nodiscard]], yet vllm ignores the result which generates # Certain HIP functions are marked as [[nodiscard]], yet vllm ignores the result which generates
# a lot of warnings that always mask real issues. Suppressing until this is properly addressed. # a lot of warnings that always mask real issues. Suppressing until this is properly addressed.
@ -230,7 +226,6 @@ endif()
# #
set(VLLM_EXT_SRC set(VLLM_EXT_SRC
"csrc/mamba/mamba_ssm/selective_scan_fwd.cu"
"csrc/cache_kernels.cu" "csrc/cache_kernels.cu"
"csrc/attention/paged_attention_v1.cu" "csrc/attention/paged_attention_v1.cu"
"csrc/attention/paged_attention_v2.cu" "csrc/attention/paged_attention_v2.cu"
@ -240,7 +235,6 @@ set(VLLM_EXT_SRC
"csrc/activation_kernels.cu" "csrc/activation_kernels.cu"
"csrc/layernorm_kernels.cu" "csrc/layernorm_kernels.cu"
"csrc/layernorm_quant_kernels.cu" "csrc/layernorm_quant_kernels.cu"
"csrc/sampler.cu"
"csrc/cuda_view.cu" "csrc/cuda_view.cu"
"csrc/quantization/gptq/q_gemm.cu" "csrc/quantization/gptq/q_gemm.cu"
"csrc/quantization/compressed_tensors/int8_quant_kernels.cu" "csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
@ -257,7 +251,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library") SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
# Set CUTLASS_REVISION. Used for FetchContent. Also fixes some bogus messages when building. # Set CUTLASS_REVISION. Used for FetchContent. Also fixes some bogus messages when building.
set(CUTLASS_REVISION "v4.0.0" CACHE STRING "CUTLASS revision to use") set(CUTLASS_REVISION "v3.9.2" CACHE STRING "CUTLASS revision to use")
# Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided # Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR}) if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
@ -287,6 +281,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
FetchContent_MakeAvailable(cutlass) FetchContent_MakeAvailable(cutlass)
list(APPEND VLLM_EXT_SRC list(APPEND VLLM_EXT_SRC
"csrc/mamba/mamba_ssm/selective_scan_fwd.cu"
"csrc/mamba/causal_conv1d/causal_conv1d.cu"
"csrc/quantization/aqlm/gemm_kernels.cu" "csrc/quantization/aqlm/gemm_kernels.cu"
"csrc/quantization/awq/gemm_kernels.cu" "csrc/quantization/awq/gemm_kernels.cu"
"csrc/permute_cols.cu" "csrc/permute_cols.cu"
@ -306,7 +302,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# Keep building Marlin for 9.0 as there are some group sizes and shapes that # Keep building Marlin for 9.0 as there are some group sizes and shapes that
# are not supported by Machete yet. # are not supported by Machete yet.
# 9.0 for latest bf16 atomicAdd PTX # 9.0 for latest bf16 atomicAdd PTX
cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.7;9.0+PTX" "${CUDA_ARCHS}") cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;9.0+PTX" "${CUDA_ARCHS}")
if (MARLIN_ARCHS) if (MARLIN_ARCHS)
# #
@ -391,7 +387,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# The cutlass_scaled_mm kernels for Hopper (c3x, i.e. CUTLASS 3.x) require # The cutlass_scaled_mm kernels for Hopper (c3x, i.e. CUTLASS 3.x) require
# CUDA 12.0 or later # CUDA 12.0 or later
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}") cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND SCALED_MM_ARCHS) if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_ARCHS)
set(SRCS set(SRCS
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90.cu" "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90.cu"
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu" "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu"
@ -407,7 +403,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
list(APPEND SCALED_MM_3X_ARCHS "${SCALED_MM_ARCHS}") list(APPEND SCALED_MM_3X_ARCHS "${SCALED_MM_ARCHS}")
message(STATUS "Building scaled_mm_c3x_sm90 for archs: ${SCALED_MM_ARCHS}") message(STATUS "Building scaled_mm_c3x_sm90 for archs: ${SCALED_MM_ARCHS}")
else() else()
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND SCALED_MM_ARCHS) if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_ARCHS)
message(STATUS "Not building scaled_mm_c3x_sm90 as CUDA Compiler version is " message(STATUS "Not building scaled_mm_c3x_sm90 as CUDA Compiler version is "
"not >= 12.0, we recommend upgrading to CUDA 12.0 or " "not >= 12.0, we recommend upgrading to CUDA 12.0 or "
"later if you intend on running FP8 quantized models on " "later if you intend on running FP8 quantized models on "
@ -418,40 +414,10 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
endif() endif()
endif() endif()
# The cutlass_scaled_mm kernels for Blackwell (c3x, i.e. CUTLASS 3.x) require
# The cutlass_scaled_mm kernels for Geforce Blackwell SM120 (c3x, i.e. CUTLASS 3.x) require
# CUDA 12.8 or later # CUDA 12.8 or later
cuda_archs_loose_intersection(SCALED_MM_ARCHS "12.0;12.0a" "${CUDA_ARCHS}") cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;12.0a" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS) if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND SCALED_MM_ARCHS)
set(SRCS
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm120.cu"
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm120_fp8.cu"
)
set_gencode_flags_for_srcs(
SRCS "${SRCS}"
CUDA_ARCHS "${SCALED_MM_ARCHS}")
list(APPEND VLLM_EXT_SRC "${SRCS}")
list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_SM120=1")
# Let scaled_mm_c2x know it doesn't need to build these arches
list(APPEND SCALED_MM_3X_ARCHS "${SCALED_MM_ARCHS}")
message(STATUS "Building scaled_mm_c3x_sm120 for archs: ${SCALED_MM_ARCHS}")
else()
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
message(STATUS "Not building scaled_mm_c3x_sm120 as CUDA Compiler version is "
"not >= 12.8, we recommend upgrading to CUDA 12.8 or "
"later if you intend on running FP8 quantized models on "
"Blackwell.")
else()
message(STATUS "Not building scaled_mm_c3x_120 as no compatible archs found "
"in CUDA target architectures")
endif()
endif()
# The cutlass_scaled_mm kernels for Blackwell SM100 (c3x, i.e. CUTLASS 3.x)
# require CUDA 12.8 or later
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
set(SRCS set(SRCS
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu" "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu"
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu" "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu"
@ -466,7 +432,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
list(APPEND SCALED_MM_3X_ARCHS "${SCALED_MM_ARCHS}") list(APPEND SCALED_MM_3X_ARCHS "${SCALED_MM_ARCHS}")
message(STATUS "Building scaled_mm_c3x_sm100 for archs: ${SCALED_MM_ARCHS}") message(STATUS "Building scaled_mm_c3x_sm100 for archs: ${SCALED_MM_ARCHS}")
else() else()
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS) if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND SCALED_MM_ARCHS)
message(STATUS "Not building scaled_mm_c3x_sm100 as CUDA Compiler version is " message(STATUS "Not building scaled_mm_c3x_sm100 as CUDA Compiler version is "
"not >= 12.8, we recommend upgrading to CUDA 12.8 or " "not >= 12.8, we recommend upgrading to CUDA 12.8 or "
"later if you intend on running FP8 quantized models on " "later if you intend on running FP8 quantized models on "
@ -482,7 +448,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# kernels for the remaining archs that are not already built for 3x. # kernels for the remaining archs that are not already built for 3x.
# (Build 8.9 for FP8) # (Build 8.9 for FP8)
cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS
"7.5;8.0;8.7;8.9+PTX" "${CUDA_ARCHS}") "7.5;8.0;8.9+PTX" "${CUDA_ARCHS}")
# subtract out the archs that are already built for 3x # subtract out the archs that are already built for 3x
list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS}) list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
if (SCALED_MM_2X_ARCHS) if (SCALED_MM_2X_ARCHS)
@ -509,7 +475,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# The 2:4 sparse kernels cutlass_scaled_sparse_mm and cutlass_compressor # The 2:4 sparse kernels cutlass_scaled_sparse_mm and cutlass_compressor
# require CUDA 12.2 or later (and only work on Hopper). # require CUDA 12.2 or later (and only work on Hopper).
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}") cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.2 AND SCALED_MM_ARCHS) if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_ARCHS)
set(SRCS "csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu") set(SRCS "csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu")
set_gencode_flags_for_srcs( set_gencode_flags_for_srcs(
SRCS "${SRCS}" SRCS "${SRCS}"
@ -518,7 +484,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
list(APPEND VLLM_GPU_FLAGS "-DENABLE_SPARSE_SCALED_MM_C3X=1") list(APPEND VLLM_GPU_FLAGS "-DENABLE_SPARSE_SCALED_MM_C3X=1")
message(STATUS "Building sparse_scaled_mm_c3x for archs: ${SCALED_MM_ARCHS}") message(STATUS "Building sparse_scaled_mm_c3x for archs: ${SCALED_MM_ARCHS}")
else() else()
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.2 AND SCALED_MM_ARCHS) if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_ARCHS)
message(STATUS "Not building sparse_scaled_mm_c3x kernels as CUDA Compiler version is " message(STATUS "Not building sparse_scaled_mm_c3x kernels as CUDA Compiler version is "
"not >= 12.2, we recommend upgrading to CUDA 12.2 or later " "not >= 12.2, we recommend upgrading to CUDA 12.2 or later "
"if you intend on running FP8 sparse quantized models on Hopper.") "if you intend on running FP8 sparse quantized models on Hopper.")
@ -530,7 +496,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# FP4 Archs and flags # FP4 Archs and flags
cuda_archs_loose_intersection(FP4_ARCHS "10.0a" "${CUDA_ARCHS}") cuda_archs_loose_intersection(FP4_ARCHS "10.0a" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS) if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND FP4_ARCHS)
set(SRCS set(SRCS
"csrc/quantization/fp4/nvfp4_quant_kernels.cu" "csrc/quantization/fp4/nvfp4_quant_kernels.cu"
"csrc/quantization/fp4/nvfp4_experts_quant.cu" "csrc/quantization/fp4/nvfp4_experts_quant.cu"
@ -541,7 +507,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
CUDA_ARCHS "${FP4_ARCHS}") CUDA_ARCHS "${FP4_ARCHS}")
list(APPEND VLLM_EXT_SRC "${SRCS}") list(APPEND VLLM_EXT_SRC "${SRCS}")
list(APPEND VLLM_GPU_FLAGS "-DENABLE_NVFP4=1") list(APPEND VLLM_GPU_FLAGS "-DENABLE_NVFP4=1")
list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MOE_SM100=1")
message(STATUS "Building NVFP4 for archs: ${FP4_ARCHS}") message(STATUS "Building NVFP4 for archs: ${FP4_ARCHS}")
else() else()
message(STATUS "Not building NVFP4 as no compatible archs were found.") message(STATUS "Not building NVFP4 as no compatible archs were found.")
@ -551,10 +516,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# CUTLASS MLA Archs and flags # CUTLASS MLA Archs and flags
cuda_archs_loose_intersection(MLA_ARCHS "10.0a" "${CUDA_ARCHS}") cuda_archs_loose_intersection(MLA_ARCHS "10.0a" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND MLA_ARCHS) if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND MLA_ARCHS)
set(SRCS set(SRCS
"csrc/attention/mla/cutlass_mla_kernels.cu" "csrc/attention/mla/cutlass_mla_kernels.cu")
"csrc/attention/mla/sm100_cutlass_mla_kernel.cu")
set_gencode_flags_for_srcs( set_gencode_flags_for_srcs(
SRCS "${SRCS}" SRCS "${SRCS}"
CUDA_ARCHS "${MLA_ARCHS}") CUDA_ARCHS "${MLA_ARCHS}")
@ -572,12 +536,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# CUTLASS MoE kernels # CUTLASS MoE kernels
# The MoE kernel cutlass_moe_mm requires CUDA 12.3 or later (and ONLY works # The MoE kernel cutlass_moe_mm requires CUDA 12.3 or later (and only works
# on Hopper). get_cutlass_(pplx_)moe_mm_data should only be compiled # on Hopper). get_cutlass_moe_mm_data should only be compiled if it's possible
# if it's possible to compile MoE kernels that use its output. # to compile MoE kernels that use its output.
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a" "${CUDA_ARCHS}") cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;10.0a" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS) if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu") set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu"
"csrc/quantization/cutlass_w8a8/moe/moe_data.cu")
set_gencode_flags_for_srcs( set_gencode_flags_for_srcs(
SRCS "${SRCS}" SRCS "${SRCS}"
CUDA_ARCHS "${SCALED_MM_ARCHS}") CUDA_ARCHS "${SCALED_MM_ARCHS}")
@ -591,46 +556,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
"if you intend on running FP8 quantized MoE models on Hopper.") "if you intend on running FP8 quantized MoE models on Hopper.")
else() else()
message(STATUS "Not building grouped_mm_c3x as no compatible archs found " message(STATUS "Not building grouped_mm_c3x as no compatible archs found "
"in CUDA target architectures.")
endif()
endif()
# moe_data.cu is used by all CUTLASS MoE kernels.
cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0a" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND CUTLASS_MOE_DATA_ARCHS)
set(SRCS "csrc/quantization/cutlass_w8a8/moe/moe_data.cu")
set_gencode_flags_for_srcs(
SRCS "${SRCS}"
CUDA_ARCHS "${CUTLASS_MOE_DATA_ARCHS}")
list(APPEND VLLM_EXT_SRC "${SRCS}")
message(STATUS "Building moe_data for archs: ${CUTLASS_MOE_DATA_ARCHS}")
else()
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND CUTLASS_MOE_DATA_ARCHS)
message(STATUS "Not building moe_data as CUDA Compiler version is "
"not >= 12.3, we recommend upgrading to CUDA 12.3 or later "
"if you intend on running FP8 quantized MoE models on Hopper or Blackwell.")
else()
message(STATUS "Not building moe_data as no compatible archs found "
"in CUDA target architectures.")
endif()
endif()
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
set(SRCS "csrc/quantization/cutlass_w8a8/moe/blockwise_scaled_group_mm_sm100.cu")
set_gencode_flags_for_srcs(
SRCS "${SRCS}"
CUDA_ARCHS "${SCALED_MM_ARCHS}")
list(APPEND VLLM_EXT_SRC "${SRCS}")
list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MOE_SM100=1")
message(STATUS "Building blockwise_scaled_group_mm_sm100 for archs: ${SCALED_MM_ARCHS}")
else()
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
message(STATUS "Not building blockwise_scaled_group_mm_sm100 kernels as CUDA Compiler version is "
"not >= 12.8, we recommend upgrading to CUDA 12.8 or later "
"if you intend on running FP8 quantized MoE models on Blackwell.")
else()
message(STATUS "Not building blockwise_scaled_group_mm_sm100 as no compatible archs found "
"in CUDA target architectures") "in CUDA target architectures")
endif() endif()
endif() endif()
@ -641,7 +566,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# The machete kernels only work on hopper and require CUDA 12.0 or later. # The machete kernels only work on hopper and require CUDA 12.0 or later.
# Only build Machete kernels if we are building for something compatible with sm90a # Only build Machete kernels if we are building for something compatible with sm90a
cuda_archs_loose_intersection(MACHETE_ARCHS "9.0a" "${CUDA_ARCHS}") cuda_archs_loose_intersection(MACHETE_ARCHS "9.0a" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND MACHETE_ARCHS) if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND MACHETE_ARCHS)
# #
# For the Machete kernels we automatically generate sources for various # For the Machete kernels we automatically generate sources for various
# preselected input type pairs and schedules. # preselected input type pairs and schedules.
@ -693,7 +618,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
message(STATUS "Building Machete kernels for archs: ${MACHETE_ARCHS}") message(STATUS "Building Machete kernels for archs: ${MACHETE_ARCHS}")
else() else()
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0
AND MACHETE_ARCHS) AND MACHETE_ARCHS)
message(STATUS "Not building Machete kernels as CUDA Compiler version is " message(STATUS "Not building Machete kernels as CUDA Compiler version is "
"not >= 12.0, we recommend upgrading to CUDA 12.0 or " "not >= 12.0, we recommend upgrading to CUDA 12.0 or "
@ -707,14 +632,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# if CUDA endif # if CUDA endif
endif() endif()
if (VLLM_GPU_LANG STREQUAL "HIP")
# Add QuickReduce kernels
list(APPEND VLLM_EXT_SRC
"csrc/custom_quickreduce.cu"
)
# if ROCM endif
endif()
message(STATUS "Enabling C extension.") message(STATUS "Enabling C extension.")
define_gpu_extension_target( define_gpu_extension_target(
_C _C
@ -761,7 +678,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
list(APPEND VLLM_MOE_EXT_SRC "${VLLM_MOE_WNA16_SRC}") list(APPEND VLLM_MOE_EXT_SRC "${VLLM_MOE_WNA16_SRC}")
# 9.0 for latest bf16 atomicAdd PTX # 9.0 for latest bf16 atomicAdd PTX
cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.7;9.0+PTX" "${CUDA_ARCHS}") cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;9.0+PTX" "${CUDA_ARCHS}")
if (MARLIN_MOE_ARCHS) if (MARLIN_MOE_ARCHS)
# #
@ -862,7 +779,5 @@ endif()
# For CUDA we also build and ship some external projects. # For CUDA we also build and ship some external projects.
if (VLLM_GPU_LANG STREQUAL "CUDA") if (VLLM_GPU_LANG STREQUAL "CUDA")
include(cmake/external_projects/flashmla.cmake) include(cmake/external_projects/flashmla.cmake)
# vllm-flash-attn should be last as it overwrites some CMake functions
include(cmake/external_projects/vllm_flash_attn.cmake) include(cmake/external_projects/vllm_flash_attn.cmake)
endif () endif ()

View File

@ -1,3 +1,3 @@
# Contributing to vLLM # Contributing to vLLM
You may find information about contributing to vLLM on [docs.vllm.ai](https://docs.vllm.ai/en/latest/contributing). You may find information about contributing to vLLM on [docs.vllm.ai](https://docs.vllm.ai/en/latest/contributing/overview.html).

View File

@ -1,7 +1,7 @@
<p align="center"> <p align="center">
<picture> <picture>
<source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/vllm-project/vllm/main/docs/assets/logos/vllm-logo-text-dark.png"> <source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/vllm-project/vllm/main/docs/source/assets/logos/vllm-logo-text-dark.png">
<img alt="vLLM" src="https://raw.githubusercontent.com/vllm-project/vllm/main/docs/assets/logos/vllm-logo-text-light.png" width=55%> <img alt="vLLM" src="https://raw.githubusercontent.com/vllm-project/vllm/main/docs/source/assets/logos/vllm-logo-text-light.png" width=55%>
</picture> </picture>
</p> </p>
@ -58,26 +58,28 @@ vLLM is fast with:
- Efficient management of attention key and value memory with [**PagedAttention**](https://blog.vllm.ai/2023/06/20/vllm.html) - Efficient management of attention key and value memory with [**PagedAttention**](https://blog.vllm.ai/2023/06/20/vllm.html)
- Continuous batching of incoming requests - Continuous batching of incoming requests
- Fast model execution with CUDA/HIP graph - Fast model execution with CUDA/HIP graph
- Quantizations: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), [AutoRound](https://arxiv.org/abs/2309.05516), INT4, INT8, and FP8 - Quantizations: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), INT4, INT8, and FP8.
- Optimized CUDA kernels, including integration with FlashAttention and FlashInfer - Optimized CUDA kernels, including integration with FlashAttention and FlashInfer.
- Speculative decoding - Speculative decoding
- Chunked prefill - Chunked prefill
**Performance benchmark**: We include a performance benchmark at the end of [our blog post](https://blog.vllm.ai/2024/09/05/perf-update.html). It compares the performance of vLLM against other LLM serving engines ([TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM), [SGLang](https://github.com/sgl-project/sglang) and [LMDeploy](https://github.com/InternLM/lmdeploy)). The implementation is under [nightly-benchmarks folder](.buildkite/nightly-benchmarks/) and you can [reproduce](https://github.com/vllm-project/vllm/issues/8176) this benchmark using our one-click runnable script.
vLLM is flexible and easy to use with: vLLM is flexible and easy to use with:
- Seamless integration with popular Hugging Face models - Seamless integration with popular Hugging Face models
- High-throughput serving with various decoding algorithms, including *parallel sampling*, *beam search*, and more - High-throughput serving with various decoding algorithms, including *parallel sampling*, *beam search*, and more
- Tensor, pipeline, data and expert parallelism support for distributed inference - Tensor parallelism and pipeline parallelism support for distributed inference
- Streaming outputs - Streaming outputs
- OpenAI-compatible API server - OpenAI-compatible API server
- Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, TPU, and AWS Neuron - Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, TPU, and AWS Neuron.
- Prefix caching support - Prefix caching support
- Multi-LoRA support - Multi-LoRA support
vLLM seamlessly supports most popular open-source models on HuggingFace, including: vLLM seamlessly supports most popular open-source models on HuggingFace, including:
- Transformer-like LLMs (e.g., Llama) - Transformer-like LLMs (e.g., Llama)
- Mixture-of-Expert LLMs (e.g., Mixtral, Deepseek-V2 and V3) - Mixture-of-Expert LLMs (e.g., Mixtral, Deepseek-V2 and V3)
- Embedding Models (e.g., E5-Mistral) - Embedding Models (e.g. E5-Mistral)
- Multi-modal LLMs (e.g., LLaVA) - Multi-modal LLMs (e.g., LLaVA)
Find the full list of supported models [here](https://docs.vllm.ai/en/latest/models/supported_models.html). Find the full list of supported models [here](https://docs.vllm.ai/en/latest/models/supported_models.html).
@ -98,14 +100,14 @@ Visit our [documentation](https://docs.vllm.ai/en/latest/) to learn more.
## Contributing ## Contributing
We welcome and value any contributions and collaborations. We welcome and value any contributions and collaborations.
Please check out [Contributing to vLLM](https://docs.vllm.ai/en/latest/contributing/index.html) for how to get involved. Please check out [Contributing to vLLM](https://docs.vllm.ai/en/stable/contributing/overview.html) for how to get involved.
## Sponsors ## Sponsors
vLLM is a community project. Our compute resources for development and testing are supported by the following organizations. Thank you for your support! vLLM is a community project. Our compute resources for development and testing are supported by the following organizations. Thank you for your support!
<!-- Note: Please sort them in alphabetical order. --> <!-- Note: Please sort them in alphabetical order. -->
<!-- Note: Please keep these consistent with docs/community/sponsors.md --> <!-- Note: Please keep these consistent with docs/source/community/sponsors.md -->
Cash Donations: Cash Donations:
- a16z - a16z
- Dropbox - Dropbox
@ -152,14 +154,12 @@ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs
## Contact Us ## Contact Us
<!-- --8<-- [start:contact-us] -->
- For technical questions and feature requests, please use GitHub [Issues](https://github.com/vllm-project/vllm/issues) or [Discussions](https://github.com/vllm-project/vllm/discussions) - For technical questions and feature requests, please use GitHub [Issues](https://github.com/vllm-project/vllm/issues) or [Discussions](https://github.com/vllm-project/vllm/discussions)
- For discussing with fellow users, please use the [vLLM Forum](https://discuss.vllm.ai) - For discussing with fellow users, please use the [vLLM Forum](https://discuss.vllm.ai)
- For coordinating contributions and development, please use [Slack](https://slack.vllm.ai) - coordinating contributions and development, please use [Slack](https://slack.vllm.ai)
- For security disclosures, please use GitHub's [Security Advisories](https://github.com/vllm-project/vllm/security/advisories) feature - For security disclosures, please use GitHub's [Security Advisories](https://github.com/vllm-project/vllm/security/advisories) feature
- For collaborations and partnerships, please contact us at [vllm-questions@lists.berkeley.edu](mailto:vllm-questions@lists.berkeley.edu) - For collaborations and partnerships, please contact us at [vllm-questions@lists.berkeley.edu](mailto:vllm-questions@lists.berkeley.edu)
<!-- --8<-- [end:contact-us] -->
## Media Kit ## Media Kit
- If you wish to use vLLM's logo, please refer to [our media kit repo](https://github.com/vllm-project/media-kit) - If you wish to use vLLM's logo, please refer to [our media kit repo](https://github.com/vllm-project/media-kit).

View File

@ -8,6 +8,4 @@ Please report security issues privately using [the vulnerability submission form
--- ---
Please see the [Security Guide in the vLLM documentation](https://docs.vllm.ai/en/latest/usage/security.html) for more information on vLLM's security assumptions and recommendations.
Please see [PyTorch's Security Policy](https://github.com/pytorch/pytorch/blob/main/SECURITY.md) for more information and recommendations on how to securely interact with models. Please see [PyTorch's Security Policy](https://github.com/pytorch/pytorch/blob/main/SECURITY.md) for more information and recommendations on how to securely interact with models.

View File

@ -4,7 +4,7 @@ This README guides you through running benchmark tests with the extensive
datasets supported on vLLM. Its a living document, updated as new features and datasets datasets supported on vLLM. Its a living document, updated as new features and datasets
become available. become available.
**Dataset Overview** ## Dataset Overview
<table style="width:100%; border-collapse: collapse;"> <table style="width:100%; border-collapse: collapse;">
<thead> <thead>
@ -64,12 +64,6 @@ become available.
<td style="text-align: center;"></td> <td style="text-align: center;"></td>
<td><code>lmms-lab/LLaVA-OneVision-Data</code>, <code>Aeala/ShareGPT_Vicuna_unfiltered</code></td> <td><code>lmms-lab/LLaVA-OneVision-Data</code>, <code>Aeala/ShareGPT_Vicuna_unfiltered</code></td>
</tr> </tr>
<tr>
<td><strong>Custom</strong></td>
<td style="text-align: center;"></td>
<td style="text-align: center;"></td>
<td>Local file: <code>data.jsonl</code></td>
</tr>
</tbody> </tbody>
</table> </table>
@ -82,10 +76,7 @@ become available.
**Note**: HuggingFace dataset's `dataset-name` should be set to `hf` **Note**: HuggingFace dataset's `dataset-name` should be set to `hf`
--- ---
<details> ## Example - Online Benchmark
<summary><b>🚀 Example - Online Benchmark</b></summary>
<br/>
First start serving your model First start serving your model
@ -133,40 +124,7 @@ P99 ITL (ms): 8.39
================================================== ==================================================
``` ```
**Custom Dataset** ### VisionArena Benchmark for Vision Language Models
If the dataset you want to benchmark is not supported yet in vLLM, even then you can benchmark on it using `CustomDataset`. Your data needs to be in `.jsonl` format and needs to have "prompt" field per entry, e.g., data.jsonl
```
{"prompt": "What is the capital of India?"}
{"prompt": "What is the capital of Iran?"}
{"prompt": "What is the capital of China?"}
```
```bash
# start server
VLLM_USE_V1=1 vllm serve meta-llama/Llama-3.1-8B-Instruct --disable-log-requests
```
```bash
# run benchmarking script
python3 benchmarks/benchmark_serving.py --port 9001 --save-result --save-detailed \
--backend vllm \
--model meta-llama/Llama-3.1-8B-Instruct \
--endpoint /v1/completions \
--dataset-name custom \
--dataset-path <path-to-your-data-jsonl> \
--custom-skip-chat-template \
--num-prompts 80 \
--max-concurrency 1 \
--temperature=0.3 \
--top-p=0.75 \
--result-dir "./log/"
```
You can skip applying chat template if your data already has it by using `--custom-skip-chat-template`.
**VisionArena Benchmark for Vision Language Models**
```bash ```bash
# need a model with vision capability here # need a model with vision capability here
@ -184,13 +142,14 @@ python3 vllm/benchmarks/benchmark_serving.py \
--num-prompts 1000 --num-prompts 1000
``` ```
**InstructCoder Benchmark with Speculative Decoding** ### InstructCoder Benchmark with Speculative Decoding
``` bash ``` bash
VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \ VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
--speculative-config $'{"method": "ngram", --speculative-model "[ngram]" \
"num_speculative_tokens": 5, "prompt_lookup_max": 5, --ngram_prompt_lookup_min 2 \
"prompt_lookup_min": 2}' --ngram-prompt-lookup-max 5 \
--num_speculative_tokens 5
``` ```
``` bash ``` bash
@ -201,7 +160,7 @@ python3 benchmarks/benchmark_serving.py \
--num-prompts 2048 --num-prompts 2048
``` ```
**Other HuggingFaceDataset Examples** ### Other HuggingFaceDataset Examples
```bash ```bash
vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
@ -245,17 +204,7 @@ python3 vllm/benchmarks/benchmark_serving.py \
--seed 42 --seed 42
``` ```
**`philschmid/mt-bench`** ### Running With Sampling Parameters
``` bash
python3 vllm/benchmarks/benchmark_serving.py \
--model Qwen/QwQ-32B \
--dataset-name hf \
--dataset-path philschmid/mt-bench \
--num-prompts 80
```
**Running With Sampling Parameters**
When using OpenAI-compatible backends such as `vllm`, optional sampling When using OpenAI-compatible backends such as `vllm`, optional sampling
parameters can be specified. Example client command: parameters can be specified. Example client command:
@ -273,27 +222,8 @@ python3 vllm/benchmarks/benchmark_serving.py \
--num-prompts 10 --num-prompts 10
``` ```
**Running With Ramp-Up Request Rate** ---
## Example - Offline Throughput Benchmark
The benchmark tool also supports ramping up the request rate over the
duration of the benchmark run. This can be useful for stress testing the
server or finding the maximum throughput that it can handle, given some latency budget.
Two ramp-up strategies are supported:
- `linear`: Increases the request rate linearly from a start value to an end value.
- `exponential`: Increases the request rate exponentially.
The following arguments can be used to control the ramp-up:
- `--ramp-up-strategy`: The ramp-up strategy to use (`linear` or `exponential`).
- `--ramp-up-start-rps`: The request rate at the beginning of the benchmark.
- `--ramp-up-end-rps`: The request rate at the end of the benchmark.
</details>
<details>
<summary><b>📈 Example - Offline Throughput Benchmark</b></summary>
<br/>
```bash ```bash
python3 vllm/benchmarks/benchmark_throughput.py \ python3 vllm/benchmarks/benchmark_throughput.py \
@ -311,7 +241,7 @@ Total num prompt tokens: 5014
Total num output tokens: 1500 Total num output tokens: 1500
``` ```
**VisionArena Benchmark for Vision Language Models** ### VisionArena Benchmark for Vision Language Models
``` bash ``` bash
python3 vllm/benchmarks/benchmark_throughput.py \ python3 vllm/benchmarks/benchmark_throughput.py \
@ -331,7 +261,7 @@ Total num prompt tokens: 14527
Total num output tokens: 1280 Total num output tokens: 1280
``` ```
**InstructCoder Benchmark with Speculative Decoding** ### InstructCoder Benchmark with Speculative Decoding
``` bash ``` bash
VLLM_WORKER_MULTIPROC_METHOD=spawn \ VLLM_WORKER_MULTIPROC_METHOD=spawn \
@ -344,9 +274,10 @@ python3 vllm/benchmarks/benchmark_throughput.py \
--output-len=100 \ --output-len=100 \
--num-prompts=2048 \ --num-prompts=2048 \
--async-engine \ --async-engine \
--speculative-config $'{"method": "ngram", --speculative-model="[ngram]" \
"num_speculative_tokens": 5, "prompt_lookup_max": 5, --ngram_prompt_lookup_min=2 \
"prompt_lookup_min": 2}' --ngram-prompt-lookup-max=5 \
--num_speculative_tokens=5
``` ```
``` ```
@ -355,7 +286,7 @@ Total num prompt tokens: 261136
Total num output tokens: 204800 Total num output tokens: 204800
``` ```
**Other HuggingFaceDataset Examples** ### Other HuggingFaceDataset Examples
**`lmms-lab/LLaVA-OneVision-Data`** **`lmms-lab/LLaVA-OneVision-Data`**
@ -394,7 +325,7 @@ python3 benchmarks/benchmark_throughput.py \
--num-prompts 10 --num-prompts 10
``` ```
**Benchmark with LoRA Adapters** ### Benchmark with LoRA Adapters
``` bash ``` bash
# download dataset # download dataset
@ -410,196 +341,3 @@ python3 vllm/benchmarks/benchmark_throughput.py \
--enable-lora \ --enable-lora \
--lora-path yard1/llama-2-7b-sql-lora-test --lora-path yard1/llama-2-7b-sql-lora-test
``` ```
</details>
<details>
<summary><b>🛠️ Example - Structured Output Benchmark</b></summary>
<br/>
Benchmark the performance of structured output generation (JSON, grammar, regex).
**Server Setup**
```bash
vllm serve NousResearch/Hermes-3-Llama-3.1-8B --disable-log-requests
```
**JSON Schema Benchmark**
```bash
python3 benchmarks/benchmark_serving_structured_output.py \
--backend vllm \
--model NousResearch/Hermes-3-Llama-3.1-8B \
--dataset json \
--structured-output-ratio 1.0 \
--request-rate 10 \
--num-prompts 1000
```
**Grammar-based Generation Benchmark**
```bash
python3 benchmarks/benchmark_serving_structured_output.py \
--backend vllm \
--model NousResearch/Hermes-3-Llama-3.1-8B \
--dataset grammar \
--structure-type grammar \
--request-rate 10 \
--num-prompts 1000
```
**Regex-based Generation Benchmark**
```bash
python3 benchmarks/benchmark_serving_structured_output.py \
--backend vllm \
--model NousResearch/Hermes-3-Llama-3.1-8B \
--dataset regex \
--request-rate 10 \
--num-prompts 1000
```
**Choice-based Generation Benchmark**
```bash
python3 benchmarks/benchmark_serving_structured_output.py \
--backend vllm \
--model NousResearch/Hermes-3-Llama-3.1-8B \
--dataset choice \
--request-rate 10 \
--num-prompts 1000
```
**XGrammar Benchmark Dataset**
```bash
python3 benchmarks/benchmark_serving_structured_output.py \
--backend vllm \
--model NousResearch/Hermes-3-Llama-3.1-8B \
--dataset xgrammar_bench \
--request-rate 10 \
--num-prompts 1000
```
</details>
<details>
<summary><b>📚 Example - Long Document QA Benchmark</b></summary>
<br/>
Benchmark the performance of long document question-answering with prefix caching.
**Basic Long Document QA Test**
```bash
python3 benchmarks/benchmark_long_document_qa_throughput.py \
--model meta-llama/Llama-2-7b-chat-hf \
--enable-prefix-caching \
--num-documents 16 \
--document-length 2000 \
--output-len 50 \
--repeat-count 5
```
**Different Repeat Modes**
```bash
# Random mode (default) - shuffle prompts randomly
python3 benchmarks/benchmark_long_document_qa_throughput.py \
--model meta-llama/Llama-2-7b-chat-hf \
--enable-prefix-caching \
--num-documents 8 \
--document-length 3000 \
--repeat-count 3 \
--repeat-mode random
# Tile mode - repeat entire prompt list in sequence
python3 benchmarks/benchmark_long_document_qa_throughput.py \
--model meta-llama/Llama-2-7b-chat-hf \
--enable-prefix-caching \
--num-documents 8 \
--document-length 3000 \
--repeat-count 3 \
--repeat-mode tile
# Interleave mode - repeat each prompt consecutively
python3 benchmarks/benchmark_long_document_qa_throughput.py \
--model meta-llama/Llama-2-7b-chat-hf \
--enable-prefix-caching \
--num-documents 8 \
--document-length 3000 \
--repeat-count 3 \
--repeat-mode interleave
```
</details>
<details>
<summary><b>🗂️ Example - Prefix Caching Benchmark</b></summary>
<br/>
Benchmark the efficiency of automatic prefix caching.
**Fixed Prompt with Prefix Caching**
```bash
python3 benchmarks/benchmark_prefix_caching.py \
--model meta-llama/Llama-2-7b-chat-hf \
--enable-prefix-caching \
--num-prompts 1 \
--repeat-count 100 \
--input-length-range 128:256
```
**ShareGPT Dataset with Prefix Caching**
```bash
# download dataset
# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
python3 benchmarks/benchmark_prefix_caching.py \
--model meta-llama/Llama-2-7b-chat-hf \
--dataset-path /path/ShareGPT_V3_unfiltered_cleaned_split.json \
--enable-prefix-caching \
--num-prompts 20 \
--repeat-count 5 \
--input-length-range 128:256
```
</details>
<details>
<summary><b>⚡ Example - Request Prioritization Benchmark</b></summary>
<br/>
Benchmark the performance of request prioritization in vLLM.
**Basic Prioritization Test**
```bash
python3 benchmarks/benchmark_prioritization.py \
--model meta-llama/Llama-2-7b-chat-hf \
--input-len 128 \
--output-len 64 \
--num-prompts 100 \
--scheduling-policy priority
```
**Multiple Sequences per Prompt**
```bash
python3 benchmarks/benchmark_prioritization.py \
--model meta-llama/Llama-2-7b-chat-hf \
--input-len 128 \
--output-len 64 \
--num-prompts 100 \
--scheduling-policy priority \
--n 2
```
</details>

View File

@ -10,16 +10,11 @@
# 3. Set variables (ALL REQUIRED) # 3. Set variables (ALL REQUIRED)
# BASE: your directory for vllm repo # BASE: your directory for vllm repo
# MODEL: the model served by vllm # MODEL: the model served by vllm
# SYSTEM: the hardware, choice TPU or GPU, for other systems, "get best profile" might not support.
# TP: ways of tensor parallelism
# DOWNLOAD_DIR: directory to download and load model weights. # DOWNLOAD_DIR: directory to download and load model weights.
# INPUT_LEN: request input len # INPUT_LEN: request input len
# OUTPUT_LEN: request output len # OUTPUT_LEN: request output len
# MIN_CACHE_HIT_PCT: prefix cache rate # MIN_CACHE_HIT_PCT: prefix cache rate
# MAX_LATENCY_ALLOWED_MS: (e2e) latency requirement. If there's no latency requirement, set it to a large number like 1000000000 # MAX_LATENCY_ALLOWED_MS: (e2e) latency requirement. If there's no latency requirement, set it to a large number like 1000000000
# NUM_SEQS_LIST: a list of `max-num-seqs` you want to loop with.
# NUM_BATCHED_TOKENS_LIST: a list of `max-num-batched-tokens` you want to loop with.
# Note that the default NUM_SEQS_LIST and NUM_BATCHED_TOKENS_LIST are set for medium size input/output len, for extra short context (such as 20:20), you might need to include larger numbers in NUM_SEQS_LIST.
# 4. Run the script, it might take a long time, you can use tmux to avoid the script stop if disconnection happens. # 4. Run the script, it might take a long time, you can use tmux to avoid the script stop if disconnection happens.
# 5. The final result will be saved in RESULT file. # 5. The final result will be saved in RESULT file.
@ -35,31 +30,31 @@
TAG=$(date +"%Y_%m_%d_%H_%M") TAG=$(date +"%Y_%m_%d_%H_%M")
BASE="" BASE=""
MODEL="meta-llama/Llama-3.1-8B-Instruct" MODEL="meta-llama/Llama-3.1-8B-Instruct"
SYSTEM="TPU"
TP=1
DOWNLOAD_DIR="" DOWNLOAD_DIR=""
INPUT_LEN=4000 INPUT_LEN=4000
OUTPUT_LEN=16 OUTPUT_LEN=16
MIN_CACHE_HIT_PCT=0 MIN_CACHE_HIT_PCT_PCT=0
MAX_LATENCY_ALLOWED_MS=100000000000 MAX_LATENCY_ALLOWED_MS=100000000000
NUM_SEQS_LIST="128 256"
NUM_BATCHED_TOKENS_LIST="512 1024 2048 4096"
LOG_FOLDER="$BASE/auto-benchmark/$TAG" LOG_FOLDER="$BASE/auto-benchmark/$TAG"
RESULT="$LOG_FOLDER/result.txt" RESULT="$LOG_FOLDER/result.txt"
PROFILE_PATH="$LOG_FOLDER/profile"
echo "result file: $RESULT" echo "result file$ $RESULT"
echo "model: $MODEL" echo "model: $MODEL"
echo
rm -rf $LOG_FOLDER rm -rf $LOG_FOLDER
rm -rf $PROFILE_PATH
mkdir -p $LOG_FOLDER mkdir -p $LOG_FOLDER
mkdir -p $PROFILE_PATH
cd "$BASE/vllm" cd "$BASE/vllm"
# create sonnet-4x.txt so that we can sample 2048 tokens for input
echo "" > benchmarks/sonnet_4x.txt
for _ in {1..4}
do
cat benchmarks/sonnet.txt >> benchmarks/sonnet_4x.txt
done
pip install -q datasets pip install datasets
current_hash=$(git rev-parse HEAD) current_hash=$(git rev-parse HEAD)
echo "hash:$current_hash" >> "$RESULT" echo "hash:$current_hash" >> "$RESULT"
@ -69,88 +64,53 @@ best_throughput=0
best_max_num_seqs=0 best_max_num_seqs=0
best_num_batched_tokens=0 best_num_batched_tokens=0
best_goodput=0 best_goodput=0
start_server() {
local gpu_memory_utilization=$1
local max_num_seqs=$2
local max_num_batched_tokens=$3
local vllm_log=$4
local profile_dir=$5
pkill -f vllm
VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir vllm serve $MODEL \
--disable-log-requests \
--port 8004 \
--gpu-memory-utilization $gpu_memory_utilization \
--max-num-seqs $max_num_seqs \
--max-num-batched-tokens $max_num_batched_tokens \
--tensor-parallel-size $TP \
--enable-prefix-caching \
--load-format dummy \
--download-dir "$DOWNLOAD_DIR" \
--max-model-len $(( INPUT_LEN+OUTPUT_LEN )) > "$vllm_log" 2>&1 &
# wait for 10 minutes...
server_started=0
for i in {1..60}; do
RESPONSE=$(curl -s -X GET "http://0.0.0.0:8004/health" -w "%{http_code}" -o /dev/stdout)
STATUS_CODE=$(echo "$RESPONSE" | tail -n 1)
if [[ "$STATUS_CODE" -eq 200 ]]; then
server_started=1
break
else
sleep 10
fi
done
if (( ! server_started )); then
echo "server did not start within 10 minutes. Please check server log at $vllm_log".
return 1
else
return 0
fi
}
update_best_profile() {
local profile_dir=$1
local profile_index=$2
sorted_paths=($(find "$profile_dir" -maxdepth 1 -not -path "$profile_dir" | sort))
selected_profile_file=
if [[ "$SYSTEM" == "TPU" ]]; then
selected_profile_file="${sorted_paths[$profile_index]}/*.xplane.pb"
fi
if [[ "$SYSTEM" == "GPU" ]]; then
selected_profile_file="${sorted_paths[$profile_index]}"
fi
rm -f $PROFILE_PATH/*
cp $selected_profile_file $PROFILE_PATH
}
run_benchmark() { run_benchmark() {
local max_num_seqs=$1 local max_num_seqs=$1
local max_num_batched_tokens=$2 local max_num_batched_tokens=$2
local gpu_memory_utilization=$3
echo "max_num_seq: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens" echo "max_num_seq: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens"
local vllm_log="$LOG_FOLDER/vllm_log_${max_num_seqs}_${max_num_batched_tokens}.txt" local vllm_log="$LOG_FOLDER/vllm_log_${max_num_seqs}_${max_num_batched_tokens}.txt"
local profile_dir="$LOG_FOLDER/profile_${max_num_seqs}_${max_num_batched_tokens}"
echo "vllm_log: $vllm_log" echo "vllm_log: $vllm_log"
echo echo
rm -f $vllm_log rm -f $vllm_log
mkdir -p $profile_dir
pkill -f vllm
local profile_index=0
echo "starting server..." # start the server
start_server $gpu_memory_utilization $max_num_seqs $max_num_batched_tokens $vllm_log $profile_dir VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 vllm serve $MODEL \
result=$? --disable-log-requests \
if [[ "$result" -eq 1 ]]; then --port 8004 \
echo "server failed to start. gpu_memory_utilization:$gpu_memory_utilization, max_num_seqs:$max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens" --gpu-memory-utilization 0.98 \
else --max-num-seqs $max_num_seqs \
echo "server started." --max-num-batched-tokens $max_num_batched_tokens \
fi --tensor-parallel-size 1 \
--enable-prefix-caching \
--load-format dummy \
--download-dir $DOWNLOAD_DIR \
--max-model-len $(( INPUT_LEN+OUTPUT_LEN )) > "$vllm_log" 2>&1 &
echo "wait for 10 minutes.."
echo echo
# wait for 10 minutes...
server_started=0
for i in {1..60}; do
if grep -Fq "Application startup complete" "$vllm_log"; then
echo "Application started"
server_started=1
break
else
# echo "wait for 10 seconds..."
sleep 10
fi
done
if (( ! server_started )); then
echo "server did not start within 10 minutes, terminate the benchmarking. Please check server log at $vllm_log"
echo "pkill -f vllm"
echo
pkill vllm
sleep 10
return 1
fi
echo "run benchmark test..." echo "run benchmark test..."
echo
meet_latency_requirement=0 meet_latency_requirement=0
# get a basic qps by using request-rate inf # get a basic qps by using request-rate inf
bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_inf.txt" bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_inf.txt"
@ -158,32 +118,30 @@ run_benchmark() {
python benchmarks/benchmark_serving.py \ python benchmarks/benchmark_serving.py \
--backend vllm \ --backend vllm \
--model $MODEL \ --model $MODEL \
--dataset-name random \ --dataset-name sonnet \
--random-input-len $INPUT_LEN \ --dataset-path benchmarks/sonnet_4x.txt \
--random-output-len $OUTPUT_LEN \ --sonnet-input-len $INPUT_LEN \
--sonnet-output-len $OUTPUT_LEN \
--ignore-eos \ --ignore-eos \
--disable-tqdm \ --disable-tqdm \
--request-rate inf \ --request-rate inf \
--percentile-metrics ttft,tpot,itl,e2el \ --percentile-metrics ttft,tpot,itl,e2el \
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \ --goodput e2el:$MAX_LATENCY_ALLOWED_MS \
--num-prompts 1000 \ --num-prompts 100 \
--random-prefix-len $prefix_len \ --sonnet-prefix-len $prefix_len \
--port 8004 \ --port 8004 > "$bm_log"
--profile &> "$bm_log" through_put=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}') e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g') goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
if (( $(echo "$e2el <= $MAX_LATENCY_ALLOWED_MS" | bc -l) )); then if (( $(echo "$e2el <= $MAX_LATENCY_ALLOWED_MS" | bc -l) )); then
meet_latency_requirement=1 meet_latency_requirement=1
request_rate=inf
fi fi
if (( ! meet_latency_requirement )); then if (( ! meet_latency_requirement )); then
# start from request-rate as int(throughput) + 1 # start from request-rate as int(through_put) + 1
request_rate=$((${throughput%.*} + 1)) request_rate=$((${through_put%.*} + 1))
while ((request_rate > 0)); do while ((request_rate > 0)); do
profile_index=$((profile_index+1))
# clear prefix cache # clear prefix cache
curl -X POST http://0.0.0.0:8004/reset_prefix_cache curl -X POST http://0.0.0.0:8004/reset_prefix_cache
sleep 5 sleep 5
@ -191,18 +149,19 @@ run_benchmark() {
python benchmarks/benchmark_serving.py \ python benchmarks/benchmark_serving.py \
--backend vllm \ --backend vllm \
--model $MODEL \ --model $MODEL \
--dataset-name random \ --dataset-name sonnet \
--random-input-len $INPUT_LEN \ --dataset-path benchmarks/sonnet_4x.txt \
--random-output-len $OUTPUT_LEN \ --sonnet-input-len $INPUT_LEN \
--ignore-eos \ --sonnet-output-len $OUTPUT_LEN \
--ignore_eos \
--disable-tqdm \ --disable-tqdm \
--request-rate $request_rate \ --request-rate $request_rate \
--percentile-metrics ttft,tpot,itl,e2el \ --percentile-metrics ttft,tpot,itl,e2el \
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \ --goodput e2el:$MAX_LATENCY_ALLOWED_MS \
--num-prompts 100 \ --num-prompts 100 \
--random-prefix-len $prefix_len \ --sonnet-prefix-len $prefix_len \
--port 8004 &> "$bm_log" --port 8004 > "$bm_log"
throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g') through_put=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}') e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g') goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
if (( $(echo "$e2el <= $MAX_LATENCY_ALLOWED_MS" | bc -l) )); then if (( $(echo "$e2el <= $MAX_LATENCY_ALLOWED_MS" | bc -l) )); then
@ -214,19 +173,13 @@ run_benchmark() {
fi fi
# write the results and update the best result. # write the results and update the best result.
if ((meet_latency_requirement)); then if ((meet_latency_requirement)); then
echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens, request_rate: $request_rate, e2el: $e2el, throughput: $throughput, goodput: $goodput" echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens, request_rate: $request_rate, e2el: $e2el, through put: $through_put, goodput: $goodput"
echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens, request_rate: $request_rate, e2el: $e2el, throughput: $throughput, goodput: $goodput" >> "$RESULT" echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens, request_rate: $request_rate, e2el: $e2el, through put: $through_put, goodput: $goodput" >> "$RESULT"
if (( $(echo "$throughput > $best_throughput" | bc -l) )); then if (( $(echo "$through_put > $best_throughput" | bc -l) )); then
best_throughput=$throughput best_throughput=$through_put
best_max_num_seqs=$max_num_seqs best_max_num_seqs=$max_num_seqs
best_num_batched_tokens=$max_num_batched_tokens best_num_batched_tokens=$max_num_batched_tokens
best_goodput=$goodput best_goodput=$goodput
if [[ "$SYSTEM" == "TPU" ]]; then
update_best_profile "$profile_dir/plugins/profile" $profile_index
fi
if [[ "$SYSTEM" == "GPU" ]]; then
update_best_profile "$profile_dir" $profile_index
fi
fi fi
else else
echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens does not meet latency requirement ${MAX_LATENCY_ALLOWED_MS}" echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens does not meet latency requirement ${MAX_LATENCY_ALLOWED_MS}"
@ -235,42 +188,25 @@ run_benchmark() {
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput" echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput"
echo "pkill -f vllm"
echo
pkill vllm pkill vllm
sleep 10 sleep 10
rm -f $vllm_log
printf '=%.0s' $(seq 1 20) printf '=%.0s' $(seq 1 20)
return 0 return 0
} }
read -r -a num_seqs_list <<< "$NUM_SEQS_LIST"
read -r -a num_batched_tokens_list <<< "$NUM_BATCHED_TOKENS_LIST"
# first find out the max gpu-memory-utilization without HBM OOM. num_seqs_list="128 256"
gpu_memory_utilization=0.98 num_batched_tokens_list="512 1024 2048 4096"
find_gpu_memory_utilization=0 for num_seqs in $num_seqs_list; do
while (( $(echo "$gpu_memory_utilization >= 0.9" | bc -l) )); do for num_batched_tokens in $num_batched_tokens_list; do
start_server $gpu_memory_utilization "${num_seqs_list[-1]}" "${num_batched_tokens_list[-1]}" "$LOG_FOLDER/vllm_log_gpu_memory_utilization_$gpu_memory_utilization.log" run_benchmark $num_seqs $num_batched_tokens
result=$? exit 0
if [[ "$result" -eq 0 ]]; then
find_gpu_memory_utilization=1
break
else
gpu_memory_utilization=$(echo "$gpu_memory_utilization - 0.01" | bc)
fi
done
if [[ "$find_gpu_memory_utilization" -eq 1 ]]; then
echo "Using gpu_memory_utilization=$gpu_memory_utilization to serve model."
else
echo "Cannot find a proper gpu_memory_utilization over 0.9 to serve the model, please check logs in $LOG_FOLDER."
exit 1
fi
for num_seqs in "${num_seqs_list[@]}"; do
for num_batched_tokens in "${num_batched_tokens_list[@]}"; do
run_benchmark $num_seqs $num_batched_tokens $gpu_memory_utilization
done done
done done
echo "finish permutations" echo "finish permutations"
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH" echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput"
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH" >> "$RESULT" echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput" >> "$RESULT"

View File

@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import io import io
import json import json
@ -195,11 +194,6 @@ async def async_request_deepspeed_mii(
request_func_input: RequestFuncInput, request_func_input: RequestFuncInput,
pbar: Optional[tqdm] = None, pbar: Optional[tqdm] = None,
) -> RequestFuncOutput: ) -> RequestFuncOutput:
api_url = request_func_input.api_url
assert api_url.endswith(("completions", "profile")), (
"OpenAI Completions API URL must end with 'completions' or 'profile'."
)
async with aiohttp.ClientSession( async with aiohttp.ClientSession(
trust_env=True, timeout=AIOHTTP_TIMEOUT trust_env=True, timeout=AIOHTTP_TIMEOUT
) as session: ) as session:
@ -210,8 +204,6 @@ async def async_request_deepspeed_mii(
"temperature": 0.01, # deepspeed-mii does not accept 0.0 temp. "temperature": 0.01, # deepspeed-mii does not accept 0.0 temp.
"top_p": 1.0, "top_p": 1.0,
} }
headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
output = RequestFuncOutput() output = RequestFuncOutput()
output.prompt_len = request_func_input.prompt_len output.prompt_len = request_func_input.prompt_len
@ -223,7 +215,7 @@ async def async_request_deepspeed_mii(
st = time.perf_counter() st = time.perf_counter()
try: try:
async with session.post( async with session.post(
url=api_url, json=payload, headers=headers url=request_func_input.api_url, json=payload
) as response: ) as response:
if response.status == 200: if response.status == 200:
parsed_resp = await response.json() parsed_resp = await response.json()
@ -325,7 +317,7 @@ async def async_request_openai_completions(
most_recent_timestamp = timestamp most_recent_timestamp = timestamp
generated_text += text or "" generated_text += text or ""
if usage := data.get("usage"): elif usage := data.get("usage"):
output.output_tokens = usage.get("completion_tokens") output.output_tokens = usage.get("completion_tokens")
if first_chunk_received: if first_chunk_received:
output.success = True output.success = True
@ -404,14 +396,8 @@ async def async_request_openai_chat_completions(
chunk_bytes = chunk_bytes.strip() chunk_bytes = chunk_bytes.strip()
if not chunk_bytes: if not chunk_bytes:
continue continue
chunk_bytes = chunk_bytes.decode("utf-8")
# NOTE: SSE comments (often used as pings) start with a colon.
# These are not JSON data payload and should be skipped.
if chunk_bytes.startswith(":"):
continue
chunk = chunk_bytes.removeprefix("data: ")
chunk = chunk_bytes.decode("utf-8").removeprefix("data: ")
if chunk != "[DONE]": if chunk != "[DONE]":
timestamp = time.perf_counter() timestamp = time.perf_counter()
data = json.loads(chunk) data = json.loads(chunk)
@ -618,7 +604,6 @@ ASYNC_REQUEST_FUNCS = {
"tensorrt-llm": async_request_trt_llm, "tensorrt-llm": async_request_trt_llm,
"scalellm": async_request_openai_completions, "scalellm": async_request_openai_completions,
"sglang": async_request_openai_completions, "sglang": async_request_openai_completions,
"llama.cpp": async_request_openai_completions,
} }
OPENAI_COMPATIBLE_BACKENDS = [ OPENAI_COMPATIBLE_BACKENDS = [

View File

@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
""" """
This module defines a framework for sampling benchmark requests from various This module defines a framework for sampling benchmark requests from various
datasets. Each dataset subclass of BenchmarkDataset must implement sample datasets. Each dataset subclass of BenchmarkDataset must implement sample
@ -10,6 +9,9 @@ generation. Supported dataset types include:
- BurstGPT - BurstGPT
- HuggingFace - HuggingFace
- VisionArena - VisionArena
TODO: Implement CustomDataset to parse a JSON file and convert its contents into
SampleRequest instances, similar to the approach used in ShareGPT.
""" """
import base64 import base64
@ -33,7 +35,6 @@ from transformers import PreTrainedTokenizerBase
from vllm.lora.request import LoRARequest from vllm.lora.request import LoRARequest
from vllm.lora.utils import get_adapter_absolute_path from vllm.lora.utils import get_adapter_absolute_path
from vllm.multimodal import MultiModalDataDict from vllm.multimodal import MultiModalDataDict
from vllm.multimodal.image import convert_image_mode
from vllm.transformers_utils.tokenizer import AnyTokenizer, get_lora_tokenizer from vllm.transformers_utils.tokenizer import AnyTokenizer, get_lora_tokenizer
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -256,7 +257,7 @@ def process_image(image: Any) -> Mapping[str, Any]:
if isinstance(image, dict) and "bytes" in image: if isinstance(image, dict) and "bytes" in image:
image = Image.open(BytesIO(image["bytes"])) image = Image.open(BytesIO(image["bytes"]))
if isinstance(image, Image.Image): if isinstance(image, Image.Image):
image = convert_image_mode(image, "RGB") image = image.convert("RGB")
with io.BytesIO() as image_data: with io.BytesIO() as image_data:
image.save(image_data, format="JPEG") image.save(image_data, format="JPEG")
image_base64 = base64.b64encode(image_data.getvalue()).decode("utf-8") image_base64 = base64.b64encode(image_data.getvalue()).decode("utf-8")
@ -324,9 +325,6 @@ class RandomDataset(BenchmarkDataset):
input_low = int(real_input_len * (1 - range_ratio)) input_low = int(real_input_len * (1 - range_ratio))
input_high = int(real_input_len * (1 + range_ratio)) input_high = int(real_input_len * (1 + range_ratio))
output_low = int(output_len * (1 - range_ratio)) output_low = int(output_len * (1 - range_ratio))
# Ensure the lower bound for output length is at least 1 to prevent
# sampling 0 tokens, which can cause request failures.
output_low = max(output_low, 1)
output_high = int(output_len * (1 + range_ratio)) output_high = int(output_len * (1 + range_ratio))
# Add logging for debugging # Add logging for debugging
@ -352,12 +350,11 @@ class RandomDataset(BenchmarkDataset):
# [1650, 939, 486] -> ['Ġcall', 'sh', 'ere'] # [1650, 939, 486] -> ['Ġcall', 'sh', 'ere']
# To avoid uncontrolled change of the prompt length, # To avoid uncontrolled change of the prompt length,
# the encoded sequence is truncated before being decode again. # the encoded sequence is truncated before being decode again.
total_input_len = prefix_len + int(input_lens[i])
re_encoded_sequence = tokenizer.encode(prompt, add_special_tokens=False)[ re_encoded_sequence = tokenizer.encode(prompt, add_special_tokens=False)[
:total_input_len : input_lens[i]
] ]
prompt = tokenizer.decode(re_encoded_sequence) prompt = tokenizer.decode(re_encoded_sequence)
total_input_len = len(re_encoded_sequence) total_input_len = prefix_len + int(input_lens[i])
requests.append( requests.append(
SampleRequest( SampleRequest(
prompt=prompt, prompt=prompt,
@ -444,97 +441,6 @@ class ShareGPTDataset(BenchmarkDataset):
return samples return samples
# -----------------------------------------------------------------------------
# Custom Dataset Implementation
# -----------------------------------------------------------------------------
class CustomDataset(BenchmarkDataset):
"""
Implements the Custom dataset. Loads data from a JSONL file and generates
sample requests based on conversation turns. E.g.,
```
{"prompt": "What is the capital of India?"}
{"prompt": "What is the capital of Iran?"}
{"prompt": "What is the capital of China?"}
```
"""
def __init__(self, **kwargs) -> None:
super().__init__(**kwargs)
self.load_data()
def load_data(self) -> None:
if self.dataset_path is None:
raise ValueError("dataset_path must be provided for loading data.")
# self.data will be a list of dictionaries
# e.g., [{"prompt": "What is the capital of India?"}, ...]
# This will be the standardized format which load_data()
# has to convert into depending on the filetype of dataset_path.
# sample() will assume this standardized format of self.data
self.data = []
# Load the JSONL file
if self.dataset_path.endswith(".jsonl"):
jsonl_data = pd.read_json(path_or_buf=self.dataset_path, lines=True)
# check if the JSONL file has a 'prompt' column
if "prompt" not in jsonl_data.columns:
raise ValueError("JSONL file must contain a 'prompt' column.")
# Convert each row to a dictionary and append to self.data
# This will convert the DataFrame to a list of dictionaries
# where each dictionary corresponds to a row in the DataFrame.
# This is the standardized format we want for self.data
for _, row in jsonl_data.iterrows():
self.data.append(row.to_dict())
else:
raise NotImplementedError(
"Only JSONL format is supported for CustomDataset."
)
random.seed(self.random_seed)
random.shuffle(self.data)
def sample(
self,
tokenizer: PreTrainedTokenizerBase,
num_requests: int,
lora_path: Optional[str] = None,
max_loras: Optional[int] = None,
output_len: Optional[int] = None,
enable_multimodal_chat: bool = False,
skip_chat_template: bool = False,
**kwargs,
) -> list:
sampled_requests = []
for item in self.data:
if len(sampled_requests) >= num_requests:
break
prompt = item["prompt"]
# apply template
if not skip_chat_template:
prompt = tokenizer.apply_chat_template(
[{"role": "user", "content": prompt}],
add_generation_prompt=True,
tokenize=False,
)
prompt_len = len(tokenizer(prompt).input_ids)
sampled_requests.append(
SampleRequest(
prompt=prompt,
prompt_len=prompt_len,
expected_output_len=output_len,
)
)
self.maybe_oversample_requests(sampled_requests, num_requests)
return sampled_requests
# ----------------------------------------------------------------------------- # -----------------------------------------------------------------------------
# Sonnet Dataset Implementation # Sonnet Dataset Implementation
# ----------------------------------------------------------------------------- # -----------------------------------------------------------------------------
@ -704,7 +610,6 @@ class HuggingFaceDataset(BenchmarkDataset):
self, self,
dataset_path: str, dataset_path: str,
dataset_split: str, dataset_split: str,
no_stream: bool = False,
dataset_subset: Optional[str] = None, dataset_subset: Optional[str] = None,
**kwargs, **kwargs,
) -> None: ) -> None:
@ -712,7 +617,6 @@ class HuggingFaceDataset(BenchmarkDataset):
self.dataset_split = dataset_split self.dataset_split = dataset_split
self.dataset_subset = dataset_subset self.dataset_subset = dataset_subset
self.load_stream = not no_stream
self.load_data() self.load_data()
def load_data(self) -> None: def load_data(self) -> None:
@ -721,7 +625,7 @@ class HuggingFaceDataset(BenchmarkDataset):
self.dataset_path, self.dataset_path,
name=self.dataset_subset, name=self.dataset_subset,
split=self.dataset_split, split=self.dataset_split,
streaming=self.load_stream, streaming=True,
) )
self.data = self.data.shuffle(seed=self.random_seed) self.data = self.data.shuffle(seed=self.random_seed)
@ -871,15 +775,7 @@ class InstructCoderDataset(HuggingFaceDataset):
for item in self.data: for item in self.data:
if len(sampled_requests) >= num_requests: if len(sampled_requests) >= num_requests:
break break
prompt = f"{item['input']}\n\n{item['instruction']} Just output \ prompt = f"{item['instruction']}:\n{item['input']}"
the code, do not include any explanation."
# apply template
prompt = tokenizer.apply_chat_template(
[{"role": "user", "content": prompt}],
add_generation_prompt=True,
tokenize=False,
)
prompt_len = len(tokenizer(prompt).input_ids) prompt_len = len(tokenizer(prompt).input_ids)
sampled_requests.append( sampled_requests.append(
SampleRequest( SampleRequest(

View File

@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Benchmark the latency of processing a single batch of requests.""" """Benchmark the latency of processing a single batch of requests."""
import argparse import argparse
@ -7,12 +6,13 @@ import dataclasses
import json import json
import os import os
import time import time
from pathlib import Path
from typing import Any, Optional from typing import Any, Optional
import numpy as np import numpy as np
import torch
from tqdm import tqdm from tqdm import tqdm
import vllm.envs as envs
from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
from vllm.engine.arg_utils import EngineArgs from vllm.engine.arg_utils import EngineArgs
@ -80,9 +80,17 @@ def main(args: argparse.Namespace):
def run_to_completion(profile_dir: Optional[str] = None): def run_to_completion(profile_dir: Optional[str] = None):
if profile_dir: if profile_dir:
llm.start_profile() with torch.profiler.profile(
llm_generate() activities=[
llm.stop_profile() torch.profiler.ProfilerActivity.CPU,
torch.profiler.ProfilerActivity.CUDA,
],
on_trace_ready=torch.profiler.tensorboard_trace_handler(
str(profile_dir)
),
) as p:
llm_generate()
print(p.key_averages().table(sort_by="self_cuda_time_total"))
else: else:
start_time = time.perf_counter() start_time = time.perf_counter()
llm_generate() llm_generate()
@ -95,7 +103,11 @@ def main(args: argparse.Namespace):
run_to_completion(profile_dir=None) run_to_completion(profile_dir=None)
if args.profile: if args.profile:
profile_dir = envs.VLLM_TORCH_PROFILER_DIR profile_dir = args.profile_result_dir
if not profile_dir:
profile_dir = (
Path(".") / "vllm_benchmark_result" / f"latency_result_{time.time()}"
)
print(f"Profiling (results will be saved to '{profile_dir}')...") print(f"Profiling (results will be saved to '{profile_dir}')...")
run_to_completion(profile_dir=profile_dir) run_to_completion(profile_dir=profile_dir)
return return
@ -123,7 +135,7 @@ def main(args: argparse.Namespace):
save_to_pytorch_benchmark_format(args, results) save_to_pytorch_benchmark_format(args, results)
def create_argument_parser(): if __name__ == "__main__":
parser = FlexibleArgumentParser( parser = FlexibleArgumentParser(
description="Benchmark the latency of processing a single batch of " description="Benchmark the latency of processing a single batch of "
"requests till completion." "requests till completion."
@ -152,6 +164,15 @@ def create_argument_parser():
action="store_true", action="store_true",
help="profile the generation process of a single batch", help="profile the generation process of a single batch",
) )
parser.add_argument(
"--profile-result-dir",
type=str,
default=None,
help=(
"path to save the pytorch profiler output. Can be visualized "
"with ui.perfetto.dev or Tensorboard."
),
)
parser.add_argument( parser.add_argument(
"--output-json", "--output-json",
type=str, type=str,
@ -168,19 +189,5 @@ def create_argument_parser():
) )
parser = EngineArgs.add_cli_args(parser) parser = EngineArgs.add_cli_args(parser)
# V1 enables prefix caching by default which skews the latency
# numbers. We need to disable prefix caching by default.
parser.set_defaults(enable_prefix_caching=False)
return parser
if __name__ == "__main__":
parser = create_argument_parser()
args = parser.parse_args() args = parser.parse_args()
if args.profile and not envs.VLLM_TORCH_PROFILER_DIR:
raise OSError(
"The environment variable 'VLLM_TORCH_PROFILER_DIR' is not set. "
"Please set it to a valid path to use torch profiler."
)
main(args) main(args)

View File

@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
""" """
Offline benchmark to test the long document QA throughput. Offline benchmark to test the long document QA throughput.
@ -142,7 +141,7 @@ def main(args):
) )
def create_argument_parser(): if __name__ == "__main__":
parser = FlexibleArgumentParser( parser = FlexibleArgumentParser(
description="Benchmark the performance with or " description="Benchmark the performance with or "
"without automatic prefix caching." "without automatic prefix caching."
@ -192,11 +191,5 @@ def create_argument_parser():
) )
parser = EngineArgs.add_cli_args(parser) parser = EngineArgs.add_cli_args(parser)
return parser
if __name__ == "__main__":
parser = create_argument_parser()
args = parser.parse_args() args = parser.parse_args()
main(args) main(args)

View File

@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
""" """
Benchmark the efficiency of prefix caching. Benchmark the efficiency of prefix caching.
@ -218,7 +217,7 @@ def main(args):
) )
def create_argument_parser(): if __name__ == "__main__":
parser = FlexibleArgumentParser( parser = FlexibleArgumentParser(
description="Benchmark the performance with or without " description="Benchmark the performance with or without "
"automatic prefix caching." "automatic prefix caching."
@ -268,11 +267,5 @@ def create_argument_parser():
) )
parser = EngineArgs.add_cli_args(parser) parser = EngineArgs.add_cli_args(parser)
return parser
if __name__ == "__main__":
parser = create_argument_parser()
args = parser.parse_args() args = parser.parse_args()
main(args) main(args)

View File

@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Benchmark offline prioritization.""" """Benchmark offline prioritization."""
import argparse import argparse
@ -161,7 +160,7 @@ def main(args: argparse.Namespace):
json.dump(results, f, indent=4) json.dump(results, f, indent=4)
def create_argument_parser(): if __name__ == "__main__":
parser = FlexibleArgumentParser(description="Benchmark the throughput.") parser = FlexibleArgumentParser(description="Benchmark the throughput.")
parser.add_argument( parser.add_argument(
"--backend", type=str, choices=["vllm", "hf", "mii"], default="vllm" "--backend", type=str, choices=["vllm", "hf", "mii"], default="vllm"
@ -204,12 +203,6 @@ def create_argument_parser():
) )
parser = EngineArgs.add_cli_args(parser) parser = EngineArgs.add_cli_args(parser)
return parser
if __name__ == "__main__":
parser = create_argument_parser()
args = parser.parse_args() args = parser.parse_args()
if args.tokenizer is None: if args.tokenizer is None:
args.tokenizer = args.model args.tokenizer = args.model

View File

@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
r"""Benchmark online serving throughput. r"""Benchmark online serving throughput.
On the server side, run one of the following commands: On the server side, run one of the following commands:
@ -33,7 +32,7 @@ import warnings
from collections.abc import AsyncGenerator, Iterable from collections.abc import AsyncGenerator, Iterable
from dataclasses import dataclass from dataclasses import dataclass
from datetime import datetime from datetime import datetime
from typing import Any, Literal, Optional from typing import Any, Optional
import numpy as np import numpy as np
from tqdm.asyncio import tqdm from tqdm.asyncio import tqdm
@ -61,7 +60,6 @@ from benchmark_dataset import (
ASRDataset, ASRDataset,
BurstGPTDataset, BurstGPTDataset,
ConversationDataset, ConversationDataset,
CustomDataset,
HuggingFaceDataset, HuggingFaceDataset,
InstructCoderDataset, InstructCoderDataset,
MTBenchDataset, MTBenchDataset,
@ -107,42 +105,14 @@ class BenchmarkMetrics:
percentiles_e2el_ms: list[tuple[float, float]] percentiles_e2el_ms: list[tuple[float, float]]
def _get_current_request_rate(
ramp_up_strategy: Optional[Literal["linear", "exponential"]],
ramp_up_start_rps: Optional[int],
ramp_up_end_rps: Optional[int],
request_index: int,
total_requests: int,
request_rate: float,
) -> float:
if (
ramp_up_strategy
and ramp_up_start_rps is not None
and ramp_up_end_rps is not None
):
progress = request_index / max(total_requests - 1, 1)
if ramp_up_strategy == "linear":
increase = (ramp_up_end_rps - ramp_up_start_rps) * progress
return ramp_up_start_rps + increase
elif ramp_up_strategy == "exponential":
ratio = ramp_up_end_rps / ramp_up_start_rps
return ramp_up_start_rps * (ratio**progress)
else:
raise ValueError(f"Unknown ramp-up strategy: {ramp_up_strategy}")
return request_rate
async def get_request( async def get_request(
input_requests: list[SampleRequest], input_requests: list[SampleRequest],
request_rate: float, request_rate: float,
burstiness: float = 1.0, burstiness: float = 1.0,
ramp_up_strategy: Optional[Literal["linear", "exponential"]] = None, ) -> AsyncGenerator[SampleRequest, None]:
ramp_up_start_rps: Optional[int] = None,
ramp_up_end_rps: Optional[int] = None,
) -> AsyncGenerator[tuple[SampleRequest, float], None]:
""" """
Asynchronously generates requests at a specified rate Asynchronously generates requests at a specified rate
with OPTIONAL burstiness and OPTIONAL ramp-up strategy. with OPTIONAL burstiness.
Args: Args:
input_requests: input_requests:
@ -157,44 +127,22 @@ async def get_request(
A lower burstiness value (0 < burstiness < 1) results A lower burstiness value (0 < burstiness < 1) results
in more bursty requests, while a higher burstiness value in more bursty requests, while a higher burstiness value
(burstiness > 1) results in a more uniform arrival of requests. (burstiness > 1) results in a more uniform arrival of requests.
ramp_up_strategy (optional):
The ramp-up strategy. Can be "linear" or "exponential".
If None, uses constant request rate (specified by request_rate).
ramp_up_start_rps (optional):
The starting request rate for ramp-up.
ramp_up_end_rps (optional):
The ending request rate for ramp-up.
""" """
input_requests: Iterable[SampleRequest] = iter(input_requests)
# Calculate scale parameter theta to maintain the desired request_rate.
assert burstiness > 0, ( assert burstiness > 0, (
f"A positive burstiness factor is expected, but given {burstiness}." f"A positive burstiness factor is expected, but given {burstiness}."
) )
# Convert to list to get length for ramp-up calculations theta = 1.0 / (request_rate * burstiness)
if isinstance(input_requests, Iterable) and not isinstance(input_requests, list):
input_requests = list(input_requests)
total_requests = len(input_requests)
request_index = 0
for request in input_requests: for request in input_requests:
current_request_rate = _get_current_request_rate( yield request
ramp_up_strategy,
ramp_up_start_rps,
ramp_up_end_rps,
request_index,
total_requests,
request_rate,
)
yield request, current_request_rate if request_rate == float("inf"):
request_index += 1
if current_request_rate == float("inf"):
# If the request rate is infinity, then we don't need to wait. # If the request rate is infinity, then we don't need to wait.
continue continue
theta = 1.0 / (current_request_rate * burstiness)
# Sample the request interval from the gamma distribution. # Sample the request interval from the gamma distribution.
# If burstiness is 1, it follows exponential distribution. # If burstiness is 1, it follows exponential distribution.
interval = np.random.gamma(shape=burstiness, scale=theta) interval = np.random.gamma(shape=burstiness, scale=theta)
@ -327,7 +275,7 @@ async def benchmark(
model_id: str, model_id: str,
model_name: str, model_name: str,
tokenizer: PreTrainedTokenizerBase, tokenizer: PreTrainedTokenizerBase,
input_requests: list[SampleRequest], requests: list[SampleRequest],
logprobs: Optional[int], logprobs: Optional[int],
request_rate: float, request_rate: float,
burstiness: float, burstiness: float,
@ -340,9 +288,6 @@ async def benchmark(
max_concurrency: Optional[int], max_concurrency: Optional[int],
lora_modules: Optional[Iterable[str]], lora_modules: Optional[Iterable[str]],
extra_body: Optional[dict], extra_body: Optional[dict],
ramp_up_strategy: Optional[Literal["linear", "exponential"]] = None,
ramp_up_start_rps: Optional[int] = None,
ramp_up_end_rps: Optional[int] = None,
): ):
if backend in ASYNC_REQUEST_FUNCS: if backend in ASYNC_REQUEST_FUNCS:
request_func = ASYNC_REQUEST_FUNCS[backend] request_func = ASYNC_REQUEST_FUNCS[backend]
@ -350,12 +295,14 @@ async def benchmark(
raise ValueError(f"Unknown backend: {backend}") raise ValueError(f"Unknown backend: {backend}")
print("Starting initial single prompt test run...") print("Starting initial single prompt test run...")
last_idx = len(requests) - 1
test_prompt, test_prompt_len, test_output_len, test_mm_content = ( test_prompt, test_prompt_len, test_output_len, test_mm_content = (
input_requests[0].prompt, requests[last_idx].prompt,
input_requests[0].prompt_len, requests[last_idx].prompt_len,
input_requests[0].expected_output_len, requests[last_idx].expected_output_len,
input_requests[0].multi_modal_data, requests[last_idx].multi_modal_data,
) )
input_requests = requests[:last_idx]
assert test_mm_content is None or isinstance(test_mm_content, dict) assert test_mm_content is None or isinstance(test_mm_content, dict)
test_input = RequestFuncInput( test_input = RequestFuncInput(
@ -406,15 +353,7 @@ async def benchmark(
distribution = "Poisson process" if burstiness == 1.0 else "Gamma distribution" distribution = "Poisson process" if burstiness == 1.0 else "Gamma distribution"
if ramp_up_strategy is not None: print(f"Traffic request rate: {request_rate}")
print(
f"Traffic ramp-up strategy: {ramp_up_strategy}. Will increase "
f"RPS from {ramp_up_start_rps} to {ramp_up_end_rps} RPS over "
"the duration of the benchmark."
)
else:
print(f"Traffic request rate: {request_rate} RPS.")
print(f"Burstiness factor: {burstiness} ({distribution})") print(f"Burstiness factor: {burstiness} ({distribution})")
print(f"Maximum request concurrency: {max_concurrency}") print(f"Maximum request concurrency: {max_concurrency}")
@ -434,34 +373,7 @@ async def benchmark(
benchmark_start_time = time.perf_counter() benchmark_start_time = time.perf_counter()
tasks: list[asyncio.Task] = [] tasks: list[asyncio.Task] = []
async for request in get_request(input_requests, request_rate, burstiness):
rps_change_events = []
last_int_rps = -1
if ramp_up_strategy is not None and ramp_up_start_rps is not None:
last_int_rps = ramp_up_start_rps
rps_change_events.append(
{
"rps": last_int_rps,
"timestamp": datetime.now().isoformat(),
}
)
async for request, current_request_rate in get_request(
input_requests,
request_rate,
burstiness,
ramp_up_strategy,
ramp_up_start_rps,
ramp_up_end_rps,
):
if ramp_up_strategy is not None:
current_int_rps = int(current_request_rate)
if current_int_rps > last_int_rps:
timestamp = datetime.now().isoformat()
for rps_val in range(last_int_rps + 1, current_int_rps + 1):
rps_change_events.append({"rps": rps_val, "timestamp": timestamp})
last_int_rps = current_int_rps
prompt, prompt_len, output_len, mm_content = ( prompt, prompt_len, output_len, mm_content = (
request.prompt, request.prompt,
request.prompt_len, request.prompt_len,
@ -485,8 +397,11 @@ async def benchmark(
ignore_eos=ignore_eos, ignore_eos=ignore_eos,
extra_body=extra_body, extra_body=extra_body,
) )
task = limited_request_func(request_func_input=request_func_input, pbar=pbar) tasks.append(
tasks.append(asyncio.create_task(task)) asyncio.create_task(
limited_request_func(request_func_input=request_func_input, pbar=pbar)
)
)
outputs: list[RequestFuncOutput] = await asyncio.gather(*tasks) outputs: list[RequestFuncOutput] = await asyncio.gather(*tasks)
if profile: if profile:
@ -551,7 +466,7 @@ async def benchmark(
"total_input_tokens": metrics.total_input, "total_input_tokens": metrics.total_input,
"total_output_tokens": metrics.total_output, "total_output_tokens": metrics.total_output,
"request_throughput": metrics.request_throughput, "request_throughput": metrics.request_throughput,
"request_goodput": metrics.request_goodput if goodput_config_dict else None, "request_goodput:": metrics.request_goodput if goodput_config_dict else None,
"output_throughput": metrics.output_throughput, "output_throughput": metrics.output_throughput,
"total_token_throughput": metrics.total_token_throughput, "total_token_throughput": metrics.total_token_throughput,
"input_lens": [output.prompt_len for output in outputs], "input_lens": [output.prompt_len for output in outputs],
@ -562,9 +477,6 @@ async def benchmark(
"errors": [output.error for output in outputs], "errors": [output.error for output in outputs],
} }
if rps_change_events:
result["rps_change_events"] = rps_change_events
def process_one_metric( def process_one_metric(
# E.g., "ttft" # E.g., "ttft"
metric_attribute_name: str, metric_attribute_name: str,
@ -698,26 +610,6 @@ def main(args: argparse.Namespace):
tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model
tokenizer_mode = args.tokenizer_mode tokenizer_mode = args.tokenizer_mode
# Validate ramp-up arguments
if args.ramp_up_strategy is not None:
if args.request_rate != float("inf"):
raise ValueError(
"When using ramp-up, do not specify --request-rate. "
"The request rate will be controlled by ramp-up parameters. "
"Please remove the --request-rate argument."
)
if args.ramp_up_start_rps is None or args.ramp_up_end_rps is None:
raise ValueError(
"When using --ramp-up-strategy, both --ramp-up-start-rps and "
"--ramp-up-end-rps must be specified"
)
if args.ramp_up_start_rps < 0 or args.ramp_up_end_rps < 0:
raise ValueError("Ramp-up start and end RPS must be non-negative")
if args.ramp_up_start_rps > args.ramp_up_end_rps:
raise ValueError("Ramp-up start RPS must be less than end RPS")
if args.ramp_up_strategy == "exponential" and args.ramp_up_start_rps == 0:
raise ValueError("For exponential ramp-up, the start RPS cannot be 0.")
if args.base_url is not None: if args.base_url is not None:
api_url = f"{args.base_url}{args.endpoint}" api_url = f"{args.base_url}{args.endpoint}"
base_url = f"{args.base_url}" base_url = f"{args.base_url}"
@ -725,6 +617,9 @@ def main(args: argparse.Namespace):
api_url = f"http://{args.host}:{args.port}{args.endpoint}" api_url = f"http://{args.host}:{args.port}{args.endpoint}"
base_url = f"http://{args.host}:{args.port}" base_url = f"http://{args.host}:{args.port}"
# Create one more request (for a test request)
total_prompts = args.num_prompts + 1
tokenizer = get_tokenizer( tokenizer = get_tokenizer(
tokenizer_id, tokenizer_id,
tokenizer_mode=tokenizer_mode, tokenizer_mode=tokenizer_mode,
@ -737,21 +632,12 @@ def main(args: argparse.Namespace):
"'--dataset-path' if required." "'--dataset-path' if required."
) )
if args.dataset_name == "custom": if args.dataset_name == "sonnet":
dataset = CustomDataset(dataset_path=args.dataset_path)
input_requests = dataset.sample(
num_requests=args.num_prompts,
tokenizer=tokenizer,
output_len=args.custom_output_len,
skip_chat_template=args.custom_skip_chat_template,
)
elif args.dataset_name == "sonnet":
dataset = SonnetDataset(dataset_path=args.dataset_path) dataset = SonnetDataset(dataset_path=args.dataset_path)
# For the "sonnet" dataset, formatting depends on the backend. # For the "sonnet" dataset, formatting depends on the backend.
if args.backend == "openai-chat": if args.backend == "openai-chat":
input_requests = dataset.sample( input_requests = dataset.sample(
num_requests=args.num_prompts, num_requests=total_prompts,
input_len=args.sonnet_input_len, input_len=args.sonnet_input_len,
output_len=args.sonnet_output_len, output_len=args.sonnet_output_len,
prefix_len=args.sonnet_prefix_len, prefix_len=args.sonnet_prefix_len,
@ -763,7 +649,7 @@ def main(args: argparse.Namespace):
"Tokenizer/model must have chat template for sonnet dataset." "Tokenizer/model must have chat template for sonnet dataset."
) )
input_requests = dataset.sample( input_requests = dataset.sample(
num_requests=args.num_prompts, num_requests=total_prompts,
input_len=args.sonnet_input_len, input_len=args.sonnet_input_len,
output_len=args.sonnet_output_len, output_len=args.sonnet_output_len,
prefix_len=args.sonnet_prefix_len, prefix_len=args.sonnet_prefix_len,
@ -825,9 +711,8 @@ def main(args: argparse.Namespace):
dataset_subset=args.hf_subset, dataset_subset=args.hf_subset,
dataset_split=args.hf_split, dataset_split=args.hf_split,
random_seed=args.seed, random_seed=args.seed,
no_stream=args.no_stream,
).sample( ).sample(
num_requests=args.num_prompts, num_requests=total_prompts,
tokenizer=tokenizer, tokenizer=tokenizer,
output_len=args.hf_output_len, output_len=args.hf_output_len,
) )
@ -839,15 +724,15 @@ def main(args: argparse.Namespace):
random_seed=args.seed, dataset_path=args.dataset_path random_seed=args.seed, dataset_path=args.dataset_path
).sample( ).sample(
tokenizer=tokenizer, tokenizer=tokenizer,
num_requests=args.num_prompts, num_requests=total_prompts,
output_len=args.sharegpt_output_len, output_len=args.sharegpt_output_len,
), ),
"burstgpt": lambda: BurstGPTDataset( "burstgpt": lambda: BurstGPTDataset(
random_seed=args.seed, dataset_path=args.dataset_path random_seed=args.seed, dataset_path=args.dataset_path
).sample(tokenizer=tokenizer, num_requests=args.num_prompts), ).sample(tokenizer=tokenizer, num_requests=total_prompts),
"random": lambda: RandomDataset(dataset_path=args.dataset_path).sample( "random": lambda: RandomDataset(dataset_path=args.dataset_path).sample(
tokenizer=tokenizer, tokenizer=tokenizer,
num_requests=args.num_prompts, num_requests=total_prompts,
prefix_len=args.random_prefix_len, prefix_len=args.random_prefix_len,
input_len=args.random_input_len, input_len=args.random_input_len,
output_len=args.random_output_len, output_len=args.random_output_len,
@ -882,10 +767,6 @@ def main(args: argparse.Namespace):
if "temperature" not in sampling_params: if "temperature" not in sampling_params:
sampling_params["temperature"] = 0.0 # Default to greedy decoding. sampling_params["temperature"] = 0.0 # Default to greedy decoding.
if args.backend == "llama.cpp":
# Disable prompt caching in llama.cpp backend
sampling_params["cache_prompt"] = False
# Avoid GC processing "static" data - reduce pause times. # Avoid GC processing "static" data - reduce pause times.
gc.collect() gc.collect()
gc.freeze() gc.freeze()
@ -898,7 +779,7 @@ def main(args: argparse.Namespace):
model_id=model_id, model_id=model_id,
model_name=model_name, model_name=model_name,
tokenizer=tokenizer, tokenizer=tokenizer,
input_requests=input_requests, requests=input_requests,
logprobs=args.logprobs, logprobs=args.logprobs,
request_rate=args.request_rate, request_rate=args.request_rate,
burstiness=args.burstiness, burstiness=args.burstiness,
@ -911,9 +792,6 @@ def main(args: argparse.Namespace):
max_concurrency=args.max_concurrency, max_concurrency=args.max_concurrency,
lora_modules=args.lora_modules, lora_modules=args.lora_modules,
extra_body=sampling_params, extra_body=sampling_params,
ramp_up_strategy=args.ramp_up_strategy,
ramp_up_start_rps=args.ramp_up_start_rps,
ramp_up_end_rps=args.ramp_up_end_rps,
) )
) )
@ -946,11 +824,6 @@ def main(args: argparse.Namespace):
result_json["burstiness"] = args.burstiness result_json["burstiness"] = args.burstiness
result_json["max_concurrency"] = args.max_concurrency result_json["max_concurrency"] = args.max_concurrency
if args.ramp_up_strategy is not None:
result_json["ramp_up_strategy"] = args.ramp_up_strategy
result_json["ramp_up_start_rps"] = args.ramp_up_start_rps
result_json["ramp_up_end_rps"] = args.ramp_up_end_rps
# Merge with benchmark result # Merge with benchmark result
result_json = {**result_json, **benchmark_result} result_json = {**result_json, **benchmark_result}
@ -966,8 +839,6 @@ def main(args: argparse.Namespace):
]: ]:
if field in result_json: if field in result_json:
del result_json[field] del result_json[field]
if field in benchmark_result:
del benchmark_result[field]
# Save to file # Save to file
base_model_id = model_id.split("/")[-1] base_model_id = model_id.split("/")[-1]
@ -976,14 +847,10 @@ def main(args: argparse.Namespace):
if args.max_concurrency is not None if args.max_concurrency is not None
else "" else ""
) )
if args.ramp_up_strategy is not None: file_name = f"{backend}-{args.request_rate}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json" # noqa
file_name = f"{backend}-ramp-up-{args.ramp_up_strategy}-{args.ramp_up_start_rps}qps-{args.ramp_up_end_rps}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json" # noqa
else:
file_name = f"{backend}-{args.request_rate}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json" # noqa
if args.result_filename: if args.result_filename:
file_name = args.result_filename file_name = args.result_filename
if args.result_dir: if args.result_dir:
os.makedirs(args.result_dir, exist_ok=True)
file_name = os.path.join(args.result_dir, file_name) file_name = os.path.join(args.result_dir, file_name)
with open( with open(
file_name, mode="a+" if args.append_result else "w", encoding="utf-8" file_name, mode="a+" if args.append_result else "w", encoding="utf-8"
@ -995,7 +862,7 @@ def main(args: argparse.Namespace):
save_to_pytorch_benchmark_format(args, result_json, file_name) save_to_pytorch_benchmark_format(args, result_json, file_name)
def create_argument_parser(): if __name__ == "__main__":
parser = FlexibleArgumentParser( parser = FlexibleArgumentParser(
description="Benchmark the online serving throughput." description="Benchmark the online serving throughput."
) )
@ -1024,7 +891,7 @@ def create_argument_parser():
"--dataset-name", "--dataset-name",
type=str, type=str,
default="sharegpt", default="sharegpt",
choices=["sharegpt", "burstgpt", "sonnet", "random", "hf", "custom"], choices=["sharegpt", "burstgpt", "sonnet", "random", "hf"],
help="Name of the dataset to benchmark on.", help="Name of the dataset to benchmark on.",
) )
parser.add_argument( parser.add_argument(
@ -1034,11 +901,6 @@ def create_argument_parser():
help="Path to the sharegpt/sonnet dataset. " help="Path to the sharegpt/sonnet dataset. "
"Or the huggingface dataset ID if using HF dataset.", "Or the huggingface dataset ID if using HF dataset.",
) )
parser.add_argument(
"--no-stream",
action="store_true",
help="Do not load the dataset in streaming mode.",
)
parser.add_argument( parser.add_argument(
"--max-concurrency", "--max-concurrency",
type=int, type=int,
@ -1199,19 +1061,6 @@ def create_argument_parser():
) )
# group for dataset specific arguments # group for dataset specific arguments
custom_group = parser.add_argument_group("custom dataset options")
custom_group.add_argument(
"--custom-output-len",
type=int,
default=256,
help="Number of output tokens per request, used only for custom dataset.",
)
custom_group.add_argument(
"--custom-skip-chat-template",
action="store_true",
help="Skip applying chat template to prompt, used only for custom dataset.",
)
sonnet_group = parser.add_argument_group("sonnet dataset options") sonnet_group = parser.add_argument_group("sonnet dataset options")
sonnet_group.add_argument( sonnet_group.add_argument(
"--sonnet-input-len", "--sonnet-input-len",
@ -1350,35 +1199,6 @@ def create_argument_parser():
"script chooses a LoRA module at random.", "script chooses a LoRA module at random.",
) )
parser.add_argument(
"--ramp-up-strategy",
type=str,
default=None,
choices=["linear", "exponential"],
help="The ramp-up strategy. This would be used to "
"ramp up the request rate from initial RPS to final "
"RPS rate (specified by --ramp-up-start-rps and --ramp-up-end-rps). "
"over the duration of the benchmark.",
)
parser.add_argument(
"--ramp-up-start-rps",
type=int,
default=None,
help="The starting request rate for ramp-up (RPS). "
"Needs to be specified when --ramp-up-strategy is used.",
)
parser.add_argument(
"--ramp-up-end-rps",
type=int,
default=None,
help="The ending request rate for ramp-up (RPS). "
"Needs to be specified when --ramp-up-strategy is used.",
)
return parser
if __name__ == "__main__":
parser = create_argument_parser()
args = parser.parse_args() args = parser.parse_args()
main(args) main(args)

View File

@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
r"""Benchmark online serving throughput with structured outputs. r"""Benchmark online serving throughput with structured outputs.
On the server side, run one of the following commands: On the server side, run one of the following commands:
@ -12,6 +11,7 @@ On the client side, run:
--model <your_model> \ --model <your_model> \
--dataset json \ --dataset json \
--structured-output-ratio 1.0 \ --structured-output-ratio 1.0 \
--structured-output-backend auto \
--request-rate 10 \ --request-rate 10 \
--num-prompts 1000 --num-prompts 1000
@ -672,7 +672,7 @@ async def benchmark(
def evaluate(ret, args): def evaluate(ret, args):
def _eval_correctness_json(expected, actual): def _eval_correctness_json(expected, actual):
# extract json string from string using regex # extract json string from string using regex
import regex as re import re
actual = actual.replace("\n", "").replace(" ", "").strip() actual = actual.replace("\n", "").replace(" ", "").strip()
try: try:
@ -687,7 +687,7 @@ def evaluate(ret, args):
return actual in args.choice return actual in args.choice
def _eval_correctness_regex(expected, actual): def _eval_correctness_regex(expected, actual):
import regex as re import re
return re.match(args.regex, actual) is not None return re.match(args.regex, actual) is not None
@ -850,7 +850,7 @@ def main(args: argparse.Namespace):
json.dump(results, outfile, indent=4) json.dump(results, outfile, indent=4)
def create_argument_parser(): if __name__ == "__main__":
parser = FlexibleArgumentParser( parser = FlexibleArgumentParser(
description="Benchmark the online serving throughput." description="Benchmark the online serving throughput."
) )
@ -1034,10 +1034,5 @@ def create_argument_parser():
help="Ratio of Structured Outputs requests", help="Ratio of Structured Outputs requests",
) )
return parser
if __name__ == "__main__":
parser = create_argument_parser()
args = parser.parse_args() args = parser.parse_args()
main(args) main(args)

View File

@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Benchmark offline inference throughput.""" """Benchmark offline inference throughput."""
import argparse import argparse
@ -97,7 +96,7 @@ def run_vllm(
assert lora_requests is None, "BeamSearch API does not support LoRA" assert lora_requests is None, "BeamSearch API does not support LoRA"
prompts = [request.prompt for request in requests] prompts = [request.prompt for request in requests]
# output_len should be the same for all requests. # output_len should be the same for all requests.
output_len = requests[0].expected_output_len output_len = requests[0][2]
for request in requests: for request in requests:
assert request.expected_output_len == output_len assert request.expected_output_len == output_len
start = time.perf_counter() start = time.perf_counter()
@ -356,7 +355,6 @@ def get_requests(args, tokenizer):
elif args.dataset_name == "burstgpt": elif args.dataset_name == "burstgpt":
dataset_cls = BurstGPTDataset dataset_cls = BurstGPTDataset
elif args.dataset_name == "hf": elif args.dataset_name == "hf":
common_kwargs["no_stream"] = args.no_stream
if args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS: if args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS:
dataset_cls = VisionArenaDataset dataset_cls = VisionArenaDataset
common_kwargs["dataset_subset"] = None common_kwargs["dataset_subset"] = None
@ -596,7 +594,7 @@ def validate_args(args):
) )
def create_argument_parser(): if __name__ == "__main__":
parser = FlexibleArgumentParser(description="Benchmark the throughput.") parser = FlexibleArgumentParser(description="Benchmark the throughput.")
parser.add_argument( parser.add_argument(
"--backend", "--backend",
@ -611,11 +609,6 @@ def create_argument_parser():
help="Name of the dataset to benchmark on.", help="Name of the dataset to benchmark on.",
default="sharegpt", default="sharegpt",
) )
parser.add_argument(
"--no-stream",
action="store_true",
help="Do not load the dataset in streaming mode.",
)
parser.add_argument( parser.add_argument(
"--dataset", "--dataset",
type=str, type=str,
@ -723,12 +716,6 @@ def create_argument_parser():
) )
parser = AsyncEngineArgs.add_cli_args(parser) parser = AsyncEngineArgs.add_cli_args(parser)
return parser
if __name__ == "__main__":
parser = create_argument_parser()
args = parser.parse_args() args = parser.parse_args()
if args.tokenizer is None: if args.tokenizer is None:
args.tokenizer = args.model args.tokenizer = args.model

View File

@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse import argparse
import json import json
@ -66,9 +65,4 @@ class InfEncoder(json.JSONEncoder):
def write_to_json(filename: str, records: list) -> None: def write_to_json(filename: str, records: list) -> None:
with open(filename, "w") as f: with open(filename, "w") as f:
json.dump( json.dump(records, f, cls=InfEncoder)
records,
f,
cls=InfEncoder,
default=lambda o: f"<{type(o).__name__} object is not JSON serializable>",
)

View File

@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse import argparse
import copy import copy

View File

@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Cutlass bench utils # Cutlass bench utils
from collections.abc import Iterable from collections.abc import Iterable

View File

@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse import argparse
import copy import copy
@ -19,7 +18,7 @@ from vllm import _custom_ops as ops
from vllm.model_executor.layers.quantization.utils.fp8_utils import ( from vllm.model_executor.layers.quantization.utils.fp8_utils import (
w8a8_block_fp8_matmul, w8a8_block_fp8_matmul,
) )
from vllm.utils import FlexibleArgumentParser, cdiv from vllm.utils import FlexibleArgumentParser
DEFAULT_MODELS = list(WEIGHT_SHAPES.keys()) DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512] DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
@ -117,9 +116,14 @@ def bench_fp8(
scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32) scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32) scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
block_scale_a = torch.rand((m, cdiv(k, 128)), device="cuda", dtype=torch.float32) def ceil_div(x: int, y: int) -> int:
return (x + y - 1) // y
block_scale_a = torch.rand(
(m, ceil_div(k, 128)), device="cuda", dtype=torch.float32
)
block_scale_b = torch.rand( block_scale_b = torch.rand(
cdiv(k, 128), cdiv(n, 128), device="cuda", dtype=torch.float32 ceil_div(k, 128), ceil_div(n, 128), device="cuda", dtype=torch.float32
) )
block_scale_a_M_major = block_scale_a.t().contiguous().t() block_scale_a_M_major = block_scale_a.t().contiguous().t()
block_scale_b_K_major = block_scale_b.t().contiguous().t() block_scale_b_K_major = block_scale_b.t().contiguous().t()

View File

@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Weight Shapes are in the format # Weight Shapes are in the format
# ([K, N], TP_SPLIT_DIM) # ([K, N], TP_SPLIT_DIM)

View File

@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os import os

View File

@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import asyncio import asyncio
import itertools import itertools

View File

@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import json import json

View File

@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pickle as pkl import pickle as pkl
import time import time

View File

@ -1,159 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse
import copy
import itertools
import torch
from weight_shapes import WEIGHT_SHAPES
from vllm._custom_ops import cutlass_scaled_mm as vllm_scaled_mm
from vllm._custom_ops import scaled_fp8_quant as vllm_scaled_fp8_quant
from vllm.triton_utils import triton
PROVIDER_CFGS = {
"torch-bf16": dict(enabled=True),
"fp8-tensor-w-token-a": dict(
w="tensor", a="token", no_a_quant=False, enabled=False
),
"fp8-tensor-w-tensor-a": dict(
w="tensor", a="tensor", no_a_quant=False, enabled=True
),
"fp8-channel-w-token-a": dict(
w="channel", a="token", no_a_quant=False, enabled=True
),
"fp8-channel-w-tensor-a": dict(
w="channel", a="tensor", no_a_quant=False, enabled=False
),
"fp8-tensor-w-token-a-noquant": dict(
w="tensor", a="token", no_a_quant=True, enabled=False
),
"fp8-tensor-w-tensor-a-noquant": dict(
w="tensor", a="tensor", no_a_quant=True, enabled=True
),
"fp8-channel-w-token-a-noquant": dict(
w="channel", a="token", no_a_quant=True, enabled=True
),
"fp8-channel-w-tensor-a-noquant": dict(
w="channel", a="tensor", no_a_quant=True, enabled=False
),
}
_enabled = [k for k, v in PROVIDER_CFGS.items() if v["enabled"]]
def _quant_weight_fp8(b: torch.Tensor, w_type: str, device: str):
if w_type == "tensor":
scale_b = torch.ones(1, device=device, dtype=torch.float32)
b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b)
else:
b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, use_per_token_if_dynamic=True)
return b_fp8.t(), scale_b_fp8
def build_fp8_runner(cfg, a, b, dtype, device):
b_fp8, scale_b_fp8 = _quant_weight_fp8(b, cfg["w"], device)
scale_a_const = (
torch.ones(1, device=device, dtype=torch.float32)
if cfg["a"] == "tensor"
else None
)
if cfg["no_a_quant"]:
if cfg["a"] == "tensor":
a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a_const)
else:
a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, use_per_token_if_dynamic=True)
def run():
return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
return run
if cfg["a"] == "tensor":
def run():
a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a_const)
return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
else:
def run():
a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, use_per_token_if_dynamic=True)
return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
return run
@triton.testing.perf_report(
triton.testing.Benchmark(
x_names=["batch_size"],
x_vals=[1, 16, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384],
x_log=False,
line_arg="provider",
line_vals=_enabled,
line_names=_enabled,
ylabel="TFLOP/s (larger is better)",
plot_name="BF16 vs FP8 GEMMs",
args={},
)
)
def benchmark(batch_size, provider, N, K):
M = batch_size
device = "cuda"
dtype = torch.bfloat16
a = torch.randn((M, K), device=device, dtype=dtype)
b = torch.randn((N, K), device=device, dtype=dtype)
quantiles = [0.5, 0.2, 0.8]
if provider == "torch-bf16":
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
lambda: torch.nn.functional.linear(a, b), quantiles=quantiles
)
else:
cfg = PROVIDER_CFGS[provider]
run_quant = build_fp8_runner(cfg, a, b, dtype, device)
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
lambda: run_quant(), quantiles=quantiles
)
to_tflops = lambda t_ms: (2 * M * N * K) * 1e-12 / (t_ms * 1e-3)
return to_tflops(ms), to_tflops(max_ms), to_tflops(min_ms)
def prepare_shapes(args):
out = []
for model, tp_size in itertools.product(args.models, args.tp_sizes):
for KN, tp_dim in copy.deepcopy(WEIGHT_SHAPES[model]):
KN[tp_dim] //= tp_size
KN.append(model)
out.append(KN)
return out
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--models",
nargs="+",
type=str,
default=["meta-llama/Llama-3.1-8B-Instruct"],
choices=list(WEIGHT_SHAPES.keys()),
)
parser.add_argument("--tp-sizes", nargs="+", type=int, default=[1])
args = parser.parse_args()
for K, N, model in prepare_shapes(args):
print(f"{model}, N={N} K={K}, BF16 vs FP8 GEMMs TFLOP/s:")
benchmark.run(
print_data=True,
show_plots=True,
save_path=f"bench_fp8_res_n{N}_k{K}",
N=N,
K=K,
)
print("Benchmark finished!")

View File

@ -1,169 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse
import copy
import itertools
import torch
from weight_shapes import WEIGHT_SHAPES
from vllm._custom_ops import cutlass_scaled_mm as vllm_scaled_mm
from vllm._custom_ops import scaled_int8_quant as vllm_scaled_int8_quant
from vllm.triton_utils import triton
PROVIDER_CFGS = {
"torch-bf16": dict(enabled=True),
"int8-tensor-w-token-a": dict(
w="tensor", a="token", no_a_quant=False, enabled=False
),
"int8-tensor-w-tensor-a": dict(
w="tensor", a="tensor", no_a_quant=False, enabled=True
),
"int8-channel-w-token-a": dict(
w="channel", a="token", no_a_quant=False, enabled=True
),
"int8-channel-w-tensor-a": dict(
w="channel", a="tensor", no_a_quant=False, enabled=False
),
"int8-tensor-w-token-a-noquant": dict(
w="tensor", a="token", no_a_quant=True, enabled=False
),
"int8-tensor-w-tensor-a-noquant": dict(
w="tensor", a="tensor", no_a_quant=True, enabled=True
),
"int8-channel-w-token-a-noquant": dict(
w="channel", a="token", no_a_quant=True, enabled=True
),
"int8-channel-w-tensor-a-noquant": dict(
w="channel", a="tensor", no_a_quant=True, enabled=False
),
}
def _quant_weight(b, w_type, device):
if w_type == "tensor":
scale_b = torch.ones(1, device=device, dtype=torch.float32)
b_int8, scale_b_int8, _ = vllm_scaled_int8_quant(b, scale_b)
assert scale_b_int8.numel() == 1
else: # channel
b_int8, scale_b_int8, _ = vllm_scaled_int8_quant(b)
assert scale_b_int8.numel() == b.shape[0]
return b_int8.t(), scale_b_int8
def build_int8_runner(cfg, a, b, dtype, device):
# quant before running the kernel
b_int8, scale_b_int8 = _quant_weight(b, cfg["w"], device)
scale_a_const = None
if cfg["a"] == "tensor":
scale_a_const = torch.ones(1, device=device, dtype=torch.float32)
# no quant, create activation ahead
if cfg["no_a_quant"]:
if cfg["a"] == "tensor":
a_int8, scale_a_int8, _ = vllm_scaled_int8_quant(a, scale_a_const)
else: # token
a_int8, scale_a_int8, _ = vllm_scaled_int8_quant(a)
def run_quant():
return vllm_scaled_mm(a_int8, b_int8, scale_a_int8, scale_b_int8, dtype)
return run_quant
# dynamic quant, create activation inside
if cfg["a"] == "tensor":
def run_quant():
a_int8, scale_a_int8, _ = vllm_scaled_int8_quant(a, scale_a_const)
return vllm_scaled_mm(a_int8, b_int8, scale_a_int8, scale_b_int8, dtype)
else: # token
def run_quant():
a_int8, scale_a_int8, _ = vllm_scaled_int8_quant(a)
return vllm_scaled_mm(a_int8, b_int8, scale_a_int8, scale_b_int8, dtype)
return run_quant
_enabled = [k for k, v in PROVIDER_CFGS.items() if v.get("enabled")]
@triton.testing.perf_report(
triton.testing.Benchmark(
x_names=["batch_size"],
x_vals=[1, 16, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384],
x_log=False,
line_arg="provider",
line_vals=_enabled,
line_names=[k for k in _enabled],
ylabel="TFLOP/s (larger is better)",
plot_name="BF16 vs INT8 GEMMs",
args={},
)
)
def benchmark(batch_size, provider, N, K):
M = batch_size
device = "cuda"
dtype = torch.bfloat16
a = torch.randn((M, K), device=device, dtype=dtype)
b = torch.randn((N, K), device=device, dtype=dtype)
quantiles = [0.5, 0.2, 0.8]
if provider == "torch-bf16":
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
lambda: torch.nn.functional.linear(a, b), quantiles=quantiles
)
else:
cfg = PROVIDER_CFGS[provider]
run_quant = build_int8_runner(cfg, a, b, dtype, device)
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
lambda: run_quant(), quantiles=quantiles
)
to_tflops = lambda t_ms: (2 * M * N * K) * 1e-12 / (t_ms * 1e-3)
return to_tflops(ms), to_tflops(max_ms), to_tflops(min_ms)
def prepare_shapes(args):
KN_model_names = []
for model, tp_size in itertools.product(args.models, args.tp_sizes):
for KN, tp_dim in copy.deepcopy(WEIGHT_SHAPES[model]):
KN[tp_dim] //= tp_size
KN.append(model)
KN_model_names.append(KN)
return KN_model_names
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--models",
nargs="+",
type=str,
default=["meta-llama/Llama-3.1-8B-Instruct"],
choices=list(WEIGHT_SHAPES.keys()),
help="List of models to benchmark",
)
parser.add_argument(
"--tp-sizes",
nargs="+",
type=int,
default=[1],
help="List of tensor parallel sizes",
)
args = parser.parse_args()
for K, N, model in prepare_shapes(args):
print(f"{model}, N={N} K={K}, BF16 vs INT8 GEMMs TFLOP/s:")
benchmark.run(
print_data=True,
show_plots=True,
save_path=f"bench_int8_res_n{N}_k{K}",
N=N,
K=K,
)
print("Benchmark finished!")

View File

@ -1,141 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse
import copy
import itertools
import torch
from weight_shapes import WEIGHT_SHAPES
from vllm import _custom_ops as ops
from vllm.platforms import current_platform
from vllm.scalar_type import scalar_types
from vllm.triton_utils import triton
if not current_platform.has_device_capability(100):
raise RuntimeError("NVFP4 requires compute capability of 10.0 (Blackwell)")
FLOAT4_E2M1_MAX = scalar_types.float4_e2m1f.max()
FLOAT8_E4M3_MAX = torch.finfo(torch.float8_e4m3fn).max
PROVIDER_CFGS = {
"torch-bf16": dict(enabled=True),
"nvfp4": dict(no_a_quant=False, enabled=True),
"nvfp4-noquant": dict(no_a_quant=True, enabled=True),
}
_enabled = [k for k, v in PROVIDER_CFGS.items() if v["enabled"]]
def _quant_weight_nvfp4(b: torch.Tensor, device: str):
# Compute global scale for weight
b_amax = torch.abs(b).max().to(torch.float32)
b_global_scale = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / b_amax
b_fp4, scale_b_fp4 = ops.scaled_fp4_quant(b, b_global_scale)
return b_fp4, scale_b_fp4, b_global_scale
def build_nvfp4_runner(cfg, a, b, dtype, device):
b_fp4, scale_b_fp4, b_global_scale = _quant_weight_nvfp4(b, device)
# Compute global scale for activation
# NOTE: This is generally provided ahead-of-time by the model checkpoint.
a_amax = torch.abs(a).max().to(torch.float32)
a_global_scale = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / a_amax
# Alpha for the GEMM operation
alpha = 1.0 / (a_global_scale * b_global_scale)
if cfg["no_a_quant"]:
# Pre-quantize activation
a_fp4, scale_a_fp4 = ops.scaled_fp4_quant(a, a_global_scale)
def run():
return ops.cutlass_scaled_fp4_mm(
a_fp4, b_fp4, scale_a_fp4, scale_b_fp4, alpha, dtype
)
return run
# Quantize activation on-the-fly
def run():
a_fp4, scale_a_fp4 = ops.scaled_fp4_quant(a, a_global_scale)
return ops.cutlass_scaled_fp4_mm(
a_fp4, b_fp4, scale_a_fp4, scale_b_fp4, alpha, dtype
)
return run
@triton.testing.perf_report(
triton.testing.Benchmark(
x_names=["batch_size"],
x_vals=[1, 16, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384],
x_log=False,
line_arg="provider",
line_vals=_enabled,
line_names=_enabled,
ylabel="TFLOP/s (larger is better)",
plot_name="BF16 vs NVFP4 GEMMs",
args={},
)
)
def benchmark(batch_size, provider, N, K):
M = batch_size
device = "cuda"
dtype = torch.bfloat16
a = torch.randn((M, K), device=device, dtype=dtype)
b = torch.randn((N, K), device=device, dtype=dtype)
quantiles = [0.5, 0.2, 0.8]
if provider == "torch-bf16":
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
lambda: torch.nn.functional.linear(a, b), quantiles=quantiles
)
else:
cfg = PROVIDER_CFGS[provider]
run_quant = build_nvfp4_runner(cfg, a, b, dtype, device)
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
lambda: run_quant(), quantiles=quantiles
)
to_tflops = lambda t_ms: (2 * M * N * K) * 1e-12 / (t_ms * 1e-3)
return to_tflops(ms), to_tflops(max_ms), to_tflops(min_ms)
def prepare_shapes(args):
out = []
for model, tp_size in itertools.product(args.models, args.tp_sizes):
for KN, tp_dim in copy.deepcopy(WEIGHT_SHAPES[model]):
KN[tp_dim] //= tp_size
KN.append(model)
out.append(KN)
return out
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--models",
nargs="+",
type=str,
default=["meta-llama/Llama-3.1-8B-Instruct"],
choices=list(WEIGHT_SHAPES.keys()),
)
parser.add_argument("--tp-sizes", nargs="+", type=int, default=[1])
args = parser.parse_args()
for K, N, model in prepare_shapes(args):
print(f"{model}, N={N} K={K}, BF16 vs NVFP4 GEMMs TFLOP/s:")
benchmark.run(
print_data=True,
show_plots=True,
save_path=f"bench_nvfp4_res_n{N}_k{K}",
N=N,
K=K,
)
print("Benchmark finished!")

View File

@ -1,98 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import itertools
from typing import Callable
import torch
from vllm import _custom_ops as ops
from vllm.config import CompilationConfig, VllmConfig, set_current_vllm_config
from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
from vllm.triton_utils import triton
# TODO(luka): use standalone_compile utility
def with_dyn_arg(fn: Callable, arg_index: int, dim_index: int):
def inner(*args):
torch._dynamo.mark_dynamic(args[arg_index], dim_index)
return fn(*args)
return inner
torch._dynamo.config.recompile_limit = 8888
compilation_config = CompilationConfig(custom_ops=["none"])
with set_current_vllm_config(VllmConfig(compilation_config=compilation_config)):
torch_per_token_quant_fp8 = torch.compile(
QuantFP8(False, GroupShape.PER_TOKEN),
fullgraph=True,
dynamic=False, # recompile for different shapes
)
# First dim is explicitly dynamic to simulate vLLM usage
torch_per_token_quant_fp8 = with_dyn_arg(torch_per_token_quant_fp8, 0, 0)
def cuda_per_token_quant_fp8(
input: torch.Tensor,
) -> tuple[torch.Tensor, torch.Tensor]:
return ops.scaled_fp8_quant(input)
def calculate_diff(batch_size: int, seq_len: int):
"""Calculate difference between Triton and CUDA implementations."""
device = torch.device("cuda")
x = torch.rand((batch_size * seq_len, 4096), dtype=torch.float16, device=device)
torch_out, torch_scale = torch_per_token_quant_fp8(x)
cuda_out, cuda_scale = cuda_per_token_quant_fp8(x)
if torch.allclose(
cuda_out.to(torch.float32), torch_out.to(torch.float32), rtol=1e-3, atol=1e-5
) and torch.allclose(cuda_scale, torch_scale, rtol=1e-3, atol=1e-5):
print("✅ All implementations match")
else:
print("❌ Implementations differ")
batch_size_range = [1, 16, 32, 64, 128]
seq_len_range = [1, 16, 64, 128, 256, 512, 1024, 2048, 4096]
configs = list(itertools.product(batch_size_range, seq_len_range))
@triton.testing.perf_report(
triton.testing.Benchmark(
x_names=["batch_size", "seq_len"],
x_vals=configs,
line_arg="provider",
line_vals=["torch", "cuda"],
line_names=["Torch", "CUDA"],
styles=[("blue", "-"), ("green", "-")],
ylabel="us",
plot_name="per-token-dynamic-quant-fp8-performance",
args={},
)
)
def benchmark_quantization(batch_size, seq_len, provider):
dtype = torch.float16
device = torch.device("cuda")
x = torch.randn(batch_size * seq_len, 4096, device=device, dtype=dtype)
quantiles = [0.5, 0.2, 0.8]
if provider == "torch":
fn = lambda: torch_per_token_quant_fp8(x.clone())
elif provider == "cuda":
fn = lambda: cuda_per_token_quant_fp8(x.clone())
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(fn, quantiles=quantiles)
return 1000 * ms, 1000 * max_ms, 1000 * min_ms
if __name__ == "__main__":
calculate_diff(batch_size=4, seq_len=4096)
benchmark_quantization.run(print_data=True)

View File

@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os import os
import sys import sys

View File

@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Copyright (c) Microsoft Corporation. # Copyright (c) Microsoft Corporation.
# Licensed under the MIT License. # Licensed under the MIT License.

View File

@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
""" """
Benchmark the performance of the cutlass_moe_fp4 kernel vs the triton_moe Benchmark the performance of the cutlass_moe_fp4 kernel vs the triton_moe
kernel. The cutlass_moe_fp4 kernel takes in fp4 quantized weights and 16-bit kernel. The cutlass_moe_fp4 kernel takes in fp4 quantized weights and 16-bit
@ -91,7 +90,7 @@ def bench_run(
score = torch.randn((m, num_experts), device=device, dtype=dtype) score = torch.randn((m, num_experts), device=device, dtype=dtype)
topk_weights, topk_ids, _ = fused_topk(a, score, topk, renormalize=False) topk_weights, topk_ids = fused_topk(a, score, topk, renormalize=False)
quant_blocksize = 16 quant_blocksize = 16
w1_blockscale = torch.empty( w1_blockscale = torch.empty(

View File

@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import torch import torch
import torch.utils.benchmark as benchmark import torch.utils.benchmark as benchmark
@ -7,8 +6,8 @@ from benchmark_shapes import WEIGHT_SHAPES_MOE
from vllm import _custom_ops as ops from vllm import _custom_ops as ops
from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp8
from vllm.model_executor.layers.fused_moe.fused_moe import ( from vllm.model_executor.layers.fused_moe.fused_moe import (
cutlass_moe_fp8,
fused_experts, fused_experts,
fused_topk, fused_topk,
) )
@ -70,9 +69,18 @@ def bench_run(
w1_scale = torch.empty((num_experts, 1, 1), device="cuda", dtype=torch.float32) w1_scale = torch.empty((num_experts, 1, 1), device="cuda", dtype=torch.float32)
w2_scale = torch.empty((num_experts, 1, 1), device="cuda", dtype=torch.float32) w2_scale = torch.empty((num_experts, 1, 1), device="cuda", dtype=torch.float32)
ab_strides1 = torch.full((num_experts,), k, device="cuda", dtype=torch.int64)
c_strides1 = torch.full((num_experts,), 2 * n, device="cuda", dtype=torch.int64)
ab_strides2 = torch.full((num_experts,), n, device="cuda", dtype=torch.int64)
c_strides2 = torch.full((num_experts,), k, device="cuda", dtype=torch.int64)
for expert in range(num_experts): for expert in range(num_experts):
w1_q[expert], w1_scale[expert] = ops.scaled_fp8_quant(w1[expert]) w1_q[expert], w1_scale[expert] = ops.scaled_fp8_quant(w1[expert])
w2_q[expert], w2_scale[expert] = ops.scaled_fp8_quant(w2[expert]) w2_q[expert], w2_scale[expert] = ops.scaled_fp8_quant(w2[expert])
w1_q_notransp = w1_q.clone()
w2_q_notransp = w2_q.clone()
w1_q = w1_q.transpose(1, 2)
w2_q = w2_q.transpose(1, 2)
score = torch.randn((m, num_experts), device="cuda", dtype=dtype) score = torch.randn((m, num_experts), device="cuda", dtype=dtype)
@ -80,11 +88,6 @@ def bench_run(
a, score, topk, renormalize=False a, score, topk, renormalize=False
) )
ab_strides1 = torch.full((num_experts,), k, device="cuda", dtype=torch.int64)
ab_strides2 = torch.full((num_experts,), n, device="cuda", dtype=torch.int64)
c_strides1 = torch.full((num_experts,), 2 * n, device="cuda", dtype=torch.int64)
c_strides2 = torch.full((num_experts,), k, device="cuda", dtype=torch.int64)
def run_triton_moe( def run_triton_moe(
a: torch.Tensor, a: torch.Tensor,
w1: torch.Tensor, w1: torch.Tensor,
@ -116,13 +119,12 @@ def bench_run(
w2: torch.Tensor, w2: torch.Tensor,
w1_scale: torch.Tensor, w1_scale: torch.Tensor,
w2_scale: torch.Tensor, w2_scale: torch.Tensor,
ab_strides1: torch.Tensor,
ab_strides2: torch.Tensor,
c_strides1: torch.Tensor,
c_strides2: torch.Tensor,
topk_weights: torch.Tensor, topk_weights: torch.Tensor,
topk_ids: torch.Tensor, topk_ids: torch.Tensor,
per_act_token: bool, ab_strides1: torch.Tensor,
c_strides1: torch.Tensor,
ab_strides2: torch.Tensor,
c_strides2: torch.Tensor,
num_repeats: int, num_repeats: int,
): ):
for _ in range(num_repeats): for _ in range(num_repeats):
@ -130,16 +132,15 @@ def bench_run(
a, a,
w1, w1,
w2, w2,
topk_weights,
topk_ids,
w1_scale, w1_scale,
w2_scale, w2_scale,
topk_weights,
topk_ids,
ab_strides1, ab_strides1,
ab_strides2,
c_strides1, c_strides1,
ab_strides2,
c_strides2, c_strides2,
per_act_token, a1_scale=a_scale,
a1_scale=None,
) )
def run_cutlass_from_graph( def run_cutlass_from_graph(
@ -149,12 +150,12 @@ def bench_run(
w2_q: torch.Tensor, w2_q: torch.Tensor,
w1_scale: torch.Tensor, w1_scale: torch.Tensor,
w2_scale: torch.Tensor, w2_scale: torch.Tensor,
ab_strides1: torch.Tensor,
ab_strides2: torch.Tensor,
c_strides1: torch.Tensor,
c_strides2: torch.Tensor,
topk_weights: torch.Tensor, topk_weights: torch.Tensor,
topk_ids: torch.Tensor, topk_ids: torch.Tensor,
ab_strides1: torch.Tensor,
c_strides1: torch.Tensor,
ab_strides2: torch.Tensor,
c_strides2: torch.Tensor,
): ):
with set_current_vllm_config( with set_current_vllm_config(
VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1)) VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
@ -163,16 +164,15 @@ def bench_run(
a, a,
w1_q, w1_q,
w2_q, w2_q,
topk_weights,
topk_ids,
w1_scale, w1_scale,
w2_scale, w2_scale,
topk_weights,
topk_ids,
ab_strides1, ab_strides1,
ab_strides2,
c_strides1, c_strides1,
ab_strides2,
c_strides2, c_strides2,
per_act_token, a1_scale=a_scale,
a1_scale=None,
) )
def run_triton_from_graph( def run_triton_from_graph(
@ -215,12 +215,12 @@ def bench_run(
w2_q, w2_q,
w1_scale, w1_scale,
w2_scale, w2_scale,
ab_strides1,
ab_strides2,
c_strides1,
c_strides2,
topk_weights, topk_weights,
topk_ids, topk_ids,
ab_strides1,
c_strides1,
ab_strides2,
c_strides2,
) )
torch.cuda.synchronize() torch.cuda.synchronize()
@ -229,8 +229,8 @@ def bench_run(
with torch.cuda.graph(triton_graph, stream=triton_stream): with torch.cuda.graph(triton_graph, stream=triton_stream):
run_triton_from_graph( run_triton_from_graph(
a, a,
w1_q, w1_q_notransp,
w2_q, w2_q_notransp,
topk_weights, topk_weights,
topk_ids, topk_ids,
w1_scale, w1_scale,
@ -249,16 +249,17 @@ def bench_run(
"w2": w2, "w2": w2,
"score": score, "score": score,
"topk": topk, "topk": topk,
"w1_q_notransp": w1_q_notransp,
"w2_q_notransp": w2_q_notransp,
# Cutlass params # Cutlass params
"a_scale": a_scale, "a_scale": a_scale,
"w1_q": w1_q, "w1_q": w1_q,
"w2_q": w2_q, "w2_q": w2_q,
"w1_scale": w1_scale, "w1_scale": w1_scale,
"w2_scale": w2_scale, "w2_scale": w2_scale,
"per_act_token": per_act_token,
"ab_strides1": ab_strides1, "ab_strides1": ab_strides1,
"ab_strides2": ab_strides2,
"c_strides1": c_strides1, "c_strides1": c_strides1,
"ab_strides2": ab_strides2,
"c_strides2": c_strides2, "c_strides2": c_strides2,
# cuda graph params # cuda graph params
"cutlass_graph": cutlass_graph, "cutlass_graph": cutlass_graph,
@ -277,8 +278,8 @@ def bench_run(
# Warmup # Warmup
run_triton_moe( run_triton_moe(
a, a,
w1_q, w1_q_notransp,
w2_q, w2_q_notransp,
topk_weights, topk_weights,
topk_ids, topk_ids,
w1_scale, w1_scale,
@ -289,7 +290,7 @@ def bench_run(
results.append( results.append(
benchmark.Timer( benchmark.Timer(
stmt="run_triton_moe(a, w1_q, w2_q, topk_weights, topk_ids, w1_scale, w2_scale, a_scale, num_runs)", # noqa: E501 stmt="run_triton_moe(a, w1_q_notransp, w2_q_notransp, topk_weights, topk_ids, w1_scale, w2_scale, a_scale, num_runs)", # noqa: E501
globals=globals, globals=globals,
label=label, label=label,
sub_label=sub_label, sub_label=sub_label,
@ -318,19 +319,18 @@ def bench_run(
w2_q, w2_q,
w1_scale, w1_scale,
w2_scale, w2_scale,
ab_strides1,
ab_strides2,
c_strides1,
c_strides2,
topk_weights, topk_weights,
topk_ids, topk_ids,
per_act_token, ab_strides1,
c_strides1,
ab_strides2,
c_strides2,
num_warmup, num_warmup,
) )
results.append( results.append(
benchmark.Timer( benchmark.Timer(
stmt="run_cutlass_moe(a, a_scale, w1_q, w2_q, w1_scale, w2_scale, ab_strides1, ab_strides2, c_strides1, c_strides2, topk_weights, topk_ids, per_act_token, num_runs)", # noqa: E501 stmt="run_cutlass_moe(a, a_scale, w1_q, w2_q, w1_scale, w2_scale, topk_weights, topk_ids, ab_strides1, c_strides1, ab_strides2, c_strides2, num_runs)", # noqa: E501
globals=globals, globals=globals,
label=label, label=label,
sub_label=sub_label, sub_label=sub_label,

View File

@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import time import time

View File

@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse import argparse
import copy import copy

View File

@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse import argparse
import copy import copy
@ -234,10 +233,8 @@ def marlin_create_bench_fn(bt: BenchmarkTensors) -> Callable:
fn = lambda: ops.gptq_marlin_gemm( fn = lambda: ops.gptq_marlin_gemm(
a=bt.a, a=bt.a,
c=None,
b_q_weight=w_q, b_q_weight=w_q,
b_scales=w_s, b_scales=w_s,
global_scale=None,
b_zeros=w_zp, b_zeros=w_zp,
g_idx=g_idx, g_idx=g_idx,
perm=sort_indices, perm=sort_indices,

View File

@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import torch import torch
import torch.utils.benchmark as benchmark import torch.utils.benchmark as benchmark
@ -22,16 +21,8 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils import (
MARLIN_SUPPORTED_GROUP_SIZES, MARLIN_SUPPORTED_GROUP_SIZES,
query_marlin_supported_quant_types, query_marlin_supported_quant_types,
) )
from vllm.model_executor.layers.quantization.utils.marlin_utils_fp4 import (
FP4_MARLIN_SUPPORTED_GROUP_SIZES,
rand_marlin_weight_fp4_like,
)
from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
marlin_quant_fp8_torch,
)
from vllm.model_executor.layers.quantization.utils.marlin_utils_test import ( from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
MarlinWorkspace, MarlinWorkspace,
awq_marlin_quantize,
marlin_quantize, marlin_quantize,
) )
from vllm.model_executor.layers.quantization.utils.marlin_utils_test_24 import ( from vllm.model_executor.layers.quantization.utils.marlin_utils_test_24 import (
@ -43,7 +34,7 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
quantize_weights, quantize_weights,
sort_weights, sort_weights,
) )
from vllm.scalar_type import ScalarType, scalar_types from vllm.scalar_type import ScalarType
from vllm.utils import FlexibleArgumentParser from vllm.utils import FlexibleArgumentParser
DEFAULT_MODELS = ["meta-llama/Llama-2-7b-hf/TP1"] DEFAULT_MODELS = ["meta-llama/Llama-2-7b-hf/TP1"]
@ -65,144 +56,80 @@ def bench_run(
size_n: int, size_n: int,
): ):
label = "Quant Matmul" label = "Quant Matmul"
sub_label = "{}, act={} k_full={}, q={}, g={}, MKN=({}x{}x{})".format( sub_label = "{}, act={} k_full={}, q={}, g={}, MKN=({}x{}x{})".format(
model, act_order, is_k_full, str(quant_type), group_size, size_m, size_k, size_n model, act_order, is_k_full, str(quant_type), group_size, size_m, size_k, size_n
) )
print(f"Testing: {sub_label}") print(f"Testing: {sub_label}")
a = torch.randn(size_m, size_k).to(torch.half).cuda() a = torch.randn(size_m, size_k).to(torch.half).cuda()
b = torch.rand(size_k, size_n).to(torch.half).cuda() b = torch.rand(size_k, size_n).to(torch.half).cuda()
has_zp = quant_type in [scalar_types.uint4, scalar_types.uint8]
if act_order and (group_size == -1 or group_size == size_k or has_zp):
return
if size_k % group_size != 0:
return
marlin_24_supported = ( a_tmp = torch.zeros(size_m, size_k).to(torch.half).cuda()
quant_type in GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES
and group_size in GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES
)
repack_supported = (
quant_type in GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES
and group_size in MARLIN_SUPPORTED_GROUP_SIZES
)
allspark_supported = (
quant_type in ALLSPARK_SUPPORTED_QUANT_TYPES
and group_size == -1
and not act_order
and is_k_full
)
def gen_marlin_params():
# Marlin quant
marlin_g_idx = marlin_sort_indices = marlin_zp = marlin_s2 = None
if quant_type == scalar_types.float4_e2m1f:
if group_size != 16 or act_order:
return
marlin_w_ref, marlin_q_w, marlin_s, marlin_s2 = rand_marlin_weight_fp4_like(
b.T, group_size
)
elif quant_type == scalar_types.float8_e4m3fn:
if group_size not in [-1, 128] or act_order:
return
marlin_w_ref, marlin_q_w, marlin_s = marlin_quant_fp8_torch(b.T, group_size)
elif group_size == 16:
return
elif has_zp:
marlin_w_ref, marlin_q_w, marlin_s, marlin_zp = awq_marlin_quantize(
b, quant_type, group_size
)
else:
marlin_w_ref, marlin_q_w, marlin_s, marlin_g_idx, marlin_sort_indices, _ = (
marlin_quantize(b, quant_type, group_size, act_order)
)
return (
marlin_w_ref,
marlin_q_w,
marlin_s,
marlin_s2,
marlin_zp,
marlin_g_idx,
marlin_sort_indices,
)
def gen_marlin_24_params():
marlin_24_w_ref = marlin_24_q_w_comp = marlin_24_meta = marlin_24_s = None
if marlin_24_supported:
(marlin_24_w_ref, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s) = (
marlin_24_quantize(b, quant_type, group_size)
)
return (marlin_24_w_ref, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s)
def gen_repack_params():
q_w_gptq = None
repack_sort_indices = None
if repack_supported:
(w_ref, q_w, s, g_idx, rand_perm) = gptq_quantize_weights(
b, quant_type, group_size, act_order
)
q_w_gptq = gptq_pack(q_w, quant_type.size_bits, size_k, size_n)
# For act_order, sort the "weights" and "g_idx"
# so that group ids are increasing
repack_sort_indices = torch.empty(0, dtype=torch.int, device=b.device)
if act_order:
(q_w, g_idx, repack_sort_indices) = sort_weights(q_w, g_idx)
return q_w_gptq, repack_sort_indices
def gen_allspark_params():
qw_reorder = s_reorder = zp_reorder = sm_count = sm_version = (
CUBLAS_M_THRESHOLD
) = None
nonlocal allspark_supported
if allspark_supported:
properties = torch.cuda.get_device_properties(b.device.index)
sm_count = properties.multi_processor_count
sm_version = properties.major * 10 + properties.minor
supported_arch = sm_version >= 80 and sm_version < 90
allspark_supported = allspark_supported and supported_arch
if supported_arch:
w_ref, qw, s, zp = quantize_weights(b, quant_type, group_size, has_zp)
qw = qw.to(torch.uint8)
qw_reorder, s_reorder, zp_reorder = ops.allspark_repack_weight(
qw, s, zp, has_zp
)
CUBLAS_M_THRESHOLD = ALLSPARK_AMPERE_M_CUBLAS_THRESHOLD
return (
qw_reorder,
s_reorder,
zp_reorder,
sm_count,
sm_version,
CUBLAS_M_THRESHOLD,
)
# Marlin quant
( (
marlin_w_ref, marlin_w_ref,
marlin_q_w, marlin_q_w,
marlin_s, marlin_s,
marlin_s2,
marlin_zp,
marlin_g_idx, marlin_g_idx,
marlin_sort_indices, marlin_sort_indices,
) = gen_marlin_params() marlin_rand_perm,
marlin_24_w_ref, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s = ( ) = marlin_quantize(b, quant_type, group_size, act_order)
gen_marlin_24_params()
# Marlin_24 quant
(marlin_24_w_ref, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s) = (
marlin_24_quantize(b, quant_type, group_size)
) )
q_w_gptq, repack_sort_indices = gen_repack_params()
qw_reorder, s_reorder, zp_reorder, sm_count, sm_version, CUBLAS_M_THRESHOLD = ( marlin_zp = torch.empty(0, dtype=torch.int, device=b.device)
gen_allspark_params()
# GPTQ quant
(w_ref, q_w, s, g_idx, rand_perm) = gptq_quantize_weights(
b, quant_type, group_size, act_order
) )
q_w_gptq = gptq_pack(q_w, quant_type.size_bits, size_k, size_n)
# For act_order, sort the "weights" and "g_idx"
# so that group ids are increasing
repack_sort_indices = torch.empty(0, dtype=torch.int, device=b.device)
if act_order:
(q_w, g_idx, repack_sort_indices) = sort_weights(q_w, g_idx)
# Prepare # Prepare
marlin_workspace = MarlinWorkspace( marlin_workspace = MarlinWorkspace(
size_n, GPTQ_MARLIN_MIN_THREAD_N, GPTQ_MARLIN_MAX_PARALLEL size_n, GPTQ_MARLIN_MIN_THREAD_N, GPTQ_MARLIN_MAX_PARALLEL
) )
marlin_24_workspace = MarlinWorkspace( marlin_24_workspace = MarlinWorkspace(
size_n, GPTQ_MARLIN_24_MIN_THREAD_N, GPTQ_MARLIN_24_MAX_PARALLEL size_n, GPTQ_MARLIN_24_MIN_THREAD_N, GPTQ_MARLIN_24_MAX_PARALLEL
) )
marlin_zp = torch.zeros_like(marlin_s, dtype=torch.int)
# AllSpark W8A16 quant
as_supported_case = (
quant_type in ALLSPARK_SUPPORTED_QUANT_TYPES
and group_size == -1
and not act_order
and is_k_full
)
if as_supported_case:
properties = torch.cuda.get_device_properties(b.device.index)
sm_count = properties.multi_processor_count
sm_version = properties.major * 10 + properties.minor
supported_arch = sm_version >= 80 and sm_version < 90
as_supported_case = as_supported_case and supported_arch
if supported_arch:
has_zp = False
w_ref, qw, s, zp = quantize_weights(b, quant_type, group_size, has_zp)
qw = qw.to(torch.uint8)
qw_reorder, s_reorder, zp_reorder = ops.allspark_repack_weight(
qw, s, zp, has_zp
)
CUBLAS_M_THRESHOLD = ALLSPARK_AMPERE_M_CUBLAS_THRESHOLD
globals = { globals = {
# Gen params # Gen params
@ -212,14 +139,15 @@ def bench_run(
"size_n": size_n, "size_n": size_n,
"size_k": size_k, "size_k": size_k,
"a": a, "a": a,
"a_tmp": a_tmp,
# Marlin params # Marlin params
"marlin_w_ref": marlin_w_ref, "marlin_w_ref": marlin_w_ref,
"marlin_q_w": marlin_q_w, "marlin_q_w": marlin_q_w,
"marlin_s": marlin_s, "marlin_s": marlin_s,
"marlin_s2": marlin_s2,
"marlin_zp": marlin_zp, "marlin_zp": marlin_zp,
"marlin_g_idx": marlin_g_idx, "marlin_g_idx": marlin_g_idx,
"marlin_sort_indices": marlin_sort_indices, "marlin_sort_indices": marlin_sort_indices,
"marlin_rand_perm": marlin_rand_perm,
"marlin_workspace": marlin_workspace, "marlin_workspace": marlin_workspace,
"is_k_full": is_k_full, "is_k_full": is_k_full,
# Marlin_24 params # Marlin_24 params
@ -232,12 +160,12 @@ def bench_run(
"q_w_gptq": q_w_gptq, "q_w_gptq": q_w_gptq,
"repack_sort_indices": repack_sort_indices, "repack_sort_indices": repack_sort_indices,
# AllSpark W8A16 params # AllSpark W8A16 params
"qw_reorder": qw_reorder, "qw_reorder": qw_reorder if as_supported_case else None,
"s_reorder": s_reorder, "s_reorder": s_reorder if as_supported_case else None,
"zp_reorder": zp_reorder, "zp_reorder": zp_reorder if as_supported_case else None,
"sm_count": sm_count, "sm_count": sm_count if as_supported_case else None,
"sm_version": sm_version, "sm_version": sm_version if as_supported_case else None,
"CUBLAS_M_THRESHOLD": CUBLAS_M_THRESHOLD, "CUBLAS_M_THRESHOLD": CUBLAS_M_THRESHOLD if as_supported_case else None,
# Kernels # Kernels
"gptq_marlin_gemm": ops.gptq_marlin_gemm, "gptq_marlin_gemm": ops.gptq_marlin_gemm,
"gptq_marlin_24_gemm": ops.gptq_marlin_24_gemm, "gptq_marlin_24_gemm": ops.gptq_marlin_24_gemm,
@ -248,7 +176,7 @@ def bench_run(
min_run_time = 1 min_run_time = 1
# Warmup pytorch # Warmup pytorch
for _ in range(5): for i in range(5):
torch.matmul(a, marlin_w_ref) torch.matmul(a, marlin_w_ref)
results.append( results.append(
@ -263,17 +191,17 @@ def bench_run(
results.append( results.append(
benchmark.Timer( benchmark.Timer(
stmt="output = gptq_marlin_gemm(a, None, marlin_q_w, marlin_s, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, False, False)", # noqa: E501 stmt="output = gptq_marlin_gemm(a, marlin_q_w, marlin_s, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, False, False)", # noqa: E501
globals=globals, globals=globals,
label=label, label=label,
sub_label=sub_label, sub_label=sub_label,
description="gptq_marlin_gemm", description="gptq_marlin_gemm_fp16",
).blocked_autorange(min_run_time=min_run_time) ).blocked_autorange(min_run_time=min_run_time)
) )
results.append( results.append(
benchmark.Timer( benchmark.Timer(
stmt="output = gptq_marlin_gemm(a, None, marlin_q_w, marlin_s, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, True, False)", # noqa: E501 stmt="output = gptq_marlin_gemm(a, marlin_q_w, marlin_s, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, True, False)", # noqa: E501
globals=globals, globals=globals,
label=label, label=label,
sub_label=sub_label, sub_label=sub_label,
@ -281,7 +209,10 @@ def bench_run(
).blocked_autorange(min_run_time=min_run_time) ).blocked_autorange(min_run_time=min_run_time)
) )
if marlin_24_supported: if (
quant_type in GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES
and group_size in GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES
):
results.append( results.append(
benchmark.Timer( benchmark.Timer(
stmt="output = gptq_marlin_24_gemm(a, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s, marlin_24_workspace.scratch, quant_type, size_m, size_n, size_k)", # noqa: E501 stmt="output = gptq_marlin_24_gemm(a, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s, marlin_24_workspace.scratch, quant_type, size_m, size_n, size_k)", # noqa: E501
@ -292,18 +223,17 @@ def bench_run(
).blocked_autorange(min_run_time=min_run_time) ).blocked_autorange(min_run_time=min_run_time)
) )
if repack_supported: results.append(
results.append( benchmark.Timer(
benchmark.Timer( stmt="q_res = gptq_marlin_repack(q_w_gptq, repack_sort_indices, size_k, size_n, quant_type.size_bits)", # noqa: E501
stmt="q_res = gptq_marlin_repack(q_w_gptq, repack_sort_indices, size_k, size_n, quant_type.size_bits)", # noqa: E501 globals=globals,
globals=globals, label=label,
label=label, sub_label=sub_label,
sub_label=sub_label, description="gptq_marlin_repack",
description="gptq_marlin_repack", ).blocked_autorange(min_run_time=min_run_time)
).blocked_autorange(min_run_time=min_run_time) )
)
if allspark_supported: if as_supported_case:
results.append( results.append(
benchmark.Timer( benchmark.Timer(
stmt="output = allspark_w8a16_gemm(a, qw_reorder, s_reorder, zp_reorder, size_n, group_size, sm_count, sm_version, CUBLAS_M_THRESHOLD, False, True)", # noqa: E501 stmt="output = allspark_w8a16_gemm(a, qw_reorder, s_reorder, zp_reorder, size_n, group_size, sm_count, sm_version, CUBLAS_M_THRESHOLD, False, True)", # noqa: E501
@ -319,6 +249,7 @@ def main(args):
print("Benchmarking models:") print("Benchmarking models:")
for i, model in enumerate(args.models): for i, model in enumerate(args.models):
print(f"[{i}] {model}") print(f"[{i}] {model}")
results: list[benchmark.Measurement] = [] results: list[benchmark.Measurement] = []
for model in args.models: for model in args.models:
@ -346,17 +277,14 @@ def main(args):
): ):
continue continue
for quant_type in query_marlin_supported_quant_types(): for quant_type in query_marlin_supported_quant_types(False):
if ( if (
len(args.limit_num_bits) > 0 len(args.limit_num_bits) > 0
and quant_type.size_bits not in args.limit_num_bits and quant_type.size_bits not in args.limit_num_bits
): ):
continue continue
for group_size in ( for group_size in MARLIN_SUPPORTED_GROUP_SIZES:
MARLIN_SUPPORTED_GROUP_SIZES
+ FP4_MARLIN_SUPPORTED_GROUP_SIZES
):
if ( if (
len(args.limit_group_size) > 0 len(args.limit_group_size) > 0
and group_size not in args.limit_group_size and group_size not in args.limit_group_size

View File

@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse import argparse
import json import json
@ -7,6 +6,7 @@ import time
from contextlib import nullcontext from contextlib import nullcontext
from datetime import datetime from datetime import datetime
from itertools import product from itertools import product
from types import SimpleNamespace
from typing import Any, TypedDict from typing import Any, TypedDict
import ray import ray
@ -42,7 +42,7 @@ def benchmark_config(
use_fp8_w8a8: bool, use_fp8_w8a8: bool,
use_int8_w8a16: bool, use_int8_w8a16: bool,
num_iters: int = 100, num_iters: int = 100,
block_quant_shape: list[int] = None, block_quant_shape: List[int] = None,
use_deep_gemm: bool = False, use_deep_gemm: bool = False,
) -> float: ) -> float:
init_dtype = torch.float16 if use_fp8_w8a8 else dtype init_dtype = torch.float16 if use_fp8_w8a8 else dtype
@ -86,9 +86,6 @@ def benchmark_config(
(num_experts, 2 * shard_intermediate_size), dtype=torch.float32 (num_experts, 2 * shard_intermediate_size), dtype=torch.float32
) )
w2_scale = torch.randn((hidden_size, num_experts), dtype=torch.float32) w2_scale = torch.randn((hidden_size, num_experts), dtype=torch.float32)
if use_deep_gemm:
# we use the default block shape for deepgemm
block_quant_shape = [128, 128]
if use_fp8_w8a8: if use_fp8_w8a8:
if block_quant_shape: if block_quant_shape:
block_n, block_k = block_quant_shape[0], block_quant_shape[1] block_n, block_k = block_quant_shape[0], block_quant_shape[1]
@ -402,7 +399,7 @@ class BenchmarkWorker:
dtype: torch.dtype, dtype: torch.dtype,
use_fp8_w8a8: bool, use_fp8_w8a8: bool,
use_int8_w8a16: bool, use_int8_w8a16: bool,
block_quant_shape: list[int] = None, block_quant_shape: List[int] = None,
use_deep_gemm: bool = False, use_deep_gemm: bool = False,
) -> tuple[dict[str, int], float]: ) -> tuple[dict[str, int], float]:
current_platform.seed_everything(self.seed) current_platform.seed_everything(self.seed)
@ -534,7 +531,7 @@ def save_configs(
dtype: torch.dtype, dtype: torch.dtype,
use_fp8_w8a8: bool, use_fp8_w8a8: bool,
use_int8_w8a16: bool, use_int8_w8a16: bool,
block_quant_shape: list[int], block_quant_shape: List[int],
) -> None: ) -> None:
dtype_str = get_config_dtype_str( dtype_str = get_config_dtype_str(
dtype, use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8 dtype, use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8
@ -565,6 +562,7 @@ def main(args: argparse.Namespace):
config = get_config(model=args.model, trust_remote_code=args.trust_remote_code) config = get_config(model=args.model, trust_remote_code=args.trust_remote_code)
if args.model_prefix: if args.model_prefix:
config = getattr(config, args.model_prefix) config = getattr(config, args.model_prefix)
config = SimpleNamespace(**config)
if config.architectures[0] == "DbrxForCausalLM": if config.architectures[0] == "DbrxForCausalLM":
E = config.ffn_config.moe_num_experts E = config.ffn_config.moe_num_experts
@ -586,11 +584,6 @@ def main(args: argparse.Namespace):
topk = config.num_experts_per_tok topk = config.num_experts_per_tok
intermediate_size = config.moe_intermediate_size intermediate_size = config.moe_intermediate_size
shard_intermediate_size = 2 * intermediate_size // args.tp_size shard_intermediate_size = 2 * intermediate_size // args.tp_size
elif config.architectures[0] in ("HunYuanMoEV1ForCausalLM"):
E = config.num_experts
topk = config.moe_topk[0]
intermediate_size = config.moe_intermediate_size[0]
shard_intermediate_size = 2 * intermediate_size // args.tp_size
else: else:
# Support for llama4 # Support for llama4
config = config.get_text_config() config = config.get_text_config()
@ -601,7 +594,11 @@ def main(args: argparse.Namespace):
shard_intermediate_size = 2 * intermediate_size // args.tp_size shard_intermediate_size = 2 * intermediate_size // args.tp_size
hidden_size = config.hidden_size hidden_size = config.hidden_size
dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype dtype = (
torch.float16
if current_platform.is_rocm()
else getattr(torch, config.torch_dtype)
)
use_fp8_w8a8 = args.dtype == "fp8_w8a8" use_fp8_w8a8 = args.dtype == "fp8_w8a8"
use_int8_w8a16 = args.dtype == "int8_w8a16" use_int8_w8a16 = args.dtype == "int8_w8a16"
block_quant_shape = get_weight_block_size_safety(config) block_quant_shape = get_weight_block_size_safety(config)
@ -628,7 +625,7 @@ def main(args: argparse.Namespace):
4096, 4096,
] ]
else: else:
batch_sizes = args.batch_size batch_sizes = [args.batch_size]
use_deep_gemm = bool(args.use_deep_gemm) use_deep_gemm = bool(args.use_deep_gemm)
@ -736,7 +733,7 @@ if __name__ == "__main__":
) )
parser.add_argument("--use-deep-gemm", action="store_true") parser.add_argument("--use-deep-gemm", action="store_true")
parser.add_argument("--seed", type=int, default=0) parser.add_argument("--seed", type=int, default=0)
parser.add_argument("--batch-size", type=int, nargs="+", required=False) parser.add_argument("--batch-size", type=int, required=False)
parser.add_argument("--tune", action="store_true") parser.add_argument("--tune", action="store_true")
parser.add_argument("--trust-remote-code", action="store_true") parser.add_argument("--trust-remote-code", action="store_true")
parser.add_argument("--model-prefix", type=str, required=False) parser.add_argument("--model-prefix", type=str, required=False)

View File

@ -1,159 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse
import itertools
import torch
from vllm import _custom_ops as ops
from vllm.model_executor.layers.fused_moe.moe_align_block_size import (
moe_align_block_size_triton,
)
from vllm.triton_utils import triton
def get_topk_ids(num_tokens: int, num_experts: int, topk: int) -> torch.Tensor:
return torch.stack(
[
torch.randperm(num_experts, dtype=torch.int32, device="cuda")[:topk]
for _ in range(num_tokens)
]
)
def check_correctness(num_tokens, num_experts=256, block_size=256, topk=8):
"""
Verifies vllm vs. Triton
"""
topk_ids = get_topk_ids(num_tokens, num_experts, topk)
# 1. malloc space for triton and vllm
# malloc enough space (max_num_tokens_padded) for the sorted ids
max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1)
sorted_ids_triton = torch.empty(
(max_num_tokens_padded,), dtype=torch.int32, device="cuda"
)
sorted_ids_triton.fill_(topk_ids.numel()) # fill with sentinel value
expert_ids_triton = torch.zeros(
(max_num_tokens_padded // block_size,), dtype=torch.int32, device="cuda"
)
num_tokens_post_pad_triton = torch.empty((1,), dtype=torch.int32, device="cuda")
sorted_ids_vllm = torch.empty_like(sorted_ids_triton)
sorted_ids_vllm.fill_(topk_ids.numel())
expert_ids_vllm = torch.zeros_like(expert_ids_triton)
num_tokens_post_pad_vllm = torch.empty_like(num_tokens_post_pad_triton)
# 2. run implementations
moe_align_block_size_triton(
topk_ids,
num_experts,
block_size,
sorted_ids_triton,
expert_ids_triton,
num_tokens_post_pad_triton,
)
ops.moe_align_block_size(
topk_ids,
num_experts,
block_size,
sorted_ids_vllm,
expert_ids_vllm,
num_tokens_post_pad_vllm,
)
print(f"✅ VLLM implementation works with {num_experts} experts!")
# 3. compare results
if torch.allclose(expert_ids_triton, expert_ids_vllm) and torch.allclose(
num_tokens_post_pad_triton, num_tokens_post_pad_vllm
):
print("✅ Triton and VLLM implementations match.")
else:
print("❌ Triton and VLLM implementations DO NOT match.")
print("Triton expert_ids:", expert_ids_triton)
print("VLLM expert_ids:", expert_ids_vllm)
print("Triton num_tokens_post_pad:", num_tokens_post_pad_triton)
print("VLLM num_tokens_post_pad:", num_tokens_post_pad_vllm)
# test configurations
num_tokens_range = [1, 16, 256, 4096]
num_experts_range = [16, 64, 224, 256, 280, 512]
topk_range = [1, 2, 8]
configs = list(itertools.product(num_tokens_range, num_experts_range, topk_range))
@triton.testing.perf_report(
triton.testing.Benchmark(
x_names=["num_tokens", "num_experts", "topk"],
x_vals=configs,
line_arg="provider",
line_vals=["vllm", "triton"], # "triton"
line_names=["VLLM", "Triton"], # "Triton"
plot_name="moe-align-block-size-performance",
args={},
)
)
def benchmark(num_tokens, num_experts, topk, provider):
"""Benchmark function for Triton."""
block_size = 256
topk_ids = get_topk_ids(num_tokens, num_experts, topk)
max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1)
sorted_ids = torch.empty((max_num_tokens_padded,), dtype=torch.int32, device="cuda")
sorted_ids.fill_(topk_ids.numel())
max_num_m_blocks = max_num_tokens_padded // block_size
expert_ids = torch.empty((max_num_m_blocks,), dtype=torch.int32, device="cuda")
num_tokens_post_pad = torch.empty((1,), dtype=torch.int32, device="cuda")
quantiles = [0.5, 0.2, 0.8]
if provider == "vllm":
ms, min_ms, max_ms = triton.testing.do_bench(
lambda: ops.moe_align_block_size(
topk_ids,
num_experts,
block_size,
sorted_ids.clone(),
expert_ids.clone(),
num_tokens_post_pad.clone(),
),
quantiles=quantiles,
)
elif provider == "triton":
ms, min_ms, max_ms = triton.testing.do_bench(
lambda: moe_align_block_size_triton(
topk_ids,
num_experts,
block_size,
sorted_ids.clone(),
expert_ids.clone(),
num_tokens_post_pad.clone(),
),
quantiles=quantiles,
)
return 1000 * ms, 1000 * max_ms, 1000 * min_ms
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--num_experts",
type=int,
default=64,
choices=[8, 16, 32, 64, 128, 256],
)
parser.add_argument(
"--topk",
type=int,
default=8,
choices=[2, 4, 8],
help="Top-k value for correctness check.",
)
args = parser.parse_args()
print("Running correctness check...")
check_correctness(num_tokens=1024, num_experts=args.num_experts, topk=args.topk)
benchmark.run(print_data=True, show_plots=True)

View File

@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse import argparse
from typing import Any, TypedDict from typing import Any, TypedDict

View File

@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import random import random
import time import time
@ -85,10 +84,7 @@ def main(
if version == "v2": if version == "v2":
if current_platform.is_rocm(): if current_platform.is_rocm():
global PARTITION_SIZE global PARTITION_SIZE
if not args.custom_paged_attn and not current_platform.is_navi(): PARTITION_SIZE = 1024 if not args.custom_paged_attn else PARTITION_SIZE_ROCM
PARTITION_SIZE = 1024
else:
PARTITION_SIZE = PARTITION_SIZE_ROCM
num_partitions = (max_seq_len + PARTITION_SIZE - 1) // PARTITION_SIZE num_partitions = (max_seq_len + PARTITION_SIZE - 1) // PARTITION_SIZE
tmp_output = torch.empty( tmp_output = torch.empty(
size=(num_seqs, num_query_heads, num_partitions, head_size), size=(num_seqs, num_query_heads, num_partitions, head_size),
@ -163,7 +159,6 @@ def main(
scale, scale,
block_tables, block_tables,
seq_lens, seq_lens,
None,
block_size, block_size,
max_seq_len, max_seq_len,
alibi_slopes, alibi_slopes,

View File

@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import time import time

View File

@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import itertools import itertools
from typing import Optional, Union from typing import Optional, Union

View File

@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from itertools import accumulate from itertools import accumulate
from typing import Optional from typing import Optional
@ -23,7 +22,7 @@ def benchmark_rope_kernels_multi_lora(
seed: int, seed: int,
device: str, device: str,
max_position: int = 8192, max_position: int = 8192,
base: float = 10000, base: int = 10000,
) -> None: ) -> None:
current_platform.seed_everything(seed) current_platform.seed_everything(seed)
torch.set_default_device(device) torch.set_default_device(device)

View File

@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
WEIGHT_SHAPES = { WEIGHT_SHAPES = {
"ideal": [[4 * 256 * 32, 256 * 32]], "ideal": [[4 * 256 * 32, 256 * 32]],

View File

@ -1,240 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import csv
import os
import random
from datetime import datetime
import flashinfer
import torch
FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
# KV Cache Layout for TRT-LLM
# kv_cache_shape = (num_blocks, 2, num_kv_heads, page_size, head_dim)
def to_float8(x, dtype=torch.float8_e4m3fn):
finfo = torch.finfo(dtype)
min_val, max_val = x.aminmax()
amax = torch.maximum(min_val.abs(), max_val.abs()).clamp(min=1e-12)
scale = finfo.max / amax * 0.1
x_scl_sat = (x * scale).clamp(min=finfo.min, max=finfo.max)
return x_scl_sat.to(dtype), scale.float().reciprocal()
@torch.no_grad()
def benchmark_decode(
num_seqs,
max_seq_len,
page_size=16,
dtype=torch.bfloat16,
kv_layout="HND",
num_kv_heads=8,
kv_cache_dtype="auto",
head_dim=128,
warmup=10,
trials=20,
):
torch.set_default_device("cuda")
device = "cuda"
torch.manual_seed(0)
# Currently only HEAD_GRP_SIZE == 8 is supported
HEAD_GRP_SIZE = 8
MAX_SEQ_LEN = max_seq_len
# large number to reduce kv_cache reuse
NUM_BLOCKS = int(256000 / page_size)
workspace_buffer = torch.empty(1024 * 1024 * 1024, dtype=torch.int8, device=device)
# For decode, batch_size is num_decode_token
num_qo_heads = num_kv_heads * HEAD_GRP_SIZE
sm_scale = float(1.0 / (head_dim**0.5))
q = torch.randn(num_seqs, num_qo_heads, head_dim, device=device, dtype=dtype)
kv_lens = [random.randint(1, MAX_SEQ_LEN) for _ in range(num_seqs)]
max_kv_len = max(kv_lens)
kv_lens_tensor = torch.tensor(kv_lens, dtype=torch.int, device=device)
max_num_blocks_per_seq = (max_kv_len + page_size - 1) // page_size
block_tables = torch.randint(
0, NUM_BLOCKS, (num_seqs, max_num_blocks_per_seq), dtype=torch.int32
)
kv_cache_shape = (NUM_BLOCKS, 2, num_kv_heads, page_size, head_dim)
kv_cache = torch.randn(size=kv_cache_shape, device=device, dtype=dtype)
k_scale = v_scale = 1.0
if kv_cache_dtype.startswith("fp8"):
kv_cache, _ = to_float8(kv_cache)
# Benchmark TRT decode
def trt_decode():
return flashinfer.decode.trtllm_batch_decode_with_kv_cache(
q,
kv_cache,
workspace_buffer,
num_qo_heads,
num_kv_heads,
sm_scale,
block_tables,
kv_lens_tensor,
page_size,
max_kv_len,
kv_cache_dtype,
k_scale,
v_scale,
)
def time_fn(fn, warmup=10, trials=20):
torch.cuda.synchronize()
start = torch.cuda.Event(enable_timing=True)
end = torch.cuda.Event(enable_timing=True)
times = []
for i in range(warmup):
fn()
for i in range(trials):
start.record()
fn()
end.record()
torch.cuda.synchronize()
times.append(start.elapsed_time(end)) # ms
return sum(times) / len(times), torch.std(torch.tensor(times))
# TRT Decode
trt_mean, trt_std = time_fn(trt_decode)
kv_indptr = [0]
kv_indices = []
kv_last_page_lens = []
for i in range(num_seqs):
seq_len = kv_lens[i]
assert seq_len > 0
num_blocks = (seq_len + page_size - 1) // page_size
kv_indices.extend(block_tables[i, :num_blocks])
kv_indptr.append(kv_indptr[-1] + num_blocks)
kv_last_page_len = seq_len % page_size
if kv_last_page_len == 0:
kv_last_page_len = page_size
kv_last_page_lens.append(kv_last_page_len)
kv_indptr = torch.tensor(kv_indptr, dtype=torch.int32)
kv_indices = torch.tensor(kv_indices, dtype=torch.int32)
kv_last_page_lens = torch.tensor(kv_last_page_lens, dtype=torch.int32)
wrapper = flashinfer.BatchDecodeWithPagedKVCacheWrapper(
workspace_buffer,
kv_layout,
use_tensor_cores=((num_qo_heads // num_kv_heads) > 4),
)
wrapper.plan(
kv_indptr,
kv_indices,
kv_last_page_lens,
num_qo_heads,
num_kv_heads,
head_dim,
page_size,
"NONE",
q_data_type=dtype,
kv_data_type=torch.float8_e4m3fn if kv_cache_dtype.startswith("fp8") else dtype,
)
def baseline_decode():
return wrapper.run(q, kv_cache, sm_scale, k_scale, v_scale)
baseline_mean, baseline_std = time_fn(baseline_decode)
# Calculate percentage speedup (positive means TRT is faster)
speedup_percent = (baseline_mean - trt_mean) / baseline_mean
print(
f"\t{num_seqs}\t{max_seq_len}\t{trt_mean:.3f}\t{trt_std.item():.3f}"
f"\t{baseline_mean:.3f}\t{baseline_std.item():.3f}\t{speedup_percent:.3f}"
)
# Return results for CSV writing
return {
"num_seqs": num_seqs,
"trt_mean": trt_mean,
"trt_std": trt_std.item(),
"baseline_mean": baseline_mean,
"baseline_std": baseline_std.item(),
"speedup_percent": speedup_percent,
"q_dtype": str(dtype),
"kv_cache_dtype": kv_cache_dtype,
"page_size": page_size,
"num_kv_heads": num_kv_heads,
"head_dim": head_dim,
"max_seq_len": max_seq_len,
}
def write_results_to_csv(results, filename=None):
"""Write benchmark results to CSV file."""
if filename is None:
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"flashinfer_trtllm_benchmark_{timestamp}.csv"
fieldnames = [
"num_seqs",
"trt_mean",
"trt_std",
"baseline_mean",
"baseline_std",
"speedup_percent",
"q_dtype",
"kv_cache_dtype",
"page_size",
"num_kv_heads",
"head_dim",
"max_seq_len",
]
file_exists = os.path.exists(filename)
with open(filename, "a", newline="") as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
if not file_exists:
writer.writeheader()
for result in results:
writer.writerow(result)
print(f"Results written to {filename}")
if __name__ == "__main__":
num_seqs = [1, 4, 8, 16, 32, 64, 128, 256]
max_seq_lens = [1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072]
all_results = []
print("Running benchmark for kv_cache_dtype: bfloat16")
print(
"\tnum_seqs\tmax_seq_len\ttrt_mean\ttrt_std\tbaseline_mean\tbaseline_std\tspeedup_percent"
)
for max_seq_len in max_seq_lens:
for bs in num_seqs:
result = benchmark_decode(
bs, max_seq_len, dtype=torch.bfloat16, kv_cache_dtype="auto"
)
all_results.append(result)
print("Running benchmark for q_dtype = bfloat16, kv_cache_dtype: fp8")
print(
"\tnum_seqs\tmax_seq_len\ttrt_mean\ttrt_std\tbaseline_mean\tbaseline_std\tspeedup_percent"
)
for max_seq_len in max_seq_lens:
for bs in num_seqs:
result = benchmark_decode(
bs, max_seq_len, dtype=torch.bfloat16, kv_cache_dtype="fp8"
)
all_results.append(result)
# Write all results to CSV
write_results_to_csv(all_results)

View File

@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Adapted from sglang quantization/tuning_block_wise_kernel.py # Adapted from sglang quantization/tuning_block_wise_kernel.py
import argparse import argparse

View File

@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# fmt: off # fmt: off
# ruff: noqa: E501 # ruff: noqa: E501
import time import time
@ -85,6 +84,12 @@ def benchmark_shape(m: int,
# === DeepGEMM Implementation === # === DeepGEMM Implementation ===
def deepgemm_gemm(): def deepgemm_gemm():
# A quantization is inside the loop as it depends on activations
# A_deepgemm, A_scale_deepgemm = per_token_cast_to_fp8(A)
# A_deepgemm, A_scale_deepgemm = per_token_group_quant_fp8(
# A, block_size[1])
# A_scale_aligned = get_col_major_tma_aligned_tensor(A_scale_deepgemm)
# C_deepgemm = torch.empty((m, n), device='cuda', dtype=torch.bfloat16)
deep_gemm.gemm_fp8_fp8_bf16_nt((A_deepgemm, A_scale_deepgemm), deep_gemm.gemm_fp8_fp8_bf16_nt((A_deepgemm, A_scale_deepgemm),
(B_deepgemm, B_scale_deepgemm), (B_deepgemm, B_scale_deepgemm),
C_deepgemm) C_deepgemm)
@ -92,6 +97,8 @@ def benchmark_shape(m: int,
# === vLLM Triton Implementation === # === vLLM Triton Implementation ===
def vllm_triton_gemm(): def vllm_triton_gemm():
# A quantization is inside the loop as it depends on activations
# A_vllm, A_scale_vllm = per_token_group_quant_fp8(A, block_size[1])
return w8a8_block_fp8_matmul(A_vllm, return w8a8_block_fp8_matmul(A_vllm,
B_vllm, B_vllm,
A_scale_vllm, A_scale_vllm,
@ -101,6 +108,9 @@ def benchmark_shape(m: int,
# === vLLM CUTLASS Implementation === # === vLLM CUTLASS Implementation ===
def vllm_cutlass_gemm(): def vllm_cutlass_gemm():
# A quantization is inside the loop as it depends on activations
# A_vllm_cutlass, A_scale_vllm_cutlass = per_token_group_quant_fp8(
# A, block_size[1], column_major_scales=True)
return ops.cutlass_scaled_mm(A_vllm_cutlass, return ops.cutlass_scaled_mm(A_vllm_cutlass,
B_vllm.T, B_vllm.T,
scale_a=A_scale_vllm_cutlass, scale_a=A_scale_vllm_cutlass,

View File

@ -1,13 +1,12 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import math import math
import pickle import pickle
import re
from collections import defaultdict from collections import defaultdict
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import pandas as pd import pandas as pd
import regex as re
import seaborn as sns import seaborn as sns
from torch.utils.benchmark import Measurement as TMeasurement from torch.utils.benchmark import Measurement as TMeasurement

View File

@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import dataclasses import dataclasses
from collections.abc import Iterable from collections.abc import Iterable

View File

@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Weight Shapes are in the format # Weight Shapes are in the format
# ([K, N], TP_SPLIT_DIM) # ([K, N], TP_SPLIT_DIM)
@ -49,50 +48,4 @@ WEIGHT_SHAPES = {
([16384, 106496], 1), ([16384, 106496], 1),
([53248, 16384], 0), ([53248, 16384], 0),
], ],
"meta-llama/Llama-3.1-8B-Instruct": [
([4096, 6144], 1),
([4096, 4096], 0),
([4096, 28672], 1),
([14336, 4096], 0),
],
"meta-llama/Llama-3.3-70B-Instruct": [
([8192, 10240], 1),
([8192, 8192], 0),
([8192, 57344], 1),
([28672, 8192], 0),
],
"mistralai/Mistral-Large-Instruct-2407": [
([12288, 14336], 1),
([12288, 12288], 0),
([12288, 57344], 1),
([28672, 12288], 0),
],
"Qwen/Qwen2.5-7B-Instruct": [
([3584, 4608], 1),
([3584, 3584], 0),
([3584, 37888], 1),
([18944, 3584], 0),
],
"Qwen/Qwen2.5-32B-Instruct": [
([5120, 7168], 1),
([5120, 5120], 0),
([5120, 55296], 1),
([27648, 5120], 0),
],
"Qwen/Qwen2.5-72B-Instruct": [
([8192, 10240], 1),
([8192, 8192], 0),
([8192, 59136], 1),
([29568, 8192], 0),
],
"deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct": [
([2048, 3072], 1),
([2048, 4096], 1),
([2048, 2048], 0),
([2048, 576], 0),
([2048, 21888], 1),
([10944, 2048], 0),
([2048, 2816], 1),
([1408, 2048], 0),
],
} }

View File

@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import cProfile import cProfile
import pstats import pstats

View File

@ -6,6 +6,11 @@
[tool.ruff] [tool.ruff]
line-length = 88 line-length = 88
exclude = [
# External file, leaving license intact
"examples/other/fp8/quantizer/quantize.py",
"vllm/vllm_flash_attn/flash_attn_interface.pyi"
]
[tool.ruff.lint.per-file-ignores] [tool.ruff.lint.per-file-ignores]
"vllm/third_party/**" = ["ALL"] "vllm/third_party/**" = ["ALL"]

Some files were not shown because too many files have changed in this diff Show More