Compare commits
2 Commits
main
...
codex/remo
| Author | SHA1 | Date | |
|---|---|---|---|
| 85013bf094 | |||
| 07665f8679 |
@ -5,11 +5,11 @@ import os
|
|||||||
import sys
|
import sys
|
||||||
import zipfile
|
import zipfile
|
||||||
|
|
||||||
# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 500 MiB
|
# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 450 MiB
|
||||||
# Note that we have 800 MiB quota, please use it wisely.
|
# Note that we have 800 MiB quota, please use it wisely.
|
||||||
# See https://github.com/pypi/support/issues/6326 .
|
# See https://github.com/pypi/support/issues/6326 .
|
||||||
# Please also sync the value with the one in Dockerfile.
|
# Please also sync the value with the one in Dockerfile.
|
||||||
VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 500))
|
VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 450))
|
||||||
|
|
||||||
|
|
||||||
def print_top_10_largest_files(zip_file):
|
def print_top_10_largest_files(zip_file):
|
||||||
|
|||||||
@ -1,12 +0,0 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m HandH1998/QQQ-Llama-3-8b-g128 -b 32 -l 1000 -f 5 -t 1
|
|
||||||
model_name: "HandH1998/QQQ-Llama-3-8b-g128"
|
|
||||||
tasks:
|
|
||||||
- name: "gsm8k"
|
|
||||||
metrics:
|
|
||||||
- name: "exact_match,strict-match"
|
|
||||||
value: 0.419
|
|
||||||
- name: "exact_match,flexible-extract"
|
|
||||||
value: 0.416
|
|
||||||
limit: 1000
|
|
||||||
num_fewshot: 5
|
|
||||||
@ -1,12 +0,0 @@
|
|||||||
# For hf script, without -t option (tensor parallel size).
|
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh -m meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -l 100 -t 8
|
|
||||||
model_name: "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
|
|
||||||
backend: "vllm-vlm"
|
|
||||||
tasks:
|
|
||||||
- name: "chartqa"
|
|
||||||
metrics:
|
|
||||||
- name: "relaxed_accuracy,none"
|
|
||||||
# TODO(zhewenl): model card is 0.90, but the actual score is 0.80.
|
|
||||||
value: 0.80
|
|
||||||
limit: 100
|
|
||||||
num_fewshot: 0
|
|
||||||
@ -1,10 +0,0 @@
|
|||||||
# For hf script, without -t option (tensor parallel size).
|
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh -m meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -l 250 -t 8 -f 5
|
|
||||||
model_name: "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
|
|
||||||
tasks:
|
|
||||||
- name: "mmlu_pro"
|
|
||||||
metrics:
|
|
||||||
- name: "exact_match,custom-extract"
|
|
||||||
value: 0.80
|
|
||||||
limit: 250 # will run on 250 * 14 subjects = 3500 samples
|
|
||||||
num_fewshot: 5
|
|
||||||
@ -1,5 +1,4 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size)
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -b auto -l 1319 -f 5 -t 1
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -l 1319 -t 1
|
|
||||||
model_name: "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic"
|
model_name: "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic"
|
||||||
tasks:
|
tasks:
|
||||||
- name: "gsm8k"
|
- name: "gsm8k"
|
||||||
|
|||||||
@ -1,12 +0,0 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh -m Qwen/Qwen2.5-VL-7B-Instruct -l 2500 -t 1
|
|
||||||
|
|
||||||
model_name: "Qwen/Qwen2.5-VL-7B-Instruct"
|
|
||||||
backend: "vllm-vlm"
|
|
||||||
tasks:
|
|
||||||
- name: "chartqa"
|
|
||||||
metrics:
|
|
||||||
- name: "relaxed_accuracy,none"
|
|
||||||
value: 0.855
|
|
||||||
limit: 2500
|
|
||||||
num_fewshot: 0
|
|
||||||
@ -1,14 +0,0 @@
|
|||||||
model_name: "Qwen/Qwen3-235B-A22B-Instruct-2507-FP8"
|
|
||||||
tasks:
|
|
||||||
- name: "mmlu_pro"
|
|
||||||
metrics:
|
|
||||||
- name: "exact_match,custom-extract"
|
|
||||||
value: 0.82
|
|
||||||
limit: 250 # will run on 250 * 14 subjects = 3500 samples
|
|
||||||
num_fewshot: 5
|
|
||||||
enforce_eager: false # we use false to speed up the eval process
|
|
||||||
kv_cache_dtype: fp8 # we use fp8 to speed up the eval process
|
|
||||||
max_model_len: 40960
|
|
||||||
apply_chat_template: true
|
|
||||||
fewshot_as_multiturn: true
|
|
||||||
gen_kwargs: "temperature=0,top_p=1,top_k=0,max_gen_toks=5632,until=<|ENDANSWER|>"
|
|
||||||
@ -1 +0,0 @@
|
|||||||
Qwen3-235B-A22B-Instruct-2507-FP8.yaml
|
|
||||||
@ -1 +0,0 @@
|
|||||||
Meta-Llama-4-Maverick-17B-128E-Instruct-FP8-MM.yaml
|
|
||||||
@ -1 +0,0 @@
|
|||||||
Qwen2.5-VL-7B-Instruct.yaml
|
|
||||||
@ -1,44 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
# We can use this script to compute baseline accuracy on chartqa for vllm.
|
|
||||||
#
|
|
||||||
# Make sure you have lm-eval-harness installed:
|
|
||||||
# pip install lm-eval==0.4.9
|
|
||||||
|
|
||||||
usage() {
|
|
||||||
echo``
|
|
||||||
echo "Runs lm eval harness on ChartQA using multimodal vllm."
|
|
||||||
echo "This pathway is intended to be used to create baselines for "
|
|
||||||
echo "our correctness tests in vllm's CI."
|
|
||||||
echo
|
|
||||||
echo "usage: ${0} <options>"
|
|
||||||
echo
|
|
||||||
echo " -m - huggingface stub or local directory of the model"
|
|
||||||
echo " -l - limit number of samples to run"
|
|
||||||
echo " -t - tensor parallel size to run at"
|
|
||||||
echo
|
|
||||||
}
|
|
||||||
|
|
||||||
while getopts "m:l:t:" OPT; do
|
|
||||||
case ${OPT} in
|
|
||||||
m )
|
|
||||||
MODEL="$OPTARG"
|
|
||||||
;;
|
|
||||||
l )
|
|
||||||
LIMIT="$OPTARG"
|
|
||||||
;;
|
|
||||||
t )
|
|
||||||
TP_SIZE="$OPTARG"
|
|
||||||
;;
|
|
||||||
\? )
|
|
||||||
usage
|
|
||||||
exit 1
|
|
||||||
;;
|
|
||||||
esac
|
|
||||||
done
|
|
||||||
|
|
||||||
lm_eval --model vllm-vlm \
|
|
||||||
--model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE" \
|
|
||||||
--tasks chartqa \
|
|
||||||
--batch_size auto \
|
|
||||||
--apply_chat_template \
|
|
||||||
--limit $LIMIT
|
|
||||||
0
.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
Executable file → Normal file
0
.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
Executable file → Normal file
@ -1,50 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
# We can use this script to compute baseline accuracy on MMLUPRO for vllm.
|
|
||||||
# We use this for fp8, which HF does not support.
|
|
||||||
#
|
|
||||||
# Make sure you have lm-eval-harness installed:
|
|
||||||
# pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
|
|
||||||
|
|
||||||
usage() {
|
|
||||||
echo``
|
|
||||||
echo "Runs lm eval harness on MMLU Pro using huggingface transformers."
|
|
||||||
echo "This pathway is intended to be used to create baselines for "
|
|
||||||
echo "our automated nm-test-accuracy workflow"
|
|
||||||
echo
|
|
||||||
echo "usage: ${0} <options>"
|
|
||||||
echo
|
|
||||||
echo " -m - huggingface stub or local directory of the model"
|
|
||||||
echo " -l - limit number of samples to run"
|
|
||||||
echo " -f - number of fewshot samples to use"
|
|
||||||
echo " -t - tensor parallel size to run at"
|
|
||||||
echo
|
|
||||||
}
|
|
||||||
|
|
||||||
while getopts "m:b:l:f:t:" OPT; do
|
|
||||||
case ${OPT} in
|
|
||||||
m )
|
|
||||||
MODEL="$OPTARG"
|
|
||||||
;;
|
|
||||||
b )
|
|
||||||
BATCH_SIZE="$OPTARG"
|
|
||||||
;;
|
|
||||||
l )
|
|
||||||
LIMIT="$OPTARG"
|
|
||||||
;;
|
|
||||||
f )
|
|
||||||
FEWSHOT="$OPTARG"
|
|
||||||
;;
|
|
||||||
t )
|
|
||||||
TP_SIZE="$OPTARG"
|
|
||||||
;;
|
|
||||||
\? )
|
|
||||||
usage
|
|
||||||
exit 1
|
|
||||||
;;
|
|
||||||
esac
|
|
||||||
done
|
|
||||||
|
|
||||||
lm_eval --model vllm \
|
|
||||||
--model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,add_bos_token=true,trust_remote_code=true,max_model_len=4096" \
|
|
||||||
--tasks mmlu_pro --num_fewshot "$FEWSHOT" --limit "$LIMIT" \
|
|
||||||
--batch_size auto
|
|
||||||
@ -19,35 +19,21 @@ RTOL = 0.08
|
|||||||
def launch_lm_eval(eval_config, tp_size):
|
def launch_lm_eval(eval_config, tp_size):
|
||||||
trust_remote_code = eval_config.get("trust_remote_code", False)
|
trust_remote_code = eval_config.get("trust_remote_code", False)
|
||||||
max_model_len = eval_config.get("max_model_len", 4096)
|
max_model_len = eval_config.get("max_model_len", 4096)
|
||||||
batch_size = eval_config.get("batch_size", "auto")
|
|
||||||
backend = eval_config.get("backend", "vllm")
|
|
||||||
enforce_eager = eval_config.get("enforce_eager", "true")
|
|
||||||
kv_cache_dtype = eval_config.get("kv_cache_dtype", "auto")
|
|
||||||
model_args = (
|
model_args = (
|
||||||
f"pretrained={eval_config['model_name']},"
|
f"pretrained={eval_config['model_name']},"
|
||||||
f"tensor_parallel_size={tp_size},"
|
f"tensor_parallel_size={tp_size},"
|
||||||
f"enforce_eager={enforce_eager},"
|
f"enforce_eager=true,"
|
||||||
f"kv_cache_dtype={kv_cache_dtype},"
|
|
||||||
f"add_bos_token=true,"
|
f"add_bos_token=true,"
|
||||||
f"trust_remote_code={trust_remote_code},"
|
f"trust_remote_code={trust_remote_code},"
|
||||||
f"max_model_len={max_model_len},"
|
f"max_model_len={max_model_len}"
|
||||||
)
|
)
|
||||||
results = lm_eval.simple_evaluate(
|
results = lm_eval.simple_evaluate(
|
||||||
model=backend,
|
model="vllm",
|
||||||
model_args=model_args,
|
model_args=model_args,
|
||||||
tasks=[task["name"] for task in eval_config["tasks"]],
|
tasks=[task["name"] for task in eval_config["tasks"]],
|
||||||
num_fewshot=eval_config["num_fewshot"],
|
num_fewshot=eval_config["num_fewshot"],
|
||||||
limit=eval_config["limit"],
|
limit=eval_config["limit"],
|
||||||
# TODO(yeq): using chat template w/ fewshot_as_multiturn is supposed help
|
batch_size="auto",
|
||||||
# text models. however, this is regressing measured strict-match for
|
|
||||||
# existing text models in CI, so only apply it for mm, or explicitly set
|
|
||||||
apply_chat_template=eval_config.get(
|
|
||||||
"apply_chat_template", backend == "vllm-vlm"
|
|
||||||
),
|
|
||||||
fewshot_as_multiturn=eval_config.get("fewshot_as_multiturn", False),
|
|
||||||
# Forward decoding and early-stop controls (e.g., max_gen_toks, until=...)
|
|
||||||
gen_kwargs=eval_config.get("gen_kwargs"),
|
|
||||||
batch_size=batch_size,
|
|
||||||
)
|
)
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
|||||||
@ -2,23 +2,40 @@
|
|||||||
|
|
||||||
## Introduction
|
## Introduction
|
||||||
|
|
||||||
This directory contains a benchmarking suite for **developers** to run locally and gain clarity on whether their PR improves/degrades vllm's performance.
|
This directory contains two sets of benchmark for vllm.
|
||||||
vLLM also maintains a continuous performance benchmark under [perf.vllm.ai](https://perf.vllm.ai/), hosted under PyTorch CI HUD.
|
|
||||||
|
- Performance benchmark: benchmark vllm's performance under various workload, for **developers** to gain clarity on whether their PR improves/degrades vllm's performance
|
||||||
|
- Nightly benchmark: compare vllm's performance against alternatives (tgi, trt-llm and lmdeploy), for **the public** to know when to choose vllm.
|
||||||
|
|
||||||
|
See [vLLM performance dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm) for the latest performance benchmark results and [vLLM GitHub README](https://github.com/vllm-project/vllm/blob/main/README.md) for latest nightly benchmark results.
|
||||||
|
|
||||||
## Performance benchmark quick overview
|
## Performance benchmark quick overview
|
||||||
|
|
||||||
**Benchmarking Coverage**: latency, throughput and fix-qps serving on B200, A100, H100, Intel® Xeon® Processors and Intel® Gaudi® 3 Accelerators with different models.
|
**Benchmarking Coverage**: latency, throughput and fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!) and Intel® Xeon® Processors, with different models.
|
||||||
|
|
||||||
**Benchmarking Duration**: about 1hr.
|
**Benchmarking Duration**: about 1hr.
|
||||||
|
|
||||||
**For benchmarking developers**: please try your best to constraint the duration of benchmarking to about 1 hr so that it won't take forever to run.
|
**For benchmarking developers**: please try your best to constraint the duration of benchmarking to about 1 hr so that it won't take forever to run.
|
||||||
|
|
||||||
|
## Nightly benchmark quick overview
|
||||||
|
|
||||||
|
**Benchmarking Coverage**: Fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!) on Llama-3 8B, 70B and Mixtral 8x7B.
|
||||||
|
|
||||||
|
**Benchmarking engines**: vllm, TGI, trt-llm and lmdeploy.
|
||||||
|
|
||||||
|
**Benchmarking Duration**: about 3.5hrs.
|
||||||
|
|
||||||
## Trigger the benchmark
|
## Trigger the benchmark
|
||||||
|
|
||||||
The benchmark needs to be triggered manually:
|
Performance benchmark will be triggered when:
|
||||||
|
|
||||||
|
- A PR being merged into vllm.
|
||||||
|
- Every commit for those PRs with `perf-benchmarks` label AND `ready` label.
|
||||||
|
|
||||||
|
Manually Trigger the benchmark
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
bash .buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
|
bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
|
||||||
```
|
```
|
||||||
|
|
||||||
Runtime environment variables:
|
Runtime environment variables:
|
||||||
@ -30,11 +47,14 @@ Runtime environment variables:
|
|||||||
- `REMOTE_HOST`: IP for the remote vLLM service to benchmark. Default value is empty string.
|
- `REMOTE_HOST`: IP for the remote vLLM service to benchmark. Default value is empty string.
|
||||||
- `REMOTE_PORT`: Port for the remote vLLM service to benchmark. Default value is empty string.
|
- `REMOTE_PORT`: Port for the remote vLLM service to benchmark. Default value is empty string.
|
||||||
|
|
||||||
|
Nightly benchmark will be triggered when:
|
||||||
|
|
||||||
|
- Every commit for those PRs with `perf-benchmarks` label and `nightly-benchmarks` label.
|
||||||
|
|
||||||
## Performance benchmark details
|
## Performance benchmark details
|
||||||
|
|
||||||
See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.
|
See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.
|
||||||
> NOTE: For Intel® Xeon® Processors, use `tests/latency-tests-cpu.json`, `tests/throughput-tests-cpu.json`, `tests/serving-tests-cpu.json` instead.
|
> NOTE: For Intel® Xeon® Processors, use `tests/latency-tests-cpu.json`, `tests/throughput-tests-cpu.json`, `tests/serving-tests-cpu.json` instead.
|
||||||
For Intel® Gaudi® 3 Accelerators, use `tests/latency-tests-hpu.json`, `tests/throughput-tests-hpu.json`, `tests/serving-tests-hpu.json` instead.
|
|
||||||
>
|
>
|
||||||
### Latency test
|
### Latency test
|
||||||
|
|
||||||
@ -132,3 +152,26 @@ Here is an example using the script to compare result_a and result_b with Model,
|
|||||||
A comparison diagram will be generated below the table.
|
A comparison diagram will be generated below the table.
|
||||||
Here is an example to compare between 96c/results_gnr_96c_091_tp2pp3 and 128c/results_gnr_128c_091_tp2pp3
|
Here is an example to compare between 96c/results_gnr_96c_091_tp2pp3 and 128c/results_gnr_128c_091_tp2pp3
|
||||||
<img width="1886" height="828" alt="image" src="https://github.com/user-attachments/assets/c02a43ef-25d0-4fd6-90e5-2169a28682dd" />
|
<img width="1886" height="828" alt="image" src="https://github.com/user-attachments/assets/c02a43ef-25d0-4fd6-90e5-2169a28682dd" />
|
||||||
|
|
||||||
|
## Nightly test details
|
||||||
|
|
||||||
|
See [nightly-descriptions.md](nightly-descriptions.md) for the detailed description on test workload, models and docker containers of benchmarking other llm engines.
|
||||||
|
|
||||||
|
### Workflow
|
||||||
|
|
||||||
|
- The [nightly-pipeline.yaml](nightly-pipeline.yaml) specifies the docker containers for different LLM serving engines.
|
||||||
|
- Inside each container, we run [scripts/run-nightly-benchmarks.sh](scripts/run-nightly-benchmarks.sh), which will probe the serving engine of the current container.
|
||||||
|
- The `scripts/run-nightly-benchmarks.sh` will parse the workload described in [nightly-tests.json](tests/nightly-tests.json) and launch the right benchmark for the specified serving engine via `scripts/launch-server.sh`.
|
||||||
|
- At last, we run [scripts/summary-nightly-results.py](scripts/summary-nightly-results.py) to collect and plot the final benchmarking results, and update the results to buildkite.
|
||||||
|
|
||||||
|
### Nightly tests
|
||||||
|
|
||||||
|
In [nightly-tests.json](tests/nightly-tests.json), we include the command line arguments for benchmarking commands, together with the benchmarking test cases. The format is highly similar to performance benchmark.
|
||||||
|
|
||||||
|
### Docker containers
|
||||||
|
|
||||||
|
The docker containers for benchmarking are specified in `nightly-pipeline.yaml`.
|
||||||
|
|
||||||
|
WARNING: the docker versions are HARD-CODED and SHOULD BE ALIGNED WITH `nightly-descriptions.md`. The docker versions need to be hard-coded as there are several version-specific bug fixes inside `scripts/run-nightly-benchmarks.sh` and `scripts/launch-server.sh`.
|
||||||
|
|
||||||
|
WARNING: populating `trt-llm` to latest version is not easy, as it requires updating several protobuf files in [tensorrt-demo](https://github.com/neuralmagic/tensorrt-demo.git).
|
||||||
184
.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
Normal file
184
.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
Normal file
@ -0,0 +1,184 @@
|
|||||||
|
steps:
|
||||||
|
- label: "Wait for container to be ready"
|
||||||
|
key: wait-for-container-image
|
||||||
|
agents:
|
||||||
|
queue: A100
|
||||||
|
plugins:
|
||||||
|
- kubernetes:
|
||||||
|
podSpec:
|
||||||
|
containers:
|
||||||
|
- image: badouralix/curl-jq
|
||||||
|
command:
|
||||||
|
- sh .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
|
||||||
|
- label: "Cleanup H100"
|
||||||
|
agents:
|
||||||
|
queue: H100
|
||||||
|
depends_on: ~
|
||||||
|
command: docker system prune -a --volumes --force
|
||||||
|
|
||||||
|
- label: "A100"
|
||||||
|
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
|
||||||
|
agents:
|
||||||
|
queue: A100
|
||||||
|
depends_on: wait-for-container-image
|
||||||
|
if: build.branch == "main"
|
||||||
|
plugins:
|
||||||
|
- kubernetes:
|
||||||
|
podSpec:
|
||||||
|
priorityClassName: perf-benchmark
|
||||||
|
containers:
|
||||||
|
- image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
|
||||||
|
command:
|
||||||
|
- bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
|
||||||
|
resources:
|
||||||
|
limits:
|
||||||
|
nvidia.com/gpu: 8
|
||||||
|
volumeMounts:
|
||||||
|
- name: devshm
|
||||||
|
mountPath: /dev/shm
|
||||||
|
env:
|
||||||
|
- name: VLLM_USAGE_SOURCE
|
||||||
|
value: ci-test
|
||||||
|
- name: HF_TOKEN
|
||||||
|
valueFrom:
|
||||||
|
secretKeyRef:
|
||||||
|
name: hf-token-secret
|
||||||
|
key: token
|
||||||
|
nodeSelector:
|
||||||
|
nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
|
||||||
|
volumes:
|
||||||
|
- name: devshm
|
||||||
|
emptyDir:
|
||||||
|
medium: Memory
|
||||||
|
|
||||||
|
- label: "H200"
|
||||||
|
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
|
||||||
|
agents:
|
||||||
|
queue: H200
|
||||||
|
depends_on: wait-for-container-image
|
||||||
|
if: build.branch == "main"
|
||||||
|
plugins:
|
||||||
|
- docker#v5.12.0:
|
||||||
|
image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
|
||||||
|
command:
|
||||||
|
- bash
|
||||||
|
- .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
|
||||||
|
mount-buildkite-agent: true
|
||||||
|
propagate-environment: true
|
||||||
|
ipc: host
|
||||||
|
gpus: 4,5,6,7
|
||||||
|
volumes:
|
||||||
|
- /data/benchmark-hf-cache:/root/.cache/huggingface
|
||||||
|
environment:
|
||||||
|
- VLLM_USAGE_SOURCE
|
||||||
|
- HF_TOKEN
|
||||||
|
|
||||||
|
#- block: "Run H100 Benchmark"
|
||||||
|
#key: block-h100
|
||||||
|
#depends_on: ~
|
||||||
|
|
||||||
|
- label: "H100"
|
||||||
|
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
|
||||||
|
agents:
|
||||||
|
queue: H100
|
||||||
|
depends_on: wait-for-container-image
|
||||||
|
if: build.branch == "main"
|
||||||
|
plugins:
|
||||||
|
- docker#v5.12.0:
|
||||||
|
image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
|
||||||
|
command:
|
||||||
|
- bash
|
||||||
|
- .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
|
||||||
|
mount-buildkite-agent: true
|
||||||
|
propagate-environment: true
|
||||||
|
ipc: host
|
||||||
|
gpus: all # see CUDA_VISIBLE_DEVICES for actual GPUs used
|
||||||
|
volumes:
|
||||||
|
- /data/benchmark-hf-cache:/root/.cache/huggingface
|
||||||
|
environment:
|
||||||
|
- VLLM_USAGE_SOURCE
|
||||||
|
- HF_TOKEN
|
||||||
|
|
||||||
|
# Premerge benchmark
|
||||||
|
- label: "A100"
|
||||||
|
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
|
||||||
|
agents:
|
||||||
|
queue: A100
|
||||||
|
depends_on: wait-for-container-image
|
||||||
|
if: build.branch != "main"
|
||||||
|
plugins:
|
||||||
|
- kubernetes:
|
||||||
|
podSpec:
|
||||||
|
priorityClassName: perf-benchmark
|
||||||
|
containers:
|
||||||
|
- image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
|
||||||
|
command:
|
||||||
|
- bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
|
||||||
|
resources:
|
||||||
|
limits:
|
||||||
|
nvidia.com/gpu: 8
|
||||||
|
volumeMounts:
|
||||||
|
- name: devshm
|
||||||
|
mountPath: /dev/shm
|
||||||
|
env:
|
||||||
|
- name: VLLM_USAGE_SOURCE
|
||||||
|
value: ci-test
|
||||||
|
- name: HF_TOKEN
|
||||||
|
valueFrom:
|
||||||
|
secretKeyRef:
|
||||||
|
name: hf-token-secret
|
||||||
|
key: token
|
||||||
|
nodeSelector:
|
||||||
|
nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
|
||||||
|
volumes:
|
||||||
|
- name: devshm
|
||||||
|
emptyDir:
|
||||||
|
medium: Memory
|
||||||
|
|
||||||
|
- label: "H200"
|
||||||
|
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
|
||||||
|
agents:
|
||||||
|
queue: H200
|
||||||
|
depends_on: wait-for-container-image
|
||||||
|
if: build.branch != "main"
|
||||||
|
plugins:
|
||||||
|
- docker#v5.12.0:
|
||||||
|
image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
|
||||||
|
command:
|
||||||
|
- bash
|
||||||
|
- .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
|
||||||
|
mount-buildkite-agent: true
|
||||||
|
propagate-environment: true
|
||||||
|
ipc: host
|
||||||
|
gpus: 4,5,6,7
|
||||||
|
volumes:
|
||||||
|
- /data/benchmark-hf-cache:/root/.cache/huggingface
|
||||||
|
environment:
|
||||||
|
- VLLM_USAGE_SOURCE
|
||||||
|
- HF_TOKEN
|
||||||
|
|
||||||
|
#- block: "Run H100 Benchmark"
|
||||||
|
#key: block-h100
|
||||||
|
#depends_on: ~
|
||||||
|
|
||||||
|
- label: "H100"
|
||||||
|
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
|
||||||
|
agents:
|
||||||
|
queue: H100
|
||||||
|
depends_on: wait-for-container-image
|
||||||
|
if: build.branch != "main"
|
||||||
|
plugins:
|
||||||
|
- docker#v5.12.0:
|
||||||
|
image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
|
||||||
|
command:
|
||||||
|
- bash
|
||||||
|
- .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
|
||||||
|
mount-buildkite-agent: true
|
||||||
|
propagate-environment: true
|
||||||
|
ipc: host
|
||||||
|
gpus: all # see CUDA_VISIBLE_DEVICES for actual GPUs used
|
||||||
|
volumes:
|
||||||
|
- /data/benchmark-hf-cache:/root/.cache/huggingface
|
||||||
|
environment:
|
||||||
|
- VLLM_USAGE_SOURCE
|
||||||
|
- HF_TOKEN
|
||||||
28
.buildkite/nightly-benchmarks/nightly-annotation.md
Normal file
28
.buildkite/nightly-benchmarks/nightly-annotation.md
Normal file
@ -0,0 +1,28 @@
|
|||||||
|
# Nightly benchmark annotation
|
||||||
|
|
||||||
|
## Description
|
||||||
|
|
||||||
|
This file contains the downloading link for benchmarking results.
|
||||||
|
|
||||||
|
- [benchmarking pipeline](artifact://nightly-pipeline.yaml)
|
||||||
|
- [benchmarking results](artifact://results.zip)
|
||||||
|
- [benchmarking code](artifact://nightly-benchmarks.zip)
|
||||||
|
|
||||||
|
Please download the visualization scripts in the post
|
||||||
|
|
||||||
|
## Results reproduction
|
||||||
|
|
||||||
|
- Find the docker we use in `benchmarking pipeline`
|
||||||
|
- Deploy the docker, and inside the docker:
|
||||||
|
- Download `nightly-benchmarks.zip`.
|
||||||
|
- In the same folder, run the following code:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
export HF_TOKEN=<your HF token>
|
||||||
|
apt update
|
||||||
|
apt install -y git
|
||||||
|
unzip nightly-benchmarks.zip
|
||||||
|
VLLM_SOURCE_CODE_LOC=./ bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
And the results will be inside `./benchmarks/results`.
|
||||||
39
.buildkite/nightly-benchmarks/nightly-descriptions.md
Normal file
39
.buildkite/nightly-benchmarks/nightly-descriptions.md
Normal file
@ -0,0 +1,39 @@
|
|||||||
|
|
||||||
|
# Nightly benchmark
|
||||||
|
|
||||||
|
This benchmark aims to:
|
||||||
|
|
||||||
|
- Provide performance clarity: Provide clarity on which one (vllm, tensorrt-llm, lmdeploy and SGLang) leads in performance in what workload.
|
||||||
|
- Be reproducible: one can run the exact same set of benchmarking commands inside the exact same docker by following reproducing instructions.
|
||||||
|
|
||||||
|
Latest results: [results link](https://blog.vllm.ai/2024/09/05/perf-update.html), scroll to the end.
|
||||||
|
|
||||||
|
Latest reproduction guide: [github issue link](https://github.com/vllm-project/vllm/issues/8176)
|
||||||
|
|
||||||
|
## Setup
|
||||||
|
|
||||||
|
- Docker images:
|
||||||
|
- vLLM: `vllm/vllm-openai:v0.6.2`
|
||||||
|
- SGLang: `lmsysorg/sglang:v0.3.2-cu121`
|
||||||
|
- LMDeploy: `openmmlab/lmdeploy:v0.6.1-cu12`
|
||||||
|
- TensorRT-LLM: `nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3`
|
||||||
|
- *NOTE: we use r24.07 as the current implementation only works for this version. We are going to bump this up.*
|
||||||
|
- Check [nightly-pipeline.yaml](nightly-pipeline.yaml) for the concrete docker images, specs and commands we use for the benchmark.
|
||||||
|
- Hardware
|
||||||
|
- 8x Nvidia A100 GPUs
|
||||||
|
- Workload:
|
||||||
|
- Dataset
|
||||||
|
- ShareGPT dataset
|
||||||
|
- Prefill-heavy dataset (in average 462 input tokens, 16 tokens as output)
|
||||||
|
- Decode-heavy dataset (in average 462 input tokens, 256 output tokens)
|
||||||
|
- Check [nightly-tests.json](tests/nightly-tests.json) for the concrete configuration of datasets we use.
|
||||||
|
- Models: llama-3 8B, llama-3 70B.
|
||||||
|
- We do not use llama 3.1 as it is incompatible with trt-llm r24.07. ([issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105)).
|
||||||
|
- Average QPS (query per second): 2, 4, 8, 16, 32 and inf.
|
||||||
|
- Queries are randomly sampled, and arrival patterns are determined via Poisson process, but all with fixed random seed.
|
||||||
|
- Evaluation metrics: Throughput (higher the better), TTFT (time to the first token, lower the better), ITL (inter-token latency, lower the better).
|
||||||
|
|
||||||
|
## Known issues
|
||||||
|
|
||||||
|
- TRT-LLM crashes with Llama 3.1 8B [issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105).
|
||||||
|
- TGI does not support `ignore-eos` flag.
|
||||||
196
.buildkite/nightly-benchmarks/nightly-pipeline.yaml
Normal file
196
.buildkite/nightly-benchmarks/nightly-pipeline.yaml
Normal file
@ -0,0 +1,196 @@
|
|||||||
|
common_pod_spec: &common_pod_spec
|
||||||
|
priorityClassName: perf-benchmark
|
||||||
|
nodeSelector:
|
||||||
|
nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
|
||||||
|
volumes:
|
||||||
|
- name: devshm
|
||||||
|
emptyDir:
|
||||||
|
medium: Memory
|
||||||
|
- name: hf-cache
|
||||||
|
hostPath:
|
||||||
|
path: /root/.cache/huggingface
|
||||||
|
type: Directory
|
||||||
|
|
||||||
|
common_container_settings: &common_container_settings
|
||||||
|
command:
|
||||||
|
- bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
|
||||||
|
resources:
|
||||||
|
limits:
|
||||||
|
nvidia.com/gpu: 8
|
||||||
|
volumeMounts:
|
||||||
|
- name: devshm
|
||||||
|
mountPath: /dev/shm
|
||||||
|
- name: hf-cache
|
||||||
|
mountPath: /root/.cache/huggingface
|
||||||
|
env:
|
||||||
|
- name: VLLM_USAGE_SOURCE
|
||||||
|
value: ci-test
|
||||||
|
- name: HF_HOME
|
||||||
|
value: /root/.cache/huggingface
|
||||||
|
- name: VLLM_SOURCE_CODE_LOC
|
||||||
|
value: /workspace/build/buildkite/vllm/performance-benchmark
|
||||||
|
- name: HF_TOKEN
|
||||||
|
valueFrom:
|
||||||
|
secretKeyRef:
|
||||||
|
name: hf-token-secret
|
||||||
|
key: token
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- block: ":rocket: Ready for comparing vllm against alternatives? This will take 4 hours."
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
- label: "A100 vllm step 10"
|
||||||
|
priority: 100
|
||||||
|
agents:
|
||||||
|
queue: A100
|
||||||
|
plugins:
|
||||||
|
- kubernetes:
|
||||||
|
podSpec:
|
||||||
|
<<: *common_pod_spec
|
||||||
|
containers:
|
||||||
|
- image: vllm/vllm-openai:v0.6.2
|
||||||
|
<<: *common_container_settings
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
- label: "A100 sglang benchmark"
|
||||||
|
priority: 100
|
||||||
|
agents:
|
||||||
|
queue: A100
|
||||||
|
plugins:
|
||||||
|
- kubernetes:
|
||||||
|
podSpec:
|
||||||
|
<<: *common_pod_spec
|
||||||
|
containers:
|
||||||
|
- image: lmsysorg/sglang:v0.3.2-cu121
|
||||||
|
<<: *common_container_settings
|
||||||
|
|
||||||
|
- label: "A100 lmdeploy benchmark"
|
||||||
|
priority: 100
|
||||||
|
agents:
|
||||||
|
queue: A100
|
||||||
|
plugins:
|
||||||
|
- kubernetes:
|
||||||
|
podSpec:
|
||||||
|
<<: *common_pod_spec
|
||||||
|
containers:
|
||||||
|
- image: openmmlab/lmdeploy:v0.6.1-cu12
|
||||||
|
<<: *common_container_settings
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
- label: "A100 trt llama-8B"
|
||||||
|
priority: 100
|
||||||
|
agents:
|
||||||
|
queue: A100
|
||||||
|
plugins:
|
||||||
|
- kubernetes:
|
||||||
|
podSpec:
|
||||||
|
<<: *common_pod_spec
|
||||||
|
containers:
|
||||||
|
- image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3
|
||||||
|
<<: *common_container_settings
|
||||||
|
env:
|
||||||
|
- name: VLLM_USAGE_SOURCE
|
||||||
|
value: ci-test
|
||||||
|
- name: HF_HOME
|
||||||
|
value: /root/.cache/huggingface
|
||||||
|
- name: VLLM_SOURCE_CODE_LOC
|
||||||
|
value: /workspace/build/buildkite/vllm/performance-benchmark
|
||||||
|
- name: HF_TOKEN
|
||||||
|
valueFrom:
|
||||||
|
secretKeyRef:
|
||||||
|
name: hf-token-secret
|
||||||
|
key: token
|
||||||
|
- name: TEST_SELECTOR
|
||||||
|
value: "llama8B"
|
||||||
|
|
||||||
|
|
||||||
|
- label: "A100 trt llama-70B"
|
||||||
|
priority: 100
|
||||||
|
agents:
|
||||||
|
queue: A100
|
||||||
|
plugins:
|
||||||
|
- kubernetes:
|
||||||
|
podSpec:
|
||||||
|
<<: *common_pod_spec
|
||||||
|
containers:
|
||||||
|
- image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3
|
||||||
|
<<: *common_container_settings
|
||||||
|
env:
|
||||||
|
- name: VLLM_USAGE_SOURCE
|
||||||
|
value: ci-test
|
||||||
|
- name: HF_HOME
|
||||||
|
value: /root/.cache/huggingface
|
||||||
|
- name: VLLM_SOURCE_CODE_LOC
|
||||||
|
value: /workspace/build/buildkite/vllm/performance-benchmark
|
||||||
|
- name: HF_TOKEN
|
||||||
|
valueFrom:
|
||||||
|
secretKeyRef:
|
||||||
|
name: hf-token-secret
|
||||||
|
key: token
|
||||||
|
- name: TEST_SELECTOR
|
||||||
|
value: "llama70B"
|
||||||
|
|
||||||
|
|
||||||
|
# FIXME(Kuntai): uncomment this after NVIDIA gives us their test docker image
|
||||||
|
# - label: "A100 trt benchmark"
|
||||||
|
# priority: 100
|
||||||
|
# agents:
|
||||||
|
# queue: A100
|
||||||
|
# plugins:
|
||||||
|
# - kubernetes:
|
||||||
|
# podSpec:
|
||||||
|
# <<: *common_pod_spec
|
||||||
|
# containers:
|
||||||
|
# - image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3
|
||||||
|
# <<: *common_container_settings
|
||||||
|
|
||||||
|
|
||||||
|
# FIXME(Kuntai): uncomment this after TGI supports `--ignore-eos`.
|
||||||
|
# - label: "A100 tgi benchmark"
|
||||||
|
# priority: 100
|
||||||
|
# agents:
|
||||||
|
# queue: A100
|
||||||
|
# plugins:
|
||||||
|
# - kubernetes:
|
||||||
|
# podSpec:
|
||||||
|
# <<: *common_pod_spec
|
||||||
|
# containers:
|
||||||
|
# - image: ghcr.io/huggingface/text-generation-inference:2.2.0
|
||||||
|
# <<: *common_container_settings
|
||||||
|
|
||||||
|
- wait
|
||||||
|
|
||||||
|
- label: "Collect the results"
|
||||||
|
priority: 100
|
||||||
|
agents:
|
||||||
|
queue: A100
|
||||||
|
plugins:
|
||||||
|
- kubernetes:
|
||||||
|
podSpec:
|
||||||
|
<<: *common_pod_spec
|
||||||
|
containers:
|
||||||
|
- image: vllm/vllm-openai:v0.5.0.post1
|
||||||
|
command:
|
||||||
|
- bash .buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
|
||||||
|
resources:
|
||||||
|
limits:
|
||||||
|
nvidia.com/gpu: 8
|
||||||
|
volumeMounts:
|
||||||
|
- name: devshm
|
||||||
|
mountPath: /dev/shm
|
||||||
|
env:
|
||||||
|
- name: VLLM_USAGE_SOURCE
|
||||||
|
value: ci-test
|
||||||
|
- name: VLLM_SOURCE_CODE_LOC
|
||||||
|
value: /workspace/build/buildkite/vllm/performance-benchmark
|
||||||
|
- name: HF_TOKEN
|
||||||
|
valueFrom:
|
||||||
|
secretKeyRef:
|
||||||
|
name: hf-token-secret
|
||||||
|
key: token
|
||||||
|
|
||||||
|
- block: ":rocket: check the results!"
|
||||||
@ -5,7 +5,7 @@
|
|||||||
- Input length: 32 tokens.
|
- Input length: 32 tokens.
|
||||||
- Output length: 128 tokens.
|
- Output length: 128 tokens.
|
||||||
- Batch size: fixed (8).
|
- Batch size: fixed (8).
|
||||||
- GPU/HPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
|
- GPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
|
||||||
- CPU Models: llama-3.1 8B.
|
- CPU Models: llama-3.1 8B.
|
||||||
- Evaluation metrics: end-to-end latency (mean, median, p99).
|
- Evaluation metrics: end-to-end latency (mean, median, p99).
|
||||||
|
|
||||||
@ -16,7 +16,7 @@
|
|||||||
- Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
|
- Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
|
||||||
- Output length: the corresponding output length of these 200 prompts.
|
- Output length: the corresponding output length of these 200 prompts.
|
||||||
- Batch size: dynamically determined by vllm to achieve maximum throughput.
|
- Batch size: dynamically determined by vllm to achieve maximum throughput.
|
||||||
- GPU/HPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
|
- GPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
|
||||||
- CPU Models: llama-3.1 8B.
|
- CPU Models: llama-3.1 8B.
|
||||||
- Evaluation metrics: throughput.
|
- Evaluation metrics: throughput.
|
||||||
|
|
||||||
@ -28,7 +28,7 @@
|
|||||||
- Output length: the corresponding output length of these 200 prompts.
|
- Output length: the corresponding output length of these 200 prompts.
|
||||||
- Batch size: dynamically determined by vllm and the arrival pattern of the requests.
|
- Batch size: dynamically determined by vllm and the arrival pattern of the requests.
|
||||||
- **Average QPS (query per second)**: 1, 4, 16 and inf. QPS = inf means all requests come at once. For other QPS values, the arrival time of each query is determined using a random Poisson process (with fixed random seed).
|
- **Average QPS (query per second)**: 1, 4, 16 and inf. QPS = inf means all requests come at once. For other QPS values, the arrival time of each query is determined using a random Poisson process (with fixed random seed).
|
||||||
- GPU/HPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
|
- GPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
|
||||||
- We also added a speculative decoding test for llama-3 70B on GPU, under QPS 2
|
- We also added a speculative decoding test for llama-3 70B on GPU, under QPS 2
|
||||||
- CPU Models: llama-3.1 8B.
|
- CPU Models: llama-3.1 8B.
|
||||||
- Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99).
|
- Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99).
|
||||||
@ -7,7 +7,6 @@ from importlib import util
|
|||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
pd.options.display.float_format = "{:.2f}".format
|
|
||||||
plotly_found = util.find_spec("plotly.express") is not None
|
plotly_found = util.find_spec("plotly.express") is not None
|
||||||
|
|
||||||
|
|
||||||
@ -110,10 +109,7 @@ def compare_data_columns(
|
|||||||
if len(compare_frames) >= 2:
|
if len(compare_frames) >= 2:
|
||||||
base = compare_frames[0]
|
base = compare_frames[0]
|
||||||
current = compare_frames[-1]
|
current = compare_frames[-1]
|
||||||
if "P99" in data_column or "Median" in data_column:
|
ratio = current / base
|
||||||
ratio = base / current # for latency
|
|
||||||
else:
|
|
||||||
ratio = current / base
|
|
||||||
ratio = ratio.mask(base == 0) # avoid inf when baseline is 0
|
ratio = ratio.mask(base == 0) # avoid inf when baseline is 0
|
||||||
ratio.name = f"Ratio 1 vs {len(compare_frames)}"
|
ratio.name = f"Ratio 1 vs {len(compare_frames)}"
|
||||||
frames.append(ratio)
|
frames.append(ratio)
|
||||||
@ -203,71 +199,6 @@ def split_json_by_tp_pp(
|
|||||||
return saved_paths
|
return saved_paths
|
||||||
|
|
||||||
|
|
||||||
def _add_limit_line(fig, y_value, label):
|
|
||||||
# Visible dashed line + annotation
|
|
||||||
fig.add_hline(
|
|
||||||
y=y_value,
|
|
||||||
line_dash="dash",
|
|
||||||
line_color="red" if "ttft" in label.lower() else "blue",
|
|
||||||
annotation_text=f"{label}: {y_value} ms",
|
|
||||||
annotation_position="top left",
|
|
||||||
)
|
|
||||||
# Optional: add a legend item (as a transparent helper trace)
|
|
||||||
if plot and plotly_found:
|
|
||||||
import plotly.graph_objects as go
|
|
||||||
|
|
||||||
fig.add_trace(
|
|
||||||
go.Scatter(
|
|
||||||
x=[None],
|
|
||||||
y=[None],
|
|
||||||
mode="lines",
|
|
||||||
line=dict(
|
|
||||||
dash="dash", color="red" if "ttft" in label.lower() else "blue"
|
|
||||||
),
|
|
||||||
name=f"{label}",
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def _find_concurrency_col(df: pd.DataFrame) -> str:
|
|
||||||
for c in [
|
|
||||||
"# of max concurrency.",
|
|
||||||
"# of max concurrency",
|
|
||||||
"Max Concurrency",
|
|
||||||
"max_concurrency",
|
|
||||||
"Concurrency",
|
|
||||||
]:
|
|
||||||
if c in df.columns:
|
|
||||||
return c
|
|
||||||
# Fallback: guess an integer-like column (harmless if unused)
|
|
||||||
for c in df.columns:
|
|
||||||
if df[c].dtype.kind in "iu" and df[c].nunique() > 1 and df[c].min() >= 1:
|
|
||||||
return c
|
|
||||||
return "# of max concurrency."
|
|
||||||
|
|
||||||
|
|
||||||
def _highlight_threshold(
|
|
||||||
df: pd.DataFrame, threshold: float
|
|
||||||
) -> "pd.io.formats.style.Styler":
|
|
||||||
"""Highlight numeric per-configuration columns with value <= threshold."""
|
|
||||||
conc_col = _find_concurrency_col(df)
|
|
||||||
key_cols = [
|
|
||||||
c
|
|
||||||
for c in ["Model", "Dataset Name", "Input Len", "Output Len", conc_col]
|
|
||||||
if c in df.columns
|
|
||||||
]
|
|
||||||
conf_cols = [
|
|
||||||
c for c in df.columns if c not in key_cols and not str(c).startswith("Ratio")
|
|
||||||
]
|
|
||||||
conf_cols = [c for c in conf_cols if pd.api.types.is_numeric_dtype(df[c])]
|
|
||||||
return df.style.map(
|
|
||||||
lambda v: "background-color:#e6ffe6;font-weight:bold;"
|
|
||||||
if pd.notna(v) and v <= threshold
|
|
||||||
else "",
|
|
||||||
subset=conf_cols,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
parser = argparse.ArgumentParser()
|
parser = argparse.ArgumentParser()
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
@ -289,26 +220,6 @@ if __name__ == "__main__":
|
|||||||
default="# of max concurrency.",
|
default="# of max concurrency.",
|
||||||
help="column name to use as X Axis in comparison graph",
|
help="column name to use as X Axis in comparison graph",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
|
||||||
"-l",
|
|
||||||
"--latency",
|
|
||||||
type=str,
|
|
||||||
default="p99",
|
|
||||||
help="take median|p99 for latency like TTFT/TPOT",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--ttft-max-ms",
|
|
||||||
type=float,
|
|
||||||
default=3000.0,
|
|
||||||
help="Reference limit for TTFT plots (ms)",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--tpot-max-ms",
|
|
||||||
type=float,
|
|
||||||
default=100.0,
|
|
||||||
help="Reference limit for TPOT plots (ms)",
|
|
||||||
)
|
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
drop_column = "P99"
|
drop_column = "P99"
|
||||||
@ -323,22 +234,12 @@ if __name__ == "__main__":
|
|||||||
"# of max concurrency.",
|
"# of max concurrency.",
|
||||||
"qps",
|
"qps",
|
||||||
]
|
]
|
||||||
|
data_cols_to_compare = ["Output Tput (tok/s)", "Median TTFT (ms)", "Median"]
|
||||||
if "median" in args.latency:
|
html_msgs_for_data_cols = [
|
||||||
data_cols_to_compare = ["Output Tput (tok/s)", "Median TTFT (ms)", "Median"]
|
"Compare Output Tokens /n",
|
||||||
html_msgs_for_data_cols = [
|
"Median TTFT /n",
|
||||||
"Compare Output Tokens /n",
|
"Median TPOT /n",
|
||||||
"Median TTFT /n",
|
]
|
||||||
"Median TPOT /n",
|
|
||||||
]
|
|
||||||
drop_column = "P99"
|
|
||||||
elif "p99" in args.latency:
|
|
||||||
data_cols_to_compare = ["Output Tput (tok/s)", "P99 TTFT (ms)", "P99"]
|
|
||||||
html_msgs_for_data_cols = [
|
|
||||||
"Compare Output Tokens /n",
|
|
||||||
"P99 TTFT /n",
|
|
||||||
"P99 TPOT /n",
|
|
||||||
]
|
|
||||||
|
|
||||||
if len(args.file) == 1:
|
if len(args.file) == 1:
|
||||||
files = split_json_by_tp_pp(args.file[0], output_root="splits")
|
files = split_json_by_tp_pp(args.file[0], output_root="splits")
|
||||||
@ -374,83 +275,33 @@ if __name__ == "__main__":
|
|||||||
f"Expected subset: {filtered_info_cols}, "
|
f"Expected subset: {filtered_info_cols}, "
|
||||||
f"but DataFrame has: {list(output_df.columns)}"
|
f"but DataFrame has: {list(output_df.columns)}"
|
||||||
)
|
)
|
||||||
# output_df_sorted = output_df.sort_values(by=existing_group_cols)
|
output_df_sorted = output_df.sort_values(by=existing_group_cols)
|
||||||
output_df_sorted = output_df.sort_values(by=args.xaxis)
|
|
||||||
output_groups = output_df_sorted.groupby(existing_group_cols, dropna=False)
|
output_groups = output_df_sorted.groupby(existing_group_cols, dropna=False)
|
||||||
for name, group in output_groups:
|
for name, group in output_groups:
|
||||||
group_name = (
|
html = group.to_html()
|
||||||
",".join(map(str, name)).replace(",", "_").replace("/", "-")
|
|
||||||
)
|
|
||||||
group_html_name = "perf_comparison_" + group_name + ".html"
|
|
||||||
|
|
||||||
metric_name = str(data_cols_to_compare[i]).lower()
|
|
||||||
if "tok/s" in metric_name:
|
|
||||||
html = group.to_html()
|
|
||||||
elif "ttft" in metric_name:
|
|
||||||
styler = _highlight_threshold(group, args.ttft_max_ms).format(
|
|
||||||
{c: "{:.2f}" for c in group.select_dtypes("number").columns},
|
|
||||||
na_rep="—",
|
|
||||||
)
|
|
||||||
html = styler.to_html(
|
|
||||||
table_attributes='border="1" class="dataframe"'
|
|
||||||
)
|
|
||||||
elif (
|
|
||||||
"tpot" in metric_name
|
|
||||||
or "median" in metric_name
|
|
||||||
or "p99" in metric_name
|
|
||||||
):
|
|
||||||
styler = _highlight_threshold(group, args.tpot_max_ms).format(
|
|
||||||
{c: "{:.2f}" for c in group.select_dtypes("number").columns},
|
|
||||||
na_rep="—",
|
|
||||||
)
|
|
||||||
html = styler.to_html(
|
|
||||||
table_attributes='border="1" class="dataframe"'
|
|
||||||
)
|
|
||||||
|
|
||||||
text_file.write(html_msgs_for_data_cols[i])
|
text_file.write(html_msgs_for_data_cols[i])
|
||||||
text_file.write(html)
|
text_file.write(html)
|
||||||
with open(group_html_name, "a+") as sub_text_file:
|
|
||||||
sub_text_file.write(html_msgs_for_data_cols[i])
|
|
||||||
sub_text_file.write(html)
|
|
||||||
|
|
||||||
if plot and plotly_found:
|
if plot and plotly_found:
|
||||||
import plotly.express as px
|
import plotly.express as px
|
||||||
|
|
||||||
df = group[raw_data_cols]
|
df = group[raw_data_cols]
|
||||||
df_sorted = df.sort_values(by=info_cols[y_axis_index])
|
df_sorted = df.sort_values(by=info_cols[y_axis_index])
|
||||||
# Melt DataFrame for plotting
|
# Melt DataFrame for plotting
|
||||||
df_melted = df_sorted.melt(
|
df_melted = df_sorted.melt(
|
||||||
id_vars=info_cols[y_axis_index],
|
id_vars=info_cols[y_axis_index],
|
||||||
var_name="Configuration",
|
var_name="Configuration",
|
||||||
value_name=data_cols_to_compare[i],
|
value_name=data_cols_to_compare[i],
|
||||||
)
|
)
|
||||||
title = (
|
title = data_cols_to_compare[i] + " vs " + info_cols[y_axis_index]
|
||||||
data_cols_to_compare[i] + " vs " + info_cols[y_axis_index]
|
# Create Plotly line chart
|
||||||
)
|
fig = px.line(
|
||||||
# Create Plotly line chart
|
df_melted,
|
||||||
fig = px.line(
|
x=info_cols[y_axis_index],
|
||||||
df_melted,
|
y=data_cols_to_compare[i],
|
||||||
x=info_cols[y_axis_index],
|
color="Configuration",
|
||||||
y=data_cols_to_compare[i],
|
title=title,
|
||||||
color="Configuration",
|
markers=True,
|
||||||
title=title,
|
)
|
||||||
markers=True,
|
# Export to HTML
|
||||||
)
|
text_file.write(fig.to_html(full_html=True, include_plotlyjs="cdn"))
|
||||||
|
|
||||||
# ---- Add threshold lines based on metric name ----
|
|
||||||
if "ttft" in metric_name:
|
|
||||||
_add_limit_line(fig, args.ttft_max_ms, "TTFT limit")
|
|
||||||
elif (
|
|
||||||
"tpot" in metric_name
|
|
||||||
or "median" in metric_name
|
|
||||||
or "p99" in metric_name
|
|
||||||
):
|
|
||||||
_add_limit_line(fig, args.tpot_max_ms, "TPOT limit")
|
|
||||||
|
|
||||||
# Export to HTML
|
|
||||||
text_file.write(
|
|
||||||
fig.to_html(full_html=True, include_plotlyjs="cdn")
|
|
||||||
)
|
|
||||||
sub_text_file.write(
|
|
||||||
fig.to_html(full_html=True, include_plotlyjs="cdn")
|
|
||||||
)
|
|
||||||
@ -63,11 +63,9 @@ serving_column_mapping = {
|
|||||||
"mean_ttft_ms": "Mean TTFT (ms)",
|
"mean_ttft_ms": "Mean TTFT (ms)",
|
||||||
"median_ttft_ms": "Median TTFT (ms)",
|
"median_ttft_ms": "Median TTFT (ms)",
|
||||||
"p99_ttft_ms": "P99 TTFT (ms)",
|
"p99_ttft_ms": "P99 TTFT (ms)",
|
||||||
"std_ttft_ms": "STD TTFT (ms)",
|
|
||||||
"mean_tpot_ms": "Mean TPOT (ms)",
|
"mean_tpot_ms": "Mean TPOT (ms)",
|
||||||
"median_tpot_ms": "Median",
|
"median_tpot_ms": "Median",
|
||||||
"p99_tpot_ms": "P99",
|
"p99_tpot_ms": "P99",
|
||||||
"std_tpot_ms": "STD TPOT (ms)",
|
|
||||||
"mean_itl_ms": "Mean ITL (ms)",
|
"mean_itl_ms": "Mean ITL (ms)",
|
||||||
"median_itl_ms": "Median ITL (ms)",
|
"median_itl_ms": "Median ITL (ms)",
|
||||||
"p99_itl_ms": "P99 ITL (ms)",
|
"p99_itl_ms": "P99 ITL (ms)",
|
||||||
@ -370,7 +368,7 @@ if __name__ == "__main__":
|
|||||||
# The GPUs sometimes come in format of "GPUTYPE\nGPUTYPE\n...",
|
# The GPUs sometimes come in format of "GPUTYPE\nGPUTYPE\n...",
|
||||||
# we want to turn it into "8xGPUTYPE"
|
# we want to turn it into "8xGPUTYPE"
|
||||||
df["GPU"] = df["GPU"].apply(
|
df["GPU"] = df["GPU"].apply(
|
||||||
lambda x: "{}x{}".format(len(x.split("\n")), x.split("\n")[0])
|
lambda x: f"{len(x.split('\n'))}x{x.split('\n')[0]}"
|
||||||
)
|
)
|
||||||
|
|
||||||
# get markdown tables
|
# get markdown tables
|
||||||
@ -392,7 +390,7 @@ if __name__ == "__main__":
|
|||||||
json_file = "benchmark_results.json"
|
json_file = "benchmark_results.json"
|
||||||
with open(results_folder / md_file, "w") as f:
|
with open(results_folder / md_file, "w") as f:
|
||||||
results = read_markdown(
|
results = read_markdown(
|
||||||
"../.buildkite/performance-benchmarks/"
|
"../.buildkite/nightly-benchmarks/"
|
||||||
+ "performance-benchmarks-descriptions.md"
|
+ "performance-benchmarks-descriptions.md"
|
||||||
)
|
)
|
||||||
results = results.format(
|
results = results.format(
|
||||||
26
.buildkite/nightly-benchmarks/scripts/download-tokenizer.py
Normal file
26
.buildkite/nightly-benchmarks/scripts/download-tokenizer.py
Normal file
@ -0,0 +1,26 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
from transformers import AutoTokenizer
|
||||||
|
|
||||||
|
|
||||||
|
def main(model, cachedir):
|
||||||
|
# Load the tokenizer and save it to the specified directory
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(model)
|
||||||
|
tokenizer.save_pretrained(cachedir)
|
||||||
|
print(f"Tokenizer saved to {cachedir}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="Download and save Hugging Face tokenizer"
|
||||||
|
)
|
||||||
|
parser.add_argument("--model", type=str, required=True, help="Name of the model")
|
||||||
|
parser.add_argument(
|
||||||
|
"--cachedir", type=str, required=True, help="Directory to save the tokenizer"
|
||||||
|
)
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
main(args.model, args.cachedir)
|
||||||
@ -0,0 +1,97 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
from tabulate import tabulate
|
||||||
|
|
||||||
|
|
||||||
|
def parse_arguments():
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="Parse command line arguments for summary-nightly-results script."
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--results-folder",
|
||||||
|
type=str,
|
||||||
|
required=True,
|
||||||
|
help="The folder where the results are stored.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--description", type=str, required=True, help="Description of the results."
|
||||||
|
)
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
return args
|
||||||
|
|
||||||
|
|
||||||
|
def get_perf(df, method, model, metric):
|
||||||
|
means = []
|
||||||
|
|
||||||
|
for qps in [2, 4, 8, 16, "inf"]:
|
||||||
|
target = df["Test name"].str.contains(model)
|
||||||
|
target = target & df["Engine"].str.contains(method)
|
||||||
|
target = target & df["Test name"].str.contains("qps_" + str(qps))
|
||||||
|
filtered_df = df[target]
|
||||||
|
|
||||||
|
if filtered_df.empty:
|
||||||
|
means.append(0.0)
|
||||||
|
else:
|
||||||
|
means.append(filtered_df[metric].values[0])
|
||||||
|
|
||||||
|
return np.array(means)
|
||||||
|
|
||||||
|
|
||||||
|
def get_perf_w_std(df, method, model, metric):
|
||||||
|
if metric in ["TTFT", "ITL"]:
|
||||||
|
mean = get_perf(df, method, model, "Mean " + metric + " (ms)")
|
||||||
|
mean = mean.tolist()
|
||||||
|
std = get_perf(df, method, model, "Std " + metric + " (ms)")
|
||||||
|
if std.mean() == 0:
|
||||||
|
std = None
|
||||||
|
success = get_perf(df, method, model, "Successful req.")
|
||||||
|
if std is not None:
|
||||||
|
std = std / np.sqrt(success)
|
||||||
|
std = std.tolist()
|
||||||
|
|
||||||
|
else:
|
||||||
|
assert metric == "Tput"
|
||||||
|
mean = get_perf(df, method, model, "Input Tput (tok/s)") + get_perf(
|
||||||
|
df, method, model, "Output Tput (tok/s)"
|
||||||
|
)
|
||||||
|
mean = mean.tolist()
|
||||||
|
std = None
|
||||||
|
|
||||||
|
return mean, std
|
||||||
|
|
||||||
|
|
||||||
|
def main(args):
|
||||||
|
results_folder = Path(args.results_folder)
|
||||||
|
|
||||||
|
results = []
|
||||||
|
|
||||||
|
# collect results
|
||||||
|
for test_file in results_folder.glob("*_nightly_results.json"):
|
||||||
|
with open(test_file) as f:
|
||||||
|
results = results + json.loads(f.read())
|
||||||
|
|
||||||
|
# generate markdown table
|
||||||
|
df = pd.DataFrame.from_dict(results)
|
||||||
|
|
||||||
|
md_table = tabulate(df, headers="keys", tablefmt="pipe", showindex=False)
|
||||||
|
|
||||||
|
with open(args.description) as f:
|
||||||
|
description = f.read()
|
||||||
|
|
||||||
|
description = description.format(nightly_results_benchmarking_table=md_table)
|
||||||
|
|
||||||
|
with open("nightly_results.md", "w") as f:
|
||||||
|
f.write(description)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
args = parse_arguments()
|
||||||
|
main(args)
|
||||||
@ -0,0 +1,9 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
|
from lmdeploy.serve.openai.api_client import APIClient
|
||||||
|
|
||||||
|
api_client = APIClient("http://localhost:8000")
|
||||||
|
model_name = api_client.available_models[0]
|
||||||
|
|
||||||
|
print(model_name)
|
||||||
@ -181,14 +181,18 @@ launch_vllm_server() {
|
|||||||
if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then
|
if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then
|
||||||
echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience."
|
echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience."
|
||||||
model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model')
|
model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model')
|
||||||
server_command="vllm serve $model \
|
server_command="python3 \
|
||||||
|
-m vllm.entrypoints.openai.api_server \
|
||||||
-tp $tp \
|
-tp $tp \
|
||||||
|
--model $model \
|
||||||
--port $port \
|
--port $port \
|
||||||
$server_args"
|
$server_args"
|
||||||
else
|
else
|
||||||
echo "Key 'fp8' does not exist in common params."
|
echo "Key 'fp8' does not exist in common params."
|
||||||
server_command="vllm serve $model \
|
server_command="python3 \
|
||||||
|
-m vllm.entrypoints.openai.api_server \
|
||||||
-tp $tp \
|
-tp $tp \
|
||||||
|
--model $model \
|
||||||
--port $port \
|
--port $port \
|
||||||
$server_args"
|
$server_args"
|
||||||
fi
|
fi
|
||||||
78
.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
Normal file
78
.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
Normal file
@ -0,0 +1,78 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
set -ex
|
||||||
|
set -o pipefail
|
||||||
|
|
||||||
|
|
||||||
|
main() {
|
||||||
|
|
||||||
|
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
|
||||||
|
(which jq) || (apt-get update && apt-get -y install jq)
|
||||||
|
(which zip) || (apt-get install -y zip)
|
||||||
|
|
||||||
|
if [ ! -f /workspace/buildkite-agent ]; then
|
||||||
|
echo "buildkite-agent binary not found. Skip plotting the results."
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
# initial annotation
|
||||||
|
#description="$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-descriptions.md"
|
||||||
|
|
||||||
|
# download results
|
||||||
|
cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
|
||||||
|
mkdir -p results/
|
||||||
|
/workspace/buildkite-agent artifact download 'results/*nightly_results.json' results/
|
||||||
|
ls
|
||||||
|
ls results/
|
||||||
|
|
||||||
|
# upload benchmark results
|
||||||
|
zip -r results.zip results/
|
||||||
|
/workspace/buildkite-agent artifact upload "results.zip"
|
||||||
|
|
||||||
|
# upload benchmarking scripts
|
||||||
|
cd "$VLLM_SOURCE_CODE_LOC/"
|
||||||
|
zip -r nightly-benchmarks.zip .buildkite/ benchmarks/
|
||||||
|
/workspace/buildkite-agent artifact upload "nightly-benchmarks.zip"
|
||||||
|
|
||||||
|
cd "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/"
|
||||||
|
# upload benchmarking pipeline
|
||||||
|
/workspace/buildkite-agent artifact upload "nightly-pipeline.yaml"
|
||||||
|
|
||||||
|
cd "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/"
|
||||||
|
/workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly-annotation.md
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# The figures should be generated by a separate process outside the CI/CD pipeline
|
||||||
|
|
||||||
|
# # generate figures
|
||||||
|
# python3 -m pip install tabulate pandas matplotlib
|
||||||
|
|
||||||
|
# python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py \
|
||||||
|
# --description $description \
|
||||||
|
# --results-folder results/
|
||||||
|
|
||||||
|
|
||||||
|
# python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
|
||||||
|
# --description $description \
|
||||||
|
# --results-folder results/ \
|
||||||
|
# --dataset sharegpt
|
||||||
|
|
||||||
|
# python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
|
||||||
|
# --description $description \
|
||||||
|
# --results-folder results/ \
|
||||||
|
# --dataset sonnet_2048_128
|
||||||
|
|
||||||
|
# python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
|
||||||
|
# --description $description \
|
||||||
|
# --results-folder results/ \
|
||||||
|
# --dataset sonnet_128_2048
|
||||||
|
|
||||||
|
# # upload results and figures
|
||||||
|
# /workspace/buildkite-agent artifact upload "nightly_results*.png"
|
||||||
|
# /workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
|
||||||
|
# /workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/tests/nightly-tests.json
|
||||||
|
# /workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly_results.md
|
||||||
|
}
|
||||||
|
|
||||||
|
main "$@"
|
||||||
464
.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
Normal file
464
.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
Normal file
@ -0,0 +1,464 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
set -o pipefail
|
||||||
|
set -x
|
||||||
|
|
||||||
|
check_gpus() {
|
||||||
|
# check the number of GPUs and GPU type.
|
||||||
|
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
|
||||||
|
if [[ $gpu_count -gt 0 ]]; then
|
||||||
|
echo "GPU found."
|
||||||
|
else
|
||||||
|
echo "Need at least 1 GPU to run benchmarking."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
declare -g gpu_type="$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')"
|
||||||
|
echo "GPU type is $gpu_type"
|
||||||
|
}
|
||||||
|
|
||||||
|
check_hf_token() {
|
||||||
|
# check if HF_TOKEN is available and valid
|
||||||
|
if [[ -z "$HF_TOKEN" ]]; then
|
||||||
|
echo "Error: HF_TOKEN is not set."
|
||||||
|
exit 1
|
||||||
|
elif [[ ! "$HF_TOKEN" =~ ^hf_ ]]; then
|
||||||
|
echo "Error: HF_TOKEN does not start with 'hf_'."
|
||||||
|
exit 1
|
||||||
|
else
|
||||||
|
echo "HF_TOKEN is set and valid."
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
upload_to_buildkite() {
|
||||||
|
# upload the benchmarking results to buildkite
|
||||||
|
|
||||||
|
# if the agent binary is not found, skip uploading the results, exit 0
|
||||||
|
if [ ! -f /workspace/buildkite-agent ]; then
|
||||||
|
echo "buildkite-agent binary not found. Skip uploading the results."
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
# /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
|
||||||
|
/workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
get_current_llm_serving_engine() {
|
||||||
|
|
||||||
|
if which lmdeploy >/dev/null; then
|
||||||
|
echo "Container: lmdeploy"
|
||||||
|
export CURRENT_LLM_SERVING_ENGINE=lmdeploy
|
||||||
|
return
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ -e /tgi-entrypoint.sh ]; then
|
||||||
|
echo "Container: tgi"
|
||||||
|
export CURRENT_LLM_SERVING_ENGINE=tgi
|
||||||
|
return
|
||||||
|
fi
|
||||||
|
|
||||||
|
if which trtllm-build >/dev/null; then
|
||||||
|
echo "Container: tensorrt-llm"
|
||||||
|
export CURRENT_LLM_SERVING_ENGINE=trt
|
||||||
|
return
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ -e /sgl-workspace ]; then
|
||||||
|
echo "Container: sglang"
|
||||||
|
export CURRENT_LLM_SERVING_ENGINE=sglang
|
||||||
|
return
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ -e /vllm-workspace ]; then
|
||||||
|
echo "Container: vllm"
|
||||||
|
# move to a completely irrelevant directory, to avoid import vllm from current folder
|
||||||
|
export CURRENT_LLM_SERVING_ENGINE=vllm
|
||||||
|
|
||||||
|
return
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
json2args() {
|
||||||
|
# transforms the JSON string to command line args, and '_' is replaced to '-'
|
||||||
|
# example:
|
||||||
|
# input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
|
||||||
|
# output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
|
||||||
|
local json_string=$1
|
||||||
|
local args=$(
|
||||||
|
echo "$json_string" | jq -r '
|
||||||
|
to_entries |
|
||||||
|
map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
|
||||||
|
join(" ")
|
||||||
|
'
|
||||||
|
)
|
||||||
|
echo "$args"
|
||||||
|
}
|
||||||
|
|
||||||
|
kill_gpu_processes() {
|
||||||
|
pkill -f '[p]ython'
|
||||||
|
pkill -f '[p]ython3'
|
||||||
|
pkill -f '[t]ritonserver'
|
||||||
|
pkill -f '[p]t_main_thread'
|
||||||
|
pkill -f '[t]ext-generation'
|
||||||
|
pkill -f '[l]mdeploy'
|
||||||
|
# vLLM now names the process with VLLM prefix after https://github.com/vllm-project/vllm/pull/21445
|
||||||
|
pkill -f '[V]LLM'
|
||||||
|
|
||||||
|
while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
|
||||||
|
sleep 1
|
||||||
|
done
|
||||||
|
}
|
||||||
|
|
||||||
|
wait_for_server() {
|
||||||
|
# wait for vllm server to start
|
||||||
|
# return 1 if vllm server crashes
|
||||||
|
timeout 1200 bash -c '
|
||||||
|
until curl -s localhost:8000/v1/completions > /dev/null; do
|
||||||
|
sleep 1
|
||||||
|
done' && return 0 || return 1
|
||||||
|
}
|
||||||
|
|
||||||
|
ensure_installed() {
|
||||||
|
# Ensure that the given command is installed by apt-get
|
||||||
|
local cmd=$1
|
||||||
|
if ! which "$cmd" >/dev/null; then
|
||||||
|
apt-get update && apt-get install -y "$cmd"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
run_serving_tests() {
|
||||||
|
# run serving tests using `vllm bench serve` command
|
||||||
|
# $1: a json file specifying serving test cases
|
||||||
|
|
||||||
|
local serving_test_file
|
||||||
|
serving_test_file=$1
|
||||||
|
|
||||||
|
# Iterate over serving tests
|
||||||
|
jq -c '.[]' "$serving_test_file" | while read -r params; do
|
||||||
|
# get the test name, and append the GPU type back to it.
|
||||||
|
test_name=$(echo "$params" | jq -r '.test_name')
|
||||||
|
|
||||||
|
# if TEST_SELECTOR is set, only run the test cases that match the selector
|
||||||
|
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
|
||||||
|
echo "Skip test case $test_name."
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
# prepend the current serving engine to the test name
|
||||||
|
test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name}
|
||||||
|
|
||||||
|
# get common parameters
|
||||||
|
common_params=$(echo "$params" | jq -r '.common_parameters')
|
||||||
|
model=$(echo "$common_params" | jq -r '.model')
|
||||||
|
tp=$(echo "$common_params" | jq -r '.tp')
|
||||||
|
dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
|
||||||
|
dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
|
||||||
|
port=$(echo "$common_params" | jq -r '.port')
|
||||||
|
num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
|
||||||
|
reuse_server=$(echo "$common_params" | jq -r '.reuse_server')
|
||||||
|
|
||||||
|
# get client and server arguments
|
||||||
|
server_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_server_parameters")
|
||||||
|
client_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_client_parameters")
|
||||||
|
client_args=$(json2args "$client_params")
|
||||||
|
qps_list=$(echo "$params" | jq -r '.qps_list')
|
||||||
|
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
|
||||||
|
echo "Running over qps list $qps_list"
|
||||||
|
|
||||||
|
# check if there is enough GPU to run the test
|
||||||
|
if [[ $gpu_count -lt $tp ]]; then
|
||||||
|
echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name."
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ $reuse_server == "true" ]]; then
|
||||||
|
echo "Reuse previous server for test case $test_name"
|
||||||
|
else
|
||||||
|
kill_gpu_processes
|
||||||
|
bash "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/launch-server.sh" \
|
||||||
|
"$server_params" "$common_params"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if wait_for_server; then
|
||||||
|
echo ""
|
||||||
|
echo "$CURRENT_LLM_SERVING_ENGINE server is up and running."
|
||||||
|
else
|
||||||
|
echo ""
|
||||||
|
echo "$CURRENT_LLM_SERVING_ENGINE failed to start within the timeout period."
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
|
||||||
|
# prepare tokenizer
|
||||||
|
# this is required for lmdeploy.
|
||||||
|
cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
|
||||||
|
rm -rf /tokenizer_cache
|
||||||
|
mkdir /tokenizer_cache
|
||||||
|
python3 ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \
|
||||||
|
--model "$model" \
|
||||||
|
--cachedir /tokenizer_cache
|
||||||
|
cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
|
||||||
|
|
||||||
|
|
||||||
|
# change model name for lmdeploy (it will not follow standard hf name)
|
||||||
|
if [[ "$CURRENT_LLM_SERVING_ENGINE" == "lmdeploy" ]]; then
|
||||||
|
model=$(python ../.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py)
|
||||||
|
fi
|
||||||
|
|
||||||
|
# iterate over different QPS
|
||||||
|
for qps in $qps_list; do
|
||||||
|
# remove the surrounding single quote from qps
|
||||||
|
if [[ "$qps" == *"inf"* ]]; then
|
||||||
|
echo "qps was $qps"
|
||||||
|
qps="inf"
|
||||||
|
echo "now qps is $qps"
|
||||||
|
fi
|
||||||
|
|
||||||
|
new_test_name=$test_name"_qps_"$qps
|
||||||
|
|
||||||
|
backend=$CURRENT_LLM_SERVING_ENGINE
|
||||||
|
|
||||||
|
if [[ $backend = "trt" ]]; then
|
||||||
|
backend="tensorrt-llm"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ "$backend" == *"vllm"* ]]; then
|
||||||
|
backend="vllm"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ "$dataset_name" = "sharegpt" ]]; then
|
||||||
|
|
||||||
|
client_command="vllm bench serve \
|
||||||
|
--backend $backend \
|
||||||
|
--tokenizer /tokenizer_cache \
|
||||||
|
--model $model \
|
||||||
|
--dataset-name $dataset_name \
|
||||||
|
--dataset-path $dataset_path \
|
||||||
|
--num-prompts $num_prompts \
|
||||||
|
--port $port \
|
||||||
|
--save-result \
|
||||||
|
--result-dir $RESULTS_FOLDER \
|
||||||
|
--result-filename ${new_test_name}.json \
|
||||||
|
--request-rate $qps \
|
||||||
|
--ignore-eos \
|
||||||
|
$client_args"
|
||||||
|
|
||||||
|
elif [[ "$dataset_name" = "sonnet" ]]; then
|
||||||
|
|
||||||
|
sonnet_input_len=$(echo "$common_params" | jq -r '.sonnet_input_len')
|
||||||
|
sonnet_output_len=$(echo "$common_params" | jq -r '.sonnet_output_len')
|
||||||
|
sonnet_prefix_len=$(echo "$common_params" | jq -r '.sonnet_prefix_len')
|
||||||
|
|
||||||
|
client_command="vllm bench serve \
|
||||||
|
--backend $backend \
|
||||||
|
--tokenizer /tokenizer_cache \
|
||||||
|
--model $model \
|
||||||
|
--dataset-name $dataset_name \
|
||||||
|
--dataset-path $dataset_path \
|
||||||
|
--num-prompts $num_prompts \
|
||||||
|
--sonnet-input-len $sonnet_input_len \
|
||||||
|
--sonnet-output-len $sonnet_output_len \
|
||||||
|
--sonnet-prefix-len $sonnet_prefix_len \
|
||||||
|
--port $port \
|
||||||
|
--save-result \
|
||||||
|
--result-dir $RESULTS_FOLDER \
|
||||||
|
--result-filename ${new_test_name}.json \
|
||||||
|
--request-rate $qps \
|
||||||
|
--ignore-eos \
|
||||||
|
$client_args"
|
||||||
|
|
||||||
|
else
|
||||||
|
|
||||||
|
echo "The dataset name must be either 'sharegpt' or 'sonnet'. Got $dataset_name."
|
||||||
|
exit 1
|
||||||
|
|
||||||
|
fi
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
echo "Running test case $test_name with qps $qps"
|
||||||
|
echo "Client command: $client_command"
|
||||||
|
|
||||||
|
eval "$client_command"
|
||||||
|
|
||||||
|
server_command="None"
|
||||||
|
|
||||||
|
# record the benchmarking commands
|
||||||
|
jq_output=$(jq -n \
|
||||||
|
--arg server "$server_command" \
|
||||||
|
--arg client "$client_command" \
|
||||||
|
--arg gpu "$gpu_type" \
|
||||||
|
--arg engine "$CURRENT_LLM_SERVING_ENGINE" \
|
||||||
|
'{
|
||||||
|
server_command: $server,
|
||||||
|
client_command: $client,
|
||||||
|
gpu_type: $gpu,
|
||||||
|
engine: $engine
|
||||||
|
}')
|
||||||
|
echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
|
||||||
|
|
||||||
|
done
|
||||||
|
|
||||||
|
done
|
||||||
|
|
||||||
|
kill_gpu_processes
|
||||||
|
}
|
||||||
|
|
||||||
|
run_genai_perf_tests() {
|
||||||
|
# run genai-perf tests
|
||||||
|
|
||||||
|
# $1: a json file specifying genai-perf test cases
|
||||||
|
local genai_perf_test_file
|
||||||
|
genai_perf_test_file=$1
|
||||||
|
|
||||||
|
# Iterate over genai-perf tests
|
||||||
|
jq -c '.[]' "$genai_perf_test_file" | while read -r params; do
|
||||||
|
# get the test name, and append the GPU type back to it.
|
||||||
|
test_name=$(echo "$params" | jq -r '.test_name')
|
||||||
|
|
||||||
|
# if TEST_SELECTOR is set, only run the test cases that match the selector
|
||||||
|
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
|
||||||
|
echo "Skip test case $test_name."
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
# prepend the current serving engine to the test name
|
||||||
|
test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name}
|
||||||
|
|
||||||
|
# get common parameters
|
||||||
|
common_params=$(echo "$params" | jq -r '.common_parameters')
|
||||||
|
model=$(echo "$common_params" | jq -r '.model')
|
||||||
|
tp=$(echo "$common_params" | jq -r '.tp')
|
||||||
|
dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
|
||||||
|
dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
|
||||||
|
port=$(echo "$common_params" | jq -r '.port')
|
||||||
|
num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
|
||||||
|
reuse_server=$(echo "$common_params" | jq -r '.reuse_server')
|
||||||
|
|
||||||
|
# get client and server arguments
|
||||||
|
server_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_server_parameters")
|
||||||
|
qps_list=$(echo "$params" | jq -r '.qps_list')
|
||||||
|
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
|
||||||
|
echo "Running over qps list $qps_list"
|
||||||
|
|
||||||
|
# check if there is enough GPU to run the test
|
||||||
|
if [[ $gpu_count -lt $tp ]]; then
|
||||||
|
echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name."
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ $reuse_server == "true" ]]; then
|
||||||
|
echo "Reuse previous server for test case $test_name"
|
||||||
|
else
|
||||||
|
kill_gpu_processes
|
||||||
|
bash "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/launch-server.sh" \
|
||||||
|
"$server_params" "$common_params"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if wait_for_server; then
|
||||||
|
echo ""
|
||||||
|
echo "$CURRENT_LLM_SERVING_ENGINE server is up and running."
|
||||||
|
else
|
||||||
|
echo ""
|
||||||
|
echo "$CURRENT_LLM_SERVING_ENGINE failed to start within the timeout period."
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
|
||||||
|
# iterate over different QPS
|
||||||
|
for qps in $qps_list; do
|
||||||
|
# remove the surrounding single quote from qps
|
||||||
|
if [[ "$qps" == *"inf"* ]]; then
|
||||||
|
echo "qps was $qps"
|
||||||
|
qps=$num_prompts
|
||||||
|
echo "now qps is $qps"
|
||||||
|
fi
|
||||||
|
|
||||||
|
new_test_name=$test_name"_qps_"$qps
|
||||||
|
backend=$CURRENT_LLM_SERVING_ENGINE
|
||||||
|
|
||||||
|
if [[ "$backend" == *"vllm"* ]]; then
|
||||||
|
backend="vllm"
|
||||||
|
fi
|
||||||
|
#TODO: add output dir.
|
||||||
|
client_command="genai-perf profile \
|
||||||
|
-m $model \
|
||||||
|
--service-kind openai \
|
||||||
|
--backend "$backend" \
|
||||||
|
--endpoint-type chat \
|
||||||
|
--streaming \
|
||||||
|
--url localhost:$port \
|
||||||
|
--request-rate $qps \
|
||||||
|
--num-prompts $num_prompts \
|
||||||
|
"
|
||||||
|
|
||||||
|
echo "Client command: $client_command"
|
||||||
|
|
||||||
|
eval "$client_command"
|
||||||
|
|
||||||
|
#TODO: process/record outputs
|
||||||
|
done
|
||||||
|
done
|
||||||
|
|
||||||
|
kill_gpu_processes
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
prepare_dataset() {
|
||||||
|
|
||||||
|
# download sharegpt dataset
|
||||||
|
cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
|
||||||
|
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
||||||
|
|
||||||
|
# duplicate sonnet by 4x, to allow benchmarking with input length 2048
|
||||||
|
cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
|
||||||
|
echo "" > sonnet_4x.txt
|
||||||
|
for _ in {1..4}
|
||||||
|
do
|
||||||
|
cat sonnet.txt >> sonnet_4x.txt
|
||||||
|
done
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
main() {
|
||||||
|
|
||||||
|
# check if the environment variable is successfully injected from yaml
|
||||||
|
|
||||||
|
check_gpus
|
||||||
|
check_hf_token
|
||||||
|
get_current_llm_serving_engine
|
||||||
|
|
||||||
|
pip install -U transformers
|
||||||
|
|
||||||
|
pip install -r requirements/dev.txt
|
||||||
|
which genai-perf
|
||||||
|
|
||||||
|
# check storage
|
||||||
|
df -h
|
||||||
|
|
||||||
|
ensure_installed wget
|
||||||
|
ensure_installed curl
|
||||||
|
ensure_installed jq
|
||||||
|
# genai-perf dependency
|
||||||
|
ensure_installed libb64-0d
|
||||||
|
|
||||||
|
prepare_dataset
|
||||||
|
|
||||||
|
cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
|
||||||
|
declare -g RESULTS_FOLDER=results/
|
||||||
|
mkdir -p $RESULTS_FOLDER
|
||||||
|
BENCHMARK_ROOT="$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/"
|
||||||
|
|
||||||
|
# run the test
|
||||||
|
run_serving_tests "$BENCHMARK_ROOT/tests/nightly-tests.json"
|
||||||
|
|
||||||
|
# run genai-perf tests
|
||||||
|
run_genai_perf_tests "$BENCHMARK_ROOT/tests/genai-perf-tests.json"
|
||||||
|
mv artifacts/ $RESULTS_FOLDER/
|
||||||
|
|
||||||
|
# upload benchmark results to buildkite
|
||||||
|
python3 -m pip install tabulate pandas
|
||||||
|
python3 "$BENCHMARK_ROOT/scripts/summary-nightly-results.py"
|
||||||
|
upload_to_buildkite
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
main "$@"
|
||||||
@ -15,8 +15,6 @@ check_gpus() {
|
|||||||
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
|
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
|
||||||
elif command -v amd-smi; then
|
elif command -v amd-smi; then
|
||||||
declare -g gpu_count=$(amd-smi list | grep 'GPU' | wc -l)
|
declare -g gpu_count=$(amd-smi list | grep 'GPU' | wc -l)
|
||||||
elif command -v hl-smi; then
|
|
||||||
declare -g gpu_count=$(hl-smi --list | grep -i "Module ID" | wc -l)
|
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [[ $gpu_count -gt 0 ]]; then
|
if [[ $gpu_count -gt 0 ]]; then
|
||||||
@ -25,16 +23,10 @@ check_gpus() {
|
|||||||
echo "Need at least 1 GPU to run benchmarking."
|
echo "Need at least 1 GPU to run benchmarking."
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
declare -g arch_suffix=''
|
|
||||||
|
|
||||||
if command -v nvidia-smi; then
|
if command -v nvidia-smi; then
|
||||||
declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')
|
declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')
|
||||||
elif command -v amd-smi; then
|
elif command -v amd-smi; then
|
||||||
declare -g gpu_type=$(amd-smi static -g 0 -a | grep 'MARKET_NAME' | awk '{print $2}')
|
declare -g gpu_type=$(amd-smi static -g 0 -a | grep 'MARKET_NAME' | awk '{print $2}')
|
||||||
elif command -v hl-smi; then
|
|
||||||
declare -g gpu_type=$(hl-smi -q | grep "Product Name" | head -n 1 | awk -F ':' '{print $2}' | sed 's/^ *//')
|
|
||||||
arch_suffix='-hpu'
|
|
||||||
fi
|
fi
|
||||||
echo "GPU type is $gpu_type"
|
echo "GPU type is $gpu_type"
|
||||||
}
|
}
|
||||||
@ -146,10 +138,6 @@ kill_gpu_processes() {
|
|||||||
while [ "$(amd-smi metric -g 0 | grep 'USED_VRAM' | awk '{print $2}')" -ge 1000 ]; do
|
while [ "$(amd-smi metric -g 0 | grep 'USED_VRAM' | awk '{print $2}')" -ge 1000 ]; do
|
||||||
sleep 1
|
sleep 1
|
||||||
done
|
done
|
||||||
elif command -v hl-smi; then
|
|
||||||
while [ "$(hl-smi -q | grep "Used" | head -n 1 | awk '{print $3}')" -ge 1000 ]; do
|
|
||||||
sleep 1
|
|
||||||
done
|
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# remove vllm config file
|
# remove vllm config file
|
||||||
@ -377,7 +365,8 @@ run_serving_tests() {
|
|||||||
continue
|
continue
|
||||||
fi
|
fi
|
||||||
|
|
||||||
server_command="$server_envs vllm serve \
|
server_command="$server_envs python3 \
|
||||||
|
-m vllm.entrypoints.openai.api_server \
|
||||||
$server_args"
|
$server_args"
|
||||||
|
|
||||||
# run the server
|
# run the server
|
||||||
@ -463,10 +452,14 @@ main() {
|
|||||||
ARCH='-cpu'
|
ARCH='-cpu'
|
||||||
else
|
else
|
||||||
check_gpus
|
check_gpus
|
||||||
ARCH="$arch_suffix"
|
|
||||||
fi
|
fi
|
||||||
check_hf_token
|
check_hf_token
|
||||||
|
|
||||||
|
# Set to v1 to run v1 benchmark
|
||||||
|
if [[ "${ENGINE_VERSION:-v0}" == "v1" ]]; then
|
||||||
|
export VLLM_USE_V1=1
|
||||||
|
fi
|
||||||
|
|
||||||
# dependencies
|
# dependencies
|
||||||
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
|
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
|
||||||
(which jq) || (apt-get update && apt-get -y install jq)
|
(which jq) || (apt-get update && apt-get -y install jq)
|
||||||
@ -482,12 +475,7 @@ main() {
|
|||||||
ensure_sharegpt_downloaded
|
ensure_sharegpt_downloaded
|
||||||
declare -g RESULTS_FOLDER=results/
|
declare -g RESULTS_FOLDER=results/
|
||||||
mkdir -p $RESULTS_FOLDER
|
mkdir -p $RESULTS_FOLDER
|
||||||
QUICK_BENCHMARK_ROOT=../.buildkite/performance-benchmarks/
|
QUICK_BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
|
||||||
|
|
||||||
# dump vllm info via vllm collect-env
|
|
||||||
env_output=$(vllm collect-env)
|
|
||||||
|
|
||||||
echo "$env_output" >"$RESULTS_FOLDER/vllm_env.txt"
|
|
||||||
|
|
||||||
# benchmarking
|
# benchmarking
|
||||||
run_serving_tests $QUICK_BENCHMARK_ROOT/tests/"${SERVING_JSON:-serving-tests$ARCH.json}"
|
run_serving_tests $QUICK_BENCHMARK_ROOT/tests/"${SERVING_JSON:-serving-tests$ARCH.json}"
|
||||||
@ -0,0 +1,82 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
|
import datetime
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
from tabulate import tabulate
|
||||||
|
|
||||||
|
results_folder = Path("results/")
|
||||||
|
|
||||||
|
# serving results and the keys that will be printed into markdown
|
||||||
|
serving_results = []
|
||||||
|
serving_column_mapping = {
|
||||||
|
"test_name": "Test name",
|
||||||
|
"gpu_type": "GPU",
|
||||||
|
"completed": "Successful req.",
|
||||||
|
"request_throughput": "Tput (req/s)",
|
||||||
|
"mean_ttft_ms": "Mean TTFT (ms)",
|
||||||
|
"std_ttft_ms": "Std TTFT (ms)",
|
||||||
|
"median_ttft_ms": "Median TTFT (ms)",
|
||||||
|
"mean_itl_ms": "Mean ITL (ms)",
|
||||||
|
"std_itl_ms": "Std ITL (ms)",
|
||||||
|
"median_itl_ms": "Median ITL (ms)",
|
||||||
|
"mean_tpot_ms": "Mean TPOT (ms)",
|
||||||
|
"std_tpot_ms": "Std TPOT (ms)",
|
||||||
|
"median_tpot_ms": "Median TPOT (ms)",
|
||||||
|
"total_token_throughput": "Total Token Tput (tok/s)",
|
||||||
|
"output_throughput": "Output Tput (tok/s)",
|
||||||
|
"total_input_tokens": "Total input tokens",
|
||||||
|
"total_output_tokens": "Total output tokens",
|
||||||
|
"engine": "Engine",
|
||||||
|
}
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
# collect results
|
||||||
|
for test_file in results_folder.glob("*.json"):
|
||||||
|
with open(test_file) as f:
|
||||||
|
raw_result = json.loads(f.read())
|
||||||
|
|
||||||
|
# attach the benchmarking command to raw_result
|
||||||
|
with open(test_file.with_suffix(".commands")) as f:
|
||||||
|
command = json.loads(f.read())
|
||||||
|
raw_result.update(command)
|
||||||
|
|
||||||
|
# update the test name of this result
|
||||||
|
raw_result.update({"test_name": test_file.stem})
|
||||||
|
|
||||||
|
# add the result to raw_result
|
||||||
|
serving_results.append(raw_result)
|
||||||
|
continue
|
||||||
|
|
||||||
|
serving_results = pd.DataFrame.from_dict(serving_results)
|
||||||
|
|
||||||
|
if not serving_results.empty:
|
||||||
|
serving_results = serving_results[list(serving_column_mapping.keys())].rename(
|
||||||
|
columns=serving_column_mapping
|
||||||
|
)
|
||||||
|
|
||||||
|
serving_md_table_with_headers = tabulate(
|
||||||
|
serving_results, headers="keys", tablefmt="pipe", showindex=False
|
||||||
|
)
|
||||||
|
# remove the first line of header
|
||||||
|
serving_md_table_lines = serving_md_table_with_headers.split("\n")
|
||||||
|
serving_md_table_without_header = "\n".join(serving_md_table_lines[2:])
|
||||||
|
|
||||||
|
prefix = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
|
||||||
|
prefix = prefix + "_" + os.environ.get("CURRENT_LLM_SERVING_ENGINE")
|
||||||
|
|
||||||
|
# document benchmarking results in markdown
|
||||||
|
with open(results_folder / f"{prefix}_nightly_results.md", "w") as f:
|
||||||
|
# document results with header.
|
||||||
|
# for those who wants to reproduce our benchmark.
|
||||||
|
f.write(serving_md_table_with_headers)
|
||||||
|
f.write("\n")
|
||||||
|
|
||||||
|
# document benchmarking results in json
|
||||||
|
with open(results_folder / f"{prefix}_nightly_results.json", "w") as f:
|
||||||
|
results = serving_results.to_dict(orient="records")
|
||||||
|
f.write(json.dumps(results))
|
||||||
23
.buildkite/nightly-benchmarks/scripts/wait-for-image.sh
Normal file
23
.buildkite/nightly-benchmarks/scripts/wait-for-image.sh
Normal file
@ -0,0 +1,23 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
TOKEN=$(curl -s -L "https://public.ecr.aws/token?service=public.ecr.aws&scope=repository:q9t5s3a7/vllm-ci-postmerge-repo:pull" | jq -r .token)
|
||||||
|
if [[ "$BUILDKITE_BRANCH" == "main" ]]; then
|
||||||
|
URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-postmerge-repo/manifests/$BUILDKITE_COMMIT"
|
||||||
|
else
|
||||||
|
URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-test-repo/manifests/$BUILDKITE_COMMIT"
|
||||||
|
fi
|
||||||
|
|
||||||
|
TIMEOUT_SECONDS=10
|
||||||
|
|
||||||
|
retries=0
|
||||||
|
while [ $retries -lt 1000 ]; do
|
||||||
|
if [ "$(curl -s --max-time "$TIMEOUT_SECONDS" -L -H "Authorization: Bearer $TOKEN" -o /dev/null -w "%{http_code}" "$URL")" -eq 200 ]; then
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "Waiting for image to be available..."
|
||||||
|
|
||||||
|
retries=$((retries + 1))
|
||||||
|
sleep 5
|
||||||
|
done
|
||||||
|
|
||||||
|
exit 1
|
||||||
30
.buildkite/nightly-benchmarks/tests/latency-tests-cpu.json
Normal file
30
.buildkite/nightly-benchmarks/tests/latency-tests-cpu.json
Normal file
@ -0,0 +1,30 @@
|
|||||||
|
[
|
||||||
|
{
|
||||||
|
"test_name": "latency_llama8B_tp1",
|
||||||
|
"environment_variables": {
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"parameters": {
|
||||||
|
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||||
|
"tensor_parallel_size": 1,
|
||||||
|
"load_format": "dummy",
|
||||||
|
"num_iters_warmup": 5,
|
||||||
|
"num_iters": 15
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "latency_llama8B_tp4",
|
||||||
|
"environment_variables": {
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"parameters": {
|
||||||
|
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||||
|
"tensor_parallel_size": 4,
|
||||||
|
"load_format": "dummy",
|
||||||
|
"num_iters_warmup": 5,
|
||||||
|
"num_iters": 15
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
@ -95,38 +95,6 @@
|
|||||||
"num_prompts": 200
|
"num_prompts": 200
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_bf16_tp4_sharegpt",
|
|
||||||
"qps_list": ["inf"],
|
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"tensor_parallel_size": 4,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "sharegpt",
|
|
||||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
|
||||||
"num_prompts": 200
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
"test_name": "serving_llama8B_bf16_tp2pp3_sharegpt",
|
"test_name": "serving_llama8B_bf16_tp2pp3_sharegpt",
|
||||||
"qps_list": ["inf"],
|
"qps_list": ["inf"],
|
||||||
@ -265,41 +233,6 @@
|
|||||||
"num_prompts": 1000
|
"num_prompts": 1000
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_bf16_tp4_random_128_128",
|
|
||||||
"qps_list": ["inf"],
|
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"tensor_parallel_size": 4,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"enable_chunked_prefill": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 128,
|
|
||||||
"ignore-eos": "",
|
|
||||||
"num_prompts": 1000
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
"test_name": "serving_llama8B_bf16_tp2pp3_random_128_128",
|
"test_name": "serving_llama8B_bf16_tp2pp3_random_128_128",
|
||||||
"qps_list": ["inf"],
|
"qps_list": ["inf"],
|
||||||
@ -432,38 +365,6 @@
|
|||||||
"num_prompts": 200
|
"num_prompts": 200
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_int8_tp4_sharegpt",
|
|
||||||
"qps_list": ["inf"],
|
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
|
||||||
"tensor_parallel_size": 4,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "sharegpt",
|
|
||||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
|
||||||
"num_prompts": 200
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
"test_name": "serving_llama8B_int8_tp2pp3_sharegpt",
|
"test_name": "serving_llama8B_int8_tp2pp3_sharegpt",
|
||||||
"qps_list": ["inf"],
|
"qps_list": ["inf"],
|
||||||
@ -602,41 +503,6 @@
|
|||||||
"num_prompts": 1000
|
"num_prompts": 1000
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_int8_tp4_random_128_128",
|
|
||||||
"qps_list": ["inf"],
|
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
|
||||||
"tensor_parallel_size": 4,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"enable_chunked_prefill": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 128,
|
|
||||||
"ignore-eos": "",
|
|
||||||
"num_prompts": 1000
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
"test_name": "serving_llama8B_int8_tp2pp3_random_128_128",
|
"test_name": "serving_llama8B_int8_tp2pp3_random_128_128",
|
||||||
"qps_list": ["inf"],
|
"qps_list": ["inf"],
|
||||||
@ -772,39 +638,6 @@
|
|||||||
"num_prompts": 200
|
"num_prompts": 200
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_int4_tp4_sharegpt",
|
|
||||||
"qps_list": ["inf"],
|
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
|
||||||
"quantization": "awq",
|
|
||||||
"tensor_parallel_size": 4,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "sharegpt",
|
|
||||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
|
||||||
"num_prompts": 200
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
"test_name": "serving_llama8B_int4_tp2pp3_sharegpt",
|
"test_name": "serving_llama8B_int4_tp2pp3_sharegpt",
|
||||||
"qps_list": ["inf"],
|
"qps_list": ["inf"],
|
||||||
@ -947,42 +780,6 @@
|
|||||||
"num_prompts": 1000
|
"num_prompts": 1000
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_int4_tp4_random_128_128",
|
|
||||||
"qps_list": ["inf"],
|
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
|
||||||
"quantization": "awq",
|
|
||||||
"tensor_parallel_size": 4,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"enable_chunked_prefill": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 128,
|
|
||||||
"ignore-eos": "",
|
|
||||||
"num_prompts": 1000
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
"test_name": "serving_llama8B_int4_tp2pp3_random_128_128",
|
"test_name": "serving_llama8B_int4_tp2pp3_random_128_128",
|
||||||
"qps_list": ["inf"],
|
"qps_list": ["inf"],
|
||||||
@ -2,7 +2,7 @@
|
|||||||
{
|
{
|
||||||
"test_name": "serving_llama8B_tp1_sharegpt",
|
"test_name": "serving_llama8B_tp1_sharegpt",
|
||||||
"qps_list": [1, 4, 16, "inf"],
|
"qps_list": [1, 4, 16, "inf"],
|
||||||
"max_concurrency_list": [32],
|
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||||
"server_environment_variables": {
|
"server_environment_variables": {
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
@ -28,13 +28,13 @@
|
|||||||
"backend": "vllm",
|
"backend": "vllm",
|
||||||
"dataset_name": "sharegpt",
|
"dataset_name": "sharegpt",
|
||||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
"num_prompts": 32
|
"num_prompts": 200
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"test_name": "serving_llama8B_tp2_sharegpt",
|
"test_name": "serving_llama8B_tp2_sharegpt",
|
||||||
"qps_list": [1, 4, 16, "inf"],
|
"qps_list": [1, 4, 16, "inf"],
|
||||||
"max_concurrency_list": [32],
|
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||||
"server_environment_variables": {
|
"server_environment_variables": {
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
@ -60,13 +60,13 @@
|
|||||||
"backend": "vllm",
|
"backend": "vllm",
|
||||||
"dataset_name": "sharegpt",
|
"dataset_name": "sharegpt",
|
||||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
"num_prompts": 32
|
"num_prompts": 200
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"test_name": "serving_llama8B_tp1_random_128_128",
|
"test_name": "serving_llama8B_tp4_sharegpt",
|
||||||
"qps_list": [1, 4, 16, "inf"],
|
"qps_list": [1, 4, 16, "inf"],
|
||||||
"max_concurrency_list": [32],
|
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||||
"server_environment_variables": {
|
"server_environment_variables": {
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
@ -76,7 +76,39 @@
|
|||||||
},
|
},
|
||||||
"server_parameters": {
|
"server_parameters": {
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||||
"tensor_parallel_size": 1,
|
"tensor_parallel_size": 4,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "sharegpt",
|
||||||
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"num_prompts": 200
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_tp4_random_1024_128",
|
||||||
|
"qps_list": [1, 4, 16, "inf"],
|
||||||
|
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||||
|
"tensor_parallel_size": 4,
|
||||||
"dtype": "bfloat16",
|
"dtype": "bfloat16",
|
||||||
"distributed_executor_backend": "mp",
|
"distributed_executor_backend": "mp",
|
||||||
"block_size": 128,
|
"block_size": 128,
|
||||||
@ -92,16 +124,16 @@
|
|||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||||
"backend": "vllm",
|
"backend": "vllm",
|
||||||
"dataset_name": "random",
|
"dataset_name": "random",
|
||||||
"random-input-len": 128,
|
"random-input-len": 1024,
|
||||||
"random-output-len": 128,
|
"random-output-len": 128,
|
||||||
"ignore-eos": "",
|
"ignore-eos": "",
|
||||||
"num_prompts": 32
|
"num_prompts": 100
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"test_name": "serving_llama8B_tp2_random_128_128",
|
"test_name": "serving_llama8B_pp6_random_1024_128",
|
||||||
"qps_list": [1, 4, 16, "inf"],
|
"qps_list": [1, 4, 16, "inf"],
|
||||||
"max_concurrency_list": [32],
|
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||||
"server_environment_variables": {
|
"server_environment_variables": {
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
@ -111,7 +143,7 @@
|
|||||||
},
|
},
|
||||||
"server_parameters": {
|
"server_parameters": {
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||||
"tensor_parallel_size": 2,
|
"pipeline_parallel_size": 6,
|
||||||
"dtype": "bfloat16",
|
"dtype": "bfloat16",
|
||||||
"distributed_executor_backend": "mp",
|
"distributed_executor_backend": "mp",
|
||||||
"block_size": 128,
|
"block_size": 128,
|
||||||
@ -127,150 +159,10 @@
|
|||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||||
"backend": "vllm",
|
"backend": "vllm",
|
||||||
"dataset_name": "random",
|
"dataset_name": "random",
|
||||||
"random-input-len": 128,
|
"random-input-len": 1024,
|
||||||
"random-output-len": 128,
|
"random-output-len": 128,
|
||||||
"ignore-eos": "",
|
"ignore-eos": "",
|
||||||
"num_prompts": 32
|
"num_prompts": 100
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_tp1_random_128_2048",
|
|
||||||
"qps_list": [1, 4, 16, "inf"],
|
|
||||||
"max_concurrency_list": [32],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"tensor_parallel_size": 1,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"enable_chunked_prefill": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 2048,
|
|
||||||
"ignore-eos": "",
|
|
||||||
"num_prompts": 32
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_tp2_random_128_2048",
|
|
||||||
"qps_list": [1, 4, 16, "inf"],
|
|
||||||
"max_concurrency_list": [32],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"tensor_parallel_size": 2,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"enable_chunked_prefill": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 128,
|
|
||||||
"random-output-len": 2048,
|
|
||||||
"ignore-eos": "",
|
|
||||||
"num_prompts": 32
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_tp1_random_2048_128",
|
|
||||||
"qps_list": [1, 4, 16, "inf"],
|
|
||||||
"max_concurrency_list": [32],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"tensor_parallel_size": 1,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"enable_chunked_prefill": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 2048,
|
|
||||||
"random-output-len": 128,
|
|
||||||
"ignore-eos": "",
|
|
||||||
"num_prompts": 32
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_tp2_random_2048_128",
|
|
||||||
"qps_list": [1, 4, 16, "inf"],
|
|
||||||
"max_concurrency_list": [32],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"tensor_parallel_size": 2,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"enable_chunked_prefill": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "random",
|
|
||||||
"random-input-len": 2048,
|
|
||||||
"random-output-len": 128,
|
|
||||||
"ignore-eos": "",
|
|
||||||
"num_prompts": 32
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
@ -0,0 +1,32 @@
|
|||||||
|
[
|
||||||
|
{
|
||||||
|
"test_name": "throughput_llama8B_tp1",
|
||||||
|
"environment_variables": {
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"parameters": {
|
||||||
|
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||||
|
"tensor_parallel_size": 1,
|
||||||
|
"load_format": "dummy",
|
||||||
|
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"num_prompts": 200,
|
||||||
|
"backend": "vllm"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "throughput_llama8B_tp4",
|
||||||
|
"environment_variables": {
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"parameters": {
|
||||||
|
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||||
|
"tensor_parallel_size": 4,
|
||||||
|
"load_format": "dummy",
|
||||||
|
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"num_prompts": 200,
|
||||||
|
"backend": "vllm"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
@ -1,26 +0,0 @@
|
|||||||
[
|
|
||||||
{
|
|
||||||
"test_name": "latency_llama8B_tp2",
|
|
||||||
"environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"tensor_parallel_size": 2,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"num_iters_warmup": 5,
|
|
||||||
"num_iters": 15
|
|
||||||
}
|
|
||||||
}
|
|
||||||
]
|
|
||||||
@ -1,55 +0,0 @@
|
|||||||
[
|
|
||||||
{
|
|
||||||
"test_name": "latency_llama8B_tp1",
|
|
||||||
"environment_variables": {
|
|
||||||
"PT_HPU_LAZY_MODE": 1,
|
|
||||||
"VLLM_CONTIGUOUS_PA": 1,
|
|
||||||
"VLLM_DEFRAG": 1
|
|
||||||
},
|
|
||||||
"parameters": {
|
|
||||||
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
|
||||||
"tensor_parallel_size": 1,
|
|
||||||
"load_format": "dummy",
|
|
||||||
"num-iters-warmup": 5,
|
|
||||||
"num-iters": 15,
|
|
||||||
"max-model-len": 256,
|
|
||||||
"async-scheduling": ""
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "latency_llama70B_tp4",
|
|
||||||
"environment_variables": {
|
|
||||||
"PT_HPU_LAZY_MODE": 1,
|
|
||||||
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
|
|
||||||
"VLLM_CONTIGUOUS_PA": 1,
|
|
||||||
"VLLM_DEFRAG": 1
|
|
||||||
},
|
|
||||||
"parameters": {
|
|
||||||
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
|
||||||
"tensor_parallel_size": 4,
|
|
||||||
"load_format": "dummy",
|
|
||||||
"num-iters-warmup": 5,
|
|
||||||
"num-iters": 15,
|
|
||||||
"max-model-len": 256,
|
|
||||||
"async-scheduling": ""
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "latency_mixtral8x7B_tp2",
|
|
||||||
"environment_variables": {
|
|
||||||
"PT_HPU_LAZY_MODE": 1,
|
|
||||||
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
|
|
||||||
"VLLM_CONTIGUOUS_PA": 1,
|
|
||||||
"VLLM_DEFRAG": 1
|
|
||||||
},
|
|
||||||
"parameters": {
|
|
||||||
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
|
|
||||||
"tensor_parallel_size": 2,
|
|
||||||
"load_format": "dummy",
|
|
||||||
"num-iters-warmup": 5,
|
|
||||||
"num-iters": 15,
|
|
||||||
"max-model-len": 256,
|
|
||||||
"async-scheduling": ""
|
|
||||||
}
|
|
||||||
}
|
|
||||||
]
|
|
||||||
@ -1,82 +0,0 @@
|
|||||||
[
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_tp1_sharegpt",
|
|
||||||
"qps_list": [1, 4, 16, "inf"],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"PT_HPU_LAZY_MODE": 1,
|
|
||||||
"VLLM_CONTIGUOUS_PA": 1,
|
|
||||||
"VLLM_DEFRAG": 1
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
|
||||||
"tensor_parallel_size": 1,
|
|
||||||
"swap_space": 16,
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"load_format": "dummy",
|
|
||||||
"max-model-len": 2048,
|
|
||||||
"max-num-seqs": 256,
|
|
||||||
"async-scheduling": ""
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "sharegpt",
|
|
||||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
|
||||||
"num_prompts": 200
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama70B_tp4_sharegpt",
|
|
||||||
"qps_list": [1, 4, 16, "inf"],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"PT_HPU_LAZY_MODE": 1,
|
|
||||||
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
|
|
||||||
"VLLM_CONTIGUOUS_PA": 1,
|
|
||||||
"VLLM_DEFRAG": 1
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
|
||||||
"tensor_parallel_size": 4,
|
|
||||||
"swap_space": 16,
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"load_format": "dummy",
|
|
||||||
"max-model-len": 2048,
|
|
||||||
"max-num-seqs": 256,
|
|
||||||
"async-scheduling": ""
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "sharegpt",
|
|
||||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
|
||||||
"num_prompts": 200
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_mixtral8x7B_tp2_sharegpt",
|
|
||||||
"qps_list": [1, 4, 16, "inf"],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"PT_HPU_LAZY_MODE": 1,
|
|
||||||
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
|
|
||||||
"VLLM_CONTIGUOUS_PA": 1,
|
|
||||||
"VLLM_DEFRAG": 1
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
|
|
||||||
"tensor_parallel_size": 2,
|
|
||||||
"swap_space": 16,
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"load_format": "dummy",
|
|
||||||
"max-model-len": 2048,
|
|
||||||
"max-num-seqs": 256,
|
|
||||||
"async-scheduling": ""
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "sharegpt",
|
|
||||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
|
||||||
"num_prompts": 200
|
|
||||||
}
|
|
||||||
}
|
|
||||||
]
|
|
||||||
@ -1,27 +0,0 @@
|
|||||||
[
|
|
||||||
{
|
|
||||||
"test_name": "throughput_llama8B_tp2",
|
|
||||||
"environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"tensor_parallel_size": 2,
|
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
|
||||||
"num_prompts": 200,
|
|
||||||
"backend": "vllm"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
]
|
|
||||||
@ -1,61 +0,0 @@
|
|||||||
[
|
|
||||||
{
|
|
||||||
"test_name": "throughput_llama8B_tp1",
|
|
||||||
"environment_variables": {
|
|
||||||
"PT_HPU_LAZY_MODE": 1,
|
|
||||||
"VLLM_CONTIGUOUS_PA": 1,
|
|
||||||
"VLLM_DEFRAG": 1
|
|
||||||
},
|
|
||||||
"parameters": {
|
|
||||||
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
|
||||||
"tensor_parallel_size": 1,
|
|
||||||
"load_format": "dummy",
|
|
||||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
|
||||||
"num_prompts": 1000,
|
|
||||||
"backend": "vllm",
|
|
||||||
"max-model-len": 2048,
|
|
||||||
"max-num-seqs": 512,
|
|
||||||
"async-scheduling": ""
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "throughput_llama70B_tp4",
|
|
||||||
"environment_variables": {
|
|
||||||
"PT_HPU_LAZY_MODE": 1,
|
|
||||||
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
|
|
||||||
"VLLM_CONTIGUOUS_PA": 1,
|
|
||||||
"VLLM_DEFRAG": 1
|
|
||||||
},
|
|
||||||
"parameters": {
|
|
||||||
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
|
||||||
"tensor_parallel_size": 4,
|
|
||||||
"load_format": "dummy",
|
|
||||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
|
||||||
"num_prompts": 1000,
|
|
||||||
"backend": "vllm",
|
|
||||||
"max-model-len": 2048,
|
|
||||||
"max-num-seqs": 512,
|
|
||||||
"async-scheduling": ""
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "throughput_mixtral8x7B_tp2",
|
|
||||||
"environment_variables": {
|
|
||||||
"PT_HPU_LAZY_MODE": 1,
|
|
||||||
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
|
|
||||||
"VLLM_CONTIGUOUS_PA": 1,
|
|
||||||
"VLLM_DEFRAG": 1
|
|
||||||
},
|
|
||||||
"parameters": {
|
|
||||||
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
|
|
||||||
"tensor_parallel_size": 2,
|
|
||||||
"load_format": "dummy",
|
|
||||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
|
||||||
"num_prompts": 1000,
|
|
||||||
"backend": "vllm",
|
|
||||||
"max-model-len": 2048,
|
|
||||||
"max-num-seqs": 512,
|
|
||||||
"async-scheduling": ""
|
|
||||||
}
|
|
||||||
}
|
|
||||||
]
|
|
||||||
46
.buildkite/pyproject.toml
Normal file
46
.buildkite/pyproject.toml
Normal file
@ -0,0 +1,46 @@
|
|||||||
|
# This local pyproject file is part of the migration from yapf to ruff format.
|
||||||
|
# It uses the same core rules as the main pyproject.toml file, but with the
|
||||||
|
# following differences:
|
||||||
|
# - ruff line length is overridden to 88
|
||||||
|
# - deprecated typing ignores (UP006, UP035) have been removed
|
||||||
|
|
||||||
|
[tool.ruff]
|
||||||
|
line-length = 88
|
||||||
|
|
||||||
|
[tool.ruff.lint.per-file-ignores]
|
||||||
|
"vllm/third_party/**" = ["ALL"]
|
||||||
|
"vllm/version.py" = ["F401"]
|
||||||
|
"vllm/_version.py" = ["ALL"]
|
||||||
|
|
||||||
|
[tool.ruff.lint]
|
||||||
|
select = [
|
||||||
|
# pycodestyle
|
||||||
|
"E",
|
||||||
|
# Pyflakes
|
||||||
|
"F",
|
||||||
|
# pyupgrade
|
||||||
|
"UP",
|
||||||
|
# flake8-bugbear
|
||||||
|
"B",
|
||||||
|
# flake8-simplify
|
||||||
|
"SIM",
|
||||||
|
# isort
|
||||||
|
"I",
|
||||||
|
# flake8-logging-format
|
||||||
|
"G",
|
||||||
|
]
|
||||||
|
ignore = [
|
||||||
|
# star imports
|
||||||
|
"F405", "F403",
|
||||||
|
# lambda expression assignment
|
||||||
|
"E731",
|
||||||
|
# Loop control variable not used within loop body
|
||||||
|
"B007",
|
||||||
|
# f-string format
|
||||||
|
"UP032",
|
||||||
|
# Can remove once 3.10+ is the minimum Python version
|
||||||
|
"UP007",
|
||||||
|
]
|
||||||
|
|
||||||
|
[tool.ruff.format]
|
||||||
|
docstring-code-format = true
|
||||||
@ -1,5 +1,5 @@
|
|||||||
steps:
|
steps:
|
||||||
# aarch64 + CUDA builds
|
# aarch64 + CUDA builds. PyTorch 2.8 aarch64 + CUDA wheel is only available on CUDA 12.9
|
||||||
- label: "Build arm64 wheel - CUDA 12.9"
|
- label: "Build arm64 wheel - CUDA 12.9"
|
||||||
depends_on: ~
|
depends_on: ~
|
||||||
id: build-wheel-arm64-cuda-12-9
|
id: build-wheel-arm64-cuda-12-9
|
||||||
@ -8,28 +8,13 @@ steps:
|
|||||||
commands:
|
commands:
|
||||||
# #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
|
# #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
|
||||||
# https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
|
# https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg VLLM_MAIN_CUDA_VERSION=12.9 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg VLLM_MAIN_CUDA_VERSION=12.9 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||||
- "mkdir artifacts"
|
- "mkdir artifacts"
|
||||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||||
- "bash .buildkite/scripts/upload-wheels.sh"
|
- "bash .buildkite/scripts/upload-wheels.sh"
|
||||||
env:
|
env:
|
||||||
DOCKER_BUILDKIT: "1"
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|
||||||
# aarch64 build
|
|
||||||
- label: "Build arm64 CPU wheel"
|
|
||||||
depends_on: ~
|
|
||||||
id: build-wheel-arm64-cpu
|
|
||||||
agents:
|
|
||||||
queue: arm64_cpu_queue_postmerge
|
|
||||||
commands:
|
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_BUILD_ACL=ON --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
|
|
||||||
- "mkdir artifacts"
|
|
||||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
|
||||||
- "bash .buildkite/scripts/upload-wheels.sh"
|
|
||||||
env:
|
|
||||||
DOCKER_BUILDKIT: "1"
|
|
||||||
|
|
||||||
# x86 + CUDA builds
|
|
||||||
- label: "Build wheel - CUDA 12.8"
|
- label: "Build wheel - CUDA 12.8"
|
||||||
depends_on: ~
|
depends_on: ~
|
||||||
id: build-wheel-cuda-12-8
|
id: build-wheel-cuda-12-8
|
||||||
@ -43,33 +28,33 @@ steps:
|
|||||||
env:
|
env:
|
||||||
DOCKER_BUILDKIT: "1"
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|
||||||
|
- label: "Build wheel - CUDA 12.6"
|
||||||
|
depends_on: ~
|
||||||
|
id: build-wheel-cuda-12-6
|
||||||
|
agents:
|
||||||
|
queue: cpu_queue_postmerge
|
||||||
|
commands:
|
||||||
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.6.3 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||||
|
- "mkdir artifacts"
|
||||||
|
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||||
|
- "bash .buildkite/scripts/upload-wheels.sh"
|
||||||
|
env:
|
||||||
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|
||||||
|
# x86 + CUDA builds
|
||||||
- label: "Build wheel - CUDA 12.9"
|
- label: "Build wheel - CUDA 12.9"
|
||||||
depends_on: ~
|
depends_on: ~
|
||||||
id: build-wheel-cuda-12-9
|
id: build-wheel-cuda-12-9
|
||||||
agents:
|
agents:
|
||||||
queue: cpu_queue_postmerge
|
queue: cpu_queue_postmerge
|
||||||
commands:
|
commands:
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||||
- "mkdir artifacts"
|
- "mkdir artifacts"
|
||||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||||
- "bash .buildkite/scripts/upload-wheels.sh"
|
- "bash .buildkite/scripts/upload-wheels.sh"
|
||||||
env:
|
env:
|
||||||
DOCKER_BUILDKIT: "1"
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|
||||||
- label: "Build wheel - CUDA 13.0"
|
|
||||||
depends_on: ~
|
|
||||||
id: build-wheel-cuda-13-0
|
|
||||||
agents:
|
|
||||||
queue: cpu_queue_postmerge
|
|
||||||
commands:
|
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
|
||||||
- "mkdir artifacts"
|
|
||||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
|
||||||
- "bash .buildkite/scripts/upload-wheels.sh"
|
|
||||||
env:
|
|
||||||
DOCKER_BUILDKIT: "1"
|
|
||||||
|
|
||||||
# Build release images (12.9)
|
|
||||||
- label: "Build release image (x86)"
|
- label: "Build release image (x86)"
|
||||||
depends_on: ~
|
depends_on: ~
|
||||||
id: build-release-image-x86
|
id: build-release-image-x86
|
||||||
@ -77,12 +62,13 @@ steps:
|
|||||||
queue: cpu_queue_postmerge
|
queue: cpu_queue_postmerge
|
||||||
commands:
|
commands:
|
||||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
|
||||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
|
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
|
||||||
# re-tag to default image tag and push, just in case arm64 build fails
|
# re-tag to default image tag and push, just in case arm64 build fails
|
||||||
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
||||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
||||||
|
|
||||||
|
# PyTorch 2.8 aarch64 + CUDA wheel is only available on CUDA 12.9
|
||||||
- label: "Build release image (arm64)"
|
- label: "Build release image (arm64)"
|
||||||
depends_on: ~
|
depends_on: ~
|
||||||
id: build-release-image-arm64
|
id: build-release-image-arm64
|
||||||
@ -90,7 +76,7 @@ steps:
|
|||||||
queue: arm64_cpu_queue_postmerge
|
queue: arm64_cpu_queue_postmerge
|
||||||
commands:
|
commands:
|
||||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
|
||||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
|
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
|
||||||
|
|
||||||
# Add job to create multi-arch manifest
|
# Add job to create multi-arch manifest
|
||||||
@ -156,22 +142,6 @@ steps:
|
|||||||
env:
|
env:
|
||||||
DOCKER_BUILDKIT: "1"
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|
||||||
- block: "Build arm64 CPU release image"
|
|
||||||
key: block-arm64-cpu-release-image-build
|
|
||||||
depends_on: ~
|
|
||||||
|
|
||||||
- label: "Build and publish arm64 CPU release image"
|
|
||||||
depends_on: block-arm64-cpu-release-image-build
|
|
||||||
agents:
|
|
||||||
queue: arm64_cpu_queue_postmerge
|
|
||||||
commands:
|
|
||||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
|
|
||||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest"
|
|
||||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
|
|
||||||
env:
|
|
||||||
DOCKER_BUILDKIT: "1"
|
|
||||||
|
|
||||||
- label: "Build and publish nightly multi-arch image to DockerHub"
|
- label: "Build and publish nightly multi-arch image to DockerHub"
|
||||||
depends_on:
|
depends_on:
|
||||||
- create-multi-arch-manifest
|
- create-multi-arch-manifest
|
||||||
@ -180,16 +150,11 @@ steps:
|
|||||||
queue: cpu_queue_postmerge
|
queue: cpu_queue_postmerge
|
||||||
commands:
|
commands:
|
||||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||||
- "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64"
|
- "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
||||||
- "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64"
|
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT vllm/vllm-openai:nightly"
|
||||||
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 vllm/vllm-openai:nightly-x86_64"
|
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT vllm/vllm-openai:nightly-$BUILDKITE_COMMIT"
|
||||||
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 vllm/vllm-openai:nightly-aarch64"
|
- "docker push vllm/vllm-openai:nightly"
|
||||||
- "docker push vllm/vllm-openai:nightly-x86_64"
|
- "docker push vllm/vllm-openai:nightly-$BUILDKITE_COMMIT"
|
||||||
- "docker push vllm/vllm-openai:nightly-aarch64"
|
|
||||||
- "docker manifest create vllm/vllm-openai:nightly vllm/vllm-openai:nightly-x86_64 vllm/vllm-openai:nightly-aarch64 --amend"
|
|
||||||
- "docker manifest create vllm/vllm-openai:nightly-$BUILDKITE_COMMIT vllm/vllm-openai:nightly-x86_64 vllm/vllm-openai:nightly-aarch64 --amend"
|
|
||||||
- "docker manifest push vllm/vllm-openai:nightly"
|
|
||||||
- "docker manifest push vllm/vllm-openai:nightly-$BUILDKITE_COMMIT"
|
|
||||||
# Clean up old nightly builds (keep only last 14)
|
# Clean up old nightly builds (keep only last 14)
|
||||||
- "bash .buildkite/scripts/cleanup-nightly-builds.sh"
|
- "bash .buildkite/scripts/cleanup-nightly-builds.sh"
|
||||||
plugins:
|
plugins:
|
||||||
@ -198,4 +163,3 @@ steps:
|
|||||||
password-env: DOCKERHUB_TOKEN
|
password-env: DOCKERHUB_TOKEN
|
||||||
env:
|
env:
|
||||||
DOCKER_BUILDKIT: "1"
|
DOCKER_BUILDKIT: "1"
|
||||||
DOCKERHUB_USERNAME: "vllmbot"
|
|
||||||
|
|||||||
@ -8,41 +8,20 @@ set -ex
|
|||||||
# DockerHub API endpoint for vllm/vllm-openai repository
|
# DockerHub API endpoint for vllm/vllm-openai repository
|
||||||
REPO_API_URL="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags"
|
REPO_API_URL="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags"
|
||||||
|
|
||||||
# Get DockerHub credentials from environment
|
# Get DockerHub token from environment
|
||||||
if [ -z "$DOCKERHUB_TOKEN" ]; then
|
if [ -z "$DOCKERHUB_TOKEN" ]; then
|
||||||
echo "Error: DOCKERHUB_TOKEN environment variable is not set"
|
echo "Error: DOCKERHUB_TOKEN environment variable is not set"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ -z "$DOCKERHUB_USERNAME" ]; then
|
|
||||||
echo "Error: DOCKERHUB_USERNAME environment variable is not set"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Get DockerHub bearer token
|
|
||||||
echo "Getting DockerHub bearer token..."
|
|
||||||
set +x
|
|
||||||
BEARER_TOKEN=$(curl -s -X POST \
|
|
||||||
-H "Content-Type: application/json" \
|
|
||||||
-d "{\"username\": \"$DOCKERHUB_USERNAME\", \"password\": \"$DOCKERHUB_TOKEN\"}" \
|
|
||||||
"https://hub.docker.com/v2/users/login" | jq -r '.token')
|
|
||||||
set -x
|
|
||||||
|
|
||||||
if [ -z "$BEARER_TOKEN" ] || [ "$BEARER_TOKEN" = "null" ]; then
|
|
||||||
echo "Error: Failed to get DockerHub bearer token"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Function to get all tags from DockerHub
|
# Function to get all tags from DockerHub
|
||||||
get_all_tags() {
|
get_all_tags() {
|
||||||
local page=1
|
local page=1
|
||||||
local all_tags=""
|
local all_tags=""
|
||||||
|
|
||||||
while true; do
|
while true; do
|
||||||
set +x
|
local response=$(curl -s -H "Authorization: Bearer $DOCKERHUB_TOKEN" \
|
||||||
local response=$(curl -s -H "Authorization: Bearer $BEARER_TOKEN" \
|
|
||||||
"$REPO_API_URL?page=$page&page_size=100")
|
"$REPO_API_URL?page=$page&page_size=100")
|
||||||
set -x
|
|
||||||
|
|
||||||
# Get both last_updated timestamp and tag name, separated by |
|
# Get both last_updated timestamp and tag name, separated by |
|
||||||
local tags=$(echo "$response" | jq -r '.results[] | select(.name | startswith("nightly-")) | "\(.last_updated)|\(.name)"')
|
local tags=$(echo "$response" | jq -r '.results[] | select(.name | startswith("nightly-")) | "\(.last_updated)|\(.name)"')
|
||||||
@ -64,9 +43,7 @@ delete_tag() {
|
|||||||
echo "Deleting tag: $tag_name"
|
echo "Deleting tag: $tag_name"
|
||||||
|
|
||||||
local delete_url="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags/$tag_name"
|
local delete_url="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags/$tag_name"
|
||||||
set +x
|
local response=$(curl -s -X DELETE -H "Authorization: Bearer $DOCKERHUB_TOKEN" "$delete_url")
|
||||||
local response=$(curl -s -X DELETE -H "Authorization: Bearer $BEARER_TOKEN" "$delete_url")
|
|
||||||
set -x
|
|
||||||
|
|
||||||
if echo "$response" | jq -e '.detail' > /dev/null 2>&1; then
|
if echo "$response" | jq -e '.detail' > /dev/null 2>&1; then
|
||||||
echo "Warning: Failed to delete tag $tag_name: $(echo "$response" | jq -r '.detail')"
|
echo "Warning: Failed to delete tag $tag_name: $(echo "$response" | jq -r '.detail')"
|
||||||
|
|||||||
@ -86,6 +86,10 @@ if [[ $commands == *"pytest -v -s models/test_registry.py"* ]]; then
|
|||||||
commands=${commands//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"}
|
commands=${commands//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"}
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
if [[ $commands == *"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'"* ]]; then
|
||||||
|
commands=${commands//"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'"/"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2 and not BambaForCausalLM and not Gemma2ForCausalLM and not Grok1ModelForCausalLM and not Zamba2ForCausalLM and not Gemma2Model and not GritLM'"}
|
||||||
|
fi
|
||||||
|
|
||||||
if [[ $commands == *"pytest -v -s compile/test_basic_correctness.py"* ]]; then
|
if [[ $commands == *"pytest -v -s compile/test_basic_correctness.py"* ]]; then
|
||||||
commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s compile/test_basic_correctness.py"}
|
commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s compile/test_basic_correctness.py"}
|
||||||
fi
|
fi
|
||||||
|
|||||||
@ -25,28 +25,25 @@ function cpu_tests() {
|
|||||||
|
|
||||||
# offline inference
|
# offline inference
|
||||||
podman exec -it "$container_id" bash -c "
|
podman exec -it "$container_id" bash -c "
|
||||||
set -xve
|
set -e
|
||||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" >> $HOME/test_basic.log
|
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
|
||||||
|
|
||||||
# Run basic model test
|
# Run basic model test
|
||||||
podman exec -it "$container_id" bash -c "
|
podman exec -it "$container_id" bash -c "
|
||||||
set -evx
|
set -e
|
||||||
pip install pytest pytest-asyncio einops peft Pillow soundfile transformers_stream_generator matplotlib
|
pip install pytest pytest-asyncio einops peft Pillow soundfile transformers_stream_generator matplotlib
|
||||||
pip install sentence-transformers datamodel_code_generator
|
pip install sentence-transformers datamodel_code_generator
|
||||||
|
pytest -v -s tests/models/language/generation/test_bart.py -m cpu_model
|
||||||
# Note: disable Bart until supports V1
|
|
||||||
# pytest -v -s tests/models/language/generation/test_bart.py -m cpu_model
|
|
||||||
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-openai-community/gpt2]
|
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-openai-community/gpt2]
|
||||||
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-facebook/opt-125m]
|
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-facebook/opt-125m]
|
||||||
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-google/gemma-1.1-2b-it]
|
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-google/gemma-1.1-2b-it]
|
||||||
pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach]
|
pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach]
|
||||||
# TODO: Below test case tests/models/language/pooling/test_embedding.py::test_models[True-ssmits/Qwen2-7B-Instruct-embed-base] fails on ppc64le. Disabling it for time being.
|
pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model"
|
||||||
# pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model" >> $HOME/test_rest.log
|
|
||||||
}
|
}
|
||||||
|
|
||||||
# All of CPU tests are expected to be finished less than 40 mins.
|
# All of CPU tests are expected to be finished less than 40 mins.
|
||||||
|
|
||||||
export container_id
|
export container_id
|
||||||
export -f cpu_tests
|
export -f cpu_tests
|
||||||
timeout 120m bash -c cpu_tests
|
timeout 40m bash -c cpu_tests
|
||||||
|
|
||||||
|
|||||||
@ -58,8 +58,11 @@ function cpu_tests() {
|
|||||||
# pytest -x -v -s tests/kernels/attention/test_cache.py -m cpu_model
|
# pytest -x -v -s tests/kernels/attention/test_cache.py -m cpu_model
|
||||||
# pytest -x -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
|
# pytest -x -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
|
||||||
|
|
||||||
pytest -x -v -s tests/models/language/generation -m cpu_model
|
# Note: disable Bart until supports V1
|
||||||
VLLM_CPU_SGL_KERNEL=1 pytest -x -v -s tests/models/language/generation -m cpu_model
|
pytest -x -v -s tests/models/language/generation -m cpu_model \
|
||||||
|
--ignore=tests/models/language/generation/test_bart.py
|
||||||
|
VLLM_CPU_SGL_KERNEL=1 pytest -x -v -s tests/models/language/generation -m cpu_model \
|
||||||
|
--ignore=tests/models/language/generation/test_bart.py
|
||||||
|
|
||||||
pytest -x -v -s tests/models/language/pooling -m cpu_model
|
pytest -x -v -s tests/models/language/pooling -m cpu_model
|
||||||
pytest -x -v -s tests/models/multimodal/generation \
|
pytest -x -v -s tests/models/multimodal/generation \
|
||||||
@ -70,7 +73,7 @@ function cpu_tests() {
|
|||||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||||
set -e
|
set -e
|
||||||
pytest -x -s -v \
|
pytest -x -s -v \
|
||||||
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs"
|
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs[False-10-32-neuralmagic/Llama-3.2-1B-quantized.w8a8]"
|
||||||
|
|
||||||
# Note: disable it until supports V1
|
# Note: disable it until supports V1
|
||||||
# Run AWQ test
|
# Run AWQ test
|
||||||
|
|||||||
@ -1,191 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
# This script build the Ascend NPU docker image and run the offline inference inside the container.
|
|
||||||
# It serves a sanity check for compilation and basic model usage.
|
|
||||||
set -ex
|
|
||||||
|
|
||||||
# Base ubuntu image with basic ascend development libraries and python installed
|
|
||||||
VLLM_ASCEND_REPO="https://github.com/vllm-project/vllm-ascend.git"
|
|
||||||
CONFIG_FILE_REMOTE_PATH="tests/e2e/vllm_interface/vllm_test.cfg"
|
|
||||||
TEST_RUN_CONFIG_FILE="vllm_test.cfg"
|
|
||||||
VLLM_ASCEND_TMP_DIR=
|
|
||||||
# Get the test run configuration file from the vllm-ascend repository
|
|
||||||
fetch_vllm_test_cfg() {
|
|
||||||
VLLM_ASCEND_TMP_DIR=$(mktemp -d)
|
|
||||||
# Ensure that the temporary directory is cleaned up when an exception occurs during configuration file retrieval
|
|
||||||
cleanup() {
|
|
||||||
rm -rf "${VLLM_ASCEND_TMP_DIR}"
|
|
||||||
}
|
|
||||||
trap cleanup EXIT
|
|
||||||
|
|
||||||
GIT_TRACE=1 git clone -v --depth 1 "${VLLM_ASCEND_REPO}" "${VLLM_ASCEND_TMP_DIR}"
|
|
||||||
if [ ! -f "${VLLM_ASCEND_TMP_DIR}/${CONFIG_FILE_REMOTE_PATH}" ]; then
|
|
||||||
echo "Error: file '${CONFIG_FILE_REMOTE_PATH}' does not exist in the warehouse" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
# If the file already exists locally, just overwrite it
|
|
||||||
cp "${VLLM_ASCEND_TMP_DIR}/${CONFIG_FILE_REMOTE_PATH}" "${TEST_RUN_CONFIG_FILE}"
|
|
||||||
echo "Copied ${CONFIG_FILE_REMOTE_PATH} to ${TEST_RUN_CONFIG_FILE}"
|
|
||||||
|
|
||||||
# Since the trap will be overwritten later, and when it is executed here, the task of cleaning up resources
|
|
||||||
# when the trap is abnormal has been completed, so the temporary resources are manually deleted here.
|
|
||||||
rm -rf "${VLLM_ASCEND_TMP_DIR}"
|
|
||||||
trap - EXIT
|
|
||||||
}
|
|
||||||
|
|
||||||
# Downloads test run configuration file from a remote URL.
|
|
||||||
# Loads the configuration into the current script environment.
|
|
||||||
get_config() {
|
|
||||||
if [ ! -f "${TEST_RUN_CONFIG_FILE}" ]; then
|
|
||||||
echo "Error: file '${TEST_RUN_CONFIG_FILE}' does not exist in the warehouse" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
source "${TEST_RUN_CONFIG_FILE}"
|
|
||||||
echo "Base docker image name that get from configuration: ${BASE_IMAGE_NAME}"
|
|
||||||
return 0
|
|
||||||
}
|
|
||||||
|
|
||||||
# get test running configuration.
|
|
||||||
fetch_vllm_test_cfg
|
|
||||||
get_config
|
|
||||||
# Check if the function call was successful. If not, exit the script.
|
|
||||||
if [ $? -ne 0 ]; then
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
image_name="npu/vllm-ci:${BUILDKITE_COMMIT}_${EPOCHSECONDS}"
|
|
||||||
container_name="npu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
|
|
||||||
|
|
||||||
# BUILDKITE_AGENT_NAME format is {hostname}-{agent_idx}-{npu_card_num}cards
|
|
||||||
agent_idx=$(echo "${BUILDKITE_AGENT_NAME}" | awk -F'-' '{print $(NF-1)}')
|
|
||||||
echo "agent_idx: ${agent_idx}"
|
|
||||||
builder_name="cachebuilder${agent_idx}"
|
|
||||||
builder_cache_dir="/mnt/docker-cache${agent_idx}"
|
|
||||||
mkdir -p ${builder_cache_dir}
|
|
||||||
|
|
||||||
# Try building the docker image
|
|
||||||
cat <<EOF | DOCKER_BUILDKIT=1 docker build \
|
|
||||||
--add-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local:${PYPI_CACHE_HOST} \
|
|
||||||
--builder ${builder_name} --cache-from type=local,src=${builder_cache_dir} \
|
|
||||||
--cache-to type=local,dest=${builder_cache_dir},mode=max \
|
|
||||||
--progress=plain --load -t ${image_name} -f - .
|
|
||||||
FROM ${BASE_IMAGE_NAME}
|
|
||||||
|
|
||||||
# Define environments
|
|
||||||
ENV DEBIAN_FRONTEND=noninteractive
|
|
||||||
|
|
||||||
RUN pip config set global.index-url http://cache-service-vllm.nginx-pypi-cache.svc.cluster.local:${PYPI_CACHE_PORT}/pypi/simple && \
|
|
||||||
pip config set global.trusted-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local && \
|
|
||||||
apt-get update -y && \
|
|
||||||
apt-get install -y python3-pip git vim wget net-tools gcc g++ cmake libnuma-dev && \
|
|
||||||
rm -rf /var/cache/apt/* && \
|
|
||||||
rm -rf /var/lib/apt/lists/*
|
|
||||||
|
|
||||||
# Install for pytest to make the docker build cache layer always valid
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
|
||||||
pip install pytest>=6.0 modelscope
|
|
||||||
|
|
||||||
WORKDIR /workspace/vllm
|
|
||||||
|
|
||||||
# Install vLLM dependencies in advance. Effect: As long as common.txt remains unchanged, the docker cache layer will be valid.
|
|
||||||
COPY requirements/common.txt /workspace/vllm/requirements/common.txt
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
|
||||||
pip install -r requirements/common.txt
|
|
||||||
|
|
||||||
COPY . .
|
|
||||||
|
|
||||||
# Install vLLM
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
|
||||||
VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \
|
|
||||||
python3 -m pip uninstall -y triton
|
|
||||||
|
|
||||||
# Install vllm-ascend
|
|
||||||
WORKDIR /workspace
|
|
||||||
ARG VLLM_ASCEND_REPO=https://github.com/vllm-project/vllm-ascend.git
|
|
||||||
ARG VLLM_ASCEND_TAG=main
|
|
||||||
RUN git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf "https://github.com/" && \
|
|
||||||
git clone --depth 1 \$VLLM_ASCEND_REPO --branch \$VLLM_ASCEND_TAG /workspace/vllm-ascend
|
|
||||||
|
|
||||||
# Install vllm dependencies in advance. Effect: As long as common.txt remains unchanged, the docker cache layer will be valid.
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
|
||||||
pip install -r /workspace/vllm-ascend/requirements.txt
|
|
||||||
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
|
||||||
export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \
|
|
||||||
source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
|
|
||||||
source /usr/local/Ascend/nnal/atb/set_env.sh && \
|
|
||||||
export LD_LIBRARY_PATH=\$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
|
|
||||||
python3 -m pip install -v -e /workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/
|
|
||||||
|
|
||||||
ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
|
|
||||||
ENV VLLM_USE_MODELSCOPE=True
|
|
||||||
|
|
||||||
WORKDIR /workspace/vllm-ascend
|
|
||||||
|
|
||||||
CMD ["/bin/bash"]
|
|
||||||
|
|
||||||
EOF
|
|
||||||
|
|
||||||
# Setup cleanup
|
|
||||||
remove_docker_container() {
|
|
||||||
docker rm -f "${container_name}" || true;
|
|
||||||
docker image rm -f "${image_name}" || true;
|
|
||||||
docker system prune -f || true;
|
|
||||||
}
|
|
||||||
trap remove_docker_container EXIT
|
|
||||||
|
|
||||||
# Generate corresponding --device args based on BUILDKITE_AGENT_NAME
|
|
||||||
# Ascend NPU BUILDKITE_AGENT_NAME format is {hostname}-{agent_idx}-{npu_card_num}cards, and agent_idx starts from 1.
|
|
||||||
# e.g. atlas-a2-001-1-2cards means this is the 1-th agent on atlas-a2-001 host, and it has 2 NPU cards.
|
|
||||||
# returns --device /dev/davinci0 --device /dev/davinci1
|
|
||||||
parse_and_gen_devices() {
|
|
||||||
local input="$1"
|
|
||||||
local index cards_num
|
|
||||||
if [[ "$input" =~ ([0-9]+)-([0-9]+)cards$ ]]; then
|
|
||||||
index="${BASH_REMATCH[1]}"
|
|
||||||
cards_num="${BASH_REMATCH[2]}"
|
|
||||||
else
|
|
||||||
echo "parse error" >&2
|
|
||||||
return 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
local devices=""
|
|
||||||
local i=0
|
|
||||||
while (( i < cards_num )); do
|
|
||||||
local dev_idx=$(((index - 1)*cards_num + i ))
|
|
||||||
devices="$devices --device /dev/davinci${dev_idx}"
|
|
||||||
((i++))
|
|
||||||
done
|
|
||||||
|
|
||||||
# trim leading space
|
|
||||||
devices="${devices#"${devices%%[![:space:]]*}"}"
|
|
||||||
# Output devices: assigned to the caller variable
|
|
||||||
printf '%s' "$devices"
|
|
||||||
}
|
|
||||||
|
|
||||||
devices=$(parse_and_gen_devices "${BUILDKITE_AGENT_NAME}") || exit 1
|
|
||||||
|
|
||||||
# Run the image and execute the Out-Of-Tree (OOT) platform interface test case on Ascend NPU hardware.
|
|
||||||
# This test checks whether the OOT platform interface is functioning properly in conjunction with
|
|
||||||
# the hardware plugin vllm-ascend.
|
|
||||||
model_cache_dir=/mnt/modelscope${agent_idx}
|
|
||||||
mkdir -p ${model_cache_dir}
|
|
||||||
docker run \
|
|
||||||
${devices} \
|
|
||||||
--device /dev/davinci_manager \
|
|
||||||
--device /dev/devmm_svm \
|
|
||||||
--device /dev/hisi_hdc \
|
|
||||||
-v /usr/local/dcmi:/usr/local/dcmi \
|
|
||||||
-v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \
|
|
||||||
-v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \
|
|
||||||
-v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \
|
|
||||||
-v /etc/ascend_install.info:/etc/ascend_install.info \
|
|
||||||
-v ${model_cache_dir}:/root/.cache/modelscope \
|
|
||||||
--entrypoint="" \
|
|
||||||
--name "${container_name}" \
|
|
||||||
"${image_name}" \
|
|
||||||
bash -c '
|
|
||||||
set -e
|
|
||||||
pytest -v -s tests/e2e/vllm_interface/
|
|
||||||
'
|
|
||||||
@ -62,11 +62,12 @@ echo "--- Installing Python dependencies ---"
|
|||||||
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
|
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
|
||||||
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
|
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
|
||||||
&& python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
|
&& python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
|
||||||
&& python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
|
&& python3 -m pip install --progress-bar off hf-transfer
|
||||||
echo "--- Python dependencies installed ---"
|
echo "--- Python dependencies installed ---"
|
||||||
|
export VLLM_USE_V1=1
|
||||||
export VLLM_XLA_CHECK_RECOMPILATION=1
|
export VLLM_XLA_CHECK_RECOMPILATION=1
|
||||||
export VLLM_XLA_CACHE_PATH=
|
export VLLM_XLA_CACHE_PATH=
|
||||||
|
echo "Using VLLM V1"
|
||||||
|
|
||||||
echo "--- Hardware Information ---"
|
echo "--- Hardware Information ---"
|
||||||
# tpu-info
|
# tpu-info
|
||||||
|
|||||||
@ -62,11 +62,12 @@ echo "--- Installing Python dependencies ---"
|
|||||||
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
|
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
|
||||||
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
|
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
|
||||||
&& python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
|
&& python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
|
||||||
&& python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
|
&& python3 -m pip install --progress-bar off hf-transfer
|
||||||
echo "--- Python dependencies installed ---"
|
echo "--- Python dependencies installed ---"
|
||||||
|
export VLLM_USE_V1=1
|
||||||
export VLLM_XLA_CHECK_RECOMPILATION=1
|
export VLLM_XLA_CHECK_RECOMPILATION=1
|
||||||
export VLLM_XLA_CACHE_PATH=
|
export VLLM_XLA_CACHE_PATH=
|
||||||
|
echo "Using VLLM V1"
|
||||||
|
|
||||||
echo "--- Hardware Information ---"
|
echo "--- Hardware Information ---"
|
||||||
# tpu-info
|
# tpu-info
|
||||||
|
|||||||
@ -20,10 +20,7 @@ trap remove_docker_container EXIT
|
|||||||
|
|
||||||
# Run the image and test offline inference/tensor parallel
|
# Run the image and test offline inference/tensor parallel
|
||||||
docker run \
|
docker run \
|
||||||
--device /dev/dri:/dev/dri \
|
--device /dev/dri \
|
||||||
--net=host \
|
|
||||||
--ipc=host \
|
|
||||||
--privileged \
|
|
||||||
-v /dev/dri/by-path:/dev/dri/by-path \
|
-v /dev/dri/by-path:/dev/dri/by-path \
|
||||||
--entrypoint="" \
|
--entrypoint="" \
|
||||||
-e "HF_TOKEN=${HF_TOKEN}" \
|
-e "HF_TOKEN=${HF_TOKEN}" \
|
||||||
@ -38,14 +35,16 @@ docker run \
|
|||||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -O.cudagraph_mode=NONE
|
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -O.cudagraph_mode=NONE
|
||||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
|
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
|
||||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
|
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
|
||||||
VLLM_ATTENTION_BACKEND=TRITON_ATTN python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
|
VLLM_ATTENTION_BACKEND=TRITON_ATTN_VLLM_V1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
|
||||||
cd tests
|
cd tests
|
||||||
pytest -v -s v1/core
|
pytest -v -s v1/core
|
||||||
pytest -v -s v1/engine
|
pytest -v -s v1/engine
|
||||||
pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py
|
pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py
|
||||||
pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
|
pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
|
||||||
pytest -v -s v1/structured_output
|
pytest -v -s v1/structured_output
|
||||||
pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py --ignore=v1/spec_decode/test_speculators_eagle3.py
|
pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_eagle.py --ignore=v1/spec_decode/test_tree_attention.py
|
||||||
pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py
|
pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py
|
||||||
pytest -v -s v1/test_serial_utils.py
|
pytest -v -s v1/test_serial_utils.py
|
||||||
|
pytest -v -s v1/test_utils.py
|
||||||
|
pytest -v -s v1/test_metrics_reader.py
|
||||||
'
|
'
|
||||||
|
|||||||
@ -18,7 +18,7 @@ vllm bench throughput --input-len 256 --output-len 256 --output-json throughput_
|
|||||||
bench_throughput_exit_code=$?
|
bench_throughput_exit_code=$?
|
||||||
|
|
||||||
# run server-based benchmarks and upload the result to buildkite
|
# run server-based benchmarks and upload the result to buildkite
|
||||||
vllm serve meta-llama/Llama-2-7b-chat-hf &
|
python3 -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-2-7b-chat-hf &
|
||||||
server_pid=$!
|
server_pid=$!
|
||||||
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
||||||
|
|
||||||
|
|||||||
@ -1,59 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
# SPDX-License-Identifier: Apache-2.0
|
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
|
|
||||||
# Setup script for Prime-RL integration tests
|
|
||||||
# This script prepares the environment for running Prime-RL tests with nightly vLLM
|
|
||||||
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
||||||
REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
|
|
||||||
PRIME_RL_REPO="https://github.com/PrimeIntellect-ai/prime-rl.git"
|
|
||||||
PRIME_RL_DIR="${REPO_ROOT}/prime-rl"
|
|
||||||
|
|
||||||
echo "Setting up Prime-RL integration test environment..."
|
|
||||||
|
|
||||||
# Clean up any existing Prime-RL directory
|
|
||||||
if [ -d "${PRIME_RL_DIR}" ]; then
|
|
||||||
echo "Removing existing Prime-RL directory..."
|
|
||||||
rm -rf "${PRIME_RL_DIR}"
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Install UV if not available
|
|
||||||
if ! command -v uv &> /dev/null; then
|
|
||||||
echo "Installing UV package manager..."
|
|
||||||
curl -LsSf https://astral.sh/uv/install.sh | sh
|
|
||||||
source $HOME/.local/bin/env
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Clone Prime-RL repository at specific branch for reproducible tests
|
|
||||||
PRIME_RL_BRANCH="integ-vllm-main"
|
|
||||||
echo "Cloning Prime-RL repository at branch: ${PRIME_RL_BRANCH}..."
|
|
||||||
git clone --branch "${PRIME_RL_BRANCH}" --single-branch "${PRIME_RL_REPO}" "${PRIME_RL_DIR}"
|
|
||||||
cd "${PRIME_RL_DIR}"
|
|
||||||
|
|
||||||
echo "Setting up UV project environment..."
|
|
||||||
export UV_PROJECT_ENVIRONMENT=/usr/local
|
|
||||||
ln -s /usr/bin/python3 /usr/local/bin/python
|
|
||||||
|
|
||||||
# Remove vllm pin from pyproject.toml
|
|
||||||
echo "Removing vllm pin from pyproject.toml..."
|
|
||||||
sed -i '/vllm==/d' pyproject.toml
|
|
||||||
|
|
||||||
# Sync Prime-RL dependencies
|
|
||||||
echo "Installing Prime-RL dependencies..."
|
|
||||||
uv sync --inexact && uv sync --inexact --all-extras
|
|
||||||
|
|
||||||
# Verify installation
|
|
||||||
echo "Verifying installations..."
|
|
||||||
uv run python -c "import vllm; print(f'vLLM version: {vllm.__version__}')"
|
|
||||||
uv run python -c "import prime_rl; print('Prime-RL imported successfully')"
|
|
||||||
|
|
||||||
echo "Prime-RL integration test environment setup complete!"
|
|
||||||
|
|
||||||
echo "Running Prime-RL integration tests..."
|
|
||||||
export WANDB_MODE=offline # this makes this test not require a WANDB_API_KEY
|
|
||||||
uv run pytest -vs tests/integration/test_rl.py -m gpu
|
|
||||||
|
|
||||||
echo "Prime-RL integration tests completed!"
|
|
||||||
@ -1,62 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
set -euxo pipefail
|
|
||||||
|
|
||||||
# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT]
|
|
||||||
THRESHOLD=${1:-0.25}
|
|
||||||
NUM_Q=${2:-1319}
|
|
||||||
PORT=${3:-8010}
|
|
||||||
OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled}
|
|
||||||
mkdir -p "${OUT_DIR}"
|
|
||||||
|
|
||||||
wait_for_server() {
|
|
||||||
local port=$1
|
|
||||||
timeout 600 bash -c '
|
|
||||||
until curl -sf "http://127.0.0.1:'"$port"'/health" > /dev/null; do
|
|
||||||
sleep 1
|
|
||||||
done'
|
|
||||||
}
|
|
||||||
|
|
||||||
MODEL="deepseek-ai/DeepSeek-V2-lite"
|
|
||||||
BACKENDS=("deepep_high_throughput" "deepep_low_latency")
|
|
||||||
|
|
||||||
cleanup() {
|
|
||||||
if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
|
|
||||||
kill "${SERVER_PID}" 2>/dev/null || true
|
|
||||||
for _ in {1..20}; do
|
|
||||||
kill -0 "${SERVER_PID}" 2>/dev/null || break
|
|
||||||
sleep 0.5
|
|
||||||
done
|
|
||||||
kill -9 "${SERVER_PID}" 2>/dev/null || true
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
trap cleanup EXIT
|
|
||||||
|
|
||||||
for BACK in "${BACKENDS[@]}"; do
|
|
||||||
VLLM_DEEP_GEMM_WARMUP=skip \
|
|
||||||
VLLM_ALL2ALL_BACKEND=$BACK \
|
|
||||||
vllm serve "$MODEL" \
|
|
||||||
--enforce-eager \
|
|
||||||
--tensor-parallel-size 2 \
|
|
||||||
--data-parallel-size 2 \
|
|
||||||
--enable-expert-parallel \
|
|
||||||
--enable-eplb \
|
|
||||||
--trust-remote-code \
|
|
||||||
--max-model-len 2048 \
|
|
||||||
--port $PORT &
|
|
||||||
SERVER_PID=$!
|
|
||||||
wait_for_server $PORT
|
|
||||||
|
|
||||||
TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
|
|
||||||
OUT="${OUT_DIR}/${TAG}_${BACK}.json"
|
|
||||||
python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
|
|
||||||
python3 - <<PY
|
|
||||||
import json; acc=json.load(open('${OUT}'))['accuracy']
|
|
||||||
print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")
|
|
||||||
assert acc >= ${THRESHOLD}, f"${MODEL} ${BACK} accuracy {acc}"
|
|
||||||
PY
|
|
||||||
|
|
||||||
cleanup
|
|
||||||
SERVER_PID=
|
|
||||||
sleep 1
|
|
||||||
PORT=$((PORT+1))
|
|
||||||
done
|
|
||||||
@ -1,61 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
set -euxo pipefail
|
|
||||||
|
|
||||||
# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT]
|
|
||||||
THRESHOLD=${1:-0.8}
|
|
||||||
NUM_Q=${2:-1319}
|
|
||||||
PORT=${3:-8020}
|
|
||||||
OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled}
|
|
||||||
mkdir -p "${OUT_DIR}"
|
|
||||||
|
|
||||||
wait_for_server() {
|
|
||||||
local port=$1
|
|
||||||
timeout 600 bash -c '
|
|
||||||
until curl -sf "http://127.0.0.1:'"$port"'/health" > /dev/null; do
|
|
||||||
sleep 1
|
|
||||||
done'
|
|
||||||
}
|
|
||||||
|
|
||||||
MODEL="QWen/Qwen3-30B-A3B-FP8"
|
|
||||||
BACKENDS=("deepep_high_throughput" "deepep_low_latency")
|
|
||||||
|
|
||||||
cleanup() {
|
|
||||||
if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
|
|
||||||
kill "${SERVER_PID}" 2>/dev/null || true
|
|
||||||
for _ in {1..20}; do
|
|
||||||
kill -0 "${SERVER_PID}" 2>/dev/null || break
|
|
||||||
sleep 0.5
|
|
||||||
done
|
|
||||||
kill -9 "${SERVER_PID}" 2>/dev/null || true
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
trap cleanup EXIT
|
|
||||||
|
|
||||||
for BACK in "${BACKENDS[@]}"; do
|
|
||||||
VLLM_DEEP_GEMM_WARMUP=skip \
|
|
||||||
VLLM_ALL2ALL_BACKEND=$BACK \
|
|
||||||
vllm serve "$MODEL" \
|
|
||||||
--enforce-eager \
|
|
||||||
--tensor-parallel-size 2 \
|
|
||||||
--data-parallel-size 2 \
|
|
||||||
--enable-expert-parallel \
|
|
||||||
--trust-remote-code \
|
|
||||||
--max-model-len 2048 \
|
|
||||||
--port $PORT &
|
|
||||||
SERVER_PID=$!
|
|
||||||
wait_for_server $PORT
|
|
||||||
|
|
||||||
TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
|
|
||||||
OUT="${OUT_DIR}/${TAG}_${BACK}.json"
|
|
||||||
python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
|
|
||||||
python3 - <<PY
|
|
||||||
import json; acc=json.load(open('${OUT}'))['accuracy']
|
|
||||||
print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")
|
|
||||||
assert acc >= ${THRESHOLD}, f"${MODEL} ${BACK} accuracy {acc}"
|
|
||||||
PY
|
|
||||||
|
|
||||||
cleanup
|
|
||||||
SERVER_PID=
|
|
||||||
sleep 1
|
|
||||||
PORT=$((PORT+1))
|
|
||||||
done
|
|
||||||
@ -9,6 +9,6 @@ MAX_NUM_BATCHED_TOKENS=1024
|
|||||||
TENSOR_PARALLEL_SIZE=1
|
TENSOR_PARALLEL_SIZE=1
|
||||||
MAX_MODEL_LEN=2048
|
MAX_MODEL_LEN=2048
|
||||||
DOWNLOAD_DIR=/mnt/disks/persist
|
DOWNLOAD_DIR=/mnt/disks/persist
|
||||||
EXPECTED_THROUGHPUT=8.7
|
EXPECTED_THROUGHPUT=10.0
|
||||||
INPUT_LEN=1800
|
INPUT_LEN=1800
|
||||||
OUTPUT_LEN=128
|
OUTPUT_LEN=128
|
||||||
|
|||||||
@ -42,7 +42,7 @@ echo "lanching vllm..."
|
|||||||
echo "logging to $VLLM_LOG"
|
echo "logging to $VLLM_LOG"
|
||||||
echo
|
echo
|
||||||
|
|
||||||
vllm serve $MODEL \
|
VLLM_USE_V1=1 vllm serve $MODEL \
|
||||||
--seed 42 \
|
--seed 42 \
|
||||||
--max-num-seqs $MAX_NUM_SEQS \
|
--max-num-seqs $MAX_NUM_SEQS \
|
||||||
--max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \
|
--max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \
|
||||||
|
|||||||
@ -58,25 +58,33 @@ python3 .buildkite/generate_index.py --wheel "$normal_wheel"
|
|||||||
aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
|
aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
|
||||||
aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
|
aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
|
||||||
|
|
||||||
if [[ $normal_wheel == *"cu129"* ]]; then
|
if [[ $normal_wheel == *"cu126"* ]]; then
|
||||||
|
# if $normal_wheel matches cu126, do not upload the index.html
|
||||||
|
echo "Skipping index files for cu126 wheels"
|
||||||
|
elif [[ $normal_wheel == *"cu128"* ]]; then
|
||||||
|
# if $normal_wheel matches cu128, do not upload the index.html
|
||||||
|
echo "Skipping index files for cu128 wheels"
|
||||||
|
else
|
||||||
# only upload index.html for cu129 wheels (default wheels) as it
|
# only upload index.html for cu129 wheels (default wheels) as it
|
||||||
# is available on both x86 and arm64
|
# is available on both x86 and arm64
|
||||||
aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
|
aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
|
||||||
aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
|
aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
|
||||||
else
|
|
||||||
echo "Skipping index files for non-cu129 wheels"
|
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# generate index for nightly
|
# generate index for nightly
|
||||||
aws s3 cp "$wheel" "s3://vllm-wheels/nightly/"
|
aws s3 cp "$wheel" "s3://vllm-wheels/nightly/"
|
||||||
aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"
|
aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"
|
||||||
|
|
||||||
if [[ $normal_wheel == *"cu129"* ]]; then
|
if [[ $normal_wheel == *"cu126"* ]]; then
|
||||||
|
# if $normal_wheel matches cu126, do not upload the index.html
|
||||||
|
echo "Skipping index files for cu126 wheels"
|
||||||
|
elif [[ $normal_wheel == *"cu128"* ]]; then
|
||||||
|
# if $normal_wheel matches cu128, do not upload the index.html
|
||||||
|
echo "Skipping index files for cu128 wheels"
|
||||||
|
else
|
||||||
# only upload index.html for cu129 wheels (default wheels) as it
|
# only upload index.html for cu129 wheels (default wheels) as it
|
||||||
# is available on both x86 and arm64
|
# is available on both x86 and arm64
|
||||||
aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
|
aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
|
||||||
else
|
|
||||||
echo "Skipping index files for non-cu129 wheels"
|
|
||||||
fi
|
fi
|
||||||
|
|
||||||
aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
|
aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@ -6,28 +6,24 @@
|
|||||||
# to generate the final pipeline yaml file.
|
# to generate the final pipeline yaml file.
|
||||||
|
|
||||||
# Documentation
|
# Documentation
|
||||||
# label(str): the name of the test. emojis allowed.
|
# label(str): the name of the test. emoji allowed.
|
||||||
# fast_check(bool): whether to run this on each commit on the fastcheck pipeline.
|
# fast_check(bool): whether to run this on each commit on fastcheck pipeline.
|
||||||
# torch_nightly(bool): whether to run this on vllm against the torch nightly pipeline.
|
# torch_nightly(bool): whether to run this on vllm against torch nightly pipeline.
|
||||||
# fast_check_only(bool): run this test on the fastcheck pipeline only
|
# fast_check_only(bool): run this test on fastcheck pipeline only
|
||||||
# optional(bool): never run this test by default (i.e. need to unblock manually) unless it's a scheduled nightly run.
|
# optional(bool): never run this test by default (i.e. need to unblock manually) unless it's scheduled nightly run.
|
||||||
# soft_fail(bool): allow this step to fail without failing the entire pipeline (useful for flaky or experimental tests).
|
|
||||||
# command(str): the single command to run for tests. incompatible with commands.
|
# command(str): the single command to run for tests. incompatible with commands.
|
||||||
# commands(list): the list of commands to run for the test. incompatible with command.
|
# commands(list): the list of commands to run for test. incompatbile with command.
|
||||||
# mirror_hardwares(list): the list of hardware to run the test on as well. currently only supports [amdexperimental]
|
# mirror_hardwares(list): the list of hardwares to run the test on as well. currently only supports [amd]
|
||||||
# gpu(str): override the GPU selection for the test. default is L4 GPUs. supports a100, b200, h200
|
# gpu(str): override the GPU selection for the test. default is on L4 GPUs. currently only supports a100
|
||||||
# num_gpus(int): override the number of GPUs for the test. defaults to 1 GPU. currently supports 2,4.
|
# num_gpus(int): override the number of GPUs for the test. default to 1 GPU. currently support 2,4.
|
||||||
# num_nodes(int): whether to simulate multi-node setup by launching multiple containers on one host,
|
# num_nodes(int): whether to simulate multi-node setup by launch multiple containers on one host,
|
||||||
# in this case, commands must be specified. the first command runs on the first host, the second
|
# in this case, commands must be specified. the first command runs on first host, the second
|
||||||
# command runs on the second host.
|
# command runs on the second host.
|
||||||
# timeout_in_minutes(int): sets a timeout for the step in minutes. if not specified, uses the default timeout.
|
# working_dir(str): specify the place where command should execute, default to /vllm-workspace/tests
|
||||||
# parallelism(int): number of parallel jobs to run for this step. enables test sharding using $$BUILDKITE_PARALLEL_JOB
|
# source_file_dependencies(list): the list of prefix to opt-in the test for, if empty, the test will always run.
|
||||||
# and $$BUILDKITE_PARALLEL_JOB_COUNT environment variables.
|
|
||||||
# working_dir(str): specify the place where the command should execute, default to /vllm-workspace/tests
|
|
||||||
# source_file_dependencies(list): the list of prefixes to opt-in the test for, if empty, the test will always run.
|
|
||||||
|
|
||||||
# When adding a test
|
# When adding a test
|
||||||
# - If the test belongs to an existing group, add it there
|
# - If the test belong to an existing group, add it there
|
||||||
# - If the test is short, add to any existing step
|
# - If the test is short, add to any existing step
|
||||||
# - If the test takes more than 10min, then it is okay to create a new step.
|
# - If the test takes more than 10min, then it is okay to create a new step.
|
||||||
# Note that all steps execute in parallel.
|
# Note that all steps execute in parallel.
|
||||||
@ -38,7 +34,7 @@ steps:
|
|||||||
- label: Pytorch Nightly Dependency Override Check # 2min
|
- label: Pytorch Nightly Dependency Override Check # 2min
|
||||||
# if this test fails, it means the nightly torch version is not compatible with some
|
# if this test fails, it means the nightly torch version is not compatible with some
|
||||||
# of the dependencies. Please check the error message and add the package to whitelist
|
# of the dependencies. Please check the error message and add the package to whitelist
|
||||||
# in /vllm/tools/pre_commit/generate_nightly_torch_test.py
|
# in /vllm/tools/generate_nightly_torch_test.py
|
||||||
soft_fail: true
|
soft_fail: true
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- requirements/nightly_torch_test.txt
|
- requirements/nightly_torch_test.txt
|
||||||
@ -50,28 +46,19 @@ steps:
|
|||||||
mirror_hardwares: [amdexperimental]
|
mirror_hardwares: [amdexperimental]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/multimodal
|
|
||||||
- tests/utils_
|
|
||||||
commands:
|
|
||||||
- pytest -v -s -m 'not cpu_test' multimodal
|
|
||||||
- pytest -v -s utils_
|
|
||||||
|
|
||||||
- label: Async Engine, Inputs, Utils, Worker Test (CPU) # 4 mins
|
|
||||||
timeout_in_minutes: 10
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/
|
|
||||||
- tests/test_inputs.py
|
- tests/test_inputs.py
|
||||||
- tests/test_outputs.py
|
- tests/test_outputs.py
|
||||||
- tests/multimodal
|
- tests/multimodal
|
||||||
|
- tests/utils_
|
||||||
- tests/standalone_tests/lazy_imports.py
|
- tests/standalone_tests/lazy_imports.py
|
||||||
- tests/transformers_utils
|
- tests/transformers_utils
|
||||||
no_gpu: true
|
|
||||||
commands:
|
commands:
|
||||||
- python3 standalone_tests/lazy_imports.py
|
- python3 standalone_tests/lazy_imports.py
|
||||||
- pytest -v -s test_inputs.py
|
- pytest -v -s test_inputs.py
|
||||||
- pytest -v -s test_outputs.py
|
- pytest -v -s test_outputs.py
|
||||||
- pytest -v -s -m 'cpu_test' multimodal
|
- pytest -v -s multimodal
|
||||||
- pytest -v -s transformers_utils
|
- pytest -v -s utils_ # Utils
|
||||||
|
- pytest -v -s transformers_utils # transformers_utils
|
||||||
|
|
||||||
- label: Python-only Installation Test # 10min
|
- label: Python-only Installation Test # 10min
|
||||||
timeout_in_minutes: 20
|
timeout_in_minutes: 20
|
||||||
@ -123,7 +110,7 @@ steps:
|
|||||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
|
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
|
||||||
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
|
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
|
||||||
- pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
|
- VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
|
||||||
|
|
||||||
- label: Entrypoints Integration Test (API Server) # 100min
|
- label: Entrypoints Integration Test (API Server) # 100min
|
||||||
timeout_in_minutes: 130
|
timeout_in_minutes: 130
|
||||||
@ -161,6 +148,7 @@ steps:
|
|||||||
num_gpus: 4
|
num_gpus: 4
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/distributed/
|
- vllm/distributed/
|
||||||
|
- vllm/core/
|
||||||
- tests/distributed/test_utils
|
- tests/distributed/test_utils
|
||||||
- tests/distributed/test_pynccl
|
- tests/distributed/test_pynccl
|
||||||
- tests/distributed/test_events
|
- tests/distributed/test_events
|
||||||
@ -168,36 +156,28 @@ steps:
|
|||||||
- examples/offline_inference/rlhf.py
|
- examples/offline_inference/rlhf.py
|
||||||
- examples/offline_inference/rlhf_colocate.py
|
- examples/offline_inference/rlhf_colocate.py
|
||||||
- tests/examples/offline_inference/data_parallel.py
|
- tests/examples/offline_inference/data_parallel.py
|
||||||
- tests/v1/distributed
|
- tests/v1/test_async_llm_dp.py
|
||||||
|
- tests/v1/test_external_lb_dp.py
|
||||||
|
- tests/v1/test_internal_lb_dp.py
|
||||||
|
- tests/v1/test_hybrid_lb_dp.py
|
||||||
- tests/v1/engine/test_engine_core_client.py
|
- tests/v1/engine/test_engine_core_client.py
|
||||||
- tests/distributed/test_symm_mem_allreduce.py
|
|
||||||
commands:
|
commands:
|
||||||
# https://github.com/NVIDIA/nccl/issues/1838
|
# test with tp=2 and external_dp=2
|
||||||
- export NCCL_CUMEM_HOST_ENABLE=0
|
- VLLM_USE_V1=0 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
|
||||||
# test with torchrun tp=2 and external_dp=2
|
|
||||||
- torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
|
- torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
|
||||||
# test with torchrun tp=2 and pp=2
|
# test with tp=2 and pp=2
|
||||||
- PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
|
- PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
|
||||||
# test with torchrun tp=4 and dp=1
|
|
||||||
- TP_SIZE=4 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
|
|
||||||
# test with torchrun tp=2, pp=2 and dp=1
|
|
||||||
- PP_SIZE=2 TP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
|
|
||||||
# test with torchrun tp=1 and dp=4 with ep
|
|
||||||
- DP_SIZE=4 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
|
|
||||||
# test with torchrun tp=2 and dp=2 with ep
|
|
||||||
- TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
|
|
||||||
# test with internal dp
|
# test with internal dp
|
||||||
- python3 ../examples/offline_inference/data_parallel.py --enforce-eager
|
- python3 ../examples/offline_inference/data_parallel.py --enforce-eager
|
||||||
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
|
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
|
||||||
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
|
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_external_lb_dp.py
|
||||||
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
|
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/test_internal_lb_dp.py
|
||||||
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
|
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/test_hybrid_lb_dp.py
|
||||||
- pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
|
- pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
|
||||||
- pytest -v -s distributed/test_utils.py
|
- pytest -v -s distributed/test_utils.py
|
||||||
- pytest -v -s compile/test_basic_correctness.py
|
- pytest -v -s compile/test_basic_correctness.py
|
||||||
- pytest -v -s distributed/test_pynccl.py
|
- pytest -v -s distributed/test_pynccl.py
|
||||||
- pytest -v -s distributed/test_events.py
|
- pytest -v -s distributed/test_events.py
|
||||||
- pytest -v -s distributed/test_symm_mem_allreduce.py
|
|
||||||
# TODO: create a dedicated test section for multi-GPU example tests
|
# TODO: create a dedicated test section for multi-GPU example tests
|
||||||
# when we have multiple distributed example tests
|
# when we have multiple distributed example tests
|
||||||
- pushd ../examples/offline_inference
|
- pushd ../examples/offline_inference
|
||||||
@ -205,24 +185,6 @@ steps:
|
|||||||
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
|
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
|
||||||
- popd
|
- popd
|
||||||
|
|
||||||
- label: Distributed Tests (8 GPUs) # 4min
|
|
||||||
timeout_in_minutes: 10
|
|
||||||
gpu: h100
|
|
||||||
num_gpus: 8
|
|
||||||
working_dir: "/vllm-workspace/tests"
|
|
||||||
source_file_dependencies:
|
|
||||||
- examples/offline_inference/torchrun_dp_example.py
|
|
||||||
- vllm/config/parallel.py
|
|
||||||
- vllm/distributed/
|
|
||||||
- vllm/v1/engine/llm_engine.py
|
|
||||||
- vllm/v1/executor/uniproc_executor.py
|
|
||||||
- vllm/v1/worker/gpu_worker.py
|
|
||||||
commands:
|
|
||||||
# https://github.com/NVIDIA/nccl/issues/1838
|
|
||||||
- export NCCL_CUMEM_HOST_ENABLE=0
|
|
||||||
# test with torchrun tp=2 and dp=4 with ep
|
|
||||||
- torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
|
|
||||||
|
|
||||||
- label: EPLB Algorithm Test # 5min
|
- label: EPLB Algorithm Test # 5min
|
||||||
timeout_in_minutes: 15
|
timeout_in_minutes: 15
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
@ -316,44 +278,24 @@ steps:
|
|||||||
- tests/v1
|
- tests/v1
|
||||||
commands:
|
commands:
|
||||||
# split the test to avoid interference
|
# split the test to avoid interference
|
||||||
- pytest -v -s -m 'not cpu_test' v1/core
|
- pytest -v -s v1/core
|
||||||
- pytest -v -s v1/executor
|
- pytest -v -s v1/executor
|
||||||
- pytest -v -s v1/kv_offload
|
- pytest -v -s v1/kv_offload
|
||||||
- pytest -v -s v1/sample
|
- pytest -v -s v1/sample
|
||||||
- pytest -v -s v1/logits_processors
|
- pytest -v -s v1/logits_processors
|
||||||
- pytest -v -s v1/worker
|
- pytest -v -s v1/worker
|
||||||
|
- pytest -v -s v1/structured_output
|
||||||
- pytest -v -s v1/spec_decode
|
- pytest -v -s v1/spec_decode
|
||||||
- pytest -v -s -m 'not cpu_test' v1/kv_connector/unit
|
- pytest -v -s v1/kv_connector/unit
|
||||||
- pytest -v -s -m 'not cpu_test' v1/metrics
|
- pytest -v -s v1/metrics
|
||||||
|
- pytest -v -s v1/test_serial_utils.py
|
||||||
|
- pytest -v -s v1/test_utils.py
|
||||||
- pytest -v -s v1/test_oracle.py
|
- pytest -v -s v1/test_oracle.py
|
||||||
- pytest -v -s v1/test_request.py
|
- pytest -v -s v1/test_metrics_reader.py
|
||||||
# Integration test for streaming correctness (requires special branch).
|
# Integration test for streaming correctness (requires special branch).
|
||||||
- pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
|
- pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
|
||||||
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
|
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
|
||||||
|
|
||||||
- label: V1 Test attention (H100) # 10min
|
|
||||||
timeout_in_minutes: 30
|
|
||||||
gpu: h100
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/v1/attention
|
|
||||||
- tests/v1/attention
|
|
||||||
commands:
|
|
||||||
- pytest -v -s v1/attention
|
|
||||||
|
|
||||||
- label: V1 Test others (CPU) # 5 mins
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/
|
|
||||||
- tests/v1
|
|
||||||
no_gpu: true
|
|
||||||
commands:
|
|
||||||
# split the test to avoid interference
|
|
||||||
- pytest -v -s -m 'cpu_test' v1/core
|
|
||||||
- pytest -v -s v1/structured_output
|
|
||||||
- pytest -v -s v1/test_serial_utils.py
|
|
||||||
- pytest -v -s -m 'cpu_test' v1/kv_connector/unit
|
|
||||||
- pytest -v -s -m 'cpu_test' v1/metrics
|
|
||||||
|
|
||||||
|
|
||||||
- label: Examples Test # 30min
|
- label: Examples Test # 30min
|
||||||
timeout_in_minutes: 45
|
timeout_in_minutes: 45
|
||||||
mirror_hardwares: [amdexperimental]
|
mirror_hardwares: [amdexperimental]
|
||||||
@ -372,14 +314,12 @@ steps:
|
|||||||
- python3 offline_inference/vision_language.py --seed 0
|
- python3 offline_inference/vision_language.py --seed 0
|
||||||
- python3 offline_inference/vision_language_pooling.py --seed 0
|
- python3 offline_inference/vision_language_pooling.py --seed 0
|
||||||
- python3 offline_inference/vision_language_multi_image.py --seed 0
|
- python3 offline_inference/vision_language_multi_image.py --seed 0
|
||||||
- python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
- VLLM_USE_V1=0 python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
||||||
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
|
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
|
||||||
- python3 offline_inference/basic/classify.py
|
- python3 offline_inference/basic/classify.py
|
||||||
- python3 offline_inference/basic/embed.py
|
- python3 offline_inference/basic/embed.py
|
||||||
- python3 offline_inference/basic/score.py
|
- python3 offline_inference/basic/score.py
|
||||||
- python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
|
- VLLM_USE_V1=0 python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2
|
||||||
# https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
|
|
||||||
- python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
|
|
||||||
|
|
||||||
- label: Platform Tests (CUDA) # 4min
|
- label: Platform Tests (CUDA) # 4min
|
||||||
timeout_in_minutes: 15
|
timeout_in_minutes: 15
|
||||||
@ -414,12 +354,7 @@ steps:
|
|||||||
--num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
|
--num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
|
||||||
--ignore=lora/test_chatglm3_tp.py \
|
--ignore=lora/test_chatglm3_tp.py \
|
||||||
--ignore=lora/test_llama_tp.py \
|
--ignore=lora/test_llama_tp.py \
|
||||||
--ignore=lora/test_llm_with_multi_loras.py \
|
--ignore=lora/test_llm_with_multi_loras.py
|
||||||
--ignore=lora/test_olmoe_tp.py \
|
|
||||||
--ignore=lora/test_deepseekv2_tp.py \
|
|
||||||
--ignore=lora/test_gptoss.py \
|
|
||||||
--ignore=lora/test_qwen3moe_tp.py
|
|
||||||
|
|
||||||
parallelism: 4
|
parallelism: 4
|
||||||
|
|
||||||
- label: PyTorch Compilation Unit Tests # 15min
|
- label: PyTorch Compilation Unit Tests # 15min
|
||||||
@ -433,12 +368,12 @@ steps:
|
|||||||
- pytest -v -s compile/test_pass_manager.py
|
- pytest -v -s compile/test_pass_manager.py
|
||||||
- pytest -v -s compile/test_fusion.py
|
- pytest -v -s compile/test_fusion.py
|
||||||
- pytest -v -s compile/test_fusion_attn.py
|
- pytest -v -s compile/test_fusion_attn.py
|
||||||
- pytest -v -s compile/test_functionalization.py
|
|
||||||
- pytest -v -s compile/test_silu_mul_quant_fusion.py
|
- pytest -v -s compile/test_silu_mul_quant_fusion.py
|
||||||
|
- pytest -v -s compile/test_sequence_parallelism.py
|
||||||
|
- pytest -v -s compile/test_async_tp.py
|
||||||
- pytest -v -s compile/test_fusion_all_reduce.py
|
- pytest -v -s compile/test_fusion_all_reduce.py
|
||||||
- pytest -v -s compile/test_decorator.py
|
- pytest -v -s compile/test_decorator.py
|
||||||
- pytest -v -s compile/test_noop_elimination.py
|
- pytest -v -s compile/test_noop_elimination.py
|
||||||
- pytest -v -s compile/test_aot_compile.py
|
|
||||||
|
|
||||||
- label: PyTorch Fullgraph Smoke Test # 15min
|
- label: PyTorch Fullgraph Smoke Test # 15min
|
||||||
timeout_in_minutes: 30
|
timeout_in_minutes: 30
|
||||||
@ -451,8 +386,8 @@ steps:
|
|||||||
- pytest -v -s compile/test_basic_correctness.py
|
- pytest -v -s compile/test_basic_correctness.py
|
||||||
- pytest -v -s compile/piecewise/
|
- pytest -v -s compile/piecewise/
|
||||||
|
|
||||||
- label: PyTorch Fullgraph Test # 22min
|
- label: PyTorch Fullgraph Test # 20min
|
||||||
timeout_in_minutes: 35
|
timeout_in_minutes: 30
|
||||||
mirror_hardwares: [amdexperimental]
|
mirror_hardwares: [amdexperimental]
|
||||||
torch_nightly: true
|
torch_nightly: true
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
@ -460,19 +395,6 @@ steps:
|
|||||||
- tests/compile
|
- tests/compile
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s compile/test_full_graph.py
|
- pytest -v -s compile/test_full_graph.py
|
||||||
- pytest -v -s compile/test_fusions_e2e.py
|
|
||||||
|
|
||||||
- label: Cudagraph test
|
|
||||||
timeout_in_minutes: 20
|
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
source_file_dependencies:
|
|
||||||
- tests/v1/cudagraph
|
|
||||||
- vllm/v1/cudagraph_dispatcher.py
|
|
||||||
- vllm/config/compilation.py
|
|
||||||
- vllm/compilation
|
|
||||||
commands:
|
|
||||||
- pytest -v -s v1/cudagraph/test_cudagraph_dispatch.py
|
|
||||||
- pytest -v -s v1/cudagraph/test_cudagraph_mode.py
|
|
||||||
|
|
||||||
- label: Kernels Core Operation Test # 48min
|
- label: Kernels Core Operation Test # 48min
|
||||||
timeout_in_minutes: 75
|
timeout_in_minutes: 75
|
||||||
@ -480,9 +402,8 @@ steps:
|
|||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- csrc/
|
- csrc/
|
||||||
- tests/kernels/core
|
- tests/kernels/core
|
||||||
- tests/kernels/test_top_k_per_row.py
|
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s kernels/core kernels/test_top_k_per_row.py
|
- pytest -v -s kernels/core
|
||||||
|
|
||||||
- label: Kernels Attention Test %N # 23min
|
- label: Kernels Attention Test %N # 23min
|
||||||
timeout_in_minutes: 35
|
timeout_in_minutes: 35
|
||||||
@ -516,8 +437,6 @@ steps:
|
|||||||
- tests/kernels/moe
|
- tests/kernels/moe
|
||||||
- vllm/model_executor/layers/fused_moe/
|
- vllm/model_executor/layers/fused_moe/
|
||||||
- vllm/distributed/device_communicators/
|
- vllm/distributed/device_communicators/
|
||||||
- vllm/envs.py
|
|
||||||
- vllm/config
|
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
- pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
||||||
parallelism: 2
|
parallelism: 2
|
||||||
@ -528,23 +447,33 @@ steps:
|
|||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- csrc/mamba/
|
- csrc/mamba/
|
||||||
- tests/kernels/mamba
|
- tests/kernels/mamba
|
||||||
- vllm/model_executor/layers/mamba/ops
|
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s kernels/mamba
|
- pytest -v -s kernels/mamba
|
||||||
|
|
||||||
- label: Model Executor Test # 23min
|
- label: Tensorizer Test # 14min
|
||||||
timeout_in_minutes: 35
|
timeout_in_minutes: 25
|
||||||
mirror_hardwares: [amdexperimental]
|
mirror_hardwares: [amdexperimental]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/model_executor
|
- vllm/model_executor/model_loader
|
||||||
- tests/model_executor
|
- tests/tensorizer_loader
|
||||||
- tests/entrypoints/openai/test_tensorizer_entrypoint.py
|
- tests/entrypoints/openai/test_tensorizer_entrypoint.py
|
||||||
commands:
|
commands:
|
||||||
- apt-get update && apt-get install -y curl libsodium23
|
- apt-get update && apt-get install -y curl libsodium23
|
||||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
- pytest -v -s model_executor
|
- pytest -v -s tensorizer_loader
|
||||||
- pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
|
- pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
|
||||||
|
|
||||||
|
- label: Model Executor Test # 7min
|
||||||
|
timeout_in_minutes: 20
|
||||||
|
mirror_hardwares: [amdexperimental]
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/model_executor
|
||||||
|
- tests/model_executor
|
||||||
|
commands:
|
||||||
|
- apt-get update && apt-get install -y curl libsodium23
|
||||||
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
|
- pytest -v -s model_executor
|
||||||
|
|
||||||
- label: Benchmarks # 11min
|
- label: Benchmarks # 11min
|
||||||
timeout_in_minutes: 20
|
timeout_in_minutes: 20
|
||||||
mirror_hardwares: [amdexperimental]
|
mirror_hardwares: [amdexperimental]
|
||||||
@ -577,9 +506,8 @@ steps:
|
|||||||
# since torchao nightly is only compatible with torch nightly currently
|
# since torchao nightly is only compatible with torch nightly currently
|
||||||
# https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
|
# https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
|
||||||
# we can only upgrade after this is resolved
|
# we can only upgrade after this is resolved
|
||||||
# TODO(jerryzh168): resolve the above comment
|
- pip install --pre torchao==0.13.0.dev20250814 --index-url https://download.pytorch.org/whl/nightly/cu128
|
||||||
- uv pip install --system torchao==0.13.0 --index-url https://download.pytorch.org/whl/cu129
|
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
|
||||||
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
|
|
||||||
|
|
||||||
- label: LM Eval Small Models # 53min
|
- label: LM Eval Small Models # 53min
|
||||||
timeout_in_minutes: 75
|
timeout_in_minutes: 75
|
||||||
@ -607,17 +535,10 @@ steps:
|
|||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/tool_use
|
- tests/tool_use
|
||||||
|
- tests/mistral_tool_use
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s -m 'not cpu_test' tool_use
|
- pytest -v -s tool_use
|
||||||
|
- pytest -v -s mistral_tool_use
|
||||||
- label: OpenAI-Compatible Tool Use (CPU) # 5 mins
|
|
||||||
timeout_in_minutes: 10
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/
|
|
||||||
- tests/tool_use
|
|
||||||
no_gpu: true
|
|
||||||
commands:
|
|
||||||
- pytest -v -s -m 'cpu_test' tool_use
|
|
||||||
|
|
||||||
##### models test #####
|
##### models test #####
|
||||||
|
|
||||||
@ -657,19 +578,13 @@ steps:
|
|||||||
- vllm/
|
- vllm/
|
||||||
- tests/models/test_transformers.py
|
- tests/models/test_transformers.py
|
||||||
- tests/models/test_registry.py
|
- tests/models/test_registry.py
|
||||||
commands:
|
|
||||||
- pytest -v -s models/test_transformers.py models/test_registry.py
|
|
||||||
|
|
||||||
- label: Basic Models Test (Other CPU) # 5min
|
|
||||||
timeout_in_minutes: 10
|
|
||||||
torch_nightly: true
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/
|
|
||||||
- tests/models/test_utils.py
|
- tests/models/test_utils.py
|
||||||
- tests/models/test_vision.py
|
- tests/models/test_vision.py
|
||||||
no_gpu: true
|
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s models/test_utils.py models/test_vision.py
|
- pytest -v -s models/test_transformers.py \
|
||||||
|
models/test_registry.py \
|
||||||
|
models/test_utils.py \
|
||||||
|
models/test_vision.py
|
||||||
|
|
||||||
- label: Language Models Tests (Standard)
|
- label: Language Models Tests (Standard)
|
||||||
timeout_in_minutes: 25
|
timeout_in_minutes: 25
|
||||||
@ -728,10 +643,8 @@ steps:
|
|||||||
- vllm/
|
- vllm/
|
||||||
- tests/models/language/generation
|
- tests/models/language/generation
|
||||||
commands:
|
commands:
|
||||||
# Install fast path packages for testing against transformers
|
# Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
|
||||||
# Note: also needed to run plamo2 model in vLLM
|
- pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
|
||||||
- uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
|
|
||||||
- uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
|
|
||||||
- pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'
|
- pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'
|
||||||
|
|
||||||
- label: Language Models Test (PPL)
|
- label: Language Models Test (PPL)
|
||||||
@ -786,16 +699,6 @@ steps:
|
|||||||
- pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
|
- pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
|
||||||
- cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
|
- cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
|
||||||
|
|
||||||
- label: Multi-Modal Accuracy Eval (Small Models) # 50min
|
|
||||||
timeout_in_minutes: 70
|
|
||||||
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/multimodal/
|
|
||||||
- vllm/inputs/
|
|
||||||
- vllm/v1/core/
|
|
||||||
commands:
|
|
||||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
|
|
||||||
|
|
||||||
- label: Multi-Modal Models Test (Extended) 1
|
- label: Multi-Modal Models Test (Extended) 1
|
||||||
mirror_hardwares: [amdexperimental]
|
mirror_hardwares: [amdexperimental]
|
||||||
optional: true
|
optional: true
|
||||||
@ -851,16 +754,14 @@ steps:
|
|||||||
commands:
|
commands:
|
||||||
- pip install --upgrade git+https://github.com/huggingface/transformers
|
- pip install --upgrade git+https://github.com/huggingface/transformers
|
||||||
- pytest -v -s tests/models/test_initialization.py
|
- pytest -v -s tests/models/test_initialization.py
|
||||||
- pytest -v -s tests/models/test_transformers.py
|
|
||||||
- pytest -v -s tests/models/multimodal/processing/
|
- pytest -v -s tests/models/multimodal/processing/
|
||||||
- pytest -v -s tests/models/multimodal/test_mapping.py
|
- pytest -v -s tests/models/multimodal/test_mapping.py
|
||||||
- python3 examples/offline_inference/basic/chat.py
|
- python3 examples/offline_inference/basic/chat.py
|
||||||
|
- python3 examples/offline_inference/audio_language.py --model-type whisper
|
||||||
- python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
|
- python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
|
||||||
# Whisper needs spawn method to avoid deadlock
|
|
||||||
- VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
|
|
||||||
|
|
||||||
- label: Blackwell Test # 21 min
|
- label: Blackwell Test # 38 min
|
||||||
timeout_in_minutes: 30
|
timeout_in_minutes: 60
|
||||||
working_dir: "/vllm-workspace/"
|
working_dir: "/vllm-workspace/"
|
||||||
gpu: b200
|
gpu: b200
|
||||||
# optional: true
|
# optional: true
|
||||||
@ -873,6 +774,8 @@ steps:
|
|||||||
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
|
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
|
||||||
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
||||||
- vllm/v1/attention/backends/flashinfer.py
|
- vllm/v1/attention/backends/flashinfer.py
|
||||||
|
- vllm/compilation/fusion.py
|
||||||
|
- vllm/compilation/fusion_attn.py
|
||||||
commands:
|
commands:
|
||||||
- nvidia-smi
|
- nvidia-smi
|
||||||
- python3 examples/offline_inference/basic/chat.py
|
- python3 examples/offline_inference/basic/chat.py
|
||||||
@ -889,38 +792,19 @@ steps:
|
|||||||
- pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
|
- pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
|
||||||
- pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
|
- pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
|
||||||
- pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
|
- pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
|
||||||
- pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py
|
|
||||||
- pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py
|
|
||||||
- pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
|
- pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
|
||||||
- pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
|
- pytest -v -s tests/kernels/moe/test_mxfp4_moe.py
|
||||||
- pytest -v -s tests/kernels/moe/test_flashinfer.py
|
# Fusion
|
||||||
|
|
||||||
- label: Blackwell Fusion Tests # 30 min
|
|
||||||
timeout_in_minutes: 40
|
|
||||||
working_dir: "/vllm-workspace/"
|
|
||||||
gpu: b200
|
|
||||||
source_file_dependencies:
|
|
||||||
- csrc/quantization/fp4/
|
|
||||||
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
|
||||||
- vllm/v1/attention/backends/flashinfer.py
|
|
||||||
- vllm/compilation/
|
|
||||||
# can affect pattern matching
|
|
||||||
- vllm/model_executor/layers/layernorm.py
|
|
||||||
- vllm/model_executor/layers/activation.py
|
|
||||||
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
|
||||||
commands:
|
|
||||||
- nvidia-smi
|
|
||||||
- pytest -v -s tests/compile/test_fusion_attn.py
|
|
||||||
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
|
|
||||||
# this runner has 2 GPUs available even though num_gpus=2 is not set
|
|
||||||
- pytest -v -s tests/compile/test_fusion_all_reduce.py
|
- pytest -v -s tests/compile/test_fusion_all_reduce.py
|
||||||
- pytest -v -s tests/compile/test_fusions_e2e.py
|
- pytest -v -s tests/compile/test_fusion_attn.py::test_attention_quant_pattern
|
||||||
|
- pytest -v -s tests/kernels/moe/test_flashinfer.py
|
||||||
|
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
|
||||||
|
|
||||||
- label: Blackwell GPT-OSS Eval
|
- label: GPT-OSS Eval (Blackwell)
|
||||||
timeout_in_minutes: 60
|
timeout_in_minutes: 60
|
||||||
working_dir: "/vllm-workspace/"
|
working_dir: "/vllm-workspace/"
|
||||||
gpu: b200
|
gpu: b200
|
||||||
optional: true # run on nightlies
|
optional: true # disable while debugging
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- tests/evals/gpt_oss
|
- tests/evals/gpt_oss
|
||||||
- vllm/model_executor/models/gpt_oss.py
|
- vllm/model_executor/models/gpt_oss.py
|
||||||
@ -928,34 +812,7 @@ steps:
|
|||||||
- vllm/v1/attention/backends/flashinfer.py
|
- vllm/v1/attention/backends/flashinfer.py
|
||||||
commands:
|
commands:
|
||||||
- uv pip install --system 'gpt-oss[eval]==0.0.5'
|
- uv pip install --system 'gpt-oss[eval]==0.0.5'
|
||||||
- pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
|
- pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58 --server-args '--tensor-parallel-size 2'
|
||||||
|
|
||||||
- label: Blackwell Quantized MoE Test
|
|
||||||
timeout_in_minutes: 60
|
|
||||||
working_dir: "/vllm-workspace/"
|
|
||||||
gpu: b200
|
|
||||||
source_file_dependencies:
|
|
||||||
- tests/quantization/test_blackwell_moe.py
|
|
||||||
- vllm/model_executor/models/deepseek_v2.py
|
|
||||||
- vllm/model_executor/models/gpt_oss.py
|
|
||||||
- vllm/model_executor/models/llama4.py
|
|
||||||
- vllm/model_executor/layers/fused_moe
|
|
||||||
- vllm/model_executor/layers/quantization/compressed_tensors
|
|
||||||
- vllm/model_executor/layers/quantization/modelopt.py
|
|
||||||
- vllm/model_executor/layers/quantization/mxfp4.py
|
|
||||||
- vllm/v1/attention/backends/flashinfer.py
|
|
||||||
commands:
|
|
||||||
- pytest -s -v tests/quantization/test_blackwell_moe.py
|
|
||||||
|
|
||||||
- label: Blackwell LM Eval Small Models
|
|
||||||
timeout_in_minutes: 120
|
|
||||||
gpu: b200
|
|
||||||
optional: true # run on nightlies
|
|
||||||
source_file_dependencies:
|
|
||||||
- csrc/
|
|
||||||
- vllm/model_executor/layers/quantization
|
|
||||||
commands:
|
|
||||||
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1
|
|
||||||
|
|
||||||
##### 1 GPU test #####
|
##### 1 GPU test #####
|
||||||
##### multi gpus test #####
|
##### multi gpus test #####
|
||||||
@ -999,61 +856,47 @@ steps:
|
|||||||
- NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
|
- NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
|
||||||
- python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
|
- python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
|
||||||
|
|
||||||
- label: Distributed Tests (2 GPUs) # 68min
|
- label: Distributed Tests (2 GPUs) # 110min
|
||||||
timeout_in_minutes: 90
|
timeout_in_minutes: 150
|
||||||
mirror_hardwares: [amdexperimental]
|
mirror_hardwares: [amdexperimental]
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
num_gpus: 2
|
num_gpus: 2
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/compilation/
|
|
||||||
- vllm/distributed/
|
- vllm/distributed/
|
||||||
- vllm/engine/
|
- vllm/engine/
|
||||||
- vllm/executor/
|
- vllm/executor/
|
||||||
- vllm/worker/worker_base.py
|
- vllm/model_executor/models/
|
||||||
- vllm/v1/engine/
|
|
||||||
- vllm/v1/worker/
|
|
||||||
- tests/compile/test_basic_correctness.py
|
|
||||||
- tests/compile/test_wrapper.py
|
|
||||||
- tests/distributed/
|
- tests/distributed/
|
||||||
- tests/entrypoints/llm/test_collective_rpc.py
|
- vllm/compilation
|
||||||
- tests/v1/distributed
|
- vllm/worker/worker_base.py
|
||||||
|
- vllm/worker/worker.py
|
||||||
|
- vllm/worker/model_runner.py
|
||||||
|
- entrypoints/llm/test_collective_rpc.py
|
||||||
|
- tests/v1/test_async_llm_dp.py
|
||||||
|
- tests/v1/test_external_lb_dp.py
|
||||||
- tests/v1/entrypoints/openai/test_multi_api_servers.py
|
- tests/v1/entrypoints/openai/test_multi_api_servers.py
|
||||||
- tests/v1/shutdown
|
- vllm/v1/engine/
|
||||||
- tests/v1/worker/test_worker_memory_snapshot.py
|
|
||||||
commands:
|
commands:
|
||||||
# https://github.com/NVIDIA/nccl/issues/1838
|
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
|
||||||
- export NCCL_CUMEM_HOST_ENABLE=0
|
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_external_lb_dp.py
|
||||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
|
|
||||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
|
|
||||||
- DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
|
- DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
|
||||||
- pytest -v -s entrypoints/llm/test_collective_rpc.py
|
- pytest -v -s entrypoints/llm/test_collective_rpc.py
|
||||||
- pytest -v -s ./compile/test_basic_correctness.py
|
- pytest -v -s ./compile/test_basic_correctness.py
|
||||||
- pytest -v -s ./compile/test_wrapper.py
|
- pytest -v -s ./compile/test_wrapper.py
|
||||||
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
||||||
- VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
|
||||||
- pytest -v -s distributed/test_sequence_parallel.py
|
|
||||||
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
|
|
||||||
- pytest -v -s v1/worker/test_worker_memory_snapshot.py
|
|
||||||
|
|
||||||
- label: Distributed Model Tests (2 GPUs) # 37min
|
|
||||||
timeout_in_minutes: 50
|
|
||||||
mirror_hardwares: [amdexperimental]
|
|
||||||
working_dir: "/vllm-workspace/tests"
|
|
||||||
num_gpus: 2
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/model_executor/model_loader/sharded_state_loader.py
|
|
||||||
- vllm/model_executor/models/
|
|
||||||
- tests/basic_correctness/
|
|
||||||
- tests/model_executor/model_loader/test_sharded_state_loader.py
|
|
||||||
- tests/models/
|
|
||||||
commands:
|
|
||||||
- TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
|
- TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
|
||||||
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s model_executor/model_loader/test_sharded_state_loader.py
|
|
||||||
# Avoid importing model tests that cause CUDA reinitialization error
|
# Avoid importing model tests that cause CUDA reinitialization error
|
||||||
- pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
|
- pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
|
||||||
- pytest models/language -v -s -m 'distributed(num_gpus=2)'
|
- pytest models/language -v -s -m 'distributed(num_gpus=2)'
|
||||||
- pytest models/multimodal -v -s -m 'distributed(num_gpus=2)' --ignore models/multimodal/generation/test_whisper.py
|
- pytest models/multimodal -v -s -m 'distributed(num_gpus=2)' --ignore models/multimodal/generation/test_whisper.py
|
||||||
- VLLM_WORKER_MULTIPROC_METHOD=spawn pytest models/multimodal/generation/test_whisper.py -v -s -m 'distributed(num_gpus=2)'
|
- VLLM_WORKER_MULTIPROC_METHOD=spawn pytest models/multimodal/generation/test_whisper.py -v -s -m 'distributed(num_gpus=2)'
|
||||||
|
# test sequence parallel
|
||||||
|
- pytest -v -s distributed/test_sequence_parallel.py
|
||||||
|
# this test fails consistently.
|
||||||
|
# TODO: investigate and fix
|
||||||
|
- VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
|
||||||
|
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
|
||||||
|
- pytest -v -s models/multimodal/generation/test_maverick.py
|
||||||
|
|
||||||
- label: Plugin Tests (2 GPUs) # 40min
|
- label: Plugin Tests (2 GPUs) # 40min
|
||||||
timeout_in_minutes: 60
|
timeout_in_minutes: 60
|
||||||
@ -1074,11 +917,6 @@ steps:
|
|||||||
- pytest -v -s plugins_tests/test_io_processor_plugins.py
|
- pytest -v -s plugins_tests/test_io_processor_plugins.py
|
||||||
- pip uninstall prithvi_io_processor_plugin -y
|
- pip uninstall prithvi_io_processor_plugin -y
|
||||||
# end io_processor plugins test
|
# end io_processor plugins test
|
||||||
# begin stat_logger plugins test
|
|
||||||
- pip install -e ./plugins/vllm_add_dummy_stat_logger
|
|
||||||
- pytest -v -s plugins_tests/test_stats_logger_plugins.py
|
|
||||||
- pip uninstall dummy_stat_logger -y
|
|
||||||
# end stat_logger plugins test
|
|
||||||
# other tests continue here:
|
# other tests continue here:
|
||||||
- pytest -v -s plugins_tests/test_scheduler_plugins.py
|
- pytest -v -s plugins_tests/test_scheduler_plugins.py
|
||||||
- pip install -e ./plugins/vllm_add_dummy_model
|
- pip install -e ./plugins/vllm_add_dummy_model
|
||||||
@ -1118,7 +956,6 @@ steps:
|
|||||||
- pytest -v -s -x lora/test_chatglm3_tp.py
|
- pytest -v -s -x lora/test_chatglm3_tp.py
|
||||||
- pytest -v -s -x lora/test_llama_tp.py
|
- pytest -v -s -x lora/test_llama_tp.py
|
||||||
- pytest -v -s -x lora/test_llm_with_multi_loras.py
|
- pytest -v -s -x lora/test_llm_with_multi_loras.py
|
||||||
- pytest -v -s -x lora/test_olmoe_tp.py
|
|
||||||
|
|
||||||
|
|
||||||
- label: Weight Loading Multiple GPU Test # 33min
|
- label: Weight Loading Multiple GPU Test # 33min
|
||||||
@ -1145,17 +982,6 @@ steps:
|
|||||||
commands:
|
commands:
|
||||||
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
|
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
|
||||||
|
|
||||||
- label: NixlConnector PD accuracy tests (Distributed) # 30min
|
|
||||||
timeout_in_minutes: 30
|
|
||||||
working_dir: "/vllm-workspace/tests"
|
|
||||||
num_gpus: 4
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
|
|
||||||
- tests/v1/kv_connector/nixl_integration/
|
|
||||||
commands:
|
|
||||||
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
|
|
||||||
- bash v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh
|
|
||||||
|
|
||||||
|
|
||||||
##### multi gpus test #####
|
##### multi gpus test #####
|
||||||
##### A100 test #####
|
##### A100 test #####
|
||||||
@ -1186,30 +1012,13 @@ steps:
|
|||||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
|
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
|
||||||
|
|
||||||
##### H100 test #####
|
|
||||||
- label: LM Eval Large Models (H100) # optional
|
|
||||||
gpu: h100
|
|
||||||
optional: true
|
|
||||||
num_gpus: 4
|
|
||||||
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
|
||||||
source_file_dependencies:
|
|
||||||
- csrc/
|
|
||||||
- vllm/model_executor/layers/quantization
|
|
||||||
commands:
|
|
||||||
- export VLLM_USE_DEEP_GEMM=0 # We found Triton is faster than DeepGEMM for H100
|
|
||||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4
|
|
||||||
|
|
||||||
##### H200 test #####
|
##### H200 test #####
|
||||||
- label: Distributed Tests (H200) # optional
|
- label: Distrubted Tests (H200) # optional
|
||||||
gpu: h200
|
gpu: h200
|
||||||
optional: true
|
optional: true
|
||||||
working_dir: "/vllm-workspace/"
|
working_dir: "/vllm-workspace/"
|
||||||
num_gpus: 2
|
num_gpus: 2
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s tests/compile/test_async_tp.py
|
|
||||||
- pytest -v -s tests/compile/test_sequence_parallelism.py
|
|
||||||
- pytest -v -s tests/compile/test_fusion_all_reduce.py
|
|
||||||
- pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
|
|
||||||
- pytest -v -s tests/distributed/test_context_parallel.py
|
- pytest -v -s tests/distributed/test_context_parallel.py
|
||||||
- CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
|
- CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
|
||||||
|
|
||||||
@ -1221,34 +1030,3 @@ steps:
|
|||||||
num_gpus: 2
|
num_gpus: 2
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s tests/distributed/test_context_parallel.py
|
- pytest -v -s tests/distributed/test_context_parallel.py
|
||||||
- pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
|
|
||||||
|
|
||||||
##### RL Integration Tests #####
|
|
||||||
- label: Prime-RL Integration Test # 15min
|
|
||||||
timeout_in_minutes: 30
|
|
||||||
optional: true
|
|
||||||
num_gpus: 2
|
|
||||||
working_dir: "/vllm-workspace"
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/
|
|
||||||
- .buildkite/scripts/run-prime-rl-test.sh
|
|
||||||
commands:
|
|
||||||
- bash .buildkite/scripts/run-prime-rl-test.sh
|
|
||||||
|
|
||||||
- label: DeepSeek V2-Lite Accuracy
|
|
||||||
timeout_in_minutes: 60
|
|
||||||
gpu: h100
|
|
||||||
optional: true
|
|
||||||
num_gpus: 4
|
|
||||||
working_dir: "/vllm-workspace"
|
|
||||||
commands:
|
|
||||||
- bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010
|
|
||||||
|
|
||||||
- label: Qwen3-30B-A3B-FP8-block Accuracy
|
|
||||||
timeout_in_minutes: 60
|
|
||||||
gpu: h100
|
|
||||||
optional: true
|
|
||||||
num_gpus: 4
|
|
||||||
working_dir: "/vllm-workspace"
|
|
||||||
commands:
|
|
||||||
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep.sh 0.8 200 8020
|
|
||||||
|
|||||||
17
.coveragerc
17
.coveragerc
@ -1,10 +1,5 @@
|
|||||||
[run]
|
[run]
|
||||||
# Track the installed vllm package (this is what actually gets imported during tests)
|
source = vllm
|
||||||
# Use wildcard pattern to match the installed location
|
|
||||||
source =
|
|
||||||
vllm
|
|
||||||
*/dist-packages/vllm
|
|
||||||
*/site-packages/vllm
|
|
||||||
omit =
|
omit =
|
||||||
*/tests/*
|
*/tests/*
|
||||||
*/test_*
|
*/test_*
|
||||||
@ -17,16 +12,6 @@ omit =
|
|||||||
*/benchmarks/*
|
*/benchmarks/*
|
||||||
*/docs/*
|
*/docs/*
|
||||||
|
|
||||||
[paths]
|
|
||||||
# Map all possible vllm locations to a canonical "vllm" path
|
|
||||||
# This ensures coverage.combine properly merges data from different test runs
|
|
||||||
source =
|
|
||||||
vllm
|
|
||||||
/vllm-workspace/src/vllm
|
|
||||||
/vllm-workspace/vllm
|
|
||||||
*/site-packages/vllm
|
|
||||||
*/dist-packages/vllm
|
|
||||||
|
|
||||||
[report]
|
[report]
|
||||||
exclude_lines =
|
exclude_lines =
|
||||||
pragma: no cover
|
pragma: no cover
|
||||||
|
|||||||
@ -1,4 +0,0 @@
|
|||||||
# Migrate from `yapf` & `isort` to `ruff`
|
|
||||||
d6953beb91da4e9c99be4c0a1304a2d24189535c
|
|
||||||
# Convert `Optional[x]` to `x | None` and `Union[x, y]` to `x | y`
|
|
||||||
8fcaaf6a165e661f63fc51be906bc05b0767332f
|
|
||||||
50
.github/CODEOWNERS
vendored
50
.github/CODEOWNERS
vendored
@ -4,12 +4,19 @@
|
|||||||
# This lists cover the "core" components of vLLM that require careful review
|
# This lists cover the "core" components of vLLM that require careful review
|
||||||
/vllm/attention @LucasWilkinson
|
/vllm/attention @LucasWilkinson
|
||||||
/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
||||||
|
/vllm/core @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
||||||
|
/vllm/engine/llm_engine.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
||||||
/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
|
/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
|
||||||
/vllm/model_executor/layers/fused_moe @mgoin @pavanimajety
|
/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
|
||||||
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256 @pavanimajety
|
/vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
||||||
|
/vllm/model_executor/layers/fused_moe @mgoin
|
||||||
|
/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @NickLucche
|
||||||
|
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256
|
||||||
/vllm/model_executor/layers/mamba @tdoublep
|
/vllm/model_executor/layers/mamba @tdoublep
|
||||||
/vllm/model_executor/model_loader @22quinn
|
/vllm/model_executor/model_loader @22quinn
|
||||||
/vllm/multimodal @DarkLight1337 @ywang96 @NickLucche
|
/vllm/multimodal @DarkLight1337 @ywang96 @NickLucche
|
||||||
|
/vllm/v1/attention @LucasWilkinson
|
||||||
|
/vllm/v1/sample @22quinn @houseroad
|
||||||
/vllm/vllm_flash_attn @LucasWilkinson
|
/vllm/vllm_flash_attn @LucasWilkinson
|
||||||
/vllm/lora @jeejeelee
|
/vllm/lora @jeejeelee
|
||||||
/vllm/reasoning @aarnphm @chaunceyjiang
|
/vllm/reasoning @aarnphm @chaunceyjiang
|
||||||
@ -21,17 +28,14 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
|
|||||||
# Any change to the VllmConfig changes can have a large user-facing impact,
|
# Any change to the VllmConfig changes can have a large user-facing impact,
|
||||||
# so spam a lot of people
|
# so spam a lot of people
|
||||||
/vllm/config @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg
|
/vllm/config @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg
|
||||||
/vllm/config/cache.py @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg @heheda12345
|
|
||||||
|
|
||||||
# vLLM V1
|
# vLLM V1
|
||||||
/vllm/v1/attention @LucasWilkinson
|
/vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
|
||||||
/vllm/v1/attention/backends/mla @pavanimajety
|
/vllm/v1/structured_output @mgoin @russellb @aarnphm @benchislett
|
||||||
/vllm/v1/attention/backends/flashinfer.py @mgoin @pavanimajety
|
/vllm/v1/spec_decode @benchislett @luccafong
|
||||||
|
/vllm/v1/attention/backends/flashinfer.py @mgoin
|
||||||
/vllm/v1/attention/backends/triton_attn.py @tdoublep
|
/vllm/v1/attention/backends/triton_attn.py @tdoublep
|
||||||
/vllm/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat @heheda12345 @ApostaC
|
/vllm/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat @heheda12345 @ApostaC
|
||||||
/vllm/v1/sample @22quinn @houseroad @njhill
|
|
||||||
/vllm/v1/spec_decode @benchislett @luccafong
|
|
||||||
/vllm/v1/structured_output @mgoin @russellb @aarnphm @benchislett
|
|
||||||
/vllm/v1/kv_cache_interface.py @heheda12345
|
/vllm/v1/kv_cache_interface.py @heheda12345
|
||||||
/vllm/v1/offloading @ApostaC
|
/vllm/v1/offloading @ApostaC
|
||||||
|
|
||||||
@ -45,7 +49,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
|
|||||||
/tests/kernels @mgoin @tlrmchlsmth @WoosukKwon @yewentao256
|
/tests/kernels @mgoin @tlrmchlsmth @WoosukKwon @yewentao256
|
||||||
/tests/models @DarkLight1337 @ywang96
|
/tests/models @DarkLight1337 @ywang96
|
||||||
/tests/multimodal @DarkLight1337 @ywang96 @NickLucche
|
/tests/multimodal @DarkLight1337 @ywang96 @NickLucche
|
||||||
/tests/quantization @mgoin @robertgshaw2-redhat @yewentao256 @pavanimajety
|
/tests/quantization @mgoin @robertgshaw2-redhat @yewentao256
|
||||||
/tests/test_inputs.py @DarkLight1337 @ywang96
|
/tests/test_inputs.py @DarkLight1337 @ywang96
|
||||||
/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
|
/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
|
||||||
/tests/v1/structured_output @mgoin @russellb @aarnphm
|
/tests/v1/structured_output @mgoin @russellb @aarnphm
|
||||||
@ -53,35 +57,27 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
|
|||||||
/tests/weight_loading @mgoin @youkaichao @yewentao256
|
/tests/weight_loading @mgoin @youkaichao @yewentao256
|
||||||
/tests/lora @jeejeelee
|
/tests/lora @jeejeelee
|
||||||
/tests/models/language/generation/test_hybrid.py @tdoublep
|
/tests/models/language/generation/test_hybrid.py @tdoublep
|
||||||
/tests/v1/kv_connector/nixl_integration @NickLucche
|
/tests/v1/kv_connector/nixl_integration @NickLucche
|
||||||
/tests/v1/kv_connector @ApostaC
|
/tests/v1/kv_connector @ApostaC
|
||||||
/tests/v1/offloading @ApostaC
|
/tests/v1/offloading @ApostaC
|
||||||
|
|
||||||
# Transformers backend
|
# Transformers backend
|
||||||
/vllm/model_executor/models/transformers @hmellor
|
/vllm/model_executor/models/transformers.py @hmellor
|
||||||
/tests/models/test_transformers.py @hmellor
|
/tests/models/test_transformers.py @hmellor
|
||||||
|
|
||||||
# Docs
|
# Docs
|
||||||
/docs/mkdocs @hmellor
|
/docs @hmellor
|
||||||
/docs/**/*.yml @hmellor
|
|
||||||
/requirements/docs.txt @hmellor
|
|
||||||
.readthedocs.yaml @hmellor
|
|
||||||
mkdocs.yaml @hmellor
|
mkdocs.yaml @hmellor
|
||||||
|
|
||||||
# Linting
|
|
||||||
.markdownlint.yaml @hmellor
|
|
||||||
.pre-commit-config.yaml @hmellor
|
|
||||||
/tools/pre_commit @hmellor
|
|
||||||
|
|
||||||
# CPU
|
# CPU
|
||||||
/vllm/v1/worker/cpu* @bigPYJ1151
|
/vllm/v1/worker/^cpu @bigPYJ1151
|
||||||
/csrc/cpu @bigPYJ1151
|
/csrc/cpu @bigPYJ1151
|
||||||
/vllm/platforms/cpu.py @bigPYJ1151
|
/vllm/platforms/cpu.py @bigPYJ1151
|
||||||
/cmake/cpu_extension.cmake @bigPYJ1151
|
/cmake/cpu_extension.cmake @bigPYJ1151
|
||||||
/docker/Dockerfile.cpu @bigPYJ1151
|
/docker/Dockerfile.cpu @bigPYJ1151
|
||||||
|
|
||||||
# Intel GPU
|
# Intel GPU
|
||||||
/vllm/v1/worker/xpu* @jikunshang
|
/vllm/v1/worker/^xpu @jikunshang
|
||||||
/vllm/platforms/xpu.py @jikunshang
|
/vllm/platforms/xpu.py @jikunshang
|
||||||
/docker/Dockerfile.xpu @jikunshang
|
/docker/Dockerfile.xpu @jikunshang
|
||||||
|
|
||||||
@ -119,11 +115,3 @@ mkdocs.yaml @hmellor
|
|||||||
|
|
||||||
# KVConnector installation files
|
# KVConnector installation files
|
||||||
/requirements/kv_connectors.txt @NickLucche
|
/requirements/kv_connectors.txt @NickLucche
|
||||||
|
|
||||||
# Pooling models
|
|
||||||
/examples/*/pooling/ @noooop
|
|
||||||
/tests/models/*/pooling* @noooop
|
|
||||||
/tests/entrypoints/pooling @noooop
|
|
||||||
/vllm/config/pooler.py @noooop
|
|
||||||
/vllm/pooling_params.py @noooop
|
|
||||||
/vllm/model_executor/layers/pooler.py @noooop
|
|
||||||
|
|||||||
4
.github/ISSUE_TEMPLATE/750-RFC.yml
vendored
4
.github/ISSUE_TEMPLATE/750-RFC.yml
vendored
@ -43,6 +43,10 @@ body:
|
|||||||
Any other things you would like to mention.
|
Any other things you would like to mention.
|
||||||
validations:
|
validations:
|
||||||
required: false
|
required: false
|
||||||
|
- type: markdown
|
||||||
|
attributes:
|
||||||
|
value: >
|
||||||
|
Thanks for contributing 🎉! The vLLM core team hosts a biweekly RFC review session at 9:30AM Pacific Time, while most RFCs can be discussed online, you can optionally sign up for a slot to discuss your RFC online [here](https://docs.google.com/document/d/1CiLVBZeIVfR7_PNAKVSusxpceywkoOOB78qoWqHvSZc/edit).
|
||||||
- type: checkboxes
|
- type: checkboxes
|
||||||
id: askllm
|
id: askllm
|
||||||
attributes:
|
attributes:
|
||||||
|
|||||||
37
.github/mergify.yml
vendored
37
.github/mergify.yml
vendored
@ -2,7 +2,6 @@ pull_request_rules:
|
|||||||
- name: label-documentation
|
- name: label-documentation
|
||||||
description: Automatically apply documentation label
|
description: Automatically apply documentation label
|
||||||
conditions:
|
conditions:
|
||||||
- label != stale
|
|
||||||
- or:
|
- or:
|
||||||
- files~=^[^/]+\.md$
|
- files~=^[^/]+\.md$
|
||||||
- files~=^docs/
|
- files~=^docs/
|
||||||
@ -11,13 +10,10 @@ pull_request_rules:
|
|||||||
label:
|
label:
|
||||||
add:
|
add:
|
||||||
- documentation
|
- documentation
|
||||||
comment:
|
|
||||||
message: "Documentation preview: https://vllm--{{number}}.org.readthedocs.build/en/{{number}}/"
|
|
||||||
|
|
||||||
- name: label-ci-build
|
- name: label-ci-build
|
||||||
description: Automatically apply ci/build label
|
description: Automatically apply ci/build label
|
||||||
conditions:
|
conditions:
|
||||||
- label != stale
|
|
||||||
- or:
|
- or:
|
||||||
- files~=^\.github/
|
- files~=^\.github/
|
||||||
- files~=\.buildkite/
|
- files~=\.buildkite/
|
||||||
@ -34,7 +30,6 @@ pull_request_rules:
|
|||||||
- name: label-deepseek
|
- name: label-deepseek
|
||||||
description: Automatically apply deepseek label
|
description: Automatically apply deepseek label
|
||||||
conditions:
|
conditions:
|
||||||
- label != stale
|
|
||||||
- or:
|
- or:
|
||||||
- files~=^examples/.*deepseek.*\.py
|
- files~=^examples/.*deepseek.*\.py
|
||||||
- files~=^tests/.*deepseek.*\.py
|
- files~=^tests/.*deepseek.*\.py
|
||||||
@ -51,7 +46,6 @@ pull_request_rules:
|
|||||||
- name: label-frontend
|
- name: label-frontend
|
||||||
description: Automatically apply frontend label
|
description: Automatically apply frontend label
|
||||||
conditions:
|
conditions:
|
||||||
- label != stale
|
|
||||||
- files~=^vllm/entrypoints/
|
- files~=^vllm/entrypoints/
|
||||||
actions:
|
actions:
|
||||||
label:
|
label:
|
||||||
@ -61,7 +55,6 @@ pull_request_rules:
|
|||||||
- name: label-llama
|
- name: label-llama
|
||||||
description: Automatically apply llama label
|
description: Automatically apply llama label
|
||||||
conditions:
|
conditions:
|
||||||
- label != stale
|
|
||||||
- or:
|
- or:
|
||||||
- files~=^examples/.*llama.*\.py
|
- files~=^examples/.*llama.*\.py
|
||||||
- files~=^tests/.*llama.*\.py
|
- files~=^tests/.*llama.*\.py
|
||||||
@ -77,7 +70,6 @@ pull_request_rules:
|
|||||||
- name: label-multi-modality
|
- name: label-multi-modality
|
||||||
description: Automatically apply multi-modality label
|
description: Automatically apply multi-modality label
|
||||||
conditions:
|
conditions:
|
||||||
- label != stale
|
|
||||||
- or:
|
- or:
|
||||||
- files~=^vllm/multimodal/
|
- files~=^vllm/multimodal/
|
||||||
- files~=^tests/multimodal/
|
- files~=^tests/multimodal/
|
||||||
@ -91,7 +83,6 @@ pull_request_rules:
|
|||||||
- name: label-new-model
|
- name: label-new-model
|
||||||
description: Automatically apply new-model label
|
description: Automatically apply new-model label
|
||||||
conditions:
|
conditions:
|
||||||
- label != stale
|
|
||||||
- and:
|
- and:
|
||||||
- files~=^vllm/model_executor/models/
|
- files~=^vllm/model_executor/models/
|
||||||
- files=vllm/model_executor/models/registry.py
|
- files=vllm/model_executor/models/registry.py
|
||||||
@ -103,12 +94,11 @@ pull_request_rules:
|
|||||||
- name: label-performance
|
- name: label-performance
|
||||||
description: Automatically apply performance label
|
description: Automatically apply performance label
|
||||||
conditions:
|
conditions:
|
||||||
- label != stale
|
|
||||||
- or:
|
- or:
|
||||||
- files~=^benchmarks/
|
- files~=^benchmarks/
|
||||||
- files~=^vllm/benchmarks/
|
- files~=^vllm/benchmarks/
|
||||||
- files~=^tests/benchmarks/
|
- files~=^tests/benchmarks/
|
||||||
- files~=^\.buildkite/performance-benchmarks/
|
- files~=^\.buildkite/nightly-benchmarks/
|
||||||
actions:
|
actions:
|
||||||
label:
|
label:
|
||||||
add:
|
add:
|
||||||
@ -117,7 +107,6 @@ pull_request_rules:
|
|||||||
- name: label-qwen
|
- name: label-qwen
|
||||||
description: Automatically apply qwen label
|
description: Automatically apply qwen label
|
||||||
conditions:
|
conditions:
|
||||||
- label != stale
|
|
||||||
- or:
|
- or:
|
||||||
- files~=^examples/.*qwen.*\.py
|
- files~=^examples/.*qwen.*\.py
|
||||||
- files~=^tests/.*qwen.*\.py
|
- files~=^tests/.*qwen.*\.py
|
||||||
@ -132,7 +121,6 @@ pull_request_rules:
|
|||||||
- name: label-gpt-oss
|
- name: label-gpt-oss
|
||||||
description: Automatically apply gpt-oss label
|
description: Automatically apply gpt-oss label
|
||||||
conditions:
|
conditions:
|
||||||
- label != stale
|
|
||||||
- or:
|
- or:
|
||||||
- files~=^examples/.*gpt[-_]?oss.*\.py
|
- files~=^examples/.*gpt[-_]?oss.*\.py
|
||||||
- files~=^tests/.*gpt[-_]?oss.*\.py
|
- files~=^tests/.*gpt[-_]?oss.*\.py
|
||||||
@ -154,7 +142,6 @@ pull_request_rules:
|
|||||||
- name: label-rocm
|
- name: label-rocm
|
||||||
description: Automatically apply rocm label
|
description: Automatically apply rocm label
|
||||||
conditions:
|
conditions:
|
||||||
- label != stale
|
|
||||||
- or:
|
- or:
|
||||||
- files~=^csrc/rocm/
|
- files~=^csrc/rocm/
|
||||||
- files~=^docker/Dockerfile.rocm
|
- files~=^docker/Dockerfile.rocm
|
||||||
@ -175,7 +162,6 @@ pull_request_rules:
|
|||||||
- name: label-structured-output
|
- name: label-structured-output
|
||||||
description: Automatically apply structured-output label
|
description: Automatically apply structured-output label
|
||||||
conditions:
|
conditions:
|
||||||
- label != stale
|
|
||||||
- or:
|
- or:
|
||||||
- files~=^benchmarks/structured_schemas/
|
- files~=^benchmarks/structured_schemas/
|
||||||
- files=benchmarks/benchmark_serving_structured_output.py
|
- files=benchmarks/benchmark_serving_structured_output.py
|
||||||
@ -195,7 +181,6 @@ pull_request_rules:
|
|||||||
- name: label-speculative-decoding
|
- name: label-speculative-decoding
|
||||||
description: Automatically apply speculative-decoding label
|
description: Automatically apply speculative-decoding label
|
||||||
conditions:
|
conditions:
|
||||||
- label != stale
|
|
||||||
- or:
|
- or:
|
||||||
- files~=^vllm/v1/spec_decode/
|
- files~=^vllm/v1/spec_decode/
|
||||||
- files~=^tests/v1/spec_decode/
|
- files~=^tests/v1/spec_decode/
|
||||||
@ -211,7 +196,6 @@ pull_request_rules:
|
|||||||
- name: label-v1
|
- name: label-v1
|
||||||
description: Automatically apply v1 label
|
description: Automatically apply v1 label
|
||||||
conditions:
|
conditions:
|
||||||
- label != stale
|
|
||||||
- or:
|
- or:
|
||||||
- files~=^vllm/v1/
|
- files~=^vllm/v1/
|
||||||
- files~=^tests/v1/
|
- files~=^tests/v1/
|
||||||
@ -224,7 +208,6 @@ pull_request_rules:
|
|||||||
description: Automatically apply tpu label
|
description: Automatically apply tpu label
|
||||||
# Keep this list in sync with `label-tpu-remove` conditions
|
# Keep this list in sync with `label-tpu-remove` conditions
|
||||||
conditions:
|
conditions:
|
||||||
- label != stale
|
|
||||||
- or:
|
- or:
|
||||||
- files~=tpu.py
|
- files~=tpu.py
|
||||||
- files~=_tpu
|
- files~=_tpu
|
||||||
@ -240,7 +223,6 @@ pull_request_rules:
|
|||||||
description: Automatically remove tpu label
|
description: Automatically remove tpu label
|
||||||
# Keep this list in sync with `label-tpu` conditions
|
# Keep this list in sync with `label-tpu` conditions
|
||||||
conditions:
|
conditions:
|
||||||
- label != stale
|
|
||||||
- and:
|
- and:
|
||||||
- -files~=tpu.py
|
- -files~=tpu.py
|
||||||
- -files~=_tpu
|
- -files~=_tpu
|
||||||
@ -255,9 +237,9 @@ pull_request_rules:
|
|||||||
- name: label-tool-calling
|
- name: label-tool-calling
|
||||||
description: Automatically add tool-calling label
|
description: Automatically add tool-calling label
|
||||||
conditions:
|
conditions:
|
||||||
- label != stale
|
|
||||||
- or:
|
- or:
|
||||||
- files~=^tests/tool_use/
|
- files~=^tests/tool_use/
|
||||||
|
- files~=^tests/mistral_tool_use/
|
||||||
- files~=^tests/entrypoints/openai/tool_parsers/
|
- files~=^tests/entrypoints/openai/tool_parsers/
|
||||||
- files=tests/entrypoints/openai/test_chat_with_tool_reasoning.py
|
- files=tests/entrypoints/openai/test_chat_with_tool_reasoning.py
|
||||||
- files~=^vllm/entrypoints/openai/tool_parsers/
|
- files~=^vllm/entrypoints/openai/tool_parsers/
|
||||||
@ -274,9 +256,8 @@ pull_request_rules:
|
|||||||
|
|
||||||
- name: ping author on conflicts and add 'needs-rebase' label
|
- name: ping author on conflicts and add 'needs-rebase' label
|
||||||
conditions:
|
conditions:
|
||||||
- label != stale
|
- conflict
|
||||||
- conflict
|
- -closed
|
||||||
- -closed
|
|
||||||
actions:
|
actions:
|
||||||
label:
|
label:
|
||||||
add:
|
add:
|
||||||
@ -290,12 +271,10 @@ pull_request_rules:
|
|||||||
|
|
||||||
- name: assign reviewer for tensorizer changes
|
- name: assign reviewer for tensorizer changes
|
||||||
conditions:
|
conditions:
|
||||||
- label != stale
|
|
||||||
- or:
|
|
||||||
- files~=^vllm/model_executor/model_loader/tensorizer.py
|
- files~=^vllm/model_executor/model_loader/tensorizer.py
|
||||||
- files~=^vllm/model_executor/model_loader/tensorizer_loader.py
|
- files~=^vllm/model_executor/model_loader/tensorizer_loader.py
|
||||||
- files~=^tests/entrypoints/openai/test_tensorizer_entrypoint.py
|
- files~=^tests/entrypoints/openai/test_tensorizer_entrypoint.py
|
||||||
- files~=^tests/model_executor/model_loader/tensorizer_loader/
|
- files~=^tests/tensorizer_loader/
|
||||||
actions:
|
actions:
|
||||||
assign:
|
assign:
|
||||||
users:
|
users:
|
||||||
@ -303,7 +282,6 @@ pull_request_rules:
|
|||||||
|
|
||||||
- name: assign reviewer for modelopt changes
|
- name: assign reviewer for modelopt changes
|
||||||
conditions:
|
conditions:
|
||||||
- label != stale
|
|
||||||
- or:
|
- or:
|
||||||
- files~=^vllm/model_executor/layers/quantization/modelopt\.py$
|
- files~=^vllm/model_executor/layers/quantization/modelopt\.py$
|
||||||
- files~=^vllm/model_executor/layers/quantization/__init__\.py$
|
- files~=^vllm/model_executor/layers/quantization/__init__\.py$
|
||||||
@ -318,8 +296,8 @@ pull_request_rules:
|
|||||||
|
|
||||||
- name: remove 'needs-rebase' label when conflict is resolved
|
- name: remove 'needs-rebase' label when conflict is resolved
|
||||||
conditions:
|
conditions:
|
||||||
- -conflict
|
- -conflict
|
||||||
- -closed
|
- -closed
|
||||||
actions:
|
actions:
|
||||||
label:
|
label:
|
||||||
remove:
|
remove:
|
||||||
@ -328,7 +306,6 @@ pull_request_rules:
|
|||||||
- name: label-kv-connector
|
- name: label-kv-connector
|
||||||
description: Automatically apply kv-connector label
|
description: Automatically apply kv-connector label
|
||||||
conditions:
|
conditions:
|
||||||
- label != stale
|
|
||||||
- or:
|
- or:
|
||||||
- files~=^examples/online_serving/disaggregated[^/]*/.*
|
- files~=^examples/online_serving/disaggregated[^/]*/.*
|
||||||
- files~=^examples/offline_inference/disaggregated[^/]*/.*
|
- files~=^examples/offline_inference/disaggregated[^/]*/.*
|
||||||
|
|||||||
138
.github/workflows/issue_autolabel.yml
vendored
138
.github/workflows/issue_autolabel.yml
vendored
@ -13,7 +13,6 @@ jobs:
|
|||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- name: Label issues based on keywords
|
- name: Label issues based on keywords
|
||||||
id: label-step
|
|
||||||
uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
|
uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
|
||||||
with:
|
with:
|
||||||
script: |
|
script: |
|
||||||
@ -43,6 +42,7 @@ jobs:
|
|||||||
searchIn: "body"
|
searchIn: "body"
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
|
|
||||||
// Substring search - matches anywhere in text (partial matches)
|
// Substring search - matches anywhere in text (partial matches)
|
||||||
substrings: [
|
substrings: [
|
||||||
{
|
{
|
||||||
@ -89,12 +89,14 @@ jobs:
|
|||||||
term: "hip_",
|
term: "hip_",
|
||||||
searchIn: "both"
|
searchIn: "both"
|
||||||
},
|
},
|
||||||
|
|
||||||
// ROCm tools and libraries
|
// ROCm tools and libraries
|
||||||
{
|
{
|
||||||
term: "hipify",
|
term: "hipify",
|
||||||
searchIn: "both"
|
searchIn: "both"
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
|
|
||||||
// Regex patterns - for complex pattern matching
|
// Regex patterns - for complex pattern matching
|
||||||
regexPatterns: [
|
regexPatterns: [
|
||||||
{
|
{
|
||||||
@ -105,17 +107,13 @@ jobs:
|
|||||||
}
|
}
|
||||||
],
|
],
|
||||||
},
|
},
|
||||||
// Add more label configurations here as needed
|
|
||||||
// example: {
|
|
||||||
// keywords: [...],
|
|
||||||
// substrings: [...],
|
|
||||||
// regexPatterns: [...]
|
|
||||||
// },
|
|
||||||
};
|
};
|
||||||
|
|
||||||
// Helper function to create regex based on search type
|
// Helper function to create regex based on search type
|
||||||
function createSearchRegex(term, type) {
|
function createSearchRegex(term, type) {
|
||||||
// Escape special regex characters in the term
|
// Escape special regex characters in the term
|
||||||
const escapedTerm = term.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
|
const escapedTerm = term.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
|
||||||
|
|
||||||
switch (type) {
|
switch (type) {
|
||||||
case 'keyword':
|
case 'keyword':
|
||||||
// Word boundary search - matches whole words only
|
// Word boundary search - matches whole words only
|
||||||
@ -127,13 +125,16 @@ jobs:
|
|||||||
throw new Error(`Unknown search type: ${type}`);
|
throw new Error(`Unknown search type: ${type}`);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Helper function to find matching terms in text with line information
|
// Helper function to find matching terms in text with line information
|
||||||
function findMatchingTermsWithLines(text, searchTerms = [], searchType = 'keyword', searchLocation = '') {
|
function findMatchingTermsWithLines(text, searchTerms = [], searchType = 'keyword', searchLocation = '') {
|
||||||
const matches = [];
|
const matches = [];
|
||||||
const lines = text.split('\n');
|
const lines = text.split('\n');
|
||||||
|
|
||||||
for (const termConfig of searchTerms) {
|
for (const termConfig of searchTerms) {
|
||||||
let regex;
|
let regex;
|
||||||
let term, searchIn, pattern, description, flags;
|
let term, searchIn, pattern, description, flags;
|
||||||
|
|
||||||
// Handle different input formats (string or object)
|
// Handle different input formats (string or object)
|
||||||
if (typeof termConfig === 'string') {
|
if (typeof termConfig === 'string') {
|
||||||
term = termConfig;
|
term = termConfig;
|
||||||
@ -145,17 +146,21 @@ jobs:
|
|||||||
description = termConfig.description;
|
description = termConfig.description;
|
||||||
flags = termConfig.flags;
|
flags = termConfig.flags;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Skip if this term shouldn't be searched in the current location
|
// Skip if this term shouldn't be searched in the current location
|
||||||
if (searchIn !== 'both' && searchIn !== searchLocation) {
|
if (searchIn !== 'both' && searchIn !== searchLocation) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Create appropriate regex
|
// Create appropriate regex
|
||||||
if (searchType === 'regex') {
|
if (searchType === 'regex') {
|
||||||
regex = new RegExp(pattern, flags || "gi");
|
regex = new RegExp(pattern, flags || "gi");
|
||||||
} else {
|
} else {
|
||||||
regex = createSearchRegex(term, searchType);
|
regex = createSearchRegex(term, searchType);
|
||||||
}
|
}
|
||||||
|
|
||||||
const termMatches = [];
|
const termMatches = [];
|
||||||
|
|
||||||
// Check each line for matches
|
// Check each line for matches
|
||||||
lines.forEach((line, lineIndex) => {
|
lines.forEach((line, lineIndex) => {
|
||||||
const lineMatches = line.match(regex);
|
const lineMatches = line.match(regex);
|
||||||
@ -170,14 +175,15 @@ jobs:
|
|||||||
originalTerm: term || pattern,
|
originalTerm: term || pattern,
|
||||||
description: description,
|
description: description,
|
||||||
// Show context around the match in the line
|
// Show context around the match in the line
|
||||||
context: line.length > 100 ?
|
context: line.length > 100 ?
|
||||||
line.substring(Math.max(0, line.toLowerCase().indexOf(match.toLowerCase()) - 30),
|
line.substring(Math.max(0, line.toLowerCase().indexOf(match.toLowerCase()) - 30),
|
||||||
line.toLowerCase().indexOf(match.toLowerCase()) + match.length + 30) + '...'
|
line.toLowerCase().indexOf(match.toLowerCase()) + match.length + 30) + '...'
|
||||||
: line.trim()
|
: line.trim()
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
if (termMatches.length > 0) {
|
if (termMatches.length > 0) {
|
||||||
matches.push({
|
matches.push({
|
||||||
term: term || (description || pattern),
|
term: term || (description || pattern),
|
||||||
@ -190,48 +196,64 @@ jobs:
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return matches;
|
return matches;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Helper function to check if label should be added
|
// Helper function to check if label should be added
|
||||||
async function processLabel(labelName, config) {
|
async function processLabel(labelName, config) {
|
||||||
const body = context.payload.issue.body || "";
|
const body = context.payload.issue.body || "";
|
||||||
const title = context.payload.issue.title || "";
|
const title = context.payload.issue.title || "";
|
||||||
|
|
||||||
core.notice(`Processing label: ${labelName}`);
|
core.notice(`Processing label: ${labelName}`);
|
||||||
core.notice(`Issue Title: "${title}"`);
|
core.notice(`Issue Title: "${title}"`);
|
||||||
core.notice(`Issue Body length: ${body.length} characters`);
|
core.notice(`Issue Body length: ${body.length} characters`);
|
||||||
|
|
||||||
let shouldAddLabel = false;
|
let shouldAddLabel = false;
|
||||||
let allMatches = [];
|
let allMatches = [];
|
||||||
let reason = '';
|
let reason = '';
|
||||||
|
|
||||||
const keywords = config.keywords || [];
|
const keywords = config.keywords || [];
|
||||||
const substrings = config.substrings || [];
|
const substrings = config.substrings || [];
|
||||||
const regexPatterns = config.regexPatterns || [];
|
const regexPatterns = config.regexPatterns || [];
|
||||||
|
|
||||||
core.notice(`Searching with ${keywords.length} keywords, ${substrings.length} substrings, and ${regexPatterns.length} regex patterns`);
|
core.notice(`Searching with ${keywords.length} keywords, ${substrings.length} substrings, and ${regexPatterns.length} regex patterns`);
|
||||||
|
|
||||||
// Search in title
|
// Search in title
|
||||||
if (title.trim()) {
|
if (title.trim()) {
|
||||||
core.notice(`Searching in title: "${title}"`);
|
core.notice(`Searching in title: "${title}"`);
|
||||||
|
|
||||||
const titleKeywordMatches = findMatchingTermsWithLines(title, keywords, 'keyword', 'title');
|
const titleKeywordMatches = findMatchingTermsWithLines(title, keywords, 'keyword', 'title');
|
||||||
const titleSubstringMatches = findMatchingTermsWithLines(title, substrings, 'substring', 'title');
|
const titleSubstringMatches = findMatchingTermsWithLines(title, substrings, 'substring', 'title');
|
||||||
const titleRegexMatches = findMatchingTermsWithLines(title, regexPatterns, 'regex', 'title');
|
const titleRegexMatches = findMatchingTermsWithLines(title, regexPatterns, 'regex', 'title');
|
||||||
|
|
||||||
allMatches.push(...titleKeywordMatches, ...titleSubstringMatches, ...titleRegexMatches);
|
allMatches.push(...titleKeywordMatches, ...titleSubstringMatches, ...titleRegexMatches);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Search in body
|
// Search in body
|
||||||
if (body.trim()) {
|
if (body.trim()) {
|
||||||
core.notice(`Searching in body (${body.length} characters)`);
|
core.notice(`Searching in body (${body.length} characters)`);
|
||||||
|
|
||||||
const bodyKeywordMatches = findMatchingTermsWithLines(body, keywords, 'keyword', 'body');
|
const bodyKeywordMatches = findMatchingTermsWithLines(body, keywords, 'keyword', 'body');
|
||||||
const bodySubstringMatches = findMatchingTermsWithLines(body, substrings, 'substring', 'body');
|
const bodySubstringMatches = findMatchingTermsWithLines(body, substrings, 'substring', 'body');
|
||||||
const bodyRegexMatches = findMatchingTermsWithLines(body, regexPatterns, 'regex', 'body');
|
const bodyRegexMatches = findMatchingTermsWithLines(body, regexPatterns, 'regex', 'body');
|
||||||
|
|
||||||
allMatches.push(...bodyKeywordMatches, ...bodySubstringMatches, ...bodyRegexMatches);
|
allMatches.push(...bodyKeywordMatches, ...bodySubstringMatches, ...bodyRegexMatches);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (allMatches.length > 0) {
|
if (allMatches.length > 0) {
|
||||||
core.notice(`Found ${allMatches.length} matching term(s):`);
|
core.notice(`Found ${allMatches.length} matching term(s):`);
|
||||||
|
|
||||||
for (const termMatch of allMatches) {
|
for (const termMatch of allMatches) {
|
||||||
const locationText = termMatch.searchLocation === 'title' ? 'title' : 'body';
|
const locationText = termMatch.searchLocation === 'title' ? 'title' : 'body';
|
||||||
const searchInText = termMatch.searchIn === 'both' ? 'both' : termMatch.searchIn;
|
const searchInText = termMatch.searchIn === 'both' ? 'both' : termMatch.searchIn;
|
||||||
|
|
||||||
if (termMatch.searchType === 'regex') {
|
if (termMatch.searchType === 'regex') {
|
||||||
core.notice(` 📍 Regex: "${termMatch.term}" (pattern: ${termMatch.pattern}) found ${termMatch.count} time(s) in ${locationText} (configured to search in: ${searchInText}):`);
|
core.notice(` 📍 Regex: "${termMatch.term}" (pattern: ${termMatch.pattern}) found ${termMatch.count} time(s) in ${locationText} (configured to search in: ${searchInText}):`);
|
||||||
} else {
|
} else {
|
||||||
core.notice(` 📍 Term: "${termMatch.term}" (${termMatch.searchType} search) found ${termMatch.count} time(s) in ${locationText} (configured to search in: ${searchInText}):`);
|
core.notice(` 📍 Term: "${termMatch.term}" (${termMatch.searchType} search) found ${termMatch.count} time(s) in ${locationText} (configured to search in: ${searchInText}):`);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Show details for each match
|
// Show details for each match
|
||||||
termMatch.matches.forEach((match, index) => {
|
termMatch.matches.forEach((match, index) => {
|
||||||
core.notice(` ${index + 1}. Line ${match.lineNumber} in ${match.searchLocation}: "${match.match}" [${match.searchType}]`);
|
core.notice(` ${index + 1}. Line ${match.lineNumber} in ${match.searchLocation}: "${match.match}" [${match.searchType}]`);
|
||||||
@ -244,6 +266,7 @@ jobs:
|
|||||||
}
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
shouldAddLabel = true;
|
shouldAddLabel = true;
|
||||||
const totalMatches = allMatches.reduce((sum, t) => sum + t.count, 0);
|
const totalMatches = allMatches.reduce((sum, t) => sum + t.count, 0);
|
||||||
const titleMatches = allMatches.filter(t => t.searchLocation === 'title').reduce((sum, t) => sum + t.count, 0);
|
const titleMatches = allMatches.filter(t => t.searchLocation === 'title').reduce((sum, t) => sum + t.count, 0);
|
||||||
@ -251,10 +274,13 @@ jobs:
|
|||||||
const keywordMatches = allMatches.filter(t => t.searchType === 'keyword').reduce((sum, t) => sum + t.count, 0);
|
const keywordMatches = allMatches.filter(t => t.searchType === 'keyword').reduce((sum, t) => sum + t.count, 0);
|
||||||
const substringMatches = allMatches.filter(t => t.searchType === 'substring').reduce((sum, t) => sum + t.count, 0);
|
const substringMatches = allMatches.filter(t => t.searchType === 'substring').reduce((sum, t) => sum + t.count, 0);
|
||||||
const regexMatches = allMatches.filter(t => t.searchType === 'regex').reduce((sum, t) => sum + t.count, 0);
|
const regexMatches = allMatches.filter(t => t.searchType === 'regex').reduce((sum, t) => sum + t.count, 0);
|
||||||
|
|
||||||
reason = `Found ${totalMatches} total matches (${titleMatches} in title, ${bodyMatches} in body) - ${keywordMatches} keyword matches, ${substringMatches} substring matches, ${regexMatches} regex matches`;
|
reason = `Found ${totalMatches} total matches (${titleMatches} in title, ${bodyMatches} in body) - ${keywordMatches} keyword matches, ${substringMatches} substring matches, ${regexMatches} regex matches`;
|
||||||
}
|
}
|
||||||
|
|
||||||
core.notice(`Final decision: ${shouldAddLabel ? 'ADD LABEL' : 'DO NOT ADD LABEL'}`);
|
core.notice(`Final decision: ${shouldAddLabel ? 'ADD LABEL' : 'DO NOT ADD LABEL'}`);
|
||||||
core.notice(`Reason: ${reason || 'No matching terms found'}`);
|
core.notice(`Reason: ${reason || 'No matching terms found'}`);
|
||||||
|
|
||||||
if (shouldAddLabel) {
|
if (shouldAddLabel) {
|
||||||
const existingLabels = context.payload.issue.labels.map(l => l.name);
|
const existingLabels = context.payload.issue.labels.map(l => l.name);
|
||||||
if (!existingLabels.includes(labelName)) {
|
if (!existingLabels.includes(labelName)) {
|
||||||
@ -270,92 +296,14 @@ jobs:
|
|||||||
core.notice(`Label "${labelName}" already present.`);
|
core.notice(`Label "${labelName}" already present.`);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
core.notice(`No matching terms found for label "${labelName}".`);
|
core.notice(`No matching terms found for label "${labelName}".`);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Process all configured labels
|
// Process all configured labels
|
||||||
const labelsAddedResults = await Promise.all(
|
const processLabels = Object.entries(labelConfig)
|
||||||
Object.entries(labelConfig).map(([labelName, config]) =>
|
.map(([labelName, config]) => processLabel(labelName, config));
|
||||||
processLabel(labelName, config).then(added => ({ labelName, added }))
|
const labelsAdded = await Promise.all(processLabels);
|
||||||
)
|
const numLabelsAdded = labelsAdded.reduce((x, y) => x + y, 0);
|
||||||
);
|
core.notice(`Processing complete. ${numLabelsAdded} label(s) added.`);
|
||||||
|
|
||||||
const numLabelsAdded = labelsAddedResults.filter(r => r.added).length;
|
|
||||||
core.notice(`Processing complete. ${numLabelsAdded} label(s) added.`);
|
|
||||||
|
|
||||||
// Return which labels were added for the next step
|
|
||||||
const addedLabels = labelsAddedResults.filter(r => r.added).map(r => r.labelName);
|
|
||||||
core.setOutput('labels_added', JSON.stringify(addedLabels));
|
|
||||||
return addedLabels;
|
|
||||||
|
|
||||||
- name: CC users for labeled issues
|
|
||||||
if: steps.label-step.outputs.labels_added != '[]'
|
|
||||||
uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
|
|
||||||
with:
|
|
||||||
script: |
|
|
||||||
// Configuration: Map labels to GitHub users to CC
|
|
||||||
// You can add multiple users per label, and multiple label configurations
|
|
||||||
const ccConfig = {
|
|
||||||
rocm: {
|
|
||||||
users: ['hongxiayang', 'tjtanaa', 'vllmellm'], // Add more users as needed: ['user1', 'user2', 'user3']
|
|
||||||
message: 'CC {users} for ROCm-related issue' // {users} will be replaced with @mentions
|
|
||||||
},
|
|
||||||
// Add more label -> user mappings here
|
|
||||||
// Example:
|
|
||||||
// cuda: {
|
|
||||||
// users: ['user1', 'user2'],
|
|
||||||
// message: 'CC {users} for CUDA-related issue'
|
|
||||||
// },
|
|
||||||
// performance: {
|
|
||||||
// users: ['perfexpert'],
|
|
||||||
// message: 'CC {users} for performance issue'
|
|
||||||
// },
|
|
||||||
};
|
|
||||||
|
|
||||||
const labelsAdded = JSON.parse('${{ steps.label-step.outputs.labels_added }}');
|
|
||||||
core.notice(`Labels added: ${labelsAdded.join(', ')}`);
|
|
||||||
|
|
||||||
// Get existing comments to check for already mentioned users
|
|
||||||
const comments = await github.rest.issues.listComments({
|
|
||||||
owner: context.repo.owner,
|
|
||||||
repo: context.repo.repo,
|
|
||||||
issue_number: context.issue.number,
|
|
||||||
});
|
|
||||||
|
|
||||||
const issueBody = context.payload.issue.body || '';
|
|
||||||
const allExistingText = issueBody + '\n' + comments.data.map(c => c.body).join('\n');
|
|
||||||
|
|
||||||
// Process each label that was added
|
|
||||||
for (const label of labelsAdded) {
|
|
||||||
if (ccConfig[label]) {
|
|
||||||
const config = ccConfig[label];
|
|
||||||
const usersToMention = [];
|
|
||||||
|
|
||||||
// Check which users haven't been mentioned yet
|
|
||||||
for (const user of config.users) {
|
|
||||||
const mentionPattern = new RegExp(`@${user}\\b`, 'i');
|
|
||||||
if (!mentionPattern.test(allExistingText)) {
|
|
||||||
usersToMention.push(user);
|
|
||||||
} else {
|
|
||||||
core.notice(`@${user} already mentioned for label "${label}", skipping`);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Post comment if there are users to mention
|
|
||||||
if (usersToMention.length > 0) {
|
|
||||||
const mentions = usersToMention.map(u => `@${u}`).join(' ');
|
|
||||||
const message = config.message.replace('{users}', mentions);
|
|
||||||
|
|
||||||
await github.rest.issues.createComment({
|
|
||||||
owner: context.repo.owner,
|
|
||||||
repo: context.repo.repo,
|
|
||||||
issue_number: context.issue.number,
|
|
||||||
body: message
|
|
||||||
});
|
|
||||||
|
|
||||||
core.notice(`CC comment added for label "${label}": ${mentions}`);
|
|
||||||
} else {
|
|
||||||
core.notice(`All users for label "${label}" already mentioned, skipping comment`);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
2
.github/workflows/stale.yml
vendored
2
.github/workflows/stale.yml
vendored
@ -13,7 +13,7 @@ jobs:
|
|||||||
actions: write
|
actions: write
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/stale@5f858e3efba33a5ca4407a664cc011ad407f2008 # v10.1.0
|
- uses: actions/stale@3a9db7e6a41a89f618792c92c0e97cc736e1b13f # v10.0.0
|
||||||
with:
|
with:
|
||||||
# Increasing this value ensures that changes to this workflow
|
# Increasing this value ensures that changes to this workflow
|
||||||
# propagate to all issues and PRs in days rather than months
|
# propagate to all issues and PRs in days rather than months
|
||||||
|
|||||||
3
.gitignore
vendored
3
.gitignore
vendored
@ -94,9 +94,6 @@ ipython_config.py
|
|||||||
# generated files
|
# generated files
|
||||||
**/generated/**
|
**/generated/**
|
||||||
|
|
||||||
# uv
|
|
||||||
uv.lock
|
|
||||||
|
|
||||||
# pyenv
|
# pyenv
|
||||||
# For a library or package, you might want to ignore these files since the code is
|
# For a library or package, you might want to ignore these files since the code is
|
||||||
# intended to run in multiple environments; otherwise, check them in:
|
# intended to run in multiple environments; otherwise, check them in:
|
||||||
|
|||||||
@ -4,6 +4,7 @@ MD013: false
|
|||||||
MD024:
|
MD024:
|
||||||
siblings_only: true
|
siblings_only: true
|
||||||
MD033: false
|
MD033: false
|
||||||
|
MD042: false
|
||||||
MD045: false
|
MD045: false
|
||||||
MD046: false
|
MD046: false
|
||||||
MD051: false
|
MD051: false
|
||||||
|
|||||||
@ -6,19 +6,30 @@ default_stages:
|
|||||||
- manual # Run in CI
|
- manual # Run in CI
|
||||||
exclude: 'vllm/third_party/.*'
|
exclude: 'vllm/third_party/.*'
|
||||||
repos:
|
repos:
|
||||||
- repo: https://github.com/astral-sh/ruff-pre-commit
|
- repo: https://github.com/google/yapf
|
||||||
rev: v0.14.0
|
rev: v0.43.0
|
||||||
hooks:
|
hooks:
|
||||||
- id: ruff-check
|
- id: yapf
|
||||||
|
args: [--in-place, --verbose]
|
||||||
|
# Keep the same list from yapfignore here to avoid yapf failing without any inputs
|
||||||
|
exclude: '(.buildkite|benchmarks|build|examples)/.*'
|
||||||
|
- repo: https://github.com/astral-sh/ruff-pre-commit
|
||||||
|
rev: v0.11.7
|
||||||
|
hooks:
|
||||||
|
- id: ruff
|
||||||
args: [--output-format, github, --fix]
|
args: [--output-format, github, --fix]
|
||||||
- id: ruff-format
|
- id: ruff-format
|
||||||
|
files: ^(.buildkite|benchmarks|examples)/.*
|
||||||
- repo: https://github.com/crate-ci/typos
|
- repo: https://github.com/crate-ci/typos
|
||||||
rev: v1.38.1
|
rev: v1.35.5
|
||||||
hooks:
|
hooks:
|
||||||
- id: typos
|
- id: typos
|
||||||
args: [--force-exclude]
|
- repo: https://github.com/PyCQA/isort
|
||||||
|
rev: 6.0.1
|
||||||
|
hooks:
|
||||||
|
- id: isort
|
||||||
- repo: https://github.com/pre-commit/mirrors-clang-format
|
- repo: https://github.com/pre-commit/mirrors-clang-format
|
||||||
rev: v21.1.2
|
rev: v20.1.3
|
||||||
hooks:
|
hooks:
|
||||||
- id: clang-format
|
- id: clang-format
|
||||||
exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))|vllm/third_party/.*'
|
exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))|vllm/third_party/.*'
|
||||||
@ -35,55 +46,61 @@ repos:
|
|||||||
hooks:
|
hooks:
|
||||||
- id: actionlint
|
- id: actionlint
|
||||||
- repo: https://github.com/astral-sh/uv-pre-commit
|
- repo: https://github.com/astral-sh/uv-pre-commit
|
||||||
rev: 0.9.1
|
rev: 0.6.17
|
||||||
hooks:
|
hooks:
|
||||||
- id: pip-compile
|
- id: pip-compile
|
||||||
args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu129, --python-platform, x86_64-manylinux_2_28]
|
args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu128]
|
||||||
files: ^requirements/test\.(in|txt)$
|
files: ^requirements/test\.(in|txt)$
|
||||||
- repo: local
|
- repo: local
|
||||||
hooks:
|
hooks:
|
||||||
- id: format-torch-nightly-test
|
- id: format-torch-nightly-test
|
||||||
name: reformat nightly_torch_test.txt to be in sync with test.in
|
name: reformat nightly_torch_test.txt to be in sync with test.in
|
||||||
language: python
|
language: python
|
||||||
entry: python tools/pre_commit/generate_nightly_torch_test.py
|
entry: python tools/generate_nightly_torch_test.py
|
||||||
files: ^requirements/test\.(in|txt)$
|
files: ^requirements/test\.(in|txt)$
|
||||||
- id: mypy-local
|
- id: mypy-local
|
||||||
name: Run mypy locally for lowest supported Python version
|
name: Run mypy for local Python installation
|
||||||
entry: python tools/pre_commit/mypy.py 0 "3.10"
|
entry: tools/mypy.sh 0 "local"
|
||||||
|
language: python
|
||||||
|
types: [python]
|
||||||
|
additional_dependencies: &mypy_deps [mypy==1.11.1, types-cachetools, types-setuptools, types-PyYAML, types-requests, pydantic]
|
||||||
stages: [pre-commit] # Don't run in CI
|
stages: [pre-commit] # Don't run in CI
|
||||||
<<: &mypy_common
|
- id: mypy-3.9 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
|
||||||
language: python
|
name: Run mypy for Python 3.9
|
||||||
types_or: [python, pyi]
|
entry: tools/mypy.sh 1 "3.9"
|
||||||
require_serial: true
|
language: python
|
||||||
additional_dependencies: [mypy==1.11.1, regex, types-cachetools, types-setuptools, types-PyYAML, types-requests, types-torch, pydantic]
|
types: [python]
|
||||||
|
additional_dependencies: *mypy_deps
|
||||||
|
stages: [manual] # Only run in CI
|
||||||
- id: mypy-3.10 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
|
- id: mypy-3.10 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
|
||||||
name: Run mypy for Python 3.10
|
name: Run mypy for Python 3.10
|
||||||
entry: python tools/pre_commit/mypy.py 1 "3.10"
|
entry: tools/mypy.sh 1 "3.10"
|
||||||
<<: *mypy_common
|
language: python
|
||||||
|
types: [python]
|
||||||
|
additional_dependencies: *mypy_deps
|
||||||
stages: [manual] # Only run in CI
|
stages: [manual] # Only run in CI
|
||||||
- id: mypy-3.11 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
|
- id: mypy-3.11 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
|
||||||
name: Run mypy for Python 3.11
|
name: Run mypy for Python 3.11
|
||||||
entry: python tools/pre_commit/mypy.py 1 "3.11"
|
entry: tools/mypy.sh 1 "3.11"
|
||||||
<<: *mypy_common
|
language: python
|
||||||
|
types: [python]
|
||||||
|
additional_dependencies: *mypy_deps
|
||||||
stages: [manual] # Only run in CI
|
stages: [manual] # Only run in CI
|
||||||
- id: mypy-3.12 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
|
- id: mypy-3.12 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
|
||||||
name: Run mypy for Python 3.12
|
name: Run mypy for Python 3.12
|
||||||
entry: python tools/pre_commit/mypy.py 1 "3.12"
|
entry: tools/mypy.sh 1 "3.12"
|
||||||
<<: *mypy_common
|
language: python
|
||||||
stages: [manual] # Only run in CI
|
types: [python]
|
||||||
- id: mypy-3.13 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
|
additional_dependencies: *mypy_deps
|
||||||
name: Run mypy for Python 3.13
|
|
||||||
entry: python tools/pre_commit/mypy.py 1 "3.13"
|
|
||||||
<<: *mypy_common
|
|
||||||
stages: [manual] # Only run in CI
|
stages: [manual] # Only run in CI
|
||||||
- id: shellcheck
|
- id: shellcheck
|
||||||
name: Lint shell scripts
|
name: Lint shell scripts
|
||||||
entry: tools/pre_commit/shellcheck.sh
|
entry: tools/shellcheck.sh
|
||||||
language: script
|
language: script
|
||||||
types: [shell]
|
types: [shell]
|
||||||
- id: png-lint
|
- id: png-lint
|
||||||
name: Lint PNG exports from excalidraw
|
name: Lint PNG exports from excalidraw
|
||||||
entry: tools/pre_commit/png-lint.sh
|
entry: tools/png-lint.sh
|
||||||
language: script
|
language: script
|
||||||
types: [png]
|
types: [png]
|
||||||
- id: signoff-commit
|
- id: signoff-commit
|
||||||
@ -100,12 +117,12 @@ repos:
|
|||||||
stages: [commit-msg]
|
stages: [commit-msg]
|
||||||
- id: check-spdx-header
|
- id: check-spdx-header
|
||||||
name: Check SPDX headers
|
name: Check SPDX headers
|
||||||
entry: python tools/pre_commit/check_spdx_header.py
|
entry: python tools/check_spdx_header.py
|
||||||
language: python
|
language: python
|
||||||
types: [python]
|
types: [python]
|
||||||
- id: check-root-lazy-imports
|
- id: check-root-lazy-imports
|
||||||
name: Check root lazy imports
|
name: Check root lazy imports
|
||||||
entry: python tools/pre_commit/check_init_lazy_imports.py
|
entry: python tools/check_init_lazy_imports.py
|
||||||
language: python
|
language: python
|
||||||
types: [python]
|
types: [python]
|
||||||
- id: check-filenames
|
- id: check-filenames
|
||||||
@ -119,11 +136,11 @@ repos:
|
|||||||
pass_filenames: false
|
pass_filenames: false
|
||||||
- id: update-dockerfile-graph
|
- id: update-dockerfile-graph
|
||||||
name: Update Dockerfile dependency graph
|
name: Update Dockerfile dependency graph
|
||||||
entry: tools/pre_commit/update-dockerfile-graph.sh
|
entry: tools/update-dockerfile-graph.sh
|
||||||
language: script
|
language: script
|
||||||
- id: enforce-import-regex-instead-of-re
|
- id: enforce-import-regex-instead-of-re
|
||||||
name: Enforce import regex as re
|
name: Enforce import regex as re
|
||||||
entry: python tools/pre_commit/enforce_regex_import.py
|
entry: python tools/enforce_regex_import.py
|
||||||
language: python
|
language: python
|
||||||
types: [python]
|
types: [python]
|
||||||
pass_filenames: false
|
pass_filenames: false
|
||||||
@ -131,20 +148,21 @@ repos:
|
|||||||
# forbid directly import triton
|
# forbid directly import triton
|
||||||
- id: forbid-direct-triton-import
|
- id: forbid-direct-triton-import
|
||||||
name: "Forbid direct 'import triton'"
|
name: "Forbid direct 'import triton'"
|
||||||
entry: python tools/pre_commit/check_triton_import.py
|
entry: python tools/check_triton_import.py
|
||||||
language: python
|
language: python
|
||||||
types: [python]
|
types: [python]
|
||||||
pass_filenames: false
|
pass_filenames: false
|
||||||
additional_dependencies: [regex]
|
additional_dependencies: [regex]
|
||||||
- id: check-pickle-imports
|
- id: check-pickle-imports
|
||||||
name: Prevent new pickle/cloudpickle imports
|
name: Prevent new pickle/cloudpickle imports
|
||||||
entry: python tools/pre_commit/check_pickle_imports.py
|
entry: python tools/check_pickle_imports.py
|
||||||
language: python
|
language: python
|
||||||
types: [python]
|
types: [python]
|
||||||
additional_dependencies: [regex]
|
pass_filenames: false
|
||||||
|
additional_dependencies: [pathspec, regex]
|
||||||
- id: validate-config
|
- id: validate-config
|
||||||
name: Validate configuration has default values and that each field has a docstring
|
name: Validate configuration has default values and that each field has a docstring
|
||||||
entry: python tools/pre_commit/validate_config.py
|
entry: python tools/validate_config.py
|
||||||
language: python
|
language: python
|
||||||
additional_dependencies: [regex]
|
additional_dependencies: [regex]
|
||||||
# Keep `suggestion` last
|
# Keep `suggestion` last
|
||||||
|
|||||||
@ -13,7 +13,6 @@ build:
|
|||||||
|
|
||||||
mkdocs:
|
mkdocs:
|
||||||
configuration: mkdocs.yaml
|
configuration: mkdocs.yaml
|
||||||
fail_on_warning: true
|
|
||||||
|
|
||||||
# Optionally declare the Python requirements required to build your docs
|
# Optionally declare the Python requirements required to build your docs
|
||||||
python:
|
python:
|
||||||
|
|||||||
114
CMakeLists.txt
114
CMakeLists.txt
@ -34,10 +34,10 @@ install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
|
|||||||
# Supported python versions. These versions will be searched in order, the
|
# Supported python versions. These versions will be searched in order, the
|
||||||
# first match will be selected. These should be kept in sync with setup.py.
|
# first match will be selected. These should be kept in sync with setup.py.
|
||||||
#
|
#
|
||||||
set(PYTHON_SUPPORTED_VERSIONS "3.10" "3.11" "3.12" "3.13")
|
set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12" "3.13")
|
||||||
|
|
||||||
# Supported AMD GPU architectures.
|
# Supported AMD GPU architectures.
|
||||||
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151")
|
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201")
|
||||||
|
|
||||||
#
|
#
|
||||||
# Supported/expected torch versions for CUDA/ROCm.
|
# Supported/expected torch versions for CUDA/ROCm.
|
||||||
@ -49,8 +49,8 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1
|
|||||||
# requirements.txt files and should be kept consistent. The ROCm torch
|
# requirements.txt files and should be kept consistent. The ROCm torch
|
||||||
# versions are derived from docker/Dockerfile.rocm
|
# versions are derived from docker/Dockerfile.rocm
|
||||||
#
|
#
|
||||||
set(TORCH_SUPPORTED_VERSION_CUDA "2.9.0")
|
set(TORCH_SUPPORTED_VERSION_CUDA "2.8.0")
|
||||||
set(TORCH_SUPPORTED_VERSION_ROCM "2.9.0")
|
set(TORCH_SUPPORTED_VERSION_ROCM "2.8.0")
|
||||||
|
|
||||||
#
|
#
|
||||||
# Try to find python package with an executable that exactly matches
|
# Try to find python package with an executable that exactly matches
|
||||||
@ -86,9 +86,6 @@ find_package(Torch REQUIRED)
|
|||||||
# Supported NVIDIA architectures.
|
# Supported NVIDIA architectures.
|
||||||
# This check must happen after find_package(Torch) because that's when CMAKE_CUDA_COMPILER_VERSION gets defined
|
# This check must happen after find_package(Torch) because that's when CMAKE_CUDA_COMPILER_VERSION gets defined
|
||||||
if(DEFINED CMAKE_CUDA_COMPILER_VERSION AND
|
if(DEFINED CMAKE_CUDA_COMPILER_VERSION AND
|
||||||
CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 13.0)
|
|
||||||
set(CUDA_SUPPORTED_ARCHS "7.5;8.0;8.6;8.7;8.9;9.0;10.0;11.0;12.0")
|
|
||||||
elseif(DEFINED CMAKE_CUDA_COMPILER_VERSION AND
|
|
||||||
CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8)
|
CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8)
|
||||||
set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0")
|
set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0")
|
||||||
else()
|
else()
|
||||||
@ -178,15 +175,6 @@ if(NVCC_THREADS AND VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
list(APPEND VLLM_GPU_FLAGS "--threads=${NVCC_THREADS}")
|
list(APPEND VLLM_GPU_FLAGS "--threads=${NVCC_THREADS}")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
#
|
|
||||||
# Set compression mode for CUDA >=13.x.
|
|
||||||
#
|
|
||||||
if(VLLM_GPU_LANG STREQUAL "CUDA" AND
|
|
||||||
DEFINED CMAKE_CUDA_COMPILER_VERSION AND
|
|
||||||
CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 13.0)
|
|
||||||
list(APPEND VLLM_GPU_FLAGS "--compress-mode=size")
|
|
||||||
endif()
|
|
||||||
|
|
||||||
#
|
#
|
||||||
# Set CUDA include flags for CXX compiler.
|
# Set CUDA include flags for CXX compiler.
|
||||||
#
|
#
|
||||||
@ -269,8 +257,8 @@ set(VLLM_EXT_SRC
|
|||||||
"csrc/sampler.cu"
|
"csrc/sampler.cu"
|
||||||
"csrc/cuda_view.cu"
|
"csrc/cuda_view.cu"
|
||||||
"csrc/quantization/gptq/q_gemm.cu"
|
"csrc/quantization/gptq/q_gemm.cu"
|
||||||
"csrc/quantization/w8a8/int8/scaled_quant.cu"
|
"csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
|
||||||
"csrc/quantization/w8a8/fp8/common.cu"
|
"csrc/quantization/fp8/common.cu"
|
||||||
"csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu"
|
"csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu"
|
||||||
"csrc/quantization/gguf/gguf_kernel.cu"
|
"csrc/quantization/gguf/gguf_kernel.cu"
|
||||||
"csrc/quantization/activation_kernels.cu"
|
"csrc/quantization/activation_kernels.cu"
|
||||||
@ -282,7 +270,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
|
SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
|
||||||
|
|
||||||
# Set CUTLASS_REVISION. Used for FetchContent. Also fixes some bogus messages when building.
|
# Set CUTLASS_REVISION. Used for FetchContent. Also fixes some bogus messages when building.
|
||||||
set(CUTLASS_REVISION "v4.2.1" CACHE STRING "CUTLASS revision to use")
|
set(CUTLASS_REVISION "v4.0.0" CACHE STRING "CUTLASS revision to use")
|
||||||
|
|
||||||
# Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
|
# Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
|
||||||
if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
|
if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
|
||||||
@ -314,13 +302,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
list(APPEND VLLM_EXT_SRC
|
list(APPEND VLLM_EXT_SRC
|
||||||
"csrc/quantization/awq/gemm_kernels.cu"
|
"csrc/quantization/awq/gemm_kernels.cu"
|
||||||
"csrc/permute_cols.cu"
|
"csrc/permute_cols.cu"
|
||||||
"csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu"
|
"csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
|
||||||
"csrc/quantization/fp4/nvfp4_quant_entry.cu"
|
"csrc/quantization/fp4/nvfp4_quant_entry.cu"
|
||||||
"csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu"
|
"csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu"
|
||||||
|
"csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu"
|
||||||
"csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
|
"csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
|
||||||
"csrc/cutlass_extensions/common.cpp"
|
"csrc/cutlass_extensions/common.cpp"
|
||||||
"csrc/quantization/w8a8/fp8/per_token_group_quant.cu"
|
"csrc/quantization/fp8/per_token_group_quant.cu")
|
||||||
"csrc/quantization/w8a8/int8/per_token_group_quant.cu")
|
|
||||||
|
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${VLLM_EXT_SRC}"
|
SRCS "${VLLM_EXT_SRC}"
|
||||||
@ -424,11 +412,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}")
|
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}")
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND SCALED_MM_ARCHS)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND SCALED_MM_ARCHS)
|
||||||
set(SRCS
|
set(SRCS
|
||||||
"csrc/quantization/w8a8/cutlass/scaled_mm_c3x_sm90.cu"
|
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90.cu"
|
||||||
"csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm90_fp8.cu"
|
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu"
|
||||||
"csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm90_int8.cu"
|
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8.cu"
|
||||||
"csrc/quantization/w8a8/cutlass/c3x/scaled_mm_azp_sm90_int8.cu"
|
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_azp_sm90_int8.cu"
|
||||||
"csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm90_fp8.cu")
|
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8.cu")
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${SRCS}"
|
SRCS "${SRCS}"
|
||||||
CUDA_ARCHS "${SCALED_MM_ARCHS}")
|
CUDA_ARCHS "${SCALED_MM_ARCHS}")
|
||||||
@ -452,16 +440,12 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
|
|
||||||
# The cutlass_scaled_mm kernels for Geforce Blackwell SM120 (c3x, i.e. CUTLASS 3.x) require
|
# The cutlass_scaled_mm kernels for Geforce Blackwell SM120 (c3x, i.e. CUTLASS 3.x) require
|
||||||
# CUDA 12.8 or later
|
# CUDA 12.8 or later
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
|
cuda_archs_loose_intersection(SCALED_MM_ARCHS "12.0;12.0a" "${CUDA_ARCHS}")
|
||||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "12.0f" "${CUDA_ARCHS}")
|
|
||||||
else()
|
|
||||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "12.0a" "${CUDA_ARCHS}")
|
|
||||||
endif()
|
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
||||||
set(SRCS
|
set(SRCS
|
||||||
"csrc/quantization/w8a8/cutlass/scaled_mm_c3x_sm120.cu"
|
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm120.cu"
|
||||||
"csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm120_fp8.cu"
|
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm120_fp8.cu"
|
||||||
"csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm120_fp8.cu"
|
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm120_fp8.cu"
|
||||||
)
|
)
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${SRCS}"
|
SRCS "${SRCS}"
|
||||||
@ -486,16 +470,12 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
|
|
||||||
# The cutlass_scaled_mm kernels for Blackwell SM100 (c3x, i.e. CUTLASS 3.x)
|
# The cutlass_scaled_mm kernels for Blackwell SM100 (c3x, i.e. CUTLASS 3.x)
|
||||||
# require CUDA 12.8 or later
|
# require CUDA 12.8 or later
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
|
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a" "${CUDA_ARCHS}")
|
||||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
|
|
||||||
else()
|
|
||||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
|
|
||||||
endif()
|
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
||||||
set(SRCS
|
set(SRCS
|
||||||
"csrc/quantization/w8a8/cutlass/scaled_mm_c3x_sm100.cu"
|
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu"
|
||||||
"csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm100_fp8.cu"
|
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu"
|
||||||
"csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm100_fp8.cu"
|
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8.cu"
|
||||||
)
|
)
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${SRCS}"
|
SRCS "${SRCS}"
|
||||||
@ -526,7 +506,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
# subtract out the archs that are already built for 3x
|
# subtract out the archs that are already built for 3x
|
||||||
list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
|
list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
|
||||||
if (SCALED_MM_2X_ARCHS)
|
if (SCALED_MM_2X_ARCHS)
|
||||||
set(SRCS "csrc/quantization/w8a8/cutlass/scaled_mm_c2x.cu")
|
set(SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu")
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${SRCS}"
|
SRCS "${SRCS}"
|
||||||
CUDA_ARCHS "${SCALED_MM_2X_ARCHS}")
|
CUDA_ARCHS "${SCALED_MM_2X_ARCHS}")
|
||||||
@ -570,11 +550,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
|
|
||||||
# The nvfp4_scaled_mm_sm120 kernels for Geforce Blackwell SM120 require
|
# The nvfp4_scaled_mm_sm120 kernels for Geforce Blackwell SM120 require
|
||||||
# CUDA 12.8 or later
|
# CUDA 12.8 or later
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
|
cuda_archs_loose_intersection(FP4_ARCHS "12.0;12.0a" "${CUDA_ARCHS}")
|
||||||
cuda_archs_loose_intersection(FP4_ARCHS "12.0f" "${CUDA_ARCHS}")
|
|
||||||
else()
|
|
||||||
cuda_archs_loose_intersection(FP4_ARCHS "12.0a" "${CUDA_ARCHS}")
|
|
||||||
endif()
|
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS)
|
||||||
set(SRCS
|
set(SRCS
|
||||||
"csrc/quantization/fp4/nvfp4_quant_kernels.cu"
|
"csrc/quantization/fp4/nvfp4_quant_kernels.cu"
|
||||||
@ -593,11 +569,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
endif()
|
endif()
|
||||||
|
|
||||||
# FP4 Archs and flags
|
# FP4 Archs and flags
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
|
cuda_archs_loose_intersection(FP4_ARCHS "10.0a" "${CUDA_ARCHS}")
|
||||||
cuda_archs_loose_intersection(FP4_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
|
|
||||||
else()
|
|
||||||
cuda_archs_loose_intersection(FP4_ARCHS "10.0a;10.1a;12.0a;12.1a" "${CUDA_ARCHS}")
|
|
||||||
endif()
|
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS)
|
||||||
set(SRCS
|
set(SRCS
|
||||||
"csrc/quantization/fp4/nvfp4_quant_kernels.cu"
|
"csrc/quantization/fp4/nvfp4_quant_kernels.cu"
|
||||||
@ -619,11 +591,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
endif()
|
endif()
|
||||||
|
|
||||||
# CUTLASS MLA Archs and flags
|
# CUTLASS MLA Archs and flags
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
|
cuda_archs_loose_intersection(MLA_ARCHS "10.0a" "${CUDA_ARCHS}")
|
||||||
cuda_archs_loose_intersection(MLA_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
|
|
||||||
else()
|
|
||||||
cuda_archs_loose_intersection(MLA_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
|
|
||||||
endif()
|
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND MLA_ARCHS)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND MLA_ARCHS)
|
||||||
set(SRCS
|
set(SRCS
|
||||||
"csrc/attention/mla/sm100_cutlass_mla_kernel.cu")
|
"csrc/attention/mla/sm100_cutlass_mla_kernel.cu")
|
||||||
@ -649,7 +617,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
# if it's possible to compile MoE kernels that use its output.
|
# if it's possible to compile MoE kernels that use its output.
|
||||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a" "${CUDA_ARCHS}")
|
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a" "${CUDA_ARCHS}")
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
|
||||||
set(SRCS "csrc/quantization/w8a8/cutlass/moe/grouped_mm_c3x_sm90.cu")
|
set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x_sm90.cu")
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${SRCS}"
|
SRCS "${SRCS}"
|
||||||
CUDA_ARCHS "${SCALED_MM_ARCHS}")
|
CUDA_ARCHS "${SCALED_MM_ARCHS}")
|
||||||
@ -667,13 +635,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
|
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
|
||||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}")
|
|
||||||
else()
|
|
||||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
|
|
||||||
endif()
|
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
||||||
set(SRCS "csrc/quantization/w8a8/cutlass/moe/grouped_mm_c3x_sm100.cu")
|
set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x_sm100.cu")
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${SRCS}"
|
SRCS "${SRCS}"
|
||||||
CUDA_ARCHS "${SCALED_MM_ARCHS}")
|
CUDA_ARCHS "${SCALED_MM_ARCHS}")
|
||||||
@ -692,13 +656,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
endif()
|
endif()
|
||||||
|
|
||||||
# moe_data.cu is used by all CUTLASS MoE kernels.
|
# moe_data.cu is used by all CUTLASS MoE kernels.
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
|
cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0a" "${CUDA_ARCHS}")
|
||||||
cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
|
|
||||||
else()
|
|
||||||
cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
|
|
||||||
endif()
|
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND CUTLASS_MOE_DATA_ARCHS)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND CUTLASS_MOE_DATA_ARCHS)
|
||||||
set(SRCS "csrc/quantization/w8a8/cutlass/moe/moe_data.cu")
|
set(SRCS "csrc/quantization/cutlass_w8a8/moe/moe_data.cu")
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${SRCS}"
|
SRCS "${SRCS}"
|
||||||
CUDA_ARCHS "${CUTLASS_MOE_DATA_ARCHS}")
|
CUDA_ARCHS "${CUTLASS_MOE_DATA_ARCHS}")
|
||||||
@ -715,13 +675,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
|
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
|
||||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
|
|
||||||
else()
|
|
||||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
|
|
||||||
endif()
|
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
||||||
set(SRCS "csrc/quantization/w8a8/cutlass/moe/blockwise_scaled_group_mm_sm100.cu")
|
set(SRCS "csrc/quantization/cutlass_w8a8/moe/blockwise_scaled_group_mm_sm100.cu")
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${SRCS}"
|
SRCS "${SRCS}"
|
||||||
CUDA_ARCHS "${SCALED_MM_ARCHS}")
|
CUDA_ARCHS "${SCALED_MM_ARCHS}")
|
||||||
@ -883,7 +839,6 @@ target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1)
|
|||||||
set(VLLM_MOE_EXT_SRC
|
set(VLLM_MOE_EXT_SRC
|
||||||
"csrc/moe/torch_bindings.cpp"
|
"csrc/moe/torch_bindings.cpp"
|
||||||
"csrc/moe/moe_align_sum_kernels.cu"
|
"csrc/moe/moe_align_sum_kernels.cu"
|
||||||
"csrc/moe/moe_lora_align_sum_kernels.cu"
|
|
||||||
"csrc/moe/topk_softmax_kernels.cu")
|
"csrc/moe/topk_softmax_kernels.cu")
|
||||||
|
|
||||||
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||||
@ -1008,7 +963,6 @@ endif()
|
|||||||
# For CUDA we also build and ship some external projects.
|
# For CUDA we also build and ship some external projects.
|
||||||
if (VLLM_GPU_LANG STREQUAL "CUDA")
|
if (VLLM_GPU_LANG STREQUAL "CUDA")
|
||||||
include(cmake/external_projects/flashmla.cmake)
|
include(cmake/external_projects/flashmla.cmake)
|
||||||
include(cmake/external_projects/qutlass.cmake)
|
|
||||||
|
|
||||||
# vllm-flash-attn should be last as it overwrites some CMake functions
|
# vllm-flash-attn should be last as it overwrites some CMake functions
|
||||||
include(cmake/external_projects/vllm_flash_attn.cmake)
|
include(cmake/external_projects/vllm_flash_attn.cmake)
|
||||||
|
|||||||
@ -21,8 +21,6 @@ Join us at the [PyTorch Conference, October 22-23](https://events.linuxfoundatio
|
|||||||
|
|
||||||
*Latest News* 🔥
|
*Latest News* 🔥
|
||||||
|
|
||||||
- [2025/10] We hosted [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/__xb4OyOsImz-9eAVrdlcg) focused on hands-on vLLM inference optimization! Please find the meetup slides [here](https://drive.google.com/drive/folders/1KqwjsFJLfEsC8wlDugnrR61zsWHt94Q6).
|
|
||||||
- [2025/09] We hosted [vLLM Toronto Meetup](https://luma.com/e80e0ymm) focused on tackling inference at scale and speculative decoding with speakers from NVIDIA and Red Hat! Please find the meetup slides [here](https://docs.google.com/presentation/d/1IYJYmJcu9fLpID5N5RbW_vO0XLo0CGOR14IXOjB61V8/edit?usp=sharing).
|
|
||||||
- [2025/08] We hosted [vLLM Shenzhen Meetup](https://mp.weixin.qq.com/s/k8ZBO1u2_2odgiKWH_GVTQ) focusing on the ecosystem around vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Ua2SVKVSu-wp5vou_6ElraDt2bnKhiEA).
|
- [2025/08] We hosted [vLLM Shenzhen Meetup](https://mp.weixin.qq.com/s/k8ZBO1u2_2odgiKWH_GVTQ) focusing on the ecosystem around vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Ua2SVKVSu-wp5vou_6ElraDt2bnKhiEA).
|
||||||
- [2025/08] We hosted [vLLM Singapore Meetup](https://www.sginnovate.com/event/vllm-sg-meet). We shared V1 updates, disaggregated serving and MLLM speedups with speakers from Embedded LLM, AMD, WekaIO, and A*STAR. Please find the meetup slides [here](https://drive.google.com/drive/folders/1ncf3GyqLdqFaB6IeB834E5TZJPLAOiXZ?usp=sharing).
|
- [2025/08] We hosted [vLLM Singapore Meetup](https://www.sginnovate.com/event/vllm-sg-meet). We shared V1 updates, disaggregated serving and MLLM speedups with speakers from Embedded LLM, AMD, WekaIO, and A*STAR. Please find the meetup slides [here](https://drive.google.com/drive/folders/1ncf3GyqLdqFaB6IeB834E5TZJPLAOiXZ?usp=sharing).
|
||||||
- [2025/08] We hosted [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/pDmAXHcN7Iqc8sUKgJgGtg) focusing on building, developing, and integrating with vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1OvLx39wnCGy_WKq8SiVKf7YcxxYI3WCH).
|
- [2025/08] We hosted [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/pDmAXHcN7Iqc8sUKgJgGtg) focusing on building, developing, and integrating with vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1OvLx39wnCGy_WKq8SiVKf7YcxxYI3WCH).
|
||||||
@ -150,7 +148,6 @@ Compute Resources:
|
|||||||
- Trainy
|
- Trainy
|
||||||
- UC Berkeley
|
- UC Berkeley
|
||||||
- UC San Diego
|
- UC San Diego
|
||||||
- Volcengine
|
|
||||||
|
|
||||||
Slack Sponsor: Anyscale
|
Slack Sponsor: Anyscale
|
||||||
|
|
||||||
|
|||||||
@ -74,7 +74,7 @@ start_server() {
|
|||||||
local vllm_log=$4
|
local vllm_log=$4
|
||||||
local profile_dir=$5
|
local profile_dir=$5
|
||||||
|
|
||||||
pkill -if "vllm serve" || true
|
pkill -if vllm
|
||||||
|
|
||||||
# Define the common arguments as a bash array.
|
# Define the common arguments as a bash array.
|
||||||
# Each argument and its value are separate elements.
|
# Each argument and its value are separate elements.
|
||||||
@ -96,22 +96,17 @@ start_server() {
|
|||||||
# This correctly passes each element as a separate argument.
|
# This correctly passes each element as a separate argument.
|
||||||
if [[ -n "$profile_dir" ]]; then
|
if [[ -n "$profile_dir" ]]; then
|
||||||
# Start server with profiling enabled
|
# Start server with profiling enabled
|
||||||
VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir \
|
VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir \
|
||||||
vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 &
|
vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 &
|
||||||
else
|
else
|
||||||
# Start server without profiling
|
# Start server without profiling
|
||||||
VLLM_SERVER_DEV_MODE=1 \
|
VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 \
|
||||||
vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 &
|
vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 &
|
||||||
fi
|
fi
|
||||||
local server_pid=$!
|
|
||||||
|
|
||||||
# wait for 10 minutes...
|
# wait for 10 minutes...
|
||||||
server_started=0
|
server_started=0
|
||||||
for i in {1..60}; do
|
for i in {1..60}; do
|
||||||
# This line checks whether the server is still alive or not,
|
|
||||||
# since that we should always have permission to send signal to the server process.
|
|
||||||
kill -0 $server_pid 2> /dev/null || break
|
|
||||||
|
|
||||||
RESPONSE=$(curl -s -X GET "http://0.0.0.0:8004/health" -w "%{http_code}" -o /dev/stdout)
|
RESPONSE=$(curl -s -X GET "http://0.0.0.0:8004/health" -w "%{http_code}" -o /dev/stdout)
|
||||||
STATUS_CODE=$(echo "$RESPONSE" | tail -n 1)
|
STATUS_CODE=$(echo "$RESPONSE" | tail -n 1)
|
||||||
if [[ "$STATUS_CODE" -eq 200 ]]; then
|
if [[ "$STATUS_CODE" -eq 200 ]]; then
|
||||||
@ -123,7 +118,7 @@ start_server() {
|
|||||||
done
|
done
|
||||||
|
|
||||||
if (( ! server_started )); then
|
if (( ! server_started )); then
|
||||||
echo "server did not start within 10 minutes or crashed. Please check server log at $vllm_log".
|
echo "server did not start within 10 minutes. Please check server log at $vllm_log".
|
||||||
return 1
|
return 1
|
||||||
else
|
else
|
||||||
return 0
|
return 0
|
||||||
@ -139,7 +134,7 @@ run_benchmark() {
|
|||||||
echo "vllm_log: $vllm_log"
|
echo "vllm_log: $vllm_log"
|
||||||
echo
|
echo
|
||||||
rm -f $vllm_log
|
rm -f $vllm_log
|
||||||
pkill -if "vllm serve" || true
|
pkill -if vllm
|
||||||
|
|
||||||
echo "starting server..."
|
echo "starting server..."
|
||||||
# Call start_server without a profile_dir to avoid profiling overhead
|
# Call start_server without a profile_dir to avoid profiling overhead
|
||||||
@ -232,7 +227,7 @@ run_benchmark() {
|
|||||||
|
|
||||||
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput"
|
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput"
|
||||||
|
|
||||||
pkill -if "vllm serve" || true
|
pkill -if vllm
|
||||||
sleep 10
|
sleep 10
|
||||||
echo "===================="
|
echo "===================="
|
||||||
return 0
|
return 0
|
||||||
@ -308,6 +303,6 @@ if (( $(echo "$best_throughput > 0" | bc -l) )); then
|
|||||||
else
|
else
|
||||||
echo "No configuration met the latency requirements. Skipping final profiling run."
|
echo "No configuration met the latency requirements. Skipping final profiling run."
|
||||||
fi
|
fi
|
||||||
pkill -if "vllm serve" || true
|
pkill -if vllm
|
||||||
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH"
|
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH"
|
||||||
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH" >> "$RESULT"
|
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH" >> "$RESULT"
|
||||||
|
|||||||
@ -8,6 +8,7 @@ import sys
|
|||||||
import time
|
import time
|
||||||
import traceback
|
import traceback
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
|
from typing import Optional, Union
|
||||||
|
|
||||||
import aiohttp
|
import aiohttp
|
||||||
import huggingface_hub.constants
|
import huggingface_hub.constants
|
||||||
@ -27,13 +28,13 @@ class RequestFuncInput:
|
|||||||
prompt_len: int
|
prompt_len: int
|
||||||
output_len: int
|
output_len: int
|
||||||
model: str
|
model: str
|
||||||
model_name: str | None = None
|
model_name: Optional[str] = None
|
||||||
logprobs: int | None = None
|
logprobs: Optional[int] = None
|
||||||
extra_body: dict | None = None
|
extra_body: Optional[dict] = None
|
||||||
multi_modal_content: dict | list[dict] | None = None
|
multi_modal_content: Optional[dict | list[dict]] = None
|
||||||
ignore_eos: bool = False
|
ignore_eos: bool = False
|
||||||
language: str | None = None
|
language: Optional[str] = None
|
||||||
request_id: str | None = None
|
request_id: Optional[str] = None
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
@ -51,7 +52,7 @@ class RequestFuncOutput:
|
|||||||
|
|
||||||
async def async_request_tgi(
|
async def async_request_tgi(
|
||||||
request_func_input: RequestFuncInput,
|
request_func_input: RequestFuncInput,
|
||||||
pbar: tqdm | None = None,
|
pbar: Optional[tqdm] = None,
|
||||||
) -> RequestFuncOutput:
|
) -> RequestFuncOutput:
|
||||||
api_url = request_func_input.api_url
|
api_url = request_func_input.api_url
|
||||||
assert api_url.endswith("generate_stream")
|
assert api_url.endswith("generate_stream")
|
||||||
@ -132,7 +133,7 @@ async def async_request_tgi(
|
|||||||
|
|
||||||
async def async_request_trt_llm(
|
async def async_request_trt_llm(
|
||||||
request_func_input: RequestFuncInput,
|
request_func_input: RequestFuncInput,
|
||||||
pbar: tqdm | None = None,
|
pbar: Optional[tqdm] = None,
|
||||||
) -> RequestFuncOutput:
|
) -> RequestFuncOutput:
|
||||||
api_url = request_func_input.api_url
|
api_url = request_func_input.api_url
|
||||||
assert api_url.endswith("generate_stream")
|
assert api_url.endswith("generate_stream")
|
||||||
@ -203,7 +204,7 @@ async def async_request_trt_llm(
|
|||||||
|
|
||||||
async def async_request_deepspeed_mii(
|
async def async_request_deepspeed_mii(
|
||||||
request_func_input: RequestFuncInput,
|
request_func_input: RequestFuncInput,
|
||||||
pbar: tqdm | None = None,
|
pbar: Optional[tqdm] = None,
|
||||||
) -> RequestFuncOutput:
|
) -> RequestFuncOutput:
|
||||||
api_url = request_func_input.api_url
|
api_url = request_func_input.api_url
|
||||||
assert api_url.endswith(("completions", "profile")), (
|
assert api_url.endswith(("completions", "profile")), (
|
||||||
@ -266,7 +267,7 @@ async def async_request_deepspeed_mii(
|
|||||||
|
|
||||||
async def async_request_openai_completions(
|
async def async_request_openai_completions(
|
||||||
request_func_input: RequestFuncInput,
|
request_func_input: RequestFuncInput,
|
||||||
pbar: tqdm | None = None,
|
pbar: Optional[tqdm] = None,
|
||||||
) -> RequestFuncOutput:
|
) -> RequestFuncOutput:
|
||||||
api_url = request_func_input.api_url
|
api_url = request_func_input.api_url
|
||||||
assert api_url.endswith(("completions", "profile")), (
|
assert api_url.endswith(("completions", "profile")), (
|
||||||
@ -366,7 +367,7 @@ async def async_request_openai_completions(
|
|||||||
|
|
||||||
async def async_request_openai_chat_completions(
|
async def async_request_openai_chat_completions(
|
||||||
request_func_input: RequestFuncInput,
|
request_func_input: RequestFuncInput,
|
||||||
pbar: tqdm | None = None,
|
pbar: Optional[tqdm] = None,
|
||||||
) -> RequestFuncOutput:
|
) -> RequestFuncOutput:
|
||||||
api_url = request_func_input.api_url
|
api_url = request_func_input.api_url
|
||||||
assert api_url.endswith(("chat/completions", "profile")), (
|
assert api_url.endswith(("chat/completions", "profile")), (
|
||||||
@ -475,7 +476,7 @@ async def async_request_openai_chat_completions(
|
|||||||
|
|
||||||
async def async_request_openai_audio(
|
async def async_request_openai_audio(
|
||||||
request_func_input: RequestFuncInput,
|
request_func_input: RequestFuncInput,
|
||||||
pbar: tqdm | None = None,
|
pbar: Optional[tqdm] = None,
|
||||||
) -> RequestFuncOutput:
|
) -> RequestFuncOutput:
|
||||||
# Lazy import without PlaceholderModule to avoid vllm dep.
|
# Lazy import without PlaceholderModule to avoid vllm dep.
|
||||||
import soundfile
|
import soundfile
|
||||||
@ -609,7 +610,7 @@ def get_tokenizer(
|
|||||||
tokenizer_mode: str = "auto",
|
tokenizer_mode: str = "auto",
|
||||||
trust_remote_code: bool = False,
|
trust_remote_code: bool = False,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
) -> PreTrainedTokenizer | PreTrainedTokenizerFast:
|
) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
|
||||||
if pretrained_model_name_or_path is not None and not os.path.exists(
|
if pretrained_model_name_or_path is not None and not os.path.exists(
|
||||||
pretrained_model_name_or_path
|
pretrained_model_name_or_path
|
||||||
):
|
):
|
||||||
|
|||||||
@ -2,10 +2,10 @@
|
|||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
import gc
|
import gc
|
||||||
|
|
||||||
from benchmark_utils import TimeCollector
|
|
||||||
from tabulate import tabulate
|
from tabulate import tabulate
|
||||||
|
|
||||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
from benchmark_utils import TimeCollector
|
||||||
|
from vllm.utils import FlexibleArgumentParser
|
||||||
from vllm.v1.core.block_pool import BlockPool
|
from vllm.v1.core.block_pool import BlockPool
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -46,7 +46,7 @@ import time
|
|||||||
|
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
from vllm.engine.arg_utils import EngineArgs
|
from vllm.engine.arg_utils import EngineArgs
|
||||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
from vllm.utils import FlexibleArgumentParser
|
||||||
|
|
||||||
|
|
||||||
def test_long_document_qa(llm=None, sampling_params=None, prompts=None):
|
def test_long_document_qa(llm=None, sampling_params=None, prompts=None):
|
||||||
|
|||||||
@ -1,31 +1,17 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
import gc
|
import gc
|
||||||
import time
|
|
||||||
from unittest import mock
|
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from benchmark_utils import TimeCollector
|
|
||||||
from tabulate import tabulate
|
from tabulate import tabulate
|
||||||
|
|
||||||
from vllm.config import (
|
from benchmark_utils import TimeCollector
|
||||||
CacheConfig,
|
from vllm.config import ModelConfig, SpeculativeConfig, VllmConfig
|
||||||
DeviceConfig,
|
from vllm.utils import FlexibleArgumentParser
|
||||||
LoadConfig,
|
|
||||||
ModelConfig,
|
|
||||||
ParallelConfig,
|
|
||||||
SchedulerConfig,
|
|
||||||
SpeculativeConfig,
|
|
||||||
VllmConfig,
|
|
||||||
)
|
|
||||||
from vllm.platforms import current_platform
|
|
||||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
|
||||||
from vllm.v1.spec_decode.ngram_proposer import NgramProposer
|
from vllm.v1.spec_decode.ngram_proposer import NgramProposer
|
||||||
from vllm.v1.worker.gpu_input_batch import InputBatch
|
|
||||||
from vllm.v1.worker.gpu_model_runner import GPUModelRunner
|
|
||||||
|
|
||||||
|
|
||||||
def benchmark_propose(args):
|
def main(args):
|
||||||
rows = []
|
rows = []
|
||||||
for max_ngram in args.max_ngram:
|
for max_ngram in args.max_ngram:
|
||||||
collector = TimeCollector(TimeCollector.US)
|
collector = TimeCollector(TimeCollector.US)
|
||||||
@ -83,88 +69,10 @@ def benchmark_propose(args):
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def benchmark_batched_propose(args):
|
|
||||||
NUM_SPECULATIVE_TOKENS_NGRAM = 10
|
|
||||||
PROMPT_LOOKUP_MIN = 5
|
|
||||||
PROMPT_LOOKUP_MAX = 15
|
|
||||||
MAX_MODEL_LEN = int(1e7)
|
|
||||||
DEVICE = current_platform.device_type
|
|
||||||
|
|
||||||
model_config = ModelConfig(model="facebook/opt-125m", runner="generate")
|
|
||||||
|
|
||||||
speculative_config = SpeculativeConfig(
|
|
||||||
target_model_config=model_config,
|
|
||||||
target_parallel_config=ParallelConfig(),
|
|
||||||
method="ngram",
|
|
||||||
num_speculative_tokens=NUM_SPECULATIVE_TOKENS_NGRAM,
|
|
||||||
prompt_lookup_max=PROMPT_LOOKUP_MAX,
|
|
||||||
prompt_lookup_min=PROMPT_LOOKUP_MIN,
|
|
||||||
)
|
|
||||||
|
|
||||||
vllm_config = VllmConfig(
|
|
||||||
model_config=model_config,
|
|
||||||
cache_config=CacheConfig(),
|
|
||||||
speculative_config=speculative_config,
|
|
||||||
device_config=DeviceConfig(device=current_platform.device_type),
|
|
||||||
parallel_config=ParallelConfig(),
|
|
||||||
load_config=LoadConfig(),
|
|
||||||
scheduler_config=SchedulerConfig(),
|
|
||||||
)
|
|
||||||
|
|
||||||
# monkey patch vllm.v1.worker.gpu_model_runner.get_pp_group
|
|
||||||
mock_pp_group = mock.MagicMock()
|
|
||||||
mock_pp_group.world_size = 1
|
|
||||||
with mock.patch(
|
|
||||||
"vllm.v1.worker.gpu_model_runner.get_pp_group", return_value=mock_pp_group
|
|
||||||
):
|
|
||||||
runner = GPUModelRunner(vllm_config, DEVICE)
|
|
||||||
|
|
||||||
# hack max model len
|
|
||||||
runner.max_model_len = MAX_MODEL_LEN
|
|
||||||
runner.drafter.max_model_len = MAX_MODEL_LEN
|
|
||||||
|
|
||||||
dummy_input_batch = InputBatch(
|
|
||||||
max_num_reqs=args.num_req,
|
|
||||||
max_model_len=MAX_MODEL_LEN,
|
|
||||||
max_num_batched_tokens=args.num_req * args.num_token,
|
|
||||||
device=DEVICE,
|
|
||||||
pin_memory=False,
|
|
||||||
vocab_size=256000,
|
|
||||||
block_sizes=[16],
|
|
||||||
)
|
|
||||||
dummy_input_batch._req_ids = list(str(id) for id in range(args.num_req))
|
|
||||||
dummy_input_batch.spec_decode_unsupported_reqs = ()
|
|
||||||
dummy_input_batch.num_tokens_no_spec = [args.num_token] * args.num_req
|
|
||||||
dummy_input_batch.token_ids_cpu = np.random.randint(
|
|
||||||
0, 20, (args.num_req, args.num_token)
|
|
||||||
)
|
|
||||||
|
|
||||||
runner.input_batch = dummy_input_batch
|
|
||||||
|
|
||||||
sampled_token_ids = [[0]] * args.num_req
|
|
||||||
|
|
||||||
print("Starting benchmark")
|
|
||||||
# first run is warmup so ignore it
|
|
||||||
for _ in range(args.num_iteration):
|
|
||||||
start = time.time()
|
|
||||||
runner.drafter.propose(
|
|
||||||
sampled_token_ids,
|
|
||||||
dummy_input_batch.req_ids,
|
|
||||||
dummy_input_batch.num_tokens_no_spec,
|
|
||||||
dummy_input_batch.token_ids_cpu,
|
|
||||||
dummy_input_batch.spec_decode_unsupported_reqs,
|
|
||||||
)
|
|
||||||
end = time.time()
|
|
||||||
print(f"Iteration time (s): {end - start}")
|
|
||||||
|
|
||||||
|
|
||||||
def invoke_main() -> None:
|
def invoke_main() -> None:
|
||||||
parser = FlexibleArgumentParser(
|
parser = FlexibleArgumentParser(
|
||||||
description="Benchmark the performance of N-gram speculative decode drafting"
|
description="Benchmark the performance of N-gram speculative decode drafting"
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
|
||||||
"--batched", action="store_true", help="consider time to prepare batch"
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--num-iteration",
|
"--num-iteration",
|
||||||
type=int,
|
type=int,
|
||||||
@ -197,17 +105,8 @@ def invoke_main() -> None:
|
|||||||
help="Number of speculative tokens to generate",
|
help="Number of speculative tokens to generate",
|
||||||
)
|
)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
main(args)
|
||||||
if not args.batched:
|
|
||||||
benchmark_propose(args)
|
|
||||||
else:
|
|
||||||
benchmark_batched_propose(args)
|
|
||||||
|
|
||||||
|
|
||||||
"""
|
|
||||||
# Example command lines:
|
|
||||||
# time python3 benchmarks/benchmark_ngram_proposer.py
|
|
||||||
# time python3 benchmarks/benchmark_ngram_proposer.py --batched --num-iteration 4 --num-token 1000000 --num-req 128
|
|
||||||
""" # noqa: E501
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
invoke_main() # pragma: no cover
|
invoke_main() # pragma: no cover
|
||||||
|
|||||||
@ -32,12 +32,13 @@ import dataclasses
|
|||||||
import json
|
import json
|
||||||
import random
|
import random
|
||||||
import time
|
import time
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
from transformers import PreTrainedTokenizerBase
|
from transformers import PreTrainedTokenizerBase
|
||||||
|
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
from vllm.engine.arg_utils import EngineArgs
|
from vllm.engine.arg_utils import EngineArgs
|
||||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
from vllm.utils import FlexibleArgumentParser
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||||
@ -79,7 +80,7 @@ def sample_requests_from_dataset(
|
|||||||
num_requests: int,
|
num_requests: int,
|
||||||
tokenizer: PreTrainedTokenizerBase,
|
tokenizer: PreTrainedTokenizerBase,
|
||||||
input_length_range: tuple[int, int],
|
input_length_range: tuple[int, int],
|
||||||
fixed_output_len: int | None,
|
fixed_output_len: Optional[int],
|
||||||
) -> list[Request]:
|
) -> list[Request]:
|
||||||
if fixed_output_len is not None and fixed_output_len < 4:
|
if fixed_output_len is not None and fixed_output_len < 4:
|
||||||
raise ValueError("output_len too small")
|
raise ValueError("output_len too small")
|
||||||
@ -127,7 +128,7 @@ def sample_requests_from_random(
|
|||||||
num_requests: int,
|
num_requests: int,
|
||||||
tokenizer: PreTrainedTokenizerBase,
|
tokenizer: PreTrainedTokenizerBase,
|
||||||
input_length_range: tuple[int, int],
|
input_length_range: tuple[int, int],
|
||||||
fixed_output_len: int | None,
|
fixed_output_len: Optional[int],
|
||||||
prefix_len: int,
|
prefix_len: int,
|
||||||
) -> list[Request]:
|
) -> list[Request]:
|
||||||
requests = []
|
requests = []
|
||||||
|
|||||||
@ -7,11 +7,12 @@ import dataclasses
|
|||||||
import json
|
import json
|
||||||
import random
|
import random
|
||||||
import time
|
import time
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
from transformers import AutoTokenizer, PreTrainedTokenizerBase
|
from transformers import AutoTokenizer, PreTrainedTokenizerBase
|
||||||
|
|
||||||
from vllm.engine.arg_utils import EngineArgs
|
from vllm.engine.arg_utils import EngineArgs
|
||||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
from vllm.utils import FlexibleArgumentParser
|
||||||
|
|
||||||
|
|
||||||
# Select a equi-probable random priority
|
# Select a equi-probable random priority
|
||||||
@ -23,7 +24,7 @@ def sample_requests(
|
|||||||
dataset_path: str,
|
dataset_path: str,
|
||||||
num_requests: int,
|
num_requests: int,
|
||||||
tokenizer: PreTrainedTokenizerBase,
|
tokenizer: PreTrainedTokenizerBase,
|
||||||
fixed_output_len: int | None,
|
fixed_output_len: Optional[int],
|
||||||
) -> list[tuple[str, int, int, int]]:
|
) -> list[tuple[str, int, int, int]]:
|
||||||
if fixed_output_len is not None and fixed_output_len < 4:
|
if fixed_output_len is not None and fixed_output_len < 4:
|
||||||
raise ValueError("output_len too small")
|
raise ValueError("output_len too small")
|
||||||
|
|||||||
@ -31,19 +31,20 @@ import time
|
|||||||
import uuid
|
import uuid
|
||||||
import warnings
|
import warnings
|
||||||
from collections.abc import AsyncGenerator
|
from collections.abc import AsyncGenerator
|
||||||
from contextlib import nullcontext
|
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
import datasets
|
import datasets
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
from tqdm.asyncio import tqdm
|
||||||
|
from transformers import PreTrainedTokenizerBase
|
||||||
|
|
||||||
from backend_request_func import (
|
from backend_request_func import (
|
||||||
ASYNC_REQUEST_FUNCS,
|
ASYNC_REQUEST_FUNCS,
|
||||||
RequestFuncInput,
|
RequestFuncInput,
|
||||||
RequestFuncOutput,
|
RequestFuncOutput,
|
||||||
)
|
)
|
||||||
from tqdm.asyncio import tqdm
|
|
||||||
from transformers import PreTrainedTokenizerBase
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||||
@ -51,7 +52,7 @@ except ImportError:
|
|||||||
from backend_request_func import get_tokenizer
|
from backend_request_func import get_tokenizer
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
from vllm.utils import FlexibleArgumentParser
|
||||||
except ImportError:
|
except ImportError:
|
||||||
from argparse import ArgumentParser as FlexibleArgumentParser
|
from argparse import ArgumentParser as FlexibleArgumentParser
|
||||||
|
|
||||||
@ -316,7 +317,7 @@ def calculate_metrics(
|
|||||||
tokenizer: PreTrainedTokenizerBase,
|
tokenizer: PreTrainedTokenizerBase,
|
||||||
selected_percentile_metrics: list[str],
|
selected_percentile_metrics: list[str],
|
||||||
selected_percentiles: list[float],
|
selected_percentiles: list[float],
|
||||||
goodput_config_dict: dict[str, float] | None = None,
|
goodput_config_dict: Optional[dict[str, float]] = None,
|
||||||
) -> tuple[BenchmarkMetrics, list[int]]:
|
) -> tuple[BenchmarkMetrics, list[int]]:
|
||||||
actual_output_lens: list[int] = []
|
actual_output_lens: list[int] = []
|
||||||
total_input = 0
|
total_input = 0
|
||||||
@ -436,9 +437,9 @@ async def benchmark(
|
|||||||
selected_percentile_metrics: list[str],
|
selected_percentile_metrics: list[str],
|
||||||
selected_percentiles: list[str],
|
selected_percentiles: list[str],
|
||||||
ignore_eos: bool,
|
ignore_eos: bool,
|
||||||
max_concurrency: int | None,
|
max_concurrency: Optional[int],
|
||||||
structured_output_ratio: float,
|
structured_output_ratio: float,
|
||||||
goodput_config_dict: dict[str, float] | None = None,
|
goodput_config_dict: Optional[dict[str, float]] = None,
|
||||||
):
|
):
|
||||||
if backend in ASYNC_REQUEST_FUNCS:
|
if backend in ASYNC_REQUEST_FUNCS:
|
||||||
request_func = ASYNC_REQUEST_FUNCS[backend]
|
request_func = ASYNC_REQUEST_FUNCS[backend]
|
||||||
@ -448,8 +449,7 @@ async def benchmark(
|
|||||||
def prepare_extra_body(request) -> dict:
|
def prepare_extra_body(request) -> dict:
|
||||||
extra_body = {}
|
extra_body = {}
|
||||||
# Add the schema to the extra_body
|
# Add the schema to the extra_body
|
||||||
extra_body["structured_outputs"] = {}
|
extra_body[request.structure_type] = request.schema
|
||||||
extra_body["structured_outputs"][request.structure_type] = request.schema
|
|
||||||
return extra_body
|
return extra_body
|
||||||
|
|
||||||
print("Starting initial single prompt test run...")
|
print("Starting initial single prompt test run...")
|
||||||
@ -502,9 +502,15 @@ async def benchmark(
|
|||||||
|
|
||||||
pbar = None if disable_tqdm else tqdm(total=len(input_requests))
|
pbar = None if disable_tqdm else tqdm(total=len(input_requests))
|
||||||
|
|
||||||
semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else nullcontext()
|
# This can be used once the minimum Python version is 3.10 or higher,
|
||||||
|
# and it will simplify the code in limited_request_func.
|
||||||
|
# semaphore = (asyncio.Semaphore(max_concurrency)
|
||||||
|
# if max_concurrency else contextlib.nullcontext())
|
||||||
|
semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else None
|
||||||
|
|
||||||
async def limited_request_func(request_func_input, pbar):
|
async def limited_request_func(request_func_input, pbar):
|
||||||
|
if semaphore is None:
|
||||||
|
return await request_func(request_func_input=request_func_input, pbar=pbar)
|
||||||
async with semaphore:
|
async with semaphore:
|
||||||
return await request_func(request_func_input=request_func_input, pbar=pbar)
|
return await request_func(request_func_input=request_func_input, pbar=pbar)
|
||||||
|
|
||||||
@ -903,13 +909,13 @@ def create_argument_parser():
|
|||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--tokenizer",
|
"--tokenizer",
|
||||||
type=str,
|
type=str,
|
||||||
help="Name or path of the tokenizer, if not using the default tokenizer.",
|
help="Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--tokenizer-mode",
|
"--tokenizer-mode",
|
||||||
type=str,
|
type=str,
|
||||||
default="auto",
|
default="auto",
|
||||||
help="Name or path of the tokenizer, if not using the default tokenizer.",
|
help="Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--num-prompts",
|
"--num-prompts",
|
||||||
|
|||||||
@ -6,7 +6,7 @@ import math
|
|||||||
import os
|
import os
|
||||||
import time
|
import time
|
||||||
from types import TracebackType
|
from types import TracebackType
|
||||||
from typing import Any
|
from typing import Any, Optional, Union
|
||||||
|
|
||||||
|
|
||||||
def convert_to_pytorch_benchmark_format(
|
def convert_to_pytorch_benchmark_format(
|
||||||
@ -92,7 +92,7 @@ class TimeCollector:
|
|||||||
def __init__(self, scale: int) -> None:
|
def __init__(self, scale: int) -> None:
|
||||||
self.cnt: int = 0
|
self.cnt: int = 0
|
||||||
self._sum: int = 0
|
self._sum: int = 0
|
||||||
self._max: int | None = None
|
self._max: Optional[int] = None
|
||||||
self.scale = scale
|
self.scale = scale
|
||||||
self.start_time: int = time.monotonic_ns()
|
self.start_time: int = time.monotonic_ns()
|
||||||
|
|
||||||
@ -104,13 +104,13 @@ class TimeCollector:
|
|||||||
else:
|
else:
|
||||||
self._max = max(self._max, v)
|
self._max = max(self._max, v)
|
||||||
|
|
||||||
def avg(self) -> float | str:
|
def avg(self) -> Union[float, str]:
|
||||||
return self._sum * 1.0 / self.cnt / self.scale if self.cnt > 0 else "N/A"
|
return self._sum * 1.0 / self.cnt / self.scale if self.cnt > 0 else "N/A"
|
||||||
|
|
||||||
def max(self) -> float | str:
|
def max(self) -> Union[float, str]:
|
||||||
return self._max / self.scale if self._max else "N/A"
|
return self._max / self.scale if self._max else "N/A"
|
||||||
|
|
||||||
def dump_avg_max(self) -> list[float | str]:
|
def dump_avg_max(self) -> list[Union[float, str]]:
|
||||||
return [self.avg(), self.max()]
|
return [self.avg(), self.max()]
|
||||||
|
|
||||||
def __enter__(self) -> None:
|
def __enter__(self) -> None:
|
||||||
@ -118,8 +118,8 @@ class TimeCollector:
|
|||||||
|
|
||||||
def __exit__(
|
def __exit__(
|
||||||
self,
|
self,
|
||||||
exc_type: type[BaseException] | None,
|
exc_type: Optional[type[BaseException]],
|
||||||
exc_value: BaseException | None,
|
exc_value: Optional[BaseException],
|
||||||
exc_traceback: TracebackType | None,
|
exc_traceback: Optional[TracebackType],
|
||||||
) -> None:
|
) -> None:
|
||||||
self.collect(time.monotonic_ns() - self.start_time)
|
self.collect(time.monotonic_ns() - self.start_time)
|
||||||
|
|||||||
@ -6,7 +6,8 @@ import copy
|
|||||||
import itertools
|
import itertools
|
||||||
import pickle as pkl
|
import pickle as pkl
|
||||||
import time
|
import time
|
||||||
from collections.abc import Callable, Iterable
|
from collections.abc import Iterable
|
||||||
|
from typing import Callable
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torch.utils.benchmark as TBenchmark
|
import torch.utils.benchmark as TBenchmark
|
||||||
@ -15,7 +16,7 @@ from utils import make_rand_sparse_tensors
|
|||||||
from weight_shapes import WEIGHT_SHAPES
|
from weight_shapes import WEIGHT_SHAPES
|
||||||
|
|
||||||
from vllm import _custom_ops as ops
|
from vllm import _custom_ops as ops
|
||||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
from vllm.utils import FlexibleArgumentParser
|
||||||
|
|
||||||
DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
|
DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
|
||||||
DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
|
DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
|
||||||
|
|||||||
@ -6,7 +6,8 @@ import copy
|
|||||||
import itertools
|
import itertools
|
||||||
import pickle as pkl
|
import pickle as pkl
|
||||||
import time
|
import time
|
||||||
from collections.abc import Callable, Iterable
|
from collections.abc import Iterable
|
||||||
|
from typing import Callable, Optional
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torch.utils.benchmark as TBenchmark
|
import torch.utils.benchmark as TBenchmark
|
||||||
@ -16,10 +17,9 @@ from weight_shapes import WEIGHT_SHAPES
|
|||||||
|
|
||||||
from vllm import _custom_ops as ops
|
from vllm import _custom_ops as ops
|
||||||
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
|
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
|
||||||
w8a8_triton_block_scaled_mm,
|
w8a8_block_fp8_matmul,
|
||||||
)
|
)
|
||||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
from vllm.utils import FlexibleArgumentParser, cdiv
|
||||||
from vllm.utils.math_utils import cdiv
|
|
||||||
|
|
||||||
DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
|
DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
|
||||||
DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
|
DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
|
||||||
@ -53,7 +53,7 @@ def bench_int8(
|
|||||||
n: int,
|
n: int,
|
||||||
label: str,
|
label: str,
|
||||||
sub_label: str,
|
sub_label: str,
|
||||||
bench_kernels: list[str] | None = None,
|
bench_kernels: Optional[list[str]] = None,
|
||||||
) -> Iterable[TMeasurement]:
|
) -> Iterable[TMeasurement]:
|
||||||
"""Benchmark INT8-based kernels."""
|
"""Benchmark INT8-based kernels."""
|
||||||
assert dtype == torch.int8
|
assert dtype == torch.int8
|
||||||
@ -108,7 +108,7 @@ def bench_fp8(
|
|||||||
n: int,
|
n: int,
|
||||||
label: str,
|
label: str,
|
||||||
sub_label: str,
|
sub_label: str,
|
||||||
bench_kernels: list[str] | None = None,
|
bench_kernels: Optional[list[str]] = None,
|
||||||
) -> Iterable[TMeasurement]:
|
) -> Iterable[TMeasurement]:
|
||||||
"""Benchmark FP8-based kernels."""
|
"""Benchmark FP8-based kernels."""
|
||||||
assert dtype == torch.float8_e4m3fn
|
assert dtype == torch.float8_e4m3fn
|
||||||
@ -158,7 +158,7 @@ def bench_fp8(
|
|||||||
"cutlass_fp8_fp8_fp16_scaled_mm_bias": lambda: ops.cutlass_scaled_mm(
|
"cutlass_fp8_fp8_fp16_scaled_mm_bias": lambda: ops.cutlass_scaled_mm(
|
||||||
a, b, scale_a, scale_b, torch.float16, bias.to(dtype=torch.float16)
|
a, b, scale_a, scale_b, torch.float16, bias.to(dtype=torch.float16)
|
||||||
),
|
),
|
||||||
"triton_fp8_fp8_fp16_scaled_mm_blockwise": lambda: w8a8_triton_block_scaled_mm(
|
"triton_fp8_fp8_fp16_scaled_mm_blockwise": lambda: w8a8_block_fp8_matmul(
|
||||||
a_cont, b.t(), block_scale_a, block_scale_b.t(), (128, 128)
|
a_cont, b.t(), block_scale_a, block_scale_b.t(), (128, 128)
|
||||||
),
|
),
|
||||||
"cutlass_fp8_fp8_fp16_scaled_mm_blockwise": lambda: ops.cutlass_scaled_mm(
|
"cutlass_fp8_fp8_fp16_scaled_mm_blockwise": lambda: ops.cutlass_scaled_mm(
|
||||||
@ -183,7 +183,7 @@ def bench(
|
|||||||
n: int,
|
n: int,
|
||||||
label: str,
|
label: str,
|
||||||
sub_label: str,
|
sub_label: str,
|
||||||
bench_kernels: list[str] | None = None,
|
bench_kernels: Optional[list[str]] = None,
|
||||||
) -> Iterable[TMeasurement]:
|
) -> Iterable[TMeasurement]:
|
||||||
if dtype == torch.int8:
|
if dtype == torch.int8:
|
||||||
return bench_int8(dtype, m, k, n, label, sub_label, bench_kernels)
|
return bench_int8(dtype, m, k, n, label, sub_label, bench_kernels)
|
||||||
@ -201,7 +201,7 @@ def print_timers(timers: Iterable[TMeasurement]):
|
|||||||
def run(
|
def run(
|
||||||
dtype: torch.dtype,
|
dtype: torch.dtype,
|
||||||
MKNs: Iterable[tuple[int, int, int]],
|
MKNs: Iterable[tuple[int, int, int]],
|
||||||
bench_kernels: list[str] | None = None,
|
bench_kernels: Optional[list[str]] = None,
|
||||||
) -> Iterable[TMeasurement]:
|
) -> Iterable[TMeasurement]:
|
||||||
results = []
|
results = []
|
||||||
for m, k, n in MKNs:
|
for m, k, n in MKNs:
|
||||||
|
|||||||
@ -55,7 +55,9 @@ benchmark() {
|
|||||||
output_len=$2
|
output_len=$2
|
||||||
|
|
||||||
|
|
||||||
CUDA_VISIBLE_DEVICES=0 vllm serve $model \
|
CUDA_VISIBLE_DEVICES=0 python3 \
|
||||||
|
-m vllm.entrypoints.openai.api_server \
|
||||||
|
--model $model \
|
||||||
--port 8100 \
|
--port 8100 \
|
||||||
--max-model-len 10000 \
|
--max-model-len 10000 \
|
||||||
--gpu-memory-utilization 0.6 \
|
--gpu-memory-utilization 0.6 \
|
||||||
@ -63,7 +65,9 @@ benchmark() {
|
|||||||
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
|
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
|
||||||
|
|
||||||
|
|
||||||
CUDA_VISIBLE_DEVICES=1 vllm serve $model \
|
CUDA_VISIBLE_DEVICES=1 python3 \
|
||||||
|
-m vllm.entrypoints.openai.api_server \
|
||||||
|
--model $model \
|
||||||
--port 8200 \
|
--port 8200 \
|
||||||
--max-model-len 10000 \
|
--max-model-len 10000 \
|
||||||
--gpu-memory-utilization 0.6 \
|
--gpu-memory-utilization 0.6 \
|
||||||
|
|||||||
@ -38,12 +38,16 @@ wait_for_server() {
|
|||||||
launch_chunked_prefill() {
|
launch_chunked_prefill() {
|
||||||
model="meta-llama/Meta-Llama-3.1-8B-Instruct"
|
model="meta-llama/Meta-Llama-3.1-8B-Instruct"
|
||||||
# disagg prefill
|
# disagg prefill
|
||||||
CUDA_VISIBLE_DEVICES=0 vllm serve $model \
|
CUDA_VISIBLE_DEVICES=0 python3 \
|
||||||
|
-m vllm.entrypoints.openai.api_server \
|
||||||
|
--model $model \
|
||||||
--port 8100 \
|
--port 8100 \
|
||||||
--max-model-len 10000 \
|
--max-model-len 10000 \
|
||||||
--enable-chunked-prefill \
|
--enable-chunked-prefill \
|
||||||
--gpu-memory-utilization 0.6 &
|
--gpu-memory-utilization 0.6 &
|
||||||
CUDA_VISIBLE_DEVICES=1 vllm serve $model \
|
CUDA_VISIBLE_DEVICES=1 python3 \
|
||||||
|
-m vllm.entrypoints.openai.api_server \
|
||||||
|
--model $model \
|
||||||
--port 8200 \
|
--port 8200 \
|
||||||
--max-model-len 10000 \
|
--max-model-len 10000 \
|
||||||
--enable-chunked-prefill \
|
--enable-chunked-prefill \
|
||||||
@ -58,14 +62,18 @@ launch_chunked_prefill() {
|
|||||||
launch_disagg_prefill() {
|
launch_disagg_prefill() {
|
||||||
model="meta-llama/Meta-Llama-3.1-8B-Instruct"
|
model="meta-llama/Meta-Llama-3.1-8B-Instruct"
|
||||||
# disagg prefill
|
# disagg prefill
|
||||||
CUDA_VISIBLE_DEVICES=0 vllm serve $model \
|
CUDA_VISIBLE_DEVICES=0 python3 \
|
||||||
|
-m vllm.entrypoints.openai.api_server \
|
||||||
|
--model $model \
|
||||||
--port 8100 \
|
--port 8100 \
|
||||||
--max-model-len 10000 \
|
--max-model-len 10000 \
|
||||||
--gpu-memory-utilization 0.6 \
|
--gpu-memory-utilization 0.6 \
|
||||||
--kv-transfer-config \
|
--kv-transfer-config \
|
||||||
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
|
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
|
||||||
|
|
||||||
CUDA_VISIBLE_DEVICES=1 vllm serve $model \
|
CUDA_VISIBLE_DEVICES=1 python3 \
|
||||||
|
-m vllm.entrypoints.openai.api_server \
|
||||||
|
--model $model \
|
||||||
--port 8200 \
|
--port 8200 \
|
||||||
--max-model-len 10000 \
|
--max-model-len 10000 \
|
||||||
--gpu-memory-utilization 0.6 \
|
--gpu-memory-utilization 0.6 \
|
||||||
|
|||||||
@ -3,9 +3,10 @@
|
|||||||
|
|
||||||
import pickle as pkl
|
import pickle as pkl
|
||||||
import time
|
import time
|
||||||
from collections.abc import Callable, Iterable
|
from collections.abc import Iterable
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from itertools import product
|
from itertools import product
|
||||||
|
from typing import Callable, Optional
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torch.utils.benchmark as TBenchmark
|
import torch.utils.benchmark as TBenchmark
|
||||||
@ -50,7 +51,7 @@ def get_bench_params() -> list[bench_params_t]:
|
|||||||
def unfused_int8_impl(
|
def unfused_int8_impl(
|
||||||
rms_norm_layer: RMSNorm,
|
rms_norm_layer: RMSNorm,
|
||||||
x: torch.Tensor,
|
x: torch.Tensor,
|
||||||
residual: torch.Tensor | None,
|
residual: Optional[torch.Tensor],
|
||||||
quant_dtype: torch.dtype,
|
quant_dtype: torch.dtype,
|
||||||
):
|
):
|
||||||
# Norm
|
# Norm
|
||||||
@ -67,7 +68,7 @@ def unfused_int8_impl(
|
|||||||
def unfused_fp8_impl(
|
def unfused_fp8_impl(
|
||||||
rms_norm_layer: RMSNorm,
|
rms_norm_layer: RMSNorm,
|
||||||
x: torch.Tensor,
|
x: torch.Tensor,
|
||||||
residual: torch.Tensor | None,
|
residual: Optional[torch.Tensor],
|
||||||
quant_dtype: torch.dtype,
|
quant_dtype: torch.dtype,
|
||||||
):
|
):
|
||||||
# Norm
|
# Norm
|
||||||
@ -84,7 +85,7 @@ def unfused_fp8_impl(
|
|||||||
def fused_impl(
|
def fused_impl(
|
||||||
rms_norm_layer: RMSNorm, # this stores the weights
|
rms_norm_layer: RMSNorm, # this stores the weights
|
||||||
x: torch.Tensor,
|
x: torch.Tensor,
|
||||||
residual: torch.Tensor | None,
|
residual: Optional[torch.Tensor],
|
||||||
quant_dtype: torch.dtype,
|
quant_dtype: torch.dtype,
|
||||||
):
|
):
|
||||||
out, _ = ops.rms_norm_dynamic_per_token_quant(
|
out, _ = ops.rms_norm_dynamic_per_token_quant(
|
||||||
|
|||||||
@ -1,191 +0,0 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
#
|
|
||||||
# Copyright (C) 2025 Roberto L. Castro (Roberto.LopezCastro@ist.ac.at).
|
|
||||||
# All Rights Reserved.
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
#
|
|
||||||
|
|
||||||
import argparse
|
|
||||||
import copy
|
|
||||||
import itertools
|
|
||||||
|
|
||||||
import torch
|
|
||||||
from compressed_tensors.transform.utils.hadamard import deterministic_hadamard_matrix
|
|
||||||
from weight_shapes import WEIGHT_SHAPES
|
|
||||||
|
|
||||||
from vllm._custom_ops import fusedQuantizeMx, matmul_mxf4_bf16_tn
|
|
||||||
from vllm.model_executor.layers.quantization.qutlass_utils import to_blocked
|
|
||||||
from vllm.triton_utils import triton
|
|
||||||
|
|
||||||
PROVIDER_CFGS = {
|
|
||||||
"torch-bf16": dict(enabled=True),
|
|
||||||
"mxfp4": dict(no_a_quant=False, enabled=True),
|
|
||||||
"mxfp4-noquant": dict(no_a_quant=True, enabled=True),
|
|
||||||
}
|
|
||||||
|
|
||||||
_enabled = [k for k, v in PROVIDER_CFGS.items() if v["enabled"]]
|
|
||||||
|
|
||||||
|
|
||||||
def get_hadamard_matrix(group_size: int, dtype: torch.dtype, device: torch.device):
|
|
||||||
return (
|
|
||||||
deterministic_hadamard_matrix(group_size, dtype=dtype, device=device)
|
|
||||||
* group_size**-0.5
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def _quant_weight_mxfp4(
|
|
||||||
b: torch.Tensor, forward_hadamard_matrix: torch.Tensor, device: str
|
|
||||||
):
|
|
||||||
weight_hf_e2m1, weight_hf_e8m0 = fusedQuantizeMx(
|
|
||||||
b, forward_hadamard_matrix, method="abs_max"
|
|
||||||
)
|
|
||||||
weight_hf_scale_block = to_blocked(weight_hf_e8m0, backend="triton")
|
|
||||||
return weight_hf_e2m1, weight_hf_scale_block
|
|
||||||
|
|
||||||
|
|
||||||
def build_mxfp4_runner(cfg, a, b, forward_hadamard_matrix, dtype, device):
|
|
||||||
weight_hf_e2m1, weight_hf_scale_block = _quant_weight_mxfp4(
|
|
||||||
b, forward_hadamard_matrix, device
|
|
||||||
)
|
|
||||||
alpha = torch.tensor([1.0], device="cuda")
|
|
||||||
|
|
||||||
if cfg["no_a_quant"]:
|
|
||||||
# Pre-quantize activation
|
|
||||||
input_hf_e2m1, input_hf_e8m0 = fusedQuantizeMx(
|
|
||||||
a, forward_hadamard_matrix, method="abs_max"
|
|
||||||
)
|
|
||||||
input_hf_scale_block = to_blocked(input_hf_e8m0, backend="triton")
|
|
||||||
|
|
||||||
def run():
|
|
||||||
return matmul_mxf4_bf16_tn(
|
|
||||||
input_hf_e2m1,
|
|
||||||
weight_hf_e2m1,
|
|
||||||
input_hf_scale_block,
|
|
||||||
weight_hf_scale_block,
|
|
||||||
alpha,
|
|
||||||
)
|
|
||||||
|
|
||||||
return run
|
|
||||||
|
|
||||||
# Quantize activation on-the-fly
|
|
||||||
def run():
|
|
||||||
input_hf_e2m1, input_hf_e8m0 = fusedQuantizeMx(
|
|
||||||
a, forward_hadamard_matrix, method="abs_max"
|
|
||||||
)
|
|
||||||
input_hf_scale_block = to_blocked(input_hf_e8m0, backend="triton")
|
|
||||||
return matmul_mxf4_bf16_tn(
|
|
||||||
input_hf_e2m1,
|
|
||||||
weight_hf_e2m1,
|
|
||||||
input_hf_scale_block,
|
|
||||||
weight_hf_scale_block,
|
|
||||||
alpha,
|
|
||||||
)
|
|
||||||
|
|
||||||
return run
|
|
||||||
|
|
||||||
|
|
||||||
@triton.testing.perf_report(
|
|
||||||
triton.testing.Benchmark(
|
|
||||||
x_names=["batch_size"],
|
|
||||||
x_vals=[
|
|
||||||
1,
|
|
||||||
4,
|
|
||||||
8,
|
|
||||||
16,
|
|
||||||
32,
|
|
||||||
64,
|
|
||||||
128,
|
|
||||||
256,
|
|
||||||
512,
|
|
||||||
1024,
|
|
||||||
2048,
|
|
||||||
4096,
|
|
||||||
8192,
|
|
||||||
16384,
|
|
||||||
24576,
|
|
||||||
32768,
|
|
||||||
],
|
|
||||||
x_log=False,
|
|
||||||
line_arg="provider",
|
|
||||||
line_vals=_enabled,
|
|
||||||
line_names=_enabled,
|
|
||||||
ylabel="TFLOP/s (larger is better)",
|
|
||||||
plot_name="BF16 vs MXFP4 GEMMs",
|
|
||||||
args={},
|
|
||||||
)
|
|
||||||
)
|
|
||||||
def benchmark(batch_size, provider, N, K, had_size):
|
|
||||||
M = batch_size
|
|
||||||
device = "cuda"
|
|
||||||
dtype = torch.bfloat16
|
|
||||||
|
|
||||||
a = torch.randn((M, K), device=device, dtype=dtype)
|
|
||||||
b = torch.randn((N, K), device=device, dtype=dtype)
|
|
||||||
forward_hadamard_matrix = get_hadamard_matrix(had_size, dtype, device)
|
|
||||||
|
|
||||||
quantiles = [0.5, 0.2, 0.8]
|
|
||||||
|
|
||||||
if provider == "torch-bf16":
|
|
||||||
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
|
|
||||||
lambda: torch.nn.functional.linear(a, b), rep=200, quantiles=quantiles
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
cfg = PROVIDER_CFGS[provider]
|
|
||||||
run_quant = build_mxfp4_runner(
|
|
||||||
cfg, a, b, forward_hadamard_matrix, dtype, device
|
|
||||||
)
|
|
||||||
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
|
|
||||||
lambda: run_quant(), rep=200, quantiles=quantiles
|
|
||||||
)
|
|
||||||
|
|
||||||
to_tflops = lambda t_ms: (2 * M * N * K) * 1e-12 / (t_ms * 1e-3)
|
|
||||||
return to_tflops(ms), to_tflops(max_ms), to_tflops(min_ms)
|
|
||||||
|
|
||||||
|
|
||||||
def prepare_shapes(args):
|
|
||||||
out = []
|
|
||||||
for model, tp_size in itertools.product(args.models, args.tp_sizes):
|
|
||||||
for KN, tp_dim in copy.deepcopy(WEIGHT_SHAPES[model]):
|
|
||||||
KN[tp_dim] //= tp_size
|
|
||||||
KN.append(model)
|
|
||||||
out.append(KN)
|
|
||||||
return out
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
parser = argparse.ArgumentParser()
|
|
||||||
parser.add_argument(
|
|
||||||
"--models",
|
|
||||||
nargs="+",
|
|
||||||
type=str,
|
|
||||||
default=["meta-llama/Llama-3.3-70B-Instruct"],
|
|
||||||
choices=list(WEIGHT_SHAPES.keys()),
|
|
||||||
)
|
|
||||||
parser.add_argument("--tp-sizes", nargs="+", type=int, default=[1])
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
for K, N, model in prepare_shapes(args):
|
|
||||||
for had_size in [32, 64, 128]:
|
|
||||||
print(f"{model}, N={N} K={K}, HAD={had_size}, BF16 vs MXFP4 GEMMs TFLOP/s:")
|
|
||||||
benchmark.run(
|
|
||||||
print_data=True,
|
|
||||||
show_plots=True,
|
|
||||||
save_path=f"bench_mxfp4_res_n{N}_k{K}",
|
|
||||||
N=N,
|
|
||||||
K=K,
|
|
||||||
had_size=had_size,
|
|
||||||
)
|
|
||||||
|
|
||||||
print("Benchmark finished!")
|
|
||||||
@ -3,7 +3,6 @@
|
|||||||
import argparse
|
import argparse
|
||||||
import copy
|
import copy
|
||||||
import itertools
|
import itertools
|
||||||
import os
|
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
from weight_shapes import WEIGHT_SHAPES
|
from weight_shapes import WEIGHT_SHAPES
|
||||||
@ -24,45 +23,21 @@ PROVIDER_CFGS = {
|
|||||||
"torch-bf16": dict(enabled=True),
|
"torch-bf16": dict(enabled=True),
|
||||||
"nvfp4": dict(no_a_quant=False, enabled=True),
|
"nvfp4": dict(no_a_quant=False, enabled=True),
|
||||||
"nvfp4-noquant": dict(no_a_quant=True, enabled=True),
|
"nvfp4-noquant": dict(no_a_quant=True, enabled=True),
|
||||||
"fbgemm-nvfp4": dict(fbgemm=True, no_a_quant=False, enabled=True),
|
|
||||||
"fbgemm-nvfp4-noquant": dict(fbgemm=True, no_a_quant=True, enabled=True),
|
|
||||||
}
|
}
|
||||||
|
|
||||||
_needs_fbgemm = any(
|
|
||||||
v.get("fbgemm", False) for v in PROVIDER_CFGS.values() if v.get("enabled", False)
|
|
||||||
)
|
|
||||||
if _needs_fbgemm:
|
|
||||||
try:
|
|
||||||
from fbgemm_gpu.experimental.gemm.triton_gemm.fp4_quantize import (
|
|
||||||
triton_scale_nvfp4_quant,
|
|
||||||
)
|
|
||||||
except ImportError:
|
|
||||||
print(
|
|
||||||
"WARNING: FBGEMM providers are enabled but fbgemm_gpu is not installed. "
|
|
||||||
"These providers will be skipped. Please install fbgemm_gpu with: "
|
|
||||||
"'pip install fbgemm-gpu-genai' to run them."
|
|
||||||
)
|
|
||||||
# Disable FBGEMM providers so the benchmark can run.
|
|
||||||
for cfg in PROVIDER_CFGS.values():
|
|
||||||
if cfg.get("fbgemm"):
|
|
||||||
cfg["enabled"] = False
|
|
||||||
|
|
||||||
_enabled = [k for k, v in PROVIDER_CFGS.items() if v["enabled"]]
|
_enabled = [k for k, v in PROVIDER_CFGS.items() if v["enabled"]]
|
||||||
|
|
||||||
|
|
||||||
def _quant_weight_nvfp4(b: torch.Tensor, device: str, cfg):
|
def _quant_weight_nvfp4(b: torch.Tensor, device: str):
|
||||||
# Compute global scale for weight
|
# Compute global scale for weight
|
||||||
b_amax = torch.abs(b).max().to(torch.float32)
|
b_amax = torch.abs(b).max().to(torch.float32)
|
||||||
b_global_scale = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / b_amax
|
b_global_scale = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / b_amax
|
||||||
if "fbgemm" in cfg and cfg["fbgemm"]:
|
b_fp4, scale_b_fp4 = ops.scaled_fp4_quant(b, b_global_scale)
|
||||||
b_fp4, scale_b_fp4 = triton_scale_nvfp4_quant(b, b_global_scale)
|
|
||||||
else:
|
|
||||||
b_fp4, scale_b_fp4 = ops.scaled_fp4_quant(b, b_global_scale)
|
|
||||||
return b_fp4, scale_b_fp4, b_global_scale
|
return b_fp4, scale_b_fp4, b_global_scale
|
||||||
|
|
||||||
|
|
||||||
def build_nvfp4_runner(cfg, a, b, dtype, device):
|
def build_nvfp4_runner(cfg, a, b, dtype, device):
|
||||||
b_fp4, scale_b_fp4, b_global_scale = _quant_weight_nvfp4(b, device, cfg)
|
b_fp4, scale_b_fp4, b_global_scale = _quant_weight_nvfp4(b, device)
|
||||||
|
|
||||||
# Compute global scale for activation
|
# Compute global scale for activation
|
||||||
# NOTE: This is generally provided ahead-of-time by the model checkpoint.
|
# NOTE: This is generally provided ahead-of-time by the model checkpoint.
|
||||||
@ -71,35 +46,6 @@ def build_nvfp4_runner(cfg, a, b, dtype, device):
|
|||||||
|
|
||||||
# Alpha for the GEMM operation
|
# Alpha for the GEMM operation
|
||||||
alpha = 1.0 / (a_global_scale * b_global_scale)
|
alpha = 1.0 / (a_global_scale * b_global_scale)
|
||||||
if "fbgemm" in cfg and cfg["fbgemm"]:
|
|
||||||
if cfg["no_a_quant"]:
|
|
||||||
a_fp4, scale_a_fp4 = triton_scale_nvfp4_quant(a, a_global_scale)
|
|
||||||
|
|
||||||
def run():
|
|
||||||
return torch.ops.fbgemm.f4f4bf16(
|
|
||||||
a_fp4,
|
|
||||||
b_fp4,
|
|
||||||
scale_a_fp4,
|
|
||||||
scale_b_fp4,
|
|
||||||
global_scale=alpha,
|
|
||||||
use_mx=False,
|
|
||||||
)
|
|
||||||
|
|
||||||
return run
|
|
||||||
else:
|
|
||||||
|
|
||||||
def run():
|
|
||||||
a_fp4, scale_a_fp4 = triton_scale_nvfp4_quant(a, a_global_scale)
|
|
||||||
return torch.ops.fbgemm.f4f4bf16(
|
|
||||||
a_fp4,
|
|
||||||
b_fp4,
|
|
||||||
scale_a_fp4,
|
|
||||||
scale_b_fp4,
|
|
||||||
global_scale=alpha,
|
|
||||||
use_mx=False,
|
|
||||||
)
|
|
||||||
|
|
||||||
return run
|
|
||||||
|
|
||||||
if cfg["no_a_quant"]:
|
if cfg["no_a_quant"]:
|
||||||
# Pre-quantize activation
|
# Pre-quantize activation
|
||||||
@ -184,13 +130,10 @@ if __name__ == "__main__":
|
|||||||
|
|
||||||
for K, N, model in prepare_shapes(args):
|
for K, N, model in prepare_shapes(args):
|
||||||
print(f"{model}, N={N} K={K}, BF16 vs NVFP4 GEMMs TFLOP/s:")
|
print(f"{model}, N={N} K={K}, BF16 vs NVFP4 GEMMs TFLOP/s:")
|
||||||
save_dir = f"bench_nvfp4_res_n{N}_k{K}"
|
|
||||||
os.makedirs(save_dir, exist_ok=True)
|
|
||||||
|
|
||||||
benchmark.run(
|
benchmark.run(
|
||||||
print_data=True,
|
print_data=True,
|
||||||
show_plots=True,
|
show_plots=True,
|
||||||
save_path=save_dir,
|
save_path=f"bench_nvfp4_res_n{N}_k{K}",
|
||||||
N=N,
|
N=N,
|
||||||
K=K,
|
K=K,
|
||||||
)
|
)
|
||||||
|
|||||||
@ -1,207 +0,0 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
#
|
|
||||||
# Copyright (C) 2025 Roberto L. Castro (Roberto.LopezCastro@ist.ac.at).
|
|
||||||
# All Rights Reserved.
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
#
|
|
||||||
|
|
||||||
import argparse
|
|
||||||
import copy
|
|
||||||
import itertools
|
|
||||||
|
|
||||||
import torch
|
|
||||||
from compressed_tensors.transform.utils.hadamard import deterministic_hadamard_matrix
|
|
||||||
from weight_shapes import WEIGHT_SHAPES
|
|
||||||
|
|
||||||
from vllm import _custom_ops as ops # use existing nvfp4 gemm in vllm
|
|
||||||
from vllm._custom_ops import fusedQuantizeNv
|
|
||||||
from vllm.model_executor.layers.quantization.qutlass_utils import to_blocked
|
|
||||||
from vllm.triton_utils import triton
|
|
||||||
|
|
||||||
PROVIDER_CFGS = {
|
|
||||||
"torch-bf16": dict(enabled=True),
|
|
||||||
"nvfp4": dict(no_a_quant=False, enabled=True),
|
|
||||||
"nvfp4-noquant": dict(no_a_quant=True, enabled=True),
|
|
||||||
}
|
|
||||||
|
|
||||||
_enabled = [k for k, v in PROVIDER_CFGS.items() if v["enabled"]]
|
|
||||||
|
|
||||||
|
|
||||||
def get_hadamard_matrix(group_size: int, dtype: torch.dtype, device: torch.device):
|
|
||||||
return (
|
|
||||||
deterministic_hadamard_matrix(group_size, dtype=dtype, device=device)
|
|
||||||
* group_size**-0.5
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def _quant_weight_nvfp4(
|
|
||||||
b: torch.Tensor,
|
|
||||||
forward_hadamard_matrix: torch.Tensor,
|
|
||||||
global_scale: torch.Tensor,
|
|
||||||
device: str,
|
|
||||||
M: int,
|
|
||||||
N: int,
|
|
||||||
K: int,
|
|
||||||
):
|
|
||||||
weight_hf_e2m1, weight_hf_e8m0 = fusedQuantizeNv(
|
|
||||||
b, forward_hadamard_matrix, global_scale
|
|
||||||
)
|
|
||||||
weight_hf_scale_block = to_blocked(weight_hf_e8m0, backend="triton").view(
|
|
||||||
-1, K // 16
|
|
||||||
)
|
|
||||||
return weight_hf_e2m1, weight_hf_scale_block
|
|
||||||
|
|
||||||
|
|
||||||
def build_nvfp4_runner(cfg, a, b, forward_hadamard_matrix, dtype, device, M, N, K):
|
|
||||||
alpha = torch.tensor([1.0], device="cuda")
|
|
||||||
global_scale = torch.tensor([1.0], device="cuda")
|
|
||||||
weight_hf_e2m1, weight_hf_scale_block = _quant_weight_nvfp4(
|
|
||||||
b, forward_hadamard_matrix, global_scale, device, M, N, K
|
|
||||||
)
|
|
||||||
|
|
||||||
if cfg["no_a_quant"]:
|
|
||||||
# Pre-quantize activation
|
|
||||||
input_hf_e2m1, input_hf_e8m0 = fusedQuantizeNv(
|
|
||||||
a, forward_hadamard_matrix, global_scale
|
|
||||||
)
|
|
||||||
input_hf_scale_block = to_blocked(input_hf_e8m0, backend="triton").view(
|
|
||||||
-1, K // 16
|
|
||||||
)
|
|
||||||
|
|
||||||
def run():
|
|
||||||
return ops.cutlass_scaled_fp4_mm(
|
|
||||||
input_hf_e2m1,
|
|
||||||
weight_hf_e2m1,
|
|
||||||
input_hf_scale_block,
|
|
||||||
weight_hf_scale_block,
|
|
||||||
alpha,
|
|
||||||
torch.bfloat16,
|
|
||||||
)
|
|
||||||
|
|
||||||
return run
|
|
||||||
|
|
||||||
# Quantize activation on-the-fly
|
|
||||||
def run():
|
|
||||||
input_hf_e2m1, input_hf_e8m0 = fusedQuantizeNv(
|
|
||||||
a, forward_hadamard_matrix, global_scale
|
|
||||||
)
|
|
||||||
input_hf_scale_block = to_blocked(input_hf_e8m0, backend="triton").view(
|
|
||||||
-1, K // 16
|
|
||||||
)
|
|
||||||
return ops.cutlass_scaled_fp4_mm(
|
|
||||||
input_hf_e2m1,
|
|
||||||
weight_hf_e2m1,
|
|
||||||
input_hf_scale_block,
|
|
||||||
weight_hf_scale_block,
|
|
||||||
alpha,
|
|
||||||
torch.bfloat16,
|
|
||||||
)
|
|
||||||
|
|
||||||
return run
|
|
||||||
|
|
||||||
|
|
||||||
@triton.testing.perf_report(
|
|
||||||
triton.testing.Benchmark(
|
|
||||||
x_names=["batch_size"],
|
|
||||||
x_vals=[
|
|
||||||
1,
|
|
||||||
4,
|
|
||||||
8,
|
|
||||||
16,
|
|
||||||
32,
|
|
||||||
64,
|
|
||||||
128,
|
|
||||||
256,
|
|
||||||
512,
|
|
||||||
1024,
|
|
||||||
2048,
|
|
||||||
4096,
|
|
||||||
8192,
|
|
||||||
16384,
|
|
||||||
24576,
|
|
||||||
32768,
|
|
||||||
],
|
|
||||||
x_log=False,
|
|
||||||
line_arg="provider",
|
|
||||||
line_vals=_enabled,
|
|
||||||
line_names=_enabled,
|
|
||||||
ylabel="TFLOP/s (larger is better)",
|
|
||||||
plot_name="BF16 vs NVFP4 GEMMs",
|
|
||||||
args={},
|
|
||||||
)
|
|
||||||
)
|
|
||||||
def benchmark(batch_size, provider, N, K, had_size):
|
|
||||||
M = batch_size
|
|
||||||
device = "cuda"
|
|
||||||
dtype = torch.bfloat16
|
|
||||||
|
|
||||||
a = torch.randn((M, K), device=device, dtype=dtype)
|
|
||||||
b = torch.randn((N, K), device=device, dtype=dtype)
|
|
||||||
forward_hadamard_matrix = get_hadamard_matrix(had_size, dtype, device)
|
|
||||||
|
|
||||||
quantiles = [0.5, 0.2, 0.8]
|
|
||||||
|
|
||||||
if provider == "torch-bf16":
|
|
||||||
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
|
|
||||||
lambda: torch.nn.functional.linear(a, b), rep=200, quantiles=quantiles
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
cfg = PROVIDER_CFGS[provider]
|
|
||||||
run_quant = build_nvfp4_runner(
|
|
||||||
cfg, a, b, forward_hadamard_matrix, dtype, device, M, N, K
|
|
||||||
)
|
|
||||||
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
|
|
||||||
lambda: run_quant(), rep=200, quantiles=quantiles
|
|
||||||
)
|
|
||||||
|
|
||||||
to_tflops = lambda t_ms: (2 * M * N * K) * 1e-12 / (t_ms * 1e-3)
|
|
||||||
return to_tflops(ms), to_tflops(max_ms), to_tflops(min_ms)
|
|
||||||
|
|
||||||
|
|
||||||
def prepare_shapes(args):
|
|
||||||
out = []
|
|
||||||
for model, tp_size in itertools.product(args.models, args.tp_sizes):
|
|
||||||
for KN, tp_dim in copy.deepcopy(WEIGHT_SHAPES[model]):
|
|
||||||
KN[tp_dim] //= tp_size
|
|
||||||
KN.append(model)
|
|
||||||
out.append(KN)
|
|
||||||
return out
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
parser = argparse.ArgumentParser()
|
|
||||||
parser.add_argument(
|
|
||||||
"--models",
|
|
||||||
nargs="+",
|
|
||||||
type=str,
|
|
||||||
default=["meta-llama/Llama-3.3-70B-Instruct"],
|
|
||||||
choices=list(WEIGHT_SHAPES.keys()),
|
|
||||||
)
|
|
||||||
parser.add_argument("--tp-sizes", nargs="+", type=int, default=[1])
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
for K, N, model in prepare_shapes(args):
|
|
||||||
for had_size in [16, 32, 64, 128]:
|
|
||||||
print(f"{model}, N={N} K={K}, HAD={had_size}, BF16 vs NVFP4 GEMMs TFLOP/s:")
|
|
||||||
benchmark.run(
|
|
||||||
print_data=True,
|
|
||||||
show_plots=True,
|
|
||||||
save_path=f"bench_nvfp4_res_n{N}_k{K}",
|
|
||||||
N=N,
|
|
||||||
K=K,
|
|
||||||
had_size=had_size,
|
|
||||||
)
|
|
||||||
|
|
||||||
print("Benchmark finished!")
|
|
||||||
@ -1,7 +1,7 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
import itertools
|
import itertools
|
||||||
from collections.abc import Callable
|
from typing import Callable
|
||||||
from unittest.mock import patch
|
from unittest.mock import patch
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
@ -10,8 +10,7 @@ import torch
|
|||||||
from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
|
from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
|
||||||
from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
|
from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
|
||||||
from vllm.triton_utils import triton
|
from vllm.triton_utils import triton
|
||||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
|
||||||
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
|
|
||||||
|
|
||||||
|
|
||||||
def with_triton_mode(fn):
|
def with_triton_mode(fn):
|
||||||
@ -52,7 +51,7 @@ def calculate_diff(
|
|||||||
):
|
):
|
||||||
"""Calculate the difference between Inductor and CUDA implementations."""
|
"""Calculate the difference between Inductor and CUDA implementations."""
|
||||||
device = torch.device("cuda")
|
device = torch.device("cuda")
|
||||||
x = torch.randn((batch_size, hidden_size), dtype=dtype, device=device)
|
x = torch.rand((batch_size * hidden_size, 4096), dtype=dtype, device=device)
|
||||||
|
|
||||||
quant_fp8 = QuantFP8(False, group_shape, column_major_scales=False)
|
quant_fp8 = QuantFP8(False, group_shape, column_major_scales=False)
|
||||||
|
|
||||||
@ -60,25 +59,23 @@ def calculate_diff(
|
|||||||
torch_eager_out, torch_eager_scale = quant_fp8.forward_native(x)
|
torch_eager_out, torch_eager_scale = quant_fp8.forward_native(x)
|
||||||
cuda_out, cuda_scale = quant_fp8.forward_cuda(x)
|
cuda_out, cuda_scale = quant_fp8.forward_cuda(x)
|
||||||
|
|
||||||
try:
|
out_allclose = lambda o1, o2: torch.allclose(
|
||||||
torch.testing.assert_close(
|
o1.to(torch.float32),
|
||||||
cuda_out.to(torch.float32),
|
o2.to(torch.float32),
|
||||||
torch_out.to(torch.float32),
|
rtol=1e-3,
|
||||||
rtol=1e-3,
|
atol=1e-5,
|
||||||
atol=1e-5,
|
)
|
||||||
)
|
scale_allclose = lambda s1, s2: torch.allclose(s1, s2, rtol=1e-3, atol=1e-5)
|
||||||
torch.testing.assert_close(cuda_scale, torch_scale, rtol=1e-3, atol=1e-5)
|
|
||||||
torch.testing.assert_close(
|
if (
|
||||||
cuda_out.to(torch.float32),
|
out_allclose(cuda_out, torch_out)
|
||||||
torch_eager_out.to(torch.float32),
|
and scale_allclose(cuda_scale, torch_scale)
|
||||||
rtol=1e-3,
|
and out_allclose(cuda_out, torch_eager_out)
|
||||||
atol=1e-5,
|
and scale_allclose(cuda_scale, torch_eager_scale)
|
||||||
)
|
):
|
||||||
torch.testing.assert_close(cuda_scale, torch_eager_scale, rtol=1e-3, atol=1e-5)
|
|
||||||
print("✅ All implementations match")
|
print("✅ All implementations match")
|
||||||
except AssertionError as e:
|
else:
|
||||||
print("❌ Implementations differ")
|
print("❌ Implementations differ")
|
||||||
print(e)
|
|
||||||
|
|
||||||
|
|
||||||
configs = []
|
configs = []
|
||||||
@ -94,7 +91,7 @@ def benchmark_quantization(
|
|||||||
):
|
):
|
||||||
device = torch.device("cuda")
|
device = torch.device("cuda")
|
||||||
|
|
||||||
x = torch.randn(batch_size, hidden_size, device=device, dtype=dtype)
|
x = torch.randn(batch_size * hidden_size, 4096, device=device, dtype=dtype)
|
||||||
|
|
||||||
quantiles = [0.5, 0.2, 0.8]
|
quantiles = [0.5, 0.2, 0.8]
|
||||||
quant_fp8 = QuantFP8(False, group_shape, column_major_scales=col_major)
|
quant_fp8 = QuantFP8(False, group_shape, column_major_scales=col_major)
|
||||||
@ -160,21 +157,21 @@ if __name__ == "__main__":
|
|||||||
)
|
)
|
||||||
parser.add_argument("-c", "--check", action="store_true")
|
parser.add_argument("-c", "--check", action="store_true")
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--dtype", type=str, choices=["half", "bfloat16", "float"], default="bfloat16"
|
"--dtype", type=str, choices=["half", "bfloat16", "float"], default="half"
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--hidden-sizes",
|
"--hidden-sizes",
|
||||||
type=int,
|
type=int,
|
||||||
nargs="+",
|
nargs="+",
|
||||||
default=[896, 1024, 2048, 4096, 7168],
|
default=None,
|
||||||
help="Hidden sizes to benchmark",
|
help="Hidden sizes to benchmark (default: 1,16,64,128,256,512,1024,2048,4096)",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--batch-sizes",
|
"--batch-sizes",
|
||||||
type=int,
|
type=int,
|
||||||
nargs="+",
|
nargs="+",
|
||||||
default=[1, 16, 128, 512, 1024],
|
default=None,
|
||||||
help="Batch sizes to benchmark",
|
help="Batch sizes to benchmark (default: 1,16,32,64,128)",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--group-sizes",
|
"--group-sizes",
|
||||||
@ -195,8 +192,8 @@ if __name__ == "__main__":
|
|||||||
|
|
||||||
dtype = STR_DTYPE_TO_TORCH_DTYPE[args.dtype]
|
dtype = STR_DTYPE_TO_TORCH_DTYPE[args.dtype]
|
||||||
|
|
||||||
hidden_sizes = args.hidden_sizes
|
hidden_sizes = args.hidden_sizes or [1, 16, 64, 128, 256, 512, 1024, 2048, 4096]
|
||||||
batch_sizes = args.batch_sizes
|
batch_sizes = args.batch_sizes or [1, 16, 32, 64, 128]
|
||||||
|
|
||||||
if args.group_sizes is not None:
|
if args.group_sizes is not None:
|
||||||
group_shapes = []
|
group_shapes = []
|
||||||
|
|||||||
@ -10,8 +10,7 @@ import vllm.model_executor.layers.activation # noqa F401
|
|||||||
from vllm.model_executor.custom_op import CustomOp
|
from vllm.model_executor.custom_op import CustomOp
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
from vllm.triton_utils import triton
|
from vllm.triton_utils import triton
|
||||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
|
||||||
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
|
|
||||||
|
|
||||||
batch_size_range = [1, 16, 32, 64, 128]
|
batch_size_range = [1, 16, 32, 64, 128]
|
||||||
seq_len_range = [1, 16, 64, 128, 256, 512, 1024, 2048, 4096]
|
seq_len_range = [1, 16, 64, 128, 256, 512, 1024, 2048, 4096]
|
||||||
|
|||||||
@ -28,7 +28,7 @@ except ImportError as e:
|
|||||||
|
|
||||||
from bitblas import Matmul, MatmulConfig, auto_detect_nvidia_target
|
from bitblas import Matmul, MatmulConfig, auto_detect_nvidia_target
|
||||||
|
|
||||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
from vllm.utils import FlexibleArgumentParser
|
||||||
|
|
||||||
parser = FlexibleArgumentParser(
|
parser = FlexibleArgumentParser(
|
||||||
description="Benchmark BitBLAS int4 on a specific target."
|
description="Benchmark BitBLAS int4 on a specific target."
|
||||||
|
|||||||
@ -20,7 +20,7 @@ from vllm.model_executor.layers.fused_moe.config import (
|
|||||||
from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp4
|
from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp4
|
||||||
from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk
|
from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk
|
||||||
from vllm.scalar_type import scalar_types
|
from vllm.scalar_type import scalar_types
|
||||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
from vllm.utils import FlexibleArgumentParser
|
||||||
|
|
||||||
WEIGHT_SHAPES_MOE = {
|
WEIGHT_SHAPES_MOE = {
|
||||||
"nvidia/DeepSeek-R1-FP4": [
|
"nvidia/DeepSeek-R1-FP4": [
|
||||||
|
|||||||
@ -1,406 +0,0 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
"""
|
|
||||||
Benchmark the performance of the cutlass_moe_fp8 kernel vs the triton_moe
|
|
||||||
kernel. Both kernels take in fp8 quantized weights and 16-bit activations,
|
|
||||||
but use different quantization strategies and backends.
|
|
||||||
"""
|
|
||||||
|
|
||||||
import nvtx
|
|
||||||
import torch
|
|
||||||
|
|
||||||
from vllm import _custom_ops as ops
|
|
||||||
from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_config
|
|
||||||
from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp8
|
|
||||||
from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk
|
|
||||||
from vllm.platforms import current_platform
|
|
||||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
|
||||||
|
|
||||||
# Weight shapes for different models: [num_experts, topk, hidden_size,
|
|
||||||
# intermediate_size]
|
|
||||||
WEIGHT_SHAPES_MOE = {
|
|
||||||
"mixtral-8x7b": [
|
|
||||||
[8, 2, 4096, 14336],
|
|
||||||
],
|
|
||||||
"deepseek-v2": [
|
|
||||||
[160, 6, 5120, 12288],
|
|
||||||
],
|
|
||||||
"custom-small": [
|
|
||||||
[8, 2, 2048, 7168],
|
|
||||||
],
|
|
||||||
"glm45-fp8": [
|
|
||||||
[128, 8, 4096, 1408],
|
|
||||||
],
|
|
||||||
"Llama-4-Maverick-17B-128E-Instruct-FP8": [
|
|
||||||
[128, 1, 5120, 8192],
|
|
||||||
],
|
|
||||||
}
|
|
||||||
|
|
||||||
DEFAULT_MODELS = [
|
|
||||||
"mixtral-8x7b",
|
|
||||||
]
|
|
||||||
|
|
||||||
DEFAULT_BATCH_SIZES = [4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048]
|
|
||||||
DEFAULT_TP_SIZES = [1]
|
|
||||||
|
|
||||||
PER_ACT_TOKEN_OPTS = [False, True]
|
|
||||||
PER_OUT_CH_OPTS = [False, True]
|
|
||||||
|
|
||||||
FP8_DTYPE = current_platform.fp8_dtype()
|
|
||||||
|
|
||||||
|
|
||||||
def bench_run(
|
|
||||||
results: list,
|
|
||||||
model: str,
|
|
||||||
num_experts: int,
|
|
||||||
topk: int,
|
|
||||||
per_act_token: bool,
|
|
||||||
per_out_ch: bool,
|
|
||||||
mkn: tuple[int, int, int],
|
|
||||||
):
|
|
||||||
(m, k, n) = mkn
|
|
||||||
|
|
||||||
dtype = torch.half
|
|
||||||
device = "cuda"
|
|
||||||
|
|
||||||
# Create input activations
|
|
||||||
a = torch.randn((m, k), device=device, dtype=dtype) / 10
|
|
||||||
|
|
||||||
# Create weights
|
|
||||||
w1 = torch.randn((num_experts, 2 * n, k), device=device, dtype=dtype) / 10
|
|
||||||
w2 = torch.randn((num_experts, k, n), device=device, dtype=dtype) / 10
|
|
||||||
|
|
||||||
# Create FP8 quantized weights and scales for both kernels
|
|
||||||
w1_fp8q = torch.empty((num_experts, 2 * n, k), device=device, dtype=FP8_DTYPE)
|
|
||||||
w2_fp8q = torch.empty((num_experts, k, n), device=device, dtype=FP8_DTYPE)
|
|
||||||
|
|
||||||
# Create scales based on quantization strategy
|
|
||||||
if per_out_ch:
|
|
||||||
# Per-channel quantization
|
|
||||||
w1_scale = torch.empty(
|
|
||||||
(num_experts, 2 * n, 1), device=device, dtype=torch.float32
|
|
||||||
)
|
|
||||||
w2_scale = torch.empty((num_experts, k, 1), device=device, dtype=torch.float32)
|
|
||||||
else:
|
|
||||||
# Per-tensor quantization
|
|
||||||
w1_scale = torch.empty((num_experts, 1, 1), device=device, dtype=torch.float32)
|
|
||||||
w2_scale = torch.empty((num_experts, 1, 1), device=device, dtype=torch.float32)
|
|
||||||
|
|
||||||
# Quantize weights
|
|
||||||
for expert in range(num_experts):
|
|
||||||
if per_out_ch:
|
|
||||||
# Per-channel quantization - not yet implemented properly
|
|
||||||
# For now, fall back to per-tensor quantization
|
|
||||||
w1_fp8q[expert], w1_scale_temp = ops.scaled_fp8_quant(w1[expert])
|
|
||||||
w2_fp8q[expert], w2_scale_temp = ops.scaled_fp8_quant(w2[expert])
|
|
||||||
# Expand scalar scales to the expected per-channel shape
|
|
||||||
w1_scale[expert] = w1_scale_temp.expand(2 * n, 1)
|
|
||||||
w2_scale[expert] = w2_scale_temp.expand(k, 1)
|
|
||||||
else:
|
|
||||||
# Per-tensor quantization
|
|
||||||
w1_fp8q[expert], w1_scale_temp = ops.scaled_fp8_quant(w1[expert])
|
|
||||||
w2_fp8q[expert], w2_scale_temp = ops.scaled_fp8_quant(w2[expert])
|
|
||||||
# Store scalar scales in [1, 1] tensors
|
|
||||||
w1_scale[expert, 0, 0] = w1_scale_temp
|
|
||||||
w2_scale[expert, 0, 0] = w2_scale_temp
|
|
||||||
|
|
||||||
# Prepare weights for CUTLASS (no transpose needed)
|
|
||||||
w1_fp8q_cutlass = w1_fp8q # Keep original [E, 2N, K]
|
|
||||||
w2_fp8q_cutlass = w2_fp8q # Keep original [E, K, N]
|
|
||||||
|
|
||||||
# Create router scores and get topk
|
|
||||||
score = torch.randn((m, num_experts), device=device, dtype=dtype)
|
|
||||||
topk_weights, topk_ids, _ = fused_topk(a, score, topk, renormalize=False)
|
|
||||||
|
|
||||||
# WORKAROUND: CUTLASS MoE FP8 has issues with per-token quantization
|
|
||||||
# Force per-tensor quantization for all cases to match working e2e setup
|
|
||||||
a1_scale = torch.full((), 1e-2, device=device, dtype=torch.float32)
|
|
||||||
a2_scale = torch.full((), 1e-2, device=device, dtype=torch.float32)
|
|
||||||
|
|
||||||
# Force per-tensor quantization for all cases
|
|
||||||
per_act_token = False
|
|
||||||
|
|
||||||
# Create stride tensors for CUTLASS
|
|
||||||
ab_strides1 = torch.full((num_experts,), k, dtype=torch.int64, device=device)
|
|
||||||
ab_strides2 = torch.full((num_experts,), n, dtype=torch.int64, device=device)
|
|
||||||
c_strides1 = torch.full((num_experts,), 2 * n, dtype=torch.int64, device=device)
|
|
||||||
c_strides2 = torch.full((num_experts,), k, dtype=torch.int64, device=device)
|
|
||||||
|
|
||||||
def run_triton_moe(
|
|
||||||
a: torch.Tensor,
|
|
||||||
w1: torch.Tensor,
|
|
||||||
w2: torch.Tensor,
|
|
||||||
topk_weights: torch.Tensor,
|
|
||||||
topk_ids: torch.Tensor,
|
|
||||||
w1_scale: torch.Tensor,
|
|
||||||
w2_scale: torch.Tensor,
|
|
||||||
a1_scale: torch.Tensor,
|
|
||||||
a2_scale: torch.Tensor,
|
|
||||||
num_repeats: int,
|
|
||||||
):
|
|
||||||
quant_config = fp8_w8a8_moe_quant_config(
|
|
||||||
w1_scale=w1_scale,
|
|
||||||
w2_scale=w2_scale,
|
|
||||||
a1_scale=a1_scale,
|
|
||||||
a2_scale=a2_scale,
|
|
||||||
per_act_token_quant=per_act_token,
|
|
||||||
per_out_ch_quant=per_out_ch,
|
|
||||||
)
|
|
||||||
|
|
||||||
for _ in range(num_repeats):
|
|
||||||
fused_experts(
|
|
||||||
a,
|
|
||||||
w1,
|
|
||||||
w2,
|
|
||||||
topk_weights,
|
|
||||||
topk_ids,
|
|
||||||
quant_config=quant_config,
|
|
||||||
)
|
|
||||||
|
|
||||||
def run_cutlass_moe_fp8(
|
|
||||||
a: torch.Tensor,
|
|
||||||
w1: torch.Tensor,
|
|
||||||
w2: torch.Tensor,
|
|
||||||
topk_weights: torch.Tensor,
|
|
||||||
topk_ids: torch.Tensor,
|
|
||||||
ab_strides1: torch.Tensor,
|
|
||||||
ab_strides2: torch.Tensor,
|
|
||||||
c_strides1: torch.Tensor,
|
|
||||||
c_strides2: torch.Tensor,
|
|
||||||
w1_scale: torch.Tensor,
|
|
||||||
w2_scale: torch.Tensor,
|
|
||||||
a1_scale: torch.Tensor,
|
|
||||||
a2_scale: torch.Tensor,
|
|
||||||
num_repeats: int,
|
|
||||||
):
|
|
||||||
quant_config = fp8_w8a8_moe_quant_config(
|
|
||||||
w1_scale=w1_scale,
|
|
||||||
w2_scale=w2_scale,
|
|
||||||
a1_scale=a1_scale,
|
|
||||||
a2_scale=a2_scale,
|
|
||||||
per_act_token_quant=per_act_token,
|
|
||||||
per_out_ch_quant=per_out_ch,
|
|
||||||
)
|
|
||||||
|
|
||||||
for _ in range(num_repeats):
|
|
||||||
with nvtx.annotate("cutlass_moe_fp8", color="blue"):
|
|
||||||
cutlass_moe_fp8(
|
|
||||||
a=a,
|
|
||||||
w1_q=w1,
|
|
||||||
w2_q=w2,
|
|
||||||
topk_weights=topk_weights,
|
|
||||||
topk_ids=topk_ids,
|
|
||||||
ab_strides1=ab_strides1,
|
|
||||||
ab_strides2=ab_strides2,
|
|
||||||
c_strides1=c_strides1,
|
|
||||||
c_strides2=c_strides2,
|
|
||||||
quant_config=quant_config,
|
|
||||||
activation="silu",
|
|
||||||
global_num_experts=num_experts,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Pre-create quantization config to avoid creating it inside CUDA graph
|
|
||||||
quant_config = fp8_w8a8_moe_quant_config(
|
|
||||||
w1_scale=w1_scale,
|
|
||||||
w2_scale=w2_scale,
|
|
||||||
a1_scale=a1_scale,
|
|
||||||
a2_scale=a2_scale,
|
|
||||||
per_act_token_quant=per_act_token,
|
|
||||||
per_out_ch_quant=per_out_ch,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Create CUDA graphs for CUTLASS (match benchmark_moe.py pattern exactly)
|
|
||||||
cutlass_stream = torch.cuda.Stream()
|
|
||||||
cutlass_graph = torch.cuda.CUDAGraph()
|
|
||||||
with torch.cuda.graph(cutlass_graph, stream=cutlass_stream):
|
|
||||||
# Capture 10 invocations like benchmark_moe.py
|
|
||||||
for _ in range(10):
|
|
||||||
cutlass_moe_fp8(
|
|
||||||
a=a,
|
|
||||||
w1_q=w1_fp8q_cutlass,
|
|
||||||
w2_q=w2_fp8q_cutlass,
|
|
||||||
topk_weights=topk_weights,
|
|
||||||
topk_ids=topk_ids,
|
|
||||||
ab_strides1=ab_strides1,
|
|
||||||
ab_strides2=ab_strides2,
|
|
||||||
c_strides1=c_strides1,
|
|
||||||
c_strides2=c_strides2,
|
|
||||||
quant_config=quant_config,
|
|
||||||
activation="silu",
|
|
||||||
global_num_experts=num_experts,
|
|
||||||
)
|
|
||||||
torch.cuda.synchronize()
|
|
||||||
|
|
||||||
# Create CUDA graphs for Triton (match benchmark_moe.py pattern exactly)
|
|
||||||
triton_stream = torch.cuda.Stream()
|
|
||||||
triton_graph = torch.cuda.CUDAGraph()
|
|
||||||
with torch.cuda.graph(triton_graph, stream=triton_stream):
|
|
||||||
# Capture 10 invocations like benchmark_moe.py
|
|
||||||
for _ in range(10):
|
|
||||||
fused_experts(
|
|
||||||
a,
|
|
||||||
w1_fp8q,
|
|
||||||
w2_fp8q,
|
|
||||||
topk_weights,
|
|
||||||
topk_ids,
|
|
||||||
quant_config=quant_config,
|
|
||||||
)
|
|
||||||
torch.cuda.synchronize()
|
|
||||||
|
|
||||||
def bench_cuda_graph(graph, num_warmup=5, num_iters=100):
|
|
||||||
"""Benchmark CUDA graph using events like benchmark_moe.py"""
|
|
||||||
# Warmup
|
|
||||||
for _ in range(num_warmup):
|
|
||||||
graph.replay()
|
|
||||||
torch.cuda.synchronize()
|
|
||||||
|
|
||||||
# Timing
|
|
||||||
start_event = torch.cuda.Event(enable_timing=True)
|
|
||||||
end_event = torch.cuda.Event(enable_timing=True)
|
|
||||||
|
|
||||||
latencies = []
|
|
||||||
for _ in range(num_iters):
|
|
||||||
torch.cuda.synchronize()
|
|
||||||
start_event.record()
|
|
||||||
graph.replay()
|
|
||||||
end_event.record()
|
|
||||||
end_event.synchronize()
|
|
||||||
latencies.append(start_event.elapsed_time(end_event))
|
|
||||||
|
|
||||||
# Divide by 10 since graph contains 10 calls
|
|
||||||
return sum(latencies) / (num_iters * 10)
|
|
||||||
|
|
||||||
# Benchmark parameters
|
|
||||||
num_warmup = 5
|
|
||||||
num_iters = 100
|
|
||||||
|
|
||||||
# Benchmark only CUDA graphs (more reliable and faster)
|
|
||||||
# Benchmark Triton MoE with CUDA graphs
|
|
||||||
triton_graph_time = bench_cuda_graph(
|
|
||||||
triton_graph, num_warmup=num_warmup, num_iters=num_iters
|
|
||||||
)
|
|
||||||
|
|
||||||
# Benchmark CUTLASS MoE with CUDA graphs
|
|
||||||
cutlass_graph_time = bench_cuda_graph(
|
|
||||||
cutlass_graph, num_warmup=num_warmup, num_iters=num_iters
|
|
||||||
)
|
|
||||||
|
|
||||||
# Convert ms to us and return results
|
|
||||||
triton_time_us = triton_graph_time * 1000
|
|
||||||
cutlass_time_us = cutlass_graph_time * 1000
|
|
||||||
|
|
||||||
return {
|
|
||||||
"batch_size": m,
|
|
||||||
"triton_time_us": triton_time_us,
|
|
||||||
"cutlass_time_us": cutlass_time_us,
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def main(args):
|
|
||||||
print("Benchmarking models:")
|
|
||||||
for i, model in enumerate(args.models):
|
|
||||||
print(f"[{i}] {model}")
|
|
||||||
|
|
||||||
all_results = []
|
|
||||||
|
|
||||||
for model in args.models:
|
|
||||||
for tp in args.tp_sizes:
|
|
||||||
for layer in WEIGHT_SHAPES_MOE[model]:
|
|
||||||
num_experts = layer[0]
|
|
||||||
topk = layer[1]
|
|
||||||
size_k = layer[2]
|
|
||||||
size_n = layer[3] // tp
|
|
||||||
|
|
||||||
if len(args.limit_k) > 0 and size_k not in args.limit_k:
|
|
||||||
continue
|
|
||||||
|
|
||||||
if len(args.limit_n) > 0 and size_n not in args.limit_n:
|
|
||||||
continue
|
|
||||||
|
|
||||||
for per_act_token in args.per_act_token_opts:
|
|
||||||
for per_out_ch in args.per_out_ch_opts:
|
|
||||||
print(
|
|
||||||
f"\n=== {model}, experts={num_experts}, topk={topk},"
|
|
||||||
f"per_act={per_act_token}, per_out_ch={per_out_ch} ==="
|
|
||||||
)
|
|
||||||
|
|
||||||
config_results = []
|
|
||||||
for size_m in args.batch_sizes:
|
|
||||||
mkn = (size_m, size_k, size_n)
|
|
||||||
result = bench_run(
|
|
||||||
[], # Not used anymore
|
|
||||||
model,
|
|
||||||
num_experts,
|
|
||||||
topk,
|
|
||||||
per_act_token,
|
|
||||||
per_out_ch,
|
|
||||||
mkn,
|
|
||||||
)
|
|
||||||
if result:
|
|
||||||
config_results.append(result)
|
|
||||||
|
|
||||||
# Print results table for this configuration
|
|
||||||
if config_results:
|
|
||||||
print(
|
|
||||||
f"\n{'Batch Size':<12}"
|
|
||||||
f"{'Triton (us)':<15}"
|
|
||||||
f"{'CUTLASS (us)':<15}"
|
|
||||||
)
|
|
||||||
print("-" * 45)
|
|
||||||
for result in config_results:
|
|
||||||
print(
|
|
||||||
f"{result['batch_size']:<12}"
|
|
||||||
f"{result['triton_time_us']:<15.2f}"
|
|
||||||
f"{result['cutlass_time_us']:<15.2f}"
|
|
||||||
)
|
|
||||||
|
|
||||||
all_results.extend(config_results)
|
|
||||||
|
|
||||||
print(f"\nTotal benchmarks completed: {len(all_results)}")
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
parser = FlexibleArgumentParser(
|
|
||||||
description="""Benchmark CUTLASS FP8 MOE vs Triton FP8 FUSED MOE
|
|
||||||
across specified models/shapes/batches
|
|
||||||
|
|
||||||
Example usage:
|
|
||||||
python benchmark_cutlass_moe_fp8.py \
|
|
||||||
--model "Llama-4-Maverick-17B-128E-Instruct-FP8" \
|
|
||||||
--tp-sizes 8 \
|
|
||||||
--batch-size 2 4 8 \
|
|
||||||
--per-act-token-opts false \
|
|
||||||
--per-out-ch-opts false
|
|
||||||
|
|
||||||
"""
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--models",
|
|
||||||
nargs="+",
|
|
||||||
type=str,
|
|
||||||
default=DEFAULT_MODELS,
|
|
||||||
choices=WEIGHT_SHAPES_MOE.keys(),
|
|
||||||
)
|
|
||||||
parser.add_argument("--tp-sizes", nargs="+", type=int, default=DEFAULT_TP_SIZES)
|
|
||||||
parser.add_argument(
|
|
||||||
"--batch-sizes", nargs="+", type=int, default=DEFAULT_BATCH_SIZES
|
|
||||||
)
|
|
||||||
parser.add_argument("--limit-k", nargs="+", type=int, default=[])
|
|
||||||
parser.add_argument("--limit-n", nargs="+", type=int, default=[])
|
|
||||||
parser.add_argument(
|
|
||||||
"--per-act-token-opts",
|
|
||||||
nargs="+",
|
|
||||||
type=lambda x: x.lower() == "true",
|
|
||||||
default=[False, True],
|
|
||||||
help="Per-activation token quantization options (true/false)",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--per-out-ch-opts",
|
|
||||||
nargs="+",
|
|
||||||
type=lambda x: x.lower() == "true",
|
|
||||||
default=[False, True],
|
|
||||||
help="Per-output channel quantization options (true/false)",
|
|
||||||
)
|
|
||||||
|
|
||||||
args = parser.parse_args()
|
|
||||||
main(args)
|
|
||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user