[Benchmark] Cleanup deprecated nightly benchmark and adjust the docstring for performance benchmark (#25786)
Signed-off-by: KuntaiDu <kuntai@uchicago.edu>
This commit is contained in:
@ -1,184 +0,0 @@
|
||||
steps:
|
||||
- label: "Wait for container to be ready"
|
||||
key: wait-for-container-image
|
||||
agents:
|
||||
queue: A100
|
||||
plugins:
|
||||
- kubernetes:
|
||||
podSpec:
|
||||
containers:
|
||||
- image: badouralix/curl-jq
|
||||
command:
|
||||
- sh .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
|
||||
- label: "Cleanup H100"
|
||||
agents:
|
||||
queue: H100
|
||||
depends_on: ~
|
||||
command: docker system prune -a --volumes --force
|
||||
|
||||
- label: "A100"
|
||||
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
|
||||
agents:
|
||||
queue: A100
|
||||
depends_on: wait-for-container-image
|
||||
if: build.branch == "main"
|
||||
plugins:
|
||||
- kubernetes:
|
||||
podSpec:
|
||||
priorityClassName: perf-benchmark
|
||||
containers:
|
||||
- image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
|
||||
command:
|
||||
- bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
|
||||
resources:
|
||||
limits:
|
||||
nvidia.com/gpu: 8
|
||||
volumeMounts:
|
||||
- name: devshm
|
||||
mountPath: /dev/shm
|
||||
env:
|
||||
- name: VLLM_USAGE_SOURCE
|
||||
value: ci-test
|
||||
- name: HF_TOKEN
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: hf-token-secret
|
||||
key: token
|
||||
nodeSelector:
|
||||
nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
|
||||
volumes:
|
||||
- name: devshm
|
||||
emptyDir:
|
||||
medium: Memory
|
||||
|
||||
- label: "H200"
|
||||
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
|
||||
agents:
|
||||
queue: H200
|
||||
depends_on: wait-for-container-image
|
||||
if: build.branch == "main"
|
||||
plugins:
|
||||
- docker#v5.12.0:
|
||||
image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
|
||||
command:
|
||||
- bash
|
||||
- .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
|
||||
mount-buildkite-agent: true
|
||||
propagate-environment: true
|
||||
ipc: host
|
||||
gpus: 4,5,6,7
|
||||
volumes:
|
||||
- /data/benchmark-hf-cache:/root/.cache/huggingface
|
||||
environment:
|
||||
- VLLM_USAGE_SOURCE
|
||||
- HF_TOKEN
|
||||
|
||||
#- block: "Run H100 Benchmark"
|
||||
#key: block-h100
|
||||
#depends_on: ~
|
||||
|
||||
- label: "H100"
|
||||
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
|
||||
agents:
|
||||
queue: H100
|
||||
depends_on: wait-for-container-image
|
||||
if: build.branch == "main"
|
||||
plugins:
|
||||
- docker#v5.12.0:
|
||||
image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
|
||||
command:
|
||||
- bash
|
||||
- .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
|
||||
mount-buildkite-agent: true
|
||||
propagate-environment: true
|
||||
ipc: host
|
||||
gpus: all # see CUDA_VISIBLE_DEVICES for actual GPUs used
|
||||
volumes:
|
||||
- /data/benchmark-hf-cache:/root/.cache/huggingface
|
||||
environment:
|
||||
- VLLM_USAGE_SOURCE
|
||||
- HF_TOKEN
|
||||
|
||||
# Premerge benchmark
|
||||
- label: "A100"
|
||||
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
|
||||
agents:
|
||||
queue: A100
|
||||
depends_on: wait-for-container-image
|
||||
if: build.branch != "main"
|
||||
plugins:
|
||||
- kubernetes:
|
||||
podSpec:
|
||||
priorityClassName: perf-benchmark
|
||||
containers:
|
||||
- image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
|
||||
command:
|
||||
- bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
|
||||
resources:
|
||||
limits:
|
||||
nvidia.com/gpu: 8
|
||||
volumeMounts:
|
||||
- name: devshm
|
||||
mountPath: /dev/shm
|
||||
env:
|
||||
- name: VLLM_USAGE_SOURCE
|
||||
value: ci-test
|
||||
- name: HF_TOKEN
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: hf-token-secret
|
||||
key: token
|
||||
nodeSelector:
|
||||
nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
|
||||
volumes:
|
||||
- name: devshm
|
||||
emptyDir:
|
||||
medium: Memory
|
||||
|
||||
- label: "H200"
|
||||
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
|
||||
agents:
|
||||
queue: H200
|
||||
depends_on: wait-for-container-image
|
||||
if: build.branch != "main"
|
||||
plugins:
|
||||
- docker#v5.12.0:
|
||||
image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
|
||||
command:
|
||||
- bash
|
||||
- .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
|
||||
mount-buildkite-agent: true
|
||||
propagate-environment: true
|
||||
ipc: host
|
||||
gpus: 4,5,6,7
|
||||
volumes:
|
||||
- /data/benchmark-hf-cache:/root/.cache/huggingface
|
||||
environment:
|
||||
- VLLM_USAGE_SOURCE
|
||||
- HF_TOKEN
|
||||
|
||||
#- block: "Run H100 Benchmark"
|
||||
#key: block-h100
|
||||
#depends_on: ~
|
||||
|
||||
- label: "H100"
|
||||
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
|
||||
agents:
|
||||
queue: H100
|
||||
depends_on: wait-for-container-image
|
||||
if: build.branch != "main"
|
||||
plugins:
|
||||
- docker#v5.12.0:
|
||||
image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
|
||||
command:
|
||||
- bash
|
||||
- .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
|
||||
mount-buildkite-agent: true
|
||||
propagate-environment: true
|
||||
ipc: host
|
||||
gpus: all # see CUDA_VISIBLE_DEVICES for actual GPUs used
|
||||
volumes:
|
||||
- /data/benchmark-hf-cache:/root/.cache/huggingface
|
||||
environment:
|
||||
- VLLM_USAGE_SOURCE
|
||||
- HF_TOKEN
|
||||
@ -1,28 +0,0 @@
|
||||
# Nightly benchmark annotation
|
||||
|
||||
## Description
|
||||
|
||||
This file contains the downloading link for benchmarking results.
|
||||
|
||||
- [benchmarking pipeline](artifact://nightly-pipeline.yaml)
|
||||
- [benchmarking results](artifact://results.zip)
|
||||
- [benchmarking code](artifact://nightly-benchmarks.zip)
|
||||
|
||||
Please download the visualization scripts in the post
|
||||
|
||||
## Results reproduction
|
||||
|
||||
- Find the docker we use in `benchmarking pipeline`
|
||||
- Deploy the docker, and inside the docker:
|
||||
- Download `nightly-benchmarks.zip`.
|
||||
- In the same folder, run the following code:
|
||||
|
||||
```bash
|
||||
export HF_TOKEN=<your HF token>
|
||||
apt update
|
||||
apt install -y git
|
||||
unzip nightly-benchmarks.zip
|
||||
VLLM_SOURCE_CODE_LOC=./ bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
|
||||
```
|
||||
|
||||
And the results will be inside `./benchmarks/results`.
|
||||
@ -1,39 +0,0 @@
|
||||
|
||||
# Nightly benchmark
|
||||
|
||||
This benchmark aims to:
|
||||
|
||||
- Provide performance clarity: Provide clarity on which one (vllm, tensorrt-llm, lmdeploy and SGLang) leads in performance in what workload.
|
||||
- Be reproducible: one can run the exact same set of benchmarking commands inside the exact same docker by following reproducing instructions.
|
||||
|
||||
Latest results: [results link](https://blog.vllm.ai/2024/09/05/perf-update.html), scroll to the end.
|
||||
|
||||
Latest reproduction guide: [github issue link](https://github.com/vllm-project/vllm/issues/8176)
|
||||
|
||||
## Setup
|
||||
|
||||
- Docker images:
|
||||
- vLLM: `vllm/vllm-openai:v0.6.2`
|
||||
- SGLang: `lmsysorg/sglang:v0.3.2-cu121`
|
||||
- LMDeploy: `openmmlab/lmdeploy:v0.6.1-cu12`
|
||||
- TensorRT-LLM: `nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3`
|
||||
- *NOTE: we use r24.07 as the current implementation only works for this version. We are going to bump this up.*
|
||||
- Check [nightly-pipeline.yaml](nightly-pipeline.yaml) for the concrete docker images, specs and commands we use for the benchmark.
|
||||
- Hardware
|
||||
- 8x Nvidia A100 GPUs
|
||||
- Workload:
|
||||
- Dataset
|
||||
- ShareGPT dataset
|
||||
- Prefill-heavy dataset (in average 462 input tokens, 16 tokens as output)
|
||||
- Decode-heavy dataset (in average 462 input tokens, 256 output tokens)
|
||||
- Check [nightly-tests.json](tests/nightly-tests.json) for the concrete configuration of datasets we use.
|
||||
- Models: llama-3 8B, llama-3 70B.
|
||||
- We do not use llama 3.1 as it is incompatible with trt-llm r24.07. ([issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105)).
|
||||
- Average QPS (query per second): 2, 4, 8, 16, 32 and inf.
|
||||
- Queries are randomly sampled, and arrival patterns are determined via Poisson process, but all with fixed random seed.
|
||||
- Evaluation metrics: Throughput (higher the better), TTFT (time to the first token, lower the better), ITL (inter-token latency, lower the better).
|
||||
|
||||
## Known issues
|
||||
|
||||
- TRT-LLM crashes with Llama 3.1 8B [issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105).
|
||||
- TGI does not support `ignore-eos` flag.
|
||||
@ -1,196 +0,0 @@
|
||||
common_pod_spec: &common_pod_spec
|
||||
priorityClassName: perf-benchmark
|
||||
nodeSelector:
|
||||
nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
|
||||
volumes:
|
||||
- name: devshm
|
||||
emptyDir:
|
||||
medium: Memory
|
||||
- name: hf-cache
|
||||
hostPath:
|
||||
path: /root/.cache/huggingface
|
||||
type: Directory
|
||||
|
||||
common_container_settings: &common_container_settings
|
||||
command:
|
||||
- bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
|
||||
resources:
|
||||
limits:
|
||||
nvidia.com/gpu: 8
|
||||
volumeMounts:
|
||||
- name: devshm
|
||||
mountPath: /dev/shm
|
||||
- name: hf-cache
|
||||
mountPath: /root/.cache/huggingface
|
||||
env:
|
||||
- name: VLLM_USAGE_SOURCE
|
||||
value: ci-test
|
||||
- name: HF_HOME
|
||||
value: /root/.cache/huggingface
|
||||
- name: VLLM_SOURCE_CODE_LOC
|
||||
value: /workspace/build/buildkite/vllm/performance-benchmark
|
||||
- name: HF_TOKEN
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: hf-token-secret
|
||||
key: token
|
||||
|
||||
steps:
|
||||
- block: ":rocket: Ready for comparing vllm against alternatives? This will take 4 hours."
|
||||
|
||||
|
||||
|
||||
- label: "A100 vllm step 10"
|
||||
priority: 100
|
||||
agents:
|
||||
queue: A100
|
||||
plugins:
|
||||
- kubernetes:
|
||||
podSpec:
|
||||
<<: *common_pod_spec
|
||||
containers:
|
||||
- image: vllm/vllm-openai:v0.6.2
|
||||
<<: *common_container_settings
|
||||
|
||||
|
||||
|
||||
- label: "A100 sglang benchmark"
|
||||
priority: 100
|
||||
agents:
|
||||
queue: A100
|
||||
plugins:
|
||||
- kubernetes:
|
||||
podSpec:
|
||||
<<: *common_pod_spec
|
||||
containers:
|
||||
- image: lmsysorg/sglang:v0.3.2-cu121
|
||||
<<: *common_container_settings
|
||||
|
||||
- label: "A100 lmdeploy benchmark"
|
||||
priority: 100
|
||||
agents:
|
||||
queue: A100
|
||||
plugins:
|
||||
- kubernetes:
|
||||
podSpec:
|
||||
<<: *common_pod_spec
|
||||
containers:
|
||||
- image: openmmlab/lmdeploy:v0.6.1-cu12
|
||||
<<: *common_container_settings
|
||||
|
||||
|
||||
|
||||
|
||||
- label: "A100 trt llama-8B"
|
||||
priority: 100
|
||||
agents:
|
||||
queue: A100
|
||||
plugins:
|
||||
- kubernetes:
|
||||
podSpec:
|
||||
<<: *common_pod_spec
|
||||
containers:
|
||||
- image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3
|
||||
<<: *common_container_settings
|
||||
env:
|
||||
- name: VLLM_USAGE_SOURCE
|
||||
value: ci-test
|
||||
- name: HF_HOME
|
||||
value: /root/.cache/huggingface
|
||||
- name: VLLM_SOURCE_CODE_LOC
|
||||
value: /workspace/build/buildkite/vllm/performance-benchmark
|
||||
- name: HF_TOKEN
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: hf-token-secret
|
||||
key: token
|
||||
- name: TEST_SELECTOR
|
||||
value: "llama8B"
|
||||
|
||||
|
||||
- label: "A100 trt llama-70B"
|
||||
priority: 100
|
||||
agents:
|
||||
queue: A100
|
||||
plugins:
|
||||
- kubernetes:
|
||||
podSpec:
|
||||
<<: *common_pod_spec
|
||||
containers:
|
||||
- image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3
|
||||
<<: *common_container_settings
|
||||
env:
|
||||
- name: VLLM_USAGE_SOURCE
|
||||
value: ci-test
|
||||
- name: HF_HOME
|
||||
value: /root/.cache/huggingface
|
||||
- name: VLLM_SOURCE_CODE_LOC
|
||||
value: /workspace/build/buildkite/vllm/performance-benchmark
|
||||
- name: HF_TOKEN
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: hf-token-secret
|
||||
key: token
|
||||
- name: TEST_SELECTOR
|
||||
value: "llama70B"
|
||||
|
||||
|
||||
# FIXME(Kuntai): uncomment this after NVIDIA gives us their test docker image
|
||||
# - label: "A100 trt benchmark"
|
||||
# priority: 100
|
||||
# agents:
|
||||
# queue: A100
|
||||
# plugins:
|
||||
# - kubernetes:
|
||||
# podSpec:
|
||||
# <<: *common_pod_spec
|
||||
# containers:
|
||||
# - image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3
|
||||
# <<: *common_container_settings
|
||||
|
||||
|
||||
# FIXME(Kuntai): uncomment this after TGI supports `--ignore-eos`.
|
||||
# - label: "A100 tgi benchmark"
|
||||
# priority: 100
|
||||
# agents:
|
||||
# queue: A100
|
||||
# plugins:
|
||||
# - kubernetes:
|
||||
# podSpec:
|
||||
# <<: *common_pod_spec
|
||||
# containers:
|
||||
# - image: ghcr.io/huggingface/text-generation-inference:2.2.0
|
||||
# <<: *common_container_settings
|
||||
|
||||
- wait
|
||||
|
||||
- label: "Collect the results"
|
||||
priority: 100
|
||||
agents:
|
||||
queue: A100
|
||||
plugins:
|
||||
- kubernetes:
|
||||
podSpec:
|
||||
<<: *common_pod_spec
|
||||
containers:
|
||||
- image: vllm/vllm-openai:v0.5.0.post1
|
||||
command:
|
||||
- bash .buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
|
||||
resources:
|
||||
limits:
|
||||
nvidia.com/gpu: 8
|
||||
volumeMounts:
|
||||
- name: devshm
|
||||
mountPath: /dev/shm
|
||||
env:
|
||||
- name: VLLM_USAGE_SOURCE
|
||||
value: ci-test
|
||||
- name: VLLM_SOURCE_CODE_LOC
|
||||
value: /workspace/build/buildkite/vllm/performance-benchmark
|
||||
- name: HF_TOKEN
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: hf-token-secret
|
||||
key: token
|
||||
|
||||
- block: ":rocket: check the results!"
|
||||
@ -1,26 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import argparse
|
||||
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
|
||||
def main(model, cachedir):
|
||||
# Load the tokenizer and save it to the specified directory
|
||||
tokenizer = AutoTokenizer.from_pretrained(model)
|
||||
tokenizer.save_pretrained(cachedir)
|
||||
print(f"Tokenizer saved to {cachedir}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Download and save Hugging Face tokenizer"
|
||||
)
|
||||
parser.add_argument("--model", type=str, required=True, help="Name of the model")
|
||||
parser.add_argument(
|
||||
"--cachedir", type=str, required=True, help="Directory to save the tokenizer"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
main(args.model, args.cachedir)
|
||||
@ -1,97 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import argparse
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from tabulate import tabulate
|
||||
|
||||
|
||||
def parse_arguments():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Parse command line arguments for summary-nightly-results script."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--results-folder",
|
||||
type=str,
|
||||
required=True,
|
||||
help="The folder where the results are stored.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--description", type=str, required=True, help="Description of the results."
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
return args
|
||||
|
||||
|
||||
def get_perf(df, method, model, metric):
|
||||
means = []
|
||||
|
||||
for qps in [2, 4, 8, 16, "inf"]:
|
||||
target = df["Test name"].str.contains(model)
|
||||
target = target & df["Engine"].str.contains(method)
|
||||
target = target & df["Test name"].str.contains("qps_" + str(qps))
|
||||
filtered_df = df[target]
|
||||
|
||||
if filtered_df.empty:
|
||||
means.append(0.0)
|
||||
else:
|
||||
means.append(filtered_df[metric].values[0])
|
||||
|
||||
return np.array(means)
|
||||
|
||||
|
||||
def get_perf_w_std(df, method, model, metric):
|
||||
if metric in ["TTFT", "ITL"]:
|
||||
mean = get_perf(df, method, model, "Mean " + metric + " (ms)")
|
||||
mean = mean.tolist()
|
||||
std = get_perf(df, method, model, "Std " + metric + " (ms)")
|
||||
if std.mean() == 0:
|
||||
std = None
|
||||
success = get_perf(df, method, model, "Successful req.")
|
||||
if std is not None:
|
||||
std = std / np.sqrt(success)
|
||||
std = std.tolist()
|
||||
|
||||
else:
|
||||
assert metric == "Tput"
|
||||
mean = get_perf(df, method, model, "Input Tput (tok/s)") + get_perf(
|
||||
df, method, model, "Output Tput (tok/s)"
|
||||
)
|
||||
mean = mean.tolist()
|
||||
std = None
|
||||
|
||||
return mean, std
|
||||
|
||||
|
||||
def main(args):
|
||||
results_folder = Path(args.results_folder)
|
||||
|
||||
results = []
|
||||
|
||||
# collect results
|
||||
for test_file in results_folder.glob("*_nightly_results.json"):
|
||||
with open(test_file) as f:
|
||||
results = results + json.loads(f.read())
|
||||
|
||||
# generate markdown table
|
||||
df = pd.DataFrame.from_dict(results)
|
||||
|
||||
md_table = tabulate(df, headers="keys", tablefmt="pipe", showindex=False)
|
||||
|
||||
with open(args.description) as f:
|
||||
description = f.read()
|
||||
|
||||
description = description.format(nightly_results_benchmarking_table=md_table)
|
||||
|
||||
with open("nightly_results.md", "w") as f:
|
||||
f.write(description)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parse_arguments()
|
||||
main(args)
|
||||
@ -1,9 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from lmdeploy.serve.openai.api_client import APIClient
|
||||
|
||||
api_client = APIClient("http://localhost:8000")
|
||||
model_name = api_client.available_models[0]
|
||||
|
||||
print(model_name)
|
||||
@ -1,78 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -ex
|
||||
set -o pipefail
|
||||
|
||||
|
||||
main() {
|
||||
|
||||
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
|
||||
(which jq) || (apt-get update && apt-get -y install jq)
|
||||
(which zip) || (apt-get install -y zip)
|
||||
|
||||
if [ ! -f /workspace/buildkite-agent ]; then
|
||||
echo "buildkite-agent binary not found. Skip plotting the results."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# initial annotation
|
||||
#description="$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-descriptions.md"
|
||||
|
||||
# download results
|
||||
cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
|
||||
mkdir -p results/
|
||||
/workspace/buildkite-agent artifact download 'results/*nightly_results.json' results/
|
||||
ls
|
||||
ls results/
|
||||
|
||||
# upload benchmark results
|
||||
zip -r results.zip results/
|
||||
/workspace/buildkite-agent artifact upload "results.zip"
|
||||
|
||||
# upload benchmarking scripts
|
||||
cd "$VLLM_SOURCE_CODE_LOC/"
|
||||
zip -r nightly-benchmarks.zip .buildkite/ benchmarks/
|
||||
/workspace/buildkite-agent artifact upload "nightly-benchmarks.zip"
|
||||
|
||||
cd "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/"
|
||||
# upload benchmarking pipeline
|
||||
/workspace/buildkite-agent artifact upload "nightly-pipeline.yaml"
|
||||
|
||||
cd "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/"
|
||||
/workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly-annotation.md
|
||||
|
||||
|
||||
|
||||
# The figures should be generated by a separate process outside the CI/CD pipeline
|
||||
|
||||
# # generate figures
|
||||
# python3 -m pip install tabulate pandas matplotlib
|
||||
|
||||
# python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py \
|
||||
# --description $description \
|
||||
# --results-folder results/
|
||||
|
||||
|
||||
# python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
|
||||
# --description $description \
|
||||
# --results-folder results/ \
|
||||
# --dataset sharegpt
|
||||
|
||||
# python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
|
||||
# --description $description \
|
||||
# --results-folder results/ \
|
||||
# --dataset sonnet_2048_128
|
||||
|
||||
# python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
|
||||
# --description $description \
|
||||
# --results-folder results/ \
|
||||
# --dataset sonnet_128_2048
|
||||
|
||||
# # upload results and figures
|
||||
# /workspace/buildkite-agent artifact upload "nightly_results*.png"
|
||||
# /workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
|
||||
# /workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/tests/nightly-tests.json
|
||||
# /workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly_results.md
|
||||
}
|
||||
|
||||
main "$@"
|
||||
@ -1,464 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -o pipefail
|
||||
set -x
|
||||
|
||||
check_gpus() {
|
||||
# check the number of GPUs and GPU type.
|
||||
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
|
||||
if [[ $gpu_count -gt 0 ]]; then
|
||||
echo "GPU found."
|
||||
else
|
||||
echo "Need at least 1 GPU to run benchmarking."
|
||||
exit 1
|
||||
fi
|
||||
declare -g gpu_type="$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')"
|
||||
echo "GPU type is $gpu_type"
|
||||
}
|
||||
|
||||
check_hf_token() {
|
||||
# check if HF_TOKEN is available and valid
|
||||
if [[ -z "$HF_TOKEN" ]]; then
|
||||
echo "Error: HF_TOKEN is not set."
|
||||
exit 1
|
||||
elif [[ ! "$HF_TOKEN" =~ ^hf_ ]]; then
|
||||
echo "Error: HF_TOKEN does not start with 'hf_'."
|
||||
exit 1
|
||||
else
|
||||
echo "HF_TOKEN is set and valid."
|
||||
fi
|
||||
}
|
||||
|
||||
|
||||
upload_to_buildkite() {
|
||||
# upload the benchmarking results to buildkite
|
||||
|
||||
# if the agent binary is not found, skip uploading the results, exit 0
|
||||
if [ ! -f /workspace/buildkite-agent ]; then
|
||||
echo "buildkite-agent binary not found. Skip uploading the results."
|
||||
return 0
|
||||
fi
|
||||
# /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
|
||||
/workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
|
||||
}
|
||||
|
||||
|
||||
get_current_llm_serving_engine() {
|
||||
|
||||
if which lmdeploy >/dev/null; then
|
||||
echo "Container: lmdeploy"
|
||||
export CURRENT_LLM_SERVING_ENGINE=lmdeploy
|
||||
return
|
||||
fi
|
||||
|
||||
if [ -e /tgi-entrypoint.sh ]; then
|
||||
echo "Container: tgi"
|
||||
export CURRENT_LLM_SERVING_ENGINE=tgi
|
||||
return
|
||||
fi
|
||||
|
||||
if which trtllm-build >/dev/null; then
|
||||
echo "Container: tensorrt-llm"
|
||||
export CURRENT_LLM_SERVING_ENGINE=trt
|
||||
return
|
||||
fi
|
||||
|
||||
if [ -e /sgl-workspace ]; then
|
||||
echo "Container: sglang"
|
||||
export CURRENT_LLM_SERVING_ENGINE=sglang
|
||||
return
|
||||
fi
|
||||
|
||||
if [ -e /vllm-workspace ]; then
|
||||
echo "Container: vllm"
|
||||
# move to a completely irrelevant directory, to avoid import vllm from current folder
|
||||
export CURRENT_LLM_SERVING_ENGINE=vllm
|
||||
|
||||
return
|
||||
fi
|
||||
}
|
||||
|
||||
json2args() {
|
||||
# transforms the JSON string to command line args, and '_' is replaced to '-'
|
||||
# example:
|
||||
# input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
|
||||
# output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
|
||||
local json_string=$1
|
||||
local args=$(
|
||||
echo "$json_string" | jq -r '
|
||||
to_entries |
|
||||
map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
|
||||
join(" ")
|
||||
'
|
||||
)
|
||||
echo "$args"
|
||||
}
|
||||
|
||||
kill_gpu_processes() {
|
||||
pkill -f '[p]ython'
|
||||
pkill -f '[p]ython3'
|
||||
pkill -f '[t]ritonserver'
|
||||
pkill -f '[p]t_main_thread'
|
||||
pkill -f '[t]ext-generation'
|
||||
pkill -f '[l]mdeploy'
|
||||
# vLLM now names the process with VLLM prefix after https://github.com/vllm-project/vllm/pull/21445
|
||||
pkill -f '[V]LLM'
|
||||
|
||||
while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
|
||||
sleep 1
|
||||
done
|
||||
}
|
||||
|
||||
wait_for_server() {
|
||||
# wait for vllm server to start
|
||||
# return 1 if vllm server crashes
|
||||
timeout 1200 bash -c '
|
||||
until curl -s localhost:8000/v1/completions > /dev/null; do
|
||||
sleep 1
|
||||
done' && return 0 || return 1
|
||||
}
|
||||
|
||||
ensure_installed() {
|
||||
# Ensure that the given command is installed by apt-get
|
||||
local cmd=$1
|
||||
if ! which "$cmd" >/dev/null; then
|
||||
apt-get update && apt-get install -y "$cmd"
|
||||
fi
|
||||
}
|
||||
|
||||
run_serving_tests() {
|
||||
# run serving tests using `vllm bench serve` command
|
||||
# $1: a json file specifying serving test cases
|
||||
|
||||
local serving_test_file
|
||||
serving_test_file=$1
|
||||
|
||||
# Iterate over serving tests
|
||||
jq -c '.[]' "$serving_test_file" | while read -r params; do
|
||||
# get the test name, and append the GPU type back to it.
|
||||
test_name=$(echo "$params" | jq -r '.test_name')
|
||||
|
||||
# if TEST_SELECTOR is set, only run the test cases that match the selector
|
||||
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
|
||||
echo "Skip test case $test_name."
|
||||
continue
|
||||
fi
|
||||
|
||||
# prepend the current serving engine to the test name
|
||||
test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name}
|
||||
|
||||
# get common parameters
|
||||
common_params=$(echo "$params" | jq -r '.common_parameters')
|
||||
model=$(echo "$common_params" | jq -r '.model')
|
||||
tp=$(echo "$common_params" | jq -r '.tp')
|
||||
dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
|
||||
dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
|
||||
port=$(echo "$common_params" | jq -r '.port')
|
||||
num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
|
||||
reuse_server=$(echo "$common_params" | jq -r '.reuse_server')
|
||||
|
||||
# get client and server arguments
|
||||
server_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_server_parameters")
|
||||
client_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_client_parameters")
|
||||
client_args=$(json2args "$client_params")
|
||||
qps_list=$(echo "$params" | jq -r '.qps_list')
|
||||
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
|
||||
echo "Running over qps list $qps_list"
|
||||
|
||||
# check if there is enough GPU to run the test
|
||||
if [[ $gpu_count -lt $tp ]]; then
|
||||
echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name."
|
||||
continue
|
||||
fi
|
||||
|
||||
if [[ $reuse_server == "true" ]]; then
|
||||
echo "Reuse previous server for test case $test_name"
|
||||
else
|
||||
kill_gpu_processes
|
||||
bash "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/launch-server.sh" \
|
||||
"$server_params" "$common_params"
|
||||
fi
|
||||
|
||||
if wait_for_server; then
|
||||
echo ""
|
||||
echo "$CURRENT_LLM_SERVING_ENGINE server is up and running."
|
||||
else
|
||||
echo ""
|
||||
echo "$CURRENT_LLM_SERVING_ENGINE failed to start within the timeout period."
|
||||
break
|
||||
fi
|
||||
|
||||
# prepare tokenizer
|
||||
# this is required for lmdeploy.
|
||||
cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
|
||||
rm -rf /tokenizer_cache
|
||||
mkdir /tokenizer_cache
|
||||
python3 ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \
|
||||
--model "$model" \
|
||||
--cachedir /tokenizer_cache
|
||||
cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
|
||||
|
||||
|
||||
# change model name for lmdeploy (it will not follow standard hf name)
|
||||
if [[ "$CURRENT_LLM_SERVING_ENGINE" == "lmdeploy" ]]; then
|
||||
model=$(python ../.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py)
|
||||
fi
|
||||
|
||||
# iterate over different QPS
|
||||
for qps in $qps_list; do
|
||||
# remove the surrounding single quote from qps
|
||||
if [[ "$qps" == *"inf"* ]]; then
|
||||
echo "qps was $qps"
|
||||
qps="inf"
|
||||
echo "now qps is $qps"
|
||||
fi
|
||||
|
||||
new_test_name=$test_name"_qps_"$qps
|
||||
|
||||
backend=$CURRENT_LLM_SERVING_ENGINE
|
||||
|
||||
if [[ $backend = "trt" ]]; then
|
||||
backend="tensorrt-llm"
|
||||
fi
|
||||
|
||||
if [[ "$backend" == *"vllm"* ]]; then
|
||||
backend="vllm"
|
||||
fi
|
||||
|
||||
if [[ "$dataset_name" = "sharegpt" ]]; then
|
||||
|
||||
client_command="vllm bench serve \
|
||||
--backend $backend \
|
||||
--tokenizer /tokenizer_cache \
|
||||
--model $model \
|
||||
--dataset-name $dataset_name \
|
||||
--dataset-path $dataset_path \
|
||||
--num-prompts $num_prompts \
|
||||
--port $port \
|
||||
--save-result \
|
||||
--result-dir $RESULTS_FOLDER \
|
||||
--result-filename ${new_test_name}.json \
|
||||
--request-rate $qps \
|
||||
--ignore-eos \
|
||||
$client_args"
|
||||
|
||||
elif [[ "$dataset_name" = "sonnet" ]]; then
|
||||
|
||||
sonnet_input_len=$(echo "$common_params" | jq -r '.sonnet_input_len')
|
||||
sonnet_output_len=$(echo "$common_params" | jq -r '.sonnet_output_len')
|
||||
sonnet_prefix_len=$(echo "$common_params" | jq -r '.sonnet_prefix_len')
|
||||
|
||||
client_command="vllm bench serve \
|
||||
--backend $backend \
|
||||
--tokenizer /tokenizer_cache \
|
||||
--model $model \
|
||||
--dataset-name $dataset_name \
|
||||
--dataset-path $dataset_path \
|
||||
--num-prompts $num_prompts \
|
||||
--sonnet-input-len $sonnet_input_len \
|
||||
--sonnet-output-len $sonnet_output_len \
|
||||
--sonnet-prefix-len $sonnet_prefix_len \
|
||||
--port $port \
|
||||
--save-result \
|
||||
--result-dir $RESULTS_FOLDER \
|
||||
--result-filename ${new_test_name}.json \
|
||||
--request-rate $qps \
|
||||
--ignore-eos \
|
||||
$client_args"
|
||||
|
||||
else
|
||||
|
||||
echo "The dataset name must be either 'sharegpt' or 'sonnet'. Got $dataset_name."
|
||||
exit 1
|
||||
|
||||
fi
|
||||
|
||||
|
||||
|
||||
echo "Running test case $test_name with qps $qps"
|
||||
echo "Client command: $client_command"
|
||||
|
||||
eval "$client_command"
|
||||
|
||||
server_command="None"
|
||||
|
||||
# record the benchmarking commands
|
||||
jq_output=$(jq -n \
|
||||
--arg server "$server_command" \
|
||||
--arg client "$client_command" \
|
||||
--arg gpu "$gpu_type" \
|
||||
--arg engine "$CURRENT_LLM_SERVING_ENGINE" \
|
||||
'{
|
||||
server_command: $server,
|
||||
client_command: $client,
|
||||
gpu_type: $gpu,
|
||||
engine: $engine
|
||||
}')
|
||||
echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
|
||||
|
||||
done
|
||||
|
||||
done
|
||||
|
||||
kill_gpu_processes
|
||||
}
|
||||
|
||||
run_genai_perf_tests() {
|
||||
# run genai-perf tests
|
||||
|
||||
# $1: a json file specifying genai-perf test cases
|
||||
local genai_perf_test_file
|
||||
genai_perf_test_file=$1
|
||||
|
||||
# Iterate over genai-perf tests
|
||||
jq -c '.[]' "$genai_perf_test_file" | while read -r params; do
|
||||
# get the test name, and append the GPU type back to it.
|
||||
test_name=$(echo "$params" | jq -r '.test_name')
|
||||
|
||||
# if TEST_SELECTOR is set, only run the test cases that match the selector
|
||||
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
|
||||
echo "Skip test case $test_name."
|
||||
continue
|
||||
fi
|
||||
|
||||
# prepend the current serving engine to the test name
|
||||
test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name}
|
||||
|
||||
# get common parameters
|
||||
common_params=$(echo "$params" | jq -r '.common_parameters')
|
||||
model=$(echo "$common_params" | jq -r '.model')
|
||||
tp=$(echo "$common_params" | jq -r '.tp')
|
||||
dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
|
||||
dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
|
||||
port=$(echo "$common_params" | jq -r '.port')
|
||||
num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
|
||||
reuse_server=$(echo "$common_params" | jq -r '.reuse_server')
|
||||
|
||||
# get client and server arguments
|
||||
server_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_server_parameters")
|
||||
qps_list=$(echo "$params" | jq -r '.qps_list')
|
||||
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
|
||||
echo "Running over qps list $qps_list"
|
||||
|
||||
# check if there is enough GPU to run the test
|
||||
if [[ $gpu_count -lt $tp ]]; then
|
||||
echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name."
|
||||
continue
|
||||
fi
|
||||
|
||||
if [[ $reuse_server == "true" ]]; then
|
||||
echo "Reuse previous server for test case $test_name"
|
||||
else
|
||||
kill_gpu_processes
|
||||
bash "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/launch-server.sh" \
|
||||
"$server_params" "$common_params"
|
||||
fi
|
||||
|
||||
if wait_for_server; then
|
||||
echo ""
|
||||
echo "$CURRENT_LLM_SERVING_ENGINE server is up and running."
|
||||
else
|
||||
echo ""
|
||||
echo "$CURRENT_LLM_SERVING_ENGINE failed to start within the timeout period."
|
||||
break
|
||||
fi
|
||||
|
||||
# iterate over different QPS
|
||||
for qps in $qps_list; do
|
||||
# remove the surrounding single quote from qps
|
||||
if [[ "$qps" == *"inf"* ]]; then
|
||||
echo "qps was $qps"
|
||||
qps=$num_prompts
|
||||
echo "now qps is $qps"
|
||||
fi
|
||||
|
||||
new_test_name=$test_name"_qps_"$qps
|
||||
backend=$CURRENT_LLM_SERVING_ENGINE
|
||||
|
||||
if [[ "$backend" == *"vllm"* ]]; then
|
||||
backend="vllm"
|
||||
fi
|
||||
#TODO: add output dir.
|
||||
client_command="genai-perf profile \
|
||||
-m $model \
|
||||
--service-kind openai \
|
||||
--backend "$backend" \
|
||||
--endpoint-type chat \
|
||||
--streaming \
|
||||
--url localhost:$port \
|
||||
--request-rate $qps \
|
||||
--num-prompts $num_prompts \
|
||||
"
|
||||
|
||||
echo "Client command: $client_command"
|
||||
|
||||
eval "$client_command"
|
||||
|
||||
#TODO: process/record outputs
|
||||
done
|
||||
done
|
||||
|
||||
kill_gpu_processes
|
||||
|
||||
}
|
||||
|
||||
prepare_dataset() {
|
||||
|
||||
# download sharegpt dataset
|
||||
cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
|
||||
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
||||
|
||||
# duplicate sonnet by 4x, to allow benchmarking with input length 2048
|
||||
cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
|
||||
echo "" > sonnet_4x.txt
|
||||
for _ in {1..4}
|
||||
do
|
||||
cat sonnet.txt >> sonnet_4x.txt
|
||||
done
|
||||
|
||||
}
|
||||
|
||||
main() {
|
||||
|
||||
# check if the environment variable is successfully injected from yaml
|
||||
|
||||
check_gpus
|
||||
check_hf_token
|
||||
get_current_llm_serving_engine
|
||||
|
||||
pip install -U transformers
|
||||
|
||||
pip install -r requirements/dev.txt
|
||||
which genai-perf
|
||||
|
||||
# check storage
|
||||
df -h
|
||||
|
||||
ensure_installed wget
|
||||
ensure_installed curl
|
||||
ensure_installed jq
|
||||
# genai-perf dependency
|
||||
ensure_installed libb64-0d
|
||||
|
||||
prepare_dataset
|
||||
|
||||
cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
|
||||
declare -g RESULTS_FOLDER=results/
|
||||
mkdir -p $RESULTS_FOLDER
|
||||
BENCHMARK_ROOT="$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/"
|
||||
|
||||
# run the test
|
||||
run_serving_tests "$BENCHMARK_ROOT/tests/nightly-tests.json"
|
||||
|
||||
# run genai-perf tests
|
||||
run_genai_perf_tests "$BENCHMARK_ROOT/tests/genai-perf-tests.json"
|
||||
mv artifacts/ $RESULTS_FOLDER/
|
||||
|
||||
# upload benchmark results to buildkite
|
||||
python3 -m pip install tabulate pandas
|
||||
python3 "$BENCHMARK_ROOT/scripts/summary-nightly-results.py"
|
||||
upload_to_buildkite
|
||||
|
||||
}
|
||||
|
||||
main "$@"
|
||||
@ -1,82 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import datetime
|
||||
import json
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
import pandas as pd
|
||||
from tabulate import tabulate
|
||||
|
||||
results_folder = Path("results/")
|
||||
|
||||
# serving results and the keys that will be printed into markdown
|
||||
serving_results = []
|
||||
serving_column_mapping = {
|
||||
"test_name": "Test name",
|
||||
"gpu_type": "GPU",
|
||||
"completed": "Successful req.",
|
||||
"request_throughput": "Tput (req/s)",
|
||||
"mean_ttft_ms": "Mean TTFT (ms)",
|
||||
"std_ttft_ms": "Std TTFT (ms)",
|
||||
"median_ttft_ms": "Median TTFT (ms)",
|
||||
"mean_itl_ms": "Mean ITL (ms)",
|
||||
"std_itl_ms": "Std ITL (ms)",
|
||||
"median_itl_ms": "Median ITL (ms)",
|
||||
"mean_tpot_ms": "Mean TPOT (ms)",
|
||||
"std_tpot_ms": "Std TPOT (ms)",
|
||||
"median_tpot_ms": "Median TPOT (ms)",
|
||||
"total_token_throughput": "Total Token Tput (tok/s)",
|
||||
"output_throughput": "Output Tput (tok/s)",
|
||||
"total_input_tokens": "Total input tokens",
|
||||
"total_output_tokens": "Total output tokens",
|
||||
"engine": "Engine",
|
||||
}
|
||||
|
||||
if __name__ == "__main__":
|
||||
# collect results
|
||||
for test_file in results_folder.glob("*.json"):
|
||||
with open(test_file) as f:
|
||||
raw_result = json.loads(f.read())
|
||||
|
||||
# attach the benchmarking command to raw_result
|
||||
with open(test_file.with_suffix(".commands")) as f:
|
||||
command = json.loads(f.read())
|
||||
raw_result.update(command)
|
||||
|
||||
# update the test name of this result
|
||||
raw_result.update({"test_name": test_file.stem})
|
||||
|
||||
# add the result to raw_result
|
||||
serving_results.append(raw_result)
|
||||
continue
|
||||
|
||||
serving_results = pd.DataFrame.from_dict(serving_results)
|
||||
|
||||
if not serving_results.empty:
|
||||
serving_results = serving_results[list(serving_column_mapping.keys())].rename(
|
||||
columns=serving_column_mapping
|
||||
)
|
||||
|
||||
serving_md_table_with_headers = tabulate(
|
||||
serving_results, headers="keys", tablefmt="pipe", showindex=False
|
||||
)
|
||||
# remove the first line of header
|
||||
serving_md_table_lines = serving_md_table_with_headers.split("\n")
|
||||
serving_md_table_without_header = "\n".join(serving_md_table_lines[2:])
|
||||
|
||||
prefix = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
|
||||
prefix = prefix + "_" + os.environ.get("CURRENT_LLM_SERVING_ENGINE")
|
||||
|
||||
# document benchmarking results in markdown
|
||||
with open(results_folder / f"{prefix}_nightly_results.md", "w") as f:
|
||||
# document results with header.
|
||||
# for those who wants to reproduce our benchmark.
|
||||
f.write(serving_md_table_with_headers)
|
||||
f.write("\n")
|
||||
|
||||
# document benchmarking results in json
|
||||
with open(results_folder / f"{prefix}_nightly_results.json", "w") as f:
|
||||
results = serving_results.to_dict(orient="records")
|
||||
f.write(json.dumps(results))
|
||||
@ -1,23 +0,0 @@
|
||||
#!/bin/sh
|
||||
TOKEN=$(curl -s -L "https://public.ecr.aws/token?service=public.ecr.aws&scope=repository:q9t5s3a7/vllm-ci-postmerge-repo:pull" | jq -r .token)
|
||||
if [[ "$BUILDKITE_BRANCH" == "main" ]]; then
|
||||
URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-postmerge-repo/manifests/$BUILDKITE_COMMIT"
|
||||
else
|
||||
URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-test-repo/manifests/$BUILDKITE_COMMIT"
|
||||
fi
|
||||
|
||||
TIMEOUT_SECONDS=10
|
||||
|
||||
retries=0
|
||||
while [ $retries -lt 1000 ]; do
|
||||
if [ "$(curl -s --max-time "$TIMEOUT_SECONDS" -L -H "Authorization: Bearer $TOKEN" -o /dev/null -w "%{http_code}" "$URL")" -eq 200 ]; then
|
||||
exit 0
|
||||
fi
|
||||
|
||||
echo "Waiting for image to be available..."
|
||||
|
||||
retries=$((retries + 1))
|
||||
sleep 5
|
||||
done
|
||||
|
||||
exit 1
|
||||
@ -2,40 +2,23 @@
|
||||
|
||||
## Introduction
|
||||
|
||||
This directory contains two sets of benchmark for vllm.
|
||||
|
||||
- Performance benchmark: benchmark vllm's performance under various workload, for **developers** to gain clarity on whether their PR improves/degrades vllm's performance
|
||||
- Nightly benchmark: compare vllm's performance against alternatives (tgi, trt-llm and lmdeploy), for **the public** to know when to choose vllm.
|
||||
|
||||
See [vLLM performance dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm) for the latest performance benchmark results and [vLLM GitHub README](https://github.com/vllm-project/vllm/blob/main/README.md) for latest nightly benchmark results.
|
||||
This directory contains a benchmarking suite for **developers** to run locally and gain clarity on whether their PR improves/degrades vllm's performance.
|
||||
vLLM also maintains a continuous performance benchmark under [perf.vllm.ai](https://perf.vllm.ai/), hosted under PyTorch CI HUD.
|
||||
|
||||
## Performance benchmark quick overview
|
||||
|
||||
**Benchmarking Coverage**: latency, throughput and fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!) and Intel® Xeon® Processors, with different models.
|
||||
**Benchmarking Coverage**: latency, throughput and fix-qps serving on B200, A100, H100 and Intel® Xeon® Processors, with different models.
|
||||
|
||||
**Benchmarking Duration**: about 1hr.
|
||||
|
||||
**For benchmarking developers**: please try your best to constraint the duration of benchmarking to about 1 hr so that it won't take forever to run.
|
||||
|
||||
## Nightly benchmark quick overview
|
||||
|
||||
**Benchmarking Coverage**: Fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!) on Llama-3 8B, 70B and Mixtral 8x7B.
|
||||
|
||||
**Benchmarking engines**: vllm, TGI, trt-llm and lmdeploy.
|
||||
|
||||
**Benchmarking Duration**: about 3.5hrs.
|
||||
|
||||
## Trigger the benchmark
|
||||
|
||||
Performance benchmark will be triggered when:
|
||||
|
||||
- A PR being merged into vllm.
|
||||
- Every commit for those PRs with `perf-benchmarks` label AND `ready` label.
|
||||
|
||||
Manually Trigger the benchmark
|
||||
The benchmark needs to be triggered manually:
|
||||
|
||||
```bash
|
||||
bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
|
||||
bash .buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
|
||||
```
|
||||
|
||||
Runtime environment variables:
|
||||
@ -47,10 +30,6 @@ Runtime environment variables:
|
||||
- `REMOTE_HOST`: IP for the remote vLLM service to benchmark. Default value is empty string.
|
||||
- `REMOTE_PORT`: Port for the remote vLLM service to benchmark. Default value is empty string.
|
||||
|
||||
Nightly benchmark will be triggered when:
|
||||
|
||||
- Every commit for those PRs with `perf-benchmarks` label and `nightly-benchmarks` label.
|
||||
|
||||
## Performance benchmark details
|
||||
|
||||
See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.
|
||||
@ -152,26 +131,3 @@ Here is an example using the script to compare result_a and result_b with Model,
|
||||
A comparison diagram will be generated below the table.
|
||||
Here is an example to compare between 96c/results_gnr_96c_091_tp2pp3 and 128c/results_gnr_128c_091_tp2pp3
|
||||
<img width="1886" height="828" alt="image" src="https://github.com/user-attachments/assets/c02a43ef-25d0-4fd6-90e5-2169a28682dd" />
|
||||
|
||||
## Nightly test details
|
||||
|
||||
See [nightly-descriptions.md](nightly-descriptions.md) for the detailed description on test workload, models and docker containers of benchmarking other llm engines.
|
||||
|
||||
### Workflow
|
||||
|
||||
- The [nightly-pipeline.yaml](nightly-pipeline.yaml) specifies the docker containers for different LLM serving engines.
|
||||
- Inside each container, we run [scripts/run-nightly-benchmarks.sh](scripts/run-nightly-benchmarks.sh), which will probe the serving engine of the current container.
|
||||
- The `scripts/run-nightly-benchmarks.sh` will parse the workload described in [nightly-tests.json](tests/nightly-tests.json) and launch the right benchmark for the specified serving engine via `scripts/launch-server.sh`.
|
||||
- At last, we run [scripts/summary-nightly-results.py](scripts/summary-nightly-results.py) to collect and plot the final benchmarking results, and update the results to buildkite.
|
||||
|
||||
### Nightly tests
|
||||
|
||||
In [nightly-tests.json](tests/nightly-tests.json), we include the command line arguments for benchmarking commands, together with the benchmarking test cases. The format is highly similar to performance benchmark.
|
||||
|
||||
### Docker containers
|
||||
|
||||
The docker containers for benchmarking are specified in `nightly-pipeline.yaml`.
|
||||
|
||||
WARNING: the docker versions are HARD-CODED and SHOULD BE ALIGNED WITH `nightly-descriptions.md`. The docker versions need to be hard-coded as there are several version-specific bug fixes inside `scripts/run-nightly-benchmarks.sh` and `scripts/launch-server.sh`.
|
||||
|
||||
WARNING: populating `trt-llm` to latest version is not easy, as it requires updating several protobuf files in [tensorrt-demo](https://github.com/neuralmagic/tensorrt-demo.git).
|
||||
@ -392,7 +392,7 @@ if __name__ == "__main__":
|
||||
json_file = "benchmark_results.json"
|
||||
with open(results_folder / md_file, "w") as f:
|
||||
results = read_markdown(
|
||||
"../.buildkite/nightly-benchmarks/"
|
||||
"../.buildkite/performance-benchmarks/"
|
||||
+ "performance-benchmarks-descriptions.md"
|
||||
)
|
||||
results = results.format(
|
||||
@ -469,7 +469,7 @@ main() {
|
||||
ensure_sharegpt_downloaded
|
||||
declare -g RESULTS_FOLDER=results/
|
||||
mkdir -p $RESULTS_FOLDER
|
||||
QUICK_BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
|
||||
QUICK_BENCHMARK_ROOT=../.buildkite/performance-benchmarks/
|
||||
|
||||
# dump vllm info via vllm collect-env
|
||||
env_output=$(vllm collect-env)
|
||||
2
.github/mergify.yml
vendored
2
.github/mergify.yml
vendored
@ -108,7 +108,7 @@ pull_request_rules:
|
||||
- files~=^benchmarks/
|
||||
- files~=^vllm/benchmarks/
|
||||
- files~=^tests/benchmarks/
|
||||
- files~=^\.buildkite/nightly-benchmarks/
|
||||
- files~=^\.buildkite/performance-benchmarks/
|
||||
actions:
|
||||
label:
|
||||
add:
|
||||
|
||||
@ -9,7 +9,6 @@ vLLM provides comprehensive benchmarking tools for performance testing and evalu
|
||||
- **[Benchmark CLI](#benchmark-cli)**: `vllm bench` CLI tools and specialized benchmark scripts for interactive performance testing
|
||||
- **[Parameter sweeps](#parameter-sweeps)**: Automate `vllm bench` runs for multiple configurations
|
||||
- **[Performance benchmarks](#performance-benchmarks)**: Automated CI benchmarks for development
|
||||
- **[Nightly benchmarks](#nightly-benchmarks)**: Comparative benchmarks against alternatives
|
||||
|
||||
[Benchmark CLI]: #benchmark-cli
|
||||
|
||||
@ -1167,7 +1166,7 @@ docker run -it --entrypoint /bin/bash -v /data/huggingface:/root/.cache/huggingf
|
||||
Then, run below command inside the docker instance.
|
||||
|
||||
```bash
|
||||
bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
|
||||
bash .buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
|
||||
```
|
||||
|
||||
When run, benchmark script generates results under **benchmark/results** folder, along with the benchmark_results.md and benchmark_results.json.
|
||||
@ -1185,7 +1184,7 @@ For more results visualization, check the [visualizing the results](https://gith
|
||||
|
||||
The latest performance results are hosted on the public [vLLM Performance Dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm).
|
||||
|
||||
More information on the performance benchmarks and their parameters can be found in [Benchmark README](https://github.com/intel-ai-tce/vllm/blob/more_cpu_models/.buildkite/nightly-benchmarks/README.md) and [performance benchmark description](../../.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md).
|
||||
More information on the performance benchmarks and their parameters can be found in [Benchmark README](https://github.com/intel-ai-tce/vllm/blob/more_cpu_models/.buildkite/nightly-benchmarks/README.md) and [performance benchmark description](../../.buildkite/performance-benchmarks/performance-benchmarks-descriptions.md).
|
||||
|
||||
### Continuous Benchmarking
|
||||
|
||||
@ -1210,11 +1209,3 @@ The benchmarking currently runs on a predefined set of models configured in the
|
||||
#### Viewing Results
|
||||
|
||||
All continuous benchmarking results are automatically published to the public [vLLM Performance Dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm).
|
||||
|
||||
## Nightly Benchmarks
|
||||
|
||||
These compare vLLM's performance against alternatives (`tgi`, `trt-llm`, and `lmdeploy`) when there are major updates of vLLM (e.g., bumping up to a new version). They are primarily intended for consumers to evaluate when to choose vLLM over other options and are triggered on every commit with both the `perf-benchmarks` and `nightly-benchmarks` labels.
|
||||
|
||||
The latest nightly benchmark results are shared in major release blog posts such as [vLLM v0.6.0](https://blog.vllm.ai/2024/09/05/perf-update.html).
|
||||
|
||||
More information on the nightly benchmarks and their parameters can be found [here](../../.buildkite/nightly-benchmarks/nightly-descriptions.md).
|
||||
|
||||
Reference in New Issue
Block a user