[Benchmark] Cleanup deprecated nightly benchmark and adjust the docstring for performance benchmark (#25786)
Signed-off-by: KuntaiDu <kuntai@uchicago.edu>
This commit is contained in:
@ -1,184 +0,0 @@
|
|||||||
steps:
|
|
||||||
- label: "Wait for container to be ready"
|
|
||||||
key: wait-for-container-image
|
|
||||||
agents:
|
|
||||||
queue: A100
|
|
||||||
plugins:
|
|
||||||
- kubernetes:
|
|
||||||
podSpec:
|
|
||||||
containers:
|
|
||||||
- image: badouralix/curl-jq
|
|
||||||
command:
|
|
||||||
- sh .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
|
|
||||||
- label: "Cleanup H100"
|
|
||||||
agents:
|
|
||||||
queue: H100
|
|
||||||
depends_on: ~
|
|
||||||
command: docker system prune -a --volumes --force
|
|
||||||
|
|
||||||
- label: "A100"
|
|
||||||
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
|
|
||||||
agents:
|
|
||||||
queue: A100
|
|
||||||
depends_on: wait-for-container-image
|
|
||||||
if: build.branch == "main"
|
|
||||||
plugins:
|
|
||||||
- kubernetes:
|
|
||||||
podSpec:
|
|
||||||
priorityClassName: perf-benchmark
|
|
||||||
containers:
|
|
||||||
- image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
|
|
||||||
command:
|
|
||||||
- bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
|
|
||||||
resources:
|
|
||||||
limits:
|
|
||||||
nvidia.com/gpu: 8
|
|
||||||
volumeMounts:
|
|
||||||
- name: devshm
|
|
||||||
mountPath: /dev/shm
|
|
||||||
env:
|
|
||||||
- name: VLLM_USAGE_SOURCE
|
|
||||||
value: ci-test
|
|
||||||
- name: HF_TOKEN
|
|
||||||
valueFrom:
|
|
||||||
secretKeyRef:
|
|
||||||
name: hf-token-secret
|
|
||||||
key: token
|
|
||||||
nodeSelector:
|
|
||||||
nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
|
|
||||||
volumes:
|
|
||||||
- name: devshm
|
|
||||||
emptyDir:
|
|
||||||
medium: Memory
|
|
||||||
|
|
||||||
- label: "H200"
|
|
||||||
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
|
|
||||||
agents:
|
|
||||||
queue: H200
|
|
||||||
depends_on: wait-for-container-image
|
|
||||||
if: build.branch == "main"
|
|
||||||
plugins:
|
|
||||||
- docker#v5.12.0:
|
|
||||||
image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
|
|
||||||
command:
|
|
||||||
- bash
|
|
||||||
- .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
|
|
||||||
mount-buildkite-agent: true
|
|
||||||
propagate-environment: true
|
|
||||||
ipc: host
|
|
||||||
gpus: 4,5,6,7
|
|
||||||
volumes:
|
|
||||||
- /data/benchmark-hf-cache:/root/.cache/huggingface
|
|
||||||
environment:
|
|
||||||
- VLLM_USAGE_SOURCE
|
|
||||||
- HF_TOKEN
|
|
||||||
|
|
||||||
#- block: "Run H100 Benchmark"
|
|
||||||
#key: block-h100
|
|
||||||
#depends_on: ~
|
|
||||||
|
|
||||||
- label: "H100"
|
|
||||||
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
|
|
||||||
agents:
|
|
||||||
queue: H100
|
|
||||||
depends_on: wait-for-container-image
|
|
||||||
if: build.branch == "main"
|
|
||||||
plugins:
|
|
||||||
- docker#v5.12.0:
|
|
||||||
image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
|
|
||||||
command:
|
|
||||||
- bash
|
|
||||||
- .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
|
|
||||||
mount-buildkite-agent: true
|
|
||||||
propagate-environment: true
|
|
||||||
ipc: host
|
|
||||||
gpus: all # see CUDA_VISIBLE_DEVICES for actual GPUs used
|
|
||||||
volumes:
|
|
||||||
- /data/benchmark-hf-cache:/root/.cache/huggingface
|
|
||||||
environment:
|
|
||||||
- VLLM_USAGE_SOURCE
|
|
||||||
- HF_TOKEN
|
|
||||||
|
|
||||||
# Premerge benchmark
|
|
||||||
- label: "A100"
|
|
||||||
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
|
|
||||||
agents:
|
|
||||||
queue: A100
|
|
||||||
depends_on: wait-for-container-image
|
|
||||||
if: build.branch != "main"
|
|
||||||
plugins:
|
|
||||||
- kubernetes:
|
|
||||||
podSpec:
|
|
||||||
priorityClassName: perf-benchmark
|
|
||||||
containers:
|
|
||||||
- image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
|
|
||||||
command:
|
|
||||||
- bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
|
|
||||||
resources:
|
|
||||||
limits:
|
|
||||||
nvidia.com/gpu: 8
|
|
||||||
volumeMounts:
|
|
||||||
- name: devshm
|
|
||||||
mountPath: /dev/shm
|
|
||||||
env:
|
|
||||||
- name: VLLM_USAGE_SOURCE
|
|
||||||
value: ci-test
|
|
||||||
- name: HF_TOKEN
|
|
||||||
valueFrom:
|
|
||||||
secretKeyRef:
|
|
||||||
name: hf-token-secret
|
|
||||||
key: token
|
|
||||||
nodeSelector:
|
|
||||||
nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
|
|
||||||
volumes:
|
|
||||||
- name: devshm
|
|
||||||
emptyDir:
|
|
||||||
medium: Memory
|
|
||||||
|
|
||||||
- label: "H200"
|
|
||||||
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
|
|
||||||
agents:
|
|
||||||
queue: H200
|
|
||||||
depends_on: wait-for-container-image
|
|
||||||
if: build.branch != "main"
|
|
||||||
plugins:
|
|
||||||
- docker#v5.12.0:
|
|
||||||
image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
|
|
||||||
command:
|
|
||||||
- bash
|
|
||||||
- .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
|
|
||||||
mount-buildkite-agent: true
|
|
||||||
propagate-environment: true
|
|
||||||
ipc: host
|
|
||||||
gpus: 4,5,6,7
|
|
||||||
volumes:
|
|
||||||
- /data/benchmark-hf-cache:/root/.cache/huggingface
|
|
||||||
environment:
|
|
||||||
- VLLM_USAGE_SOURCE
|
|
||||||
- HF_TOKEN
|
|
||||||
|
|
||||||
#- block: "Run H100 Benchmark"
|
|
||||||
#key: block-h100
|
|
||||||
#depends_on: ~
|
|
||||||
|
|
||||||
- label: "H100"
|
|
||||||
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
|
|
||||||
agents:
|
|
||||||
queue: H100
|
|
||||||
depends_on: wait-for-container-image
|
|
||||||
if: build.branch != "main"
|
|
||||||
plugins:
|
|
||||||
- docker#v5.12.0:
|
|
||||||
image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
|
|
||||||
command:
|
|
||||||
- bash
|
|
||||||
- .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
|
|
||||||
mount-buildkite-agent: true
|
|
||||||
propagate-environment: true
|
|
||||||
ipc: host
|
|
||||||
gpus: all # see CUDA_VISIBLE_DEVICES for actual GPUs used
|
|
||||||
volumes:
|
|
||||||
- /data/benchmark-hf-cache:/root/.cache/huggingface
|
|
||||||
environment:
|
|
||||||
- VLLM_USAGE_SOURCE
|
|
||||||
- HF_TOKEN
|
|
||||||
@ -1,28 +0,0 @@
|
|||||||
# Nightly benchmark annotation
|
|
||||||
|
|
||||||
## Description
|
|
||||||
|
|
||||||
This file contains the downloading link for benchmarking results.
|
|
||||||
|
|
||||||
- [benchmarking pipeline](artifact://nightly-pipeline.yaml)
|
|
||||||
- [benchmarking results](artifact://results.zip)
|
|
||||||
- [benchmarking code](artifact://nightly-benchmarks.zip)
|
|
||||||
|
|
||||||
Please download the visualization scripts in the post
|
|
||||||
|
|
||||||
## Results reproduction
|
|
||||||
|
|
||||||
- Find the docker we use in `benchmarking pipeline`
|
|
||||||
- Deploy the docker, and inside the docker:
|
|
||||||
- Download `nightly-benchmarks.zip`.
|
|
||||||
- In the same folder, run the following code:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
export HF_TOKEN=<your HF token>
|
|
||||||
apt update
|
|
||||||
apt install -y git
|
|
||||||
unzip nightly-benchmarks.zip
|
|
||||||
VLLM_SOURCE_CODE_LOC=./ bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
|
|
||||||
```
|
|
||||||
|
|
||||||
And the results will be inside `./benchmarks/results`.
|
|
||||||
@ -1,39 +0,0 @@
|
|||||||
|
|
||||||
# Nightly benchmark
|
|
||||||
|
|
||||||
This benchmark aims to:
|
|
||||||
|
|
||||||
- Provide performance clarity: Provide clarity on which one (vllm, tensorrt-llm, lmdeploy and SGLang) leads in performance in what workload.
|
|
||||||
- Be reproducible: one can run the exact same set of benchmarking commands inside the exact same docker by following reproducing instructions.
|
|
||||||
|
|
||||||
Latest results: [results link](https://blog.vllm.ai/2024/09/05/perf-update.html), scroll to the end.
|
|
||||||
|
|
||||||
Latest reproduction guide: [github issue link](https://github.com/vllm-project/vllm/issues/8176)
|
|
||||||
|
|
||||||
## Setup
|
|
||||||
|
|
||||||
- Docker images:
|
|
||||||
- vLLM: `vllm/vllm-openai:v0.6.2`
|
|
||||||
- SGLang: `lmsysorg/sglang:v0.3.2-cu121`
|
|
||||||
- LMDeploy: `openmmlab/lmdeploy:v0.6.1-cu12`
|
|
||||||
- TensorRT-LLM: `nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3`
|
|
||||||
- *NOTE: we use r24.07 as the current implementation only works for this version. We are going to bump this up.*
|
|
||||||
- Check [nightly-pipeline.yaml](nightly-pipeline.yaml) for the concrete docker images, specs and commands we use for the benchmark.
|
|
||||||
- Hardware
|
|
||||||
- 8x Nvidia A100 GPUs
|
|
||||||
- Workload:
|
|
||||||
- Dataset
|
|
||||||
- ShareGPT dataset
|
|
||||||
- Prefill-heavy dataset (in average 462 input tokens, 16 tokens as output)
|
|
||||||
- Decode-heavy dataset (in average 462 input tokens, 256 output tokens)
|
|
||||||
- Check [nightly-tests.json](tests/nightly-tests.json) for the concrete configuration of datasets we use.
|
|
||||||
- Models: llama-3 8B, llama-3 70B.
|
|
||||||
- We do not use llama 3.1 as it is incompatible with trt-llm r24.07. ([issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105)).
|
|
||||||
- Average QPS (query per second): 2, 4, 8, 16, 32 and inf.
|
|
||||||
- Queries are randomly sampled, and arrival patterns are determined via Poisson process, but all with fixed random seed.
|
|
||||||
- Evaluation metrics: Throughput (higher the better), TTFT (time to the first token, lower the better), ITL (inter-token latency, lower the better).
|
|
||||||
|
|
||||||
## Known issues
|
|
||||||
|
|
||||||
- TRT-LLM crashes with Llama 3.1 8B [issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105).
|
|
||||||
- TGI does not support `ignore-eos` flag.
|
|
||||||
@ -1,196 +0,0 @@
|
|||||||
common_pod_spec: &common_pod_spec
|
|
||||||
priorityClassName: perf-benchmark
|
|
||||||
nodeSelector:
|
|
||||||
nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
|
|
||||||
volumes:
|
|
||||||
- name: devshm
|
|
||||||
emptyDir:
|
|
||||||
medium: Memory
|
|
||||||
- name: hf-cache
|
|
||||||
hostPath:
|
|
||||||
path: /root/.cache/huggingface
|
|
||||||
type: Directory
|
|
||||||
|
|
||||||
common_container_settings: &common_container_settings
|
|
||||||
command:
|
|
||||||
- bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
|
|
||||||
resources:
|
|
||||||
limits:
|
|
||||||
nvidia.com/gpu: 8
|
|
||||||
volumeMounts:
|
|
||||||
- name: devshm
|
|
||||||
mountPath: /dev/shm
|
|
||||||
- name: hf-cache
|
|
||||||
mountPath: /root/.cache/huggingface
|
|
||||||
env:
|
|
||||||
- name: VLLM_USAGE_SOURCE
|
|
||||||
value: ci-test
|
|
||||||
- name: HF_HOME
|
|
||||||
value: /root/.cache/huggingface
|
|
||||||
- name: VLLM_SOURCE_CODE_LOC
|
|
||||||
value: /workspace/build/buildkite/vllm/performance-benchmark
|
|
||||||
- name: HF_TOKEN
|
|
||||||
valueFrom:
|
|
||||||
secretKeyRef:
|
|
||||||
name: hf-token-secret
|
|
||||||
key: token
|
|
||||||
|
|
||||||
steps:
|
|
||||||
- block: ":rocket: Ready for comparing vllm against alternatives? This will take 4 hours."
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
- label: "A100 vllm step 10"
|
|
||||||
priority: 100
|
|
||||||
agents:
|
|
||||||
queue: A100
|
|
||||||
plugins:
|
|
||||||
- kubernetes:
|
|
||||||
podSpec:
|
|
||||||
<<: *common_pod_spec
|
|
||||||
containers:
|
|
||||||
- image: vllm/vllm-openai:v0.6.2
|
|
||||||
<<: *common_container_settings
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
- label: "A100 sglang benchmark"
|
|
||||||
priority: 100
|
|
||||||
agents:
|
|
||||||
queue: A100
|
|
||||||
plugins:
|
|
||||||
- kubernetes:
|
|
||||||
podSpec:
|
|
||||||
<<: *common_pod_spec
|
|
||||||
containers:
|
|
||||||
- image: lmsysorg/sglang:v0.3.2-cu121
|
|
||||||
<<: *common_container_settings
|
|
||||||
|
|
||||||
- label: "A100 lmdeploy benchmark"
|
|
||||||
priority: 100
|
|
||||||
agents:
|
|
||||||
queue: A100
|
|
||||||
plugins:
|
|
||||||
- kubernetes:
|
|
||||||
podSpec:
|
|
||||||
<<: *common_pod_spec
|
|
||||||
containers:
|
|
||||||
- image: openmmlab/lmdeploy:v0.6.1-cu12
|
|
||||||
<<: *common_container_settings
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
- label: "A100 trt llama-8B"
|
|
||||||
priority: 100
|
|
||||||
agents:
|
|
||||||
queue: A100
|
|
||||||
plugins:
|
|
||||||
- kubernetes:
|
|
||||||
podSpec:
|
|
||||||
<<: *common_pod_spec
|
|
||||||
containers:
|
|
||||||
- image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3
|
|
||||||
<<: *common_container_settings
|
|
||||||
env:
|
|
||||||
- name: VLLM_USAGE_SOURCE
|
|
||||||
value: ci-test
|
|
||||||
- name: HF_HOME
|
|
||||||
value: /root/.cache/huggingface
|
|
||||||
- name: VLLM_SOURCE_CODE_LOC
|
|
||||||
value: /workspace/build/buildkite/vllm/performance-benchmark
|
|
||||||
- name: HF_TOKEN
|
|
||||||
valueFrom:
|
|
||||||
secretKeyRef:
|
|
||||||
name: hf-token-secret
|
|
||||||
key: token
|
|
||||||
- name: TEST_SELECTOR
|
|
||||||
value: "llama8B"
|
|
||||||
|
|
||||||
|
|
||||||
- label: "A100 trt llama-70B"
|
|
||||||
priority: 100
|
|
||||||
agents:
|
|
||||||
queue: A100
|
|
||||||
plugins:
|
|
||||||
- kubernetes:
|
|
||||||
podSpec:
|
|
||||||
<<: *common_pod_spec
|
|
||||||
containers:
|
|
||||||
- image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3
|
|
||||||
<<: *common_container_settings
|
|
||||||
env:
|
|
||||||
- name: VLLM_USAGE_SOURCE
|
|
||||||
value: ci-test
|
|
||||||
- name: HF_HOME
|
|
||||||
value: /root/.cache/huggingface
|
|
||||||
- name: VLLM_SOURCE_CODE_LOC
|
|
||||||
value: /workspace/build/buildkite/vllm/performance-benchmark
|
|
||||||
- name: HF_TOKEN
|
|
||||||
valueFrom:
|
|
||||||
secretKeyRef:
|
|
||||||
name: hf-token-secret
|
|
||||||
key: token
|
|
||||||
- name: TEST_SELECTOR
|
|
||||||
value: "llama70B"
|
|
||||||
|
|
||||||
|
|
||||||
# FIXME(Kuntai): uncomment this after NVIDIA gives us their test docker image
|
|
||||||
# - label: "A100 trt benchmark"
|
|
||||||
# priority: 100
|
|
||||||
# agents:
|
|
||||||
# queue: A100
|
|
||||||
# plugins:
|
|
||||||
# - kubernetes:
|
|
||||||
# podSpec:
|
|
||||||
# <<: *common_pod_spec
|
|
||||||
# containers:
|
|
||||||
# - image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3
|
|
||||||
# <<: *common_container_settings
|
|
||||||
|
|
||||||
|
|
||||||
# FIXME(Kuntai): uncomment this after TGI supports `--ignore-eos`.
|
|
||||||
# - label: "A100 tgi benchmark"
|
|
||||||
# priority: 100
|
|
||||||
# agents:
|
|
||||||
# queue: A100
|
|
||||||
# plugins:
|
|
||||||
# - kubernetes:
|
|
||||||
# podSpec:
|
|
||||||
# <<: *common_pod_spec
|
|
||||||
# containers:
|
|
||||||
# - image: ghcr.io/huggingface/text-generation-inference:2.2.0
|
|
||||||
# <<: *common_container_settings
|
|
||||||
|
|
||||||
- wait
|
|
||||||
|
|
||||||
- label: "Collect the results"
|
|
||||||
priority: 100
|
|
||||||
agents:
|
|
||||||
queue: A100
|
|
||||||
plugins:
|
|
||||||
- kubernetes:
|
|
||||||
podSpec:
|
|
||||||
<<: *common_pod_spec
|
|
||||||
containers:
|
|
||||||
- image: vllm/vllm-openai:v0.5.0.post1
|
|
||||||
command:
|
|
||||||
- bash .buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
|
|
||||||
resources:
|
|
||||||
limits:
|
|
||||||
nvidia.com/gpu: 8
|
|
||||||
volumeMounts:
|
|
||||||
- name: devshm
|
|
||||||
mountPath: /dev/shm
|
|
||||||
env:
|
|
||||||
- name: VLLM_USAGE_SOURCE
|
|
||||||
value: ci-test
|
|
||||||
- name: VLLM_SOURCE_CODE_LOC
|
|
||||||
value: /workspace/build/buildkite/vllm/performance-benchmark
|
|
||||||
- name: HF_TOKEN
|
|
||||||
valueFrom:
|
|
||||||
secretKeyRef:
|
|
||||||
name: hf-token-secret
|
|
||||||
key: token
|
|
||||||
|
|
||||||
- block: ":rocket: check the results!"
|
|
||||||
@ -1,26 +0,0 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
|
|
||||||
import argparse
|
|
||||||
|
|
||||||
from transformers import AutoTokenizer
|
|
||||||
|
|
||||||
|
|
||||||
def main(model, cachedir):
|
|
||||||
# Load the tokenizer and save it to the specified directory
|
|
||||||
tokenizer = AutoTokenizer.from_pretrained(model)
|
|
||||||
tokenizer.save_pretrained(cachedir)
|
|
||||||
print(f"Tokenizer saved to {cachedir}")
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
parser = argparse.ArgumentParser(
|
|
||||||
description="Download and save Hugging Face tokenizer"
|
|
||||||
)
|
|
||||||
parser.add_argument("--model", type=str, required=True, help="Name of the model")
|
|
||||||
parser.add_argument(
|
|
||||||
"--cachedir", type=str, required=True, help="Directory to save the tokenizer"
|
|
||||||
)
|
|
||||||
|
|
||||||
args = parser.parse_args()
|
|
||||||
main(args.model, args.cachedir)
|
|
||||||
@ -1,97 +0,0 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
|
|
||||||
import argparse
|
|
||||||
import json
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
import pandas as pd
|
|
||||||
from tabulate import tabulate
|
|
||||||
|
|
||||||
|
|
||||||
def parse_arguments():
|
|
||||||
parser = argparse.ArgumentParser(
|
|
||||||
description="Parse command line arguments for summary-nightly-results script."
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--results-folder",
|
|
||||||
type=str,
|
|
||||||
required=True,
|
|
||||||
help="The folder where the results are stored.",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--description", type=str, required=True, help="Description of the results."
|
|
||||||
)
|
|
||||||
|
|
||||||
args = parser.parse_args()
|
|
||||||
return args
|
|
||||||
|
|
||||||
|
|
||||||
def get_perf(df, method, model, metric):
|
|
||||||
means = []
|
|
||||||
|
|
||||||
for qps in [2, 4, 8, 16, "inf"]:
|
|
||||||
target = df["Test name"].str.contains(model)
|
|
||||||
target = target & df["Engine"].str.contains(method)
|
|
||||||
target = target & df["Test name"].str.contains("qps_" + str(qps))
|
|
||||||
filtered_df = df[target]
|
|
||||||
|
|
||||||
if filtered_df.empty:
|
|
||||||
means.append(0.0)
|
|
||||||
else:
|
|
||||||
means.append(filtered_df[metric].values[0])
|
|
||||||
|
|
||||||
return np.array(means)
|
|
||||||
|
|
||||||
|
|
||||||
def get_perf_w_std(df, method, model, metric):
|
|
||||||
if metric in ["TTFT", "ITL"]:
|
|
||||||
mean = get_perf(df, method, model, "Mean " + metric + " (ms)")
|
|
||||||
mean = mean.tolist()
|
|
||||||
std = get_perf(df, method, model, "Std " + metric + " (ms)")
|
|
||||||
if std.mean() == 0:
|
|
||||||
std = None
|
|
||||||
success = get_perf(df, method, model, "Successful req.")
|
|
||||||
if std is not None:
|
|
||||||
std = std / np.sqrt(success)
|
|
||||||
std = std.tolist()
|
|
||||||
|
|
||||||
else:
|
|
||||||
assert metric == "Tput"
|
|
||||||
mean = get_perf(df, method, model, "Input Tput (tok/s)") + get_perf(
|
|
||||||
df, method, model, "Output Tput (tok/s)"
|
|
||||||
)
|
|
||||||
mean = mean.tolist()
|
|
||||||
std = None
|
|
||||||
|
|
||||||
return mean, std
|
|
||||||
|
|
||||||
|
|
||||||
def main(args):
|
|
||||||
results_folder = Path(args.results_folder)
|
|
||||||
|
|
||||||
results = []
|
|
||||||
|
|
||||||
# collect results
|
|
||||||
for test_file in results_folder.glob("*_nightly_results.json"):
|
|
||||||
with open(test_file) as f:
|
|
||||||
results = results + json.loads(f.read())
|
|
||||||
|
|
||||||
# generate markdown table
|
|
||||||
df = pd.DataFrame.from_dict(results)
|
|
||||||
|
|
||||||
md_table = tabulate(df, headers="keys", tablefmt="pipe", showindex=False)
|
|
||||||
|
|
||||||
with open(args.description) as f:
|
|
||||||
description = f.read()
|
|
||||||
|
|
||||||
description = description.format(nightly_results_benchmarking_table=md_table)
|
|
||||||
|
|
||||||
with open("nightly_results.md", "w") as f:
|
|
||||||
f.write(description)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
args = parse_arguments()
|
|
||||||
main(args)
|
|
||||||
@ -1,9 +0,0 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
|
|
||||||
from lmdeploy.serve.openai.api_client import APIClient
|
|
||||||
|
|
||||||
api_client = APIClient("http://localhost:8000")
|
|
||||||
model_name = api_client.available_models[0]
|
|
||||||
|
|
||||||
print(model_name)
|
|
||||||
@ -1,78 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
set -ex
|
|
||||||
set -o pipefail
|
|
||||||
|
|
||||||
|
|
||||||
main() {
|
|
||||||
|
|
||||||
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
|
|
||||||
(which jq) || (apt-get update && apt-get -y install jq)
|
|
||||||
(which zip) || (apt-get install -y zip)
|
|
||||||
|
|
||||||
if [ ! -f /workspace/buildkite-agent ]; then
|
|
||||||
echo "buildkite-agent binary not found. Skip plotting the results."
|
|
||||||
exit 0
|
|
||||||
fi
|
|
||||||
|
|
||||||
# initial annotation
|
|
||||||
#description="$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-descriptions.md"
|
|
||||||
|
|
||||||
# download results
|
|
||||||
cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
|
|
||||||
mkdir -p results/
|
|
||||||
/workspace/buildkite-agent artifact download 'results/*nightly_results.json' results/
|
|
||||||
ls
|
|
||||||
ls results/
|
|
||||||
|
|
||||||
# upload benchmark results
|
|
||||||
zip -r results.zip results/
|
|
||||||
/workspace/buildkite-agent artifact upload "results.zip"
|
|
||||||
|
|
||||||
# upload benchmarking scripts
|
|
||||||
cd "$VLLM_SOURCE_CODE_LOC/"
|
|
||||||
zip -r nightly-benchmarks.zip .buildkite/ benchmarks/
|
|
||||||
/workspace/buildkite-agent artifact upload "nightly-benchmarks.zip"
|
|
||||||
|
|
||||||
cd "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/"
|
|
||||||
# upload benchmarking pipeline
|
|
||||||
/workspace/buildkite-agent artifact upload "nightly-pipeline.yaml"
|
|
||||||
|
|
||||||
cd "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/"
|
|
||||||
/workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly-annotation.md
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# The figures should be generated by a separate process outside the CI/CD pipeline
|
|
||||||
|
|
||||||
# # generate figures
|
|
||||||
# python3 -m pip install tabulate pandas matplotlib
|
|
||||||
|
|
||||||
# python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py \
|
|
||||||
# --description $description \
|
|
||||||
# --results-folder results/
|
|
||||||
|
|
||||||
|
|
||||||
# python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
|
|
||||||
# --description $description \
|
|
||||||
# --results-folder results/ \
|
|
||||||
# --dataset sharegpt
|
|
||||||
|
|
||||||
# python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
|
|
||||||
# --description $description \
|
|
||||||
# --results-folder results/ \
|
|
||||||
# --dataset sonnet_2048_128
|
|
||||||
|
|
||||||
# python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
|
|
||||||
# --description $description \
|
|
||||||
# --results-folder results/ \
|
|
||||||
# --dataset sonnet_128_2048
|
|
||||||
|
|
||||||
# # upload results and figures
|
|
||||||
# /workspace/buildkite-agent artifact upload "nightly_results*.png"
|
|
||||||
# /workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
|
|
||||||
# /workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/tests/nightly-tests.json
|
|
||||||
# /workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly_results.md
|
|
||||||
}
|
|
||||||
|
|
||||||
main "$@"
|
|
||||||
@ -1,464 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
set -o pipefail
|
|
||||||
set -x
|
|
||||||
|
|
||||||
check_gpus() {
|
|
||||||
# check the number of GPUs and GPU type.
|
|
||||||
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
|
|
||||||
if [[ $gpu_count -gt 0 ]]; then
|
|
||||||
echo "GPU found."
|
|
||||||
else
|
|
||||||
echo "Need at least 1 GPU to run benchmarking."
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
declare -g gpu_type="$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')"
|
|
||||||
echo "GPU type is $gpu_type"
|
|
||||||
}
|
|
||||||
|
|
||||||
check_hf_token() {
|
|
||||||
# check if HF_TOKEN is available and valid
|
|
||||||
if [[ -z "$HF_TOKEN" ]]; then
|
|
||||||
echo "Error: HF_TOKEN is not set."
|
|
||||||
exit 1
|
|
||||||
elif [[ ! "$HF_TOKEN" =~ ^hf_ ]]; then
|
|
||||||
echo "Error: HF_TOKEN does not start with 'hf_'."
|
|
||||||
exit 1
|
|
||||||
else
|
|
||||||
echo "HF_TOKEN is set and valid."
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
upload_to_buildkite() {
|
|
||||||
# upload the benchmarking results to buildkite
|
|
||||||
|
|
||||||
# if the agent binary is not found, skip uploading the results, exit 0
|
|
||||||
if [ ! -f /workspace/buildkite-agent ]; then
|
|
||||||
echo "buildkite-agent binary not found. Skip uploading the results."
|
|
||||||
return 0
|
|
||||||
fi
|
|
||||||
# /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
|
|
||||||
/workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
get_current_llm_serving_engine() {
|
|
||||||
|
|
||||||
if which lmdeploy >/dev/null; then
|
|
||||||
echo "Container: lmdeploy"
|
|
||||||
export CURRENT_LLM_SERVING_ENGINE=lmdeploy
|
|
||||||
return
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [ -e /tgi-entrypoint.sh ]; then
|
|
||||||
echo "Container: tgi"
|
|
||||||
export CURRENT_LLM_SERVING_ENGINE=tgi
|
|
||||||
return
|
|
||||||
fi
|
|
||||||
|
|
||||||
if which trtllm-build >/dev/null; then
|
|
||||||
echo "Container: tensorrt-llm"
|
|
||||||
export CURRENT_LLM_SERVING_ENGINE=trt
|
|
||||||
return
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [ -e /sgl-workspace ]; then
|
|
||||||
echo "Container: sglang"
|
|
||||||
export CURRENT_LLM_SERVING_ENGINE=sglang
|
|
||||||
return
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [ -e /vllm-workspace ]; then
|
|
||||||
echo "Container: vllm"
|
|
||||||
# move to a completely irrelevant directory, to avoid import vllm from current folder
|
|
||||||
export CURRENT_LLM_SERVING_ENGINE=vllm
|
|
||||||
|
|
||||||
return
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
json2args() {
|
|
||||||
# transforms the JSON string to command line args, and '_' is replaced to '-'
|
|
||||||
# example:
|
|
||||||
# input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
|
|
||||||
# output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
|
|
||||||
local json_string=$1
|
|
||||||
local args=$(
|
|
||||||
echo "$json_string" | jq -r '
|
|
||||||
to_entries |
|
|
||||||
map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
|
|
||||||
join(" ")
|
|
||||||
'
|
|
||||||
)
|
|
||||||
echo "$args"
|
|
||||||
}
|
|
||||||
|
|
||||||
kill_gpu_processes() {
|
|
||||||
pkill -f '[p]ython'
|
|
||||||
pkill -f '[p]ython3'
|
|
||||||
pkill -f '[t]ritonserver'
|
|
||||||
pkill -f '[p]t_main_thread'
|
|
||||||
pkill -f '[t]ext-generation'
|
|
||||||
pkill -f '[l]mdeploy'
|
|
||||||
# vLLM now names the process with VLLM prefix after https://github.com/vllm-project/vllm/pull/21445
|
|
||||||
pkill -f '[V]LLM'
|
|
||||||
|
|
||||||
while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
|
|
||||||
sleep 1
|
|
||||||
done
|
|
||||||
}
|
|
||||||
|
|
||||||
wait_for_server() {
|
|
||||||
# wait for vllm server to start
|
|
||||||
# return 1 if vllm server crashes
|
|
||||||
timeout 1200 bash -c '
|
|
||||||
until curl -s localhost:8000/v1/completions > /dev/null; do
|
|
||||||
sleep 1
|
|
||||||
done' && return 0 || return 1
|
|
||||||
}
|
|
||||||
|
|
||||||
ensure_installed() {
|
|
||||||
# Ensure that the given command is installed by apt-get
|
|
||||||
local cmd=$1
|
|
||||||
if ! which "$cmd" >/dev/null; then
|
|
||||||
apt-get update && apt-get install -y "$cmd"
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
run_serving_tests() {
|
|
||||||
# run serving tests using `vllm bench serve` command
|
|
||||||
# $1: a json file specifying serving test cases
|
|
||||||
|
|
||||||
local serving_test_file
|
|
||||||
serving_test_file=$1
|
|
||||||
|
|
||||||
# Iterate over serving tests
|
|
||||||
jq -c '.[]' "$serving_test_file" | while read -r params; do
|
|
||||||
# get the test name, and append the GPU type back to it.
|
|
||||||
test_name=$(echo "$params" | jq -r '.test_name')
|
|
||||||
|
|
||||||
# if TEST_SELECTOR is set, only run the test cases that match the selector
|
|
||||||
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
|
|
||||||
echo "Skip test case $test_name."
|
|
||||||
continue
|
|
||||||
fi
|
|
||||||
|
|
||||||
# prepend the current serving engine to the test name
|
|
||||||
test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name}
|
|
||||||
|
|
||||||
# get common parameters
|
|
||||||
common_params=$(echo "$params" | jq -r '.common_parameters')
|
|
||||||
model=$(echo "$common_params" | jq -r '.model')
|
|
||||||
tp=$(echo "$common_params" | jq -r '.tp')
|
|
||||||
dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
|
|
||||||
dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
|
|
||||||
port=$(echo "$common_params" | jq -r '.port')
|
|
||||||
num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
|
|
||||||
reuse_server=$(echo "$common_params" | jq -r '.reuse_server')
|
|
||||||
|
|
||||||
# get client and server arguments
|
|
||||||
server_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_server_parameters")
|
|
||||||
client_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_client_parameters")
|
|
||||||
client_args=$(json2args "$client_params")
|
|
||||||
qps_list=$(echo "$params" | jq -r '.qps_list')
|
|
||||||
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
|
|
||||||
echo "Running over qps list $qps_list"
|
|
||||||
|
|
||||||
# check if there is enough GPU to run the test
|
|
||||||
if [[ $gpu_count -lt $tp ]]; then
|
|
||||||
echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name."
|
|
||||||
continue
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ $reuse_server == "true" ]]; then
|
|
||||||
echo "Reuse previous server for test case $test_name"
|
|
||||||
else
|
|
||||||
kill_gpu_processes
|
|
||||||
bash "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/launch-server.sh" \
|
|
||||||
"$server_params" "$common_params"
|
|
||||||
fi
|
|
||||||
|
|
||||||
if wait_for_server; then
|
|
||||||
echo ""
|
|
||||||
echo "$CURRENT_LLM_SERVING_ENGINE server is up and running."
|
|
||||||
else
|
|
||||||
echo ""
|
|
||||||
echo "$CURRENT_LLM_SERVING_ENGINE failed to start within the timeout period."
|
|
||||||
break
|
|
||||||
fi
|
|
||||||
|
|
||||||
# prepare tokenizer
|
|
||||||
# this is required for lmdeploy.
|
|
||||||
cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
|
|
||||||
rm -rf /tokenizer_cache
|
|
||||||
mkdir /tokenizer_cache
|
|
||||||
python3 ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \
|
|
||||||
--model "$model" \
|
|
||||||
--cachedir /tokenizer_cache
|
|
||||||
cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
|
|
||||||
|
|
||||||
|
|
||||||
# change model name for lmdeploy (it will not follow standard hf name)
|
|
||||||
if [[ "$CURRENT_LLM_SERVING_ENGINE" == "lmdeploy" ]]; then
|
|
||||||
model=$(python ../.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py)
|
|
||||||
fi
|
|
||||||
|
|
||||||
# iterate over different QPS
|
|
||||||
for qps in $qps_list; do
|
|
||||||
# remove the surrounding single quote from qps
|
|
||||||
if [[ "$qps" == *"inf"* ]]; then
|
|
||||||
echo "qps was $qps"
|
|
||||||
qps="inf"
|
|
||||||
echo "now qps is $qps"
|
|
||||||
fi
|
|
||||||
|
|
||||||
new_test_name=$test_name"_qps_"$qps
|
|
||||||
|
|
||||||
backend=$CURRENT_LLM_SERVING_ENGINE
|
|
||||||
|
|
||||||
if [[ $backend = "trt" ]]; then
|
|
||||||
backend="tensorrt-llm"
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ "$backend" == *"vllm"* ]]; then
|
|
||||||
backend="vllm"
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ "$dataset_name" = "sharegpt" ]]; then
|
|
||||||
|
|
||||||
client_command="vllm bench serve \
|
|
||||||
--backend $backend \
|
|
||||||
--tokenizer /tokenizer_cache \
|
|
||||||
--model $model \
|
|
||||||
--dataset-name $dataset_name \
|
|
||||||
--dataset-path $dataset_path \
|
|
||||||
--num-prompts $num_prompts \
|
|
||||||
--port $port \
|
|
||||||
--save-result \
|
|
||||||
--result-dir $RESULTS_FOLDER \
|
|
||||||
--result-filename ${new_test_name}.json \
|
|
||||||
--request-rate $qps \
|
|
||||||
--ignore-eos \
|
|
||||||
$client_args"
|
|
||||||
|
|
||||||
elif [[ "$dataset_name" = "sonnet" ]]; then
|
|
||||||
|
|
||||||
sonnet_input_len=$(echo "$common_params" | jq -r '.sonnet_input_len')
|
|
||||||
sonnet_output_len=$(echo "$common_params" | jq -r '.sonnet_output_len')
|
|
||||||
sonnet_prefix_len=$(echo "$common_params" | jq -r '.sonnet_prefix_len')
|
|
||||||
|
|
||||||
client_command="vllm bench serve \
|
|
||||||
--backend $backend \
|
|
||||||
--tokenizer /tokenizer_cache \
|
|
||||||
--model $model \
|
|
||||||
--dataset-name $dataset_name \
|
|
||||||
--dataset-path $dataset_path \
|
|
||||||
--num-prompts $num_prompts \
|
|
||||||
--sonnet-input-len $sonnet_input_len \
|
|
||||||
--sonnet-output-len $sonnet_output_len \
|
|
||||||
--sonnet-prefix-len $sonnet_prefix_len \
|
|
||||||
--port $port \
|
|
||||||
--save-result \
|
|
||||||
--result-dir $RESULTS_FOLDER \
|
|
||||||
--result-filename ${new_test_name}.json \
|
|
||||||
--request-rate $qps \
|
|
||||||
--ignore-eos \
|
|
||||||
$client_args"
|
|
||||||
|
|
||||||
else
|
|
||||||
|
|
||||||
echo "The dataset name must be either 'sharegpt' or 'sonnet'. Got $dataset_name."
|
|
||||||
exit 1
|
|
||||||
|
|
||||||
fi
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
echo "Running test case $test_name with qps $qps"
|
|
||||||
echo "Client command: $client_command"
|
|
||||||
|
|
||||||
eval "$client_command"
|
|
||||||
|
|
||||||
server_command="None"
|
|
||||||
|
|
||||||
# record the benchmarking commands
|
|
||||||
jq_output=$(jq -n \
|
|
||||||
--arg server "$server_command" \
|
|
||||||
--arg client "$client_command" \
|
|
||||||
--arg gpu "$gpu_type" \
|
|
||||||
--arg engine "$CURRENT_LLM_SERVING_ENGINE" \
|
|
||||||
'{
|
|
||||||
server_command: $server,
|
|
||||||
client_command: $client,
|
|
||||||
gpu_type: $gpu,
|
|
||||||
engine: $engine
|
|
||||||
}')
|
|
||||||
echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
|
|
||||||
|
|
||||||
done
|
|
||||||
|
|
||||||
done
|
|
||||||
|
|
||||||
kill_gpu_processes
|
|
||||||
}
|
|
||||||
|
|
||||||
run_genai_perf_tests() {
|
|
||||||
# run genai-perf tests
|
|
||||||
|
|
||||||
# $1: a json file specifying genai-perf test cases
|
|
||||||
local genai_perf_test_file
|
|
||||||
genai_perf_test_file=$1
|
|
||||||
|
|
||||||
# Iterate over genai-perf tests
|
|
||||||
jq -c '.[]' "$genai_perf_test_file" | while read -r params; do
|
|
||||||
# get the test name, and append the GPU type back to it.
|
|
||||||
test_name=$(echo "$params" | jq -r '.test_name')
|
|
||||||
|
|
||||||
# if TEST_SELECTOR is set, only run the test cases that match the selector
|
|
||||||
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
|
|
||||||
echo "Skip test case $test_name."
|
|
||||||
continue
|
|
||||||
fi
|
|
||||||
|
|
||||||
# prepend the current serving engine to the test name
|
|
||||||
test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name}
|
|
||||||
|
|
||||||
# get common parameters
|
|
||||||
common_params=$(echo "$params" | jq -r '.common_parameters')
|
|
||||||
model=$(echo "$common_params" | jq -r '.model')
|
|
||||||
tp=$(echo "$common_params" | jq -r '.tp')
|
|
||||||
dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
|
|
||||||
dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
|
|
||||||
port=$(echo "$common_params" | jq -r '.port')
|
|
||||||
num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
|
|
||||||
reuse_server=$(echo "$common_params" | jq -r '.reuse_server')
|
|
||||||
|
|
||||||
# get client and server arguments
|
|
||||||
server_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_server_parameters")
|
|
||||||
qps_list=$(echo "$params" | jq -r '.qps_list')
|
|
||||||
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
|
|
||||||
echo "Running over qps list $qps_list"
|
|
||||||
|
|
||||||
# check if there is enough GPU to run the test
|
|
||||||
if [[ $gpu_count -lt $tp ]]; then
|
|
||||||
echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name."
|
|
||||||
continue
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ $reuse_server == "true" ]]; then
|
|
||||||
echo "Reuse previous server for test case $test_name"
|
|
||||||
else
|
|
||||||
kill_gpu_processes
|
|
||||||
bash "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/launch-server.sh" \
|
|
||||||
"$server_params" "$common_params"
|
|
||||||
fi
|
|
||||||
|
|
||||||
if wait_for_server; then
|
|
||||||
echo ""
|
|
||||||
echo "$CURRENT_LLM_SERVING_ENGINE server is up and running."
|
|
||||||
else
|
|
||||||
echo ""
|
|
||||||
echo "$CURRENT_LLM_SERVING_ENGINE failed to start within the timeout period."
|
|
||||||
break
|
|
||||||
fi
|
|
||||||
|
|
||||||
# iterate over different QPS
|
|
||||||
for qps in $qps_list; do
|
|
||||||
# remove the surrounding single quote from qps
|
|
||||||
if [[ "$qps" == *"inf"* ]]; then
|
|
||||||
echo "qps was $qps"
|
|
||||||
qps=$num_prompts
|
|
||||||
echo "now qps is $qps"
|
|
||||||
fi
|
|
||||||
|
|
||||||
new_test_name=$test_name"_qps_"$qps
|
|
||||||
backend=$CURRENT_LLM_SERVING_ENGINE
|
|
||||||
|
|
||||||
if [[ "$backend" == *"vllm"* ]]; then
|
|
||||||
backend="vllm"
|
|
||||||
fi
|
|
||||||
#TODO: add output dir.
|
|
||||||
client_command="genai-perf profile \
|
|
||||||
-m $model \
|
|
||||||
--service-kind openai \
|
|
||||||
--backend "$backend" \
|
|
||||||
--endpoint-type chat \
|
|
||||||
--streaming \
|
|
||||||
--url localhost:$port \
|
|
||||||
--request-rate $qps \
|
|
||||||
--num-prompts $num_prompts \
|
|
||||||
"
|
|
||||||
|
|
||||||
echo "Client command: $client_command"
|
|
||||||
|
|
||||||
eval "$client_command"
|
|
||||||
|
|
||||||
#TODO: process/record outputs
|
|
||||||
done
|
|
||||||
done
|
|
||||||
|
|
||||||
kill_gpu_processes
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
prepare_dataset() {
|
|
||||||
|
|
||||||
# download sharegpt dataset
|
|
||||||
cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
|
|
||||||
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
|
||||||
|
|
||||||
# duplicate sonnet by 4x, to allow benchmarking with input length 2048
|
|
||||||
cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
|
|
||||||
echo "" > sonnet_4x.txt
|
|
||||||
for _ in {1..4}
|
|
||||||
do
|
|
||||||
cat sonnet.txt >> sonnet_4x.txt
|
|
||||||
done
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
main() {
|
|
||||||
|
|
||||||
# check if the environment variable is successfully injected from yaml
|
|
||||||
|
|
||||||
check_gpus
|
|
||||||
check_hf_token
|
|
||||||
get_current_llm_serving_engine
|
|
||||||
|
|
||||||
pip install -U transformers
|
|
||||||
|
|
||||||
pip install -r requirements/dev.txt
|
|
||||||
which genai-perf
|
|
||||||
|
|
||||||
# check storage
|
|
||||||
df -h
|
|
||||||
|
|
||||||
ensure_installed wget
|
|
||||||
ensure_installed curl
|
|
||||||
ensure_installed jq
|
|
||||||
# genai-perf dependency
|
|
||||||
ensure_installed libb64-0d
|
|
||||||
|
|
||||||
prepare_dataset
|
|
||||||
|
|
||||||
cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
|
|
||||||
declare -g RESULTS_FOLDER=results/
|
|
||||||
mkdir -p $RESULTS_FOLDER
|
|
||||||
BENCHMARK_ROOT="$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/"
|
|
||||||
|
|
||||||
# run the test
|
|
||||||
run_serving_tests "$BENCHMARK_ROOT/tests/nightly-tests.json"
|
|
||||||
|
|
||||||
# run genai-perf tests
|
|
||||||
run_genai_perf_tests "$BENCHMARK_ROOT/tests/genai-perf-tests.json"
|
|
||||||
mv artifacts/ $RESULTS_FOLDER/
|
|
||||||
|
|
||||||
# upload benchmark results to buildkite
|
|
||||||
python3 -m pip install tabulate pandas
|
|
||||||
python3 "$BENCHMARK_ROOT/scripts/summary-nightly-results.py"
|
|
||||||
upload_to_buildkite
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
main "$@"
|
|
||||||
@ -1,82 +0,0 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
|
|
||||||
import datetime
|
|
||||||
import json
|
|
||||||
import os
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
import pandas as pd
|
|
||||||
from tabulate import tabulate
|
|
||||||
|
|
||||||
results_folder = Path("results/")
|
|
||||||
|
|
||||||
# serving results and the keys that will be printed into markdown
|
|
||||||
serving_results = []
|
|
||||||
serving_column_mapping = {
|
|
||||||
"test_name": "Test name",
|
|
||||||
"gpu_type": "GPU",
|
|
||||||
"completed": "Successful req.",
|
|
||||||
"request_throughput": "Tput (req/s)",
|
|
||||||
"mean_ttft_ms": "Mean TTFT (ms)",
|
|
||||||
"std_ttft_ms": "Std TTFT (ms)",
|
|
||||||
"median_ttft_ms": "Median TTFT (ms)",
|
|
||||||
"mean_itl_ms": "Mean ITL (ms)",
|
|
||||||
"std_itl_ms": "Std ITL (ms)",
|
|
||||||
"median_itl_ms": "Median ITL (ms)",
|
|
||||||
"mean_tpot_ms": "Mean TPOT (ms)",
|
|
||||||
"std_tpot_ms": "Std TPOT (ms)",
|
|
||||||
"median_tpot_ms": "Median TPOT (ms)",
|
|
||||||
"total_token_throughput": "Total Token Tput (tok/s)",
|
|
||||||
"output_throughput": "Output Tput (tok/s)",
|
|
||||||
"total_input_tokens": "Total input tokens",
|
|
||||||
"total_output_tokens": "Total output tokens",
|
|
||||||
"engine": "Engine",
|
|
||||||
}
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
# collect results
|
|
||||||
for test_file in results_folder.glob("*.json"):
|
|
||||||
with open(test_file) as f:
|
|
||||||
raw_result = json.loads(f.read())
|
|
||||||
|
|
||||||
# attach the benchmarking command to raw_result
|
|
||||||
with open(test_file.with_suffix(".commands")) as f:
|
|
||||||
command = json.loads(f.read())
|
|
||||||
raw_result.update(command)
|
|
||||||
|
|
||||||
# update the test name of this result
|
|
||||||
raw_result.update({"test_name": test_file.stem})
|
|
||||||
|
|
||||||
# add the result to raw_result
|
|
||||||
serving_results.append(raw_result)
|
|
||||||
continue
|
|
||||||
|
|
||||||
serving_results = pd.DataFrame.from_dict(serving_results)
|
|
||||||
|
|
||||||
if not serving_results.empty:
|
|
||||||
serving_results = serving_results[list(serving_column_mapping.keys())].rename(
|
|
||||||
columns=serving_column_mapping
|
|
||||||
)
|
|
||||||
|
|
||||||
serving_md_table_with_headers = tabulate(
|
|
||||||
serving_results, headers="keys", tablefmt="pipe", showindex=False
|
|
||||||
)
|
|
||||||
# remove the first line of header
|
|
||||||
serving_md_table_lines = serving_md_table_with_headers.split("\n")
|
|
||||||
serving_md_table_without_header = "\n".join(serving_md_table_lines[2:])
|
|
||||||
|
|
||||||
prefix = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
|
|
||||||
prefix = prefix + "_" + os.environ.get("CURRENT_LLM_SERVING_ENGINE")
|
|
||||||
|
|
||||||
# document benchmarking results in markdown
|
|
||||||
with open(results_folder / f"{prefix}_nightly_results.md", "w") as f:
|
|
||||||
# document results with header.
|
|
||||||
# for those who wants to reproduce our benchmark.
|
|
||||||
f.write(serving_md_table_with_headers)
|
|
||||||
f.write("\n")
|
|
||||||
|
|
||||||
# document benchmarking results in json
|
|
||||||
with open(results_folder / f"{prefix}_nightly_results.json", "w") as f:
|
|
||||||
results = serving_results.to_dict(orient="records")
|
|
||||||
f.write(json.dumps(results))
|
|
||||||
@ -1,23 +0,0 @@
|
|||||||
#!/bin/sh
|
|
||||||
TOKEN=$(curl -s -L "https://public.ecr.aws/token?service=public.ecr.aws&scope=repository:q9t5s3a7/vllm-ci-postmerge-repo:pull" | jq -r .token)
|
|
||||||
if [[ "$BUILDKITE_BRANCH" == "main" ]]; then
|
|
||||||
URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-postmerge-repo/manifests/$BUILDKITE_COMMIT"
|
|
||||||
else
|
|
||||||
URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-test-repo/manifests/$BUILDKITE_COMMIT"
|
|
||||||
fi
|
|
||||||
|
|
||||||
TIMEOUT_SECONDS=10
|
|
||||||
|
|
||||||
retries=0
|
|
||||||
while [ $retries -lt 1000 ]; do
|
|
||||||
if [ "$(curl -s --max-time "$TIMEOUT_SECONDS" -L -H "Authorization: Bearer $TOKEN" -o /dev/null -w "%{http_code}" "$URL")" -eq 200 ]; then
|
|
||||||
exit 0
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "Waiting for image to be available..."
|
|
||||||
|
|
||||||
retries=$((retries + 1))
|
|
||||||
sleep 5
|
|
||||||
done
|
|
||||||
|
|
||||||
exit 1
|
|
||||||
@ -2,40 +2,23 @@
|
|||||||
|
|
||||||
## Introduction
|
## Introduction
|
||||||
|
|
||||||
This directory contains two sets of benchmark for vllm.
|
This directory contains a benchmarking suite for **developers** to run locally and gain clarity on whether their PR improves/degrades vllm's performance.
|
||||||
|
vLLM also maintains a continuous performance benchmark under [perf.vllm.ai](https://perf.vllm.ai/), hosted under PyTorch CI HUD.
|
||||||
- Performance benchmark: benchmark vllm's performance under various workload, for **developers** to gain clarity on whether their PR improves/degrades vllm's performance
|
|
||||||
- Nightly benchmark: compare vllm's performance against alternatives (tgi, trt-llm and lmdeploy), for **the public** to know when to choose vllm.
|
|
||||||
|
|
||||||
See [vLLM performance dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm) for the latest performance benchmark results and [vLLM GitHub README](https://github.com/vllm-project/vllm/blob/main/README.md) for latest nightly benchmark results.
|
|
||||||
|
|
||||||
## Performance benchmark quick overview
|
## Performance benchmark quick overview
|
||||||
|
|
||||||
**Benchmarking Coverage**: latency, throughput and fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!) and Intel® Xeon® Processors, with different models.
|
**Benchmarking Coverage**: latency, throughput and fix-qps serving on B200, A100, H100 and Intel® Xeon® Processors, with different models.
|
||||||
|
|
||||||
**Benchmarking Duration**: about 1hr.
|
**Benchmarking Duration**: about 1hr.
|
||||||
|
|
||||||
**For benchmarking developers**: please try your best to constraint the duration of benchmarking to about 1 hr so that it won't take forever to run.
|
**For benchmarking developers**: please try your best to constraint the duration of benchmarking to about 1 hr so that it won't take forever to run.
|
||||||
|
|
||||||
## Nightly benchmark quick overview
|
|
||||||
|
|
||||||
**Benchmarking Coverage**: Fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!) on Llama-3 8B, 70B and Mixtral 8x7B.
|
|
||||||
|
|
||||||
**Benchmarking engines**: vllm, TGI, trt-llm and lmdeploy.
|
|
||||||
|
|
||||||
**Benchmarking Duration**: about 3.5hrs.
|
|
||||||
|
|
||||||
## Trigger the benchmark
|
## Trigger the benchmark
|
||||||
|
|
||||||
Performance benchmark will be triggered when:
|
The benchmark needs to be triggered manually:
|
||||||
|
|
||||||
- A PR being merged into vllm.
|
|
||||||
- Every commit for those PRs with `perf-benchmarks` label AND `ready` label.
|
|
||||||
|
|
||||||
Manually Trigger the benchmark
|
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
|
bash .buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
|
||||||
```
|
```
|
||||||
|
|
||||||
Runtime environment variables:
|
Runtime environment variables:
|
||||||
@ -47,10 +30,6 @@ Runtime environment variables:
|
|||||||
- `REMOTE_HOST`: IP for the remote vLLM service to benchmark. Default value is empty string.
|
- `REMOTE_HOST`: IP for the remote vLLM service to benchmark. Default value is empty string.
|
||||||
- `REMOTE_PORT`: Port for the remote vLLM service to benchmark. Default value is empty string.
|
- `REMOTE_PORT`: Port for the remote vLLM service to benchmark. Default value is empty string.
|
||||||
|
|
||||||
Nightly benchmark will be triggered when:
|
|
||||||
|
|
||||||
- Every commit for those PRs with `perf-benchmarks` label and `nightly-benchmarks` label.
|
|
||||||
|
|
||||||
## Performance benchmark details
|
## Performance benchmark details
|
||||||
|
|
||||||
See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.
|
See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.
|
||||||
@ -152,26 +131,3 @@ Here is an example using the script to compare result_a and result_b with Model,
|
|||||||
A comparison diagram will be generated below the table.
|
A comparison diagram will be generated below the table.
|
||||||
Here is an example to compare between 96c/results_gnr_96c_091_tp2pp3 and 128c/results_gnr_128c_091_tp2pp3
|
Here is an example to compare between 96c/results_gnr_96c_091_tp2pp3 and 128c/results_gnr_128c_091_tp2pp3
|
||||||
<img width="1886" height="828" alt="image" src="https://github.com/user-attachments/assets/c02a43ef-25d0-4fd6-90e5-2169a28682dd" />
|
<img width="1886" height="828" alt="image" src="https://github.com/user-attachments/assets/c02a43ef-25d0-4fd6-90e5-2169a28682dd" />
|
||||||
|
|
||||||
## Nightly test details
|
|
||||||
|
|
||||||
See [nightly-descriptions.md](nightly-descriptions.md) for the detailed description on test workload, models and docker containers of benchmarking other llm engines.
|
|
||||||
|
|
||||||
### Workflow
|
|
||||||
|
|
||||||
- The [nightly-pipeline.yaml](nightly-pipeline.yaml) specifies the docker containers for different LLM serving engines.
|
|
||||||
- Inside each container, we run [scripts/run-nightly-benchmarks.sh](scripts/run-nightly-benchmarks.sh), which will probe the serving engine of the current container.
|
|
||||||
- The `scripts/run-nightly-benchmarks.sh` will parse the workload described in [nightly-tests.json](tests/nightly-tests.json) and launch the right benchmark for the specified serving engine via `scripts/launch-server.sh`.
|
|
||||||
- At last, we run [scripts/summary-nightly-results.py](scripts/summary-nightly-results.py) to collect and plot the final benchmarking results, and update the results to buildkite.
|
|
||||||
|
|
||||||
### Nightly tests
|
|
||||||
|
|
||||||
In [nightly-tests.json](tests/nightly-tests.json), we include the command line arguments for benchmarking commands, together with the benchmarking test cases. The format is highly similar to performance benchmark.
|
|
||||||
|
|
||||||
### Docker containers
|
|
||||||
|
|
||||||
The docker containers for benchmarking are specified in `nightly-pipeline.yaml`.
|
|
||||||
|
|
||||||
WARNING: the docker versions are HARD-CODED and SHOULD BE ALIGNED WITH `nightly-descriptions.md`. The docker versions need to be hard-coded as there are several version-specific bug fixes inside `scripts/run-nightly-benchmarks.sh` and `scripts/launch-server.sh`.
|
|
||||||
|
|
||||||
WARNING: populating `trt-llm` to latest version is not easy, as it requires updating several protobuf files in [tensorrt-demo](https://github.com/neuralmagic/tensorrt-demo.git).
|
|
||||||
@ -392,7 +392,7 @@ if __name__ == "__main__":
|
|||||||
json_file = "benchmark_results.json"
|
json_file = "benchmark_results.json"
|
||||||
with open(results_folder / md_file, "w") as f:
|
with open(results_folder / md_file, "w") as f:
|
||||||
results = read_markdown(
|
results = read_markdown(
|
||||||
"../.buildkite/nightly-benchmarks/"
|
"../.buildkite/performance-benchmarks/"
|
||||||
+ "performance-benchmarks-descriptions.md"
|
+ "performance-benchmarks-descriptions.md"
|
||||||
)
|
)
|
||||||
results = results.format(
|
results = results.format(
|
||||||
@ -469,7 +469,7 @@ main() {
|
|||||||
ensure_sharegpt_downloaded
|
ensure_sharegpt_downloaded
|
||||||
declare -g RESULTS_FOLDER=results/
|
declare -g RESULTS_FOLDER=results/
|
||||||
mkdir -p $RESULTS_FOLDER
|
mkdir -p $RESULTS_FOLDER
|
||||||
QUICK_BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
|
QUICK_BENCHMARK_ROOT=../.buildkite/performance-benchmarks/
|
||||||
|
|
||||||
# dump vllm info via vllm collect-env
|
# dump vllm info via vllm collect-env
|
||||||
env_output=$(vllm collect-env)
|
env_output=$(vllm collect-env)
|
||||||
2
.github/mergify.yml
vendored
2
.github/mergify.yml
vendored
@ -108,7 +108,7 @@ pull_request_rules:
|
|||||||
- files~=^benchmarks/
|
- files~=^benchmarks/
|
||||||
- files~=^vllm/benchmarks/
|
- files~=^vllm/benchmarks/
|
||||||
- files~=^tests/benchmarks/
|
- files~=^tests/benchmarks/
|
||||||
- files~=^\.buildkite/nightly-benchmarks/
|
- files~=^\.buildkite/performance-benchmarks/
|
||||||
actions:
|
actions:
|
||||||
label:
|
label:
|
||||||
add:
|
add:
|
||||||
|
|||||||
@ -9,7 +9,6 @@ vLLM provides comprehensive benchmarking tools for performance testing and evalu
|
|||||||
- **[Benchmark CLI](#benchmark-cli)**: `vllm bench` CLI tools and specialized benchmark scripts for interactive performance testing
|
- **[Benchmark CLI](#benchmark-cli)**: `vllm bench` CLI tools and specialized benchmark scripts for interactive performance testing
|
||||||
- **[Parameter sweeps](#parameter-sweeps)**: Automate `vllm bench` runs for multiple configurations
|
- **[Parameter sweeps](#parameter-sweeps)**: Automate `vllm bench` runs for multiple configurations
|
||||||
- **[Performance benchmarks](#performance-benchmarks)**: Automated CI benchmarks for development
|
- **[Performance benchmarks](#performance-benchmarks)**: Automated CI benchmarks for development
|
||||||
- **[Nightly benchmarks](#nightly-benchmarks)**: Comparative benchmarks against alternatives
|
|
||||||
|
|
||||||
[Benchmark CLI]: #benchmark-cli
|
[Benchmark CLI]: #benchmark-cli
|
||||||
|
|
||||||
@ -1167,7 +1166,7 @@ docker run -it --entrypoint /bin/bash -v /data/huggingface:/root/.cache/huggingf
|
|||||||
Then, run below command inside the docker instance.
|
Then, run below command inside the docker instance.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
|
bash .buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
|
||||||
```
|
```
|
||||||
|
|
||||||
When run, benchmark script generates results under **benchmark/results** folder, along with the benchmark_results.md and benchmark_results.json.
|
When run, benchmark script generates results under **benchmark/results** folder, along with the benchmark_results.md and benchmark_results.json.
|
||||||
@ -1185,7 +1184,7 @@ For more results visualization, check the [visualizing the results](https://gith
|
|||||||
|
|
||||||
The latest performance results are hosted on the public [vLLM Performance Dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm).
|
The latest performance results are hosted on the public [vLLM Performance Dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm).
|
||||||
|
|
||||||
More information on the performance benchmarks and their parameters can be found in [Benchmark README](https://github.com/intel-ai-tce/vllm/blob/more_cpu_models/.buildkite/nightly-benchmarks/README.md) and [performance benchmark description](../../.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md).
|
More information on the performance benchmarks and their parameters can be found in [Benchmark README](https://github.com/intel-ai-tce/vllm/blob/more_cpu_models/.buildkite/nightly-benchmarks/README.md) and [performance benchmark description](../../.buildkite/performance-benchmarks/performance-benchmarks-descriptions.md).
|
||||||
|
|
||||||
### Continuous Benchmarking
|
### Continuous Benchmarking
|
||||||
|
|
||||||
@ -1210,11 +1209,3 @@ The benchmarking currently runs on a predefined set of models configured in the
|
|||||||
#### Viewing Results
|
#### Viewing Results
|
||||||
|
|
||||||
All continuous benchmarking results are automatically published to the public [vLLM Performance Dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm).
|
All continuous benchmarking results are automatically published to the public [vLLM Performance Dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm).
|
||||||
|
|
||||||
## Nightly Benchmarks
|
|
||||||
|
|
||||||
These compare vLLM's performance against alternatives (`tgi`, `trt-llm`, and `lmdeploy`) when there are major updates of vLLM (e.g., bumping up to a new version). They are primarily intended for consumers to evaluate when to choose vLLM over other options and are triggered on every commit with both the `perf-benchmarks` and `nightly-benchmarks` labels.
|
|
||||||
|
|
||||||
The latest nightly benchmark results are shared in major release blog posts such as [vLLM v0.6.0](https://blog.vllm.ai/2024/09/05/perf-update.html).
|
|
||||||
|
|
||||||
More information on the nightly benchmarks and their parameters can be found [here](../../.buildkite/nightly-benchmarks/nightly-descriptions.md).
|
|
||||||
|
|||||||
Reference in New Issue
Block a user