Compare commits

..

14 Commits

Author SHA1 Message Date
70b4e46e70 compilation is fixed 2025-02-06 20:49:29 +00:00
5fb9dbe6f6 fix capture model 2025-02-06 20:18:30 +00:00
996b92ccb4 swap works! 2025-02-05 20:28:33 +00:00
2b0526fa15 works! 2025-02-05 16:54:57 +00:00
7be649256f fixes 2025-02-05 15:36:38 +00:00
627efde813 fixes 2025-02-04 22:16:19 +00:00
c2867d5bc1 Optimize decode/prompt prepare code 2025-02-04 21:12:07 +00:00
39c4a4cdb5 review comments
Signed-off-by: Alexander Matveev <amatveev@redhat.com>
2025-01-28 23:08:50 +00:00
1ccf100c6a clean-ups
Signed-off-by: Alexander Matveev <amatveev@redhat.com>
2025-01-28 23:08:50 +00:00
248c5b632d works
Signed-off-by: Alexander Matveev <amatveev@redhat.com>
2025-01-28 23:08:50 +00:00
950f349492 scheduler is clean
Signed-off-by: Alexander Matveev <amatveev@redhat.com>
2025-01-28 23:08:50 +00:00
61bb55f3d5 Chunked prompt works!
Signed-off-by: Alexander Matveev <amatveev@redhat.com>
2025-01-28 23:08:50 +00:00
0bddb6b9a5 reorder funcs
Signed-off-by: Alexander Matveev <amatveev@redhat.com>
2025-01-28 23:08:50 +00:00
c715fb19e5 [V1] TPU support
Signed-off-by: Alexander Matveev <amatveev@redhat.com>
2025-01-28 23:08:50 +00:00
1502 changed files with 25106 additions and 105805 deletions

View File

@ -1,14 +1,9 @@
# SPDX-License-Identifier: Apache-2.0
import os
import sys
import zipfile
# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 400 MiB
# Note that we have 400 MiB quota, please use it wisely.
# See https://github.com/pypi/support/issues/3792 .
# Please also sync the value with the one in Dockerfile.
VLLM_MAX_SIZE_MB = int(os.environ.get('VLLM_MAX_SIZE_MB', 400))
# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 250 MB
VLLM_MAX_SIZE_MB = int(os.environ.get('VLLM_MAX_SIZE_MB', 250))
def print_top_10_largest_files(zip_file):

View File

@ -1,5 +1,3 @@
# SPDX-License-Identifier: Apache-2.0
import argparse
import os

View File

@ -1,11 +0,0 @@
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM -b "auto" -t 2
model_name: "nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM"
tasks:
- name: "gsm8k"
metrics:
- name: "exact_match,strict-match"
value: 0.6353
- name: "exact_match,flexible-extract"
value: 0.637
limit: null
num_fewshot: null

View File

@ -1,4 +1,3 @@
# SPDX-License-Identifier: Apache-2.0
"""
LM eval harness on model to compare vs HF baseline computed offline.
Configs are found in configs/$MODEL.yaml

View File

@ -1,13 +1,15 @@
# vLLM benchmark suite
## Introduction
This directory contains two sets of benchmark for vllm.
- Performance benchmark: benchmark vllm's performance under various workload, for **developers** to gain clarity on whether their PR improves/degrades vllm's performance
- Nightly benchmark: compare vllm's performance against alternatives (tgi, trt-llm and lmdeploy), for **the public** to know when to choose vllm.
See [vLLM performance dashboard](https://perf.vllm.ai) for the latest performance benchmark results and [vLLM GitHub README](https://github.com/vllm-project/vllm/blob/main/README.md) for latest nightly benchmark results.
See [vLLM performance dashboard](https://perf.vllm.ai) for the latest performance benchmark results and [vLLM GitHub README](https://github.com/vllm-project/vllm/blob/main/README.md) for latest nightly benchmark results.
## Performance benchmark quick overview
@ -17,14 +19,17 @@ See [vLLM performance dashboard](https://perf.vllm.ai) for the latest performanc
**For benchmarking developers**: please try your best to constraint the duration of benchmarking to about 1 hr so that it won't take forever to run.
## Nightly benchmark quick overview
**Benchmarking Coverage**: Fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!) on Llama-3 8B, 70B and Mixtral 8x7B.
**Benchmarking Coverage**: Fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!) on Llama-3 8B, 70B and Mixtral 8x7B.
**Benchmarking engines**: vllm, TGI, trt-llm and lmdeploy.
**Benchmarking Duration**: about 3.5hrs.
## Trigger the benchmark
Performance benchmark will be triggered when:
@ -34,11 +39,16 @@ Performance benchmark will be triggered when:
Nightly benchmark will be triggered when:
- Every commit for those PRs with `perf-benchmarks` label and `nightly-benchmarks` label.
## Performance benchmark details
See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.
### Latency test
#### Latency test
Here is an example of one test inside `latency-tests.json`:
@ -58,25 +68,23 @@ Here is an example of one test inside `latency-tests.json`:
```
In this example:
- The `test_name` attributes is a unique identifier for the test. In `latency-tests.json`, it must start with `latency_`.
- The `parameters` attribute control the command line arguments to be used for `benchmark_latency.py`. Note that please use underline `_` instead of the dash `-` when specifying the command line arguments, and `run-performance-benchmarks.sh` will convert the underline to dash when feeding the arguments to `benchmark_latency.py`. For example, the corresponding command line arguments for `benchmark_latency.py` will be `--model meta-llama/Meta-Llama-3-8B --tensor-parallel-size 1 --load-format dummy --num-iters-warmup 5 --num-iters 15`
- The `test_name` attributes is a unique identifier for the test. In `latency-tests.json`, it must start with `latency_`.
- The `parameters` attribute control the command line arguments to be used for `benchmark_latency.py`. Note that please use underline `_` instead of the dash `-` when specifying the command line arguments, and `run-performance-benchmarks.sh` will convert the underline to dash when feeding the arguments to `benchmark_latency.py`. For example, the corresponding command line arguments for `benchmark_latency.py` will be `--model meta-llama/Meta-Llama-3-8B --tensor-parallel-size 1 --load-format dummy --num-iters-warmup 5 --num-iters 15`
Note that the performance numbers are highly sensitive to the value of the parameters. Please make sure the parameters are set correctly.
WARNING: The benchmarking script will save json results by itself, so please do not configure `--output-json` parameter in the json file.
### Throughput test
#### Throughput test
The tests are specified in `throughput-tests.json`. The syntax is similar to `latency-tests.json`, except for that the parameters will be fed forward to `benchmark_throughput.py`.
The number of this test is also stable -- a slight change on the value of this number might vary the performance numbers by a lot.
### Serving test
#### Serving test
We test the throughput by using `benchmark_serving.py` with request rate = inf to cover the online serving overhead. The corresponding parameters are in `serving-tests.json`, and here is an example:
```json
```
[
{
"test_name": "serving_llama8B_tp1_sharegpt",
@ -101,7 +109,6 @@ We test the throughput by using `benchmark_serving.py` with request rate = inf t
```
Inside this example:
- The `test_name` attribute is also a unique identifier for the test. It must start with `serving_`.
- The `server-parameters` includes the command line arguments for vLLM server.
- The `client-parameters` includes the command line arguments for `benchmark_serving.py`.
@ -111,33 +118,36 @@ The number of this test is less stable compared to the delay and latency benchma
WARNING: The benchmarking script will save json results by itself, so please do not configure `--save-results` or other results-saving-related parameters in `serving-tests.json`.
### Visualizing the results
#### Visualizing the results
The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](tests/descriptions.md) with real benchmarking results.
You can find the result presented as a table inside the `buildkite/performance-benchmark` job page.
If you do not see the table, please wait till the benchmark finish running.
The json version of the table (together with the json version of the benchmark) will be also attached to the markdown file.
The raw benchmarking results (in the format of json files) are in the `Artifacts` tab of the benchmarking.
## Nightly test details
See [nightly-descriptions.md](nightly-descriptions.md) for the detailed description on test workload, models and docker containers of benchmarking other llm engines.
### Workflow
- The [nightly-pipeline.yaml](nightly-pipeline.yaml) specifies the docker containers for different LLM serving engines.
#### Workflow
- The [nightly-pipeline.yaml](nightly-pipeline.yaml) specifies the docker containers for different LLM serving engines.
- Inside each container, we run [run-nightly-suite.sh](run-nightly-suite.sh), which will probe the serving engine of the current container.
- The `run-nightly-suite.sh` will redirect the request to `tests/run-[llm serving engine name]-nightly.sh`, which parses the workload described in [nightly-tests.json](tests/nightly-tests.json) and performs the benchmark.
- At last, we run [scripts/plot-nightly-results.py](scripts/plot-nightly-results.py) to collect and plot the final benchmarking results, and update the results to buildkite.
### Nightly tests
#### Nightly tests
In [nightly-tests.json](tests/nightly-tests.json), we include the command line arguments for benchmarking commands, together with the benchmarking test cases. The format is highly similar to performance benchmark.
### Docker containers
#### Docker containers
The docker containers for benchmarking are specified in `nightly-pipeline.yaml`.
WARNING: the docker versions are HARD-CODED and SHOULD BE ALIGNED WITH `nightly-descriptions.md`. The docker versions need to be hard-coded as there are several version-specific bug fixes inside `tests/run-[llm serving engine name]-nightly.sh`.
WARNING: populating `trt-llm` to latest version is not easy, as it requires updating several protobuf files in [tensorrt-demo](https://github.com/neuralmagic/tensorrt-demo.git).

View File

@ -10,18 +10,12 @@ steps:
- image: badouralix/curl-jq
command:
- sh .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
- label: "Cleanup H100"
agents:
queue: H100
depends_on: ~
command: docker system prune -a --volumes --force
- label: "A100"
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
agents:
queue: A100
depends_on: wait-for-container-image
if: build.branch == "main"
plugins:
- kubernetes:
podSpec:
@ -56,7 +50,6 @@ steps:
agents:
queue: H200
depends_on: wait-for-container-image
if: build.branch == "main"
plugins:
- docker#v5.12.0:
image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
@ -82,7 +75,6 @@ steps:
agents:
queue: H100
depends_on: wait-for-container-image
if: build.branch == "main"
plugins:
- docker#v5.12.0:
image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
@ -98,87 +90,3 @@ steps:
environment:
- VLLM_USAGE_SOURCE
- HF_TOKEN
# Premerge benchmark
- label: "A100"
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
agents:
queue: A100
depends_on: wait-for-container-image
if: build.branch != "main"
plugins:
- kubernetes:
podSpec:
priorityClassName: perf-benchmark
containers:
- image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
command:
- bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
resources:
limits:
nvidia.com/gpu: 8
volumeMounts:
- name: devshm
mountPath: /dev/shm
env:
- name: VLLM_USAGE_SOURCE
value: ci-test
- name: HF_TOKEN
valueFrom:
secretKeyRef:
name: hf-token-secret
key: token
nodeSelector:
nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
volumes:
- name: devshm
emptyDir:
medium: Memory
- label: "H200"
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
agents:
queue: H200
depends_on: wait-for-container-image
if: build.branch != "main"
plugins:
- docker#v5.12.0:
image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
command:
- bash
- .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
mount-buildkite-agent: true
propagate-environment: true
ipc: host
gpus: 4,5,6,7
volumes:
- /data/benchmark-hf-cache:/root/.cache/huggingface
environment:
- VLLM_USAGE_SOURCE
- HF_TOKEN
#- block: "Run H100 Benchmark"
#key: block-h100
#depends_on: ~
- label: "H100"
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
agents:
queue: H100
depends_on: wait-for-container-image
if: build.branch != "main"
plugins:
- docker#v5.12.0:
image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
command:
- bash
- .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
mount-buildkite-agent: true
propagate-environment: true
ipc: host
gpus: all # see CUDA_VISIBLE_DEVICES for actual GPUs used
volumes:
- /data/benchmark-hf-cache:/root/.cache/huggingface
environment:
- VLLM_USAGE_SOURCE
- HF_TOKEN

View File

@ -9,19 +9,20 @@ This file contains the downloading link for benchmarking results.
Please download the visualization scripts in the post
## Results reproduction
- Find the docker we use in `benchmarking pipeline`
- Deploy the docker, and inside the docker:
- Download `nightly-benchmarks.zip`.
- In the same folder, run the following code:
```console
export HF_TOKEN=<your HF token>
apt update
apt install -y git
unzip nightly-benchmarks.zip
VLLM_SOURCE_CODE_LOC=./ bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
```
- Download `nightly-benchmarks.zip`.
- In the same folder, run the following code
```
export HF_TOKEN=<your HF token>
apt update
apt install -y git
unzip nightly-benchmarks.zip
VLLM_SOURCE_CODE_LOC=./ bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
```
And the results will be inside `./benchmarks/results`.

View File

@ -2,7 +2,6 @@
# Nightly benchmark
This benchmark aims to:
- Provide performance clarity: Provide clarity on which one (vllm, tensorrt-llm, lmdeploy and SGLang) leads in performance in what workload.
- Be reproducible: one can run the exact same set of benchmarking commands inside the exact same docker by following reproducing instructions.
@ -10,6 +9,7 @@ Latest results: [results link](https://blog.vllm.ai/2024/09/05/perf-update.html)
Latest reproduction guilde: [github issue link](https://github.com/vllm-project/vllm/issues/8176)
## Setup
- Docker images:
@ -33,7 +33,7 @@ Latest reproduction guilde: [github issue link](https://github.com/vllm-project/
- Queries are randomly sampled, and arrival patterns are determined via Poisson process, but all with fixed random seed.
- Evaluation metrics: Throughput (higher the better), TTFT (time to the first token, lower the better), ITL (inter-token latency, lower the better).
## Known issues
# Known issues
- TRT-LLM crashes with Llama 3.1 8B [issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105).
- TGI does not support `ignore-eos` flag.
- TGI does not support `ignore-eos` flag.

View File

@ -7,8 +7,10 @@
- Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
- Evaluation metrics: end-to-end latency (mean, median, p99).
{latency_tests_markdown_table}
## Throughput tests
- Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
@ -17,8 +19,10 @@
- Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
- Evaluation metrics: throughput.
{throughput_tests_markdown_table}
## Serving tests
- Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
@ -29,11 +33,13 @@
- We also added a speculative decoding test for llama-3 70B, under QPS 2
- Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99).
{serving_tests_markdown_table}
## json version of the benchmarking tables
This section contains the data of the markdown tables above in JSON format.
This section contains the data of the markdown tables above in JSON format.
You can load the benchmarking tables into pandas dataframes as follows:
```python
@ -48,9 +54,9 @@ serving_results = pd.DataFrame.from_dict(benchmarking_results["serving"])
```
The json string for all benchmarking tables:
```json
{benchmarking_results_in_json_string}
```
You can also check the raw experiment data in the Artifact tab of the Buildkite page.

View File

@ -1,5 +1,3 @@
# SPDX-License-Identifier: Apache-2.0
import json
import os
from pathlib import Path
@ -84,13 +82,8 @@ if __name__ == "__main__":
# this result is generated via `benchmark_serving.py`
# attach the benchmarking command to raw_result
try:
with open(test_file.with_suffix(".commands")) as f:
command = json.loads(f.read())
except OSError as e:
print(e)
continue
with open(test_file.with_suffix(".commands")) as f:
command = json.loads(f.read())
raw_result.update(command)
# update the test name of this result
@ -104,13 +97,8 @@ if __name__ == "__main__":
# this result is generated via `benchmark_latency.py`
# attach the benchmarking command to raw_result
try:
with open(test_file.with_suffix(".commands")) as f:
command = json.loads(f.read())
except OSError as e:
print(e)
continue
with open(test_file.with_suffix(".commands")) as f:
command = json.loads(f.read())
raw_result.update(command)
# update the test name of this result
@ -131,13 +119,8 @@ if __name__ == "__main__":
# this result is generated via `benchmark_throughput.py`
# attach the benchmarking command to raw_result
try:
with open(test_file.with_suffix(".commands")) as f:
command = json.loads(f.read())
except OSError as e:
print(e)
continue
with open(test_file.with_suffix(".commands")) as f:
command = json.loads(f.read())
raw_result.update(command)
# update the test name of this result

View File

@ -1,5 +1,3 @@
# SPDX-License-Identifier: Apache-2.0
import argparse
from transformers import AutoTokenizer

View File

@ -1,5 +1,3 @@
# SPDX-License-Identifier: Apache-2.0
import argparse
import json
from pathlib import Path

View File

@ -1,5 +1,3 @@
# SPDX-License-Identifier: Apache-2.0
from lmdeploy.serve.openai.api_client import APIClient
api_client = APIClient("http://localhost:8000")

View File

@ -309,14 +309,11 @@ run_serving_tests() {
new_test_name=$test_name"_qps_"$qps
# pass the tensor parallel size to the client so that it can be displayed
# on the benchmark dashboard
client_command="python3 benchmark_serving.py \
--save-result \
--result-dir $RESULTS_FOLDER \
--result-filename ${new_test_name}.json \
--request-rate $qps \
--metadata "tensor_parallel_size=$tp" \
$client_args"
echo "Running test case $test_name with qps $qps"
@ -348,11 +345,6 @@ main() {
check_gpus
check_hf_token
# Set to v1 to run v1 benchmark
if [[ "${ENGINE_VERSION:-v0}" == "v1" ]]; then
export VLLM_USE_V1=1
fi
# dependencies
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
(which jq) || (apt-get update && apt-get -y install jq)

View File

@ -1,5 +1,3 @@
# SPDX-License-Identifier: Apache-2.0
import datetime
import json
import os

View File

@ -1,10 +1,6 @@
#!/bin/sh
TOKEN=$(curl -s -L "https://public.ecr.aws/token?service=public.ecr.aws&scope=repository:q9t5s3a7/vllm-ci-postmerge-repo:pull" | jq -r .token)
if [[ "$BUILDKITE_BRANCH" == "main" ]]; then
URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-postmerge-repo/manifests/$BUILDKITE_COMMIT"
else
URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-test-repo/manifests/$BUILDKITE_COMMIT"
fi
URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-postmerge-repo/manifests/$BUILDKITE_COMMIT"
TIMEOUT_SECONDS=10

View File

@ -29,4 +29,4 @@
"num-iters": 15
}
}
]
]

View File

@ -66,7 +66,8 @@
"swap_space": 16,
"speculative_model": "turboderp/Qwama-0.5B-Instruct",
"num_speculative_tokens": 4,
"speculative_draft_tensor_parallel_size": 1
"speculative_draft_tensor_parallel_size": 1,
"use_v2_block_manager": ""
},
"client_parameters": {
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",

View File

@ -32,4 +32,4 @@
"backend": "vllm"
}
}
]
]

View File

@ -1,15 +1,4 @@
steps:
- label: "Build wheel - CUDA 12.4"
agents:
queue: cpu_queue_postmerge
commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag vllm-ci:build-image --target build --progress plain ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/upload-wheels.sh"
env:
DOCKER_BUILDKIT: "1"
- label: "Build wheel - CUDA 12.1"
agents:
queue: cpu_queue_postmerge
@ -48,7 +37,7 @@ steps:
queue: cpu_queue_postmerge
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain ."
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
- label: "Build and publish TPU release image"
@ -67,11 +56,6 @@ steps:
env:
DOCKER_BUILDKIT: "1"
- input: "Provide Release version here"
fields:
- text: "What is the release version?"
key: "release-version"
- block: "Build CPU release image"
key: block-cpu-release-image-build
depends_on: ~
@ -82,7 +66,7 @@ steps:
queue: cpu_queue_postmerge
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --progress plain -f Dockerfile.cpu ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$RELEASE_VERSION --progress plain -f Dockerfile.cpu ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$RELEASE_VERSION"
env:
DOCKER_BUILDKIT: "1"

View File

@ -77,6 +77,7 @@ echo "Commands:$commands"
#ignore certain kernels tests
if [[ $commands == *" kernels "* ]]; then
commands="${commands} \
--ignore=kernels/test_attention.py \
--ignore=kernels/test_attention_selector.py \
--ignore=kernels/test_blocksparse_attention.py \
--ignore=kernels/test_causal_conv1d.py \
@ -91,14 +92,7 @@ if [[ $commands == *" kernels "* ]]; then
--ignore=kernels/test_moe.py \
--ignore=kernels/test_prefix_prefill.py \
--ignore=kernels/test_rand.py \
--ignore=kernels/test_sampler.py \
--ignore=kernels/test_cascade_flash_attn.py \
--ignore=kernels/test_mamba_mixer2.py \
--ignore=kernels/test_aqlm.py \
--ignore=kernels/test_machete_mm.py \
--ignore=kernels/test_mha_attn.py \
--ignore=kernels/test_block_fp8.py \
--ignore=kernels/test_permute_cols.py"
--ignore=kernels/test_sampler.py"
fi
#ignore certain Entrypoints tests
@ -127,8 +121,6 @@ if [[ $commands == *"--shard-id="* ]]; then
--rm \
-e HIP_VISIBLE_DEVICES="${GPU}" \
-e HF_TOKEN \
-e AWS_ACCESS_KEY_ID \
-e AWS_SECRET_ACCESS_KEY \
-v "${HF_CACHE}:${HF_MOUNT}" \
-e "HF_HOME=${HF_MOUNT}" \
--name "${container_name}_${GPU}" \
@ -156,8 +148,6 @@ else
--rm \
-e HIP_VISIBLE_DEVICES=0 \
-e HF_TOKEN \
-e AWS_ACCESS_KEY_ID \
-e AWS_SECRET_ACCESS_KEY \
-v "${HF_CACHE}:${HF_MOUNT}" \
-e "HF_HOME=${HF_MOUNT}" \
--name "${container_name}" \

View File

@ -30,7 +30,7 @@ function cpu_tests() {
# offline inference
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" bash -c "
set -e
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
python3 examples/offline_inference/basic.py"
# Run basic model test
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "

View File

@ -23,6 +23,6 @@ trap remove_docker_container EXIT
remove_docker_container
# Run the image and test offline inference
docker run -e HF_TOKEN -v /root/.cache/huggingface:/root/.cache/huggingface --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
python3 examples/offline_inference/basic/generate.py --model meta-llama/Llama-3.2-1B
docker run --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
python3 examples/offline_inference/basic.py
'

View File

@ -20,5 +20,5 @@ trap remove_docker_container_and_exit EXIT
remove_docker_container
# Run the image and launch offline inference
docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m
docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/basic.py
EXITCODE=$?

View File

@ -29,6 +29,9 @@ if [ -f /tmp/neuron-docker-build-timestamp ]; then
docker image prune -f
# Remove unused volumes / force the system prune for old images as well.
docker volume prune -f && docker system prune -f
# Remove huggingface model artifacts and compiler cache
rm -rf "${HF_MOUNT:?}/*"
rm -rf "${NEURON_COMPILE_CACHE_MOUNT:?}/*"
echo "$current_time" > /tmp/neuron-docker-build-timestamp
fi
else
@ -51,4 +54,4 @@ docker run --rm -it --device=/dev/neuron0 --device=/dev/neuron1 --network host \
-e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \
--name "${container_name}" \
${image_name} \
/bin/bash -c "python3 /workspace/vllm/examples/offline_inference/neuron.py && python3 -m pytest /workspace/vllm/tests/neuron/ -v --capture=tee-sys"
/bin/bash -c "python3 /workspace/vllm/examples/offline_inference/neuron.py"

View File

@ -13,4 +13,4 @@ trap remove_docker_container EXIT
remove_docker_container
# Run the image and launch offline inference
docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference/basic/generate.py --model facebook/opt-125m
docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference/basic.py

0
.buildkite/run-tpu-test.sh Executable file → Normal file
View File

View File

@ -14,6 +14,6 @@ remove_docker_container
# Run the image and test offline inference/tensor parallel
docker run --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path --entrypoint="" xpu-test sh -c '
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m -tp 2
python3 examples/offline_inference/basic.py
python3 examples/offline_inference/cli.py -tp 2
'

View File

@ -2,7 +2,7 @@
# adding a new command to an existing step. See different options here for examples.
# This script will be feed into Jinja template in `test-template-aws.j2` at
# https://github.com/vllm-project/buildkite-ci/blob/main/scripts/test-template-aws.j2
# https://github.com/vllm-project/buildkite-ci/blob/main/scripts/test-template-aws.j2
# to generate the final pipeline yaml file.
# Documentation
@ -15,7 +15,7 @@
# mirror_hardwares(list): the list of hardwares to run the test on as well. currently only supports [amd]
# gpu(str): override the GPU selection for the test. default is on L4 GPUs. currently only supports a100
# num_gpus(int): override the number of GPUs for the test. default to 1 GPU. currently support 2,4.
# num_nodes(int): whether to simulate multi-node setup by launch multiple containers on one host,
# num_nodes(int): whether to simulate multi-node setup by launch multiple containers on one host,
# in this case, commands must be specified. the first command runs on first host, the second
# command runs on the second host.
# working_dir(str): specify the place where command should execute, default to /vllm-workspace/tests
@ -24,8 +24,8 @@
# When adding a test
# - If the test belong to an existing group, add it there
# - If the test is short, add to any existing step
# - If the test takes more than 10min, then it is okay to create a new step.
# Note that all steps execute in parallel.
# - If the test takes more than 10min, then it is okay to create a new step.
# Note that all steps execute in parallel.
steps:
##### fast check tests #####
@ -50,9 +50,9 @@ steps:
- tests/multimodal
- tests/test_utils
- tests/worker
- tests/standalone_tests/lazy_imports.py
- tests/standalone_tests/lazy_torch_compile.py
commands:
- python3 standalone_tests/lazy_imports.py
- python3 standalone_tests/lazy_torch_compile.py
- pytest -v -s mq_llm_engine # MQLLMEngine
- pytest -v -s async_engine # AsyncLLMEngine
- NUM_SCHEDULER_STEPS=4 pytest -v -s async_engine/test_async_llm_engine.py
@ -107,17 +107,13 @@ steps:
mirror_hardwares: [amd]
source_file_dependencies:
- vllm/
- tests/entrypoints/llm
- tests/entrypoints/openai
- tests/entrypoints/test_chat_utils
- tests/entrypoints/offline_mode
commands:
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_guided_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
- pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
- pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
- pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/correctness/
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py
- pytest -v -s entrypoints/test_chat_utils.py
- pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
@ -128,15 +124,11 @@ steps:
source_file_dependencies:
- vllm/distributed/
- vllm/core/
- tests/distributed/test_utils
- tests/distributed/test_pynccl
- tests/distributed
- tests/spec_decode/e2e/test_integration_dist_tp4
- tests/compile/test_basic_correctness
- tests/compile
- examples/offline_inference/rlhf.py
- examples/offline_inference/rlhf_colocate.py
- tests/examples/offline_inference/data_parallel.py
commands:
- VLLM_USE_V1=1 python3 ../examples/offline_inference/data_parallel.py
- pytest -v -s distributed/test_utils.py
- pytest -v -s compile/test_basic_correctness.py
- pytest -v -s distributed/test_pynccl.py
@ -144,17 +136,16 @@ steps:
# TODO: create a dedicated test section for multi-GPU example tests
# when we have multiple distributed example tests
- python3 ../examples/offline_inference/rlhf.py
- RAY_DEDUP_LOGS=0 python3 ../examples/offline_inference/rlhf_colocate.py
- label: Metrics, Tracing Test # 10min
num_gpus: 2
num_gpus: 2
fast_check: true
source_file_dependencies:
- vllm/
- tests/metrics
- tests/tracing
commands:
- pytest -v -s metrics
- pytest -v -s metrics
- "pip install \
'opentelemetry-sdk>=1.26.0,<1.27.0' \
'opentelemetry-api>=1.26.0,<1.27.0' \
@ -181,9 +172,6 @@ steps:
- vllm/
- tests/engine
- tests/tokenization
- tests/test_sequence
- tests/test_config
- tests/test_logger
commands:
- pytest -v -s engine test_sequence.py test_config.py test_logger.py
# OOM in the CI unless we run this separately
@ -195,19 +183,7 @@ steps:
- vllm/
- tests/v1
commands:
# split the test to avoid interference
- VLLM_USE_V1=1 pytest -v -s v1/core
- VLLM_USE_V1=1 pytest -v -s v1/engine
- VLLM_USE_V1=1 pytest -v -s v1/sample
- VLLM_USE_V1=1 pytest -v -s v1/worker
- VLLM_USE_V1=1 pytest -v -s v1/test_stats.py
- VLLM_USE_V1=1 pytest -v -s v1/test_utils.py
# TODO: accuracy does not match, whether setting
# VLLM_USE_FLASHINFER_SAMPLER or not on H100.
- VLLM_USE_V1=1 pytest -v -s v1/e2e
# Integration test for streaming correctness (requires special branch).
- pip install -U git+https://github.com/robertgshaw2-neuralmagic/lm-evaluation-harness.git@streaming-api
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
- VLLM_USE_V1=1 pytest -v -s v1
- label: Examples Test # 25min
working_dir: "/vllm-workspace/examples"
@ -217,18 +193,18 @@ steps:
- examples/
commands:
- pip install tensorizer # for tensorizer test
- python3 offline_inference/basic/generate.py --model facebook/opt-125m
- python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
- python3 offline_inference/basic/chat.py
- python3 offline_inference/basic.py
- python3 offline_inference/cpu_offload.py
- python3 offline_inference/chat.py
- python3 offline_inference/prefix_caching.py
- python3 offline_inference/llm_engine_example.py
- python3 offline_inference/vision_language.py
- python3 offline_inference/vision_language_multi_image.py
- python3 other/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 other/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
- python3 offline_inference/encoder_decoder.py
- python3 offline_inference/basic/classify.py
- python3 offline_inference/basic/embed.py
- python3 offline_inference/basic/score.py
- python3 offline_inference/classification.py
- python3 offline_inference/embedding.py
- python3 offline_inference/scoring.py
- python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2
- label: Prefix Caching Test # 9min
@ -256,7 +232,7 @@ steps:
- vllm/model_executor/guided_decoding
- tests/test_logits_processor
- tests/model_executor/test_guided_processors
commands:
commands:
- pytest -v -s test_logits_processor.py
- pytest -v -s model_executor/test_guided_processors.py
@ -267,7 +243,7 @@ steps:
- vllm/model_executor/models/eagle.py
commands:
- pytest -v -s spec_decode/e2e/test_multistep_correctness.py
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py --ignore=spec_decode/e2e/test_mtp_correctness.py
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py
- pytest -v -s spec_decode/e2e/test_eagle_correctness.py
- label: LoRA Test %N # 15min each
@ -275,10 +251,10 @@ steps:
source_file_dependencies:
- vllm/lora
- tests/lora
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_minicpmv_tp.py --ignore=lora/test_transfomers_model.py
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_minicpmv_tp.py
parallelism: 4
- label: PyTorch Fullgraph Smoke Test # 9min
- label: "PyTorch Fullgraph Smoke Test" # 9min
fast_check: true
source_file_dependencies:
- vllm/
@ -289,7 +265,7 @@ steps:
- pytest -v -s compile/piecewise/test_simple.py
- pytest -v -s compile/piecewise/test_toy_llama.py
- label: PyTorch Fullgraph Test # 18min
- label: "PyTorch Fullgraph Test" # 18min
source_file_dependencies:
- vllm/
- tests/compile
@ -341,14 +317,6 @@ steps:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- bash ./run-tests.sh -c configs/models-small.txt -t 1
- label: OpenAI API correctness
source_file_dependencies:
- csrc/
- vllm/entrypoints/openai/
- vllm/model_executor/models/whisper.py
commands: # LMEval+Transcription WER check
- pytest -s entrypoints/openai/correctness/
- label: Encoder Decoder tests # 5min
source_file_dependencies:
- vllm/
@ -372,7 +340,6 @@ steps:
- vllm/
- tests/models
commands:
- pytest -v -s models/test_transformers.py
- pytest -v -s models/test_registry.py
- pytest -v -s models/test_initialization.py
@ -503,14 +470,12 @@ steps:
- entrypoints/llm/test_collective_rpc.py
commands:
- pytest -v -s entrypoints/llm/test_collective_rpc.py
- VLLM_USE_V1=1 torchrun --nproc-per-node=2 distributed/test_torchrun_example.py
- torchrun --nproc-per-node=2 distributed/test_torchrun_example.py
- pytest -v -s ./compile/test_basic_correctness.py
- pytest -v -s ./compile/test_wrapper.py
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
- TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
# Avoid importing model tests that cause CUDA reinitialization error
- pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
- pytest models/encoder_decoder/language/test_bart.py -v -s -m 'distributed(num_gpus=2)'
- pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m 'distributed(num_gpus=2)'
- pytest models/decoder_only/vision_language/test_models.py -v -s -m 'distributed(num_gpus=2)'
@ -534,7 +499,6 @@ steps:
- pip uninstall vllm_add_dummy_platform -y
# end platform plugin tests
# other tests continue here:
- pytest -v -s plugins_tests/test_scheduler_plugins.py
- pip install -e ./plugins/vllm_add_dummy_model
- pytest -v -s distributed/test_distributed_oot.py
- pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
@ -584,12 +548,11 @@ steps:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
# This test runs llama 13B, so it is required to run on 4 GPUs.
- pytest -v -s -x lora/test_long_context.py
# There is some Tensor Parallelism related processing logic in LoRA that
# There is some Tensor Parallelism related processing logic in LoRA that
# requires multi-GPU testing for validation.
- pytest -v -s -x lora/test_chatglm3_tp.py
- pytest -v -s -x lora/test_llama_tp.py
- pytest -v -s -x lora/test_minicpmv_tp.py
- pytest -v -s -x lora/test_transfomers_model.py
- label: Weight Loading Multiple GPU Test # 33min
@ -610,7 +573,7 @@ steps:
- vllm/
- tests/weight_loading
commands:
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
##### multi gpus test #####
@ -622,7 +585,7 @@ steps:
num_gpus: 4
source_file_dependencies:
- vllm/
commands:
commands:
# NOTE: don't test llama model here, it seems hf implementation is buggy
# see https://github.com/vllm-project/vllm/pull/5689 for details
- pytest -v -s distributed/test_custom_all_reduce.py

View File

@ -50,11 +50,8 @@ aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
if [[ $normal_wheel == *"cu118"* ]]; then
# if $normal_wheel matches cu118, do not upload the index.html
echo "Skipping index files for cu118 wheels"
elif [[ $normal_wheel == *"cu121"* ]]; then
# if $normal_wheel matches cu121, do not upload the index.html
echo "Skipping index files for cu121 wheels"
else
# only upload index.html for cu124 wheels (default wheels)
# only upload index.html for cu12 wheels (default wheels)
aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
fi
@ -66,11 +63,8 @@ aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"
if [[ $normal_wheel == *"cu118"* ]]; then
# if $normal_wheel matches cu118, do not upload the index.html
echo "Skipping index files for cu118 wheels"
elif [[ $normal_wheel == *"cu121"* ]]; then
# if $normal_wheel matches cu121, do not upload the index.html
echo "Skipping index files for cu121 wheels"
else
# only upload index.html for cu124 wheels (default wheels)
# only upload index.html for cu12 wheels (default wheels)
aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
fi

View File

@ -30,6 +30,15 @@ body:
</details>
validations:
required: true
- type: textarea
attributes:
label: Model Input Dumps
description: |
If you are facing crashing due to illegal memory access or other issues with model execution, vLLM may dump the problematic input of the model. In this case, you will see the message `Error in model execution (input dumped to /tmp/err_xxx.pkl)`. If you see this message, please zip the file (because GitHub doesn't support .pkl file format) and upload it here. This will help us to reproduce the issue and facilitate the debugging process.
placeholder: |
Upload the dumped input file.
validations:
required: false
- type: textarea
attributes:
label: 🐛 Describe the bug

View File

@ -2,5 +2,4 @@ FILL IN THE PR DESCRIPTION HERE
FIX #xxxx (*link existing issues this PR will resolve*)
<!--- pyml disable-next-line no-emphasis-as-heading -->
**BEFORE SUBMITTING, PLEASE READ <https://docs.vllm.ai/en/latest/contributing/overview.html>**
**BEFORE SUBMITTING, PLEASE READ https://docs.vllm.ai/en/latest/contributing/overview.html **

View File

@ -23,7 +23,7 @@ updates:
- dependency-name: "lm-format-enforcer"
- dependency-name: "gguf"
- dependency-name: "compressed-tensors"
- dependency-name: "ray[cgraph]" # Ray Compiled Graph
- dependency-name: "ray[adag]"
- dependency-name: "lm-eval"
groups:
minor-update:

38
.github/mergify.yml vendored
View File

@ -5,7 +5,6 @@ pull_request_rules:
- or:
- files~=^[^/]+\.md$
- files~=^docs/
- files~=^examples/
actions:
label:
add:
@ -36,43 +35,6 @@ pull_request_rules:
add:
- frontend
- name: label-structured-output
description: Automatically apply structured-output label
conditions:
- or:
- files~=^vllm/model_executor/guided_decoding/
- files=tests/model_executor/test_guided_processors.py
- files=tests/entrypoints/llm/test_guided_generate.py
- files=benchmarks/benchmark_serving_guided.py
- files=benchmarks/benchmark_guided.py
actions:
label:
add:
- structured-output
- name: label-speculative-decoding
description: Automatically apply speculative-decoding label
conditions:
- or:
- files~=^vllm/spec_decode/
- files=vllm/model_executor/layers/spec_decode_base_sampler.py
- files~=^tests/spec_decode/
actions:
label:
add:
- speculative-decoding
- name: label-v1
description: Automatically apply v1 label
conditions:
- or:
- files~=^vllm/v1/
- files~=^tests/v1/
actions:
label:
add:
- v1
- name: ping author on conflicts and add 'needs-rebase' label
conditions:
- conflict

View File

@ -16,7 +16,7 @@ jobs:
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- name: Set up Python
uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
with:
python-version: '3.12'

View File

@ -12,17 +12,17 @@ jobs:
fetch-depth: 0
- name: Set up Helm
uses: azure/setup-helm@b9e51907a09c216f16ebe8536097933489208112 # v4.3.0
uses: azure/setup-helm@fe7b79cd5ee1e45176fcad797de68ecaf3ca4814 # v4.2.0
with:
version: v3.14.4
#Python is required because ct lint runs Yamale and yamllint which require Python.
- uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
- uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
with:
python-version: '3.13'
- name: Set up chart-testing
uses: helm/chart-testing-action@0d28d3144d3a25ea2cc349d6e59901c4ff469b3b # v2.7.0
uses: helm/chart-testing-action@e6669bcd63d7cb57cb4380c33043eebe5d111992 # v2.6.1
with:
version: v3.10.1
@ -47,7 +47,7 @@ jobs:
aws --endpoint-url http://127.0.0.1:9000/ s3 cp opt-125m/ s3://testbucket/opt-125m --recursive
- name: Create kind cluster
uses: helm/kind-action@a1b0e391336a6ee6713a0583f8c6240d70863de3 # v1.12.0
uses: helm/kind-action@0025e74a8c7512023d06dc019c617aa3cf561fde # v1.10.0
- name: Build the Docker image vllm cpu
run: docker buildx build -f Dockerfile.cpu -t vllm-cpu-env .

View File

@ -10,11 +10,10 @@ jobs:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
- uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
with:
python-version: "3.12"
- run: echo "::add-matcher::.github/workflows/matchers/actionlint.json"
- run: echo "::add-matcher::.github/workflows/matchers/mypy.json"
- uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1
with:
extra_args: --all-files --hook-stage manual

View File

@ -2,6 +2,7 @@ name: PR Reminder Comment Bot
on:
pull_request_target:
types: [opened]
jobs:
pr_reminder:
runs-on: ubuntu-latest
@ -14,12 +15,7 @@ jobs:
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: context.issue.number,
body: '👋 Hi! Thank you for contributing to the vLLM project.\n\n' +
'💬 Join our developer Slack at https://slack.vllm.ai to discuss your PR in #pr-reviews, coordinate on features in #feat- channels, or join special interest groups in #sig- channels.\n\n' +
'Just a reminder: PRs would not trigger full CI run by default. Instead, it would only run `fastcheck` CI which starts running only a small and essential subset of CI tests to quickly catch errors. You can run other CI tests on top of those by going to your `fastcheck` build on Buildkite UI (linked in the PR checks section) and unblock them. If you do not have permission to unblock, ping `simon-mo` or `khluu` to add you in our Buildkite org.\n\n' +
'Once the PR is approved and ready to go, your PR reviewer(s) can run CI to test the changes comprehensively before merging.\n\n' +
'To run CI, PR reviewers can either: Add `ready` label to the PR or enable auto-merge.\n\n' +
'🚀'
body: '👋 Hi! Thank you for contributing to the vLLM project.\n Just a reminder: PRs would not trigger full CI run by default. Instead, it would only run `fastcheck` CI which starts running only a small and essential subset of CI tests to quickly catch errors. You can run other CI tests on top of those by going to your `fastcheck` build on Buildkite UI (linked in the PR checks section) and unblock them. If you do not have permission to unblock, ping `simon-mo` or `khluu` to add you in our Buildkite org. \n\nOnce the PR is approved and ready to go, your PR reviewer(s) can run CI to test the changes comprehensively before merging.\n\n To run CI, PR reviewers can do one of these:\n- Add `ready` label to the PR\n- Enable auto-merge.\n\n🚀'
})
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

View File

@ -13,7 +13,7 @@ jobs:
actions: write
runs-on: ubuntu-latest
steps:
- uses: actions/stale@5bef64f19d7facfb25b37b414482c7164d639639 # v9.1.0
- uses: actions/stale@28ca1036281a5e5922ead5184a1bbf96e5fc984e # v9.0.0
with:
# Increasing this value ensures that changes to this workflow
# propagate to all issues and PRs in days rather than months

View File

@ -1,51 +1,43 @@
default_stages:
- pre-commit # Run locally
- manual # Run in CI
exclude: 'vllm/third_party/.*'
repos:
- repo: https://github.com/google/yapf
rev: v0.43.0
rev: v0.32.0
hooks:
- id: yapf
args: [--in-place, --verbose]
additional_dependencies: [toml] # TODO: Remove when yapf is upgraded
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.9.3
rev: v0.6.5
hooks:
- id: ruff
args: [--output-format, github, --fix]
args: [--output-format, github]
- repo: https://github.com/codespell-project/codespell
rev: v2.4.0
rev: v2.3.0
hooks:
- id: codespell
additional_dependencies: ['tomli']
args: ['--toml', 'pyproject.toml']
exclude: 'benchmarks/sonnet.txt|(build|tests/(lora/data|models/fixtures|prompts))/.*'
- repo: https://github.com/PyCQA/isort
rev: 0a0b7a830386ba6a31c2ec8316849ae4d1b8240d # 6.0.0
rev: 5.13.2
hooks:
- id: isort
- repo: https://github.com/pre-commit/mirrors-clang-format
rev: v19.1.7
rev: v18.1.5
hooks:
- id: clang-format
exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))|vllm/third_party/.*'
exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))'
types_or: [c++, cuda]
args: [--style=file, --verbose]
- repo: https://github.com/jackdewinter/pymarkdown
rev: v0.9.27
hooks:
- id: pymarkdown
args: [fix]
files: docs/.*
- repo: https://github.com/rhysd/actionlint
rev: v1.7.7
rev: v1.7.6
hooks:
- id: actionlint
- repo: https://github.com/astral-sh/uv-pre-commit
rev: 0.6.2
hooks:
- id: pip-compile
args: [requirements-test.in, -o, requirements-test.txt]
files: ^requirements-test\.(in|txt)$
- repo: local
hooks:
- id: mypy-local
@ -93,37 +85,8 @@ repos:
entry: tools/png-lint.sh
language: script
types: [png]
- id: signoff-commit
name: Sign-off Commit
entry: bash
args:
- -c
- |
if ! grep -q "^Signed-off-by: $(git config user.name) <$(git config user.email)>" .git/COMMIT_EDITMSG; then
printf "\nSigned-off-by: $(git config user.name) <$(git config user.email)>\n" >> .git/COMMIT_EDITMSG
fi
language: system
verbose: true
stages: [commit-msg]
- id: check-spdx-header
name: Check SPDX headers
entry: python tools/check_spdx_header.py
language: python
types: [python]
- id: check-filenames
name: Check for spaces in all filenames
entry: bash
args:
- -c
- 'git ls-files | grep " " && echo "Filenames should not contain spaces!" && exit 1 || exit 0'
language: system
always_run: true
pass_filenames: false
# Keep `suggestion` last
- id: suggestion
name: Suggestion
entry: bash -c 'echo "To bypass pre-commit hooks, add --no-verify to git commit."'
language: system
verbose: true
pass_filenames: false
# Insert new entries above the `suggestion` entry
verbose: true

View File

@ -31,10 +31,10 @@ set(ignoreMe "${VLLM_PYTHON_PATH}")
set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12")
# Supported NVIDIA architectures.
set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0")
set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0")
# Supported AMD GPU architectures.
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101")
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101")
#
# Supported/expected torch versions for CUDA/ROCm.
@ -174,25 +174,6 @@ include(FetchContent)
file(MAKE_DIRECTORY ${FETCHCONTENT_BASE_DIR}) # Ensure the directory exists
message(STATUS "FetchContent base directory: ${FETCHCONTENT_BASE_DIR}")
#
# Set rocm version dev int.
#
if(VLLM_GPU_LANG STREQUAL "HIP")
#
# Overriding the default -O set up by cmake, adding ggdb3 for the most verbose devug info
#
set(CMAKE_${VLLM_GPU_LANG}_FLAGS_DEBUG "${CMAKE_${VLLM_GPU_LANG}_FLAGS_DEBUG} -O0 -ggdb3")
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0 -ggdb3")
#
# Certain HIP functions are marked as [[nodiscard]], yet vllm ignores the result which generates
# a lot of warnings that always mask real issues. Suppressing until this is properly addressed.
#
set(CMAKE_${VLLM_GPU_LANG}_FLAGS "${CMAKE_${VLLM_GPU_LANG}_FLAGS} -Wno-unused-result")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-result")
endif()
#
# Define other extension targets
#
@ -211,7 +192,7 @@ set_gencode_flags_for_srcs(
if(VLLM_GPU_LANG STREQUAL "CUDA")
message(STATUS "Enabling cumem allocator extension.")
# link against cuda driver library
list(APPEND CUMEM_LIBS CUDA::cuda_driver)
list(APPEND CUMEM_LIBS cuda)
define_gpu_extension_target(
cumem_allocator
DESTINATION vllm
@ -247,8 +228,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
# Set CUTLASS_REVISION manually -- its revision detection doesn't work in this case.
# Please keep this in sync with FetchContent_Declare line below.
set(CUTLASS_REVISION "v3.8.0" CACHE STRING "CUTLASS revision to use")
set(CUTLASS_REVISION "v3.6.0" CACHE STRING "CUTLASS revision to use")
# Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
@ -265,8 +245,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
FetchContent_Declare(
cutlass
GIT_REPOSITORY https://github.com/nvidia/cutlass.git
# Please keep this in sync with CUTLASS_REVISION line above.
GIT_TAG v3.8.0
GIT_TAG v3.6.0
GIT_PROGRESS TRUE
# Speed up CUTLASS download by retrieving only the specified GIT_TAG instead of the history.
@ -285,9 +264,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
"csrc/custom_all_reduce.cu"
"csrc/permute_cols.cu"
"csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
"csrc/quantization/fp4/nvfp4_quant_entry.cu"
"csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu"
"csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
"csrc/sparse/cutlass/sparse_compressor_entry.cu"
"csrc/cutlass_extensions/common.cpp")
set_gencode_flags_for_srcs(
@ -297,7 +275,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# Only build Marlin kernels if we are building for at least some compatible archs.
# Keep building Marlin for 9.0 as there are some group sizes and shapes that
# are not supported by Machete yet.
cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0" "${CUDA_ARCHS}")
cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.7;8.9;9.0" ${CUDA_ARCHS})
if (MARLIN_ARCHS)
set(MARLIN_SRCS
"csrc/quantization/fp8/fp8_marlin.cu"
@ -317,32 +295,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
" in CUDA target architectures")
endif()
# Only build AllSpark kernels if we are building for at least some compatible archs.
cuda_archs_loose_intersection(ALLSPARK_ARCHS "8.0;8.6;8.7;8.9" "${CUDA_ARCHS}")
if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND ALLSPARK_ARCHS)
set(ALLSPARK_SRCS
"csrc/quantization/gptq_allspark/allspark_repack.cu"
"csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu")
set_gencode_flags_for_srcs(
SRCS "${ALLSPARK_SRCS}"
CUDA_ARCHS "${ALLSPARK_ARCHS}")
list(APPEND VLLM_EXT_SRC "${ALLSPARK_SRCS}")
message(STATUS "Building AllSpark kernels for archs: ${ALLSPARK_ARCHS}")
else()
message(STATUS "Not building AllSpark kernels as no compatible archs found"
" in CUDA target architectures, or CUDA not >= 12.0")
endif()
# The cutlass_scaled_mm kernels for Hopper (c3x, i.e. CUTLASS 3.x) require
# CUDA 12.0 or later (and only work on Hopper, 9.0a for now).
cuda_archs_loose_intersection(SCALED_MM_3X_ARCHS "9.0a;10.0a;10.1a;12.0a" "${CUDA_ARCHS}")
# CUDA 12.0 or later (and only work on Hopper, 9.0/9.0a for now).
cuda_archs_loose_intersection(SCALED_MM_3X_ARCHS "9.0;9.0a" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
set(SRCS
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu"
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu"
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8.cu"
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_azp_sm90_int8.cu"
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8.cu")
set(SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu")
set_gencode_flags_for_srcs(
SRCS "${SRCS}"
CUDA_ARCHS "${SCALED_MM_3X_ARCHS}")
@ -369,7 +326,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# For the cutlass_scaled_mm kernels we want to build the c2x (CUTLASS 2.x)
# kernels for the remaining archs that are not already built for 3x.
cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS
"7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0" "${CUDA_ARCHS}")
"7.5;8.0;8.6;8.7;8.9;9.0" "${CUDA_ARCHS}")
# subtract out the archs that are already built for 3x
list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
if (SCALED_MM_2X_ARCHS)
@ -394,9 +351,10 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# 2:4 Sparse Kernels
# The 2:4 sparse kernels cutlass_scaled_sparse_mm and cutlass_compressor
# require CUDA 12.2 or later (and only work on Hopper and Blackwell).
# require CUDA 12.2 or later (and only work on Hopper, 9.0/9.0a for now).
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_3X_ARCHS)
set(SRCS "csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu")
set(SRCS "csrc/sparse/cutlass/sparse_compressor_c3x.cu"
"csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu")
set_gencode_flags_for_srcs(
SRCS "${SRCS}"
CUDA_ARCHS "${SCALED_MM_3X_ARCHS}")
@ -414,40 +372,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
endif()
endif()
# FP4 Archs and flags
cuda_archs_loose_intersection(FP4_ARCHS "10.0a" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND FP4_ARCHS)
set(SRCS
"csrc/quantization/fp4/nvfp4_quant_kernels.cu"
"csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu")
set_gencode_flags_for_srcs(
SRCS "${SRCS}"
CUDA_ARCHS "${FP4_ARCHS}")
list(APPEND VLLM_EXT_SRC "${SRCS}")
list(APPEND VLLM_GPU_FLAGS "-DENABLE_NVFP4=1")
message(STATUS "Building NVFP4 for archs: ${FP4_ARCHS}")
else()
message(STATUS "Not building NVFP4 as no compatible archs were found.")
# clear FP4_ARCHS
set(FP4_ARCHS)
endif()
# FP8 Blackwell Archs
cuda_archs_loose_intersection(BLACKWELL_ARCHS "10.0;10.1;12.0" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND BLACKWELL_ARCHS)
set(SRCS
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu"
)
set_gencode_flags_for_srcs(
SRCS "${SRCS}"
CUDA_ARCHS "${BLACKWELL_ARCHS}")
list(APPEND VLLM_EXT_SRC "${SRCS}")
message(STATUS "Building FP8 for archs: ${BLACKWELL_ARCHS}")
else()
# clear BLACKWELL_ARCHS
set(BLACKWELL_ARCHS)
endif()
#
# Machete kernels
@ -528,8 +453,7 @@ define_gpu_extension_target(
SOURCES ${VLLM_EXT_SRC}
COMPILE_FLAGS ${VLLM_GPU_FLAGS}
ARCHITECTURES ${VLLM_GPU_ARCHES}
INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR}
INCLUDE_DIRECTORIES ${CUTLASS_TOOLS_UTIL_INCLUDE_DIR}
INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR};${CUTLASS_TOOLS_UTIL_INCLUDE_DIR}
USE_SABI 3
WITH_SOABI)
@ -553,7 +477,7 @@ set_gencode_flags_for_srcs(
CUDA_ARCHS "${CUDA_ARCHS}")
if(VLLM_GPU_LANG STREQUAL "CUDA")
cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0" "${CUDA_ARCHS}")
cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.6;8.7;8.9;9.0" "${CUDA_ARCHS}")
if (MARLIN_MOE_ARCHS)
set(MARLIN_MOE_SRC
"csrc/moe/marlin_kernels/marlin_moe_kernel.h"
@ -607,8 +531,77 @@ if(VLLM_GPU_LANG STREQUAL "HIP")
WITH_SOABI)
endif()
# For CUDA we also build and ship some external projects.
if (VLLM_GPU_LANG STREQUAL "CUDA")
include(cmake/external_projects/flashmla.cmake)
include(cmake/external_projects/vllm_flash_attn.cmake)
# vllm-flash-attn currently only supported on CUDA
if (NOT VLLM_GPU_LANG STREQUAL "CUDA")
return()
endif ()
# vLLM flash attention requires VLLM_GPU_ARCHES to contain the set of target
# arches in the CMake syntax (75-real, 89-virtual, etc), since we clear the
# arches in the CUDA case (and instead set the gencodes on a per file basis)
# we need to manually set VLLM_GPU_ARCHES here.
if(VLLM_GPU_LANG STREQUAL "CUDA")
foreach(_ARCH ${CUDA_ARCHS})
string(REPLACE "." "" _ARCH "${_ARCH}")
list(APPEND VLLM_GPU_ARCHES "${_ARCH}-real")
endforeach()
endif()
#
# Build vLLM flash attention from source
#
# IMPORTANT: This has to be the last thing we do, because vllm-flash-attn uses the same macros/functions as vLLM.
# Because functions all belong to the global scope, vllm-flash-attn's functions overwrite vLLMs.
# They should be identical but if they aren't, this is a massive footgun.
#
# The vllm-flash-attn install rules are nested under vllm to make sure the library gets installed in the correct place.
# To only install vllm-flash-attn, use --component _vllm_fa2_C (for FA2) or --component _vllm_fa3_C (for FA3).
# If no component is specified, vllm-flash-attn is still installed.
# If VLLM_FLASH_ATTN_SRC_DIR is set, vllm-flash-attn is installed from that directory instead of downloading.
# This is to enable local development of vllm-flash-attn within vLLM.
# It can be set as an environment variable or passed as a cmake argument.
# The environment variable takes precedence.
if (DEFINED ENV{VLLM_FLASH_ATTN_SRC_DIR})
set(VLLM_FLASH_ATTN_SRC_DIR $ENV{VLLM_FLASH_ATTN_SRC_DIR})
endif()
if(VLLM_FLASH_ATTN_SRC_DIR)
FetchContent_Declare(
vllm-flash-attn SOURCE_DIR
${VLLM_FLASH_ATTN_SRC_DIR}
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
)
else()
FetchContent_Declare(
vllm-flash-attn
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
GIT_TAG 90eacc1af2a7c3de62ea249e929ed5faccf38954
GIT_PROGRESS TRUE
# Don't share the vllm-flash-attn build between build types
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
)
endif()
# Fetch the vllm-flash-attn library
FetchContent_MakeAvailable(vllm-flash-attn)
message(STATUS "vllm-flash-attn is available at ${vllm-flash-attn_SOURCE_DIR}")
# Copy over the vllm-flash-attn python files (duplicated for fa2 and fa3, in
# case only one is built, in the case both are built redundant work is done)
install(
DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/
DESTINATION vllm_flash_attn
COMPONENT _vllm_fa2_C
FILES_MATCHING PATTERN "*.py"
)
install(
DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/
DESTINATION vllm_flash_attn
COMPONENT _vllm_fa3_C
FILES_MATCHING PATTERN "*.py"
)
# Nothing after vllm-flash-attn, see comment about macros above

View File

@ -61,7 +61,7 @@ representative at an online or offline/IRL event.
Instances of abusive, harassing, or otherwise unacceptable behavior may be
reported to the community leaders responsible for enforcement in the #code-of-conduct
channel in the [vLLM Slack](https://slack.vllm.ai).
channel in the [vLLM Discord](https://discord.com/invite/jz7wjKhh6g).
All complaints will be reviewed and investigated promptly and fairly.
All community leaders are obligated to respect the privacy and security of the
@ -125,3 +125,4 @@ Community Impact Guidelines were inspired by
For answers to common questions about this code of conduct, see the
[Contributor Covenant FAQ](https://www.contributor-covenant.org/faq). Translations are available at
[Contributor Covenant translations](https://www.contributor-covenant.org/translations).

View File

@ -27,9 +27,6 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
&& ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
&& curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
&& python3 --version && python3 -m pip --version
# Install uv for faster pip installs
RUN --mount=type=cache,target=/root/.cache/uv \
python3 -m pip install uv
# Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519
# as it was causing spam when compiling the CUTLASS kernels
@ -53,15 +50,15 @@ WORKDIR /workspace
# we need to install torch and torchvision from the nightly builds first,
# pytorch will not appear as a vLLM dependency in all of the following steps
# after this step
RUN --mount=type=cache,target=/root/.cache/uv \
RUN --mount=type=cache,target=/root/.cache/pip \
if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu126 "torch==2.7.0.dev20250121+cu126" "torchvision==0.22.0.dev20250121"; \
python3 -m pip install --index-url https://download.pytorch.org/whl/nightly/cu126 "torch==2.7.0.dev20250121+cu126" "torchvision==0.22.0.dev20250121"; \
fi
COPY requirements-common.txt requirements-common.txt
COPY requirements-cuda.txt requirements-cuda.txt
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system -r requirements-cuda.txt
RUN --mount=type=cache,target=/root/.cache/pip \
python3 -m pip install -r requirements-cuda.txt
# cuda arch list used by torch
# can be useful for both `dev` and `test`
@ -81,8 +78,8 @@ ARG TARGETPLATFORM
# install build dependencies
COPY requirements-build.txt requirements-build.txt
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system -r requirements-build.txt
RUN --mount=type=cache,target=/root/.cache/pip \
python3 -m pip install -r requirements-build.txt
COPY . .
ARG GIT_REPO_CHECK=0
@ -101,7 +98,7 @@ ARG SCCACHE_BUCKET_NAME=vllm-build-sccache
ARG SCCACHE_REGION_NAME=us-west-2
ARG SCCACHE_S3_NO_CREDENTIALS=0
# if USE_SCCACHE is set, use sccache to speed up compilation
RUN --mount=type=cache,target=/root/.cache/uv \
RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=bind,source=.git,target=.git \
if [ "$USE_SCCACHE" = "1" ]; then \
echo "Installing sccache..." \
@ -121,7 +118,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
ENV CCACHE_DIR=/root/.cache/ccache
RUN --mount=type=cache,target=/root/.cache/ccache \
--mount=type=cache,target=/root/.cache/uv \
--mount=type=cache,target=/root/.cache/pip \
--mount=type=bind,source=.git,target=.git \
if [ "$USE_SCCACHE" != "1" ]; then \
python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \
@ -129,8 +126,8 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
# Check the size of the wheel if RUN_WHEEL_CHECK is true
COPY .buildkite/check-wheel-size.py check-wheel-size.py
# sync the default value with .buildkite/check-wheel-size.py
ARG VLLM_MAX_SIZE_MB=400
# Default max size of the wheel is 250MB
ARG VLLM_MAX_SIZE_MB=250
ENV VLLM_MAX_SIZE_MB=$VLLM_MAX_SIZE_MB
ARG RUN_WHEEL_CHECK=true
RUN if [ "$RUN_WHEEL_CHECK" = "true" ]; then \
@ -146,14 +143,13 @@ FROM base as dev
COPY requirements-lint.txt requirements-lint.txt
COPY requirements-test.txt requirements-test.txt
COPY requirements-dev.txt requirements-dev.txt
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system -r requirements-dev.txt
RUN --mount=type=cache,target=/root/.cache/pip \
python3 -m pip install -r requirements-dev.txt
#################### DEV IMAGE ####################
#################### vLLM installation IMAGE ####################
# image with vLLM installed
# TODO: Restore to base image after FlashInfer AOT wheel fixed
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS vllm-base
FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu22.04 AS vllm-base
ARG CUDA_VERSION=12.4.1
ARG PYTHON_VERSION=3.12
WORKDIR /vllm-workspace
@ -177,9 +173,6 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
&& ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
&& curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
&& python3 --version && python3 -m pip --version
# Install uv for faster pip installs
RUN --mount=type=cache,target=/root/.cache/uv \
python3 -m pip install uv
# Workaround for https://github.com/openai/triton/issues/2507 and
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
@ -191,43 +184,22 @@ RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
# we need to install torch and torchvision from the nightly builds first,
# pytorch will not appear as a vLLM dependency in all of the following steps
# after this step
RUN --mount=type=cache,target=/root/.cache/uv \
RUN --mount=type=cache,target=/root/.cache/pip \
if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu124 "torch==2.6.0.dev20241210+cu124" "torchvision==0.22.0.dev20241215"; \
python3 -m pip install --index-url https://download.pytorch.org/whl/nightly/cu124 "torch==2.6.0.dev20241210+cu124" "torchvision==0.22.0.dev20241215"; \
fi
# Install vllm wheel first, so that torch etc will be installed.
RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
--mount=type=cache,target=/root/.cache/uv \
uv pip install --system dist/*.whl --verbose
--mount=type=cache,target=/root/.cache/pip \
python3 -m pip install dist/*.whl --verbose
# If we need to build FlashInfer wheel before its release:
# $ export FLASHINFER_ENABLE_AOT=1
# $ # Note we remove 7.0 from the arch list compared to the list below, since FlashInfer only supports sm75+
# $ export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.6 8.9 9.0+PTX'
# $ git clone https://github.com/flashinfer-ai/flashinfer.git --recursive
# $ cd flashinfer
# $ git checkout 524304395bd1d8cd7d07db083859523fcaa246a4
# $ rm -rf build
# $ python3 setup.py bdist_wheel --dist-dir=dist --verbose
# $ ls dist
# $ # upload the wheel to a public location, e.g. https://wheels.vllm.ai/flashinfer/524304395bd1d8cd7d07db083859523fcaa246a4/flashinfer_python-0.2.1.post1+cu124torch2.5-cp38-abi3-linux_x86_64.whl
RUN --mount=type=cache,target=/root/.cache/uv \
RUN --mount=type=cache,target=/root/.cache/pip \
. /etc/environment && \
if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \
uv pip install --system https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.1.post1/flashinfer_python-0.2.1.post1+cu124torch2.5-cp38-abi3-linux_x86_64.whl ; \
python3 -m pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.6/flashinfer-0.1.6+cu121torch2.4-cp${PYTHON_VERSION_STR}-cp${PYTHON_VERSION_STR}-linux_x86_64.whl; \
fi
COPY examples examples
# Although we build Flashinfer with AOT mode, there's still
# some issues w.r.t. JIT compilation. Therefore we need to
# install build dependencies for JIT compilation.
# TODO: Remove this once FlashInfer AOT wheel is fixed
COPY requirements-build.txt requirements-build.txt
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system -r requirements-build.txt
#################### vLLM installation IMAGE ####################
#################### TEST IMAGE ####################
@ -238,16 +210,16 @@ FROM vllm-base AS test
ADD . /vllm-workspace/
# install development dependencies (for testing)
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system -r requirements-dev.txt
RUN --mount=type=cache,target=/root/.cache/pip \
python3 -m pip install -r requirements-dev.txt
# install development dependencies (for testing)
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system -e tests/vllm_test_utils
RUN --mount=type=cache,target=/root/.cache/pip \
python3 -m pip install -e tests/vllm_test_utils
# enable fast downloads from hf (for testing)
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system hf_transfer
RUN --mount=type=cache,target=/root/.cache/pip \
python3 -m pip install hf_transfer
ENV HF_HUB_ENABLE_HF_TRANSFER 1
# Copy in the v1 package for testing (it isn't distributed yet)
@ -266,11 +238,11 @@ RUN mv vllm test_docs/
FROM vllm-base AS vllm-openai-base
# install additional dependencies for openai api server
RUN --mount=type=cache,target=/root/.cache/uv \
RUN --mount=type=cache,target=/root/.cache/pip \
if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
uv pip install --system accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.42.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.42.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
else \
uv pip install --system accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.45.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.45.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
fi
ENV VLLM_USAGE_SOURCE production-docker-image

View File

@ -23,12 +23,10 @@ WORKDIR ${APP_MOUNT}/vllm
RUN python3 -m pip install --upgrade pip
RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas
RUN python3 -m pip install sentencepiece transformers==4.45.2 -U
RUN python3 -m pip install transformers-neuronx --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
RUN python3 -m pip install neuronx-cc==2.16.345.0 --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
RUN python3 -m pip install pytest
# uninstall transformers-neuronx package explicitly to avoid version conflict
RUN python3 -m pip uninstall -y transformers-neuronx
COPY . .
ARG GIT_REPO_CHECK=0
RUN --mount=type=bind,source=.git,target=.git \
@ -45,10 +43,6 @@ RUN --mount=type=bind,source=.git,target=.git \
# install development dependencies (for testing)
RUN python3 -m pip install -e tests/vllm_test_utils
# install transformers-neuronx package as an optional dependencies (for V0)
# FIXME: `--no-deps` argument is temporarily added to resolve transformers package version conflict
RUN python3 -m pip install transformers-neuronx==0.13.* --extra-index-url=https://pip.repos.neuron.amazonaws.com -U --no-deps
# overwrite entrypoint to run bash script
RUN echo "import subprocess; import sys; subprocess.check_call(sys.argv[1:])" > /usr/local/bin/dockerd-entrypoint.py

View File

@ -4,12 +4,12 @@ USER root
ENV PATH="/usr/local/cargo/bin:$PATH:/opt/conda/bin/"
RUN apt-get update -y && apt-get install -y git wget kmod curl vim libnuma-dev libsndfile-dev libprotobuf-dev build-essential ffmpeg libsm6 libxext6 libgl1 libssl-dev
RUN apt-get update -y && apt-get install -y git wget curl vim libnuma-dev libsndfile-dev libprotobuf-dev build-essential ffmpeg libsm6 libxext6 libgl1 libssl-dev
# Some packages in requirements-cpu are installed here
# IBM provides optimized packages for ppc64le processors in the open-ce project for mamba
# Currently these may not be available for venv or pip directly
RUN micromamba install -y -n base -c https://ftp.osuosl.org/pub/open-ce/1.11.0-p10/ -c defaults python=3.10 rust && micromamba clean --all --yes
RUN micromamba install -y -n base -c https://ftp.osuosl.org/pub/open-ce/1.11.0-p10/ -c defaults python=3.10 torchvision-cpu=0.16.2 rust && micromamba clean --all --yes
COPY ./ /workspace/vllm
@ -21,6 +21,7 @@ RUN --mount=type=bind,source=.git,target=.git \
RUN --mount=type=cache,target=/root/.cache/pip \
RUSTFLAGS='-L /opt/conda/lib' pip install -v --prefer-binary --extra-index-url https://repo.fury.io/mgiessing \
'cmake>=3.26' ninja packaging 'setuptools-scm>=8' wheel jinja2 \
torch==2.3.1 \
-r requirements-cpu.txt \
xformers uvloop==0.20.0

View File

@ -6,7 +6,7 @@ ARG RCCL_BRANCH="648a58d"
ARG RCCL_REPO="https://github.com/ROCm/rccl"
ARG TRITON_BRANCH="e5be006"
ARG TRITON_REPO="https://github.com/triton-lang/triton.git"
ARG PYTORCH_BRANCH="3a585126"
ARG PYTORCH_BRANCH="8d4926e"
ARG PYTORCH_VISION_BRANCH="v0.19.1"
ARG PYTORCH_REPO="https://github.com/pytorch/pytorch.git"
ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"

View File

@ -1,4 +1,4 @@
ARG NIGHTLY_DATE="20250124"
ARG NIGHTLY_DATE="20250122"
ARG BASE_IMAGE="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.10_tpuvm_$NIGHTLY_DATE"
FROM $BASE_IMAGE

View File

@ -10,23 +10,13 @@ Easy, fast, and cheap LLM serving for everyone
</h3>
<p align="center">
| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://vllm.ai"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> |
| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://vllm.ai"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://discord.gg/jz7wjKhh6g"><b>Discord</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> |
</p>
---
Were excited to invite you to the first **vLLM China Meetup** on **March 16** in **Beijing**!
Join us to connect with the **vLLM team** and explore how vLLM is leveraged in **post-training, fine-tuning, and deployment**, including [verl](https://github.com/volcengine/verl), [LLaMA-Factory](https://github.com/hiyouga/LLaMA-Factory), and [vllm-ascend](https://github.com/vllm-project/vllm-ascend).
👉 **[Register Now](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg)** to be part of the discussion!
---
*Latest News* 🔥
- [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html).
- [2025/01] We hosted [the eighth vLLM meetup](https://lu.ma/zep56hui) with Google Cloud! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1epVkt4Zu8Jz_S5OhEHPc798emsYh2BwYfRuDDVEF7u4/edit?usp=sharing), and Google Cloud team [here](https://drive.google.com/file/d/1h24pHewANyRL11xy5dXUbvRC9F9Kkjix/view?usp=sharing).
- [2025/01] We hosted [the eighth vLLM meetup](https://lu.ma/zep56hui) with Google Cloud! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1epVkt4Zu8Jz_S5OhEHPc798emsYh2BwYfRuDDVEF7u4/edit?usp=sharing).
- [2024/12] vLLM joins [pytorch ecosystem](https://pytorch.org/blog/vllm-joins-pytorch)! Easy, Fast, and Cheap LLM Serving for Everyone!
- [2024/11] We hosted [the seventh vLLM meetup](https://lu.ma/h0qvrajz) with Snowflake! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing), and Snowflake team [here](https://docs.google.com/presentation/d/1qF3RkDAbOULwz9WK5TOltt2fE9t6uIc_hVNLFAaQX6A/edit?usp=sharing).
- [2024/10] We have just created a developer slack ([slack.vllm.ai](https://slack.vllm.ai)) focusing on coordinating contributions and discussing features. Please feel free to join us there!
@ -42,12 +32,10 @@ Join us to connect with the **vLLM team** and explore how vLLM is leveraged in *
- [2023/06] We officially released vLLM! FastChat-vLLM integration has powered [LMSYS Vicuna and Chatbot Arena](https://chat.lmsys.org) since mid-April. Check out our [blog post](https://vllm.ai).
---
## About
vLLM is a fast and easy-to-use library for LLM inference and serving.
Originally developed in the [Sky Computing Lab](https://sky.cs.berkeley.edu) at UC Berkeley, vLLM has evolved into a community-driven project with contributions from both academia and industry.
Originally developed in the [Sky Computing Lab](https://sky.cs.berkeley.edu) at UC Berkeley, vLLM has evloved into a community-driven project with contributions from both academia and industry.
vLLM is fast with:
@ -138,7 +126,6 @@ We also have an official fundraising venue through [OpenCollective](https://open
## Citation
If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs/2309.06180):
```bibtex
@inproceedings{kwon2023efficient,
title={Efficient Memory Management for Large Language Model Serving with PagedAttention},
@ -150,11 +137,12 @@ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs
## Contact Us
- For technical questions and feature requests, please use Github issues or discussions.
- For discussing with fellow users and coordinating contributions and development, please use Slack.
- For security disclosures, please use Github's security advisory feature.
- For collaborations and partnerships, please contact us at vllm-questions AT lists.berkeley.edu.
* For technical questions and feature requests, please use Github issues or discussions.
* For discussing with fellow users, please use Discord.
* For coordinating contributions and development, please use Slack.
* For security disclosures, please use Github's security advisory feature.
* For collaborations and partnerships, please contact us at vllm-questions AT lists.berkeley.edu.
## Media Kit
- If you wish to use vLLM's logo, please refer to [our media kit repo](https://github.com/vllm-project/media-kit).
* If you wish to use vLLM's logo, please refer to [our media kit repo](https://github.com/vllm-project/media-kit).

View File

@ -1,54 +0,0 @@
# Releasing vLLM
vLLM releases offer a reliable version of the code base, packaged into a binary format that can be conveniently accessed via PyPI. These releases also serve as key milestones for the development team to communicate with the community about newly available features, improvements, and upcoming changes that could affect users, including potential breaking changes.
## Release Versioning
vLLM uses a “right-shifted” versioning scheme where a new patch release is out every 2 weeks. And patch releases contain features and bug fixes (as opposed to semver where patch release contains only backwards-compatible bug fixes). When critical fixes need to be made, special release post1 is released.
* _major_ major architectural milestone and when incompatible API changes are made, similar to PyTorch 2.0.
* _minor_ major features
* _patch_ features and backwards-compatible bug fixes
* _post1_ or _patch-1_ backwards-compatible bug fixes, either explicit or implicit post release
## Release Cadence
Patch release is released on bi-weekly basis. Post release 1-3 days after patch release and uses same branch as patch release.
Following is the release cadence for year 2025. All future release dates below are tentative. Please note: Post releases are optional.
| Release Date | Patch release versions | Post Release versions |
| --- | --- | --- |
| Jan 2025 | 0.7.0 | --- |
| Feb 2025 | 0.7.1, 0.7.2, 0.7.3 | --- |
| Mar 2025 | 0.7.4, 0.7.5 | --- |
| Apr 2025 | 0.7.6, 0.7.7 | --- |
| May 2025 | 0.7.8, 0.7.9 | --- |
| Jun 2025 | 0.7.10, 0.7.11 | --- |
| Jul 2025 | 0.7.12, 0.7.13 | --- |
| Aug 2025 | 0.7.14, 0.7.15 | --- |
| Sep 2025 | 0.7.16, 0.7.17 | --- |
| Oct 2025 | 0.7.18, 0.7.19 | --- |
| Nov 2025 | 0.7.20, 0.7.21 | --- |
| Dec 2025 | 0.7.22, 0.7.23 | --- |
## Release branch
Each release is built from a dedicated release branch.
* For _major_, _minor_, _patch_ releases, the release branch cut is performed 1-2 days before release is live.
* For post releases, previously cut release branch is reused
* Release builds are triggered via push to RC tag like vX.Y.Z-rc1 . This enables us to build and test multiple RCs for each release.
* Final tag : vX.Y.Z does not trigger the build but used for Release notes and assets.
* After branch cut is created we monitor the main branch for any reverts and apply these reverts to a release branch.
## Release Cherry-Pick Criteria
After branch cut, we approach finalizing the release branch with clear criteria on what cherry picks are allowed in. Note: a cherry pick is a process to land a PR in the release branch after branch cut. These are typically limited to ensure that the team has sufficient time to complete a thorough round of testing on a stable code base.
* Regression fixes - that address functional/performance regression against the most recent release (e.g. 0.7.0 for 0.7.1 release)
* Critical fixes - critical fixes for severe issue such as silent incorrectness, backwards compatibility, crashes, deadlocks, (large) memory leaks
* Fixes to new features introduced in the most recent release (e.g. 0.7.0 for 0.7.1 release)
* Documentation improvements
* Release branch specific changes (e.g. change version identifiers or CI fixes)
Please note: **No feature work allowed for cherry picks**. All PRs that are considered for cherry-picks need to be merged on trunk, the only exception are Release branch specific changes.

View File

@ -3,7 +3,6 @@
## Downloading the ShareGPT dataset
You can download the dataset by running:
```bash
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
```
@ -12,18 +11,9 @@ wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/r
The json file refers to several image datasets (coco, llava, etc.). The benchmark scripts
will ignore a datapoint if the referred image is missing.
```bash
wget https://huggingface.co/datasets/Lin-Chen/ShareGPT4V/resolve/main/sharegpt4v_instruct_gpt4-vision_cap100k.json
mkdir coco -p
wget http://images.cocodataset.org/zips/train2017.zip -O coco/train2017.zip
unzip coco/train2017.zip -d coco/
```
# Downloading the BurstGPT dataset
You can download the BurstGPT v1.1 dataset by running:
```bash
wget https://github.com/HPMLL/BurstGPT/releases/download/v1.1/BurstGPT_without_fails_2.csv
```

View File

@ -1,12 +1,10 @@
# SPDX-License-Identifier: Apache-2.0
import json
import os
import sys
import time
import traceback
from dataclasses import dataclass, field
from typing import Optional, Union
from typing import List, Optional, Union
import aiohttp
import huggingface_hub.constants
@ -14,8 +12,6 @@ from tqdm.asyncio import tqdm
from transformers import (AutoTokenizer, PreTrainedTokenizer,
PreTrainedTokenizerFast)
from vllm.model_executor.model_loader.weight_utils import get_lock
AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
@ -41,8 +37,8 @@ class RequestFuncOutput:
latency: float = 0.0
output_tokens: int = 0
ttft: float = 0.0 # Time to first token
itl: list[float] = field(
default_factory=list) # list of inter-token latencies
itl: List[float] = field(
default_factory=list) # List of inter-token latencies
tpot: float = 0.0 # avg next-token latencies
prompt_len: int = 0
error: str = ""
@ -55,8 +51,7 @@ async def async_request_tgi(
api_url = request_func_input.api_url
assert api_url.endswith("generate_stream")
async with aiohttp.ClientSession(trust_env=True,
timeout=AIOHTTP_TIMEOUT) as session:
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
params = {
"best_of": request_func_input.best_of,
"max_new_tokens": request_func_input.output_len,
@ -128,8 +123,7 @@ async def async_request_trt_llm(
api_url = request_func_input.api_url
assert api_url.endswith("generate_stream")
async with aiohttp.ClientSession(trust_env=True,
timeout=AIOHTTP_TIMEOUT) as session:
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
assert request_func_input.best_of == 1
payload = {
"accumulate_tokens": True,
@ -193,8 +187,7 @@ async def async_request_deepspeed_mii(
request_func_input: RequestFuncInput,
pbar: Optional[tqdm] = None,
) -> RequestFuncOutput:
async with aiohttp.ClientSession(trust_env=True,
timeout=AIOHTTP_TIMEOUT) as session:
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
assert request_func_input.best_of == 1
payload = {
@ -242,8 +235,7 @@ async def async_request_openai_completions(
("completions", "profile")
), "OpenAI Completions API URL must end with 'completions' or 'profile'."
async with aiohttp.ClientSession(trust_env=True,
timeout=AIOHTTP_TIMEOUT) as session:
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
payload = {
"model": request_func_input.model_name \
if request_func_input.model_name else request_func_input.model,
@ -341,8 +333,7 @@ async def async_request_openai_chat_completions(
"chat/completions"
), "OpenAI Chat Completions API URL must end with 'chat/completions'."
async with aiohttp.ClientSession(trust_env=True,
timeout=AIOHTTP_TIMEOUT) as session:
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
content = [{"type": "text", "text": request_func_input.prompt}]
if request_func_input.multi_modal_content:
content.append(request_func_input.multi_modal_content)
@ -432,15 +423,12 @@ def get_model(pretrained_model_name_or_path: str) -> str:
if os.getenv('VLLM_USE_MODELSCOPE', 'False').lower() == 'true':
from modelscope import snapshot_download
# Use file lock to prevent multiple processes from
# downloading the same model weights at the same time.
with get_lock(pretrained_model_name_or_path):
model_path = snapshot_download(
model_id=pretrained_model_name_or_path,
local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"])
model_path = snapshot_download(
model_id=pretrained_model_name_or_path,
local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"])
return model_path
return model_path
return pretrained_model_name_or_path

View File

@ -1,4 +1,3 @@
# SPDX-License-Identifier: Apache-2.0
"""Benchmark guided decoding throughput."""
import argparse
import dataclasses
@ -6,6 +5,7 @@ import json
import os
import random
import time
from typing import List
import datasets
import pandas as pd
@ -38,23 +38,17 @@ class SampleRequest:
completion: str = None
def run_vllm(requests: list[SampleRequest],
def run_vllm(requests: List[SampleRequest],
engine_args: EngineArgs,
n: int,
guided_decoding_rate: float = 1.0,
warmup: bool = False) -> float:
from vllm import LLM, SamplingParams
llm = LLM(**vars(engine_args))
assert all(
llm.llm_engine.model_config.max_model_len >= (
request.prompt_len + request.expected_output_len)
for request in requests), (
"Please ensure that max_model_len is greater than the sum of"
" prompt_len and expected_output_len for all requests.")
# Add the requests to the engine.
prompts: list[str] = []
sampling_params: list[SamplingParams] = []
prompts: List[str] = []
sampling_params: List[SamplingParams] = []
# create a list containing random selected true or false
guided_decoding_req_idx = random.sample(
range(len(requests)), int(len(requests) * guided_decoding_rate))
@ -109,7 +103,7 @@ def run_vllm(requests: list[SampleRequest],
async def run_vllm_async(
requests: list[SampleRequest],
requests: List[SampleRequest],
engine_args: AsyncEngineArgs,
n: int,
guided_decoding_rate: float = 1.0,
@ -120,16 +114,9 @@ async def run_vllm_async(
async with build_async_engine_client_from_engine_args(
engine_args, disable_frontend_multiprocessing) as llm:
assert all(
llm.model_config.max_model_len >= (request.prompt_len +
request.expected_output_len)
for request in requests), (
"Please ensure that max_model_len is greater than the sum of"
" prompt_len and expected_output_len for all requests.")
# Add the requests to the engine.
prompts: list[str] = []
sampling_params: list[SamplingParams] = []
prompts: List[str] = []
sampling_params: List[SamplingParams] = []
guided_decoding_req_idx = random.sample(
range(len(requests)), int(len(requests) * guided_decoding_rate))
@ -202,7 +189,7 @@ async def run_vllm_async(
def sample_requests(tokenizer: PreTrainedTokenizerBase,
args: argparse.Namespace) -> list[SampleRequest]:
args: argparse.Namespace) -> List[SampleRequest]:
if args.dataset == 'json':
if args.json_schema_path is None:
dir_path = os.path.dirname(os.path.realpath(__file__))
@ -286,7 +273,7 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase,
elif args.dataset == "xgrammar_bench":
args.warmup = False
requests: list[SampleRequest] = []
requests: List[SampleRequest] = []
dataset = datasets.load_dataset("NousResearch/json-mode-eval",
split="train")
print(f"dataset has {len(dataset)} entries")

View File

@ -1,17 +1,13 @@
# SPDX-License-Identifier: Apache-2.0
"""Benchmark the latency of processing a single batch of requests."""
import argparse
import dataclasses
import json
import os
import time
from pathlib import Path
from typing import Any, Optional
from typing import List, Optional
import numpy as np
import torch
from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
from tqdm import tqdm
from vllm import LLM, SamplingParams
@ -21,18 +17,6 @@ from vllm.sampling_params import BeamSearchParams
from vllm.utils import FlexibleArgumentParser
def save_to_pytorch_benchmark_format(args: argparse.Namespace,
results: dict[str, Any]) -> None:
pt_records = convert_to_pytorch_benchmark_format(
args=args,
metrics={"latency": results["latencies"]},
extra_info={k: results[k]
for k in ["avg_latency", "percentiles"]})
if pt_records:
pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch.json"
write_to_json(pt_file, pt_records)
def main(args: argparse.Namespace):
print(args)
@ -41,10 +25,6 @@ def main(args: argparse.Namespace):
# NOTE(woosuk): If the request cannot be processed in a single batch,
# the engine will automatically process the request in multiple batches.
llm = LLM(**dataclasses.asdict(engine_args))
assert llm.llm_engine.model_config.max_model_len >= (
args.input_len +
args.output_len), ("Please ensure that max_model_len is greater than"
" the sum of input_len and output_len.")
sampling_params = SamplingParams(
n=args.n,
@ -57,7 +37,7 @@ def main(args: argparse.Namespace):
dummy_prompt_token_ids = np.random.randint(10000,
size=(args.batch_size,
args.input_len))
dummy_prompts: list[PromptType] = [{
dummy_prompts: List[PromptType] = [{
"prompt_token_ids": batch
} for batch in dummy_prompt_token_ids.tolist()]
@ -73,8 +53,7 @@ def main(args: argparse.Namespace):
beam_width=args.n,
max_tokens=args.output_len,
ignore_eos=True,
),
)
))
def run_to_completion(profile_dir: Optional[str] = None):
if profile_dir:
@ -84,8 +63,7 @@ def main(args: argparse.Namespace):
torch.profiler.ProfilerActivity.CUDA,
],
on_trace_ready=torch.profiler.tensorboard_trace_handler(
str(profile_dir)),
) as p:
str(profile_dir))) as p:
llm_generate()
print(p.key_averages().table(sort_by="self_cuda_time_total"))
else:
@ -102,8 +80,9 @@ def main(args: argparse.Namespace):
if args.profile:
profile_dir = args.profile_result_dir
if not profile_dir:
profile_dir = (Path(".") / "vllm_benchmark_result" /
f"latency_result_{time.time()}")
profile_dir = Path(
"."
) / "vllm_benchmark_result" / f"latency_result_{time.time()}"
print(f"Profiling (results will be saved to '{profile_dir}')...")
run_to_completion(profile_dir=profile_dir)
return
@ -115,9 +94,9 @@ def main(args: argparse.Namespace):
latencies = np.array(latencies)
percentages = [10, 25, 50, 75, 90, 99]
percentiles = np.percentile(latencies, percentages)
print(f"Avg latency: {np.mean(latencies)} seconds")
print(f'Avg latency: {np.mean(latencies)} seconds')
for percentage, percentile in zip(percentages, percentiles):
print(f"{percentage}% percentile latency: {percentile} seconds")
print(f'{percentage}% percentile latency: {percentile} seconds')
# Output JSON results if specified
if args.output_json:
@ -128,51 +107,43 @@ def main(args: argparse.Namespace):
}
with open(args.output_json, "w") as f:
json.dump(results, f, indent=4)
save_to_pytorch_benchmark_format(args, results)
if __name__ == "__main__":
if __name__ == '__main__':
parser = FlexibleArgumentParser(
description="Benchmark the latency of processing a single batch of "
"requests till completion.")
parser.add_argument("--input-len", type=int, default=32)
parser.add_argument("--output-len", type=int, default=128)
parser.add_argument("--batch-size", type=int, default=8)
parser.add_argument(
"--n",
type=int,
default=1,
help="Number of generated sequences per prompt.",
)
parser.add_argument("--use-beam-search", action="store_true")
parser.add_argument(
"--num-iters-warmup",
type=int,
default=10,
help="Number of iterations to run for warmup.",
)
parser.add_argument("--num-iters",
description='Benchmark the latency of processing a single batch of '
'requests till completion.')
parser.add_argument('--input-len', type=int, default=32)
parser.add_argument('--output-len', type=int, default=128)
parser.add_argument('--batch-size', type=int, default=8)
parser.add_argument('--n',
type=int,
default=1,
help='Number of generated sequences per prompt.')
parser.add_argument('--use-beam-search', action='store_true')
parser.add_argument('--num-iters-warmup',
type=int,
default=10,
help='Number of iterations to run for warmup.')
parser.add_argument('--num-iters',
type=int,
default=30,
help="Number of iterations to run.")
help='Number of iterations to run.')
parser.add_argument(
"--profile",
action="store_true",
help="profile the generation process of a single batch",
)
'--profile',
action='store_true',
help='profile the generation process of a single batch')
parser.add_argument(
"--profile-result-dir",
'--profile-result-dir',
type=str,
default=None,
help=("path to save the pytorch profiler output. Can be visualized "
"with ui.perfetto.dev or Tensorboard."),
)
help=('path to save the pytorch profiler output. Can be visualized '
'with ui.perfetto.dev or Tensorboard.'))
parser.add_argument(
"--output-json",
'--output-json',
type=str,
default=None,
help="Path to save the latency results in JSON format.",
)
help='Path to save the latency results in JSON format.')
parser = EngineArgs.add_cli_args(parser)
args = parser.parse_args()

View File

@ -1,4 +1,3 @@
# SPDX-License-Identifier: Apache-2.0
"""
Offline benchmark to test the long document QA throughput.

View File

@ -1,4 +1,3 @@
# SPDX-License-Identifier: Apache-2.0
"""
Benchmark the efficiency of prefix caching.
@ -31,7 +30,7 @@ import dataclasses
import json
import random
import time
from typing import Optional
from typing import List, Optional, Tuple
from transformers import PreTrainedTokenizerBase
@ -77,9 +76,9 @@ def sample_requests_from_dataset(
dataset_path: str,
num_requests: int,
tokenizer: PreTrainedTokenizerBase,
input_length_range: tuple[int, int],
input_length_range: Tuple[int, int],
fixed_output_len: Optional[int],
) -> list[Request]:
) -> List[Request]:
if fixed_output_len is not None and fixed_output_len < 4:
raise ValueError("output_len too small")
@ -99,7 +98,7 @@ def sample_requests_from_dataset(
assert min_len >= 0 and max_len >= min_len, "input_length_range too small"
# Filter out sequences that are too long or too short
filtered_requests: list[Request] = []
filtered_requests: List[Request] = []
for i in range(len(dataset)):
if len(filtered_requests) == num_requests:
@ -122,10 +121,10 @@ def sample_requests_from_dataset(
def sample_requests_from_random(
num_requests: int,
tokenizer: PreTrainedTokenizerBase,
input_length_range: tuple[int, int],
input_length_range: Tuple[int, int],
fixed_output_len: Optional[int],
prefix_len: int,
) -> list[Request]:
) -> List[Request]:
requests = []
prefix_token_ids = sample_tokens(tokenizer, prefix_len)
@ -144,9 +143,9 @@ def sample_requests_from_random(
return requests
def repeat_and_sort_requests(requests: list[Request],
def repeat_and_sort_requests(requests: List[Request],
repeat_count: int,
sort: bool = False) -> list[str]:
sort: bool = False) -> List[str]:
repeated_requests = requests * repeat_count
if sort:
repeated_requests.sort(key=lambda x: x[1])

View File

@ -1,11 +1,10 @@
# SPDX-License-Identifier: Apache-2.0
"""Benchmark offline prioritization."""
import argparse
import dataclasses
import json
import random
import time
from typing import Optional
from typing import List, Optional, Tuple
from transformers import AutoTokenizer, PreTrainedTokenizerBase
@ -13,17 +12,12 @@ from vllm.engine.arg_utils import EngineArgs
from vllm.utils import FlexibleArgumentParser
#Select a equi-probable random priority
def get_random_flag():
return 0 if random.random() < 0.5 else 1
def sample_requests(
dataset_path: str,
num_requests: int,
tokenizer: PreTrainedTokenizerBase,
fixed_output_len: Optional[int],
) -> list[tuple[str, int, int]]:
) -> List[Tuple[str, int, int]]:
if fixed_output_len is not None and fixed_output_len < 4:
raise ValueError("output_len too small")
@ -40,7 +34,7 @@ def sample_requests(
random.shuffle(dataset)
# Filter out sequences that are too long or too short
filtered_dataset: list[tuple[str, int, int]] = []
filtered_dataset: List[Tuple[str, int, int]] = []
for i in range(len(dataset)):
if len(filtered_dataset) == num_requests:
break
@ -60,7 +54,8 @@ def sample_requests(
# Prune too long sequences.
continue
priority = get_random_flag()
#Select a equi-probable random priority
priority = 0 if random.random() < 0.5 else 1
filtered_dataset.append((prompt, prompt_len, output_len, priority))
@ -68,19 +63,13 @@ def sample_requests(
def run_vllm(
requests: list[tuple[str, int, int]],
requests: List[Tuple[str, int, int]],
n: int,
engine_args: EngineArgs,
) -> float:
from vllm import LLM, SamplingParams
llm = LLM(**dataclasses.asdict(engine_args))
assert all(
llm.llm_engine.model_config.max_model_len >= (request[1] + request[2])
for request in requests), (
"Please ensure that max_model_len is greater than the sum of"
" input_len and output_len for all requests.")
# Add the requests to the engine.
prompts = []
sampling_params = []
@ -113,8 +102,8 @@ def main(args: argparse.Namespace):
if args.dataset is None:
# Synthesize a prompt with the given input length.
prompt = "hi" * (args.input_len - 1)
requests = [(prompt, args.input_len, args.output_len,
get_random_flag()) for _ in range(args.num_prompts)]
requests = [(prompt, args.input_len, args.output_len)
for _ in range(args.num_prompts)]
else:
requests = sample_requests(args.dataset, args.num_prompts, tokenizer,
args.output_len)

View File

@ -1,4 +1,3 @@
# SPDX-License-Identifier: Apache-2.0
r"""Benchmark online serving throughput.
On the server side, run one of the following commands:
@ -33,13 +32,11 @@ import os
import random
import time
import warnings
from collections.abc import AsyncGenerator, Collection
from dataclasses import dataclass
from datetime import datetime
from typing import Any, Optional
from typing import Any, AsyncGenerator, Collection, Dict, List, Optional, Tuple
import numpy as np
import pandas as pd
from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput,
RequestFuncOutput)
from datasets import load_dataset
@ -57,8 +54,6 @@ try:
except ImportError:
from argparse import ArgumentParser as FlexibleArgumentParser
from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
MILLISECONDS_TO_SECONDS_CONVERSION = 1000
@ -74,22 +69,22 @@ class BenchmarkMetrics:
mean_ttft_ms: float
median_ttft_ms: float
std_ttft_ms: float
percentiles_ttft_ms: list[tuple[float, float]]
percentiles_ttft_ms: List[Tuple[float, float]]
mean_tpot_ms: float
median_tpot_ms: float
std_tpot_ms: float
percentiles_tpot_ms: list[tuple[float, float]]
percentiles_tpot_ms: List[Tuple[float, float]]
mean_itl_ms: float
median_itl_ms: float
std_itl_ms: float
percentiles_itl_ms: list[tuple[float, float]]
percentiles_itl_ms: List[Tuple[float, float]]
# E2EL stands for end-to-end latency per request.
# It is the time taken on the client side from sending
# a request to receiving a complete response.
mean_e2el_ms: float
median_e2el_ms: float
std_e2el_ms: float
percentiles_e2el_ms: list[tuple[float, float]]
percentiles_e2el_ms: List[Tuple[float, float]]
def sample_sharegpt_requests(
@ -97,7 +92,7 @@ def sample_sharegpt_requests(
num_requests: int,
tokenizer: PreTrainedTokenizerBase,
fixed_output_len: Optional[int] = None,
) -> list[tuple[str, int, int, None]]:
) -> List[Tuple[str, int, int, None]]:
# Load the dataset.
with open(dataset_path, encoding='utf-8') as f:
dataset = json.load(f)
@ -111,7 +106,7 @@ def sample_sharegpt_requests(
random.shuffle(dataset)
# Filter out sequences that are too long or too short
filtered_dataset: list[tuple[str, int, int]] = []
filtered_dataset: List[Tuple[str, int, int]] = []
for i in range(len(dataset)):
if len(filtered_dataset) == num_requests:
break
@ -135,35 +130,6 @@ def sample_sharegpt_requests(
return filtered_dataset
def sample_burstgpt_requests(
dataset_path: str,
num_requests: int,
random_seed: int,
tokenizer: PreTrainedTokenizerBase,
) -> list[tuple[str, int, int, None]]:
df = pd.read_csv(dataset_path)
gpt4_df = df[df["Model"] == "GPT-4"]
# Remove the failed requests (i.e., response length is 0)
gpt4_df = gpt4_df[gpt4_df["Response tokens"] > 0]
# Randomly sample num_requests from the dataset
if num_requests <= len(gpt4_df):
gpt4_df = gpt4_df.sample(n=num_requests, random_state=random_seed)
else:
gpt4_df = gpt4_df.sample(n=num_requests,
random_state=random_seed,
replace=True)
# Convert the dataframe to a list of tuples
dataset = gpt4_df.values.tolist()
input_requests = []
for i in range(num_requests):
input_len = int(dataset[i][2])
output_len = int(dataset[i][3])
prompt = tokenizer.decode([(i + j) % tokenizer.vocab_size
for j in range(input_len)])
input_requests.append((prompt, input_len, output_len, None))
return input_requests
def sample_sonnet_requests(
dataset_path: str,
num_requests: int,
@ -171,7 +137,7 @@ def sample_sonnet_requests(
output_len: int,
prefix_len: int,
tokenizer: PreTrainedTokenizerBase,
) -> list[tuple[str, str, int, int, None]]:
) -> List[Tuple[str, str, int, int, None]]:
assert (
input_len > prefix_len
), "'args.sonnet-input-len' must be greater than 'args.prefix-input-len'."
@ -212,7 +178,7 @@ def sample_sonnet_requests(
prefix_lines = poem_lines[:num_prefix_lines]
# Sample the rest of lines per request.
sampled_requests: list[tuple[str, int, int]] = []
sampled_requests: List[Tuple[str, int, int]] = []
for _ in range(num_requests):
num_lines_needed = num_input_lines - num_prefix_lines
sampled_lines = "".join(prefix_lines +
@ -234,19 +200,25 @@ def sample_sonnet_requests(
return sampled_requests
def sample_vision_arena_requests(
def sample_mmmu_pro_vision_requests(
dataset,
num_requests: int,
tokenizer: PreTrainedTokenizerBase,
fixed_output_len: Optional[int] = None,
) -> list[tuple[str, str, int, Optional[dict[str, Collection[str]]]]]:
sampled_requests: list[tuple[str, int, int, dict[str,
) -> List[Tuple[str, str, int, Optional[Dict[str, Collection[str]]]]]:
sampled_requests: List[Tuple[str, int, int, Dict[str,
Collection[str]]]] = []
for data in dataset:
if len(sampled_requests) == num_requests:
break
prompt = data["turns"][0][0]['content']
# MMMU-Pro vision direct prompt
# Ref: https://github.com/MMMU-Benchmark/MMMU/blob/6ce42f4d8f70c1841c67867152648974415b5cac/mmmu-pro/prompts.yaml#L5
prompt = (
"Answer with the option letter from the given choices directly. "
"The last line of your response should be of the following "
"format: 'Answer: $LETTER' (without quotes) where LETTER is one of "
"options.")
prompt_token_ids = tokenizer(prompt).input_ids
if fixed_output_len is None:
@ -258,10 +230,10 @@ def sample_vision_arena_requests(
output_len = fixed_output_len
assert isinstance(
data["images"][0],
data["image"],
Image), ("Input image format must be `PIL.Image.Image`, "
f"given {type(data['image'])}.")
image: Image = data["images"][0]
image: Image = data["image"]
image = image.convert("RGB")
image_data = io.BytesIO()
image.save(image_data, format='JPEG')
@ -280,25 +252,27 @@ def sample_vision_arena_requests(
def sample_hf_requests(
dataset_path: str,
dataset_subset: Optional[str],
dataset_subset: str,
dataset_split: str,
num_requests: int,
tokenizer: PreTrainedTokenizerBase,
random_seed: int,
fixed_output_len: Optional[int] = None,
) -> list[tuple[str, str, int, Optional[dict[str, Collection[str]]]]]:
) -> List[Tuple[str, str, int, Optional[Dict[str, Collection[str]]]]]:
# Special case for vision_arena dataset
if dataset_path == 'lmarena-ai/vision-arena-bench-v0.1' \
and dataset_subset is None:
assert dataset_split == "train"
# Special case for MMMU-Pro vision dataset
if dataset_path == 'MMMU/MMMU_Pro' and dataset_subset == 'vision':
assert dataset_split == "test"
dataset = load_dataset(dataset_path,
name=dataset_subset,
split=dataset_split,
streaming=True)
dataset = dataset.shuffle(seed=random_seed)
return sample_vision_arena_requests(dataset, num_requests, tokenizer,
fixed_output_len)
assert "image" in dataset.features, (
"MMMU/MMMU_Pro vision dataset must have 'image' column.")
filter_func = lambda x: isinstance(x["image"], Image)
dataset = dataset.shuffle(seed=random_seed).filter(filter_func)
return sample_mmmu_pro_vision_requests(dataset, num_requests,
tokenizer, fixed_output_len)
dataset = load_dataset(dataset_path,
name=dataset_subset,
@ -308,7 +282,7 @@ def sample_hf_requests(
"HF Dataset must have 'conversations' column.")
filter_func = lambda x: len(x["conversations"]) >= 2
filtered_dataset = dataset.shuffle(seed=random_seed).filter(filter_func)
sampled_requests: list[tuple[str, int, int, dict[str,
sampled_requests: List[Tuple[str, int, int, Dict[str,
Collection[str]]]] = []
for data in filtered_dataset:
if len(sampled_requests) == num_requests:
@ -371,7 +345,7 @@ def sample_random_requests(
num_prompts: int,
range_ratio: float,
tokenizer: PreTrainedTokenizerBase,
) -> list[tuple[str, int, int]]:
) -> List[Tuple[str, int, int]]:
prefix_token_ids = np.random.randint(0,
tokenizer.vocab_size,
size=prefix_len).tolist()
@ -400,26 +374,26 @@ def sample_random_requests(
async def get_request(
input_requests: list[tuple[str, int, int]],
input_requests: List[Tuple[str, int, int]],
request_rate: float,
burstiness: float = 1.0,
) -> AsyncGenerator[tuple[str, int, int], None]:
) -> AsyncGenerator[Tuple[str, int, int], None]:
"""
Asynchronously generates requests at a specified rate
Asynchronously generates requests at a specified rate
with OPTIONAL burstiness.
Args:
input_requests:
input_requests:
A list of input requests, each represented as a tuple.
request_rate:
request_rate:
The rate at which requests are generated (requests/s).
burstiness (optional):
The burstiness factor of the request generation.
burstiness (optional):
The burstiness factor of the request generation.
Only takes effect when request_rate is not inf.
Default value is 1, which follows a Poisson process.
Otherwise, the request intervals follow a gamma distribution.
A lower burstiness value (0 < burstiness < 1) results
in more bursty requests, while a higher burstiness value
A lower burstiness value (0 < burstiness < 1) results
in more bursty requests, while a higher burstiness value
(burstiness > 1) results in a more uniform arrival of requests.
"""
input_requests = iter(input_requests)
@ -444,23 +418,23 @@ async def get_request(
def calculate_metrics(
input_requests: list[tuple[str, int, int]],
outputs: list[RequestFuncOutput],
input_requests: List[Tuple[str, int, int]],
outputs: List[RequestFuncOutput],
dur_s: float,
tokenizer: PreTrainedTokenizerBase,
selected_percentile_metrics: list[str],
selected_percentiles: list[float],
goodput_config_dict: dict[str, float],
) -> tuple[BenchmarkMetrics, list[int]]:
actual_output_lens: list[int] = []
selected_percentile_metrics: List[str],
selected_percentiles: List[float],
goodput_config_dict: Dict[str, float],
) -> Tuple[BenchmarkMetrics, List[int]]:
actual_output_lens: List[int] = []
total_input = 0
completed = 0
good_completed = 0
itls: list[float] = []
tpots: list[float] = []
all_tpots: list[float] = []
ttfts: list[float] = []
e2els: list[float] = []
itls: List[float] = []
tpots: List[float] = []
all_tpots: List[float] = []
ttfts: List[float] = []
e2els: List[float] = []
for i in range(len(outputs)):
if outputs[i].success:
output_len = outputs[i].output_tokens
@ -558,19 +532,18 @@ async def benchmark(
model_id: str,
model_name: str,
tokenizer: PreTrainedTokenizerBase,
input_requests: list[tuple[str, int, int]],
input_requests: List[Tuple[str, int, int]],
logprobs: Optional[int],
best_of: int,
request_rate: float,
burstiness: float,
disable_tqdm: bool,
profile: bool,
selected_percentile_metrics: list[str],
selected_percentiles: list[str],
selected_percentile_metrics: List[str],
selected_percentiles: List[str],
ignore_eos: bool,
goodput_config_dict: dict[str, float],
goodput_config_dict: Dict[str, float],
max_concurrency: Optional[int],
lora_modules: Optional[list[str]],
):
if backend in ASYNC_REQUEST_FUNCS:
request_func = ASYNC_REQUEST_FUNCS[backend]
@ -596,7 +569,6 @@ async def benchmark(
multi_modal_content=test_mm_content,
ignore_eos=ignore_eos,
)
test_output = await request_func(request_func_input=test_input)
if not test_output.success:
raise ValueError(
@ -605,11 +577,6 @@ async def benchmark(
else:
print("Initial test run completed. Starting main benchmark run...")
if lora_modules:
# For each input request, choose a LoRA module at random.
lora_modules = iter(
[random.choice(lora_modules) for _ in range(len(input_requests))])
if profile:
print("Starting profiler...")
profile_input = RequestFuncInput(model=model_id,
@ -653,16 +620,11 @@ async def benchmark(
pbar=pbar)
benchmark_start_time = time.perf_counter()
tasks: list[asyncio.Task] = []
tasks: List[asyncio.Task] = []
async for request in get_request(input_requests, request_rate, burstiness):
prompt, prompt_len, output_len, mm_content = request
req_model_id, req_model_name = model_id, model_name
if lora_modules:
req_lora_module = next(lora_modules)
req_model_id, req_model_name = req_lora_module, req_lora_module
request_func_input = RequestFuncInput(model=req_model_id,
model_name=req_model_name,
request_func_input = RequestFuncInput(model=model_id,
model_name=model_name,
prompt=prompt,
api_url=api_url,
prompt_len=prompt_len,
@ -675,7 +637,7 @@ async def benchmark(
asyncio.create_task(
limited_request_func(request_func_input=request_func_input,
pbar=pbar)))
outputs: list[RequestFuncOutput] = await asyncio.gather(*tasks)
outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks)
if profile:
print("Stopping profiler...")
@ -820,31 +782,6 @@ def parse_goodput(slo_pairs):
return goodput_config_dict
def save_to_pytorch_benchmark_format(args: argparse.Namespace,
results: dict[str, Any],
file_name: str) -> None:
metrics = [
"median_ttft_ms", "mean_ttft_ms", "std_ttft_ms", "p99_ttft_ms",
"mean_tpot_ms", "median_tpot_ms", "std_tpot_ms", "p99_tpot_ms",
"median_itl_ms", "mean_itl_ms", "std_itl_ms", "p99_itl_ms"
]
# These raw data might be useful, but they are rather big. They can be added
# later if needed
ignored_metrics = ["ttfts", "itls", "generated_texts", "errors"]
pt_records = convert_to_pytorch_benchmark_format(
args=args,
metrics={k: [results[k]]
for k in metrics},
extra_info={
k: results[k]
for k in results if k not in metrics and k not in ignored_metrics
})
if pt_records:
# Don't use json suffix here as we don't want CI to pick it up
pt_file = f"{os.path.splitext(file_name)[0]}.pytorch.json"
write_to_json(pt_file, pt_records)
def main(args: argparse.Namespace):
print(args)
random.seed(args.seed)
@ -867,10 +804,18 @@ def main(args: argparse.Namespace):
tokenizer_mode=tokenizer_mode,
trust_remote_code=args.trust_remote_code)
if args.dataset_name is None:
raise ValueError(
"Please specify '--dataset-name' and the corresponding "
"'--dataset-path' if required.")
if args.dataset is not None:
warnings.warn(
"The '--dataset' argument will be deprecated in the next "
"release. Please use '--dataset-name' and "
"'--dataset-path' in the future runs.",
stacklevel=2)
input_requests = sample_sharegpt_requests(
dataset_path=args.dataset,
num_requests=args.num_prompts,
tokenizer=tokenizer,
fixed_output_len=args.sharegpt_output_len,
)
elif args.dataset_name == "sharegpt":
input_requests = sample_sharegpt_requests(
@ -880,14 +825,6 @@ def main(args: argparse.Namespace):
fixed_output_len=args.sharegpt_output_len,
)
elif args.dataset_name == "burstgpt":
input_requests = sample_burstgpt_requests(
dataset_path=args.dataset_path,
num_requests=args.num_prompts,
random_seed=args.seed,
tokenizer=tokenizer,
)
elif args.dataset_name == "sonnet":
# Do not format the prompt, pass to message directly
if args.backend == "openai-chat":
@ -970,12 +907,11 @@ def main(args: argparse.Namespace):
ignore_eos=args.ignore_eos,
goodput_config_dict=goodput_config_dict,
max_concurrency=args.max_concurrency,
lora_modules=args.lora_modules,
))
# Save config and results to json
if args.save_result:
result_json: dict[str, Any] = {}
result_json: Dict[str, Any] = {}
# Setup
current_dt = datetime.now().strftime("%Y%m%d-%H%M%S")
@ -998,8 +934,8 @@ def main(args: argparse.Namespace):
)
# Traffic
result_json["request_rate"] = (args.request_rate if args.request_rate
< float("inf") else "inf")
result_json["request_rate"] = (
args.request_rate if args.request_rate < float("inf") else "inf")
result_json["burstiness"] = args.burstiness
result_json["max_concurrency"] = args.max_concurrency
@ -1017,7 +953,6 @@ def main(args: argparse.Namespace):
file_name = os.path.join(args.result_dir, file_name)
with open(file_name, "w", encoding='utf-8') as outfile:
json.dump(result_json, outfile)
save_to_pytorch_benchmark_format(args, result_json, file_name)
if __name__ == "__main__":
@ -1035,8 +970,7 @@ if __name__ == "__main__":
default=None,
help="Server or API base url if not using http host and port.",
)
# Use 127.0.0.1 here instead of localhost to force the use of ipv4
parser.add_argument("--host", type=str, default="127.0.0.1")
parser.add_argument("--host", type=str, default="localhost")
parser.add_argument("--port", type=int, default=8000)
parser.add_argument(
"--endpoint",
@ -1044,11 +978,18 @@ if __name__ == "__main__":
default="/v1/completions",
help="API endpoint.",
)
parser.add_argument(
"--dataset",
type=str,
default=None,
help="Path to the ShareGPT dataset, will be deprecated in the "
"next release.",
)
parser.add_argument(
"--dataset-name",
type=str,
default="sharegpt",
choices=["sharegpt", "burstgpt", "sonnet", "random", "hf"],
choices=["sharegpt", "sonnet", "random", "hf"],
help="Name of the dataset to benchmark on.",
)
parser.add_argument("--dataset-path",
@ -1290,12 +1231,11 @@ if __name__ == "__main__":
'--tokenizer-mode',
type=str,
default="auto",
choices=['auto', 'slow', 'mistral', 'custom'],
choices=['auto', 'slow', 'mistral'],
help='The tokenizer mode.\n\n* "auto" will use the '
'fast tokenizer if available.\n* "slow" will '
'always use the slow tokenizer. \n* '
'"mistral" will always use the `mistral_common` tokenizer. \n*'
'"custom" will use --tokenizer to select the preregistered tokenizer.')
'"mistral" will always use the `mistral_common` tokenizer.')
parser.add_argument("--served-model-name",
type=str,
@ -1304,12 +1244,5 @@ if __name__ == "__main__":
"If not specified, the model name will be the "
"same as the ``--model`` argument. ")
parser.add_argument("--lora-modules",
nargs='+',
default=None,
help="A subset of LoRA module names passed in when "
"launching the server. For each request, the "
"script chooses a LoRA module at random.")
args = parser.parse_args()
main(args)

View File

@ -1,4 +1,3 @@
# SPDX-License-Identifier: Apache-2.0
r"""Benchmark online serving throughput with guided decoding.
On the server side, run one of the following commands:
@ -9,7 +8,7 @@ On the server side, run one of the following commands:
./launch_tgi_server.sh <your_model> <max_batch_total_tokens>
On the client side, run:
python benchmarks/benchmark_serving_guided.py \
python benchmarks/benchmark_serving.py \
--backend <backend> \
--model <your_model> \
--dataset json \
@ -30,9 +29,8 @@ import os
import random
import time
import warnings
from collections.abc import AsyncGenerator
from dataclasses import dataclass
from typing import Optional
from typing import AsyncGenerator, List, Optional, Tuple
import datasets
import numpy as np
@ -67,22 +65,22 @@ class BenchmarkMetrics:
mean_ttft_ms: float
median_ttft_ms: float
std_ttft_ms: float
percentiles_ttft_ms: list[tuple[float, float]]
percentiles_ttft_ms: List[Tuple[float, float]]
mean_tpot_ms: float
median_tpot_ms: float
std_tpot_ms: float
percentiles_tpot_ms: list[tuple[float, float]]
percentiles_tpot_ms: List[Tuple[float, float]]
mean_itl_ms: float
median_itl_ms: float
std_itl_ms: float
percentiles_itl_ms: list[tuple[float, float]]
percentiles_itl_ms: List[Tuple[float, float]]
# E2EL stands for end-to-end latency per request.
# It is the time taken on the client side from sending
# a request to receiving a complete response.
mean_e2el_ms: float
median_e2el_ms: float
std_e2el_ms: float
percentiles_e2el_ms: list[tuple[float, float]]
percentiles_e2el_ms: List[Tuple[float, float]]
@dataclasses.dataclass
@ -105,7 +103,7 @@ class SampleRequest:
def sample_requests(tokenizer: PreTrainedTokenizerBase,
args: argparse.Namespace) -> list[SampleRequest]:
args: argparse.Namespace) -> List[SampleRequest]:
if args.dataset == 'json':
if args.json_schema_path is None:
dir_path = os.path.dirname(os.path.realpath(__file__))
@ -188,7 +186,7 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase,
]
elif args.dataset == "xgrammar_bench":
requests: list[SampleRequest] = []
requests: List[SampleRequest] = []
dataset = datasets.load_dataset("NousResearch/json-mode-eval",
split="train")
print(f"dataset has {len(dataset)} entries")
@ -215,10 +213,10 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase,
async def get_request(
input_requests: list[SampleRequest],
input_requests: List[SampleRequest],
request_rate: float,
burstiness: float = 1.0,
) -> AsyncGenerator[tuple[int, SampleRequest], None]:
) -> AsyncGenerator[Tuple[int, SampleRequest], None]:
"""
Asynchronously generates requests at a specified rate
with OPTIONAL burstiness.
@ -259,23 +257,22 @@ async def get_request(
def calculate_metrics(
input_requests: list[tuple[str, int, int]],
outputs: list[RequestFuncOutput],
input_requests: List[Tuple[str, int, int]],
outputs: List[RequestFuncOutput],
dur_s: float,
tokenizer: PreTrainedTokenizerBase,
selected_percentile_metrics: list[str],
selected_percentiles: list[float],
goodput_config_dict: Optional[dict[str, float]] = None,
) -> tuple[BenchmarkMetrics, list[int]]:
actual_output_lens: list[int] = []
selected_percentile_metrics: List[str],
selected_percentiles: List[float],
) -> Tuple[BenchmarkMetrics, List[int]]:
actual_output_lens: List[int] = []
total_input = 0
completed = 0
good_completed = 0
itls: list[float] = []
tpots: list[float] = []
all_tpots: list[float] = []
ttfts: list[float] = []
e2els: list[float] = []
itls: List[float] = []
tpots: List[float] = []
all_tpots: List[float] = []
ttfts: List[float] = []
e2els: List[float] = []
for i in range(len(outputs)):
if outputs[i].success:
# We use the tokenizer to count the number of output tokens for all
@ -289,10 +286,10 @@ def calculate_metrics(
total_input += input_requests[i].prompt_len
tpot = 0
if output_len > 1:
latency_minus_ttft = outputs[i].latency - outputs[i].ttft
tpot = latency_minus_ttft / (output_len - 1)
tpot = (outputs[i].latency - outputs[i].ttft) / (output_len -
1)
tpots.append(tpot)
outputs[i].tpot = tpot
outputs[i].tpot = sum(tpots) / len(tpots) if len(tpots) else 0
# Note: if output_len <= 1, we regard tpot as 0 for goodput
all_tpots.append(tpot)
itls += outputs[i].itl
@ -302,28 +299,6 @@ def calculate_metrics(
else:
actual_output_lens.append(0)
if goodput_config_dict:
valid_metrics = []
slo_values = []
if "ttft" in goodput_config_dict:
valid_metrics.append(ttfts)
slo_values.append(goodput_config_dict["ttft"] /
MILLISECONDS_TO_SECONDS_CONVERSION)
if "tpot" in goodput_config_dict:
valid_metrics.append(all_tpots)
slo_values.append(goodput_config_dict["tpot"] /
MILLISECONDS_TO_SECONDS_CONVERSION)
if "e2el" in goodput_config_dict:
valid_metrics.append(e2els)
slo_values.append(goodput_config_dict["e2el"] /
MILLISECONDS_TO_SECONDS_CONVERSION)
for req_metric in zip(*valid_metrics):
is_good_req = all([s >= r for s, r in zip(slo_values, req_metric)])
if is_good_req:
good_completed += 1
if completed == 0:
warnings.warn(
"All requests failed. This is likely due to a misconfiguration "
@ -369,18 +344,17 @@ async def benchmark(
base_url: str,
model_id: str,
tokenizer: PreTrainedTokenizerBase,
input_requests: list[SampleRequest],
input_requests: List[SampleRequest],
request_rate: float,
burstiness: float,
disable_tqdm: bool,
profile: bool,
selected_percentile_metrics: list[str],
selected_percentiles: list[str],
selected_percentile_metrics: List[str],
selected_percentiles: List[str],
ignore_eos: bool,
max_concurrency: Optional[int],
guided_decoding_ratio: float,
guided_decoding_backend: str,
goodput_config_dict: Optional[dict[str, float]] = None,
):
if backend in ASYNC_REQUEST_FUNCS:
request_func = ASYNC_REQUEST_FUNCS[backend]
@ -460,8 +434,8 @@ async def benchmark(
pbar=pbar)
benchmark_start_time = time.perf_counter()
tasks: list[asyncio.Task] = []
expected: list[str] = []
tasks: List[asyncio.Task] = []
expected: List[str] = []
async for i, request in get_request(input_requests, request_rate,
burstiness):
extra_body = prepare_extra_body(
@ -480,7 +454,7 @@ async def benchmark(
asyncio.create_task(
limited_request_func(request_func_input=request_func_input,
pbar=pbar)))
outputs: list[RequestFuncOutput] = await asyncio.gather(*tasks)
outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks)
if profile:
print("Stopping profiler...")
@ -508,7 +482,6 @@ async def benchmark(
tokenizer=tokenizer,
selected_percentile_metrics=selected_percentile_metrics,
selected_percentiles=selected_percentiles,
goodput_config_dict=goodput_config_dict,
)
print("{s:{c}^{n}}".format(s=' Serving Benchmark Result ', n=50, c='='))
@ -520,9 +493,6 @@ async def benchmark(
metrics.total_output))
print("{:<40} {:<10.2f}".format("Request throughput (req/s):",
metrics.request_throughput))
if goodput_config_dict:
print("{:<40} {:<10.2f}".format("Request goodput (req/s):",
metrics.request_goodput))
print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):",
metrics.output_throughput))
print("{:<40} {:<10.2f}".format("Total Token throughput (tok/s):",
@ -646,40 +616,6 @@ def evaluate(ret, args):
100) if len(not_none_scores) > 0 else None
def parse_goodput(slo_pairs):
goodput_config_dict = {}
try:
for slo_pair in slo_pairs:
slo_name, slo_val = slo_pair.split(":")
goodput_config_dict[slo_name] = float(slo_val)
except ValueError as err:
raise argparse.ArgumentTypeError(
"Invalid format found for service level objectives. "
"Specify service level objectives for goodput as \"KEY:VALUE\" "
"pairs, where the key is a metric name, and the value is a "
"number in milliseconds.") from err
return goodput_config_dict
def check_goodput_args(args):
goodput_config_dict = {}
VALID_NAMES = ["ttft", "tpot", "e2el"]
if args.goodput:
goodput_config_dict = parse_goodput(args.goodput)
for slo_name, slo_val in goodput_config_dict.items():
if slo_name not in VALID_NAMES:
raise ValueError(
f"Invalid metric name found, {slo_name}: {slo_val}. "
"The service level objective name should be one of "
f"{str(VALID_NAMES)}. ")
if slo_val < 0:
raise ValueError(
f"Invalid value found, {slo_name}: {slo_val}. "
"The service level objective value should be "
"non-negative.")
return goodput_config_dict
def main(args: argparse.Namespace):
print(args)
random.seed(args.seed)
@ -724,8 +660,6 @@ def main(args: argparse.Namespace):
input_requests = sample_requests(tokenizer, args)
goodput_config_dict = check_goodput_args(args)
benchmark_result, ret = asyncio.run(
benchmark(
backend=backend,
@ -746,7 +680,6 @@ def main(args: argparse.Namespace):
max_concurrency=args.max_concurrency,
guided_decoding_ratio=args.guided_decoding_ratio,
guided_decoding_backend=args.guided_decoding_backend,
goodput_config_dict=goodput_config_dict,
))
# Save config and results to json
@ -797,8 +730,7 @@ if __name__ == "__main__":
default=None,
help="Server or API base url if not using http host and port.",
)
# Use 127.0.0.1 here instead of localhost to force the use of ipv4
parser.add_argument("--host", type=str, default="127.0.0.1")
parser.add_argument("--host", type=str, default="localhost")
parser.add_argument("--port", type=int, default=8000)
parser.add_argument(
"--endpoint",
@ -931,18 +863,6 @@ if __name__ == "__main__":
"Default value is \"99\". "
"Use \"--percentile-metrics\" to select metrics.",
)
parser.add_argument(
"--goodput",
nargs="+",
required=False,
help="Specify service level objectives for goodput as \"KEY:VALUE\" "
"pairs, where the key is a metric name, and the value is in "
"milliseconds. Multiple \"KEY:VALUE\" pairs can be provided, "
"separated by spaces. Allowed request level metric names are "
"\"ttft\", \"tpot\", \"e2el\". For more context on the definition of "
"goodput, refer to DistServe paper: https://arxiv.org/pdf/2401.09670 "
"and the blog: https://hao-ai-lab.github.io/blogs/distserve")
parser.add_argument("--no-guided-decoding",
action='store_true',
default=False,

View File

@ -1,17 +1,14 @@
# SPDX-License-Identifier: Apache-2.0
"""Benchmark offline inference throughput."""
import argparse
import dataclasses
import json
import os
import random
import time
from functools import cache
from typing import Any, Optional
from typing import Dict, List, Optional, Tuple
import torch
import uvloop
from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
from PIL import Image
from tqdm import tqdm
from transformers import (AutoModelForCausalLM, AutoTokenizer,
@ -74,12 +71,12 @@ def lora_path_on_disk(lora_path: str) -> str:
return get_adapter_absolute_path(lora_path)
lora_tokenizer_cache: dict[int, AnyTokenizer] = {}
lora_tokenizer_cache: Dict[int, AnyTokenizer] = {}
def get_random_lora_request(
args: argparse.Namespace
) -> tuple[LoRARequest, Optional[AnyTokenizer]]:
) -> Tuple[LoRARequest, Optional[AnyTokenizer]]:
global lora_tokenizer_cache
lora_id = random.randint(1, args.max_loras)
lora_request = LoRARequest(lora_name=str(lora_id),
@ -91,7 +88,7 @@ def get_random_lora_request(
def sample_requests(tokenizer: PreTrainedTokenizerBase,
args: argparse.Namespace) -> list[SampleRequest]:
args: argparse.Namespace) -> List[SampleRequest]:
dataset_path: str = args.dataset
num_requests: int = args.num_prompts
@ -109,7 +106,7 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase,
random.shuffle(dataset)
# Filter out sequences that are too long or too short
filtered_dataset: list[SampleRequest] = []
filtered_dataset: List[SampleRequest] = []
for data in tqdm(dataset,
total=len(filtered_dataset),
desc="sampling requests"):
@ -165,21 +162,16 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase,
def run_vllm(
requests: list[SampleRequest],
requests: List[SampleRequest],
n: int,
engine_args: EngineArgs,
) -> float:
from vllm import LLM, SamplingParams
llm = LLM(**dataclasses.asdict(engine_args))
assert all(
llm.llm_engine.model_config.max_model_len >= (
request.prompt_len + request.expected_output_len)
for request in requests), (
"Please ensure that max_model_len is greater than the sum of"
" prompt_len and expected_output_len for all requests.")
# Add the requests to the engine.
prompts: list[TextPrompt] = []
sampling_params: list[SamplingParams] = []
prompts: List[TextPrompt] = []
sampling_params: List[SamplingParams] = []
for request in requests:
prompts.append(
TextPrompt(prompt=request.prompt,
@ -192,7 +184,7 @@ def run_vllm(
ignore_eos=True,
max_tokens=request.expected_output_len,
))
lora_requests: Optional[list[LoRARequest]] = None
lora_requests: Optional[List[LoRARequest]] = None
if engine_args.enable_lora:
lora_requests = [request.lora_request for request in requests]
@ -225,7 +217,7 @@ def run_vllm(
async def run_vllm_async(
requests: list[SampleRequest],
requests: List[SampleRequest],
n: int,
engine_args: AsyncEngineArgs,
disable_frontend_multiprocessing: bool = False,
@ -234,17 +226,11 @@ async def run_vllm_async(
async with build_async_engine_client_from_engine_args(
engine_args, disable_frontend_multiprocessing) as llm:
assert all(
llm.model_config.max_model_len >= (request.prompt_len +
request.expected_output_len)
for request in requests), (
"Please ensure that max_model_len is greater than the sum of"
" prompt_len and expected_output_len for all requests.")
# Add the requests to the engine.
prompts: list[TextPrompt] = []
sampling_params: list[SamplingParams] = []
lora_requests: list[Optional[LoRARequest]] = []
prompts: List[TextPrompt] = []
sampling_params: List[SamplingParams] = []
lora_requests: List[Optional[LoRARequest]] = []
for request in requests:
prompts.append(
TextPrompt(prompt=request.prompt,
@ -276,7 +262,7 @@ async def run_vllm_async(
def run_hf(
requests: list[SampleRequest],
requests: List[SampleRequest],
model: str,
tokenizer: PreTrainedTokenizerBase,
n: int,
@ -292,7 +278,7 @@ def run_hf(
pbar = tqdm(total=len(requests))
start = time.perf_counter()
batch: list[str] = []
batch: List[str] = []
max_prompt_len = 0
max_output_len = 0
for i in range(len(requests)):
@ -334,7 +320,7 @@ def run_hf(
def run_mii(
requests: list[SampleRequest],
requests: List[SampleRequest],
model: str,
tensor_parallel_size: int,
output_len: int,
@ -351,24 +337,6 @@ def run_mii(
return end - start
def save_to_pytorch_benchmark_format(args: argparse.Namespace,
results: dict[str, Any]) -> None:
pt_records = convert_to_pytorch_benchmark_format(
args=args,
metrics={
"requests_per_second": [results["requests_per_second"]],
"tokens_per_second": [results["tokens_per_second"]],
},
extra_info={
k: results[k]
for k in ["elapsed_time", "num_requests", "total_num_tokens"]
})
if pt_records:
# Don't use json suffix here as we don't want CI to pick it up
pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch.json"
write_to_json(pt_file, pt_records)
def main(args: argparse.Namespace):
print(args)
random.seed(args.seed)
@ -466,7 +434,6 @@ def main(args: argparse.Namespace):
}
with open(args.output_json, "w") as f:
json.dump(results, f, indent=4)
save_to_pytorch_benchmark_format(args, results)
if __name__ == "__main__":
@ -479,8 +446,8 @@ if __name__ == "__main__":
type=str,
default=None,
help="Path to the dataset. The dataset is expected to "
"be a json in form of list[dict[..., conversations: "
"list[dict[..., value: <prompt_or_response>]]]]")
"be a json in form of List[Dict[..., conversations: "
"List[Dict[..., value: <prompt_or_response>]]]]")
parser.add_argument("--input-len",
type=int,
default=None,

View File

@ -1,69 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
import argparse
import json
import math
import os
from typing import Any
def convert_to_pytorch_benchmark_format(args: argparse.Namespace,
metrics: dict[str, list],
extra_info: dict[str, Any]) -> list:
"""
Save the benchmark results in the format used by PyTorch OSS benchmark with
on metric per record
https://github.com/pytorch/pytorch/wiki/How-to-integrate-with-PyTorch-OSS-benchmark-database
"""
records = []
if not os.environ.get("SAVE_TO_PYTORCH_BENCHMARK_FORMAT", False):
return records
for name, benchmark_values in metrics.items():
record = {
"benchmark": {
"name": "vLLM benchmark",
"extra_info": {
"args": vars(args),
},
},
"model": {
"name": args.model,
},
"metric": {
"name": name,
"benchmark_values": benchmark_values,
"extra_info": extra_info,
},
}
tp = record["benchmark"]["extra_info"]["args"].get(
"tensor_parallel_size")
# Save tensor_parallel_size parameter if it's part of the metadata
if not tp and "tensor_parallel_size" in extra_info:
record["benchmark"]["extra_info"]["args"][
"tensor_parallel_size"] = extra_info["tensor_parallel_size"]
records.append(record)
return records
class InfEncoder(json.JSONEncoder):
def clear_inf(self, o: Any):
if isinstance(o, dict):
return {k: self.clear_inf(v) for k, v in o.items()}
elif isinstance(o, list):
return [self.clear_inf(v) for v in o]
elif isinstance(o, float) and math.isinf(o):
return "inf"
return o
def iterencode(self, o: Any, *args, **kwargs) -> Any:
return super().iterencode(self.clear_inf(o), *args, **kwargs)
def write_to_json(filename: str, records: list) -> None:
with open(filename, "w") as f:
json.dump(records, f, cls=InfEncoder)

View File

@ -1,12 +1,9 @@
# SPDX-License-Identifier: Apache-2.0
import argparse
import copy
import itertools
import pickle as pkl
import time
from collections.abc import Iterable
from typing import Callable
from typing import Callable, Iterable, List, Tuple
import torch
import torch.utils.benchmark as TBenchmark
@ -229,7 +226,7 @@ def print_timers(timers: Iterable[TMeasurement]):
def run(dtype: torch.dtype,
MKNs: Iterable[tuple[int, int, int]]) -> Iterable[TMeasurement]:
MKNs: Iterable[Tuple[int, int, int]]) -> Iterable[TMeasurement]:
results = []
for m, k, n in MKNs:
timers = bench(dtype, m, k, n, f"scaled-{dtype}-gemm",
@ -242,7 +239,7 @@ def run(dtype: torch.dtype,
# output makers
def make_output(data: Iterable[TMeasurement],
MKNs: Iterable[tuple[int, int, int]],
MKNs: Iterable[Tuple[int, int, int]],
base_description: str,
timestamp=None):
print(f"== All Results {base_description} ====")
@ -283,7 +280,7 @@ def run_model_bench(args):
for i, model in enumerate(args.models):
print(f"[{i}] {model}")
def model_shapes(model_name: str, tp_size: int) -> list[tuple[int, int]]:
def model_shapes(model_name: str, tp_size: int) -> List[Tuple[int, int]]:
KNs = []
for KN, tp_split_dim in copy.deepcopy(WEIGHT_SHAPES[model_name]):
KN[tp_split_dim] = KN[tp_split_dim] // tp_size

View File

@ -1,7 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# Cutlass bench utils
from collections.abc import Iterable
from typing import Iterable, Tuple
import torch
@ -27,7 +25,7 @@ def to_fp16(tensor: torch.Tensor) -> torch.Tensor:
def make_rand_tensors(dtype: torch.dtype, m: int, n: int,
k: int) -> tuple[torch.Tensor, torch.Tensor]:
k: int) -> Tuple[torch.Tensor, torch.Tensor]:
a = torch.randn((m, k), device='cuda') * 5
b = torch.randn((n, k), device='cuda').t() * 5
@ -63,7 +61,7 @@ def prune_to_2_4(tensor):
def make_rand_sparse_tensors(dtype: torch.dtype, m: int, n: int,
k: int) -> tuple[torch.Tensor, torch.Tensor]:
k: int) -> Tuple[torch.Tensor, torch.Tensor]:
a = torch.randn((m, k), device='cuda') * 5
b = torch.randn((n, k), device='cuda').t() * 5
@ -88,7 +86,7 @@ def make_rand_sparse_tensors(dtype: torch.dtype, m: int, n: int,
def make_n_rand_sparse_tensors(num_tensors: int, dtype: torch.dtype,
m: int, n: int, k: int) -> \
tuple[Iterable[torch.Tensor], Iterable[torch.Tensor]]:
Tuple[Iterable[torch.Tensor], Iterable[torch.Tensor]]:
ABs = []
for _ in range(num_tensors):
b_comp, e, a, b = make_rand_sparse_tensors(dtype, m, n, k)

View File

@ -1,12 +1,9 @@
# SPDX-License-Identifier: Apache-2.0
import argparse
import copy
import itertools
import pickle as pkl
import time
from collections.abc import Iterable
from typing import Callable, Optional
from typing import Callable, Iterable, List, Tuple
import torch
import torch.utils.benchmark as TBenchmark
@ -15,8 +12,6 @@ from utils import make_rand_tensors
from weight_shapes import WEIGHT_SHAPES
from vllm import _custom_ops as ops
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
w8a8_block_fp8_matmul)
from vllm.utils import FlexibleArgumentParser
DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
@ -43,15 +38,8 @@ def bench_fn(label: str, sub_label: str, description: str, fn: Callable, *args,
).blocked_autorange(min_run_time=min_run_time)
def bench_int8(
dtype: torch.dtype,
m: int,
k: int,
n: int,
label: str,
sub_label: str,
bench_kernels: Optional[list[str]] = None) -> Iterable[TMeasurement]:
"""Benchmark INT8-based kernels."""
def bench_int8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
sub_label: str) -> Iterable[TMeasurement]:
assert dtype == torch.int8
a, b = make_rand_tensors(torch.int8, m, n, k)
scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
@ -60,132 +48,155 @@ def bench_int8(
azp = torch.zeros((m, ), device="cuda", dtype=torch.int32)
azp_adj = torch.zeros((n, ), device="cuda", dtype=torch.int32)
bench_fns = {
"pytorch_bf16_bf16_bf16_matmul-no-scales":
lambda: torch.mm(a.to(dtype=torch.bfloat16), b.to(dtype=torch.bfloat16)
),
"pytorch_fp16_fp16_fp16_matmul-no-scales":
lambda: torch.mm(a.to(dtype=torch.float16), b.to(dtype=torch.float16)),
"cutlass_i8_i8_bf16_scaled_mm":
lambda: ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.bfloat16),
"cutlass_i8_i8_bf16_scaled_mm_bias":
lambda: ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.bfloat16,
bias),
"cutlass_i8_i8_bf16_scaled_mm_azp":
lambda: ops.cutlass_scaled_mm_azp(a, b, scale_a, scale_b, torch.
bfloat16, azp_adj),
"cutlass_i8_i8_bf16_scaled_mm_azp_bias":
lambda: ops.cutlass_scaled_mm_azp(a, b, scale_a, scale_b, torch.
bfloat16, azp_adj, None, bias),
"cutlass_i8_i8_bf16_scaled_mm_azp_pt":
lambda: ops.cutlass_scaled_mm_azp(a, b, scale_a, scale_b, torch.
bfloat16, azp_adj, azp),
"cutlass_i8_i8_bf16_scaled_mm_azp_pt_bias":
lambda: ops.cutlass_scaled_mm_azp(a, b, scale_a, scale_b, torch.
bfloat16, azp_adj, azp, bias),
}
timers = []
for name, fn in bench_fns.items():
# If bench_kernels is None, run all. Otherwise, run only exact matches.
if bench_kernels is None or name in bench_kernels:
print(f"Running {name}")
timers.append(bench_fn(label, sub_label, name, fn))
# pytorch impl - bfloat16
timers.append(
bench_fn(label, sub_label, "pytorch_bf16_bf16_bf16_matmul-no-scales",
torch.mm, a.to(dtype=torch.bfloat16),
b.to(dtype=torch.bfloat16)))
# pytorch impl - float16
timers.append(
bench_fn(label, sub_label,
"pytorch_fp16_fp16_fp16_matmul-no-scales", torch.mm,
a.to(dtype=torch.float16), b.to(dtype=torch.float16)))
# cutlass impl
timers.append(
bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm",
ops.cutlass_scaled_mm, a, b, scale_a, scale_b,
torch.bfloat16))
# cutlass with bias
timers.append(
bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_bias",
ops.cutlass_scaled_mm, a, b, scale_a, scale_b, torch.bfloat16,
bias))
# cutlass with azp per-tensor
timers.append(
bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_azp",
ops.cutlass_scaled_mm_azp, a, b, scale_a, scale_b,
torch.bfloat16, azp_adj))
# cutlass with azp per-tensor + bias
timers.append(
bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_azp_bias",
ops.cutlass_scaled_mm_azp, a, b, scale_a, scale_b,
torch.bfloat16, azp_adj, None, bias))
# cutlass with azp per-token
timers.append(
bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_azp_pt",
ops.cutlass_scaled_mm_azp, a, b, scale_a, scale_b,
torch.bfloat16, azp_adj, azp))
# cutlass with azp per-token + bias
timers.append(
bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_azp_pt_bias",
ops.cutlass_scaled_mm_azp, a, b, scale_a, scale_b,
torch.bfloat16, azp_adj, azp, bias))
return timers
def bench_fp8(
dtype: torch.dtype,
m: int,
k: int,
n: int,
label: str,
sub_label: str,
bench_kernels: Optional[list[str]] = None) -> Iterable[TMeasurement]:
"""Benchmark FP8-based kernels."""
def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
sub_label: str) -> Iterable[TMeasurement]:
assert dtype == torch.float8_e4m3fn
a, b = make_rand_tensors(torch.float8_e4m3fn, m, n, k)
a_cont = a.contiguous()
scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
block_scale_a = torch.rand((m, k // 128),
device="cuda",
dtype=torch.float32)
block_scale_b = torch.rand((k // 128, n // 128),
device="cuda",
dtype=torch.float32)
block_scale_a_M_major = block_scale_a.t().contiguous().t()
block_scale_b_K_major = block_scale_b.t().contiguous().t()
bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
print(m, k, n)
bench_fns = {
"pytorch_bf16_bf16_bf16_matmul-no-scales":
lambda: torch.mm(a.to(dtype=torch.bfloat16), b.to(dtype=torch.bfloat16)
),
"pytorch_fp16_fp16_fp16_matmul-no-scales":
lambda: torch.mm(a.to(dtype=torch.float16), b.to(dtype=torch.float16)),
"pytorch_fp8_fp8_fp16_scaled_mm":
lambda: torch._scaled_mm(
a, b, scale_a, scale_b, out_dtype=torch.float16),
"pytorch_fp8_fp8_fp16_scaled_mm_fast_accum":
lambda: torch._scaled_mm(a,
b,
scale_a,
scale_b,
out_dtype=torch.float16,
use_fast_accum=True),
"pytorch_fp8_fp8_bf16_scaled_mm":
lambda: torch._scaled_mm(
a, b, scale_a, scale_b, out_dtype=torch.bfloat16),
"pytorch_fp8_fp8_bf16_scaled_mm_fast_accum":
lambda: torch._scaled_mm(a,
b,
scale_a,
scale_b,
out_dtype=torch.bfloat16,
use_fast_accum=True),
"cutlass_fp8_fp8_bf16_scaled_mm":
lambda: ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.bfloat16),
"cutlass_fp8_fp8_fp16_scaled_mm":
lambda: ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.float16),
"cutlass_fp8_fp8_bf16_scaled_mm_bias":
lambda: ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.bfloat16,
bias),
"cutlass_fp8_fp8_fp16_scaled_mm_bias":
lambda: ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.float16,
bias.to(dtype=torch.float16)),
"triton_fp8_fp8_fp16_scaled_mm_blockwise":
lambda: w8a8_block_fp8_matmul(a_cont, b.t(), block_scale_a,
block_scale_b.t(), (128, 128)),
"cutlass_fp8_fp8_fp16_scaled_mm_blockwise":
lambda: ops.cutlass_scaled_mm(a, b, block_scale_a_M_major,
block_scale_b_K_major, torch.float16),
}
timers = []
for name, fn in bench_fns.items():
# If bench_kernels is None, run all. Otherwise, run only exact matches.
if bench_kernels is None or name in bench_kernels:
print(f"Running {name}")
timers.append(bench_fn(label, sub_label, name, fn))
# pytorch impl w. bf16
timers.append(
bench_fn(label, sub_label, "pytorch_bf16_bf16_bf16_matmul-no-scales",
torch.mm, a.to(dtype=torch.bfloat16, device="cuda"),
b.to(dtype=torch.bfloat16, device="cuda")))
# pytorch impl: bf16 output, without fp8 fast accum
timers.append(
bench_fn(label,
sub_label,
"pytorch_fp8_fp8_bf16_scaled_mm",
torch._scaled_mm,
a,
b,
scale_a=scale_a,
scale_b=scale_b,
out_dtype=torch.bfloat16))
# pytorch impl: bf16 output, with fp8 fast accum
timers.append(
bench_fn(label,
sub_label,
"pytorch_fp8_fp8_bf16_scaled_mm_fast_accum",
torch._scaled_mm,
a,
b,
scale_a=scale_a,
scale_b=scale_b,
out_dtype=torch.bfloat16,
use_fast_accum=True))
# pytorch impl: fp16 output, without fp8 fast accum
timers.append(
bench_fn(label,
sub_label,
"pytorch_fp8_fp8_fp16_scaled_mm",
torch._scaled_mm,
a,
b,
scale_a=scale_a,
scale_b=scale_b,
out_dtype=torch.float16))
# pytorch impl: fp16 output, with fp8 fast accum
timers.append(
bench_fn(label,
sub_label,
"pytorch_fp8_fp8_fp16_scaled_mm_fast_accum",
torch._scaled_mm,
a,
b,
scale_a=scale_a,
scale_b=scale_b,
out_dtype=torch.float16,
use_fast_accum=True))
# cutlass impl: bf16 output
timers.append(
bench_fn(label, sub_label, "cutlass_fp8_fp8_bf16_scaled_mm",
ops.cutlass_scaled_mm, a, b, scale_a, scale_b,
torch.bfloat16))
# cutlass impl: fp16 output
timers.append(
bench_fn(label, sub_label, "cutlass_fp8_fp8_fp16_scaled_mm",
ops.cutlass_scaled_mm, a, b, scale_a, scale_b, torch.float16))
# cutlass impl: bf16 output, with bias
timers.append(
bench_fn(label, sub_label, "cutlass_fp8_fp8_bf16_scaled_mm_bias",
ops.cutlass_scaled_mm, a, b, scale_a, scale_b, torch.bfloat16,
bias))
# cutlass impl: fp16 output, with bias
timers.append(
bench_fn(label, sub_label, "cutlass_fp8_fp8_fp16_scaled_mm_bias",
ops.cutlass_scaled_mm, a, b, scale_a, scale_b, torch.float16,
bias.to(dtype=torch.float16)))
return timers
def bench(dtype: torch.dtype,
m: int,
k: int,
n: int,
label: str,
sub_label: str,
bench_kernels: Optional[list[str]] = None) -> Iterable[TMeasurement]:
def bench(dtype: torch.dtype, m: int, k: int, n: int, label: str,
sub_label: str) -> Iterable[TMeasurement]:
if dtype == torch.int8:
return bench_int8(dtype, m, k, n, label, sub_label, bench_kernels)
return bench_int8(dtype, m, k, n, label, sub_label)
if dtype == torch.float8_e4m3fn:
return bench_fp8(dtype, m, k, n, label, sub_label, bench_kernels)
return bench_fp8(dtype, m, k, n, label, sub_label)
raise ValueError("unsupported type")
@ -196,24 +207,20 @@ def print_timers(timers: Iterable[TMeasurement]):
def run(dtype: torch.dtype,
MKNs: Iterable[tuple[int, int, int]],
bench_kernels: Optional[list[str]] = None) -> Iterable[TMeasurement]:
MKNs: Iterable[Tuple[int, int, int]]) -> Iterable[TMeasurement]:
results = []
for m, k, n in MKNs:
timers = bench(dtype,
m,
k,
n,
f"scaled-{dtype}-gemm",
f"MKN=({m}x{k}x{n})",
bench_kernels=bench_kernels)
timers = bench(dtype, m, k, n, f"scaled-{dtype}-gemm",
f"MKN=({m}x{k}x{n})")
print_timers(timers)
results.extend(timers)
return results
# output makers
def make_output(data: Iterable[TMeasurement],
MKNs: Iterable[tuple[int, int, int]],
MKNs: Iterable[Tuple[int, int, int]],
base_description: str,
timestamp=None):
print(f"== All Results {base_description} ====")
@ -225,11 +232,15 @@ def make_output(data: Iterable[TMeasurement],
pkl.dump(data, f)
# argparse runners
def run_square_bench(args):
dim_sizes = list(
range(args.dim_start, args.dim_end + 1, args.dim_increment))
MKNs = list(zip(dim_sizes, dim_sizes, dim_sizes))
data = run(args.dtype, MKNs, bench_kernels=args.kernels)
data = run(args.dtype, MKNs)
make_output(data, MKNs, f"square_bench-{args.dtype}")
@ -240,7 +251,8 @@ def run_range_bench(args):
Ks = [args.k_constant] * n if args.k_constant is not None else dim_sizes
Ns = [args.n_constant] * n if args.n_constant is not None else dim_sizes
MKNs = list(zip(Ms, Ks, Ns))
data = run(args.dtype, MKNs, bench_kernels=args.kernels)
data = run(args.dtype, MKNs)
make_output(data, MKNs, f"range_bench-{args.dtype}")
@ -249,7 +261,7 @@ def run_model_bench(args):
for i, model in enumerate(args.models):
print(f"[{i}] {model}")
def model_shapes(model_name: str, tp_size: int) -> list[tuple[int, int]]:
def model_shapes(model_name: str, tp_size: int) -> List[Tuple[int, int]]:
KNs = []
for KN, tp_split_dim in copy.deepcopy(WEIGHT_SHAPES[model_name]):
KN[tp_split_dim] = KN[tp_split_dim] // tp_size
@ -266,7 +278,7 @@ def run_model_bench(args):
for k, n in KNs:
MKNs.append((m, k, n))
data = run(args.dtype, MKNs, bench_kernels=args.kernels)
data = run(args.dtype, MKNs)
model_bench_data.append(data)
# Print all results
@ -316,15 +328,6 @@ Benchmark Cutlass GEMM.
type=to_torch_dtype,
required=True,
help="Available options are ['int8', 'fp8']")
parser.add_argument(
"--kernels",
nargs="+",
type=str,
default=None,
help=
"Exact names of the kernels to benchmark. If not set, runs all kernels."
)
subparsers = parser.add_subparsers(dest="cmd")
square_parser = subparsers.add_parser("square_bench")
@ -359,4 +362,4 @@ Benchmark Cutlass GEMM.
model_parser.set_defaults(func=run_model_bench)
args = parser.parse_args()
args.func(args)
args.func(args)

View File

@ -1,5 +1,3 @@
# SPDX-License-Identifier: Apache-2.0
# Weight Shapes are in the format
# ([K, N], TP_SPLIT_DIM)
# Example:

View File

@ -1,5 +1,3 @@
# SPDX-License-Identifier: Apache-2.0
import os
import aiohttp

View File

@ -1,5 +1,3 @@
# SPDX-License-Identifier: Apache-2.0
import asyncio
import itertools

View File

@ -1,5 +1,3 @@
# SPDX-License-Identifier: Apache-2.0
import json
import matplotlib.pyplot as plt

View File

@ -1,11 +1,8 @@
# SPDX-License-Identifier: Apache-2.0
import pickle as pkl
import time
from collections.abc import Iterable
from dataclasses import dataclass
from itertools import product
from typing import Callable, Optional
from typing import Callable, Iterable, List, Optional
import torch
import torch.utils.benchmark as TBenchmark
@ -30,7 +27,7 @@ class bench_params_t:
f'x DT {self.dtype}')
def get_bench_params() -> list[bench_params_t]:
def get_bench_params() -> List[bench_params_t]:
## Test Fixtures
NUM_TOKENS = [2**x for x in range(11)]
HIDDEN_SIZES = list(range(1024, 8129, 1024))

View File

@ -1,5 +1,3 @@
# SPDX-License-Identifier: Apache-2.0
import os
import sys
from typing import Optional

View File

@ -1,5 +1,3 @@
# SPDX-License-Identifier: Apache-2.0
import time
import torch

View File

@ -1,5 +1,3 @@
# SPDX-License-Identifier: Apache-2.0
import argparse
import copy
import json
@ -9,7 +7,7 @@ from dataclasses import dataclass
from enum import Enum, auto
from itertools import product
from pathlib import Path
from typing import Any, Callable, Optional
from typing import Any, Callable, Dict, List, Optional, Tuple
import torch
import torch.utils.benchmark as TBenchmark
@ -61,15 +59,15 @@ def make_rand_lora_weight_tensor(k: int,
def make_rand_tensors(
a_shape: tuple[int],
b_shape: tuple[int],
c_shape: tuple[int],
a_shape: Tuple[int],
b_shape: Tuple[int],
c_shape: Tuple[int],
a_dtype: torch.dtype,
b_dtype: torch.dtype,
c_dtype: torch.dtype,
num_slices: int,
device: str = "cuda",
) -> tuple[torch.Tensor, list[torch.Tensor], torch.Tensor]:
) -> Tuple[torch.Tensor, List[torch.Tensor], torch.Tensor]:
"""
Make LoRA input/output matrices.
"""
@ -89,7 +87,7 @@ def make_prompt_lora_mapping(num_prompts: int, num_active_loras: int,
sort_by_lora_id: bool,
device: str) -> torch.Tensor:
"""
All prompts are mapped to a LoRA ID in range [0, num_active_loras).
All prompts are mapped to a Lora ID in range [0, num_active_loras).
where 0 refers to first lora, 1 refers to second lora and so on.
"""
assert num_active_loras > 0
@ -135,7 +133,7 @@ def make_token_lora_mapping(num_tokens: int, num_prompts: int,
def ref_group_gemm(ref_out: torch.Tensor, input: torch.Tensor,
lora_weights: list[torch.Tensor],
lora_weights: List[torch.Tensor],
seq_lens_cpu: torch.Tensor,
prompt_lora_mapping_cpu: torch.Tensor, scaling: float,
add_inputs: Optional[bool]):
@ -204,7 +202,7 @@ class OpType(Enum):
def is_expand_slice_fn(self) -> bool:
return self in [OpType.BGMV_EXPAND_SLICE]
def num_slices(self) -> list[int]:
def num_slices(self) -> List[int]:
if self in [OpType.SGMV_EXPAND, OpType.SGMV_SHRINK]:
# SGMV kernels supports slices
return [1, 2, 3]
@ -215,7 +213,7 @@ class OpType(Enum):
raise ValueError(f"Unrecognized OpType {self}")
def mkn(self, batch_size: int, seq_length: int, hidden_size: int,
lora_rank: int) -> tuple[int, int, int]:
lora_rank: int) -> Tuple[int, int, int]:
num_tokens = batch_size * seq_length
if self.is_shrink_fn():
m = num_tokens
@ -230,7 +228,7 @@ class OpType(Enum):
def matmul_dtypes(
self, op_dtype: torch.dtype
) -> tuple[torch.dtype, torch.dtype, torch.dtype]:
) -> Tuple[torch.dtype, torch.dtype, torch.dtype]:
"""
return a type, b type and c type for A x B = C
"""
@ -243,7 +241,7 @@ class OpType(Enum):
def matmul_shapes(
self, batch_size: int, seq_length: int, hidden_size: int,
lora_rank: int, num_loras: int,
num_slices: int) -> tuple[tuple[int], tuple[int], tuple[int]]:
num_slices: int) -> Tuple[Tuple[int], Tuple[int], Tuple[int]]:
"""
Given num_slices, return the shapes of the A, B, and C matrices
in A x B = C, for the op_type
@ -268,7 +266,7 @@ class OpType(Enum):
def bench_fn(self) -> Callable:
def emulate_bgmv_expand_slice(kwargs_list: list[dict[str, Any]]):
def emulate_bgmv_expand_slice(kwargs_list: List[Dict[str, Any]]):
for x in kwargs_list:
bgmv_expand_slice(**x)
@ -285,7 +283,7 @@ class OpType(Enum):
raise ValueError(f"Unrecognized optype {self}")
def run_ref_group_gemm(self, output: torch.Tensor, input: torch.Tensor,
lora_weights: list[torch.Tensor],
lora_weights: List[torch.Tensor],
**kwargs) -> Callable:
"""Each benchmark operation expected the input, lora_weights and outputs
in a slightly different format. Refer to self.matmul_shapes().
@ -384,7 +382,7 @@ class BenchmarkTensors:
"""
# matmul tensors
input: torch.Tensor
lora_weights_lst: list[torch.Tensor]
lora_weights_lst: List[torch.Tensor]
output: torch.Tensor
# metadata tensors
seq_lens: torch.Tensor
@ -469,7 +467,7 @@ class BenchmarkTensors:
for i in range(len(self.lora_weights_lst)):
self.lora_weights_lst[i] = to_device(self.lora_weights_lst[i])
def metadata(self) -> tuple[int, int, int]:
def metadata(self) -> Tuple[int, int, int]:
"""
Return num_seqs, num_tokens and max_seq_len
"""
@ -505,7 +503,7 @@ class BenchmarkTensors:
self.seq_lens = seq_lens.to(dtype=self.seq_lens.dtype)
self.seq_start_loc = seq_start_loc.to(dtype=self.seq_start_loc.dtype)
def as_sgmv_shrink_kwargs(self) -> dict[str, Any]:
def as_sgmv_shrink_kwargs(self) -> Dict[str, Any]:
self.convert_to_sgmv_benchmark_tensors()
self.sanity_check()
self.to_device(self.input.device)
@ -540,7 +538,7 @@ class BenchmarkTensors:
'scaling': 1.0,
}
def as_sgmv_expand_kwargs(self, add_inputs: bool) -> dict[str, Any]:
def as_sgmv_expand_kwargs(self, add_inputs: bool) -> Dict[str, Any]:
self.convert_to_sgmv_benchmark_tensors()
self.sanity_check()
@ -578,7 +576,7 @@ class BenchmarkTensors:
'add_inputs': add_inputs,
}
def as_bgmv_shrink_kwargs(self) -> dict[str, Any]:
def as_bgmv_shrink_kwargs(self) -> Dict[str, Any]:
assert len(self.lora_weights_lst) == 1
self.to_device(self.input.device)
@ -634,7 +632,7 @@ class BenchmarkTensors:
'add_inputs': add_inputs
}
def as_bgmv_expand_slice_kwargs(self, add_inputs: bool) -> dict[str, Any]:
def as_bgmv_expand_slice_kwargs(self, add_inputs: bool) -> Dict[str, Any]:
_, num_tokens, _, num_slices = self.metadata()
# Sanity check shapes
@ -670,7 +668,7 @@ class BenchmarkTensors:
def bench_fn_kwargs(self,
op_type: OpType,
add_inputs: Optional[bool] = None) -> dict[str, Any]:
add_inputs: Optional[bool] = None) -> Dict[str, Any]:
if op_type.is_shrink_fn():
assert add_inputs is None
else:
@ -734,7 +732,7 @@ def bench_optype(ctx: BenchmarkContext,
assert expand_fn_add_inputs is not None
# BenchmarkContext -> BenchmarkTensors
bench_tensors : list[BenchmarkTensors] = \
bench_tensors : List[BenchmarkTensors] = \
[BenchmarkTensors.make(ctx, op_type) for _ in range(arg_pool_size)]
for bt in bench_tensors:
bt.sanity_check()
@ -746,7 +744,7 @@ def bench_optype(ctx: BenchmarkContext,
for bt in bench_tensors
])
# BenchmarkTensors -> dict (kwargs)
# BenchmarkTensors -> Dict (kwargs)
kwargs_list = [
bt.bench_fn_kwargs(op_type, add_inputs=expand_fn_add_inputs)
for bt in bench_tensors
@ -841,7 +839,7 @@ def use_cuda_graph_recommendation() -> str:
"""
def print_timers(timers: list[TMeasurement],
def print_timers(timers: List[TMeasurement],
args: Optional[argparse.Namespace] = None):
compare = TBenchmark.Compare(timers)
compare.print()
@ -861,7 +859,7 @@ def print_timers(timers: list[TMeasurement],
"small num_loras the goal should be to match the torch.mm numbers.")
def run(args: argparse.Namespace, bench_ctxs: list[BenchmarkContext]):
def run(args: argparse.Namespace, bench_ctxs: List[BenchmarkContext]):
if args.cuda_graph_nops is not None:
assert args.cuda_graph_nops > 0
@ -873,7 +871,7 @@ def run(args: argparse.Namespace, bench_ctxs: list[BenchmarkContext]):
timers = []
for bench_ctx in bench_ctxs:
for seq_len in args.seq_lengths:
bench_ops: list[OpType] = []
bench_ops: List[OpType] = []
if seq_len == 1:
# bench all decode ops
bench_ops = [op for op in args.op_types if op.is_decode_op()]
@ -921,10 +919,10 @@ def run(args: argparse.Namespace, bench_ctxs: list[BenchmarkContext]):
pickle.dump(timers, f)
def as_benchmark_contexts(hidden_sizes: list[int], lora_ranks: list[int],
args: argparse.Namespace) -> list[BenchmarkContext]:
def as_benchmark_contexts(hidden_sizes: List[int], lora_ranks: List[int],
args: argparse.Namespace) -> List[BenchmarkContext]:
ctxs: list[BenchmarkContext] = []
ctxs: List[BenchmarkContext] = []
for batch_size, hidden_size, lora_rank, num_loras, sort_by_lora_id in product( # noqa
args.batch_sizes, list(hidden_sizes), lora_ranks, args.num_loras,
args.sort_by_lora_id):
@ -954,7 +952,7 @@ def run_list_bench(args: argparse.Namespace):
f" LoRA Ranks {args.lora_ranks}")
# Get all benchmarking contexts
bench_contexts: list[BenchmarkContext] = as_benchmark_contexts(
bench_contexts: List[BenchmarkContext] = as_benchmark_contexts(
hidden_sizes=args.hidden_sizes, lora_ranks=args.lora_ranks, args=args)
run(args, bench_contexts)
@ -975,7 +973,7 @@ def run_range_bench(args: argparse.Namespace):
f" LoRA Ranks {lora_ranks}")
# Get all benchmarking contexts
bench_contexts: list[BenchmarkContext] = as_benchmark_contexts(
bench_contexts: List[BenchmarkContext] = as_benchmark_contexts(
hidden_sizes=hidden_sizes, lora_ranks=lora_ranks, args=args)
run(args, bench_contexts)
@ -1002,7 +1000,7 @@ def run_model_bench(args: argparse.Namespace):
f" LoRA Ranks {args.lora_ranks}")
# Get all benchmarking contexts
bench_contexts: list[BenchmarkContext] = as_benchmark_contexts(
bench_contexts: List[BenchmarkContext] = as_benchmark_contexts(
hidden_sizes=hidden_sizes, lora_ranks=args.lora_ranks, args=args)
run(args, bench_contexts)

View File

@ -1,5 +1,3 @@
# SPDX-License-Identifier: Apache-2.0
import argparse
import copy
import itertools
@ -7,10 +5,9 @@ import math
import os
import pickle as pkl
import time
from collections.abc import Iterable
from dataclasses import dataclass
from itertools import product
from typing import Callable, Optional
from typing import Callable, Iterable, List, Optional, Tuple
import pandas as pd
import torch
@ -103,8 +100,8 @@ def quantize_and_pack(atype: torch.dtype,
return w_ref, w_q, w_s, w_zp
def create_bench_tensors(shape: tuple[int, int, int], types: TypeConfig,
group_size: Optional[int]) -> list[BenchmarkTensors]:
def create_bench_tensors(shape: Tuple[int, int, int], types: TypeConfig,
group_size: Optional[int]) -> List[BenchmarkTensors]:
m, n, k = shape
# we want to make sure that weights don't fit into L2 cache between runs so
@ -115,7 +112,7 @@ def create_bench_tensors(shape: tuple[int, int, int], types: TypeConfig,
a = rand_data((m, k), types.act_type, scale=5)
benchmark_tensors: list[BenchmarkTensors] = []
benchmark_tensors: List[BenchmarkTensors] = []
for _ in range(num_weights):
w = rand_data((k, n), types.act_type, scale=5)
@ -277,7 +274,7 @@ def machete_create_bench_fn(bt: BenchmarkTensors,
def bench_fns(label: str, sub_label: str, description: str,
fns: list[Callable]):
fns: List[Callable]):
min_run_time = 1 if not NVTX_PROFILE else 0.1
res = TBenchmark.Timer(
@ -312,7 +309,7 @@ def bench(types: TypeConfig,
n: int,
label: str,
sub_label: str,
sweep_schedules: bool = True) -> list[TMeasurement]:
sweep_schedules: bool = True) -> List[TMeasurement]:
benchmark_tensors = create_bench_tensors((m, n, k), types, group_size)
sub_label += f", L={len(benchmark_tensors)}"
@ -415,12 +412,12 @@ def bench(types: TypeConfig,
# runner
def print_timers(timers: list[TMeasurement]):
def print_timers(timers: List[TMeasurement]):
compare = TBenchmark.Compare(timers)
compare.print()
def run(args, MKNs: Iterable[tuple[int, int, int]]) -> Iterable[TMeasurement]:
def run(args, MKNs: Iterable[Tuple[int, int, int]]) -> Iterable[TMeasurement]:
types = TypeConfig(
act_type=args.act_type,
weight_type=scalar_types.uint4b8 if args.group_zero_type is None \
@ -432,7 +429,7 @@ def run(args, MKNs: Iterable[tuple[int, int, int]]) -> Iterable[TMeasurement]:
token_scale_type=args.token_scale_type,
)
results: list[TMeasurement] = []
results: List[TMeasurement] = []
for m, k, n in MKNs:
timers = bench(types,
args.group_size,
@ -450,8 +447,8 @@ def run(args, MKNs: Iterable[tuple[int, int, int]]) -> Iterable[TMeasurement]:
# output makers
def make_output(
data: list[TMeasurement],
MKNs: Iterable[tuple[int, int, int]],
data: List[TMeasurement],
MKNs: Iterable[Tuple[int, int, int]],
base_description: str,
timestamp=None,
):
@ -498,7 +495,7 @@ def run_model_bench(args):
for i, model in enumerate(args.models):
print(f"[{i}] {model}")
def model_shapes(model_name: str, tp_size: int) -> list[tuple[int, int]]:
def model_shapes(model_name: str, tp_size: int) -> List[Tuple[int, int]]:
KNs = []
for KN, tp_split_dim in copy.deepcopy(WEIGHT_SHAPES[model_name]):
KN[tp_split_dim] = KN[tp_split_dim] // tp_size

View File

@ -1,4 +1,4 @@
# SPDX-License-Identifier: Apache-2.0
from typing import List
import torch
import torch.utils.benchmark as benchmark
@ -8,8 +8,6 @@ from vllm import _custom_ops as ops
from vllm.model_executor.layers.quantization.gptq_marlin_24 import (
GPTQ_MARLIN_24_MAX_PARALLEL, GPTQ_MARLIN_24_MIN_THREAD_N,
GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES, GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES)
from vllm.model_executor.layers.quantization.utils.allspark_utils import (
ALLSPARK_AMPERE_M_CUBLAS_THRESHOLD, ALLSPARK_SUPPORTED_QUANT_TYPES)
from vllm.model_executor.layers.quantization.utils.marlin_utils import (
GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N,
MARLIN_SUPPORTED_GROUP_SIZES, query_marlin_supported_quant_types)
@ -18,18 +16,18 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
from vllm.model_executor.layers.quantization.utils.marlin_utils_test_24 import (
marlin_24_quantize)
from vllm.model_executor.layers.quantization.utils.quant_utils import (
gptq_pack, gptq_quantize_weights, quantize_weights, sort_weights)
gptq_pack, gptq_quantize_weights, sort_weights)
from vllm.scalar_type import ScalarType
from vllm.utils import FlexibleArgumentParser
DEFAULT_MODELS = ["meta-llama/Llama-2-7b-hf/TP1"]
DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192]
DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
ACT_ORDER_OPTS = [False, True]
K_FULL_OPTS = [False, True]
def bench_run(results: list[benchmark.Measurement], model: str,
def bench_run(results: List[benchmark.Measurement], model: str,
act_order: bool, is_k_full: bool, quant_type: ScalarType,
group_size: int, size_m: int, size_k: int, size_n: int):
label = "Quant Matmul"
@ -81,27 +79,6 @@ def bench_run(results: list[benchmark.Measurement], model: str,
GPTQ_MARLIN_24_MAX_PARALLEL)
marlin_zp = torch.zeros_like(marlin_s, dtype=torch.int)
# AllSpark W8A16 quant
as_supported_case = (quant_type in ALLSPARK_SUPPORTED_QUANT_TYPES
and group_size == -1 and not act_order and is_k_full)
if as_supported_case:
properties = torch.cuda.get_device_properties(b.device.index)
sm_count = properties.multi_processor_count
sm_version = properties.major * 10 + properties.minor
supported_arch = (sm_version >= 80 and sm_version < 90)
as_supported_case = as_supported_case and supported_arch
if supported_arch:
has_zp = False
w_ref, qw, s, zp = quantize_weights(b, quant_type, group_size,
has_zp)
qw = qw.to(torch.uint8)
qw_reorder, s_reorder, zp_reorder = \
ops.allspark_repack_weight(
qw, s, zp, has_zp)
CUBLAS_M_THRESHOLD = ALLSPARK_AMPERE_M_CUBLAS_THRESHOLD
globals = {
# Gen params
"quant_type": quant_type,
@ -130,19 +107,10 @@ def bench_run(results: list[benchmark.Measurement], model: str,
# GPTQ params
"q_w_gptq": q_w_gptq,
"repack_sort_indices": repack_sort_indices,
# AllSpark W8A16 params
"qw_reorder": qw_reorder if as_supported_case else None,
"s_reorder": s_reorder if as_supported_case else None,
"zp_reorder": zp_reorder if as_supported_case else None,
"sm_count": sm_count if as_supported_case else None,
"sm_version": sm_version if as_supported_case else None,
"CUBLAS_M_THRESHOLD":
CUBLAS_M_THRESHOLD if as_supported_case else None,
# Kernels
"gptq_marlin_gemm": ops.gptq_marlin_gemm,
"gptq_marlin_24_gemm": ops.gptq_marlin_24_gemm,
"gptq_marlin_repack": ops.gptq_marlin_repack,
"allspark_w8a16_gemm": ops.allspark_w8a16_gemm,
}
min_run_time = 1
@ -202,24 +170,13 @@ def bench_run(results: list[benchmark.Measurement], model: str,
description="gptq_marlin_repack",
).blocked_autorange(min_run_time=min_run_time))
if as_supported_case:
results.append(
benchmark.Timer(
stmt=
"output = allspark_w8a16_gemm(a, qw_reorder, s_reorder, zp_reorder, size_n, group_size, sm_count, sm_version, CUBLAS_M_THRESHOLD, False, True)", # noqa: E501
globals=globals,
label=label,
sub_label=sub_label,
description="allspark_w8a16_gemm_fp32",
).blocked_autorange(min_run_time=min_run_time))
def main(args):
print("Benchmarking models:")
for i, model in enumerate(args.models):
print(f"[{i}] {model}")
results: list[benchmark.Measurement] = []
results: List[benchmark.Measurement] = []
for model in args.models:
for layer in WEIGHT_SHAPES[model]:

View File

@ -1,11 +1,8 @@
# SPDX-License-Identifier: Apache-2.0
import argparse
import time
from contextlib import nullcontext
from datetime import datetime
from itertools import product
from typing import Any, TypedDict
from typing import Any, Dict, List, Tuple, TypedDict
import ray
import torch
@ -41,7 +38,6 @@ def benchmark_config(
use_fp8_w8a8: bool,
use_int8_w8a16: bool,
num_iters: int = 100,
block_quant_shape: List[int] = None,
) -> float:
init_dtype = torch.float16 if use_fp8_w8a8 else dtype
x = torch.randn(num_tokens, hidden_size, dtype=dtype)
@ -83,24 +79,8 @@ def benchmark_config(
dtype=torch.float32)
w2_scale = torch.randn((hidden_size, num_experts), dtype=torch.float32)
if use_fp8_w8a8:
if block_quant_shape:
block_n, block_k = block_quant_shape[0], block_quant_shape[1]
E = num_experts
N = shard_intermediate_size // 2
K = hidden_size
factor_for_scale = 1e-2
n_tiles_w1 = (2 * N + block_n - 1) // block_n
n_tiles_w2 = (K + block_n - 1) // block_n
k_tiles_w1 = (K + block_k - 1) // block_k
k_tiles_w2 = (N + block_k - 1) // block_k
w1_scale = torch.rand((E, n_tiles_w1, k_tiles_w1),
dtype=torch.float32) * factor_for_scale
w2_scale = torch.rand((E, n_tiles_w2, k_tiles_w2),
dtype=torch.float32) * factor_for_scale
else:
w1_scale = torch.randn(num_experts, dtype=torch.float32)
w2_scale = torch.randn(num_experts, dtype=torch.float32)
w1_scale = torch.randn(num_experts, dtype=torch.float32)
w2_scale = torch.randn(num_experts, dtype=torch.float32)
a1_scale = torch.randn(1, dtype=torch.float32)
a2_scale = torch.randn(1, dtype=torch.float32)
@ -129,7 +109,6 @@ def benchmark_config(
w2_scale=w2_scale,
a1_scale=a1_scale,
a2_scale=a2_scale,
block_shape=block_quant_shape,
)
# JIT compilation & warmup
@ -151,7 +130,7 @@ def benchmark_config(
start_event = torch.cuda.Event(enable_timing=True)
end_event = torch.cuda.Event(enable_timing=True)
latencies: list[float] = []
latencies: List[float] = []
for i in range(num_iters):
prepare(i)
torch.cuda.synchronize()
@ -194,9 +173,8 @@ def get_rocm_tuning_space(use_fp16):
return param_ranges
def get_configs_compute_bound(use_fp16,
block_quant_shape) -> list[dict[str, int]]:
configs: list[BenchmarkConfig] = []
def get_configs_compute_bound(use_fp16) -> List[Dict[str, int]]:
configs: List[BenchmarkConfig] = []
if current_platform.is_rocm():
param_ranges = get_rocm_tuning_space(use_fp16)
@ -224,27 +202,17 @@ def get_configs_compute_bound(use_fp16,
for config_values in product(*values):
config = dict(zip(keys, config_values))
configs.append(config)
# Remove configs that are not compatible with fp8 block quantization
# BLOCK_SIZE_K must be a multiple of block_k
# BLOCK_SIZE_N must be a multiple of block_n
if block_quant_shape is not None and not use_fp16:
block_n, block_k = block_quant_shape[0], block_quant_shape[1]
for config in configs[:]:
if config["BLOCK_SIZE_K"] % block_k != 0 or config[
"BLOCK_SIZE_N"] % block_n != 0:
configs.remove(config)
return configs
def prune_rocm_search_space(num_tokens, shard_intermediate_size, hidden_size,
search_space, is_fp16, topk):
search_space, is_fp16):
N1, K1 = shard_intermediate_size, hidden_size
N2, K2 = hidden_size, shard_intermediate_size // 2
pruned_space_1 = prune_rocm_configs(num_tokens * topk, N1, K1,
search_space, is_fp16)
pruned_space_2 = prune_rocm_configs(num_tokens * topk, N2, K2,
search_space, is_fp16)
pruned_space_1 = prune_rocm_configs(num_tokens * 2, N1, K1, search_space,
is_fp16)
pruned_space_2 = prune_rocm_configs(num_tokens * 2, N2, K2, search_space,
is_fp16)
search_space = merge_unique_dicts(pruned_space_1, pruned_space_2)
return search_space
@ -365,7 +333,7 @@ class BenchmarkWorker:
dtype: torch.dtype,
use_fp8_w8a8: bool,
use_int8_w8a16: bool,
) -> tuple[dict[str, int], float]:
) -> Tuple[Dict[str, int], float]:
current_platform.seed_everything(self.seed)
dtype_str = get_config_dtype_str(dtype,
use_int8_w8a16=use_int8_w8a16,
@ -375,13 +343,9 @@ class BenchmarkWorker:
op_config = get_moe_configs(num_experts, shard_intermediate_size // 2,
dtype_str)
if op_config is None:
config = get_default_config(num_tokens,
num_experts,
shard_intermediate_size,
hidden_size,
topk,
dtype_str,
is_marlin=False)
config = get_default_config(num_tokens, num_experts,
shard_intermediate_size, hidden_size,
topk, dtype_str)
else:
config = op_config[min(op_config.keys(),
key=lambda x: abs(x - num_tokens))]
@ -401,9 +365,8 @@ class BenchmarkWorker:
dtype: torch.dtype,
use_fp8_w8a8: bool,
use_int8_w8a16: bool,
search_space: list[dict[str, int]],
block_quant_shape: list[int],
) -> dict[str, int]:
search_space: List[Dict[str, int]],
) -> Dict[str, int]:
best_config = None
best_time = float("inf")
if current_platform.is_rocm():
@ -411,24 +374,21 @@ class BenchmarkWorker:
search_space = prune_rocm_search_space(num_tokens,
shard_intermediate_size,
hidden_size, search_space,
is_fp16, topk)
is_fp16)
with torch.cuda.device(self.device_id) if current_platform.is_rocm(
) else nullcontext():
with torch.cuda.device(self.device_id):
for config in tqdm(search_space):
try:
kernel_time = benchmark_config(
config,
num_tokens,
num_experts,
shard_intermediate_size,
hidden_size,
topk,
dtype,
use_fp8_w8a8,
use_int8_w8a16,
num_iters=20,
block_quant_shape=block_quant_shape)
kernel_time = benchmark_config(config,
num_tokens,
num_experts,
shard_intermediate_size,
hidden_size,
topk,
dtype,
use_fp8_w8a8,
use_int8_w8a16,
num_iters=20)
except triton.runtime.autotuner.OutOfResources:
# Some configurations may be invalid and fail to compile.
continue
@ -468,10 +428,10 @@ def sort_config(config: BenchmarkConfig) -> BenchmarkConfig:
}
def save_configs(configs: dict[int, BenchmarkConfig], num_experts: int,
def save_configs(configs: Dict[int, BenchmarkConfig], num_experts: int,
shard_intermediate_size: int, hidden_size: int, topk: int,
dtype: torch.dtype, use_fp8_w8a8: bool, use_int8_w8a16: bool,
block_quant_shape: List[int]) -> None:
dtype: torch.dtype, use_fp8_w8a8: bool,
use_int8_w8a16: bool) -> None:
dtype_str = get_config_dtype_str(dtype,
use_int8_w8a16=use_int8_w8a16,
use_fp8_w8a8=use_fp8_w8a8)
@ -479,7 +439,7 @@ def save_configs(configs: dict[int, BenchmarkConfig], num_experts: int,
# NOTE(woosuk): The current naming convention uses w2.shape[2], which
# is the intermediate size after silu_and_mul.
filename = get_config_file_name(num_experts, shard_intermediate_size // 2,
dtype_str, block_quant_shape)
dtype_str)
print(f"Writing best config to {filename}...")
with open(filename, "w") as f:
@ -489,9 +449,8 @@ def save_configs(configs: dict[int, BenchmarkConfig], num_experts: int,
def main(args: argparse.Namespace):
print(args)
block_quant_shape = None
config = AutoConfig.from_pretrained(
args.model, trust_remote_code=args.trust_remote_code)
config = AutoConfig.from_pretrained(args.model)
if config.architectures[0] == "DbrxForCausalLM":
E = config.ffn_config.moe_num_experts
topk = config.ffn_config.moe_top_k
@ -502,13 +461,6 @@ def main(args: argparse.Namespace):
topk = config.num_experts_per_tok
intermediate_size = config.intermediate_size
shard_intermediate_size = 2 * intermediate_size // args.tp_size
elif (config.architectures[0] == "DeepseekV3ForCausalLM"
or config.architectures[0] == "DeepseekV2ForCausalLM"):
E = config.n_routed_experts
topk = config.num_experts_per_tok
intermediate_size = config.moe_intermediate_size
shard_intermediate_size = 2 * intermediate_size // args.tp_size
block_quant_shape = config.quantization_config['weight_block_size']
else:
# Default: Mixtral.
E = config.num_local_experts
@ -533,7 +485,7 @@ def main(args: argparse.Namespace):
num_gpus = int(ray.available_resources()["GPU"])
workers = [BenchmarkWorker.remote(args.seed) for _ in range(num_gpus)]
def _distribute(method: str, inputs: list[Any]) -> list[Any]:
def _distribute(method: str, inputs: List[Any]) -> List[Any]:
outputs = []
worker_idx = 0
for input_args in inputs:
@ -546,30 +498,27 @@ def main(args: argparse.Namespace):
if args.tune:
is_fp16 = not (use_fp8_w8a8 or use_int8_w8a16)
search_space = get_configs_compute_bound(is_fp16, block_quant_shape)
search_space = get_configs_compute_bound(is_fp16)
print(f"Start tuning over {len(search_space)} configurations...")
start = time.time()
configs = _distribute(
"tune",
[(batch_size, E, shard_intermediate_size, hidden_size, topk, dtype,
use_fp8_w8a8, use_int8_w8a16, search_space, block_quant_shape)
for batch_size in batch_sizes])
"tune", [(batch_size, E, shard_intermediate_size, hidden_size,
topk, dtype, use_fp8_w8a8, use_int8_w8a16, search_space)
for batch_size in batch_sizes])
best_configs = {
M: sort_config(config)
for M, config in zip(batch_sizes, configs)
}
save_configs(best_configs, E, shard_intermediate_size, hidden_size,
topk, dtype, use_fp8_w8a8, use_int8_w8a16,
block_quant_shape)
topk, dtype, use_fp8_w8a8, use_int8_w8a16)
end = time.time()
print(f"Tuning took {end - start:.2f} seconds")
else:
outputs = _distribute(
"benchmark",
[(batch_size, E, shard_intermediate_size, hidden_size, topk, dtype,
use_fp8_w8a8, use_int8_w8a16, block_quant_shape)
for batch_size in batch_sizes])
"benchmark", [(batch_size, E, shard_intermediate_size, hidden_size,
topk, dtype, use_fp8_w8a8, use_int8_w8a16)
for batch_size in batch_sizes])
for batch_size, (config, kernel_time) in zip(batch_sizes, outputs):
print(f"Batch size: {batch_size}, config: {config}")
@ -581,11 +530,7 @@ if __name__ == "__main__":
parser.add_argument("--model",
type=str,
default="mistralai/Mixtral-8x7B-Instruct-v0.1")
parser.add_argument("--tp-size",
"-tp",
"--tensor-parallel-size",
type=int,
default=2)
parser.add_argument("--tp-size", "-tp", type=int, default=2)
parser.add_argument("--dtype",
type=str,
choices=["auto", "fp8_w8a8", "int8_w8a16"],
@ -593,7 +538,6 @@ if __name__ == "__main__":
parser.add_argument("--seed", type=int, default=0)
parser.add_argument("--batch-size", type=int, required=False)
parser.add_argument("--tune", action="store_true")
parser.add_argument("--trust-remote-code", action="store_true")
args = parser.parse_args()
main(args)

View File

@ -1,8 +1,6 @@
# SPDX-License-Identifier: Apache-2.0
import random
import time
from typing import Optional
from typing import List, Optional
import torch
@ -11,9 +9,8 @@ from vllm.platforms import current_platform
from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser,
create_kv_caches_with_random)
NUM_BLOCKS = 128 * 1024
NUM_BLOCKS = 1024
PARTITION_SIZE = 512
PARTITION_SIZE_ROCM = 256
@torch.inference_mode()
@ -55,7 +52,7 @@ def main(
# Create the block tables.
max_num_blocks_per_seq = (max_seq_len + block_size - 1) // block_size
block_tables_lst: list[list[int]] = []
block_tables_lst: List[List[int]] = []
for _ in range(num_seqs):
block_table = [
random.randint(0, NUM_BLOCKS - 1)
@ -81,12 +78,6 @@ def main(
# Prepare for the paged attention kernel.
output = torch.empty_like(query)
if version == "v2":
if current_platform.is_rocm():
global PARTITION_SIZE
if not args.custom_paged_attn:
PARTITION_SIZE = 1024
else:
PARTITION_SIZE = PARTITION_SIZE_ROCM
num_partitions = ((max_seq_len + PARTITION_SIZE - 1) // PARTITION_SIZE)
tmp_output = torch.empty(
size=(num_seqs, num_query_heads, num_partitions, head_size),
@ -130,46 +121,25 @@ def main(
v_scale,
)
elif version == "v2":
if not args.custom_paged_attn:
ops.paged_attention_v2(
output,
exp_sums,
max_logits,
tmp_output,
query,
key_cache,
value_cache,
num_kv_heads,
scale,
block_tables,
seq_lens,
block_size,
max_seq_len,
alibi_slopes,
kv_cache_dtype,
k_scale,
v_scale,
)
else:
ops.paged_attention_rocm(
output,
exp_sums,
max_logits,
tmp_output,
query,
key_cache,
value_cache,
num_kv_heads,
scale,
block_tables,
seq_lens,
block_size,
max_seq_len,
alibi_slopes,
kv_cache_dtype,
k_scale,
v_scale,
)
ops.paged_attention_v2(
output,
exp_sums,
max_logits,
tmp_output,
query,
key_cache,
value_cache,
num_kv_heads,
scale,
block_tables,
seq_lens,
block_size,
max_seq_len,
alibi_slopes,
kv_cache_dtype,
k_scale,
v_scale,
)
else:
raise ValueError(f"Invalid version: {version}")
torch.cuda.synchronize()
@ -223,9 +193,6 @@ if __name__ == '__main__':
help="Data type for kv cache storage. If 'auto', will use model "
"data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. "
"ROCm (AMD GPU) supports fp8 (=fp8_e4m3)")
parser.add_argument("--custom-paged-attn",
action="store_true",
help="Use custom paged attention")
args = parser.parse_args()
print(args)

View File

@ -1,5 +1,3 @@
# SPDX-License-Identifier: Apache-2.0
import time
import torch

View File

@ -1,7 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
import itertools
from typing import Optional, Union
from typing import Optional, Tuple, Union
import torch
import triton
@ -22,7 +20,7 @@ class HuggingFaceRMSNorm(nn.Module):
self,
x: torch.Tensor,
residual: Optional[torch.Tensor] = None,
) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
orig_dtype = x.dtype
x = x.to(torch.float32)
if residual is not None:

View File

@ -1,7 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
from itertools import accumulate
from typing import Optional
from typing import List, Optional
import nvtx
import torch
@ -39,7 +37,7 @@ def benchmark_rope_kernels_multi_lora(
})
# non-batched RoPE takes only one scaling factor, we create multiple
# instances to simulate the same behavior
non_batched_ropes: list[RotaryEmbedding] = []
non_batched_ropes: List[RotaryEmbedding] = []
for scaling_factor in scaling_factors:
non_batched_ropes.append(
get_rope(head_size, rotary_dim, max_position, base, is_neox_style,

View File

@ -1,5 +1,3 @@
# SPDX-License-Identifier: Apache-2.0
WEIGHT_SHAPES = {
"ideal": [[4 * 256 * 32, 256 * 32]],
"mistralai/Mistral-7B-v0.1/TP1": [

View File

@ -1,9 +1,8 @@
# SPDX-License-Identifier: Apache-2.0
import math
import pickle
import re
from collections import defaultdict
from typing import List
import matplotlib.pyplot as plt
import pandas as pd
@ -22,7 +21,7 @@ if __name__ == "__main__":
with open(args.filename, 'rb') as f:
data = pickle.load(f)
raw_results: list[TMeasurement] = data["results"]
raw_results: List[TMeasurement] = data["results"]
results = defaultdict(lambda: list())
for v in raw_results:

View File

@ -1,8 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
import dataclasses
from collections.abc import Iterable
from typing import Any, Callable, Optional
from typing import Any, Callable, Iterable, Optional
import torch
import torch.utils.benchmark as TBenchmark

View File

@ -1,5 +1,3 @@
# SPDX-License-Identifier: Apache-2.0
# Weight Shapes are in the format
# ([K, N], TP_SPLIT_DIM)
# Example:

View File

@ -1,5 +1,3 @@
# SPDX-License-Identifier: Apache-2.0
import cProfile
import pstats

View File

@ -1,66 +0,0 @@
include(FetchContent)
# If FLASH_MLA_SRC_DIR is set, flash-mla is installed from that directory
# instead of downloading.
# It can be set as an environment variable or passed as a cmake argument.
# The environment variable takes precedence.
if (DEFINED ENV{FLASH_MLA_SRC_DIR})
set(FLASH_MLA_SRC_DIR $ENV{FLASH_MLA_SRC_DIR})
endif()
if(FLASH_MLA_SRC_DIR)
FetchContent_Declare(
flashmla
SOURCE_DIR ${FLASH_MLA_SRC_DIR}
CONFIGURE_COMMAND ""
BUILD_COMMAND ""
)
else()
FetchContent_Declare(
flashmla
GIT_REPOSITORY https://github.com/vllm-project/FlashMLA.git
GIT_TAG 575f7724b9762f265bbee5889df9c7d630801845
GIT_PROGRESS TRUE
CONFIGURE_COMMAND ""
BUILD_COMMAND ""
)
endif()
FetchContent_MakeAvailable(flashmla)
message(STATUS "FlashMLA is available at ${flashmla_SOURCE_DIR}")
# The FlashMLA kernels only work on hopper and require CUDA 12.3 or later.
# Only build FlashMLA kernels if we are building for something compatible with
# sm90a
cuda_archs_loose_intersection(FLASH_MLA_ARCHS "9.0a" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.3 AND FLASH_MLA_ARCHS)
set(FlashMLA_SOURCES
${flashmla_SOURCE_DIR}/csrc/flash_api.cpp
${flashmla_SOURCE_DIR}/csrc/flash_fwd_mla_bf16_sm90.cu
${flashmla_SOURCE_DIR}/csrc/flash_fwd_mla_fp16_sm90.cu
${flashmla_SOURCE_DIR}/csrc/flash_fwd_mla_metadata.cu)
set(FlashMLA_INCLUDES
${flashmla_SOURCE_DIR}/csrc/cutlass/include
${flashmla_SOURCE_DIR}/csrc/include)
set_gencode_flags_for_srcs(
SRCS "${FlashMLA_SOURCES}"
CUDA_ARCHS "${FLASH_MLA_ARCHS}")
define_gpu_extension_target(
_flashmla_C
DESTINATION vllm
LANGUAGE ${VLLM_GPU_LANG}
SOURCES ${FlashMLA_SOURCES}
COMPILE_FLAGS ${VLLM_GPU_FLAGS}
ARCHITECTURES ${VLLM_GPU_ARCHES}
INCLUDE_DIRECTORIES ${FlashMLA_INCLUDES}
USE_SABI 3
WITH_SOABI)
else()
# Create an empty target for setup.py when not targeting sm90a systems
add_custom_target(_flashmla_C)
endif()

View File

@ -1,67 +0,0 @@
# vLLM flash attention requires VLLM_GPU_ARCHES to contain the set of target
# arches in the CMake syntax (75-real, 89-virtual, etc), since we clear the
# arches in the CUDA case (and instead set the gencodes on a per file basis)
# we need to manually set VLLM_GPU_ARCHES here.
if(VLLM_GPU_LANG STREQUAL "CUDA")
foreach(_ARCH ${CUDA_ARCHS})
string(REPLACE "." "" _ARCH "${_ARCH}")
list(APPEND VLLM_GPU_ARCHES "${_ARCH}-real")
endforeach()
endif()
#
# Build vLLM flash attention from source
#
# IMPORTANT: This has to be the last thing we do, because vllm-flash-attn uses the same macros/functions as vLLM.
# Because functions all belong to the global scope, vllm-flash-attn's functions overwrite vLLMs.
# They should be identical but if they aren't, this is a massive footgun.
#
# The vllm-flash-attn install rules are nested under vllm to make sure the library gets installed in the correct place.
# To only install vllm-flash-attn, use --component _vllm_fa2_C (for FA2) or --component _vllm_fa3_C (for FA3).
# If no component is specified, vllm-flash-attn is still installed.
# If VLLM_FLASH_ATTN_SRC_DIR is set, vllm-flash-attn is installed from that directory instead of downloading.
# This is to enable local development of vllm-flash-attn within vLLM.
# It can be set as an environment variable or passed as a cmake argument.
# The environment variable takes precedence.
if (DEFINED ENV{VLLM_FLASH_ATTN_SRC_DIR})
set(VLLM_FLASH_ATTN_SRC_DIR $ENV{VLLM_FLASH_ATTN_SRC_DIR})
endif()
if(VLLM_FLASH_ATTN_SRC_DIR)
FetchContent_Declare(
vllm-flash-attn SOURCE_DIR
${VLLM_FLASH_ATTN_SRC_DIR}
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
)
else()
FetchContent_Declare(
vllm-flash-attn
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
GIT_TAG 720c94869cf2e0ff5a706e9c7f1dce0939686ade
GIT_PROGRESS TRUE
# Don't share the vllm-flash-attn build between build types
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
)
endif()
# Fetch the vllm-flash-attn library
FetchContent_MakeAvailable(vllm-flash-attn)
message(STATUS "vllm-flash-attn is available at ${vllm-flash-attn_SOURCE_DIR}")
# Copy over the vllm-flash-attn python files (duplicated for fa2 and fa3, in
# case only one is built, in the case both are built redundant work is done)
install(
DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/
DESTINATION vllm_flash_attn
COMPONENT _vllm_fa2_C
FILES_MATCHING PATTERN "*.py"
)
install(
DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/
DESTINATION vllm_flash_attn
COMPONENT _vllm_fa3_C
FILES_MATCHING PATTERN "*.py"
)

View File

@ -1,5 +1,4 @@
#!/usr/bin/env python3
# SPDX-License-Identifier: Apache-2.0
#
# A command line tool for running pytorch's hipify preprocessor on CUDA

View File

@ -257,9 +257,9 @@ endmacro()
# where `<=` is the version comparison operator.
# In other words, for each version in `TGT_CUDA_ARCHS` find the highest version
# in `SRC_CUDA_ARCHS` that is less or equal to the version in `TGT_CUDA_ARCHS`.
# We have special handling for x.0a, if x.0a is in `SRC_CUDA_ARCHS` and x.0 is
# in `TGT_CUDA_ARCHS` then we should remove x.0a from `SRC_CUDA_ARCHS` and add
# x.0a to the result (and remove x.0 from TGT_CUDA_ARCHS).
# We have special handling for 9.0a, if 9.0a is in `SRC_CUDA_ARCHS` and 9.0 is
# in `TGT_CUDA_ARCHS` then we should remove 9.0a from `SRC_CUDA_ARCHS` and add
# 9.0a to the result.
# The result is stored in `OUT_CUDA_ARCHS`.
#
# Example:
@ -270,55 +270,34 @@ endmacro()
#
function(cuda_archs_loose_intersection OUT_CUDA_ARCHS SRC_CUDA_ARCHS TGT_CUDA_ARCHS)
list(REMOVE_DUPLICATES SRC_CUDA_ARCHS)
set(TGT_CUDA_ARCHS_ ${TGT_CUDA_ARCHS})
# if x.0a is in SRC_CUDA_ARCHS and x.0 is in CUDA_ARCHS then we should
# remove x.0a from SRC_CUDA_ARCHS and add x.0a to _CUDA_ARCHS
# if 9.0a is in SRC_CUDA_ARCHS and 9.0 is in CUDA_ARCHS then we should
# remove 9.0a from SRC_CUDA_ARCHS and add 9.0a to _CUDA_ARCHS
set(_CUDA_ARCHS)
if ("9.0a" IN_LIST SRC_CUDA_ARCHS)
list(REMOVE_ITEM SRC_CUDA_ARCHS "9.0a")
if ("9.0" IN_LIST TGT_CUDA_ARCHS_)
list(REMOVE_ITEM TGT_CUDA_ARCHS_ "9.0")
if ("9.0" IN_LIST TGT_CUDA_ARCHS)
set(_CUDA_ARCHS "9.0a")
endif()
endif()
if ("10.0a" IN_LIST SRC_CUDA_ARCHS)
list(REMOVE_ITEM SRC_CUDA_ARCHS "10.0a")
if ("10.0" IN_LIST TGT_CUDA_ARCHS)
list(REMOVE_ITEM TGT_CUDA_ARCHS_ "10.0")
set(_CUDA_ARCHS "10.0a")
endif()
endif()
list(SORT SRC_CUDA_ARCHS COMPARE NATURAL ORDER ASCENDING)
# for each ARCH in TGT_CUDA_ARCHS find the highest arch in SRC_CUDA_ARCHS that
# is less or equal to ARCH (but has the same major version since SASS binary
# compatibility is only forward compatible within the same major version).
foreach(_ARCH ${TGT_CUDA_ARCHS_})
set(_TMP_ARCH)
# Extract the major version of the target arch
string(REGEX REPLACE "^([0-9]+)\\..*$" "\\1" TGT_ARCH_MAJOR "${_ARCH}")
foreach(_SRC_ARCH ${SRC_CUDA_ARCHS})
# Extract the major version of the source arch
string(REGEX REPLACE "^([0-9]+)\\..*$" "\\1" SRC_ARCH_MAJOR "${_SRC_ARCH}")
# Check major-version match AND version-less-or-equal
if (_SRC_ARCH VERSION_LESS_EQUAL _ARCH)
if (SRC_ARCH_MAJOR STREQUAL TGT_ARCH_MAJOR)
set(_TMP_ARCH "${_SRC_ARCH}")
endif()
else()
# If we hit a version greater than the target, we can break
break()
endif()
endforeach()
# If we found a matching _TMP_ARCH, append it to _CUDA_ARCHS
if (_TMP_ARCH)
list(APPEND _CUDA_ARCHS "${_TMP_ARCH}")
# for each ARCH in CUDA_ARCHS find the highest arch in SRC_CUDA_ARCHS that is
# less or eqault to ARCH
foreach(_ARCH ${CUDA_ARCHS})
set(_TMP_ARCH)
foreach(_SRC_ARCH ${SRC_CUDA_ARCHS})
if (_SRC_ARCH VERSION_LESS_EQUAL _ARCH)
set(_TMP_ARCH ${_SRC_ARCH})
else()
break()
endif()
endforeach()
if (_TMP_ARCH)
list(APPEND _CUDA_ARCHS ${_TMP_ARCH})
endif()
endforeach()
list(REMOVE_DUPLICATES _CUDA_ARCHS)
set(${OUT_CUDA_ARCHS} ${_CUDA_ARCHS} PARENT_SCOPE)

View File

@ -1,5 +1,3 @@
# SPDX-License-Identifier: Apache-2.0
# ruff: noqa
# code borrowed from https://github.com/pytorch/pytorch/blob/main/torch/utils/collect_env.py

View File

@ -15,9 +15,6 @@ void copy_blocks(std::vector<torch::Tensor> const& key_caches,
std::vector<torch::Tensor> const& value_caches,
const torch::Tensor& block_mapping);
void copy_blocks_mla(std::vector<torch::Tensor> const& kv_caches,
const torch::Tensor& block_mapping);
void reshape_and_cache(torch::Tensor& key, torch::Tensor& value,
torch::Tensor& key_cache, torch::Tensor& value_cache,
torch::Tensor& slot_mapping,
@ -31,18 +28,6 @@ void reshape_and_cache_flash(torch::Tensor& key, torch::Tensor& value,
const std::string& kv_cache_dtype,
torch::Tensor& k_scale, torch::Tensor& v_scale);
void concat_and_cache_mla(torch::Tensor& kv_c, torch::Tensor& k_pe,
torch::Tensor& kv_cache, torch::Tensor& slot_mapping,
const std::string& kv_cache_dtype,
torch::Tensor& scale);
// Just for unittest
void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache,
const double scale, const std::string& kv_cache_dtype);
void gather_cache(
torch::Tensor const& src_cache, // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...]
torch::Tensor const& dst, // [TOT_TOKENS, ENTRIES...]
torch::Tensor const& block_table, // [BATCH, BLOCK_INDICES]
torch::Tensor const& cu_seq_lens, // [BATCH+1]
int64_t batch_size, std::optional<torch::Tensor> seq_starts = std::nullopt);

View File

@ -2,7 +2,6 @@
#include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDAGuard.h>
#include "cuda_utils.h"
#include "cuda_compat.h"
#include "dispatch_utils.h"
@ -47,10 +46,7 @@ void swap_blocks(torch::Tensor& src, torch::Tensor& dst,
char* src_ptr = static_cast<char*>(src.data_ptr());
char* dst_ptr = static_cast<char*>(dst.data_ptr());
// We use the stride instead of numel in case the cache is padded for memory
// alignment reasons, we assume the blocks data (inclusive of any padding)
// is contiguous in memory
const int64_t block_size_in_bytes = src.element_size() * src.stride(0);
const int64_t block_size_in_bytes = src.element_size() * src[0].numel();
const at::cuda::OptionalCUDAGuard device_guard(
src_device.is_cuda() ? src_device : dst_device);
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
@ -97,24 +93,6 @@ __global__ void copy_blocks_kernel(int64_t* key_cache_ptrs,
}
}
// Kernel for MLA, which works on a single joint kv_cache
// Grid: (num_layers, num_pairs)
template <typename scalar_t>
__global__ void copy_blocks_mla_kernel(
int64_t* cache_ptrs, const int64_t* __restrict__ block_mapping,
const int mem_footprint_per_block) {
const int layer_idx = blockIdx.x;
const int pair_idx = blockIdx.y;
scalar_t* cache = reinterpret_cast<scalar_t*>(cache_ptrs[layer_idx]);
int64_t src_block = block_mapping[2 * pair_idx];
int64_t dst_block = block_mapping[2 * pair_idx + 1];
int64_t src_offset = src_block * mem_footprint_per_block;
int64_t dst_offset = dst_block * mem_footprint_per_block;
for (int i = threadIdx.x; i < mem_footprint_per_block; i += blockDim.x) {
cache[dst_offset + i] = cache[src_offset + i];
}
}
} // namespace vllm
// Note: the key_caches and value_caches vectors are constant but
@ -169,42 +147,6 @@ void copy_blocks(std::vector<torch::Tensor> const& key_caches,
}));
}
// copy blocks kernel for MLA (assumes a joint KV-cache)
void copy_blocks_mla(std::vector<torch::Tensor> const& kv_caches,
const torch::Tensor& block_mapping) {
int num_layers = kv_caches.size();
if (num_layers == 0) {
return;
}
torch::Device cache_device = kv_caches[0].device();
TORCH_CHECK(cache_device.is_cuda(), "kv_cache must be on CUDA");
std::vector<int64_t> cache_ptrs(num_layers);
for (int layer_idx = 0; layer_idx < num_layers; ++layer_idx) {
cache_ptrs[layer_idx] =
reinterpret_cast<int64_t>(kv_caches[layer_idx].data_ptr());
}
torch::Tensor cache_ptrs_tensor =
torch::from_blob(cache_ptrs.data(), {num_layers}, torch::kInt64)
.to(cache_device);
int num_pairs = block_mapping.size(0);
// We use the stride instead of numel in case the cache is padded for memory
// alignment reasons, we assume the blocks data (inclusive of any padding)
// is contiguous in memory
int mem_footprint_per_block = kv_caches[0].stride(0);
dim3 grid(num_layers, num_pairs);
dim3 block(std::min(1024, mem_footprint_per_block));
const at::cuda::OptionalCUDAGuard device_guard(cache_device);
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
VLLM_DISPATCH_FLOATING_AND_BYTE_TYPES(
kv_caches[0].scalar_type(), "copy_blocks_mla_kernel", ([&] {
vllm::copy_blocks_mla_kernel<scalar_t><<<grid, block, 0, stream>>>(
cache_ptrs_tensor.data_ptr<int64_t>(),
block_mapping.data_ptr<int64_t>(), mem_footprint_per_block);
}));
}
namespace vllm {
template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
@ -303,51 +245,6 @@ __global__ void reshape_and_cache_flash_kernel(
}
}
}
template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
__global__ void concat_and_cache_mla_kernel(
const scalar_t* __restrict__ kv_c, // [num_tokens, kv_lora_rank]
const scalar_t* __restrict__ k_pe, // [num_tokens, pe_dim]
cache_t* __restrict__ kv_cache, // [num_blocks, block_size, (kv_lora_rank
// + pe_dim)]
const int64_t* __restrict__ slot_mapping, // [num_tokens]
const int block_stride, //
const int entry_stride, //
const int kv_c_stride, //
const int k_pe_stride, //
const int kv_lora_rank, //
const int pe_dim, //
const int block_size, //
const float* scale //
) {
const int64_t token_idx = blockIdx.x;
const int64_t slot_idx = slot_mapping[token_idx];
// NOTE: slot_idx can be -1 if the token is padded
if (slot_idx < 0) {
return;
}
const int64_t block_idx = slot_idx / block_size;
const int64_t block_offset = slot_idx % block_size;
auto copy = [&](const scalar_t* __restrict__ src, cache_t* __restrict__ dst,
int src_stride, int dst_stride, int size, int offset) {
for (int i = threadIdx.x; i < size; i += blockDim.x) {
const int64_t src_idx = token_idx * src_stride + i;
const int64_t dst_idx =
block_idx * block_stride + block_offset * entry_stride + i + offset;
if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
dst[dst_idx] = src[src_idx];
} else {
dst[dst_idx] =
fp8::scaled_convert<cache_t, scalar_t, kv_dt>(src[src_idx], *scale);
}
}
};
copy(kv_c, kv_cache, kv_c_stride, block_stride, kv_lora_rank, 0);
copy(k_pe, kv_cache, k_pe_stride, block_stride, pe_dim, kv_lora_rank);
}
} // namespace vllm
// KV_T is the stored data type of kv-cache.
@ -375,7 +272,7 @@ void reshape_and_cache(
torch::Tensor& slot_mapping, // [num_tokens]
const std::string& kv_cache_dtype, torch::Tensor& k_scale,
torch::Tensor& v_scale) {
int num_tokens = slot_mapping.size(0);
int num_tokens = key.size(0);
int num_heads = key.size(1);
int head_size = key.size(2);
int block_size = key_cache.size(3);
@ -446,57 +343,6 @@ void reshape_and_cache_flash(
CALL_RESHAPE_AND_CACHE_FLASH);
}
// KV_T is the stored data type of kv-cache.
// CACHE_T is the data type of key and value tensors.
// KV_DTYPE is the real data type of kv-cache.
#define CALL_CONCAT_AND_CACHE_MLA(KV_T, CACHE_T, KV_DTYPE) \
vllm::concat_and_cache_mla_kernel<KV_T, CACHE_T, KV_DTYPE> \
<<<grid, block, 0, stream>>>( \
reinterpret_cast<KV_T*>(kv_c.data_ptr()), \
reinterpret_cast<KV_T*>(k_pe.data_ptr()), \
reinterpret_cast<CACHE_T*>(kv_cache.data_ptr()), \
slot_mapping.data_ptr<int64_t>(), block_stride, entry_stride, \
kv_c_stride, k_pe_stride, kv_lora_rank, pe_dim, block_size, \
reinterpret_cast<const float*>(scale.data_ptr()));
void concat_and_cache_mla(
torch::Tensor& kv_c, // [num_tokens, kv_lora_rank]
torch::Tensor& k_pe, // [num_tokens, pe_dim]
torch::Tensor& kv_cache, // [num_blocks, block_size, (kv_lora_rank +
// pe_dim)]
torch::Tensor& slot_mapping, // [num_tokens] or [num_actual_tokens]
const std::string& kv_cache_dtype, torch::Tensor& scale) {
// NOTE(woosuk): In vLLM V1, key.size(0) can be different from
// slot_mapping.size(0) because of padding for CUDA graphs.
// In vLLM V0, key.size(0) is always equal to slot_mapping.size(0) because
// both include padding.
// In vLLM V1, however, key.size(0) can be larger than slot_mapping.size(0)
// since key includes padding for CUDA graphs, while slot_mapping does not.
// In this case, slot_mapping.size(0) represents the actual number of tokens
// before padding.
// For compatibility with both cases, we use slot_mapping.size(0) as the
// number of tokens.
int num_tokens = slot_mapping.size(0);
int kv_lora_rank = kv_c.size(1);
int pe_dim = k_pe.size(1);
int block_size = kv_cache.size(1);
TORCH_CHECK(kv_cache.size(2) == kv_lora_rank + pe_dim);
int kv_c_stride = kv_c.stride(0);
int k_pe_stride = k_pe.stride(0);
int block_stride = kv_cache.stride(0);
int entry_stride = kv_cache.stride(1);
dim3 grid(num_tokens);
dim3 block(std::min(kv_lora_rank, 512));
const at::cuda::OptionalCUDAGuard device_guard(device_of(kv_c));
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
DISPATCH_BY_KV_CACHE_DTYPE(kv_c.dtype(), kv_cache_dtype,
CALL_CONCAT_AND_CACHE_MLA);
}
namespace vllm {
template <typename Tout, typename Tin, Fp8KVCacheDataType kv_dt>
@ -571,161 +417,3 @@ void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache,
TORCH_CHECK(false, "Unsupported data type: ", kv_cache_dtype);
}
}
namespace vllm {
// grid is launched with dimensions (batch, num_splits)
template <typename scalar_t>
__global__ void gather_cache(
const scalar_t* __restrict__ src_cache, // [NUM_BLOCKS, BLOCK_SIZE,
// ENTRIES...]
scalar_t* __restrict__ dst, // [TOT_TOKENS, ENTRIES...]
const int32_t* __restrict__ block_table, // [BATCH, BLOCK_INDICES]
const int32_t* __restrict__ cu_seq_lens, // [BATCH+1]
const int32_t block_size, const int32_t entry_size,
const int64_t block_table_stride, const int64_t cache_block_stride,
const int64_t cache_entry_stride, const int64_t dst_entry_stride,
const int32_t* __restrict__ seq_starts) { // Optional: starting offsets per
// batch
const int64_t bid = blockIdx.x; // Batch ID
const int32_t num_splits = gridDim.y;
const int32_t split = blockIdx.y;
const int32_t seq_start = cu_seq_lens[bid];
const int32_t seq_end = cu_seq_lens[bid + 1];
const int32_t seq_len = seq_end - seq_start;
const int32_t tot_blocks = cuda_utils::ceil_div(seq_len, block_size);
const int32_t split_blocks = cuda_utils::ceil_div(tot_blocks, num_splits);
const int32_t split_start = split * split_blocks;
const int32_t split_end = min((split + 1) * split_blocks, tot_blocks);
const bool is_active_split = (split_start < tot_blocks);
const bool is_last_split = (split_end == tot_blocks);
if (!is_active_split) return;
int32_t full_blocks_end = split_end;
int32_t partial_block_size = 0;
// Adjust the pointer for the block_table for this batch.
// If seq_starts is provided, compute an offset based on (seq_starts[bid] /
// page_size)
const int32_t batch_offset = bid * block_table_stride;
int32_t offset = 0;
if (seq_starts != nullptr) {
offset = seq_starts[bid] / block_size;
}
const int32_t* batch_block_table = block_table + batch_offset + offset;
// Adjust dst pointer based on the cumulative sequence lengths.
dst += seq_start * dst_entry_stride;
if (is_last_split) {
partial_block_size = seq_len % block_size;
if (partial_block_size) full_blocks_end -= 1;
}
auto copy_entry = [&](const scalar_t* __restrict__ _src,
scalar_t* __restrict__ _dst) {
for (int i = threadIdx.x; i < entry_size; i += blockDim.x)
_dst[i] = _src[i];
};
for (int pid = split_start; pid < full_blocks_end; ++pid) {
auto block_id = batch_block_table[pid];
auto block_start_ptr = src_cache + block_id * cache_block_stride;
auto block_dst_ptr = dst + pid * block_size * dst_entry_stride;
for (int eid = 0; eid < block_size; ++eid) {
copy_entry(block_start_ptr + eid * cache_entry_stride,
block_dst_ptr + eid * dst_entry_stride);
}
}
if (partial_block_size) {
auto block_id = batch_block_table[full_blocks_end];
auto block_start_ptr = src_cache + block_id * cache_block_stride;
auto block_dst_ptr = dst + full_blocks_end * block_size * dst_entry_stride;
for (int eid = 0; eid < partial_block_size; ++eid) {
copy_entry(block_start_ptr + eid * cache_entry_stride,
block_dst_ptr + eid * dst_entry_stride);
}
}
}
} // namespace vllm
// Macro to dispatch the kernel based on the data type.
#define CALL_GATHER_CACHE(CPY_DTYPE) \
vllm::gather_cache<CPY_DTYPE><<<grid, block, 0, stream>>>( \
reinterpret_cast<CPY_DTYPE*>(src_cache.data_ptr()), \
reinterpret_cast<CPY_DTYPE*>(dst.data_ptr()), \
block_table.data_ptr<int32_t>(), cu_seq_lens.data_ptr<int32_t>(), \
block_size, entry_size, block_table_stride, cache_block_stride, \
cache_entry_stride, dst_entry_stride, seq_starts_ptr);
// Gather sequences from the cache into the destination tensor.
// - cu_seq_lens contains the cumulative sequence lengths for each batch
// - block_table contains the cache block indices for each sequence
// - Optionally, seq_starts (if provided) offsets the starting block index by
// (seq_starts[bid] / page_size)
void gather_cache(
torch::Tensor const& src_cache, // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...]
torch::Tensor const& dst, // [TOT_TOKENS, ENTRIES...]
torch::Tensor const& block_table, // [BATCH, BLOCK_INDICES]
torch::Tensor const& cu_seq_lens, // [BATCH+1]
int64_t batch_size,
std::optional<torch::Tensor> seq_starts = std::nullopt) {
at::cuda::OptionalCUDAGuard device_guard(src_cache.device());
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
int32_t block_size = src_cache.size(1);
int32_t entry_size = src_cache.flatten(2, -1).size(2);
TORCH_CHECK(block_table.dtype() == torch::kInt32,
"block_table must be int32");
TORCH_CHECK(cu_seq_lens.dtype() == torch::kInt32,
"cu_seq_lens must be int32");
if (seq_starts.has_value()) {
TORCH_CHECK(seq_starts.value().dtype() == torch::kInt32,
"seq_starts must be int32");
}
TORCH_CHECK(src_cache.device() == dst.device(),
"src_cache and dst must be on the same device");
TORCH_CHECK(src_cache.device() == block_table.device(),
"src_cache and block_table must be on the same device");
TORCH_CHECK(src_cache.device() == cu_seq_lens.device(),
"src_cache and cu_seq_lens must be on the same device");
if (seq_starts.has_value()) {
TORCH_CHECK(src_cache.device() == seq_starts.value().device(),
"src_cache and seq_starts must be on the same device");
}
int64_t block_table_stride = block_table.stride(0);
int64_t cache_block_stride = src_cache.stride(0);
int64_t cache_entry_stride = src_cache.stride(1);
int64_t dst_entry_stride = dst.stride(0);
// Decide on the number of splits based on the batch size.
int num_splits = batch_size > 128 ? 2 : batch_size > 64 ? 4 : 16;
dim3 grid(batch_size, num_splits);
dim3 block(1024);
TORCH_CHECK(src_cache.dtype() == dst.dtype(),
"src_cache and dst must have the same dtype");
const int dtype_bits = src_cache.element_size() * 8;
const int32_t* seq_starts_ptr =
seq_starts.has_value() ? seq_starts.value().data_ptr<int32_t>() : nullptr;
if (dtype_bits == 32) {
CALL_GATHER_CACHE(uint32_t);
} else if (dtype_bits == 16) {
CALL_GATHER_CACHE(uint16_t);
} else if (dtype_bits == 8) {
CALL_GATHER_CACHE(uint8_t);
} else {
TORCH_CHECK(false, "Unsupported data type width: ", dtype_bits);
}
}

View File

@ -1,9 +1,7 @@
#pragma once
#include <climits>
#include <iostream>
inline constexpr uint32_t next_pow_2(uint32_t const num) {
inline uint32_t next_pow_2(uint32_t const num) {
if (num <= 1) return num;
return 1 << (CHAR_BIT * sizeof(num) - __builtin_clz(num - 1));
}
}

View File

@ -2,10 +2,6 @@
#include <torch/all.h>
#include <cmath>
#if defined(__APPLE__)
#include "omp.h"
#endif
namespace vec_op {
#ifdef ARM_BF16_SUPPORT

View File

@ -1,41 +1,15 @@
#pragma once
#include <stdio.h>
#if defined(__HIPCC__)
#define HOST_DEVICE_INLINE __host__ __device__
#define DEVICE_INLINE __device__
#define HOST_INLINE __host__
#elif defined(__CUDACC__) || defined(_NVHPC_CUDA)
#define HOST_DEVICE_INLINE __host__ __device__ __forceinline__
#define DEVICE_INLINE __device__ __forceinline__
#define HOST_INLINE __host__ __forceinline__
#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
#define HOST_DEVICE_INLINE __forceinline__ __host__ __device__
#define DEVICE_INLINE __forceinline__ __device__
#define HOST_INLINE __forceinline__ __host__
#else
#define HOST_DEVICE_INLINE inline
#define DEVICE_INLINE inline
#define HOST_INLINE inline
#endif
#define CUDA_CHECK(cmd) \
do { \
cudaError_t e = cmd; \
if (e != cudaSuccess) { \
printf("Failed: Cuda error %s:%d '%s'\n", __FILE__, __LINE__, \
cudaGetErrorString(e)); \
exit(EXIT_FAILURE); \
} \
} while (0)
int64_t get_device_attribute(int64_t attribute, int64_t device_id);
int64_t get_max_shared_memory_per_block_device_attribute(int64_t device_id);
namespace cuda_utils {
template <typename T>
HOST_DEVICE_INLINE constexpr std::enable_if_t<std::is_integral_v<T>, T>
ceil_div(T a, T b) {
return (a + b - 1) / b;
}
}; // namespace cuda_utils

View File

@ -1,22 +1,16 @@
#include "cuda_utils.h"
#ifdef USE_ROCM
#include <hip/hip_runtime.h>
#include <hip/hip_runtime_api.h>
#endif
int64_t get_device_attribute(int64_t attribute, int64_t device_id) {
// Return the cached value on subsequent calls
static int value = [=]() {
int device = static_cast<int>(device_id);
if (device < 0) {
CUDA_CHECK(cudaGetDevice(&device));
}
int value;
CUDA_CHECK(cudaDeviceGetAttribute(
&value, static_cast<cudaDeviceAttr>(attribute), device));
return static_cast<int>(value);
}();
int device, value;
if (device_id < 0) {
cudaGetDevice(&device);
} else {
device = device_id;
}
cudaDeviceGetAttribute(&value, static_cast<cudaDeviceAttr>(attribute),
device);
return value;
}

View File

@ -12,21 +12,15 @@ extern "C" {
#include <cuda_runtime_api.h>
#include <cuda.h>
char error_msg[10240]; // 10KB buffer to store error messages
CUresult no_error = CUresult(0);
CUresult error_code = no_error; // store error code
#define CUDA_CHECK(condition) \
do { \
CUresult error = condition; \
if (error != 0) { \
error_code = error; \
char* error_string; \
cuGetErrorString(error, (const char**)&error_string); \
snprintf(error_msg, sizeof(error_msg), "CUDA Error: %s at %s:%d", \
error_string, __FILE__, __LINE__); \
std::cerr << error_msg << std::endl; \
} \
#define CUDA_CHECK(condition) \
do { \
CUresult error = condition; \
if (error != 0) { \
char* error_string; \
cuGetErrorString(error, (const char**)&error_string); \
std::cerr << "CUDA Error: " << error_string << " at " << __FILE__ << ":" \
<< __LINE__ << std::endl; \
} \
} while (0)
// Global references to Python callables
@ -60,22 +54,14 @@ void create_and_map(unsigned long long device, ssize_t size, CUdeviceptr d_mem,
// Allocate memory using cuMemCreate
CUDA_CHECK(cuMemCreate(p_memHandle, size, &prop, 0));
if (error_code != 0) {
return;
}
CUDA_CHECK(cuMemMap(d_mem, size, 0, *p_memHandle, 0));
if (error_code != 0) {
return;
}
CUmemAccessDesc accessDesc = {};
accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
accessDesc.location.id = device;
accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
CUDA_CHECK(cuMemSetAccess(d_mem, size, &accessDesc, 1));
if (error_code != 0) {
return;
}
// std::cout << "create_and_map: device=" << device << ", size=" << size << ",
// d_mem=" << d_mem << ", p_memHandle=" << p_memHandle << std::endl;
}
@ -87,13 +73,7 @@ void unmap_and_release(unsigned long long device, ssize_t size,
// ", d_mem=" << d_mem << ", p_memHandle=" << p_memHandle << std::endl;
ensure_context(device);
CUDA_CHECK(cuMemUnmap(d_mem, size));
if (error_code != 0) {
return;
}
CUDA_CHECK(cuMemRelease(*p_memHandle));
if (error_code != 0) {
return;
}
}
PyObject* create_tuple_from_c_integers(unsigned long long a,
@ -141,16 +121,12 @@ void* my_malloc(ssize_t size, int device, CUstream stream) {
size_t granularity;
CUDA_CHECK(cuMemGetAllocationGranularity(&granularity, &prop,
CU_MEM_ALLOC_GRANULARITY_MINIMUM));
if (error_code != 0) {
return nullptr;
}
size_t alignedSize = ((size + granularity - 1) / granularity) * granularity;
CUdeviceptr d_mem;
CUDA_CHECK(cuMemAddressReserve(&d_mem, alignedSize, 0, 0, 0));
if (error_code != 0) {
return nullptr;
}
// allocate the CUmemGenericAllocationHandle
CUmemGenericAllocationHandle* p_memHandle =
(CUmemGenericAllocationHandle*)malloc(
@ -232,9 +208,6 @@ void my_free(void* ptr, ssize_t size, int device, CUstream stream) {
// free address and the handle
CUDA_CHECK(cuMemAddressFree(d_mem, size));
if (error_code != 0) {
return;
}
free(p_memHandle);
}
@ -285,12 +258,6 @@ static PyObject* python_unmap_and_release(PyObject* self, PyObject* args) {
unmap_and_release(recv_device, recv_size, d_mem_ptr, p_memHandle);
if (error_code != 0) {
error_code = no_error;
PyErr_SetString(PyExc_RuntimeError, error_msg);
return nullptr;
}
Py_RETURN_NONE;
}
@ -315,12 +282,6 @@ static PyObject* python_create_and_map(PyObject* self, PyObject* args) {
create_and_map(recv_device, recv_size, d_mem_ptr, p_memHandle);
if (error_code != 0) {
error_code = no_error;
PyErr_SetString(PyExc_RuntimeError, error_msg);
return nullptr;
}
Py_RETURN_NONE;
}

View File

@ -38,13 +38,9 @@ struct Signal {
alignas(128) FlagType peer_counter[2][kMaxBlocks][8];
};
struct __align__(16) RankData {
const void* __restrict__ ptrs[8];
};
struct __align__(16) RankData { const void* __restrict__ ptrs[8]; };
struct __align__(16) RankSignals {
Signal* signals[8];
};
struct __align__(16) RankSignals { Signal* signals[8]; };
// like std::array, but aligned
template <typename T, int sz>

View File

@ -32,20 +32,3 @@ inline int get_cuda_max_shared_memory_per_block_opt_in(int const device) {
}
int32_t get_sm_version_num();
/**
* A wrapper for a kernel that is used to guard against compilation on
* architectures that will never use the kernel. The purpose of this is to
* reduce the size of the compiled binary.
* __CUDA_ARCH__ is not defined in host code, so this lets us smuggle the ifdef
* into code that will be executed on the device where it is defined.
*/
template <typename Kernel>
struct enable_sm90_or_later : Kernel {
template <typename... Args>
CUTLASS_DEVICE void operator()(Args&&... args) {
#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 900
Kernel::operator()(std::forward<Args>(args)...);
#endif
}
};

View File

@ -122,8 +122,8 @@ struct ScaledEpilogue
auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
typename EVTCompute0::Arguments evt0_args{b_args, {}, {}};
return ArgumentType{a_args, evt0_args, {}};
typename EVTCompute0::Arguments evt0_args{b_args};
return ArgumentType{a_args, evt0_args};
}
};
@ -167,8 +167,8 @@ struct ScaledEpilogueBias
auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
auto bias_args = SUPER::template args_from_tensor<Bias, ElementD>(bias);
typename EVTCompute0::Arguments evt0_args{b_args, {}, {}};
return ArgumentType{a_args, evt0_args, bias_args, {}};
typename EVTCompute0::Arguments evt0_args{b_args};
return ArgumentType{a_args, evt0_args, bias_args};
}
};
@ -230,10 +230,9 @@ struct ScaledEpilogueBiasAzp
auto azp_adj_args =
SUPER::template args_from_tensor<AzpWithAdj, int32_t>(azp_adj);
typename EVTComputeAzp::Arguments evt_azp_args{{}, azp_adj_args, {}};
typename EVTComputeScaleB::Arguments evt_scale_b_args{
b_args, evt_azp_args, {}};
return ArgumentType{a_args, evt_scale_b_args, bias_args, {}};
typename EVTComputeAzp::Arguments evt_azp_args{{}, azp_adj_args};
typename EVTComputeScaleB::Arguments evt_scale_b_args{b_args, evt_azp_args};
return ArgumentType{a_args, evt_scale_b_args, bias_args};
}
};
@ -310,12 +309,11 @@ struct ScaledEpilogueBiasAzpToken
auto azp_adj_args =
SUPER::template args_from_tensor<AzpAdj, int32_t>(azp_adj);
typename EVTComputeAzp::Arguments evt_azp_args{azp_args, azp_adj_args, {}};
typename EVTComputeAcc::Arguments evt_acc_args{{}, evt_azp_args, {}};
typename EVTComputeScaleB::Arguments evt_scale_b_args{
b_args, evt_acc_args, {}};
return ArgumentType{a_args, evt_scale_b_args, bias_args, {}};
typename EVTComputeAzp::Arguments evt_azp_args{azp_args, azp_adj_args};
typename EVTComputeAcc::Arguments evt_acc_args{{}, evt_azp_args};
typename EVTComputeScaleB::Arguments evt_scale_b_args{b_args, evt_acc_args};
return ArgumentType{a_args, evt_scale_b_args, bias_args};
}
};
}; // namespace vllm::c2x
}; // namespace vllm::c2x

View File

@ -16,58 +16,36 @@ namespace vllm::c3x {
using namespace cute;
template <typename T>
struct identity {
CUTLASS_HOST_DEVICE
T operator()(T lhs) const { return lhs; }
};
template <typename ElementAcc, typename ElementD, typename TileShape>
struct TrivialEpilogue {
private:
using Accum = cutlass::epilogue::fusion::Sm90AccFetch;
using Compute = cutlass::epilogue::fusion::Sm90Compute<
cutlass::epilogue::thread::Identity, ElementD, ElementAcc,
cutlass::FloatRoundStyle::round_to_nearest>;
public:
using EVTCompute = cutlass::epilogue::fusion::Sm90EVT<Compute, Accum>;
using ArgumentType = typename EVTCompute::Arguments;
template <typename... Args>
static ArgumentType prepare_args(Args... args) {
return {};
}
};
/*
* This class provides the common load descriptors for the
* ScaledEpilogue[...] classes
*/
template <typename ElementAcc, typename ElementD, typename TileShape>
template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
struct ScaledEpilogueBase {
protected:
using Accum = cutlass::epilogue::fusion::Sm90AccFetch;
template <typename T>
using ColOrScalarLoad = cutlass::epilogue::fusion::Sm90ColOrScalarBroadcast<
0 /*Stages*/, TileShape, T, Stride<Int<1>, Int<0>, Int<0>>>;
0 /*Stages*/, typename EpilogueDescriptor::TileShape, T,
Stride<Int<1>, Int<0>, Int<0>>>;
template <typename T>
using RowOrScalarLoad = cutlass::epilogue::fusion::Sm90RowOrScalarBroadcast<
0 /*Stages*/, TileShape, T, Stride<Int<0>, Int<1>, Int<0>>>;
0 /*Stages*/, typename EpilogueDescriptor::TileShape, T,
Stride<Int<0>, Int<1>, Int<0>>>;
// Don't want to support nullptr by default
template <typename T, bool EnableNullPtr = false>
using ColLoad = cutlass::epilogue::fusion::Sm90ColBroadcast<
0 /*Stages*/, TileShape, T, T, Stride<Int<1>, Int<0>, Int<0>>,
128 / sizeof_bits_v<T>, EnableNullPtr>;
0 /*Stages*/, typename EpilogueDescriptor::TileShape, T, T,
Stride<Int<1>, Int<0>, Int<0>>, 128 / sizeof_bits_v<T>, EnableNullPtr>;
// Don't want to support nullptr by default
template <typename T, bool EnableNullPtr = false>
using RowLoad = cutlass::epilogue::fusion::Sm90RowBroadcast<
0 /*Stages*/, TileShape, T, T, Stride<Int<0>, Int<1>, Int<0>>,
128 / sizeof_bits_v<T>, EnableNullPtr>;
0 /*Stages*/, typename EpilogueDescriptor::TileShape, T, T,
Stride<Int<0>, Int<1>, Int<0>>, 128 / sizeof_bits_v<T>, EnableNullPtr>;
// This utility function constructs the arguments for the load descriptors
// from a tensor. It can handle both row and column, as well as row/column or
@ -114,11 +92,11 @@ struct ScaledEpilogueBase {
the A and B operands respectively. These scales may be either per-tensor or
per row or column.
*/
template <typename ElementAcc, typename ElementD, typename TileShape>
template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
struct ScaledEpilogue
: private ScaledEpilogueBase<ElementAcc, ElementD, TileShape> {
: private ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor> {
private:
using SUPER = ScaledEpilogueBase<ElementAcc, ElementD, TileShape>;
using SUPER = ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor>;
using Accum = typename SUPER::Accum;
using ScaleA = typename SUPER::template ColOrScalarLoad<float>;
using ScaleB = typename SUPER::template RowOrScalarLoad<float>;
@ -144,8 +122,8 @@ struct ScaledEpilogue
auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
typename EVTCompute0::Arguments evt0_args{b_args, {}, {}};
return ArgumentType{a_args, evt0_args, {}};
typename EVTCompute0::Arguments evt0_args{b_args};
return ArgumentType{a_args, evt0_args};
}
};
@ -158,11 +136,11 @@ struct ScaledEpilogue
* The bias tensor must be per-output channel.
* ScaleA and ScaleB can be per-tensor or per-token/per-channel.
*/
template <typename ElementAcc, typename ElementD, typename TileShape>
template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
struct ScaledEpilogueBias
: private ScaledEpilogueBase<ElementAcc, ElementD, TileShape> {
: private ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor> {
private:
using SUPER = ScaledEpilogueBase<ElementAcc, ElementD, TileShape>;
using SUPER = ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor>;
using Accum = typename SUPER::Accum;
using ScaleA = typename SUPER::template ColOrScalarLoad<float>;
using ScaleB = typename SUPER::template RowOrScalarLoad<float>;
@ -191,51 +169,8 @@ struct ScaledEpilogueBias
auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
auto bias_args = SUPER::template args_from_tensor<Bias, ElementD>(bias);
typename EVTCompute0::Arguments evt0_args{b_args, {}, {}};
return ArgumentType{a_args, evt0_args, bias_args, {}};
}
};
/*
* This epilogue performs the same operation as ScaledEpilogueBias, but the
* bias is a column vector instead of a row vector. Useful e.g. if we are
* computing a GEMM via C^T += B^T A^T. This happens in the 2:4 sparse kernels.
*/
template <typename ElementAcc, typename ElementD, typename TileShape>
struct ScaledEpilogueColumnBias
: private ScaledEpilogueBase<ElementAcc, ElementD, TileShape> {
private:
using SUPER = ScaledEpilogueBase<ElementAcc, ElementD, TileShape>;
using Accum = typename SUPER::Accum;
using ScaleA = typename SUPER::template ColOrScalarLoad<float>;
using ScaleB = typename SUPER::template RowOrScalarLoad<float>;
using Bias = typename SUPER::template ColLoad<ElementD>;
using Compute0 = cutlass::epilogue::fusion::Sm90Compute<
cutlass::multiplies, float, float,
cutlass::FloatRoundStyle::round_to_nearest>;
using EVTCompute0 =
cutlass::epilogue::fusion::Sm90EVT<Compute0, ScaleB, Accum>;
using Compute1 = cutlass::epilogue::fusion::Sm90Compute<
cutlass::multiply_add, ElementD, float,
cutlass::FloatRoundStyle::round_to_nearest>;
public:
using EVTCompute =
cutlass::epilogue::fusion::Sm90EVT<Compute1, ScaleA, EVTCompute0, Bias>;
using ArgumentType = typename EVTCompute::Arguments;
static ArgumentType prepare_args(torch::Tensor const& a_scales,
torch::Tensor const& b_scales,
torch::Tensor const& bias) {
auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
auto bias_args = SUPER::template args_from_tensor<Bias, ElementD>(bias);
typename EVTCompute0::Arguments evt0_args{b_args, {}, {}};
return ArgumentType{a_args, evt0_args, bias_args, {}};
typename EVTCompute0::Arguments evt0_args{b_args};
return ArgumentType{a_args, evt0_args, bias_args};
}
};
@ -247,11 +182,11 @@ struct ScaledEpilogueColumnBias
*
* This epilogue also supports bias, which remains per-channel.
*/
template <typename ElementAcc, typename ElementD, typename TileShape>
template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
struct ScaledEpilogueBiasAzp
: private ScaledEpilogueBase<ElementAcc, ElementD, TileShape> {
: private ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor> {
private:
using SUPER = ScaledEpilogueBase<ElementAcc, ElementD, TileShape>;
using SUPER = ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor>;
using Accum = typename SUPER::Accum;
using ScaleA = typename SUPER::template ColOrScalarLoad<float>;
using ScaleB = typename SUPER::template RowOrScalarLoad<float>;
@ -295,10 +230,9 @@ struct ScaledEpilogueBiasAzp
auto azp_adj_args =
SUPER::template args_from_tensor<AzpWithAdj, int32_t>(azp_adj);
typename EVTComputeAzp::Arguments evt_azp_args{{}, azp_adj_args, {}};
typename EVTComputeScaleB::Arguments evt_scale_b_args{
b_args, evt_azp_args, {}};
return ArgumentType{a_args, evt_scale_b_args, bias_args, {}};
typename EVTComputeAzp::Arguments evt_azp_args{{}, azp_adj_args};
typename EVTComputeScaleB::Arguments evt_scale_b_args{b_args, evt_azp_args};
return ArgumentType{a_args, evt_scale_b_args, bias_args};
}
};
@ -312,11 +246,11 @@ struct ScaledEpilogueBiasAzp
*
* This epilogue also supports bias, which remains per-channel.
*/
template <typename ElementAcc, typename ElementD, typename TileShape>
template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
struct ScaledEpilogueBiasAzpToken
: private ScaledEpilogueBase<ElementAcc, ElementD, TileShape> {
: private ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor> {
private:
using SUPER = ScaledEpilogueBase<ElementAcc, ElementD, TileShape>;
using SUPER = ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor>;
using Accum = typename SUPER::Accum;
using ScaleA = typename SUPER::template ColOrScalarLoad<float>;
using ScaleB = typename SUPER::template RowOrScalarLoad<float>;
@ -373,12 +307,11 @@ struct ScaledEpilogueBiasAzpToken
auto azp_adj_args =
SUPER::template args_from_tensor<AzpAdj, int32_t>(azp_adj);
typename EVTComputeAzp::Arguments evt_azp_args{azp_args, azp_adj_args, {}};
typename EVTComputeAcc::Arguments evt_acc_args{{}, evt_azp_args, {}};
typename EVTComputeScaleB::Arguments evt_scale_b_args{
b_args, evt_acc_args, {}};
return ArgumentType{a_args, evt_scale_b_args, bias_args, {}};
typename EVTComputeAzp::Arguments evt_azp_args{azp_args, azp_adj_args};
typename EVTComputeAcc::Arguments evt_acc_args{{}, evt_azp_args};
typename EVTComputeScaleB::Arguments evt_scale_b_args{b_args, evt_acc_args};
return ArgumentType{a_args, evt_scale_b_args, bias_args};
}
};
}; // namespace vllm::c3x
}; // namespace vllm::c3x

View File

@ -1,123 +0,0 @@
// Modified from: cutlass/gemm/collective/builders/sm90_gmma_builder.inl
// clang-format off
#pragma once
#include "cutlass/gemm/collective/builders/sm90_gmma_builder.inl"
#include "cutlass_extensions/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp"
/////////////////////////////////////////////////////////////////////////////////////////////////
namespace cutlass::gemm::collective {
/////////////////////////////////////////////////////////////////////////////////////////////////
// GMMA_TMA_WS_SS (BlockScaled Builders)
template <
class ElementA,
class GmemLayoutATag,
int AlignmentA,
class ElementB,
class GmemLayoutBTag,
int AlignmentB,
class ElementAccumulator,
class TileShape_MNK,
class ClusterShape_MNK,
class StageCountType,
int ScaleGranularityM
>
struct CollectiveBuilder<
arch::Sm90,
arch::OpClassTensorOp,
ElementA,
GmemLayoutATag,
AlignmentA,
ElementB,
GmemLayoutBTag,
AlignmentB,
ElementAccumulator,
TileShape_MNK,
ClusterShape_MNK,
StageCountType,
KernelTmaWarpSpecializedCooperativeFP8BlockScaledSubGroupMAccum<ScaleGranularityM>,
cute::enable_if_t<
not detail::is_use_rmem_A<ElementA, GmemLayoutATag, ElementB, GmemLayoutBTag>()>
> {
using KernelScheduleType = KernelTmaWarpSpecializedCooperativeFP8BlockScaledSubGroupMAccum<ScaleGranularityM>;
static_assert(is_static<TileShape_MNK>::value);
static_assert(is_static<ClusterShape_MNK>::value);
#ifndef CUTLASS_SM90_COLLECTIVE_BUILDER_SUPPORTED
static_assert(cutlass::detail::dependent_false<ElementA>, "Unsupported Toolkit for SM90 Collective Builder\n");
#endif
static_assert(detail::is_aligned<ElementA, AlignmentA, ElementB, AlignmentB, detail::tma_alignment_bytes>(),
"Should meet TMA alignment requirement\n");
static constexpr bool IsArrayOfPointersGemm = (cute::is_any_of_v<KernelScheduleType,
KernelPtrArrayTmaWarpSpecializedCooperative,
KernelPtrArrayTmaWarpSpecializedPingpong>);
static constexpr bool IsFP8Input = detail::is_input_fp8<ElementA, ElementB>();
static_assert((!IsFP8Input || !IsArrayOfPointersGemm),
"KernelTmaWarpSpecializedCooperativeFP8BlockScaledAccum is only compatible with FP8 Blocked Scaled version right now.");
// For fp32 types, map to tf32 MMA value type
using ElementAMma = cute::conditional_t<cute::is_same_v<ElementA, float>, tfloat32_t, ElementA>;
using ElementBMma = cute::conditional_t<cute::is_same_v<ElementB, float>, tfloat32_t, ElementB>;
static constexpr cute::GMMA::Major GmmaMajorA = detail::gmma_ss_tag_to_major_A<ElementAMma, GmemLayoutATag>();
static constexpr cute::GMMA::Major GmmaMajorB = detail::gmma_ss_tag_to_major_B<ElementBMma, GmemLayoutBTag>();
static constexpr bool IsCooperative = cute::is_any_of_v<KernelScheduleType,
KernelTmaWarpSpecializedCooperative,
KernelPtrArrayTmaWarpSpecializedCooperative,
KernelTmaWarpSpecializedCooperativeFP8BlockScaledSubGroupMAccum<ScaleGranularityM>>;
using AtomLayoutMNK = cute::conditional_t<IsCooperative,
Layout<Shape<_2,_1,_1>>, Layout<Shape<_1,_1,_1>>>;
using TiledMma = decltype(cute::make_tiled_mma(cute::GMMA::ss_op_selector<
ElementAMma, ElementBMma, ElementAccumulator, TileShape_MNK, GmmaMajorA, GmmaMajorB>(), AtomLayoutMNK{}));
using GmemTiledCopyA = decltype(detail::sm90_cluster_shape_to_tma_atom(shape<1>(ClusterShape_MNK{})));
using GmemTiledCopyB = decltype(detail::sm90_cluster_shape_to_tma_atom(shape<0>(ClusterShape_MNK{})));
using SmemLayoutAtomA = decltype(detail::ss_smem_selector<
GmmaMajorA, ElementAMma, decltype(cute::get<0>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>());
using SmemLayoutAtomB = decltype(detail::ss_smem_selector<
GmmaMajorB, ElementBMma, decltype(cute::get<1>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>());
static constexpr size_t TensorMapStorage = IsArrayOfPointersGemm ? sizeof(cute::TmaDescriptor) * 2 /* for A and B */ : 0;
static constexpr int KernelSmemCarveout = static_cast<int>(TensorMapStorage);
static constexpr int PipelineStages = detail::compute_stage_count_or_override<detail::sm90_smem_capacity_bytes - KernelSmemCarveout,
ElementAMma, ElementBMma, TileShape_MNK>(StageCountType{});
using DispatchPolicy = MainloopSm90TmaGmmaWarpSpecializedBlockScalingSubGroupMFP8<PipelineStages, ClusterShape_MNK, KernelScheduleType, ScaleGranularityM>;
using SmemCopyAtomA = void;
using SmemCopyAtomB = void;
using CollectiveOp = CollectiveMma<
DispatchPolicy,
TileShape_MNK,
ElementA,
TagToStrideA_t<GmemLayoutATag>,
ElementB,
TagToStrideB_t<GmemLayoutBTag>,
TiledMma,
GmemTiledCopyA,
SmemLayoutAtomA,
SmemCopyAtomA,
cute::identity,
GmemTiledCopyB,
SmemLayoutAtomB,
SmemCopyAtomB,
cute::identity
>;
};
/////////////////////////////////////////////////////////////////////////////////////////////////
} // namespace cutlass::gemm::collective
/////////////////////////////////////////////////////////////////////////////////////////////////

Some files were not shown because too many files have changed in this diff Show More