Compare commits
75 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| c00ddd6834 | |||
| 881b884046 | |||
| 98a3df0f8d | |||
| 3f6288cc89 | |||
| 408ff4950c | |||
| 278e8a1adc | |||
| 07be6ed3eb | |||
| f6637dba18 | |||
| 707a5f6473 | |||
| 57690a9c09 | |||
| b15db234ba | |||
| d1591f0f1f | |||
| 85d4488458 | |||
| 8d072dbfbd | |||
| d830766c0c | |||
| 5ae2f81c2b | |||
| 4ea41d01a9 | |||
| d16a348477 | |||
| aa092834bb | |||
| d2c6a32c0c | |||
| 21f35c2289 | |||
| 2aa9831dd3 | |||
| 028f528aad | |||
| fa5bacd5b0 | |||
| b62170e4e3 | |||
| 98eda57899 | |||
| 81b8b813f1 | |||
| e2c7dedb3a | |||
| 5323969fcf | |||
| f42b4c27d8 | |||
| 620e7646d3 | |||
| d5fb1c20c1 | |||
| 092e3d6d6d | |||
| 84284302d8 | |||
| 743695f586 | |||
| 62b870fa07 | |||
| 7e3a230c38 | |||
| 186c88c497 | |||
| ef762cb110 | |||
| 756c4e78d3 | |||
| 4880de35d2 | |||
| 0fb07c08d0 | |||
| e4377dd698 | |||
| 5cb213c85e | |||
| 25bbc21ef6 | |||
| b25fcc06c2 | |||
| 6661c030c4 | |||
| 8888d1c474 | |||
| cedb67028a | |||
| 91b47e3f2f | |||
| 6d62e4c6aa | |||
| de82e95787 | |||
| b3b89cf755 | |||
| 6692a30266 | |||
| eb0a0466a9 | |||
| c59c1e7b2c | |||
| d4adf92beb | |||
| 363e6a950f | |||
| 696b653193 | |||
| 0d6402ddfd | |||
| 60ff6b8c5c | |||
| d899009a63 | |||
| 6894d3efef | |||
| 38e3d33a62 | |||
| 02e614d922 | |||
| 46b31ed98d | |||
| 31d05f7edb | |||
| 4cdb732cef | |||
| 27c592b97b | |||
| 5083aa9092 | |||
| 824521c987 | |||
| 3b8f43024f | |||
| d148c2ef00 | |||
| 86f073edd6 | |||
| 52a1e908e4 |
@ -1,36 +0,0 @@
|
||||
import os
|
||||
import zipfile
|
||||
|
||||
MAX_SIZE_MB = 200
|
||||
|
||||
|
||||
def print_top_10_largest_files(zip_file):
|
||||
with zipfile.ZipFile(zip_file, 'r') as z:
|
||||
file_sizes = [(f, z.getinfo(f).file_size) for f in z.namelist()]
|
||||
file_sizes.sort(key=lambda x: x[1], reverse=True)
|
||||
for f, size in file_sizes[:10]:
|
||||
print(f"{f}: {size/(1024*1024)} MBs uncompressed.")
|
||||
|
||||
|
||||
def check_wheel_size(directory):
|
||||
for root, _, files in os.walk(directory):
|
||||
for f in files:
|
||||
if f.endswith(".whl"):
|
||||
wheel_path = os.path.join(root, f)
|
||||
wheel_size = os.path.getsize(wheel_path)
|
||||
wheel_size_mb = wheel_size / (1024 * 1024)
|
||||
if wheel_size_mb > MAX_SIZE_MB:
|
||||
print(
|
||||
f"Wheel {wheel_path} is too large ({wheel_size_mb} MB) "
|
||||
f"compare to the allowed size ({MAX_SIZE_MB} MB).")
|
||||
print_top_10_largest_files(wheel_path)
|
||||
return 1
|
||||
else:
|
||||
print(f"Wheel {wheel_path} is within the allowed size "
|
||||
f"({wheel_size_mb} MB).")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
sys.exit(check_wheel_size(sys.argv[1]))
|
||||
@ -8,6 +8,10 @@ set -o pipefail
|
||||
# aws s3 sync s3://air-example-data-2/vllm_opensource_llava/ images/
|
||||
mkdir -p images
|
||||
cd images
|
||||
wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign_pixel_values.pt
|
||||
wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign_image_features.pt
|
||||
wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom_pixel_values.pt
|
||||
wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom_image_features.pt
|
||||
wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign.jpg
|
||||
wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom.jpg
|
||||
|
||||
|
||||
@ -1,11 +0,0 @@
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-70B-Instruct -b 32 -l 250 -f 5
|
||||
model_name: "meta-llama/Meta-Llama-3-70B-Instruct"
|
||||
tasks:
|
||||
- name: "gsm8k"
|
||||
metrics:
|
||||
- name: "exact_match,strict-match"
|
||||
value: 0.892
|
||||
- name: "exact_match,flexible-extract"
|
||||
value: 0.892
|
||||
limit: 250
|
||||
num_fewshot: 5
|
||||
@ -1,11 +0,0 @@
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m neuralmagic/Meta-Llama-3-8B-Instruct-FP8 -b 32 -l 250 -f 5 -t 1
|
||||
model_name: "neuralmagic/Meta-Llama-3-8B-Instruct-FP8"
|
||||
tasks:
|
||||
- name: "gsm8k"
|
||||
metrics:
|
||||
- name: "exact_match,strict-match"
|
||||
value: 0.756
|
||||
- name: "exact_match,flexible-extract"
|
||||
value: 0.752
|
||||
limit: 250
|
||||
num_fewshot: 5
|
||||
@ -1,11 +0,0 @@
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-8B-Instruct -b 32 -l 250 -f 5 -t 1
|
||||
model_name: "meta-llama/Meta-Llama-3-8B-Instruct"
|
||||
tasks:
|
||||
- name: "gsm8k"
|
||||
metrics:
|
||||
- name: "exact_match,strict-match"
|
||||
value: 0.756
|
||||
- name: "exact_match,flexible-extract"
|
||||
value: 0.752
|
||||
limit: 250
|
||||
num_fewshot: 5
|
||||
@ -1,11 +0,0 @@
|
||||
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic -b "auto" -l 250 -f 5 -t 8
|
||||
model_name: "neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic"
|
||||
tasks:
|
||||
- name: "gsm8k"
|
||||
metrics:
|
||||
- name: "exact_match,strict-match"
|
||||
value: 0.86
|
||||
- name: "exact_match,flexible-extract"
|
||||
value: 0.86
|
||||
limit: 250
|
||||
num_fewshot: 5
|
||||
@ -1,11 +0,0 @@
|
||||
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8 -b "auto" -l 250 -f 5 -t 4
|
||||
model_name: "neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8"
|
||||
tasks:
|
||||
- name: "gsm8k"
|
||||
metrics:
|
||||
- name: "exact_match,strict-match"
|
||||
value: 0.624
|
||||
- name: "exact_match,flexible-extract"
|
||||
value: 0.624
|
||||
limit: 250
|
||||
num_fewshot: 5
|
||||
@ -1,11 +0,0 @@
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1 -b 32 -l 250 -f 5 -t 4
|
||||
model_name: "mistralai/Mixtral-8x7B-Instruct-v0.1"
|
||||
tasks:
|
||||
- name: "gsm8k"
|
||||
metrics:
|
||||
- name: "exact_match,strict-match"
|
||||
value: 0.616
|
||||
- name: "exact_match,flexible-extract"
|
||||
value: 0.632
|
||||
limit: 250
|
||||
num_fewshot: 5
|
||||
@ -1,11 +0,0 @@
|
||||
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m Qwen/Qwen2-57B-A14B-Instruct -b "auto" -l 250 -f 5 -t 4
|
||||
model_name: "Qwen/Qwen2-57B-A14B-Instruct"
|
||||
tasks:
|
||||
- name: "gsm8k"
|
||||
metrics:
|
||||
- name: "exact_match,strict-match"
|
||||
value: 0.792
|
||||
- name: "exact_match,flexible-extract"
|
||||
value: 0.824
|
||||
limit: 250
|
||||
num_fewshot: 5
|
||||
@ -1,3 +0,0 @@
|
||||
Meta-Llama-3-70B-Instruct.yaml
|
||||
Mixtral-8x7B-Instruct-v0.1.yaml
|
||||
Qwen2-57B-A14-Instruct.yaml
|
||||
@ -1,2 +0,0 @@
|
||||
Meta-Llama-3-8B-Instruct.yaml
|
||||
Meta-Llama-3-8B-Instruct-FP8.yaml
|
||||
@ -1,46 +0,0 @@
|
||||
#!/bin/bash
|
||||
# We can use this script to compute baseline accuracy on GSM for transformers.
|
||||
#
|
||||
# Make sure you have lm-eval-harness installed:
|
||||
# pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@9516087b81a61d0e220b22cc1b75be76de23bc10
|
||||
|
||||
usage() {
|
||||
echo``
|
||||
echo "Runs lm eval harness on GSM8k using huggingface transformers."
|
||||
echo "This pathway is intended to be used to create baselines for "
|
||||
echo "our automated nm-test-accuracy workflow"
|
||||
echo
|
||||
echo "usage: ${0} <options>"
|
||||
echo
|
||||
echo " -m - huggingface stub or local directory of the model"
|
||||
echo " -b - batch size to run the evaluation at"
|
||||
echo " -l - limit number of samples to run"
|
||||
echo " -f - number of fewshot samples to use"
|
||||
echo
|
||||
}
|
||||
|
||||
while getopts "m:b:l:f:" OPT; do
|
||||
case ${OPT} in
|
||||
m )
|
||||
MODEL="$OPTARG"
|
||||
;;
|
||||
b )
|
||||
BATCH_SIZE="$OPTARG"
|
||||
;;
|
||||
l )
|
||||
LIMIT="$OPTARG"
|
||||
;;
|
||||
f )
|
||||
FEWSHOT="$OPTARG"
|
||||
;;
|
||||
\? )
|
||||
usage
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
lm_eval --model hf \
|
||||
--model_args pretrained=$MODEL,parallelize=True \
|
||||
--tasks gsm8k --num_fewshot $FEWSHOT --limit $LIMIT \
|
||||
--batch_size $BATCH_SIZE
|
||||
@ -1,51 +0,0 @@
|
||||
#!/bin/bash
|
||||
# We can use this script to compute baseline accuracy on GSM for vllm.
|
||||
# We use this for fp8, which HF does not support.
|
||||
#
|
||||
# Make sure you have lm-eval-harness installed:
|
||||
# pip install lm-eval==0.4.2
|
||||
|
||||
usage() {
|
||||
echo``
|
||||
echo "Runs lm eval harness on GSM8k using huggingface transformers."
|
||||
echo "This pathway is intended to be used to create baselines for "
|
||||
echo "our automated nm-test-accuracy workflow"
|
||||
echo
|
||||
echo "usage: ${0} <options>"
|
||||
echo
|
||||
echo " -m - huggingface stub or local directory of the model"
|
||||
echo " -b - batch size to run the evaluation at"
|
||||
echo " -l - limit number of samples to run"
|
||||
echo " -f - number of fewshot samples to use"
|
||||
echo " -t - tensor parallel size to run at"
|
||||
echo
|
||||
}
|
||||
|
||||
while getopts "m:b:l:f:t:" OPT; do
|
||||
case ${OPT} in
|
||||
m )
|
||||
MODEL="$OPTARG"
|
||||
;;
|
||||
b )
|
||||
BATCH_SIZE="$OPTARG"
|
||||
;;
|
||||
l )
|
||||
LIMIT="$OPTARG"
|
||||
;;
|
||||
f )
|
||||
FEWSHOT="$OPTARG"
|
||||
;;
|
||||
t )
|
||||
TP_SIZE="$OPTARG"
|
||||
;;
|
||||
\? )
|
||||
usage
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
lm_eval --model vllm \
|
||||
--model_args pretrained=$MODEL,tensor_parallel_size=$TP_SIZE \
|
||||
--tasks gsm8k --num_fewshot $FEWSHOT --limit $LIMIT \
|
||||
--batch_size $BATCH_SIZE
|
||||
@ -1,59 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
usage() {
|
||||
echo``
|
||||
echo "Runs lm eval harness on GSM8k using vllm and compares to "
|
||||
echo "precomputed baseline (measured by HF transformers.)"
|
||||
echo
|
||||
echo "usage: ${0} <options>"
|
||||
echo
|
||||
echo " -c - path to the test data config (e.g. configs/small-models.txt)"
|
||||
echo " -t - tensor parallel size"
|
||||
echo
|
||||
}
|
||||
|
||||
SUCCESS=0
|
||||
|
||||
while getopts "c:t:" OPT; do
|
||||
case ${OPT} in
|
||||
c )
|
||||
CONFIG="$OPTARG"
|
||||
;;
|
||||
t )
|
||||
TP_SIZE="$OPTARG"
|
||||
;;
|
||||
\? )
|
||||
usage
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
# Parse list of configs.
|
||||
IFS=$'\n' read -d '' -r -a MODEL_CONFIGS < $CONFIG
|
||||
|
||||
for MODEL_CONFIG in "${MODEL_CONFIGS[@]}"
|
||||
do
|
||||
LOCAL_SUCCESS=0
|
||||
|
||||
echo "=== RUNNING MODEL: $MODEL_CONFIG WITH TP SIZE: $TP_SIZE==="
|
||||
|
||||
export LM_EVAL_TEST_DATA_FILE=$PWD/configs/${MODEL_CONFIG}
|
||||
export LM_EVAL_TP_SIZE=$TP_SIZE
|
||||
pytest -s test_lm_eval_correctness.py || LOCAL_SUCCESS=$?
|
||||
|
||||
if [[ $LOCAL_SUCCESS == 0 ]]; then
|
||||
echo "=== PASSED MODEL: ${MODEL_CONFIG} ==="
|
||||
else
|
||||
echo "=== FAILED MODEL: ${MODEL_CONFIG} ==="
|
||||
fi
|
||||
|
||||
SUCCESS=$((SUCCESS + LOCAL_SUCCESS))
|
||||
|
||||
done
|
||||
|
||||
if [ "${SUCCESS}" -eq "0" ]; then
|
||||
exit 0
|
||||
else
|
||||
exit 1
|
||||
fi
|
||||
@ -1,54 +0,0 @@
|
||||
"""
|
||||
LM eval harness on model to compare vs HF baseline computed offline.
|
||||
Configs are found in configs/$MODEL.yaml
|
||||
|
||||
* export LM_EVAL_TEST_DATA_FILE=configs/Meta-Llama-3-70B-Instruct.yaml
|
||||
* export LM_EVAL_TP_SIZE=4
|
||||
* pytest -s test_lm_eval_correctness.py
|
||||
"""
|
||||
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
import lm_eval
|
||||
import numpy
|
||||
import yaml
|
||||
|
||||
RTOL = 0.02
|
||||
TEST_DATA_FILE = os.environ.get(
|
||||
"LM_EVAL_TEST_DATA_FILE",
|
||||
".buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml")
|
||||
|
||||
TP_SIZE = os.environ.get("LM_EVAL_TP_SIZE", 1)
|
||||
|
||||
|
||||
def launch_lm_eval(eval_config):
|
||||
model_args = f"pretrained={eval_config['model_name']}," \
|
||||
f"tensor_parallel_size={TP_SIZE}"
|
||||
|
||||
results = lm_eval.simple_evaluate(
|
||||
model="vllm",
|
||||
model_args=model_args,
|
||||
tasks=[task["name"] for task in eval_config["tasks"]],
|
||||
num_fewshot=eval_config["num_fewshot"],
|
||||
limit=eval_config["limit"],
|
||||
batch_size="auto")
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def test_lm_eval_correctness():
|
||||
eval_config = yaml.safe_load(
|
||||
Path(TEST_DATA_FILE).read_text(encoding="utf-8"))
|
||||
|
||||
# Launch eval requests.
|
||||
results = launch_lm_eval(eval_config)
|
||||
|
||||
# Confirm scores match ground truth.
|
||||
for task in eval_config["tasks"]:
|
||||
for metric in task["metrics"]:
|
||||
ground_truth = metric["value"]
|
||||
measured_value = results["results"][task["name"]][metric["name"]]
|
||||
print(f'{task["name"]} | {metric["name"]}: '
|
||||
f'ground_truth={ground_truth} | measured={measured_value}')
|
||||
assert numpy.isclose(ground_truth, measured_value, rtol=RTOL)
|
||||
@ -1,103 +0,0 @@
|
||||
# vLLM benchmark suite
|
||||
|
||||
## Introduction
|
||||
|
||||
This directory contains the performance benchmarking CI for vllm.
|
||||
The goal is to help developers know the impact of their PRs on the performance of vllm.
|
||||
|
||||
This benchmark will be *triggered* upon:
|
||||
- A PR being merged into vllm.
|
||||
- Every commit for those PRs with `perf-benchmarks` label.
|
||||
|
||||
**Benchmarking Coverage**: latency, throughput and fix-qps serving on A100 (the support for more GPUs is comming later), with different models.
|
||||
|
||||
**Benchmarking Duration**: about 1hr.
|
||||
|
||||
**For benchmarking developers**: please try your best to constraint the duration of benchmarking to less than 1.5 hr so that it won't take forever to run.
|
||||
|
||||
|
||||
## Configuring the workload
|
||||
|
||||
The benchmarking workload contains three parts:
|
||||
- Latency tests in `latency-tests.json`.
|
||||
- Throughput tests in `throughput-tests.json`.
|
||||
- Serving tests in `serving-tests.json`.
|
||||
|
||||
See [descriptions.md](tests/descriptions.md) for detailed descriptions.
|
||||
|
||||
### Latency test
|
||||
|
||||
Here is an example of one test inside `latency-tests.json`:
|
||||
|
||||
```json
|
||||
[
|
||||
{
|
||||
"test_name": "latency_llama8B_tp1",
|
||||
"parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3-8B",
|
||||
"tensor_parallel_size": 1,
|
||||
"load_format": "dummy",
|
||||
"num_iters_warmup": 5,
|
||||
"num_iters": 15
|
||||
}
|
||||
},
|
||||
]
|
||||
```
|
||||
|
||||
In this example:
|
||||
- The `test_name` attributes is a unique identifier for the test. In `latency-tests.json`, it must start with `latency_`.
|
||||
- The `parameters` attribute control the command line arguments to be used for `benchmark_latency.py`. Note that please use underline `_` instead of the dash `-` when specifying the command line arguments, and `run-benchmarks-suite.sh` will convert the underline to dash when feeding the arguments to `benchmark_latency.py`. For example, the corresponding command line arguments for `benchmark_latency.py` will be `--model meta-llama/Meta-Llama-3-8B --tensor-parallel-size 1 --load-format dummy --num-iters-warmup 5 --num-iters 15`
|
||||
|
||||
Note that the performance numbers are highly sensitive to the value of the parameters. Please make sure the parameters are set correctly.
|
||||
|
||||
WARNING: The benchmarking script will save json results by itself, so please do not configure `--output-json` parameter in the json file.
|
||||
|
||||
|
||||
### Throughput test
|
||||
The tests are specified in `throughput-tests.json`. The syntax is similar to `latency-tests.json`, except for that the parameters will be fed forward to `benchmark_throughput.py`.
|
||||
|
||||
The number of this test is also stable -- a slight change on the value of this number might vary the performance numbers by a lot.
|
||||
|
||||
### Serving test
|
||||
We test the throughput by using `benchmark_serving.py` with request rate = inf to cover the online serving overhead. The corresponding parameters are in `serving-tests.json`, and here is an example:
|
||||
|
||||
```
|
||||
[
|
||||
{
|
||||
"test_name": "serving_llama8B_tp1_sharegpt",
|
||||
"qps_list": [1, 4, 16, "inf"],
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3-8B",
|
||||
"tensor_parallel_size": 1,
|
||||
"swap_space": 16,
|
||||
"disable_log_stats": "",
|
||||
"disable_log_requests": "",
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3-8B",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"num_prompts": 200
|
||||
}
|
||||
},
|
||||
]
|
||||
```
|
||||
|
||||
Inside this example:
|
||||
- The `test_name` attribute is also a unique identifier for the test. It must start with `serving_`.
|
||||
- The `server-parameters` includes the command line arguments for vLLM server.
|
||||
- The `client-parameters` includes the command line arguments for `benchmark_serving.py`.
|
||||
- The `qps_list` controls the list of qps for test. It will be used to configure the `--request-rate` parameter in `benchmark_serving.py`
|
||||
|
||||
The number of this test is less stable compared to the delay and latency benchmarks (due to randomized sharegpt dataset sampling inside `benchmark_serving.py`), but a large change on this number (e.g. 5% change) still vary the output greatly.
|
||||
|
||||
WARNING: The benchmarking script will save json results by itself, so please do not configure `--save-results` or other results-saving-related parameters in `serving-tests.json`.
|
||||
|
||||
## Visualizing the results
|
||||
The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](tests/descriptions.md) with real benchmarking results.
|
||||
You can find the result presented as a table inside the `buildkite/performance-benchmark` job page.
|
||||
If you do not see the table, please wait till the benchmark finish running.
|
||||
The json version of the table (together with the json version of the benchmark) will be also attached to the markdown file.
|
||||
The raw benchmarking results (in the format of json files) are in the `Artifacts` tab of the benchmarking.
|
||||
@ -1,62 +0,0 @@
|
||||
steps:
|
||||
- label: "Wait for container to be ready"
|
||||
agents:
|
||||
queue: A100
|
||||
plugins:
|
||||
- kubernetes:
|
||||
podSpec:
|
||||
containers:
|
||||
- image: badouralix/curl-jq
|
||||
command:
|
||||
- sh
|
||||
- .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
|
||||
- wait
|
||||
- label: "A100 Benchmark"
|
||||
agents:
|
||||
queue: A100
|
||||
plugins:
|
||||
- kubernetes:
|
||||
podSpec:
|
||||
priorityClassName: perf-benchmark
|
||||
containers:
|
||||
- image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
|
||||
command:
|
||||
- bash .buildkite/nightly-benchmarks/run-benchmarks-suite.sh
|
||||
resources:
|
||||
limits:
|
||||
nvidia.com/gpu: 8
|
||||
volumeMounts:
|
||||
- name: devshm
|
||||
mountPath: /dev/shm
|
||||
env:
|
||||
- name: VLLM_USAGE_SOURCE
|
||||
value: ci-test
|
||||
- name: HF_TOKEN
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: hf-token-secret
|
||||
key: token
|
||||
nodeSelector:
|
||||
nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
|
||||
volumes:
|
||||
- name: devshm
|
||||
emptyDir:
|
||||
medium: Memory
|
||||
# - label: "H100: NVIDIA SMI"
|
||||
# agents:
|
||||
# queue: H100
|
||||
# plugins:
|
||||
# - docker#v5.11.0:
|
||||
# image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
|
||||
# command:
|
||||
# - bash
|
||||
# - .buildkite/nightly-benchmarks/run-benchmarks-suite.sh
|
||||
# mount-buildkite-agent: true
|
||||
# propagate-environment: true
|
||||
# propagate-uid-gid: false
|
||||
# ipc: host
|
||||
# gpus: all
|
||||
# environment:
|
||||
# - VLLM_USAGE_SOURCE
|
||||
# - HF_TOKEN
|
||||
|
||||
@ -1,27 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
# NOTE(simon): this script runs inside a buildkite agent with CPU only access.
|
||||
set -euo pipefail
|
||||
|
||||
# Install system packages
|
||||
apt update
|
||||
apt install -y curl jq
|
||||
|
||||
# Install minijinja for templating
|
||||
curl -sSfL https://github.com/mitsuhiko/minijinja/releases/latest/download/minijinja-cli-installer.sh | sh
|
||||
source $HOME/.cargo/env
|
||||
|
||||
# If BUILDKITE_PULL_REQUEST != "false", then we check the PR labels using curl and jq
|
||||
if [ "$BUILDKITE_PULL_REQUEST" != "false" ]; then
|
||||
PR_LABELS=$(curl -s "https://api.github.com/repos/vllm-project/vllm/pulls/$BUILDKITE_PULL_REQUEST" | jq -r '.labels[].name')
|
||||
|
||||
if [[ $PR_LABELS == *"perf-benchmarks"* ]]; then
|
||||
echo "This PR has the 'perf-benchmarks' label. Proceeding with the nightly benchmarks."
|
||||
else
|
||||
echo "This PR does not have the 'perf-benchmarks' label. Skipping the nightly benchmarks."
|
||||
exit 0
|
||||
fi
|
||||
fi
|
||||
|
||||
# Upload sample.yaml
|
||||
buildkite-agent pipeline upload .buildkite/nightly-benchmarks/benchmark-pipeline.yaml
|
||||
@ -1,358 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
# This script should be run inside the CI process
|
||||
# This script assumes that we are already inside the vllm/ directory
|
||||
# Benchmarking results will be available inside vllm/benchmarks/results/
|
||||
|
||||
# Do not set -e, as the mixtral 8x22B model tends to crash occasionally
|
||||
# and we still want to see other benchmarking results even when mixtral crashes.
|
||||
set -o pipefail
|
||||
|
||||
check_gpus() {
|
||||
# check the number of GPUs and GPU type.
|
||||
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
|
||||
if [[ $gpu_count -gt 0 ]]; then
|
||||
echo "GPU found."
|
||||
else
|
||||
echo "Need at least 1 GPU to run benchmarking."
|
||||
exit 1
|
||||
fi
|
||||
declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
|
||||
echo "GPU type is $gpu_type"
|
||||
}
|
||||
|
||||
check_hf_token() {
|
||||
# check if HF_TOKEN is available and valid
|
||||
if [[ -z "$HF_TOKEN" ]]; then
|
||||
echo "Error: HF_TOKEN is not set."
|
||||
exit 1
|
||||
elif [[ ! "$HF_TOKEN" =~ ^hf_ ]]; then
|
||||
echo "Error: HF_TOKEN does not start with 'hf_'."
|
||||
exit 1
|
||||
else
|
||||
echo "HF_TOKEN is set and valid."
|
||||
fi
|
||||
}
|
||||
|
||||
json2args() {
|
||||
# transforms the JSON string to command line args, and '_' is replaced to '-'
|
||||
# example:
|
||||
# input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
|
||||
# output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
|
||||
local json_string=$1
|
||||
local args=$(
|
||||
echo "$json_string" | jq -r '
|
||||
to_entries |
|
||||
map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
|
||||
join(" ")
|
||||
'
|
||||
)
|
||||
echo "$args"
|
||||
}
|
||||
|
||||
wait_for_server() {
|
||||
# wait for vllm server to start
|
||||
# return 1 if vllm server crashes
|
||||
timeout 1200 bash -c '
|
||||
until curl localhost:8000/v1/completions; do
|
||||
sleep 1
|
||||
done' && return 0 || return 1
|
||||
}
|
||||
|
||||
kill_gpu_processes() {
|
||||
# kill all processes on GPU.
|
||||
pids=$(nvidia-smi --query-compute-apps=pid --format=csv,noheader)
|
||||
if [ -z "$pids" ]; then
|
||||
echo "No GPU processes found."
|
||||
else
|
||||
for pid in $pids; do
|
||||
kill -9 "$pid"
|
||||
echo "Killed process with PID: $pid"
|
||||
done
|
||||
|
||||
echo "All GPU processes have been killed."
|
||||
fi
|
||||
|
||||
# waiting for GPU processes to be fully killed
|
||||
sleep 10
|
||||
|
||||
# remove vllm config file
|
||||
rm -rf ~/.config/vllm
|
||||
|
||||
# Print the GPU memory usage
|
||||
# so that we know if all GPU processes are killed.
|
||||
gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
|
||||
# The memory usage should be 0 MB.
|
||||
echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
|
||||
}
|
||||
|
||||
upload_to_buildkite() {
|
||||
# upload the benchmarking results to buildkite
|
||||
|
||||
# if the agent binary is not found, skip uploading the results, exit 0
|
||||
if [ ! -f /workspace/buildkite-agent ]; then
|
||||
echo "buildkite-agent binary not found. Skip uploading the results."
|
||||
return 0
|
||||
fi
|
||||
/workspace/buildkite-agent annotate --style "info" --context "benchmark-results" < $RESULTS_FOLDER/benchmark_results.md
|
||||
/workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
|
||||
}
|
||||
|
||||
run_latency_tests() {
|
||||
# run latency tests using `benchmark_latency.py`
|
||||
# $1: a json file specifying latency test cases
|
||||
|
||||
local latency_test_file
|
||||
latency_test_file=$1
|
||||
|
||||
# Iterate over latency tests
|
||||
jq -c '.[]' "$latency_test_file" | while read -r params; do
|
||||
# get the test name, and append the GPU type back to it.
|
||||
test_name=$(echo "$params" | jq -r '.test_name')
|
||||
if [[ ! "$test_name" =~ ^latency_ ]]; then
|
||||
echo "In latency-test.json, test_name must start with \"latency_\"."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# if TEST_SELECTOR is set, only run the test cases that match the selector
|
||||
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
|
||||
echo "Skip test case $test_name."
|
||||
continue
|
||||
fi
|
||||
|
||||
# get arguments
|
||||
latency_params=$(echo "$params" | jq -r '.parameters')
|
||||
latency_args=$(json2args "$latency_params")
|
||||
|
||||
# check if there is enough GPU to run the test
|
||||
tp=$(echo "$latency_params" | jq -r '.tensor_parallel_size')
|
||||
if [[ $gpu_count -lt $tp ]]; then
|
||||
echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $testname."
|
||||
continue
|
||||
fi
|
||||
|
||||
latency_command="python3 benchmark_latency.py \
|
||||
--output-json $RESULTS_FOLDER/${test_name}.json \
|
||||
$latency_args"
|
||||
|
||||
echo "Running test case $test_name"
|
||||
echo "Latency command: $latency_command"
|
||||
|
||||
# recoding benchmarking command ang GPU command
|
||||
jq_output=$(jq -n \
|
||||
--arg latency "$latency_command" \
|
||||
--arg gpu "$gpu_type" \
|
||||
'{
|
||||
latency_command: $latency,
|
||||
gpu_type: $gpu
|
||||
}')
|
||||
echo "$jq_output" > "$RESULTS_FOLDER/$test_name.commands"
|
||||
|
||||
# run the benchmark
|
||||
eval "$latency_command"
|
||||
|
||||
kill_gpu_processes
|
||||
|
||||
done
|
||||
}
|
||||
|
||||
|
||||
run_throughput_tests() {
|
||||
# run throughput tests using `benchmark_throughput.py`
|
||||
# $1: a json file specifying throughput test cases
|
||||
|
||||
local throughput_test_file
|
||||
throughput_test_file=$1
|
||||
|
||||
# Iterate over throughput tests
|
||||
jq -c '.[]' "$throughput_test_file" | while read -r params; do
|
||||
# get the test name, and append the GPU type back to it.
|
||||
test_name=$(echo "$params" | jq -r '.test_name')
|
||||
if [[ ! "$test_name" =~ ^throughput_ ]]; then
|
||||
echo "In throughput-test.json, test_name must start with \"throughput_\"."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# if TEST_SELECTOR is set, only run the test cases that match the selector
|
||||
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
|
||||
echo "Skip test case $test_name."
|
||||
continue
|
||||
fi
|
||||
|
||||
# get arguments
|
||||
throughput_params=$(echo "$params" | jq -r '.parameters')
|
||||
throughput_args=$(json2args "$throughput_params")
|
||||
|
||||
# check if there is enough GPU to run the test
|
||||
tp=$(echo $throughput_params | jq -r '.tensor_parallel_size')
|
||||
if [[ $gpu_count -lt $tp ]]; then
|
||||
echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $testname."
|
||||
continue
|
||||
fi
|
||||
|
||||
throughput_command="python3 benchmark_throughput.py \
|
||||
--output-json $RESULTS_FOLDER/${test_name}.json \
|
||||
$throughput_args"
|
||||
|
||||
echo "Running test case $test_name"
|
||||
echo "Throughput command: $throughput_command"
|
||||
# recoding benchmarking command ang GPU command
|
||||
jq_output=$(jq -n \
|
||||
--arg command "$throughput_command" \
|
||||
--arg gpu "$gpu_type" \
|
||||
'{
|
||||
throughput_command: $command,
|
||||
gpu_type: $gpu
|
||||
}')
|
||||
echo "$jq_output" > "$RESULTS_FOLDER/$test_name.commands"
|
||||
|
||||
# run the benchmark
|
||||
eval "$throughput_command"
|
||||
|
||||
kill_gpu_processes
|
||||
|
||||
done
|
||||
}
|
||||
|
||||
run_serving_tests() {
|
||||
# run serving tests using `benchmark_serving.py`
|
||||
# $1: a json file specifying serving test cases
|
||||
|
||||
local serving_test_file
|
||||
serving_test_file=$1
|
||||
|
||||
# Iterate over serving tests
|
||||
jq -c '.[]' "$serving_test_file" | while read -r params; do
|
||||
# get the test name, and append the GPU type back to it.
|
||||
test_name=$(echo "$params" | jq -r '.test_name')
|
||||
if [[ ! "$test_name" =~ ^serving_ ]]; then
|
||||
echo "In serving-test.json, test_name must start with \"serving_\"."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# if TEST_SELECTOR is set, only run the test cases that match the selector
|
||||
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
|
||||
echo "Skip test case $test_name."
|
||||
continue
|
||||
fi
|
||||
|
||||
|
||||
# get client and server arguments
|
||||
server_params=$(echo "$params" | jq -r '.server_parameters')
|
||||
client_params=$(echo "$params" | jq -r '.client_parameters')
|
||||
server_args=$(json2args "$server_params")
|
||||
client_args=$(json2args "$client_params")
|
||||
qps_list=$(echo "$params" | jq -r '.qps_list')
|
||||
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
|
||||
echo "Running over qps list $qps_list"
|
||||
|
||||
# check if there is enough GPU to run the test
|
||||
tp=$(echo "$server_params" | jq -r '.tensor_parallel_size')
|
||||
if [[ $gpu_count -lt $tp ]]; then
|
||||
echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $testname."
|
||||
continue
|
||||
fi
|
||||
|
||||
# check if server model and client model is aligned
|
||||
server_model=$(echo "$server_params" | jq -r '.model')
|
||||
client_model=$(echo "$client_params" | jq -r '.model')
|
||||
if [[ $server_model != "$client_model" ]]; then
|
||||
echo "Server model and client model must be the same. Skip testcase $testname."
|
||||
continue
|
||||
fi
|
||||
|
||||
server_command="python3 \
|
||||
-m vllm.entrypoints.openai.api_server \
|
||||
$server_args"
|
||||
|
||||
# run the server
|
||||
echo "Running test case $test_name"
|
||||
echo "Server command: $server_command"
|
||||
eval "$server_command" &
|
||||
|
||||
# wait until the server is alive
|
||||
wait_for_server
|
||||
if [ $? -eq 0 ]; then
|
||||
echo ""
|
||||
echo "vllm server is up and running."
|
||||
else
|
||||
echo ""
|
||||
echo "vllm failed to start within the timeout period."
|
||||
fi
|
||||
|
||||
# iterate over different QPS
|
||||
for qps in $qps_list; do
|
||||
# remove the surrounding single quote from qps
|
||||
if [[ "$qps" == *"inf"* ]]; then
|
||||
echo "qps was $qps"
|
||||
qps="inf"
|
||||
echo "now qps is $qps"
|
||||
fi
|
||||
|
||||
new_test_name=$test_name"_qps_"$qps
|
||||
|
||||
client_command="python3 benchmark_serving.py \
|
||||
--save-result \
|
||||
--result-dir $RESULTS_FOLDER \
|
||||
--result-filename ${new_test_name}.json \
|
||||
--request-rate $qps \
|
||||
$client_args"
|
||||
|
||||
echo "Running test case $test_name with qps $qps"
|
||||
echo "Client command: $client_command"
|
||||
|
||||
eval "$client_command"
|
||||
|
||||
# record the benchmarking commands
|
||||
jq_output=$(jq -n \
|
||||
--arg server "$server_command" \
|
||||
--arg client "$client_command" \
|
||||
--arg gpu "$gpu_type" \
|
||||
'{
|
||||
server_command: $server,
|
||||
client_command: $client,
|
||||
gpu_type: $gpu
|
||||
}')
|
||||
echo "$jq_output" > "$RESULTS_FOLDER/${new_test_name}.commands"
|
||||
|
||||
done
|
||||
|
||||
# clean up
|
||||
kill_gpu_processes
|
||||
done
|
||||
}
|
||||
|
||||
main() {
|
||||
check_gpus
|
||||
check_hf_token
|
||||
|
||||
# dependencies
|
||||
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
|
||||
(which jq) || (apt-get update && apt-get -y install jq)
|
||||
|
||||
# get the current IP address, required by benchmark_serving.py
|
||||
export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
|
||||
# turn of the reporting of the status of each request, to clean up the terminal output
|
||||
export VLLM_LOG_LEVEL="WARNING"
|
||||
|
||||
# prepare for benchmarking
|
||||
cd benchmarks || exit 1
|
||||
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
||||
declare -g RESULTS_FOLDER=results/
|
||||
mkdir -p $RESULTS_FOLDER
|
||||
QUICK_BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
|
||||
|
||||
# benchmarking
|
||||
run_serving_tests $QUICK_BENCHMARK_ROOT/tests/serving-tests.json
|
||||
run_latency_tests $QUICK_BENCHMARK_ROOT/tests/latency-tests.json
|
||||
run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/throughput-tests.json
|
||||
|
||||
|
||||
# postprocess benchmarking results
|
||||
pip install tabulate pandas
|
||||
python3 $QUICK_BENCHMARK_ROOT/scripts/convert-results-json-to-markdown.py
|
||||
|
||||
upload_to_buildkite
|
||||
}
|
||||
|
||||
main "$@"
|
||||
@ -1,192 +0,0 @@
|
||||
import json
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
import pandas as pd
|
||||
from tabulate import tabulate
|
||||
|
||||
results_folder = Path("results/")
|
||||
|
||||
# latency results and the keys that will be printed into markdown
|
||||
latency_results = []
|
||||
latency_column_mapping = {
|
||||
"test_name": "Test name",
|
||||
"gpu_type": "GPU",
|
||||
"avg_latency": "Mean latency (ms)",
|
||||
# "P10": "P10 (s)",
|
||||
# "P25": "P25 (s)",
|
||||
"P50": "Median latency (ms)",
|
||||
# "P75": "P75 (s)",
|
||||
# "P90": "P90 (s)",
|
||||
"P99": "P99 latency (ms)",
|
||||
}
|
||||
|
||||
# throughput tests and the keys that will be printed into markdown
|
||||
throughput_results = []
|
||||
throughput_results_column_mapping = {
|
||||
"test_name": "Test name",
|
||||
"gpu_type": "GPU",
|
||||
# "num_requests": "# of req.",
|
||||
# "total_num_tokens": "Total # of tokens",
|
||||
# "elapsed_time": "Elapsed time (s)",
|
||||
"requests_per_second": "Tput (req/s)",
|
||||
# "tokens_per_second": "Tput (tok/s)",
|
||||
}
|
||||
|
||||
# serving results and the keys that will be printed into markdown
|
||||
serving_results = []
|
||||
serving_column_mapping = {
|
||||
"test_name": "Test name",
|
||||
"gpu_type": "GPU",
|
||||
# "completed": "# of req.",
|
||||
"request_throughput": "Tput (req/s)",
|
||||
# "input_throughput": "Input Tput (tok/s)",
|
||||
# "output_throughput": "Output Tput (tok/s)",
|
||||
"mean_ttft_ms": "Mean TTFT (ms)",
|
||||
"median_ttft_ms": "Median TTFT (ms)",
|
||||
"p99_ttft_ms": "P99 TTFT (ms)",
|
||||
# "mean_tpot_ms": "Mean TPOT (ms)",
|
||||
# "median_tpot_ms": "Median",
|
||||
# "p99_tpot_ms": "P99",
|
||||
"mean_itl_ms": "Mean ITL (ms)",
|
||||
"median_itl_ms": "Median ITL (ms)",
|
||||
"p99_itl_ms": "P99 ITL (ms)",
|
||||
}
|
||||
|
||||
|
||||
def read_markdown(file):
|
||||
if os.path.exists(file):
|
||||
with open(file, "r") as f:
|
||||
return f.read() + "\n"
|
||||
else:
|
||||
return f"{file} not found.\n"
|
||||
|
||||
|
||||
def results_to_json(latency, throughput, serving):
|
||||
return json.dumps({
|
||||
'latency': latency.to_dict(),
|
||||
'throughput': throughput.to_dict(),
|
||||
'serving': serving.to_dict()
|
||||
})
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
# collect results
|
||||
for test_file in results_folder.glob("*.json"):
|
||||
|
||||
with open(test_file, "r") as f:
|
||||
raw_result = json.loads(f.read())
|
||||
|
||||
if "serving" in str(test_file):
|
||||
# this result is generated via `benchmark_serving.py`
|
||||
|
||||
# attach the benchmarking command to raw_result
|
||||
with open(test_file.with_suffix(".commands"), "r") as f:
|
||||
command = json.loads(f.read())
|
||||
raw_result.update(command)
|
||||
|
||||
# update the test name of this result
|
||||
raw_result.update({"test_name": test_file.stem})
|
||||
|
||||
# add the result to raw_result
|
||||
serving_results.append(raw_result)
|
||||
continue
|
||||
|
||||
elif "latency" in f.name:
|
||||
# this result is generated via `benchmark_latency.py`
|
||||
|
||||
# attach the benchmarking command to raw_result
|
||||
with open(test_file.with_suffix(".commands"), "r") as f:
|
||||
command = json.loads(f.read())
|
||||
raw_result.update(command)
|
||||
|
||||
# update the test name of this result
|
||||
raw_result.update({"test_name": test_file.stem})
|
||||
|
||||
# get different percentiles
|
||||
for perc in [10, 25, 50, 75, 90, 99]:
|
||||
# Multiply 1000 to convert the time unit from s to ms
|
||||
raw_result.update(
|
||||
{f"P{perc}": 1000 * raw_result["percentiles"][str(perc)]})
|
||||
raw_result["avg_latency"] = raw_result["avg_latency"] * 1000
|
||||
|
||||
# add the result to raw_result
|
||||
latency_results.append(raw_result)
|
||||
continue
|
||||
|
||||
elif "throughput" in f.name:
|
||||
# this result is generated via `benchmark_throughput.py`
|
||||
|
||||
# attach the benchmarking command to raw_result
|
||||
with open(test_file.with_suffix(".commands"), "r") as f:
|
||||
command = json.loads(f.read())
|
||||
raw_result.update(command)
|
||||
|
||||
# update the test name of this result
|
||||
raw_result.update({"test_name": test_file.stem})
|
||||
|
||||
# add the result to raw_result
|
||||
throughput_results.append(raw_result)
|
||||
continue
|
||||
|
||||
print(f"Skipping {test_file}")
|
||||
|
||||
latency_results = pd.DataFrame.from_dict(latency_results)
|
||||
serving_results = pd.DataFrame.from_dict(serving_results)
|
||||
throughput_results = pd.DataFrame.from_dict(throughput_results)
|
||||
|
||||
raw_results_json = results_to_json(latency_results, throughput_results,
|
||||
serving_results)
|
||||
|
||||
# remapping the key, for visualization purpose
|
||||
if not latency_results.empty:
|
||||
latency_results = latency_results[list(
|
||||
latency_column_mapping.keys())].rename(
|
||||
columns=latency_column_mapping)
|
||||
if not serving_results.empty:
|
||||
serving_results = serving_results[list(
|
||||
serving_column_mapping.keys())].rename(
|
||||
columns=serving_column_mapping)
|
||||
if not throughput_results.empty:
|
||||
throughput_results = throughput_results[list(
|
||||
throughput_results_column_mapping.keys())].rename(
|
||||
columns=throughput_results_column_mapping)
|
||||
|
||||
processed_results_json = results_to_json(latency_results,
|
||||
throughput_results,
|
||||
serving_results)
|
||||
|
||||
# get markdown tables
|
||||
latency_md_table = tabulate(latency_results,
|
||||
headers='keys',
|
||||
tablefmt='pipe',
|
||||
showindex=False)
|
||||
serving_md_table = tabulate(serving_results,
|
||||
headers='keys',
|
||||
tablefmt='pipe',
|
||||
showindex=False)
|
||||
throughput_md_table = tabulate(throughput_results,
|
||||
headers='keys',
|
||||
tablefmt='pipe',
|
||||
showindex=False)
|
||||
|
||||
# document the result
|
||||
with open(results_folder / "benchmark_results.md", "w") as f:
|
||||
|
||||
results = read_markdown(
|
||||
"../.buildkite/nightly-benchmarks/tests/descriptions.md")
|
||||
results = results.format(
|
||||
latency_tests_markdown_table=latency_md_table,
|
||||
throughput_tests_markdown_table=throughput_md_table,
|
||||
serving_tests_markdown_table=serving_md_table,
|
||||
benchmarking_results_in_json_string=processed_results_json)
|
||||
f.write(results)
|
||||
|
||||
# document benchmarking results in json
|
||||
with open(results_folder / "benchmark_results.json", "w") as f:
|
||||
|
||||
results = latency_results.to_dict(
|
||||
orient='records') + throughput_results.to_dict(
|
||||
orient='records') + serving_results.to_dict(orient='records')
|
||||
f.write(json.dumps(results))
|
||||
@ -1,17 +0,0 @@
|
||||
#!/bin/sh
|
||||
TOKEN=$(curl -s -L "https://public.ecr.aws/token?service=public.ecr.aws&scope=repository:q9t5s3a7/vllm-ci-test-repo:pull" | jq -r .token)
|
||||
URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-test-repo/manifests/$BUILDKITE_COMMIT"
|
||||
|
||||
retries=0
|
||||
while [ $retries -lt 1000 ]; do
|
||||
if [ $(curl -s -L -H "Authorization: Bearer $TOKEN" -o /dev/null -w "%{http_code}" $URL) -eq 200 ]; then
|
||||
exit 0
|
||||
fi
|
||||
|
||||
echo "Waiting for image to be available..."
|
||||
|
||||
retries=$((retries + 1))
|
||||
sleep 5
|
||||
done
|
||||
|
||||
exit 1
|
||||
@ -1,67 +0,0 @@
|
||||
|
||||
## Latency tests
|
||||
|
||||
This test suite aims to test vllm's end-to-end latency under a controlled setup.
|
||||
|
||||
- Input length: 32 tokens.
|
||||
- Output length: 128 tokens.
|
||||
- Batch size: fixed (8).
|
||||
- Models: llama-3 8B, llama-3 70B, mixtral 8x7B.
|
||||
- Evaluation metrics: end-to-end latency (mean, median, p99).
|
||||
|
||||
### Latency benchmarking results
|
||||
|
||||
{latency_tests_markdown_table}
|
||||
|
||||
## Throughput tests
|
||||
|
||||
This test suite aims to test vllm's throughput.
|
||||
|
||||
- Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
|
||||
- Output length: the corresponding output length of these 200 prompts.
|
||||
- Batch size: dynamically determined by vllm to achieve maximum throughput.
|
||||
- Models: llama-3 8B, llama-3 70B, mixtral 8x7B.
|
||||
- Evaluation metrics: throughput.
|
||||
|
||||
### Throughput benchmarking results
|
||||
|
||||
{throughput_tests_markdown_table}
|
||||
|
||||
## Serving tests
|
||||
|
||||
This test suite aims to test vllm's real serving metrics.
|
||||
|
||||
- Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
|
||||
- Output length: the corresponding output length of these 200 prompts.
|
||||
- Batch size: dynamically determined by vllm and the arrival pattern of the requests.
|
||||
- **Average QPS (query per second)**: 1, 4, 16 and inf. QPS = inf means all requests come at once. For other QPS values, the arrival time of each query is determined using a random Poisson process (with fixed random seed).
|
||||
- Models: llama-3 8B, llama-3 70B, mixtral 8x7B.
|
||||
- Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99).
|
||||
|
||||
### Serving benchmarking results
|
||||
|
||||
{serving_tests_markdown_table}
|
||||
|
||||
## json version of the benchmarking tables
|
||||
|
||||
This section contains the data of the markdown tables above in JSON format.
|
||||
You can load the benchmarking tables into pandas dataframes as follows:
|
||||
|
||||
```python
|
||||
import json
|
||||
import pandas as pd
|
||||
|
||||
benchmarking_results_json = """The json string"""
|
||||
benchmarking_results = json.loads(benchmarking_results_json)
|
||||
latency_results = pd.DataFrame.from_dict(benchmarking_results["latency"])
|
||||
throughput_results = pd.DataFrame.from_dict(benchmarking_results["throughput"])
|
||||
serving_results = pd.DataFrame.from_dict(benchmarking_results["serving"])
|
||||
```
|
||||
|
||||
The json string for all benchmarking tables:
|
||||
```json
|
||||
{benchmarking_results_in_json_string}
|
||||
```
|
||||
|
||||
You can also check the raw experiment data in the Artifact tab of the Buildkite page.
|
||||
|
||||
@ -1,32 +0,0 @@
|
||||
[
|
||||
{
|
||||
"test_name": "latency_llama8B_tp1",
|
||||
"parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3-8B",
|
||||
"tensor_parallel_size": 1,
|
||||
"load_format": "dummy",
|
||||
"num_iters_warmup": 5,
|
||||
"num_iters": 15
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "latency_llama70B_tp4",
|
||||
"parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3-70B-Instruct",
|
||||
"tensor_parallel_size": 4,
|
||||
"load_format": "dummy",
|
||||
"num-iters-warmup": 5,
|
||||
"num-iters": 15
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "latency_mixtral8x7B_tp2",
|
||||
"parameters": {
|
||||
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
|
||||
"tensor_parallel_size": 2,
|
||||
"load_format": "dummy",
|
||||
"num-iters-warmup": 5,
|
||||
"num-iters": 15
|
||||
}
|
||||
}
|
||||
]
|
||||
@ -1,59 +0,0 @@
|
||||
[
|
||||
{
|
||||
"test_name": "serving_llama8B_tp1_sharegpt",
|
||||
"qps_list": [1, 4, 16, "inf"],
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3-8B",
|
||||
"tensor_parallel_size": 1,
|
||||
"swap_space": 16,
|
||||
"disable_log_stats": "",
|
||||
"disable_log_requests": "",
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3-8B",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"num_prompts": 200
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama70B_tp4_sharegpt",
|
||||
"qps_list": [1, 4, 16, "inf"],
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3-70B-Instruct",
|
||||
"tensor_parallel_size": 4,
|
||||
"swap_space": 16,
|
||||
"disable_log_stats": "",
|
||||
"disable_log_requests": "",
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3-70B-Instruct",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"num_prompts": 200
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_mixtral8x7B_tp2_sharegpt",
|
||||
"qps_list": [1, 4, 16, "inf"],
|
||||
"server_parameters": {
|
||||
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
|
||||
"tensor_parallel_size": 2,
|
||||
"swap_space": 16,
|
||||
"disable_log_stats": "",
|
||||
"disable_log_requests": "",
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"num_prompts": 200
|
||||
}
|
||||
}
|
||||
]
|
||||
@ -1,35 +0,0 @@
|
||||
[
|
||||
{
|
||||
"test_name": "throughput_llama8B_tp1",
|
||||
"parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3-8B",
|
||||
"tensor_parallel_size": 1,
|
||||
"load_format": "dummy",
|
||||
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"num_prompts": 200,
|
||||
"backend": "vllm"
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "throughput_llama70B_tp4",
|
||||
"parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3-70B-Instruct",
|
||||
"tensor_parallel_size": 4,
|
||||
"load_format": "dummy",
|
||||
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"num_prompts": 200,
|
||||
"backend": "vllm"
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "throughput_mixtral8x7B_tp2",
|
||||
"parameters": {
|
||||
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
|
||||
"tensor_parallel_size": 2,
|
||||
"load_format": "dummy",
|
||||
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"num_prompts": 200,
|
||||
"backend": "vllm"
|
||||
}
|
||||
}
|
||||
]
|
||||
@ -1,21 +0,0 @@
|
||||
steps:
|
||||
- block: "Build wheels"
|
||||
|
||||
- label: "Build wheel - Python {{matrix.python_version}}, CUDA {{matrix.cuda_version}}"
|
||||
agents:
|
||||
queue: cpu_queue
|
||||
commands:
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg CUDA_VERSION={{matrix.cuda_version}} --build-arg PYTHON_VERSION={{matrix.python_version}} --tag vllm-ci:build-image --target build --progress plain ."
|
||||
- "mkdir artifacts"
|
||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image cp -r dist /artifacts_host"
|
||||
- "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/$BUILDKITE_COMMIT/"
|
||||
matrix:
|
||||
setup:
|
||||
cuda_version:
|
||||
- "11.8.0"
|
||||
- "12.1.0"
|
||||
python_version:
|
||||
- "3.8"
|
||||
- "3.9"
|
||||
- "3.10"
|
||||
- "3.11"
|
||||
@ -1,73 +1,38 @@
|
||||
# This script runs test inside the corresponding ROCm docker container.
|
||||
# This script build the ROCm docker image and run the API server inside the container.
|
||||
# It serves a sanity check for compilation and basic model usage.
|
||||
set -ex
|
||||
|
||||
# Print ROCm version
|
||||
echo "--- ROCm info"
|
||||
rocminfo
|
||||
|
||||
# cleanup older docker images
|
||||
cleanup_docker() {
|
||||
# Get Docker's root directory
|
||||
docker_root=$(docker info -f '{{.DockerRootDir}}')
|
||||
if [ -z "$docker_root" ]; then
|
||||
echo "Failed to determine Docker root directory."
|
||||
exit 1
|
||||
fi
|
||||
echo "Docker root directory: $docker_root"
|
||||
# Check disk usage of the filesystem where Docker's root directory is located
|
||||
disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
|
||||
# Define the threshold
|
||||
threshold=70
|
||||
if [ "$disk_usage" -gt "$threshold" ]; then
|
||||
echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
|
||||
# Remove dangling images (those that are not tagged and not used by any container)
|
||||
docker image prune -f
|
||||
# Remove unused volumes
|
||||
docker volume prune -f
|
||||
echo "Docker images and volumes cleanup completed."
|
||||
else
|
||||
echo "Disk usage is below $threshold%. No cleanup needed."
|
||||
fi
|
||||
}
|
||||
# Try building the docker image
|
||||
docker build -t rocm -f Dockerfile.rocm .
|
||||
|
||||
# Call the cleanup docker function
|
||||
cleanup_docker
|
||||
|
||||
echo "--- Resetting GPUs"
|
||||
|
||||
echo "reset" > /opt/amdgpu/etc/gpu_state
|
||||
|
||||
while true; do
|
||||
sleep 3
|
||||
if grep -q clean /opt/amdgpu/etc/gpu_state; then
|
||||
echo "GPUs state is \"clean\""
|
||||
break
|
||||
fi
|
||||
done
|
||||
|
||||
echo "--- Building container"
|
||||
sha=$(git rev-parse --short HEAD)
|
||||
image_name=rocm_${sha}
|
||||
container_name=rocm_${sha}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)
|
||||
docker build \
|
||||
-t ${image_name} \
|
||||
-f Dockerfile.rocm \
|
||||
--progress plain \
|
||||
.
|
||||
|
||||
remove_docker_container() {
|
||||
docker rm -f ${container_name} || docker image rm -f ${image_name} || true
|
||||
}
|
||||
# Setup cleanup
|
||||
remove_docker_container() { docker rm -f rocm || true; }
|
||||
trap remove_docker_container EXIT
|
||||
remove_docker_container
|
||||
|
||||
echo "--- Running container"
|
||||
# Run the image
|
||||
docker run --device /dev/kfd --device /dev/dri --network host --name rocm rocm python3 -m vllm.entrypoints.api_server &
|
||||
|
||||
docker run \
|
||||
--device /dev/kfd --device /dev/dri \
|
||||
--network host \
|
||||
--rm \
|
||||
-e HF_TOKEN \
|
||||
--name ${container_name} \
|
||||
${image_name} \
|
||||
/bin/bash -c "${@}"
|
||||
# Wait for the server to start
|
||||
wait_for_server_to_start() {
|
||||
timeout=300
|
||||
counter=0
|
||||
|
||||
while [ "$(curl -s -o /dev/null -w ''%{http_code}'' localhost:8000/health)" != "200" ]; do
|
||||
sleep 1
|
||||
counter=$((counter + 1))
|
||||
if [ $counter -ge $timeout ]; then
|
||||
echo "Timeout after $timeout seconds"
|
||||
break
|
||||
fi
|
||||
done
|
||||
}
|
||||
wait_for_server_to_start
|
||||
|
||||
# Test a simple prompt
|
||||
curl -X POST -H "Content-Type: application/json" \
|
||||
localhost:8000/generate \
|
||||
-d '{"prompt": "San Francisco is a"}'
|
||||
|
||||
@ -9,10 +9,10 @@ cd "$(dirname "${BASH_SOURCE[0]}")/.."
|
||||
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
|
||||
|
||||
# run python-based benchmarks and upload the result to buildkite
|
||||
python3 benchmarks/benchmark_latency.py --output-json latency_results.json 2>&1 | tee benchmark_latency.txt
|
||||
python3 benchmarks/benchmark_latency.py 2>&1 | tee benchmark_latency.txt
|
||||
bench_latency_exit_code=$?
|
||||
|
||||
python3 benchmarks/benchmark_throughput.py --input-len 256 --output-len 256 --output-json throughput_results.json 2>&1 | tee benchmark_throughput.txt
|
||||
python3 benchmarks/benchmark_throughput.py --input-len 256 --output-len 256 2>&1 | tee benchmark_throughput.txt
|
||||
bench_throughput_exit_code=$?
|
||||
|
||||
# run server-based benchmarks and upload the result to buildkite
|
||||
@ -50,16 +50,11 @@ echo "### Serving Benchmarks" >> benchmark_results.md
|
||||
sed -n '1p' benchmark_serving.txt >> benchmark_results.md # first line
|
||||
echo "" >> benchmark_results.md
|
||||
echo '```' >> benchmark_results.md
|
||||
tail -n 24 benchmark_serving.txt >> benchmark_results.md # last 24 lines
|
||||
tail -n 20 benchmark_serving.txt >> benchmark_results.md # last 20 lines
|
||||
echo '```' >> benchmark_results.md
|
||||
|
||||
# if the agent binary is not found, skip uploading the results, exit 0
|
||||
if [ ! -f /usr/bin/buildkite-agent ]; then
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# upload the results to buildkite
|
||||
buildkite-agent annotate --style "info" --context "benchmark-results" < benchmark_results.md
|
||||
/workspace/buildkite-agent annotate --style "info" --context "benchmark-results" < benchmark_results.md
|
||||
|
||||
# exit with the exit code of the benchmarks
|
||||
if [ $bench_latency_exit_code -ne 0 ]; then
|
||||
@ -74,5 +69,4 @@ if [ $bench_serving_exit_code -ne 0 ]; then
|
||||
exit $bench_serving_exit_code
|
||||
fi
|
||||
|
||||
rm ShareGPT_V3_unfiltered_cleaned_split.json
|
||||
buildkite-agent artifact upload "*.json"
|
||||
/workspace/buildkite-agent artifact upload openai-*.json
|
||||
|
||||
@ -4,25 +4,11 @@ set -ex
|
||||
|
||||
# Try building the docker image
|
||||
docker build -t cpu-test -f Dockerfile.cpu .
|
||||
docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-avx2 -f Dockerfile.cpu .
|
||||
|
||||
# Setup cleanup
|
||||
remove_docker_container() { docker rm -f cpu-test cpu-test-avx2 || true; }
|
||||
remove_docker_container() { docker rm -f cpu-test || true; }
|
||||
trap remove_docker_container EXIT
|
||||
remove_docker_container
|
||||
|
||||
# Run the image
|
||||
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=48-95 \
|
||||
--cpuset-mems=1 --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --name cpu-test cpu-test
|
||||
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=48-95 \
|
||||
--cpuset-mems=1 --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --name cpu-test-avx2 cpu-test-avx2
|
||||
|
||||
# offline inference
|
||||
docker exec cpu-test bash -c "python3 examples/offline_inference.py"
|
||||
docker exec cpu-test-avx2 bash -c "python3 examples/offline_inference.py"
|
||||
|
||||
# Run basic model test
|
||||
docker exec cpu-test bash -c "cd tests;
|
||||
pip install pytest Pillow protobuf
|
||||
cd ../
|
||||
pytest -v -s tests/models -m \"not vlm\" --ignore=tests/models/test_embedding.py --ignore=tests/models/test_registry.py --ignore=tests/models/test_jamba.py" # Mamba on CPU is not supported
|
||||
# Run the image and launch offline inference
|
||||
docker run --network host --env VLLM_CPU_KVCACHE_SPACE=1 --name cpu-test cpu-test python3 examples/offline_inference.py
|
||||
|
||||
@ -1,51 +0,0 @@
|
||||
# This script build the Neuron docker image and run the API server inside the container.
|
||||
# It serves a sanity check for compilation and basic model usage.
|
||||
set -e
|
||||
|
||||
# Try building the docker image
|
||||
aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-west-2.amazonaws.com
|
||||
|
||||
# prune old image and containers to save disk space, and only once a day
|
||||
# by using a timestamp file in tmp.
|
||||
if [ -f /tmp/neuron-docker-build-timestamp ]; then
|
||||
last_build=$(cat /tmp/neuron-docker-build-timestamp)
|
||||
current_time=$(date +%s)
|
||||
if [ $((current_time - last_build)) -gt 86400 ]; then
|
||||
docker system prune -f
|
||||
echo $current_time > /tmp/neuron-docker-build-timestamp
|
||||
fi
|
||||
else
|
||||
echo $(date +%s) > /tmp/neuron-docker-build-timestamp
|
||||
fi
|
||||
|
||||
docker build -t neuron -f Dockerfile.neuron .
|
||||
|
||||
# Setup cleanup
|
||||
remove_docker_container() { docker rm -f neuron || true; }
|
||||
trap remove_docker_container EXIT
|
||||
remove_docker_container
|
||||
|
||||
# Run the image
|
||||
docker run --device=/dev/neuron0 --device=/dev/neuron1 --network host --name neuron neuron python3 -m vllm.entrypoints.api_server \
|
||||
--model TinyLlama/TinyLlama-1.1B-Chat-v1.0 --max-num-seqs 8 --max-model-len 128 --block-size 128 --device neuron --tensor-parallel-size 2 &
|
||||
|
||||
# Wait for the server to start
|
||||
wait_for_server_to_start() {
|
||||
timeout=300
|
||||
counter=0
|
||||
|
||||
while [ "$(curl -s -o /dev/null -w ''%{http_code}'' localhost:8000/health)" != "200" ]; do
|
||||
sleep 1
|
||||
counter=$((counter + 1))
|
||||
if [ $counter -ge $timeout ]; then
|
||||
echo "Timeout after $timeout seconds"
|
||||
break
|
||||
fi
|
||||
done
|
||||
}
|
||||
wait_for_server_to_start
|
||||
|
||||
# Test a simple prompt
|
||||
curl -X POST -H "Content-Type: application/json" \
|
||||
localhost:8000/generate \
|
||||
-d '{"prompt": "San Francisco is a"}'
|
||||
@ -1,14 +0,0 @@
|
||||
# This script build the OpenVINO docker image and run the offline inference inside the container.
|
||||
# It serves a sanity check for compilation and basic model usage.
|
||||
set -ex
|
||||
|
||||
# Try building the docker image
|
||||
docker build -t openvino-test -f Dockerfile.openvino .
|
||||
|
||||
# Setup cleanup
|
||||
remove_docker_container() { docker rm -f openvino-test || true; }
|
||||
trap remove_docker_container EXIT
|
||||
remove_docker_container
|
||||
|
||||
# Run the image and launch offline inference
|
||||
docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/vllm/examples/offline_inference.py
|
||||
@ -1,14 +0,0 @@
|
||||
# This script build the CPU docker image and run the offline inference inside the container.
|
||||
# It serves a sanity check for compilation and basic model usage.
|
||||
set -ex
|
||||
|
||||
# Try building the docker image
|
||||
docker build -t xpu-test -f Dockerfile.xpu .
|
||||
|
||||
# Setup cleanup
|
||||
remove_docker_container() { docker rm -f xpu-test || true; }
|
||||
trap remove_docker_container EXIT
|
||||
remove_docker_container
|
||||
|
||||
# Run the image and launch offline inference
|
||||
docker run --network host --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path xpu-test python3 examples/offline_inference.py
|
||||
@ -1,243 +1,111 @@
|
||||
# In this file, you can add more tests to run either by adding a new step or
|
||||
# adding a new command to an existing step. See different options here for examples.
|
||||
|
||||
# This script will be feed into Jinja template in `test-template-aws.j2` at
|
||||
# https://github.com/vllm-project/buildkite-ci/blob/main/scripts/test-template-aws.j2
|
||||
# to generate the final pipeline yaml file.
|
||||
|
||||
# This script will be feed into Jinja template in `test-template.j2` to generate
|
||||
# the final pipeline yaml file.
|
||||
|
||||
steps:
|
||||
- label: Regression Test
|
||||
mirror_hardwares: [amd]
|
||||
command: pytest -v -s test_regression.py
|
||||
working_dir: "/vllm-workspace/tests" # optional
|
||||
|
||||
- label: AsyncEngine Test
|
||||
#mirror_hardwares: [amd]
|
||||
command: pytest -v -s async_engine
|
||||
|
||||
- label: Basic Correctness Test
|
||||
mirror_hardwares: [amd]
|
||||
commands:
|
||||
- VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_basic_correctness.py
|
||||
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_basic_correctness.py
|
||||
- VLLM_ATTENTION_BACKEND=ROCM_FLASH pytest -v -s basic_correctness/test_basic_correctness.py
|
||||
- VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
|
||||
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
|
||||
- VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
|
||||
- VLLM_ATTENTION_BACKEND=ROCM_FLASH pytest -v -s basic_correctness/test_chunked_prefill.py
|
||||
|
||||
- label: Core Test
|
||||
mirror_hardwares: [amd]
|
||||
commands:
|
||||
- pytest -v -s core
|
||||
- pytest -v -s distributed/test_parallel_state.py
|
||||
command: pytest -v -s core
|
||||
|
||||
- label: Distributed Comm Ops Test
|
||||
#mirror_hardwares: [amd]
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 2
|
||||
commands:
|
||||
- pytest -v -s distributed/test_comm_ops.py
|
||||
- pytest -v -s distributed/test_shm_broadcast.py
|
||||
command: pytest -v -s test_comm_ops.py
|
||||
working_dir: "/vllm-workspace/tests/distributed"
|
||||
num_gpus: 2 # only support 1 or 2 for now.
|
||||
|
||||
- label: Distributed Tests (2 GPUs)
|
||||
mirror_hardwares: [amd]
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 2
|
||||
- label: Distributed Tests
|
||||
working_dir: "/vllm-workspace/tests/distributed"
|
||||
num_gpus: 2 # only support 1 or 2 for now.
|
||||
commands:
|
||||
- bash ../.buildkite/download-images.sh
|
||||
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py
|
||||
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
|
||||
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
|
||||
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py
|
||||
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py
|
||||
- TEST_DIST_MODEL=llava-hf/llava-1.5-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_multimodal_broadcast.py
|
||||
- TEST_DIST_MODEL=microsoft/Phi-3-vision-128k-instruct DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_multimodal_broadcast.py
|
||||
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
|
||||
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
|
||||
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
|
||||
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
|
||||
- TEST_DIST_MODEL=llava-hf/llava-1.5-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_multimodal_broadcast.py
|
||||
- TEST_DIST_MODEL=microsoft/Phi-3-vision-128k-instruct DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_multimodal_broadcast.py
|
||||
- pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
|
||||
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
|
||||
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py
|
||||
|
||||
- label: Distributed Tests (4 GPUs)
|
||||
#mirror_hardwares: [amd]
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 4
|
||||
commands:
|
||||
- pytest -v -s distributed/test_pynccl.py
|
||||
# We want to test that models which use 2 GPUs work with 4 GPUs, which is why we duplicate them here.
|
||||
# See https://github.com/vllm-project/vllm/pull/5473#issuecomment-2166601837 for context.
|
||||
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
|
||||
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
|
||||
- pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
|
||||
|
||||
- label: Pipeline Parallelism Test
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 4
|
||||
commands:
|
||||
- TP_SIZE=2 PP_SIZE=2 EAGER_MODE=1 CHUNKED_PREFILL=1 pytest -v -s distributed/test_pipeline_parallel.py
|
||||
- TP_SIZE=2 PP_SIZE=2 EAGER_MODE=1 CHUNKED_PREFILL=0 pytest -v -s distributed/test_pipeline_parallel.py
|
||||
- TP_SIZE=1 PP_SIZE=3 EAGER_MODE=1 CHUNKED_PREFILL=0 pytest -v -s distributed/test_pipeline_parallel.py
|
||||
- PP_SIZE=4 EAGER_MODE=1 CHUNKED_PREFILL=1 pytest -v -s distributed/test_pipeline_parallel.py
|
||||
- PP_SIZE=4 EAGER_MODE=1 CHUNKED_PREFILL=0 pytest -v -s distributed/test_pipeline_parallel.py
|
||||
|
||||
- pytest -v -s test_pynccl.py
|
||||
- TEST_DIST_MODEL=facebook/opt-125m pytest -v -s test_basic_distributed_correctness.py
|
||||
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf pytest -v -s test_basic_distributed_correctness.py
|
||||
- TEST_DIST_MODEL=facebook/opt-125m pytest -v -s test_chunked_prefill_distributed.py
|
||||
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf pytest -v -s test_chunked_prefill_distributed.py
|
||||
|
||||
- label: Engine Test
|
||||
mirror_hardwares: [amd]
|
||||
command: pytest -v -s engine tokenization test_sequence.py test_config.py test_logger.py
|
||||
command: pytest -v -s engine tokenization test_sequence.py test_config.py
|
||||
|
||||
- label: Entrypoints Test
|
||||
mirror_hardwares: [amd]
|
||||
|
||||
commands:
|
||||
- pytest -v -s entrypoints/llm
|
||||
- pytest -v -s entrypoints/openai
|
||||
# these tests have to be separated, because each one will allocate all posible GPU memory
|
||||
- pytest -v -s entrypoints --ignore=entrypoints/test_server_oot_registration.py
|
||||
- pytest -v -s entrypoints/test_server_oot_registration.py
|
||||
|
||||
- label: Examples Test
|
||||
working_dir: "/vllm-workspace/examples"
|
||||
mirror_hardwares: [amd]
|
||||
commands:
|
||||
# install aws cli for llava_example.py
|
||||
# install tensorizer for tensorize_vllm_model.py
|
||||
- pip install awscli tensorizer
|
||||
- pip install awscli
|
||||
- python3 offline_inference.py
|
||||
- python3 offline_inference_with_prefix.py
|
||||
- python3 llm_engine_example.py
|
||||
- python3 llava_example.py
|
||||
- python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
||||
|
||||
- label: Inputs Test
|
||||
#mirror_hardwares: [amd]
|
||||
commands:
|
||||
- bash ../.buildkite/download-images.sh
|
||||
- pytest -v -s test_inputs.py
|
||||
- pytest -v -s multimodal
|
||||
|
||||
- label: Kernels Test %N
|
||||
#mirror_hardwares: [amd]
|
||||
commands:
|
||||
- pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.7/flashinfer-0.0.7+cu121torch2.3-cp310-cp310-linux_x86_64.whl
|
||||
- pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
||||
command: pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
||||
parallelism: 4
|
||||
|
||||
- label: Models Test
|
||||
#mirror_hardwares: [amd]
|
||||
commands:
|
||||
- pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.7/flashinfer-0.0.7+cu121torch2.3-cp310-cp310-linux_x86_64.whl
|
||||
- pytest -v -s models -m \"not vlm\"
|
||||
|
||||
- label: Vision Language Models Test
|
||||
mirror_hardwares: [amd]
|
||||
commands:
|
||||
- bash ../.buildkite/download-images.sh
|
||||
- pytest -v -s models -m vlm
|
||||
- pytest -v -s models --ignore=models/test_llava.py --ignore=models/test_mistral.py
|
||||
|
||||
- label: Llava Test
|
||||
commands:
|
||||
- bash ../.buildkite/download-images.sh
|
||||
- pytest -v -s models/test_llava.py
|
||||
|
||||
- label: Prefix Caching Test
|
||||
mirror_hardwares: [amd]
|
||||
commands:
|
||||
- pytest -v -s prefix_caching
|
||||
|
||||
- label: Samplers Test
|
||||
#mirror_hardwares: [amd]
|
||||
command: pytest -v -s samplers
|
||||
|
||||
- label: LogitsProcessor Test
|
||||
mirror_hardwares: [amd]
|
||||
command: pytest -v -s test_logits_processor.py
|
||||
|
||||
- label: Utils Test
|
||||
command: pytest -v -s test_utils.py
|
||||
|
||||
- label: Worker Test
|
||||
mirror_hardwares: [amd]
|
||||
command: pytest -v -s worker
|
||||
|
||||
- label: Speculative decoding tests
|
||||
#mirror_hardwares: [amd]
|
||||
commands:
|
||||
# See https://github.com/vllm-project/vllm/issues/5152
|
||||
- export VLLM_ATTENTION_BACKEND=XFORMERS
|
||||
- pytest -v -s spec_decode
|
||||
command: pytest -v -s spec_decode
|
||||
|
||||
- label: LoRA Test %N
|
||||
#mirror_hardwares: [amd]
|
||||
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py
|
||||
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
||||
parallelism: 4
|
||||
|
||||
- label: LoRA Long Context (Distributed)
|
||||
#mirror_hardwares: [amd]
|
||||
num_gpus: 4
|
||||
# This test runs llama 13B, so it is required to run on 4 GPUs.
|
||||
commands:
|
||||
# FIXIT: find out which code initialize cuda before running the test
|
||||
# before the fix, we need to use spawn to test it
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- pytest -v -s -x lora/test_long_context.py
|
||||
|
||||
- label: Tensorizer Test
|
||||
#mirror_hardwares: [amd]
|
||||
command: apt-get install curl libsodium23 && pytest -v -s tensorizer_loader
|
||||
command: apt-get install curl libsodium23 && pytest -v -s tensorizer
|
||||
|
||||
- label: Metrics Test
|
||||
mirror_hardwares: [amd]
|
||||
command: pytest -v -s metrics
|
||||
|
||||
- label: Quantization Test
|
||||
#mirror_hardwares: [amd]
|
||||
command: pytest -v -s quantization
|
||||
|
||||
- label: Tracing Test
|
||||
commands:
|
||||
- "pip install \
|
||||
opentelemetry-sdk \
|
||||
opentelemetry-api \
|
||||
opentelemetry-exporter-otlp \
|
||||
opentelemetry-semantic-conventions-ai"
|
||||
- pytest -v -s tracing
|
||||
|
||||
- label: Benchmarks
|
||||
working_dir: "/vllm-workspace/.buildkite"
|
||||
mirror_hardwares: [amd]
|
||||
commands:
|
||||
- pip install aiohttp
|
||||
- bash run-benchmarks.sh
|
||||
|
||||
- label: LM Eval Small Models
|
||||
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
||||
commands:
|
||||
- pip install lm-eval
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- bash ./run-tests.sh -c configs/models-small.txt -t 1
|
||||
|
||||
- label: LM Eval Large Models
|
||||
gpu: a100
|
||||
num_gpus: 4
|
||||
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
||||
commands:
|
||||
- pip install lm-eval
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- bash ./run-tests.sh -c configs/models-large.txt -t 4
|
||||
|
||||
- label: Documentation Build
|
||||
working_dir: "/vllm-workspace/test_docs/docs"
|
||||
no_gpu: True
|
||||
commands:
|
||||
- pip install -r requirements-docs.txt
|
||||
- SPHINXOPTS=\"-W\" make html
|
||||
|
||||
- label: Distributed Tests (A100)
|
||||
gpu: a100
|
||||
num_gpus: 4
|
||||
commands:
|
||||
# NOTE: don't test llama model here, it seems hf implementation is buggy
|
||||
# see https://github.com/vllm-project/vllm/pull/5689 for details
|
||||
- pytest -v -s distributed/test_custom_all_reduce.py
|
||||
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
|
||||
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
|
||||
- pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.7/flashinfer-0.0.7+cu121torch2.3-cp310-cp310-linux_x86_64.whl
|
||||
- VLLM_ATTENTION_BACKEND=FLASHINFER TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
|
||||
- VLLM_ATTENTION_BACKEND=FLASHINFER TEST_DIST_MODEL=meta-llama/Meta-Llama-3-8B DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
|
||||
- pytest -v -s -x lora/test_mixtral.py
|
||||
|
||||
69
.buildkite/test-template.j2
Normal file
69
.buildkite/test-template.j2
Normal file
@ -0,0 +1,69 @@
|
||||
{% set docker_image = "us-central1-docker.pkg.dev/vllm-405802/vllm-ci-test-repo/vllm-test:$BUILDKITE_COMMIT" %}
|
||||
{% set default_num_gpu = 1 %}
|
||||
{% set default_working_dir = "/vllm-workspace/tests" %}
|
||||
|
||||
steps:
|
||||
- label: "AMD Test"
|
||||
agents:
|
||||
queue: amd
|
||||
command: bash .buildkite/run-amd-test.sh
|
||||
|
||||
- label: "CPU Test"
|
||||
command: bash .buildkite/run-cpu-test.sh
|
||||
|
||||
- label: ":docker: build image"
|
||||
commands:
|
||||
- "docker build --build-arg max_jobs=16 --tag {{ docker_image }} --target test --progress plain ."
|
||||
- "docker push {{ docker_image }}"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
retry:
|
||||
automatic:
|
||||
- exit_status: -1 # Agent was lost
|
||||
limit: 5
|
||||
- wait
|
||||
|
||||
{% for step in steps %}
|
||||
- label: "{{ step.label }}"
|
||||
agents:
|
||||
queue: kubernetes
|
||||
soft_fail: {{ step.soft_fail or false }}
|
||||
{% if step.parallelism %}
|
||||
parallelism: {{ step.parallelism }}
|
||||
{% endif %}
|
||||
retry:
|
||||
automatic:
|
||||
- exit_status: -1 # Agent was lost
|
||||
limit: 5
|
||||
plugins:
|
||||
- kubernetes:
|
||||
podSpec:
|
||||
volumes:
|
||||
- name: dshm
|
||||
emptyDir:
|
||||
medium: Memory
|
||||
containers:
|
||||
- image: "{{ docker_image }}"
|
||||
command: ["bash"]
|
||||
args:
|
||||
- '-c'
|
||||
- "'cd {{ (step.working_dir or default_working_dir) | safe }} && {{ step.command or (step.commands | join(' && ')) | safe }}'"
|
||||
{% if not step.no_gpu %}
|
||||
resources:
|
||||
requests:
|
||||
nvidia.com/gpu: "{{ step.num_gpus or default_num_gpu }}"
|
||||
limits:
|
||||
nvidia.com/gpu: "{{ step.num_gpus or default_num_gpu }}"
|
||||
{% endif %}
|
||||
env:
|
||||
- name: VLLM_USAGE_SOURCE
|
||||
value: ci-test
|
||||
- name: HF_TOKEN
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: hf-token-secret
|
||||
key: token
|
||||
volumeMounts:
|
||||
- mountPath: /dev/shm
|
||||
name: dshm
|
||||
{% endfor %}
|
||||
@ -1,26 +0,0 @@
|
||||
BasedOnStyle: Google
|
||||
UseTab: Never
|
||||
IndentWidth: 2
|
||||
ColumnLimit: 80
|
||||
|
||||
# Force pointers to the type for C++.
|
||||
DerivePointerAlignment: false
|
||||
PointerAlignment: Left
|
||||
|
||||
# Reordering #include statements can (and currently will) introduce errors
|
||||
SortIncludes: false
|
||||
|
||||
# Style choices
|
||||
AlignConsecutiveAssignments: false
|
||||
AlignConsecutiveDeclarations: false
|
||||
IndentPPDirectives: BeforeHash
|
||||
|
||||
IncludeCategories:
|
||||
- Regex: '^<'
|
||||
Priority: 4
|
||||
- Regex: '^"(llvm|llvm-c|clang|clang-c|mlir|mlir-c)/'
|
||||
Priority: 3
|
||||
- Regex: '^"(qoda|\.\.)/'
|
||||
Priority: 2
|
||||
- Regex: '.*'
|
||||
Priority: 1
|
||||
1
.github/ISSUE_TEMPLATE/200-installation.yml
vendored
1
.github/ISSUE_TEMPLATE/200-installation.yml
vendored
@ -18,7 +18,6 @@ body:
|
||||
# For security purposes, please feel free to check the contents of collect_env.py before running it.
|
||||
python collect_env.py
|
||||
```
|
||||
It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues.
|
||||
value: |
|
||||
```text
|
||||
The output of `python collect_env.py`
|
||||
|
||||
1
.github/ISSUE_TEMPLATE/300-usage.yml
vendored
1
.github/ISSUE_TEMPLATE/300-usage.yml
vendored
@ -18,7 +18,6 @@ body:
|
||||
# For security purposes, please feel free to check the contents of collect_env.py before running it.
|
||||
python collect_env.py
|
||||
```
|
||||
It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues.
|
||||
value: |
|
||||
```text
|
||||
The output of `python collect_env.py`
|
||||
|
||||
5
.github/ISSUE_TEMPLATE/400-bug report.yml
vendored
5
.github/ISSUE_TEMPLATE/400-bug report.yml
vendored
@ -18,7 +18,6 @@ body:
|
||||
# For security purposes, please feel free to check the contents of collect_env.py before running it.
|
||||
python collect_env.py
|
||||
```
|
||||
It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues.
|
||||
value: |
|
||||
```text
|
||||
The output of `python collect_env.py`
|
||||
@ -58,10 +57,6 @@ body:
|
||||
If the code is too long (hopefully, it isn't), feel free to put it in a public gist and link it in the issue: https://gist.github.com.
|
||||
|
||||
Please also paste or describe the results you observe instead of the expected results. If you observe an error, please paste the error message including the **full** traceback of the exception. It may be relevant to wrap error messages in ```` ```triple quotes blocks``` ````.
|
||||
|
||||
Please set the environment variable `export VLLM_LOGGING_LEVEL=DEBUG` to turn on more logging to help debugging potential issues.
|
||||
|
||||
If you experienced crashes or hangs, it would be helpful to run vllm with `export VLLM_TRACE_FUNCTION=1` . All the function calls in vllm will be recorded. Inspect these log files, and tell which function crashes or hangs.
|
||||
placeholder: |
|
||||
A clear and concise description of what the bug is.
|
||||
|
||||
|
||||
@ -39,7 +39,6 @@ body:
|
||||
# For security purposes, please feel free to check the contents of collect_env.py before running it.
|
||||
python collect_env.py
|
||||
```
|
||||
It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues.
|
||||
value: |
|
||||
```text
|
||||
The output of `python collect_env.py`
|
||||
|
||||
49
.github/ISSUE_TEMPLATE/750-RFC.yml
vendored
49
.github/ISSUE_TEMPLATE/750-RFC.yml
vendored
@ -1,49 +0,0 @@
|
||||
name: 💬 Request for comments (RFC).
|
||||
description: Ask for feedback on major architectural changes or design choices.
|
||||
title: "[RFC]: "
|
||||
labels: ["RFC"]
|
||||
|
||||
body:
|
||||
- type: markdown
|
||||
attributes:
|
||||
value: >
|
||||
#### Please take a look at previous [RFCs](https://github.com/vllm-project/vllm/issues?q=label%3ARFC+sort%3Aupdated-desc) for reference.
|
||||
- type: textarea
|
||||
attributes:
|
||||
label: Motivation.
|
||||
description: >
|
||||
The motivation of the RFC.
|
||||
validations:
|
||||
required: true
|
||||
- type: textarea
|
||||
attributes:
|
||||
label: Proposed Change.
|
||||
description: >
|
||||
The proposed change of the RFC.
|
||||
validations:
|
||||
required: true
|
||||
- type: textarea
|
||||
attributes:
|
||||
label: Feedback Period.
|
||||
description: >
|
||||
The feedback period of the RFC. Usually at least one week.
|
||||
validations:
|
||||
required: false
|
||||
- type: textarea
|
||||
attributes:
|
||||
label: CC List.
|
||||
description: >
|
||||
The list of people you want to CC.
|
||||
validations:
|
||||
required: false
|
||||
- type: textarea
|
||||
attributes:
|
||||
label: Any Other Things.
|
||||
description: >
|
||||
Any other things you would like to mention.
|
||||
validations:
|
||||
required: false
|
||||
- type: markdown
|
||||
attributes:
|
||||
value: >
|
||||
Thanks for contributing 🎉!
|
||||
42
.github/workflows/clang-format.yml
vendored
42
.github/workflows/clang-format.yml
vendored
@ -1,42 +0,0 @@
|
||||
name: clang-format
|
||||
|
||||
on:
|
||||
# Trigger the workflow on push or pull request,
|
||||
# but only for the main branch
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
pull_request:
|
||||
branches:
|
||||
- main
|
||||
|
||||
jobs:
|
||||
clang-format:
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
matrix:
|
||||
python-version: ["3.11"]
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- name: Set up Python ${{ matrix.python-version }}
|
||||
uses: actions/setup-python@v2
|
||||
with:
|
||||
python-version: ${{ matrix.python-version }}
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install clang-format==18.1.5
|
||||
- name: Running clang-format
|
||||
run: |
|
||||
EXCLUDES=(
|
||||
'csrc/moe/topk_softmax_kernels.cu'
|
||||
'csrc/punica/bgmv/bgmv_bf16_bf16_bf16.cu'
|
||||
'csrc/punica/bgmv/bgmv_config.h'
|
||||
'csrc/punica/bgmv/bgmv_impl.cuh'
|
||||
'csrc/punica/bgmv/vec_dtypes.cuh'
|
||||
'csrc/punica/punica_ops.cu'
|
||||
'csrc/punica/type_convert.h'
|
||||
)
|
||||
find csrc/ \( -name '*.h' -o -name '*.cpp' -o -name '*.cu' -o -name '*.cuh' \) -print \
|
||||
| grep -vFf <(printf "%s\n" "${EXCLUDES[@]}") \
|
||||
| xargs clang-format --dry-run --Werror
|
||||
31
.github/workflows/mypy.yaml
vendored
31
.github/workflows/mypy.yaml
vendored
@ -32,20 +32,19 @@ jobs:
|
||||
pip install types-setuptools
|
||||
- name: Mypy
|
||||
run: |
|
||||
mypy vllm/attention --config-file pyproject.toml
|
||||
mypy vllm/core --config-file pyproject.toml
|
||||
mypy vllm/distributed --config-file pyproject.toml
|
||||
mypy vllm/entrypoints --config-file pyproject.toml
|
||||
mypy vllm/executor --config-file pyproject.toml
|
||||
mypy vllm/multimodal --config-file pyproject.toml
|
||||
mypy vllm/usage --config-file pyproject.toml
|
||||
mypy vllm/*.py --config-file pyproject.toml
|
||||
mypy vllm/transformers_utils --config-file pyproject.toml
|
||||
mypy vllm/engine --config-file pyproject.toml
|
||||
mypy vllm/worker --config-file pyproject.toml
|
||||
mypy vllm/spec_decode --config-file pyproject.toml
|
||||
mypy vllm/model_executor --config-file pyproject.toml
|
||||
mypy vllm/lora --config-file pyproject.toml
|
||||
mypy vllm/logging --config-file pyproject.toml
|
||||
mypy tests --config-file pyproject.toml
|
||||
mypy vllm/attention/*.py --follow-imports=skip --config-file pyproject.toml
|
||||
mypy vllm/core/*.py --follow-imports=skip --config-file pyproject.toml
|
||||
mypy vllm/distributed/*.py --follow-imports=skip --config-file pyproject.toml
|
||||
mypy vllm/entrypoints/*.py --follow-imports=skip --config-file pyproject.toml
|
||||
mypy vllm/executor/*.py --follow-imports=skip --config-file pyproject.toml
|
||||
mypy vllm/usage/*.py --follow-imports=skip --config-file pyproject.toml
|
||||
mypy vllm/*.py --follow-imports=skip --config-file pyproject.toml
|
||||
mypy vllm/transformers_utils/*.py --follow-imports=skip --config-file pyproject.toml
|
||||
|
||||
# TODO(sang): Follow up
|
||||
# mypy vllm/engine/*.py --follow-imports=skip --config-file pyproject.toml
|
||||
# mypy vllm/worker/*.py --follow-imports=skip --config-file pyproject.toml
|
||||
# mypy vllm/spec_decoding/*.py --follow-imports=skip --config-file pyproject.toml
|
||||
# mypy vllm/model_executor/*.py --follow-imports=skip --config-file pyproject.toml
|
||||
# mypy vllm/lora/*.py --follow-imports=skip --config-file pyproject.toml
|
||||
|
||||
|
||||
10
.github/workflows/publish.yml
vendored
10
.github/workflows/publish.yml
vendored
@ -49,19 +49,13 @@ jobs:
|
||||
matrix:
|
||||
os: ['ubuntu-20.04']
|
||||
python-version: ['3.8', '3.9', '3.10', '3.11']
|
||||
pytorch-version: ['2.3.0'] # Must be the most recent version that meets requirements-cuda.txt.
|
||||
pytorch-version: ['2.2.1'] # Must be the most recent version that meets requirements-cuda.txt.
|
||||
cuda-version: ['11.8', '12.1']
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Setup ccache
|
||||
uses: hendrikmuhs/ccache-action@v1.2
|
||||
with:
|
||||
create-symlink: true
|
||||
key: ${{ github.job }}-${{ matrix.python-version }}-${{ matrix.cuda-version }}
|
||||
|
||||
- name: Set up Linux Env
|
||||
if: ${{ runner.os == 'Linux' }}
|
||||
run: |
|
||||
@ -82,8 +76,6 @@ jobs:
|
||||
|
||||
- name: Build wheel
|
||||
shell: bash
|
||||
env:
|
||||
CMAKE_BUILD_TYPE: Release # do not compile with debug symbol to reduce wheel size
|
||||
run: |
|
||||
bash -x .github/workflows/scripts/build.sh ${{ matrix.python-version }} ${{ matrix.cuda-version }}
|
||||
wheel_name=$(ls dist/*whl | xargs -n 1 basename)
|
||||
|
||||
2
.github/workflows/ruff.yml
vendored
2
.github/workflows/ruff.yml
vendored
@ -25,7 +25,7 @@ jobs:
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install ruff==0.1.5 codespell==2.3.0 tomli==2.0.1 isort==5.13.2
|
||||
pip install ruff==0.1.5 codespell==2.2.6 tomli==2.0.1 isort==5.13.2
|
||||
- name: Analysing the code with ruff
|
||||
run: |
|
||||
ruff .
|
||||
|
||||
2
.github/workflows/scripts/create_release.js
vendored
2
.github/workflows/scripts/create_release.js
vendored
@ -8,7 +8,7 @@ module.exports = async (github, context, core) => {
|
||||
generate_release_notes: true,
|
||||
name: process.env.RELEASE_TAG,
|
||||
owner: context.repo.owner,
|
||||
prerelease: true,
|
||||
prerelease: false,
|
||||
repo: context.repo.repo,
|
||||
tag_name: process.env.RELEASE_TAG,
|
||||
});
|
||||
|
||||
2
.gitignore
vendored
2
.gitignore
vendored
@ -70,8 +70,6 @@ instance/
|
||||
|
||||
# Sphinx documentation
|
||||
docs/_build/
|
||||
docs/source/getting_started/examples/*.rst
|
||||
!**/*.template.rst
|
||||
|
||||
# PyBuilder
|
||||
.pybuilder/
|
||||
|
||||
103
CMakeLists.txt
103
CMakeLists.txt
@ -2,8 +2,7 @@ cmake_minimum_required(VERSION 3.21)
|
||||
|
||||
project(vllm_extensions LANGUAGES CXX)
|
||||
|
||||
# CUDA by default, can be overridden by using -DVLLM_TARGET_DEVICE=... (used by setup.py)
|
||||
set(VLLM_TARGET_DEVICE "cuda" CACHE STRING "Target device backend for vLLM")
|
||||
option(VLLM_TARGET_DEVICE "Target device backend for vLLM" "cuda")
|
||||
|
||||
message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
|
||||
message(STATUS "Target device: ${VLLM_TARGET_DEVICE}")
|
||||
@ -32,8 +31,9 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx11
|
||||
# requirements.txt files and should be kept consistent. The ROCm torch
|
||||
# versions are derived from Dockerfile.rocm
|
||||
#
|
||||
set(TORCH_SUPPORTED_VERSION_CUDA "2.3.0")
|
||||
set(TORCH_SUPPORTED_VERSION_ROCM "2.4.0")
|
||||
set(TORCH_SUPPORTED_VERSION_CUDA "2.2.1")
|
||||
set(TORCH_SUPPORTED_VERSION_ROCM_5X "2.0.1")
|
||||
set(TORCH_SUPPORTED_VERSION_ROCM_6X "2.1.1")
|
||||
|
||||
#
|
||||
# Try to find python package with an executable that exactly matches
|
||||
@ -66,6 +66,19 @@ endif()
|
||||
#
|
||||
find_package(Torch REQUIRED)
|
||||
|
||||
#
|
||||
# Normally `torch.utils.cpp_extension.CUDAExtension` would add
|
||||
# `libtorch_python.so` for linking against an extension. Torch's cmake
|
||||
# configuration does not include this library (presumably since the cmake
|
||||
# config is used for standalone C++ binaries that link against torch).
|
||||
# The `libtorch_python.so` library defines some of the glue code between
|
||||
# torch/python via pybind and is required by VLLM extensions for this
|
||||
# reason. So, add it by manually with `find_library` using torch's
|
||||
# installed library path.
|
||||
#
|
||||
find_library(torch_python_LIBRARY torch_python PATHS
|
||||
"${TORCH_INSTALL_PREFIX}/lib")
|
||||
|
||||
#
|
||||
# Forward the non-CUDA device extensions to external CMake scripts.
|
||||
#
|
||||
@ -98,11 +111,18 @@ elseif(HIP_FOUND)
|
||||
# .hip extension automatically, HIP must be enabled explicitly.
|
||||
enable_language(HIP)
|
||||
|
||||
# ROCm 5.X and 6.X
|
||||
if (ROCM_VERSION_DEV_MAJOR GREATER_EQUAL 5 AND
|
||||
NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_ROCM})
|
||||
message(WARNING "Pytorch version ${TORCH_SUPPORTED_VERSION_ROCM} "
|
||||
"expected for ROCm build, saw ${Torch_VERSION} instead.")
|
||||
# ROCm 5.x
|
||||
if (ROCM_VERSION_DEV_MAJOR EQUAL 5 AND
|
||||
NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_ROCM_5X})
|
||||
message(WARNING "Pytorch version ${TORCH_SUPPORTED_VERSION_ROCM_5X} "
|
||||
"expected for ROCMm 5.x build, saw ${Torch_VERSION} instead.")
|
||||
endif()
|
||||
|
||||
# ROCm 6.x
|
||||
if (ROCM_VERSION_DEV_MAJOR EQUAL 6 AND
|
||||
NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_ROCM_6X})
|
||||
message(WARNING "Pytorch version ${TORCH_SUPPORTED_VERSION_ROCM_6X} "
|
||||
"expected for ROCMm 6.x build, saw ${Torch_VERSION} instead.")
|
||||
endif()
|
||||
else()
|
||||
message(FATAL_ERROR "Can't find CUDA or HIP installation.")
|
||||
@ -147,48 +167,15 @@ set(VLLM_EXT_SRC
|
||||
"csrc/layernorm_kernels.cu"
|
||||
"csrc/quantization/squeezellm/quant_cuda_kernel.cu"
|
||||
"csrc/quantization/gptq/q_gemm.cu"
|
||||
"csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
|
||||
"csrc/quantization/fp8/common.cu"
|
||||
"csrc/cuda_utils_kernels.cu"
|
||||
"csrc/moe_align_block_size_kernels.cu"
|
||||
"csrc/torch_bindings.cpp")
|
||||
"csrc/pybind.cpp")
|
||||
|
||||
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
include(FetchContent)
|
||||
SET(CUTLASS_ENABLE_HEADERS_ONLY=ON)
|
||||
FetchContent_Declare(
|
||||
cutlass
|
||||
GIT_REPOSITORY https://github.com/nvidia/cutlass.git
|
||||
# CUTLASS 3.5.0
|
||||
GIT_TAG 7d49e6c7e2f8896c47f586706e67e1fb215529dc
|
||||
)
|
||||
FetchContent_MakeAvailable(cutlass)
|
||||
|
||||
list(APPEND VLLM_EXT_SRC
|
||||
"csrc/quantization/aqlm/gemm_kernels.cu"
|
||||
"csrc/quantization/awq/gemm_kernels.cu"
|
||||
"csrc/quantization/marlin/dense/marlin_cuda_kernel.cu"
|
||||
"csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
|
||||
"csrc/quantization/gptq_marlin/gptq_marlin.cu"
|
||||
"csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
|
||||
"csrc/quantization/fp8/fp8_marlin.cu"
|
||||
"csrc/custom_all_reduce.cu"
|
||||
"csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
|
||||
"csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu"
|
||||
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu")
|
||||
|
||||
#
|
||||
# The CUTLASS kernels for Hopper require sm90a to be enabled.
|
||||
# This is done via the below gencode option, BUT that creates kernels for both sm90 and sm90a.
|
||||
# That adds an extra 17MB to compiled binary, so instead we selectively enable it.
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0)
|
||||
set_source_files_properties(
|
||||
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu"
|
||||
PROPERTIES
|
||||
COMPILE_FLAGS
|
||||
"-gencode arch=compute_90a,code=sm_90a")
|
||||
endif()
|
||||
|
||||
"csrc/quantization/marlin/marlin_cuda_kernel.cu"
|
||||
"csrc/custom_all_reduce.cu")
|
||||
endif()
|
||||
|
||||
define_gpu_extension_target(
|
||||
@ -198,8 +185,6 @@ define_gpu_extension_target(
|
||||
SOURCES ${VLLM_EXT_SRC}
|
||||
COMPILE_FLAGS ${VLLM_GPU_FLAGS}
|
||||
ARCHITECTURES ${VLLM_GPU_ARCHES}
|
||||
INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR};${CUTLASS_TOOLS_UTIL_INCLUDE_DIR}
|
||||
USE_SABI 3
|
||||
WITH_SOABI)
|
||||
|
||||
#
|
||||
@ -207,7 +192,7 @@ define_gpu_extension_target(
|
||||
#
|
||||
|
||||
set(VLLM_MOE_EXT_SRC
|
||||
"csrc/moe/torch_bindings.cpp"
|
||||
"csrc/moe/moe_ops.cpp"
|
||||
"csrc/moe/topk_softmax_kernels.cu")
|
||||
|
||||
define_gpu_extension_target(
|
||||
@ -217,7 +202,6 @@ define_gpu_extension_target(
|
||||
SOURCES ${VLLM_MOE_EXT_SRC}
|
||||
COMPILE_FLAGS ${VLLM_GPU_FLAGS}
|
||||
ARCHITECTURES ${VLLM_GPU_ARCHES}
|
||||
USE_SABI 3
|
||||
WITH_SOABI)
|
||||
|
||||
#
|
||||
@ -226,13 +210,24 @@ define_gpu_extension_target(
|
||||
|
||||
set(VLLM_PUNICA_EXT_SRC
|
||||
"csrc/punica/bgmv/bgmv_bf16_bf16_bf16.cu"
|
||||
"csrc/punica/bgmv/bgmv_bf16_bf16_fp16.cu"
|
||||
"csrc/punica/bgmv/bgmv_bf16_fp16_bf16.cu"
|
||||
"csrc/punica/bgmv/bgmv_bf16_fp16_fp16.cu"
|
||||
"csrc/punica/bgmv/bgmv_bf16_fp32_bf16.cu"
|
||||
"csrc/punica/bgmv/bgmv_bf16_fp32_fp16.cu"
|
||||
"csrc/punica/bgmv/bgmv_fp16_bf16_bf16.cu"
|
||||
"csrc/punica/bgmv/bgmv_fp16_bf16_fp16.cu"
|
||||
"csrc/punica/bgmv/bgmv_fp16_fp16_bf16.cu"
|
||||
"csrc/punica/bgmv/bgmv_fp16_fp16_fp16.cu"
|
||||
"csrc/punica/bgmv/bgmv_fp16_fp32_bf16.cu"
|
||||
"csrc/punica/bgmv/bgmv_fp16_fp32_fp16.cu"
|
||||
"csrc/punica/bgmv/bgmv_fp32_bf16_bf16.cu"
|
||||
"csrc/punica/bgmv/bgmv_fp32_bf16_fp16.cu"
|
||||
"csrc/punica/bgmv/bgmv_fp32_fp16_bf16.cu"
|
||||
"csrc/punica/bgmv/bgmv_fp32_fp16_fp16.cu"
|
||||
"csrc/punica/punica_ops.cu"
|
||||
"csrc/punica/torch_bindings.cpp")
|
||||
"csrc/punica/bgmv/bgmv_fp32_fp32_bf16.cu"
|
||||
"csrc/punica/bgmv/bgmv_fp32_fp32_fp16.cu"
|
||||
"csrc/punica/punica_ops.cc")
|
||||
|
||||
#
|
||||
# Copy GPU compilation flags+update for punica
|
||||
@ -256,9 +251,6 @@ if (${VLLM_GPU_LANG} STREQUAL "CUDA")
|
||||
endif()
|
||||
endforeach()
|
||||
message(STATUS "Punica target arches: ${VLLM_PUNICA_GPU_ARCHES}")
|
||||
elseif(${VLLM_GPU_LANG} STREQUAL "HIP")
|
||||
set(VLLM_PUNICA_GPU_ARCHES ${VLLM_GPU_ARCHES})
|
||||
message(STATUS "Punica target arches: ${VLLM_PUNICA_GPU_ARCHES}")
|
||||
endif()
|
||||
|
||||
if (VLLM_PUNICA_GPU_ARCHES)
|
||||
@ -269,7 +261,6 @@ if (VLLM_PUNICA_GPU_ARCHES)
|
||||
SOURCES ${VLLM_PUNICA_EXT_SRC}
|
||||
COMPILE_FLAGS ${VLLM_PUNICA_GPU_FLAGS}
|
||||
ARCHITECTURES ${VLLM_PUNICA_GPU_ARCHES}
|
||||
USE_SABI 3
|
||||
WITH_SOABI)
|
||||
else()
|
||||
message(WARNING "Unable to create _punica_C target because none of the "
|
||||
@ -294,7 +285,9 @@ add_custom_target(default)
|
||||
if(VLLM_GPU_LANG STREQUAL "CUDA" OR VLLM_GPU_LANG STREQUAL "HIP")
|
||||
message(STATUS "Enabling C extension.")
|
||||
add_dependencies(default _C)
|
||||
endif()
|
||||
|
||||
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
message(STATUS "Enabling moe extension.")
|
||||
add_dependencies(default _moe_C)
|
||||
|
||||
|
||||
114
Dockerfile
114
Dockerfile
@ -1,39 +1,18 @@
|
||||
# The vLLM Dockerfile is used to construct vLLM image that can be directly used
|
||||
# to run the OpenAI compatible server.
|
||||
|
||||
# Please update any changes made here to
|
||||
# docs/source/dev/dockerfile/dockerfile.rst and
|
||||
# docs/source/assets/dev/dockerfile-stages-dependency.png
|
||||
|
||||
ARG CUDA_VERSION=12.4.1
|
||||
#################### BASE BUILD IMAGE ####################
|
||||
# prepare basic build environment
|
||||
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS base
|
||||
|
||||
ARG CUDA_VERSION=12.4.1
|
||||
ARG PYTHON_VERSION=3
|
||||
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
|
||||
RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
|
||||
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
|
||||
&& apt-get update -y \
|
||||
&& apt-get install -y ccache software-properties-common \
|
||||
&& add-apt-repository ppa:deadsnakes/ppa \
|
||||
&& apt-get update -y \
|
||||
&& apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv python3-pip \
|
||||
&& if [ "${PYTHON_VERSION}" != "3" ]; then update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1; fi \
|
||||
&& python3 --version \
|
||||
&& python3 -m pip --version
|
||||
FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 AS dev
|
||||
|
||||
RUN apt-get update -y \
|
||||
&& apt-get install -y python3-pip git curl sudo
|
||||
&& apt-get install -y python3-pip git
|
||||
|
||||
# Workaround for https://github.com/openai/triton/issues/2507 and
|
||||
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
|
||||
# this won't be needed for future versions of this docker image
|
||||
# or future versions of triton.
|
||||
RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
|
||||
RUN ldconfig /usr/local/cuda-12.1/compat/
|
||||
|
||||
WORKDIR /workspace
|
||||
|
||||
@ -41,11 +20,12 @@ WORKDIR /workspace
|
||||
COPY requirements-common.txt requirements-common.txt
|
||||
COPY requirements-cuda.txt requirements-cuda.txt
|
||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||
python3 -m pip install -r requirements-cuda.txt
|
||||
pip install -r requirements-cuda.txt
|
||||
|
||||
COPY requirements-mamba.txt requirements-mamba.txt
|
||||
RUN python3 -m pip install packaging
|
||||
RUN python3 -m pip install -r requirements-mamba.txt
|
||||
# install development dependencies
|
||||
COPY requirements-dev.txt requirements-dev.txt
|
||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||
pip install -r requirements-dev.txt
|
||||
|
||||
# cuda arch list used by torch
|
||||
# can be useful for both `dev` and `test`
|
||||
@ -55,16 +35,14 @@ ARG torch_cuda_arch_list='7.0 7.5 8.0 8.6 8.9 9.0+PTX'
|
||||
ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
|
||||
#################### BASE BUILD IMAGE ####################
|
||||
|
||||
#################### WHEEL BUILD IMAGE ####################
|
||||
FROM base AS build
|
||||
|
||||
ARG PYTHON_VERSION=3
|
||||
#################### WHEEL BUILD IMAGE ####################
|
||||
FROM dev AS build
|
||||
|
||||
# install build dependencies
|
||||
COPY requirements-build.txt requirements-build.txt
|
||||
|
||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||
python3 -m pip install -r requirements-build.txt
|
||||
pip install -r requirements-build.txt
|
||||
|
||||
# install compiler cache to speed up compilation leveraging local or remote caching
|
||||
RUN apt-get update -y && apt-get install -y ccache
|
||||
@ -88,65 +66,39 @@ ENV NVCC_THREADS=$nvcc_threads
|
||||
# make sure punica kernels are built (for LoRA)
|
||||
ENV VLLM_INSTALL_PUNICA_KERNELS=1
|
||||
|
||||
ARG USE_SCCACHE
|
||||
# if USE_SCCACHE is set, use sccache to speed up compilation
|
||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||
if [ "$USE_SCCACHE" = "1" ]; then \
|
||||
echo "Installing sccache..." \
|
||||
&& curl -L -o sccache.tar.gz https://github.com/mozilla/sccache/releases/download/v0.8.1/sccache-v0.8.1-x86_64-unknown-linux-musl.tar.gz \
|
||||
&& tar -xzf sccache.tar.gz \
|
||||
&& sudo mv sccache-v0.8.1-x86_64-unknown-linux-musl/sccache /usr/bin/sccache \
|
||||
&& rm -rf sccache.tar.gz sccache-v0.8.1-x86_64-unknown-linux-musl \
|
||||
&& export SCCACHE_BUCKET=vllm-build-sccache \
|
||||
&& export SCCACHE_REGION=us-west-2 \
|
||||
&& sccache --show-stats \
|
||||
&& python3 setup.py bdist_wheel --dist-dir=dist \
|
||||
&& sccache --show-stats; \
|
||||
fi
|
||||
|
||||
ENV CCACHE_DIR=/root/.cache/ccache
|
||||
RUN --mount=type=cache,target=/root/.cache/ccache \
|
||||
--mount=type=cache,target=/root/.cache/pip \
|
||||
if [ "$USE_SCCACHE" != "1" ]; then \
|
||||
python3 setup.py bdist_wheel --dist-dir=dist; \
|
||||
fi
|
||||
|
||||
# check the size of the wheel, we cannot upload wheels larger than 100MB
|
||||
COPY .buildkite/check-wheel-size.py check-wheel-size.py
|
||||
RUN python3 check-wheel-size.py dist
|
||||
python3 setup.py bdist_wheel --dist-dir=dist
|
||||
|
||||
# the `vllm_nccl` package must be installed from source distribution
|
||||
# pip is too smart to store a wheel in the cache, and other CI jobs
|
||||
# will directly use the wheel from the cache, which is not what we want.
|
||||
# we need to remove it manually
|
||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||
pip cache remove vllm_nccl*
|
||||
#################### EXTENSION Build IMAGE ####################
|
||||
|
||||
#################### DEV IMAGE ####################
|
||||
FROM base as dev
|
||||
|
||||
COPY requirements-lint.txt requirements-lint.txt
|
||||
COPY requirements-test.txt requirements-test.txt
|
||||
COPY requirements-dev.txt requirements-dev.txt
|
||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||
python3 -m pip install -r requirements-dev.txt
|
||||
|
||||
#################### DEV IMAGE ####################
|
||||
#################### MAMBA Build IMAGE ####################
|
||||
FROM dev as mamba-builder
|
||||
#################### FLASH_ATTENTION Build IMAGE ####################
|
||||
FROM dev as flash-attn-builder
|
||||
# max jobs used for build
|
||||
ARG max_jobs=2
|
||||
ENV MAX_JOBS=${max_jobs}
|
||||
# flash attention version
|
||||
ARG flash_attn_version=v2.5.6
|
||||
ENV FLASH_ATTN_VERSION=${flash_attn_version}
|
||||
|
||||
WORKDIR /usr/src/mamba
|
||||
|
||||
COPY requirements-mamba.txt requirements-mamba.txt
|
||||
WORKDIR /usr/src/flash-attention-v2
|
||||
|
||||
# Download the wheel or build it if a pre-compiled release doesn't exist
|
||||
RUN pip --verbose wheel -r requirements-mamba.txt \
|
||||
RUN pip --verbose wheel flash-attn==${FLASH_ATTN_VERSION} \
|
||||
--no-build-isolation --no-deps --no-cache-dir
|
||||
|
||||
#################### MAMBA Build IMAGE ####################
|
||||
#################### FLASH_ATTENTION Build IMAGE ####################
|
||||
|
||||
#################### vLLM installation IMAGE ####################
|
||||
# image with vLLM installed
|
||||
FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu22.04 AS vllm-base
|
||||
ARG CUDA_VERSION=12.4.1
|
||||
FROM nvidia/cuda:12.1.0-base-ubuntu22.04 AS vllm-base
|
||||
WORKDIR /vllm-workspace
|
||||
|
||||
RUN apt-get update -y \
|
||||
@ -156,16 +108,16 @@ RUN apt-get update -y \
|
||||
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
|
||||
# this won't be needed for future versions of this docker image
|
||||
# or future versions of triton.
|
||||
RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
|
||||
RUN ldconfig /usr/local/cuda-12.1/compat/
|
||||
|
||||
# install vllm wheel first, so that torch etc will be installed
|
||||
RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
|
||||
--mount=type=cache,target=/root/.cache/pip \
|
||||
python3 -m pip install dist/*.whl --verbose
|
||||
pip install dist/*.whl --verbose
|
||||
|
||||
RUN --mount=type=bind,from=mamba-builder,src=/usr/src/mamba,target=/usr/src/mamba \
|
||||
RUN --mount=type=bind,from=flash-attn-builder,src=/usr/src/flash-attention-v2,target=/usr/src/flash-attention-v2 \
|
||||
--mount=type=cache,target=/root/.cache/pip \
|
||||
python3 -m pip install /usr/src/mamba/*.whl --no-cache-dir
|
||||
pip install /usr/src/flash-attention-v2/*.whl --no-cache-dir
|
||||
#################### vLLM installation IMAGE ####################
|
||||
|
||||
|
||||
@ -178,7 +130,7 @@ ADD . /vllm-workspace/
|
||||
|
||||
# install development dependencies (for testing)
|
||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||
python3 -m pip install -r requirements-dev.txt
|
||||
pip install -r requirements-dev.txt
|
||||
|
||||
# doc requires source code
|
||||
# we hide them inside `test_docs/` , so that this source code
|
||||
@ -195,7 +147,7 @@ FROM vllm-base AS vllm-openai
|
||||
|
||||
# install additional dependencies for openai api server
|
||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||
pip install accelerate hf_transfer 'modelscope!=1.15.0'
|
||||
pip install accelerate hf_transfer modelscope
|
||||
|
||||
ENV VLLM_USAGE_SOURCE production-docker-image
|
||||
|
||||
|
||||
@ -1,25 +1,13 @@
|
||||
# This vLLM Dockerfile is used to construct image that can build and run vLLM on x86 CPU platform.
|
||||
|
||||
FROM ubuntu:22.04 AS cpu-test-1
|
||||
FROM ubuntu:22.04
|
||||
|
||||
RUN apt-get update -y \
|
||||
&& apt-get install -y git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 \
|
||||
&& apt-get install -y git wget vim numactl gcc-12 g++-12 python3 python3-pip \
|
||||
&& update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
|
||||
|
||||
# https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/performance_tuning/tuning_guide.html
|
||||
# intel-openmp provides additional performance improvement vs. openmp
|
||||
# tcmalloc provides better memory allocation efficiency, e.g, holding memory in caches to speed up access of commonly-used objects.
|
||||
RUN pip install intel-openmp
|
||||
|
||||
ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/usr/local/lib/libiomp5.so:$LD_PRELOAD"
|
||||
|
||||
|
||||
RUN pip install https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/cpu/intel_extension_for_pytorch-2.3.100%2Bgit0eb3473-cp310-cp310-linux_x86_64.whl
|
||||
|
||||
RUN pip install --upgrade pip \
|
||||
&& pip install wheel packaging ninja "setuptools>=49.4.0" numpy
|
||||
|
||||
FROM cpu-test-1 AS build
|
||||
&& pip install wheel packaging ninja setuptools>=49.4.0 numpy
|
||||
|
||||
COPY ./ /workspace/vllm
|
||||
|
||||
@ -27,14 +15,6 @@ WORKDIR /workspace/vllm
|
||||
|
||||
RUN pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
|
||||
|
||||
# Support for building with non-AVX512 vLLM: docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" ...
|
||||
ARG VLLM_CPU_DISABLE_AVX512
|
||||
ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}
|
||||
|
||||
RUN VLLM_TARGET_DEVICE=cpu python3 setup.py install
|
||||
|
||||
WORKDIR /workspace/
|
||||
|
||||
RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
|
||||
|
||||
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
|
||||
CMD ["/bin/bash"]
|
||||
|
||||
@ -1,36 +0,0 @@
|
||||
# default base image
|
||||
ARG BASE_IMAGE="763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-inference-neuronx:2.1.1-neuronx-py310-sdk2.17.0-ubuntu20.04"
|
||||
|
||||
FROM $BASE_IMAGE
|
||||
|
||||
RUN echo "Base image is $BASE_IMAGE"
|
||||
|
||||
# Install some basic utilities
|
||||
RUN apt-get update && apt-get install python3 python3-pip -y
|
||||
|
||||
### Mount Point ###
|
||||
# When launching the container, mount the code directory to /app
|
||||
ARG APP_MOUNT=/app
|
||||
VOLUME [ ${APP_MOUNT} ]
|
||||
WORKDIR ${APP_MOUNT}
|
||||
|
||||
RUN python3 -m pip install --upgrade pip
|
||||
RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas
|
||||
RUN python3 -m pip install sentencepiece transformers==4.36.2 -U
|
||||
RUN python3 -m pip install transformers-neuronx --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
|
||||
RUN python3 -m pip install --pre neuronx-cc==2.12.* --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
|
||||
|
||||
COPY ./vllm /app/vllm/vllm
|
||||
COPY ./setup.py /app/vllm/setup.py
|
||||
COPY ./requirements-common.txt /app/vllm/requirements-common.txt
|
||||
COPY ./requirements-neuron.txt /app/vllm/requirements-neuron.txt
|
||||
|
||||
RUN cd /app/vllm \
|
||||
&& python3 -m pip install -U -r requirements-neuron.txt
|
||||
|
||||
ENV VLLM_TARGET_DEVICE neuron
|
||||
RUN cd /app/vllm \
|
||||
&& pip install -e . \
|
||||
&& cd ..
|
||||
|
||||
CMD ["/bin/bash"]
|
||||
@ -1,26 +0,0 @@
|
||||
# The vLLM Dockerfile is used to construct vLLM image that can be directly used
|
||||
# to run the OpenAI compatible server.
|
||||
|
||||
FROM ubuntu:22.04 AS dev
|
||||
|
||||
RUN apt-get update -y && \
|
||||
apt-get install -y python3-pip git
|
||||
WORKDIR /workspace
|
||||
|
||||
# copy requirements
|
||||
COPY requirements-build.txt /workspace/vllm/
|
||||
COPY requirements-common.txt /workspace/vllm/
|
||||
COPY requirements-openvino.txt /workspace/vllm/
|
||||
|
||||
COPY vllm/ /workspace/vllm/vllm
|
||||
COPY setup.py /workspace/vllm/
|
||||
|
||||
# install build requirements
|
||||
RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/vllm/requirements-build.txt
|
||||
# build vLLM with OpenVINO backend
|
||||
RUN PIP_PRE=1 PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu https://storage.openvinotoolkit.org/simple/wheels/nightly/" VLLM_TARGET_DEVICE="openvino" python3 -m pip install /workspace/vllm/
|
||||
|
||||
COPY examples/ /workspace/vllm/examples
|
||||
COPY benchmarks/ /workspace/vllm/benchmarks
|
||||
|
||||
CMD ["/bin/bash"]
|
||||
@ -1,22 +0,0 @@
|
||||
FROM mambaorg/micromamba
|
||||
ARG MAMBA_DOCKERFILE_ACTIVATE=1
|
||||
USER root
|
||||
|
||||
RUN apt-get update -y && apt-get install -y git wget vim numactl gcc-12 g++-12 protobuf-compiler libprotobuf-dev && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
|
||||
|
||||
# Some packages in requirements-cpu are installed here
|
||||
# IBM provides optimized packages for ppc64le processors in the open-ce project for mamba
|
||||
# Currently these may not be available for venv or pip directly
|
||||
RUN micromamba install -y -n base -c https://ftp.osuosl.org/pub/open-ce/1.11.0-p10/ -c defaults python=3.10 pytorch-cpu=2.1.2 torchvision-cpu=0.16.2 && micromamba clean --all --yes
|
||||
|
||||
COPY ./ /workspace/vllm
|
||||
|
||||
WORKDIR /workspace/vllm
|
||||
|
||||
# These packages will be in rocketce eventually
|
||||
RUN pip install -v -r requirements-cpu.txt --prefer-binary --extra-index-url https://repo.fury.io/mgiessing
|
||||
|
||||
RUN VLLM_TARGET_DEVICE=cpu python3 setup.py install
|
||||
|
||||
WORKDIR /vllm-workspace
|
||||
ENTRYPOINT ["/opt/conda/bin/python3", "-m", "vllm.entrypoints.openai.api_server"]
|
||||
225
Dockerfile.rocm
225
Dockerfile.rocm
@ -1,35 +1,35 @@
|
||||
# Default ROCm 6.1 base image
|
||||
ARG BASE_IMAGE="rocm/pytorch:rocm6.1.2_ubuntu20.04_py3.9_pytorch_staging"
|
||||
# default base image
|
||||
ARG BASE_IMAGE="rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1"
|
||||
|
||||
# Tested and supported base rocm/pytorch images
|
||||
ARG ROCm_5_7_BASE="rocm/pytorch:rocm5.7_ubuntu20.04_py3.9_pytorch_2.0.1" \
|
||||
ROCm_6_0_BASE="rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1" \
|
||||
ROCM_6_1_BASE="rocm/pytorch:rocm6.1.2_ubuntu20.04_py3.9_pytorch_staging"
|
||||
FROM $BASE_IMAGE
|
||||
|
||||
ARG BASE_IMAGE="rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1"
|
||||
|
||||
RUN echo "Base image is $BASE_IMAGE"
|
||||
|
||||
# BASE_IMAGE for ROCm_5.7: "rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1"
|
||||
# BASE_IMAGE for ROCm_6.0: "rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1"
|
||||
|
||||
# Default ROCm ARCHes to build vLLM for.
|
||||
ARG PYTORCH_ROCM_ARCH="gfx908;gfx90a;gfx942;gfx1100"
|
||||
|
||||
# Whether to build CK-based flash-attention
|
||||
# If 0, will not build flash attention
|
||||
# This is useful for gfx target where flash-attention is not supported
|
||||
# (i.e. those that do not appear in `FA_GFX_ARCHS`)
|
||||
# Triton FA is used by default on ROCm now so this is unnecessary.
|
||||
ARG BUILD_FA="1"
|
||||
ARG FA_GFX_ARCHS="gfx90a;gfx942"
|
||||
ARG FA_BRANCH="ae7928c"
|
||||
RUN echo "FA_GFX_ARCHS is $FA_GFX_ARCHS"
|
||||
|
||||
# Whether to build triton on rocm
|
||||
ARG FA_BRANCH="3d2b6f5"
|
||||
RUN echo "FA_BRANCH is $FA_BRANCH"
|
||||
|
||||
# whether to build flash-attention
|
||||
# if 0, will not build flash attention
|
||||
# this is useful for gfx target where flash-attention is not supported
|
||||
# In that case, we need to use the python reference attention implementation in vllm
|
||||
ARG BUILD_FA="1"
|
||||
|
||||
# whether to build triton on rocm
|
||||
ARG BUILD_TRITON="1"
|
||||
ARG TRITON_BRANCH="0ef1848"
|
||||
|
||||
### Base image build stage
|
||||
FROM $BASE_IMAGE AS base
|
||||
|
||||
# Import arg(s) defined before this build stage
|
||||
ARG PYTORCH_ROCM_ARCH
|
||||
|
||||
# Install some basic utilities
|
||||
RUN apt-get update && apt-get install python3 python3-pip -y
|
||||
|
||||
# Install some basic utilities
|
||||
RUN apt-get update && apt-get install -y \
|
||||
curl \
|
||||
ca-certificates \
|
||||
@ -40,165 +40,70 @@ RUN apt-get update && apt-get install -y \
|
||||
build-essential \
|
||||
wget \
|
||||
unzip \
|
||||
nvidia-cuda-toolkit \
|
||||
tmux \
|
||||
ccache \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# When launching the container, mount the code directory to /vllm-workspace
|
||||
ARG APP_MOUNT=/vllm-workspace
|
||||
### Mount Point ###
|
||||
# When launching the container, mount the code directory to /app
|
||||
ARG APP_MOUNT=/app
|
||||
VOLUME [ ${APP_MOUNT} ]
|
||||
WORKDIR ${APP_MOUNT}
|
||||
|
||||
RUN pip install --upgrade pip
|
||||
# Remove sccache so it doesn't interfere with ccache
|
||||
# TODO: implement sccache support across components
|
||||
RUN apt-get purge -y sccache; pip uninstall -y sccache; rm -f "$(which sccache)"
|
||||
# Install torch == 2.4.0 on ROCm
|
||||
RUN case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \
|
||||
*"rocm-5.7"*) \
|
||||
pip uninstall -y torch torchaudio torchvision \
|
||||
&& pip install --no-cache-dir --pre \
|
||||
torch==2.4.0.dev20240612 torchaudio==2.4.0.dev20240612 \
|
||||
torchvision==0.19.0.dev20240612 \
|
||||
--index-url https://download.pytorch.org/whl/nightly/rocm5.7;; \
|
||||
*"rocm-6.0"*) \
|
||||
pip uninstall -y torch torchaudio torchvision \
|
||||
&& pip install --no-cache-dir --pre \
|
||||
torch==2.4.0.dev20240612 torchaudio==2.4.0.dev20240612 \
|
||||
torchvision==0.19.0.dev20240612 \
|
||||
--index-url https://download.pytorch.org/whl/nightly/rocm6.0;; \
|
||||
*"rocm-6.1"*) \
|
||||
pip uninstall -y torch torchaudio torchvision \
|
||||
&& pip install --no-cache-dir --pre \
|
||||
torch==2.4.0.dev20240612 torchaudio==2.4.0.dev20240612 \
|
||||
torchvision==0.19.0.dev20240612 \
|
||||
--index-url https://download.pytorch.org/whl/nightly/rocm6.1;; \
|
||||
*) ;; esac
|
||||
RUN python3 -m pip install --upgrade pip
|
||||
RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas
|
||||
|
||||
ENV LLVM_SYMBOLIZER_PATH=/opt/rocm/llvm/bin/llvm-symbolizer
|
||||
ENV PATH=$PATH:/opt/rocm/bin:/libtorch/bin:
|
||||
ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/lib/:/libtorch/lib:
|
||||
ENV CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/libtorch/include:/libtorch/include/torch/csrc/api/include/:/opt/rocm/include/:
|
||||
|
||||
ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}
|
||||
ENV CCACHE_DIR=/root/.cache/ccache
|
||||
|
||||
|
||||
### AMD-SMI build stage
|
||||
FROM base AS build_amdsmi
|
||||
# Build amdsmi wheel always
|
||||
RUN cd /opt/rocm/share/amd_smi \
|
||||
&& pip wheel . --wheel-dir=/install
|
||||
|
||||
|
||||
### Flash-Attention wheel build stage
|
||||
FROM base AS build_fa
|
||||
ARG BUILD_FA
|
||||
ARG FA_GFX_ARCHS
|
||||
ARG FA_BRANCH
|
||||
# Build ROCm flash-attention wheel if `BUILD_FA = 1`
|
||||
RUN --mount=type=cache,target=${CCACHE_DIR} \
|
||||
if [ "$BUILD_FA" = "1" ]; then \
|
||||
mkdir -p libs \
|
||||
# Install ROCm flash-attention
|
||||
RUN if [ "$BUILD_FA" = "1" ]; then \
|
||||
mkdir libs \
|
||||
&& cd libs \
|
||||
&& git clone https://github.com/ROCm/flash-attention.git \
|
||||
&& cd flash-attention \
|
||||
&& git checkout "${FA_BRANCH}" \
|
||||
&& git checkout ${FA_BRANCH} \
|
||||
&& git submodule update --init \
|
||||
&& case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \
|
||||
*"rocm-5.7"*) \
|
||||
export VLLM_TORCH_PATH="$(python3 -c 'import torch; print(torch.__path__[0])')" \
|
||||
&& patch "${VLLM_TORCH_PATH}"/utils/hipify/hipify_python.py hipify_patch.patch;; \
|
||||
*) ;; esac \
|
||||
&& GPU_ARCHS="${FA_GFX_ARCHS}" python3 setup.py bdist_wheel --dist-dir=/install; \
|
||||
# Create an empty directory otherwise as later build stages expect one
|
||||
else mkdir -p /install; \
|
||||
&& export GPU_ARCHS=${FA_GFX_ARCHS} \
|
||||
&& if [ "$BASE_IMAGE" = "rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1" ]; then \
|
||||
patch /opt/conda/envs/py_3.10/lib/python3.10/site-packages/torch/utils/hipify/hipify_python.py hipify_patch.patch; fi \
|
||||
&& python3 setup.py install \
|
||||
&& cd ..; \
|
||||
fi
|
||||
|
||||
|
||||
### Triton wheel build stage
|
||||
FROM base AS build_triton
|
||||
ARG BUILD_TRITON
|
||||
ARG TRITON_BRANCH
|
||||
# Build triton wheel if `BUILD_TRITON = 1`
|
||||
RUN --mount=type=cache,target=${CCACHE_DIR} \
|
||||
if [ "$BUILD_TRITON" = "1" ]; then \
|
||||
mkdir -p libs \
|
||||
&& cd libs \
|
||||
&& git clone https://github.com/OpenAI/triton.git \
|
||||
&& cd triton \
|
||||
&& git checkout "${TRITON_BRANCH}" \
|
||||
&& cd python \
|
||||
&& python3 setup.py bdist_wheel --dist-dir=/install; \
|
||||
# Create an empty directory otherwise as later build stages expect one
|
||||
else mkdir -p /install; \
|
||||
fi
|
||||
|
||||
|
||||
### Final vLLM build stage
|
||||
FROM base AS final
|
||||
# Import the vLLM development directory from the build context
|
||||
COPY . .
|
||||
|
||||
# Error related to odd state for numpy 1.20.3 where there is no METADATA etc, but an extra LICENSES_bundled.txt.
|
||||
# Manually remove it so that later steps of numpy upgrade can continue
|
||||
RUN case "$(which python3)" in \
|
||||
*"/opt/conda/envs/py_3.9"*) \
|
||||
rm -rf /opt/conda/envs/py_3.9/lib/python3.9/site-packages/numpy-1.20.3.dist-info/;; \
|
||||
*) ;; esac
|
||||
# Manually removed it so that later steps of numpy upgrade can continue
|
||||
RUN if [ "$BASE_IMAGE" = "rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1" ]; then \
|
||||
rm -rf /opt/conda/envs/py_3.9/lib/python3.9/site-packages/numpy-1.20.3.dist-info/; fi
|
||||
|
||||
# Package upgrades for useful functionality or to avoid dependency issues
|
||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||
pip install --upgrade numba scipy huggingface-hub[cli]
|
||||
|
||||
# Make sure punica kernels are built (for LoRA)
|
||||
ENV VLLM_INSTALL_PUNICA_KERNELS=1
|
||||
# Workaround for ray >= 2.10.0
|
||||
ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
|
||||
# Silences the HF Tokenizers warning
|
||||
ENV TOKENIZERS_PARALLELISM=false
|
||||
|
||||
RUN --mount=type=cache,target=${CCACHE_DIR} \
|
||||
--mount=type=cache,target=/root/.cache/pip \
|
||||
pip install -U -r requirements-rocm.txt \
|
||||
&& case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \
|
||||
*"rocm-6.0"*) \
|
||||
patch /opt/rocm/include/hip/amd_detail/amd_hip_bf16.h rocm_patch/rocm_bf16.patch;; \
|
||||
*"rocm-6.1"*) \
|
||||
# Bring in upgrades to HIP graph earlier than ROCm 6.2 for vLLM
|
||||
wget -N https://github.com/ROCm/vllm/raw/fa78403/rocm_patch/libamdhip64.so.6 -P rocm_patch \
|
||||
&& cp rocm_patch/libamdhip64.so.6 /opt/rocm/lib/libamdhip64.so.6 \
|
||||
# Prevent interference if torch bundles its own HIP runtime
|
||||
&& rm -f "$(python3 -c 'import torch; print(torch.__path__[0])')"/lib/libamdhip64.so* || true;; \
|
||||
*) ;; esac \
|
||||
&& python3 setup.py clean --all \
|
||||
&& python3 setup.py develop
|
||||
|
||||
# Copy amdsmi wheel into final image
|
||||
RUN --mount=type=bind,from=build_amdsmi,src=/install,target=/install \
|
||||
# build triton
|
||||
RUN if [ "$BUILD_TRITON" = "1" ]; then \
|
||||
mkdir -p libs \
|
||||
&& cp /install/*.whl libs \
|
||||
# Preemptively uninstall to avoid same-version no-installs
|
||||
&& pip uninstall -y amdsmi;
|
||||
&& cd libs \
|
||||
&& pip uninstall -y triton \
|
||||
&& git clone https://github.com/ROCm/triton.git \
|
||||
&& cd triton/python \
|
||||
&& pip3 install . \
|
||||
&& cd ../..; \
|
||||
fi
|
||||
|
||||
# Copy triton wheel(s) into final image if they were built
|
||||
RUN --mount=type=bind,from=build_triton,src=/install,target=/install \
|
||||
mkdir -p libs \
|
||||
&& if ls /install/*.whl; then \
|
||||
cp /install/*.whl libs \
|
||||
# Preemptively uninstall to avoid same-version no-installs
|
||||
&& pip uninstall -y triton; fi
|
||||
COPY ./ /app/vllm
|
||||
|
||||
# Copy flash-attn wheel(s) into final image if they were built
|
||||
RUN --mount=type=bind,from=build_fa,src=/install,target=/install \
|
||||
mkdir -p libs \
|
||||
&& if ls /install/*.whl; then \
|
||||
cp /install/*.whl libs \
|
||||
# Preemptively uninstall to avoid same-version no-installs
|
||||
&& pip uninstall -y flash-attn; fi
|
||||
RUN python3 -m pip install --upgrade pip numba
|
||||
RUN python3 -m pip install xformers==0.0.23 --no-deps
|
||||
|
||||
# Install wheels that were built to the final image
|
||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||
if ls libs/*.whl; then \
|
||||
pip install libs/*.whl; fi
|
||||
RUN cd /app \
|
||||
&& cd vllm \
|
||||
&& pip install -U -r requirements-rocm.txt \
|
||||
&& if [ "$BUILD_FA" = "1" ]; then \
|
||||
bash patch_xformers.rocm.sh; fi \
|
||||
&& patch /opt/rocm/include/hip/amd_detail/amd_hip_bf16.h /app/vllm/rocm_patch/rocm_bf16.patch \
|
||||
&& python3 setup.py install \
|
||||
&& cd ..
|
||||
|
||||
RUN python3 -m pip install --upgrade pip
|
||||
RUN python3 -m pip install --no-cache-dir ray[all]==2.9.3
|
||||
|
||||
CMD ["/bin/bash"]
|
||||
|
||||
@ -1,19 +0,0 @@
|
||||
ARG NIGHTLY_DATE="20240601"
|
||||
ARG BASE_IMAGE="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.10_tpuvm_$NIGHTLY_DATE"
|
||||
|
||||
FROM $BASE_IMAGE
|
||||
|
||||
WORKDIR /workspace
|
||||
COPY . /workspace/vllm
|
||||
|
||||
ENV VLLM_TARGET_DEVICE="tpu"
|
||||
# Install aiohttp separately to avoid build errors.
|
||||
RUN pip install aiohttp
|
||||
# Install the TPU and Pallas dependencies.
|
||||
RUN pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html
|
||||
RUN pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
|
||||
|
||||
# Build vLLM.
|
||||
RUN cd /workspace/vllm && python setup.py develop
|
||||
|
||||
CMD ["/bin/bash"]
|
||||
@ -1,22 +0,0 @@
|
||||
FROM intel/oneapi-basekit:2024.1.0-devel-ubuntu22.04
|
||||
|
||||
RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
|
||||
echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
|
||||
chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \
|
||||
rm /etc/apt/sources.list.d/intel-graphics.list && \
|
||||
wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \
|
||||
echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
|
||||
chmod 644 /usr/share/keyrings/intel-graphics.gpg
|
||||
|
||||
RUN apt-get update -y \
|
||||
&& apt-get install -y curl libicu70 lsb-release git wget vim numactl python3 python3-pip
|
||||
|
||||
COPY ./ /workspace/vllm
|
||||
|
||||
WORKDIR /workspace/vllm
|
||||
|
||||
RUN pip install -v -r requirements-xpu.txt
|
||||
|
||||
RUN VLLM_TARGET_DEVICE=xpu python3 setup.py install
|
||||
|
||||
CMD ["/bin/bash"]
|
||||
@ -1,9 +1,6 @@
|
||||
include LICENSE
|
||||
include requirements-common.txt
|
||||
include requirements-cuda.txt
|
||||
include requirements-rocm.txt
|
||||
include requirements-neuron.txt
|
||||
include requirements-cpu.txt
|
||||
include CMakeLists.txt
|
||||
|
||||
recursive-include cmake *
|
||||
|
||||
86
README.md
86
README.md
@ -14,19 +14,7 @@ Easy, fast, and cheap LLM serving for everyone
|
||||
|
||||
</p>
|
||||
|
||||
---
|
||||
|
||||
**Ray Summit CPF is Open (June 4th to June 20th)!**
|
||||
|
||||
There will be a track for vLLM at the Ray Summit (09/30-10/02, SF) this year!
|
||||
If you have cool projects related to vLLM or LLM inference, we would love to see your proposals.
|
||||
This will be a great chance for everyone in the community to get together and learn.
|
||||
Please submit your proposal [here](https://raysummit.anyscale.com/flow/anyscale/raysummit2024/landing/page/eventsite)
|
||||
|
||||
---
|
||||
|
||||
*Latest News* 🔥
|
||||
- [2024/06] We hosted [the fourth vLLM meetup](https://lu.ma/agivllm) with Cloudflare and BentoML! Please find the meetup slides [here](https://docs.google.com/presentation/d/1iJ8o7V2bQEi0BFEljLTwc5G1S10_Rhv3beed5oB0NJ4/edit?usp=sharing).
|
||||
- [2024/04] We hosted [the third vLLM meetup](https://robloxandvllmmeetup2024.splashthat.com/) with Roblox! Please find the meetup slides [here](https://docs.google.com/presentation/d/1A--47JAK4BJ39t954HyTkvtfwn0fkqtsL8NGFuslReM/edit?usp=sharing).
|
||||
- [2024/01] We hosted [the second vLLM meetup](https://lu.ma/ygxbpzhl) in SF! Please find the meetup slides [here](https://docs.google.com/presentation/d/12mI2sKABnUw5RBWXDYY-HtHth4iMSNcEoQ10jDQbxgA/edit?usp=sharing).
|
||||
- [2024/01] Added ROCm 6.0 support to vLLM.
|
||||
@ -59,18 +47,44 @@ vLLM is flexible and easy to use with:
|
||||
- Tensor parallelism support for distributed inference
|
||||
- Streaming outputs
|
||||
- OpenAI-compatible API server
|
||||
- Support NVIDIA GPUs, AMD GPUs, Intel CPUs and GPUs
|
||||
- Support NVIDIA GPUs and AMD GPUs
|
||||
- (Experimental) Prefix caching support
|
||||
- (Experimental) Multi-lora support
|
||||
|
||||
vLLM seamlessly supports most popular open-source models on HuggingFace, including:
|
||||
- Transformer-like LLMs (e.g., Llama)
|
||||
- Mixture-of-Expert LLMs (e.g., Mixtral)
|
||||
- Multi-modal LLMs (e.g., LLaVA)
|
||||
vLLM seamlessly supports many Hugging Face models, including the following architectures:
|
||||
|
||||
Find the full list of supported models [here](https://docs.vllm.ai/en/latest/models/supported_models.html).
|
||||
|
||||
## Getting Started
|
||||
- Aquila & Aquila2 (`BAAI/AquilaChat2-7B`, `BAAI/AquilaChat2-34B`, `BAAI/Aquila-7B`, `BAAI/AquilaChat-7B`, etc.)
|
||||
- Baichuan & Baichuan2 (`baichuan-inc/Baichuan2-13B-Chat`, `baichuan-inc/Baichuan-7B`, etc.)
|
||||
- BLOOM (`bigscience/bloom`, `bigscience/bloomz`, etc.)
|
||||
- ChatGLM (`THUDM/chatglm2-6b`, `THUDM/chatglm3-6b`, etc.)
|
||||
- Command-R (`CohereForAI/c4ai-command-r-v01`, etc.)
|
||||
- DBRX (`databricks/dbrx-base`, `databricks/dbrx-instruct` etc.)
|
||||
- DeciLM (`Deci/DeciLM-7B`, `Deci/DeciLM-7B-instruct`, etc.)
|
||||
- Falcon (`tiiuae/falcon-7b`, `tiiuae/falcon-40b`, `tiiuae/falcon-rw-7b`, etc.)
|
||||
- Gemma (`google/gemma-2b`, `google/gemma-7b`, etc.)
|
||||
- GPT-2 (`gpt2`, `gpt2-xl`, etc.)
|
||||
- GPT BigCode (`bigcode/starcoder`, `bigcode/gpt_bigcode-santacoder`, etc.)
|
||||
- GPT-J (`EleutherAI/gpt-j-6b`, `nomic-ai/gpt4all-j`, etc.)
|
||||
- GPT-NeoX (`EleutherAI/gpt-neox-20b`, `databricks/dolly-v2-12b`, `stabilityai/stablelm-tuned-alpha-7b`, etc.)
|
||||
- InternLM (`internlm/internlm-7b`, `internlm/internlm-chat-7b`, etc.)
|
||||
- InternLM2 (`internlm/internlm2-7b`, `internlm/internlm2-chat-7b`, etc.)
|
||||
- Jais (`core42/jais-13b`, `core42/jais-13b-chat`, `core42/jais-30b-v3`, `core42/jais-30b-chat-v3`, etc.)
|
||||
- LLaMA & LLaMA-2 (`meta-llama/Llama-2-70b-hf`, `lmsys/vicuna-13b-v1.3`, `young-geng/koala`, `openlm-research/open_llama_13b`, etc.)
|
||||
- MiniCPM (`openbmb/MiniCPM-2B-sft-bf16`, `openbmb/MiniCPM-2B-dpo-bf16`, etc.)
|
||||
- Mistral (`mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc.)
|
||||
- Mixtral (`mistralai/Mixtral-8x7B-v0.1`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, `mistral-community/Mixtral-8x22B-v0.1`, etc.)
|
||||
- MPT (`mosaicml/mpt-7b`, `mosaicml/mpt-30b`, etc.)
|
||||
- OLMo (`allenai/OLMo-1B`, `allenai/OLMo-7B`, etc.)
|
||||
- OPT (`facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc.)
|
||||
- Orion (`OrionStarAI/Orion-14B-Base`, `OrionStarAI/Orion-14B-Chat`, etc.)
|
||||
- Phi (`microsoft/phi-1_5`, `microsoft/phi-2`, etc.)
|
||||
- Qwen (`Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc.)
|
||||
- Qwen2 (`Qwen/Qwen1.5-7B`, `Qwen/Qwen1.5-7B-Chat`, etc.)
|
||||
- Qwen2MoE (`Qwen/Qwen1.5-MoE-A2.7B`, `Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc.)
|
||||
- StableLM(`stabilityai/stablelm-3b-4e1t`, `stabilityai/stablelm-base-alpha-7b-v2`, etc.)
|
||||
- Starcoder2(`bigcode/starcoder2-3b`, `bigcode/starcoder2-7b`, `bigcode/starcoder2-15b`, etc.)
|
||||
- Xverse (`xverse/XVERSE-7B-Chat`, `xverse/XVERSE-13B-Chat`, `xverse/XVERSE-65B-Chat`, etc.)
|
||||
- Yi (`01-ai/Yi-6B`, `01-ai/Yi-34B`, etc.)
|
||||
|
||||
Install vLLM with pip or [from source](https://vllm.readthedocs.io/en/latest/getting_started/installation.html#build-from-source):
|
||||
|
||||
@ -78,7 +92,9 @@ Install vLLM with pip or [from source](https://vllm.readthedocs.io/en/latest/get
|
||||
pip install vllm
|
||||
```
|
||||
|
||||
Visit our [documentation](https://vllm.readthedocs.io/en/latest/) to learn more.
|
||||
## Getting Started
|
||||
|
||||
Visit our [documentation](https://vllm.readthedocs.io/en/latest/) to get started.
|
||||
- [Installation](https://vllm.readthedocs.io/en/latest/getting_started/installation.html)
|
||||
- [Quickstart](https://vllm.readthedocs.io/en/latest/getting_started/quickstart.html)
|
||||
- [Supported Models](https://vllm.readthedocs.io/en/latest/models/supported_models.html)
|
||||
@ -88,34 +104,6 @@ Visit our [documentation](https://vllm.readthedocs.io/en/latest/) to learn more.
|
||||
We welcome and value any contributions and collaborations.
|
||||
Please check out [CONTRIBUTING.md](./CONTRIBUTING.md) for how to get involved.
|
||||
|
||||
## Sponsors
|
||||
|
||||
vLLM is a community project. Our compute resources for development and testing are supported by the following organizations. Thank you for your support!
|
||||
|
||||
<!-- Note: Please sort them in alphabetical order. -->
|
||||
<!-- Note: Please keep these consistent with docs/source/community/sponsors.md -->
|
||||
|
||||
- a16z
|
||||
- AMD
|
||||
- Anyscale
|
||||
- AWS
|
||||
- Crusoe Cloud
|
||||
- Databricks
|
||||
- DeepInfra
|
||||
- Dropbox
|
||||
- Lambda Lab
|
||||
- NVIDIA
|
||||
- Replicate
|
||||
- Roblox
|
||||
- RunPod
|
||||
- Sequoia Capital
|
||||
- Trainy
|
||||
- UC Berkeley
|
||||
- UC San Diego
|
||||
- ZhenFund
|
||||
|
||||
We also have an official fundraising venue through [OpenCollective](https://opencollective.com/vllm). We plan to use the fund to support the development, maintenance, and adoption of vLLM.
|
||||
|
||||
## Citation
|
||||
|
||||
If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs/2309.06180):
|
||||
|
||||
@ -4,13 +4,10 @@ import sys
|
||||
import time
|
||||
import traceback
|
||||
from dataclasses import dataclass, field
|
||||
from typing import List, Optional, Union
|
||||
from typing import List, Optional
|
||||
|
||||
import aiohttp
|
||||
import huggingface_hub.constants
|
||||
from tqdm.asyncio import tqdm
|
||||
from transformers import (AutoTokenizer, PreTrainedTokenizer,
|
||||
PreTrainedTokenizerFast)
|
||||
|
||||
AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
|
||||
|
||||
@ -71,13 +68,9 @@ async def async_request_tgi(
|
||||
chunk_bytes = chunk_bytes.strip()
|
||||
if not chunk_bytes:
|
||||
continue
|
||||
chunk_bytes = chunk_bytes.decode("utf-8")
|
||||
|
||||
#NOTE: Sometimes TGI returns a ping response without
|
||||
# any data, we should skip it.
|
||||
if chunk_bytes.startswith(":"):
|
||||
continue
|
||||
chunk = remove_prefix(chunk_bytes, "data:")
|
||||
chunk = remove_prefix(chunk_bytes.decode("utf-8"),
|
||||
"data:")
|
||||
|
||||
data = json.loads(chunk)
|
||||
timestamp = time.perf_counter()
|
||||
@ -96,9 +89,6 @@ async def async_request_tgi(
|
||||
output.latency = most_recent_timestamp - st
|
||||
output.success = True
|
||||
output.generated_text = data["generated_text"]
|
||||
else:
|
||||
output.error = response.reason or ""
|
||||
output.success = False
|
||||
except Exception:
|
||||
output.success = False
|
||||
exc_info = sys.exc_info()
|
||||
@ -145,7 +135,6 @@ async def async_request_trt_llm(
|
||||
"data:")
|
||||
|
||||
data = json.loads(chunk)
|
||||
output.generated_text += data["text_output"]
|
||||
timestamp = time.perf_counter()
|
||||
# First token
|
||||
if ttft == 0.0:
|
||||
@ -160,6 +149,7 @@ async def async_request_trt_llm(
|
||||
most_recent_timestamp = timestamp
|
||||
|
||||
output.latency = most_recent_timestamp - st
|
||||
output.generated_text = json.loads(data)["text_output"]
|
||||
output.success = True
|
||||
|
||||
else:
|
||||
@ -225,8 +215,8 @@ async def async_request_openai_completions(
|
||||
) -> RequestFuncOutput:
|
||||
api_url = request_func_input.api_url
|
||||
assert api_url.endswith(
|
||||
"completions"
|
||||
), "OpenAI Completions API URL must end with 'completions'."
|
||||
"v1/completions"
|
||||
), "OpenAI Completions API URL must end with 'v1/completions'."
|
||||
|
||||
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
|
||||
assert not request_func_input.use_beam_search
|
||||
@ -265,9 +255,6 @@ async def async_request_openai_completions(
|
||||
else:
|
||||
data = json.loads(chunk)
|
||||
|
||||
# NOTE: Some completion API might have a last
|
||||
# usage summary response without a token so we
|
||||
# want to check a token was generated
|
||||
if data["choices"][0]["text"]:
|
||||
timestamp = time.perf_counter()
|
||||
# First token
|
||||
@ -276,8 +263,12 @@ async def async_request_openai_completions(
|
||||
output.ttft = ttft
|
||||
|
||||
# Decoding phase
|
||||
output.itl.append(timestamp -
|
||||
most_recent_timestamp)
|
||||
# NOTE: Some completion API might have a last
|
||||
# usage summary response without a token so we
|
||||
# do not want to include as inter-token-latency
|
||||
elif data.get("usage", None) is None:
|
||||
output.itl.append(timestamp -
|
||||
most_recent_timestamp)
|
||||
|
||||
most_recent_timestamp = timestamp
|
||||
generated_text += data["choices"][0]["text"]
|
||||
@ -285,9 +276,6 @@ async def async_request_openai_completions(
|
||||
output.generated_text = generated_text
|
||||
output.success = True
|
||||
output.latency = latency
|
||||
else:
|
||||
output.error = response.reason or ""
|
||||
output.success = False
|
||||
except Exception:
|
||||
output.success = False
|
||||
exc_info = sys.exc_info()
|
||||
@ -304,8 +292,8 @@ async def async_request_openai_chat_completions(
|
||||
) -> RequestFuncOutput:
|
||||
api_url = request_func_input.api_url
|
||||
assert api_url.endswith(
|
||||
"chat/completions"
|
||||
), "OpenAI Chat Completions API URL must end with 'chat/completions'."
|
||||
"v1/chat/completions"
|
||||
), "OpenAI Chat Completions API URL must end with 'v1/chat/completions'."
|
||||
|
||||
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
|
||||
assert not request_func_input.use_beam_search
|
||||
@ -390,30 +378,6 @@ def remove_prefix(text: str, prefix: str) -> str:
|
||||
return text
|
||||
|
||||
|
||||
def get_model(pretrained_model_name_or_path: str):
|
||||
if os.getenv('VLLM_USE_MODELSCOPE', 'False').lower() == 'true':
|
||||
from modelscope import snapshot_download
|
||||
else:
|
||||
from huggingface_hub import snapshot_download
|
||||
|
||||
model_path = snapshot_download(
|
||||
model_id=pretrained_model_name_or_path,
|
||||
local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
|
||||
ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"])
|
||||
return model_path
|
||||
|
||||
|
||||
def get_tokenizer(
|
||||
pretrained_model_name_or_path: str, trust_remote_code: bool
|
||||
) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
|
||||
if pretrained_model_name_or_path is not None and not os.path.exists(
|
||||
pretrained_model_name_or_path):
|
||||
pretrained_model_name_or_path = get_model(
|
||||
pretrained_model_name_or_path)
|
||||
return AutoTokenizer.from_pretrained(pretrained_model_name_or_path,
|
||||
trust_remote_code=trust_remote_code)
|
||||
|
||||
|
||||
ASYNC_REQUEST_FUNCS = {
|
||||
"tgi": async_request_tgi,
|
||||
"vllm": async_request_openai_completions,
|
||||
@ -422,5 +386,4 @@ ASYNC_REQUEST_FUNCS = {
|
||||
"openai": async_request_openai_completions,
|
||||
"openai-chat": async_request_openai_chat_completions,
|
||||
"tensorrt-llm": async_request_trt_llm,
|
||||
"scalellm": async_request_openai_completions,
|
||||
}
|
||||
|
||||
148
benchmarks/bench_cache_write.py
Normal file
148
benchmarks/bench_cache_write.py
Normal file
@ -0,0 +1,148 @@
|
||||
import functools
|
||||
import time
|
||||
from typing import Tuple
|
||||
|
||||
import chex
|
||||
import jax
|
||||
import jax.numpy as jnp
|
||||
|
||||
_PAD_SLOT_ID = -1
|
||||
|
||||
|
||||
@jax.jit
|
||||
def write_to_kv_cache1(
|
||||
key: jax.Array, # [batch_size, seq_len, num_heads, head_size]
|
||||
value: jax.Array, # [batch_size, seq_len, num_heads, head_size]
|
||||
k_cache: jax.Array, # [num_heads, num_blocks * block_size, head_size]
|
||||
v_cache: jax.Array, # [num_heads, num_blocks * block_size, head_size]
|
||||
slot_mapping: jax.Array, # [batch_size, seq_len]
|
||||
) -> Tuple[jax.Array, jax.Array]:
|
||||
num_heads = key.shape[-2]
|
||||
head_size = key.shape[-1]
|
||||
|
||||
key = key.reshape(-1, num_heads, head_size)
|
||||
key = key.transpose((1, 0, 2))
|
||||
value = value.reshape(-1, num_heads, head_size)
|
||||
value = value.transpose((1, 0, 2))
|
||||
|
||||
k_cache = k_cache.at[:, slot_mapping.reshape(-1), :].set(key)
|
||||
v_cache = v_cache.at[:, slot_mapping.reshape(-1), :].set(value)
|
||||
return k_cache, v_cache
|
||||
|
||||
|
||||
@functools.partial(jax.jit, donate_argnums=(2, 3))
|
||||
def write_to_kv_cache2(
|
||||
key: jax.Array, # [batch_size, seq_len, num_heads, head_size]
|
||||
value: jax.Array, # [batch_size, seq_len, num_heads, head_size]
|
||||
k_cache: jax.Array, # [num_heads, num_blocks * block_size, head_size]
|
||||
v_cache: jax.Array, # [num_heads, num_blocks * block_size, head_size]
|
||||
slot_mapping: jax.Array, # [batch_size, seq_len]
|
||||
) -> Tuple[jax.Array, jax.Array]:
|
||||
batch_size = slot_mapping.shape[0]
|
||||
|
||||
def cond(val: _IteratorState):
|
||||
return val.idx < batch_size
|
||||
|
||||
def body(val: _IteratorState):
|
||||
k_cache, v_cache = _write_seq_to_kv_cache(
|
||||
key[val.idx],
|
||||
value[val.idx],
|
||||
val.k_cache,
|
||||
val.v_cache,
|
||||
slot_mapping[val.idx],
|
||||
)
|
||||
val.k_cache = k_cache
|
||||
val.v_cache = v_cache
|
||||
val.idx += 1
|
||||
return val
|
||||
|
||||
iterator = _IteratorState(idx=0, k_cache=k_cache, v_cache=v_cache)
|
||||
iterator = jax.lax.while_loop(cond, body, iterator)
|
||||
return iterator.k_cache, iterator.v_cache
|
||||
|
||||
|
||||
@functools.partial(jax.jit, donate_argnums=(2, 3))
|
||||
def _write_seq_to_kv_cache(
|
||||
key: jax.Array, # [seq_len, num_heads, head_size]
|
||||
value: jax.Array, # [seq_len, num_heads, head_size]
|
||||
k_cache: jax.Array, # [num_heads, num_blocks * block_size, head_size]
|
||||
v_cache: jax.Array, # [num_heads, num_blocks * block_size, head_size]
|
||||
slot_mapping: jax.Array, # [seq_len]
|
||||
) -> Tuple[jax.Array, jax.Array]:
|
||||
seq_len = slot_mapping.shape[0]
|
||||
num_heads, _, head_size = k_cache.shape
|
||||
# Reshape to match the rank of kv_cache.
|
||||
key = key.reshape(seq_len, num_heads, 1, head_size)
|
||||
value = value.reshape(seq_len, num_heads, 1, head_size)
|
||||
|
||||
def cond(val: _IteratorState):
|
||||
return jnp.logical_and(
|
||||
val.idx < seq_len, slot_mapping[val.idx] != _PAD_SLOT_ID)
|
||||
|
||||
def body(val: _IteratorState):
|
||||
slot_idx = slot_mapping[val.idx]
|
||||
val.k_cache = jax.lax.dynamic_update_slice(
|
||||
val.k_cache,
|
||||
key[val.idx],
|
||||
(0, slot_idx, 0),
|
||||
)
|
||||
val.v_cache = jax.lax.dynamic_update_slice(
|
||||
val.v_cache,
|
||||
value[val.idx],
|
||||
(0, slot_idx, 0),
|
||||
)
|
||||
val.idx += 1
|
||||
return val
|
||||
|
||||
iterator = _IteratorState(idx=0, k_cache=k_cache, v_cache=v_cache)
|
||||
iterator = jax.lax.while_loop(cond, body, iterator)
|
||||
return iterator.k_cache, iterator.v_cache
|
||||
|
||||
|
||||
@chex.dataclass
|
||||
class _IteratorState:
|
||||
|
||||
idx: jnp.int32
|
||||
k_cache: jnp.ndarray # [num_heads, num_blocks, block_size, head_size]
|
||||
v_cache: jnp.ndarray # [num_heads, num_blocks, block_size, head_size]
|
||||
|
||||
|
||||
def benchmark_write_to_kv_cache(
|
||||
batch_size: int,
|
||||
seq_len: int,
|
||||
num_kv_heads: int,
|
||||
head_size: int,
|
||||
num_blocks: int,
|
||||
block_size: int,
|
||||
version: int = 1,
|
||||
):
|
||||
if version == 1:
|
||||
f = write_to_kv_cache1
|
||||
elif version == 2:
|
||||
f = write_to_kv_cache2
|
||||
else:
|
||||
raise ValueError(f"Invalid version: {version}")
|
||||
|
||||
rng_key = jax.random.PRNGKey(0)
|
||||
key = jax.random.normal(rng_key, (batch_size, seq_len, num_kv_heads, head_size), dtype=jnp.bfloat16)
|
||||
value = jax.random.normal(rng_key, (batch_size, seq_len, num_kv_heads, head_size), dtype=jnp.bfloat16)
|
||||
k_cache = jax.random.normal(rng_key, (num_kv_heads, num_blocks * block_size, head_size), dtype=jnp.bfloat16)
|
||||
v_cache = jax.random.normal(rng_key, (num_kv_heads, num_blocks * block_size, head_size), dtype=jnp.bfloat16)
|
||||
slot_mapping = jax.random.randint(rng_key, (batch_size, seq_len), 0, num_blocks * block_size, dtype=jnp.int32)
|
||||
|
||||
# For JIT compilation.
|
||||
k_cache, v_cache = f(key, value, k_cache, v_cache, slot_mapping)
|
||||
k_cache.block_until_ready()
|
||||
|
||||
start = time.time()
|
||||
for _ in range(100):
|
||||
k_cache, v_cache = f(key, value, k_cache, v_cache, slot_mapping)
|
||||
k_cache.block_until_ready()
|
||||
end = time.time()
|
||||
print(f"Time taken: {(end - start) * 10:.2f} ms")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
for num_blocks in [16, 256, 512, 1024, 2048, 8192, 16384]:
|
||||
print(f"Benchmarking Write to KV Cache w/ {num_blocks} blocks")
|
||||
benchmark_write_to_kv_cache(16, 256, 16, 256, num_blocks, 16, version=1)
|
||||
101
benchmarks/bench_paged_attn.py
Normal file
101
benchmarks/bench_paged_attn.py
Normal file
@ -0,0 +1,101 @@
|
||||
import argparse
|
||||
import functools
|
||||
import time
|
||||
|
||||
import jax
|
||||
import jax.numpy as jnp
|
||||
from jax.experimental.pallas.ops.tpu.paged_attention import paged_attention
|
||||
|
||||
BLOCK_SIZE = 16
|
||||
MAX_NUM_BLOCKS_PER_SEQ = 512
|
||||
|
||||
|
||||
@functools.partial(jax.jit, static_argnums=(6, 7))
|
||||
def paged_attn(
|
||||
q: jax.Array, # [batch, 1, num_heads, head_size]
|
||||
k_cache: jax.Array, # [num_kv_heads, num_blocks * block_size, head_size]
|
||||
v_cache: jax.Array, # [num_kv_heads, num_blocks * block_size, head_size]
|
||||
sm_scale: float,
|
||||
block_tables: jax.Array, # [batch, max_num_blocks_per_batch]
|
||||
context_lens: jax.Array, # [batch]
|
||||
block_size: int,
|
||||
pages_per_compute_block: int,
|
||||
) -> jax.Array: # [batch, 1, num_heads, head_size]
|
||||
q = q.squeeze(1)
|
||||
q = q * sm_scale
|
||||
|
||||
head_size = q.shape[-1]
|
||||
num_slots = k_cache.shape[-2]
|
||||
k_cache = k_cache.reshape(-1, num_slots // block_size, block_size, head_size)
|
||||
v_cache = v_cache.reshape(-1, num_slots // block_size, block_size, head_size)
|
||||
|
||||
output = paged_attention(
|
||||
q,
|
||||
k_cache,
|
||||
v_cache,
|
||||
context_lens,
|
||||
block_tables,
|
||||
pages_per_compute_block=pages_per_compute_block,
|
||||
)
|
||||
return output.reshape(q.shape[0], 1, q.shape[1], q.shape[2])
|
||||
|
||||
|
||||
def benchmark_paged_attn(
|
||||
batch_size: int,
|
||||
num_heads: int,
|
||||
num_kv_heads: int,
|
||||
head_size: int,
|
||||
context_len: int,
|
||||
num_blocks: int,
|
||||
block_size: int,
|
||||
pages_per_compute_block: int,
|
||||
):
|
||||
rng_key = jax.random.PRNGKey(0)
|
||||
query = jax.random.normal(rng_key, (batch_size, 1, num_heads, head_size), dtype=jnp.bfloat16)
|
||||
k_cache = jax.random.normal(rng_key, (num_kv_heads, num_blocks * block_size, head_size), dtype=jnp.bfloat16)
|
||||
v_cache = jax.random.normal(rng_key, (num_kv_heads, num_blocks * block_size, head_size), dtype=jnp.bfloat16)
|
||||
sm_scale = head_size ** -0.5
|
||||
block_tables = jax.random.randint(rng_key, (batch_size, MAX_NUM_BLOCKS_PER_SEQ), 0, num_blocks, dtype=jnp.int32)
|
||||
context_lens = jnp.array([context_len] * batch_size, dtype=jnp.int32)
|
||||
|
||||
# For JIT compilation.
|
||||
output = paged_attn(query, k_cache, v_cache, sm_scale, block_tables, context_lens, block_size, pages_per_compute_block)
|
||||
output.block_until_ready()
|
||||
|
||||
start = time.time()
|
||||
for _ in range(100):
|
||||
output = paged_attn(query, k_cache, v_cache, sm_scale, block_tables, context_lens, block_size, pages_per_compute_block)
|
||||
output.block_until_ready()
|
||||
end = time.time()
|
||||
|
||||
print(f"Time taken: {(end - start) * 10000:.2f} us")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--batch-size", type=int, default=8)
|
||||
parser.add_argument("--num-heads", type=int, default=16)
|
||||
parser.add_argument("--num-kv-heads", type=int, default=16)
|
||||
parser.add_argument("--head-size", type=int, default=256)
|
||||
parser.add_argument("--context-len", type=int, default=512)
|
||||
parser.add_argument("--num-blocks", type=int, default=2048)
|
||||
args = parser.parse_args()
|
||||
print(args)
|
||||
|
||||
for block_size in [16, 32, 64, 128]:
|
||||
for pages_per_compute_block in [1, 2, 4, 8, 16, 32, 64, 128]:
|
||||
if pages_per_compute_block > MAX_NUM_BLOCKS_PER_SEQ:
|
||||
continue
|
||||
if block_size * pages_per_compute_block > 1024:
|
||||
continue
|
||||
print(f"block_size {block_size}, pages_per_compute_block: {pages_per_compute_block}")
|
||||
benchmark_paged_attn(
|
||||
args.batch_size,
|
||||
args.num_heads,
|
||||
args.num_kv_heads,
|
||||
args.head_size,
|
||||
args.context_len,
|
||||
args.num_blocks,
|
||||
block_size,
|
||||
pages_per_compute_block,
|
||||
)
|
||||
@ -1,19 +1,14 @@
|
||||
"""Benchmark the latency of processing a single batch of requests."""
|
||||
import argparse
|
||||
import json
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import List, Optional
|
||||
from typing import Optional
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from tqdm import tqdm
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.engine.arg_utils import EngineArgs
|
||||
from vllm.inputs import PromptStrictInputs
|
||||
from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
|
||||
|
||||
def main(args: argparse.Namespace):
|
||||
@ -21,33 +16,20 @@ def main(args: argparse.Namespace):
|
||||
|
||||
# NOTE(woosuk): If the request cannot be processed in a single batch,
|
||||
# the engine will automatically process the request in multiple batches.
|
||||
llm = LLM(
|
||||
model=args.model,
|
||||
speculative_model=args.speculative_model,
|
||||
num_speculative_tokens=args.num_speculative_tokens,
|
||||
speculative_draft_tensor_parallel_size=\
|
||||
args.speculative_draft_tensor_parallel_size,
|
||||
tokenizer=args.tokenizer,
|
||||
quantization=args.quantization,
|
||||
tensor_parallel_size=args.tensor_parallel_size,
|
||||
trust_remote_code=args.trust_remote_code,
|
||||
dtype=args.dtype,
|
||||
max_model_len=args.max_model_len,
|
||||
enforce_eager=args.enforce_eager,
|
||||
kv_cache_dtype=args.kv_cache_dtype,
|
||||
quantization_param_path=args.quantization_param_path,
|
||||
device=args.device,
|
||||
ray_workers_use_nsight=args.ray_workers_use_nsight,
|
||||
use_v2_block_manager=args.use_v2_block_manager,
|
||||
enable_chunked_prefill=args.enable_chunked_prefill,
|
||||
download_dir=args.download_dir,
|
||||
block_size=args.block_size,
|
||||
gpu_memory_utilization=args.gpu_memory_utilization,
|
||||
load_format=args.load_format,
|
||||
distributed_executor_backend=args.distributed_executor_backend,
|
||||
otlp_traces_endpoint=args.otlp_traces_endpoint,
|
||||
enable_prefix_caching=args.enable_prefix_caching,
|
||||
)
|
||||
llm = LLM(model=args.model,
|
||||
tokenizer=args.tokenizer,
|
||||
quantization=args.quantization,
|
||||
tensor_parallel_size=args.tensor_parallel_size,
|
||||
trust_remote_code=args.trust_remote_code,
|
||||
dtype=args.dtype,
|
||||
enforce_eager=args.enforce_eager,
|
||||
kv_cache_dtype=args.kv_cache_dtype,
|
||||
quantization_param_path=args.quantization_param_path,
|
||||
device=args.device,
|
||||
ray_workers_use_nsight=args.ray_workers_use_nsight,
|
||||
enable_chunked_prefill=args.enable_chunked_prefill,
|
||||
download_dir=args.download_dir,
|
||||
block_size=args.block_size)
|
||||
|
||||
sampling_params = SamplingParams(
|
||||
n=args.n,
|
||||
@ -61,9 +43,7 @@ def main(args: argparse.Namespace):
|
||||
dummy_prompt_token_ids = np.random.randint(10000,
|
||||
size=(args.batch_size,
|
||||
args.input_len))
|
||||
dummy_inputs: List[PromptStrictInputs] = [{
|
||||
"prompt_token_ids": batch
|
||||
} for batch in dummy_prompt_token_ids.tolist()]
|
||||
dummy_prompt_token_ids = dummy_prompt_token_ids.tolist()
|
||||
|
||||
def run_to_completion(profile_dir: Optional[str] = None):
|
||||
if profile_dir:
|
||||
@ -74,13 +54,13 @@ def main(args: argparse.Namespace):
|
||||
],
|
||||
on_trace_ready=torch.profiler.tensorboard_trace_handler(
|
||||
str(profile_dir))) as p:
|
||||
llm.generate(dummy_inputs,
|
||||
llm.generate(prompt_token_ids=dummy_prompt_token_ids,
|
||||
sampling_params=sampling_params,
|
||||
use_tqdm=False)
|
||||
print(p.key_averages())
|
||||
else:
|
||||
start_time = time.perf_counter()
|
||||
llm.generate(dummy_inputs,
|
||||
llm.generate(prompt_token_ids=dummy_prompt_token_ids,
|
||||
sampling_params=sampling_params,
|
||||
use_tqdm=False)
|
||||
end_time = time.perf_counter()
|
||||
@ -106,38 +86,22 @@ def main(args: argparse.Namespace):
|
||||
for _ in tqdm(range(args.num_iters), desc="Profiling iterations"):
|
||||
latencies.append(run_to_completion(profile_dir=None))
|
||||
latencies = np.array(latencies)
|
||||
percentages = [10, 25, 50, 75, 90, 99]
|
||||
percentages = [10, 25, 50, 75, 90]
|
||||
percentiles = np.percentile(latencies, percentages)
|
||||
print(f'Avg latency: {np.mean(latencies)} seconds')
|
||||
for percentage, percentile in zip(percentages, percentiles):
|
||||
print(f'{percentage}% percentile latency: {percentile} seconds')
|
||||
|
||||
# Output JSON results if specified
|
||||
if args.output_json:
|
||||
results = {
|
||||
"avg_latency": np.mean(latencies),
|
||||
"latencies": latencies.tolist(),
|
||||
"percentiles": dict(zip(percentages, percentiles.tolist())),
|
||||
}
|
||||
with open(args.output_json, "w") as f:
|
||||
json.dump(results, f, indent=4)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = FlexibleArgumentParser(
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Benchmark the latency of processing a single batch of '
|
||||
'requests till completion.')
|
||||
parser.add_argument('--model', type=str, default='facebook/opt-125m')
|
||||
parser.add_argument('--speculative-model', type=str, default=None)
|
||||
parser.add_argument('--num-speculative-tokens', type=int, default=None)
|
||||
parser.add_argument('--speculative-draft-tensor-parallel-size',
|
||||
'-spec-draft-tp',
|
||||
type=int,
|
||||
default=None)
|
||||
parser.add_argument('--tokenizer', type=str, default=None)
|
||||
parser.add_argument('--quantization',
|
||||
'-q',
|
||||
choices=[*QUANTIZATION_METHODS, None],
|
||||
choices=['awq', 'gptq', 'squeezellm', None],
|
||||
default=None)
|
||||
parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1)
|
||||
parser.add_argument('--input-len', type=int, default=32)
|
||||
@ -159,12 +123,6 @@ if __name__ == '__main__':
|
||||
parser.add_argument('--trust-remote-code',
|
||||
action='store_true',
|
||||
help='trust remote code from huggingface')
|
||||
parser.add_argument(
|
||||
'--max-model-len',
|
||||
type=int,
|
||||
default=None,
|
||||
help='Maximum length of a sequence (including prompt and output). '
|
||||
'If None, will be derived from the model.')
|
||||
parser.add_argument(
|
||||
'--dtype',
|
||||
type=str,
|
||||
@ -178,13 +136,15 @@ if __name__ == '__main__':
|
||||
action='store_true',
|
||||
help='enforce eager mode and disable CUDA graph')
|
||||
parser.add_argument(
|
||||
'--kv-cache-dtype',
|
||||
"--kv-cache-dtype",
|
||||
type=str,
|
||||
choices=['auto', 'fp8', 'fp8_e5m2', 'fp8_e4m3'],
|
||||
default="auto",
|
||||
help='Data type for kv cache storage. If "auto", will use model '
|
||||
'data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. '
|
||||
'ROCm (AMD GPU) supports fp8 (=fp8_e4m3)')
|
||||
choices=['auto', 'fp8'],
|
||||
default='auto',
|
||||
help=
|
||||
'Data type for kv cache storage. If "auto", will use model data type. '
|
||||
'FP8_E5M2 (without scaling) is only supported on cuda version greater '
|
||||
'than 11.8. On ROCm (AMD GPU), FP8_E4M3 is instead supported for '
|
||||
'common inference criteria.')
|
||||
parser.add_argument(
|
||||
'--quantization-param-path',
|
||||
type=str,
|
||||
@ -208,10 +168,9 @@ if __name__ == '__main__':
|
||||
parser.add_argument(
|
||||
"--device",
|
||||
type=str,
|
||||
default="auto",
|
||||
choices=["auto", "cuda", "cpu", "openvino", "tpu", "xpu"],
|
||||
help='device type for vLLM execution, supporting CUDA, OpenVINO and '
|
||||
'CPU.')
|
||||
default="cuda",
|
||||
choices=["cuda", "cpu"],
|
||||
help='device type for vLLM execution, supporting CUDA and CPU.')
|
||||
parser.add_argument('--block-size',
|
||||
type=int,
|
||||
default=16,
|
||||
@ -221,10 +180,6 @@ if __name__ == '__main__':
|
||||
action='store_true',
|
||||
help='If True, the prefill requests can be chunked based on the '
|
||||
'max_num_batched_tokens')
|
||||
parser.add_argument("--enable-prefix-caching",
|
||||
action='store_true',
|
||||
help="Enable automatic prefix caching")
|
||||
parser.add_argument('--use-v2-block-manager', action='store_true')
|
||||
parser.add_argument(
|
||||
"--ray-workers-use-nsight",
|
||||
action='store_true',
|
||||
@ -235,51 +190,5 @@ if __name__ == '__main__':
|
||||
default=None,
|
||||
help='directory to download and load the weights, '
|
||||
'default to the default cache dir of huggingface')
|
||||
parser.add_argument(
|
||||
'--output-json',
|
||||
type=str,
|
||||
default=None,
|
||||
help='Path to save the latency results in JSON format.')
|
||||
parser.add_argument('--gpu-memory-utilization',
|
||||
type=float,
|
||||
default=0.9,
|
||||
help='the fraction of GPU memory to be used for '
|
||||
'the model executor, which can range from 0 to 1.'
|
||||
'If unspecified, will use the default value of 0.9.')
|
||||
parser.add_argument(
|
||||
'--load-format',
|
||||
type=str,
|
||||
default=EngineArgs.load_format,
|
||||
choices=[
|
||||
'auto', 'pt', 'safetensors', 'npcache', 'dummy', 'tensorizer',
|
||||
'bitsandbytes'
|
||||
],
|
||||
help='The format of the model weights to load.\n\n'
|
||||
'* "auto" will try to load the weights in the safetensors format '
|
||||
'and fall back to the pytorch bin format if safetensors format '
|
||||
'is not available.\n'
|
||||
'* "pt" will load the weights in the pytorch bin format.\n'
|
||||
'* "safetensors" will load the weights in the safetensors format.\n'
|
||||
'* "npcache" will load the weights in pytorch format and store '
|
||||
'a numpy cache to speed up the loading.\n'
|
||||
'* "dummy" will initialize the weights with random values, '
|
||||
'which is mainly for profiling.\n'
|
||||
'* "tensorizer" will load the weights using tensorizer from '
|
||||
'CoreWeave. See the Tensorize vLLM Model script in the Examples'
|
||||
'section for more information.\n'
|
||||
'* "bitsandbytes" will load the weights using bitsandbytes '
|
||||
'quantization.\n')
|
||||
parser.add_argument(
|
||||
'--distributed-executor-backend',
|
||||
choices=['ray', 'mp'],
|
||||
default=None,
|
||||
help='Backend to use for distributed serving. When more than 1 GPU '
|
||||
'is used, will be automatically set to "ray" if installed '
|
||||
'or "mp" (multiprocessing) otherwise.')
|
||||
parser.add_argument(
|
||||
'--otlp-traces-endpoint',
|
||||
type=str,
|
||||
default=None,
|
||||
help='Target URL to which OpenTelemetry traces will be sent.')
|
||||
args = parser.parse_args()
|
||||
main(args)
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
import argparse
|
||||
import time
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
|
||||
PROMPT = "You are a helpful assistant in recognizes the content of tables in markdown format. Here is a table as fellows. You need to answer my question about the table.\n# Table\n|Opening|Opening|Sl. No.|Film|Cast|Director|Music Director|Notes|\n|----|----|----|----|----|----|----|----|\n|J A N|9|1|Agni Pushpam|Jayabharathi, Kamalahasan|Jeassy|M. K. Arjunan||\n|J A N|16|2|Priyamvada|Mohan Sharma, Lakshmi, KPAC Lalitha|K. S. Sethumadhavan|V. Dakshinamoorthy||\n|J A N|23|3|Yakshagaanam|Madhu, Sheela|Sheela|M. S. Viswanathan||\n|J A N|30|4|Paalkkadal|Sheela, Sharada|T. K. Prasad|A. T. Ummer||\n|F E B|5|5|Amma|Madhu, Srividya|M. Krishnan Nair|M. K. Arjunan||\n|F E B|13|6|Appooppan|Thikkurissi Sukumaran Nair, Kamal Haasan|P. Bhaskaran|M. S. Baburaj||\n|F E B|20|7|Srishti|Chowalloor Krishnankutty, Ravi Alummoodu|K. T. Muhammad|M. S. Baburaj||\n|F E B|20|8|Vanadevatha|Prem Nazir, Madhubala|Yusufali Kechery|G. Devarajan||\n|F E B|27|9|Samasya|Madhu, Kamalahaasan|K. Thankappan|Shyam||\n|F E B|27|10|Yudhabhoomi|K. P. Ummer, Vidhubala|Crossbelt Mani|R. K. Shekhar||\n|M A R|5|11|Seemantha Puthran|Prem Nazir, Jayabharathi|A. B. Raj|M. K. Arjunan||\n|M A R|12|12|Swapnadanam|Rani Chandra, Dr. Mohandas|K. G. George|Bhaskar Chandavarkar||\n|M A R|19|13|Thulavarsham|Prem Nazir, sreedevi, Sudheer|N. Sankaran Nair|V. Dakshinamoorthy||\n|M A R|20|14|Aruthu|Kaviyoor Ponnamma, Kamalahasan|Ravi|G. Devarajan||\n|M A R|26|15|Swimming Pool|Kamal Haasan, M. G. Soman|J. Sasikumar|M. K. Arjunan||\n\n# Question\nWhat' s the content in the (1,1) cells\n" # noqa: E501
|
||||
|
||||
@ -16,22 +16,20 @@ def test_prefix(llm=None, sampling_params=None, prompts=None):
|
||||
|
||||
|
||||
def main(args):
|
||||
llm = LLM(model=args.model,
|
||||
llm = LLM(model="baichuan-inc/Baichuan2-13B-Chat",
|
||||
tokenizer_mode='auto',
|
||||
trust_remote_code=True,
|
||||
enforce_eager=True,
|
||||
use_v2_block_manager=args.use_v2_block_manager,
|
||||
tensor_parallel_size=args.tensor_parallel_size,
|
||||
enable_prefix_caching=args.enable_prefix_caching)
|
||||
|
||||
num_prompts = 100
|
||||
prompts = [PROMPT] * num_prompts
|
||||
sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len)
|
||||
sampling_params = SamplingParams(temperature=0, max_tokens=100)
|
||||
|
||||
print("------warm up------")
|
||||
test_prefix(
|
||||
llm=llm,
|
||||
prompts=prompts,
|
||||
prompts=prompts[:1],
|
||||
sampling_params=sampling_params,
|
||||
)
|
||||
|
||||
@ -44,19 +42,11 @@ def main(args):
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = FlexibleArgumentParser(
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Benchmark the performance with or without automatic '
|
||||
'prefix caching.')
|
||||
parser.add_argument('--model',
|
||||
type=str,
|
||||
default='baichuan-inc/Baichuan2-13B-Chat')
|
||||
parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1)
|
||||
parser.add_argument('--output-len', type=int, default=10)
|
||||
parser.add_argument('--enable-prefix-caching',
|
||||
action='store_true',
|
||||
help='enable prefix caching')
|
||||
parser.add_argument('--use-v2-block-manager',
|
||||
action='store_true',
|
||||
help='Use BlockSpaceMangerV2')
|
||||
args = parser.parse_args()
|
||||
main(args)
|
||||
|
||||
@ -17,10 +17,6 @@ On the client side, run:
|
||||
--dataset-path <path to dataset> \
|
||||
--request-rate <request_rate> \ # By default <request_rate> is inf
|
||||
--num-prompts <num_prompts> # By default <num_prompts> is 1000
|
||||
|
||||
when using tgi backend, add
|
||||
--endpoint /generate_stream
|
||||
to the end of the command above.
|
||||
"""
|
||||
import argparse
|
||||
import asyncio
|
||||
@ -31,7 +27,7 @@ import time
|
||||
import warnings
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime
|
||||
from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple
|
||||
from typing import AsyncGenerator, List, Tuple
|
||||
|
||||
import numpy as np
|
||||
from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput,
|
||||
@ -39,15 +35,7 @@ from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput,
|
||||
from tqdm.asyncio import tqdm
|
||||
from transformers import PreTrainedTokenizerBase
|
||||
|
||||
try:
|
||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||
except ImportError:
|
||||
from backend_request_func import get_tokenizer
|
||||
|
||||
try:
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
except ImportError:
|
||||
from argparse import ArgumentParser as FlexibleArgumentParser
|
||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||
|
||||
|
||||
@dataclass
|
||||
@ -64,20 +52,13 @@ class BenchmarkMetrics:
|
||||
mean_tpot_ms: float
|
||||
median_tpot_ms: float
|
||||
p99_tpot_ms: float
|
||||
mean_itl_ms: float
|
||||
median_itl_ms: float
|
||||
p99_itl_ms: float
|
||||
|
||||
|
||||
def sample_sharegpt_requests(
|
||||
dataset_path: str,
|
||||
num_requests: int,
|
||||
tokenizer: PreTrainedTokenizerBase,
|
||||
fixed_output_len: Optional[int] = None,
|
||||
) -> List[Tuple[str, int, int]]:
|
||||
if fixed_output_len is not None and fixed_output_len < 4:
|
||||
raise ValueError("output_len too small")
|
||||
|
||||
# Load the dataset.
|
||||
with open(dataset_path) as f:
|
||||
dataset = json.load(f)
|
||||
@ -87,32 +68,38 @@ def sample_sharegpt_requests(
|
||||
dataset = [(data["conversations"][0]["value"],
|
||||
data["conversations"][1]["value"]) for data in dataset]
|
||||
|
||||
# Shuffle the dataset.
|
||||
random.shuffle(dataset)
|
||||
# some of these will be filtered out, so sample more than we need
|
||||
sampled_indices = random.sample(range(len(dataset)),
|
||||
int(num_requests * 1.2))
|
||||
dataset = [dataset[i] for i in sampled_indices]
|
||||
|
||||
# Filter out sequences that are too long or too short
|
||||
filtered_dataset: List[Tuple[str, int, int]] = []
|
||||
# Tokenize the prompts and completions.
|
||||
prompts = [prompt for prompt, _ in dataset]
|
||||
prompt_token_ids = tokenizer(prompts).input_ids
|
||||
completions = [completion for _, completion in dataset]
|
||||
completion_token_ids = tokenizer(completions).input_ids
|
||||
tokenized_dataset = []
|
||||
for i in range(len(dataset)):
|
||||
if len(filtered_dataset) == num_requests:
|
||||
break
|
||||
output_len = len(completion_token_ids[i])
|
||||
tokenized_dataset.append((prompts[i], prompt_token_ids[i], output_len))
|
||||
|
||||
# Tokenize the prompts and completions.
|
||||
prompt = dataset[i][0]
|
||||
prompt_token_ids = tokenizer(prompt).input_ids
|
||||
completion = dataset[i][1]
|
||||
completion_token_ids = tokenizer(completion).input_ids
|
||||
# Filter out too long sequences.
|
||||
filtered_dataset: List[Tuple[str, int, int]] = []
|
||||
for prompt, prompt_token_ids, output_len in tokenized_dataset:
|
||||
prompt_len = len(prompt_token_ids)
|
||||
output_len = len(completion_token_ids
|
||||
) if fixed_output_len is None else fixed_output_len
|
||||
if prompt_len < 4 or output_len < 4:
|
||||
# Prune too short sequences.
|
||||
# This is because TGI causes errors when the input or output length
|
||||
# is too short.
|
||||
continue
|
||||
if prompt_len > 1024 or prompt_len + output_len > 2048:
|
||||
# Prune too long sequences.
|
||||
continue
|
||||
filtered_dataset.append((prompt, prompt_len, output_len))
|
||||
|
||||
return filtered_dataset
|
||||
# Sample the requests.
|
||||
sampled_requests = random.sample(filtered_dataset, num_requests)
|
||||
return sampled_requests
|
||||
|
||||
|
||||
def sample_sonnet_requests(
|
||||
@ -208,37 +195,24 @@ def calculate_metrics(
|
||||
dur_s: float,
|
||||
tokenizer: PreTrainedTokenizerBase,
|
||||
) -> Tuple[BenchmarkMetrics, List[int]]:
|
||||
actual_output_lens: List[int] = []
|
||||
actual_output_lens = []
|
||||
total_input = 0
|
||||
completed = 0
|
||||
itls: List[float] = []
|
||||
tpots: List[float] = []
|
||||
ttfts: List[float] = []
|
||||
tpots = []
|
||||
ttfts = []
|
||||
for i in range(len(outputs)):
|
||||
if outputs[i].success:
|
||||
# We use the tokenizer to count the number of output tokens for all
|
||||
# serving backends instead of looking at len(outputs[i].itl) since
|
||||
# multiple output tokens may be bundled together
|
||||
# Note: this may inflate the output token count slightly
|
||||
output_len = len(
|
||||
tokenizer(outputs[i].generated_text,
|
||||
add_special_tokens=False).input_ids)
|
||||
output_len = len(tokenizer(outputs[i].generated_text).input_ids)
|
||||
actual_output_lens.append(output_len)
|
||||
total_input += input_requests[i][1]
|
||||
if output_len > 1:
|
||||
tpots.append(
|
||||
(outputs[i].latency - outputs[i].ttft) / (output_len - 1))
|
||||
itls += outputs[i].itl
|
||||
ttfts.append(outputs[i].ttft)
|
||||
completed += 1
|
||||
else:
|
||||
actual_output_lens.append(0)
|
||||
|
||||
if completed == 0:
|
||||
warnings.warn(
|
||||
"All requests failed. This is likely due to a misconfiguration "
|
||||
"on the benchmark arguments.",
|
||||
stacklevel=2)
|
||||
metrics = BenchmarkMetrics(
|
||||
completed=completed,
|
||||
total_input=total_input,
|
||||
@ -250,12 +224,9 @@ def calculate_metrics(
|
||||
1000, # ttfts is empty if streaming is not supported by backend
|
||||
median_ttft_ms=np.median(ttfts or 0) * 1000,
|
||||
p99_ttft_ms=np.percentile(ttfts or 0, 99) * 1000,
|
||||
mean_tpot_ms=np.mean(tpots or 0) * 1000,
|
||||
median_tpot_ms=np.median(tpots or 0) * 1000,
|
||||
p99_tpot_ms=np.percentile(tpots or 0, 99) * 1000,
|
||||
mean_itl_ms=np.mean(itls or 0) * 1000,
|
||||
median_itl_ms=np.median(itls or 0) * 1000,
|
||||
p99_itl_ms=np.percentile(itls or 0, 99) * 1000,
|
||||
mean_tpot_ms=np.mean(tpots) * 1000,
|
||||
median_tpot_ms=np.median(tpots) * 1000,
|
||||
p99_tpot_ms=np.percentile(tpots, 99) * 1000,
|
||||
)
|
||||
|
||||
return metrics, actual_output_lens
|
||||
@ -273,34 +244,16 @@ async def benchmark(
|
||||
disable_tqdm: bool,
|
||||
):
|
||||
if backend in ASYNC_REQUEST_FUNCS:
|
||||
request_func = ASYNC_REQUEST_FUNCS[backend]
|
||||
request_func = ASYNC_REQUEST_FUNCS.get(backend)
|
||||
else:
|
||||
raise ValueError(f"Unknown backend: {backend}")
|
||||
|
||||
print("Starting initial single prompt test run...")
|
||||
test_prompt, test_prompt_len, test_output_len = input_requests[0]
|
||||
test_input = RequestFuncInput(
|
||||
model=model_id,
|
||||
prompt=test_prompt,
|
||||
api_url=api_url,
|
||||
prompt_len=test_prompt_len,
|
||||
output_len=test_output_len,
|
||||
best_of=best_of,
|
||||
use_beam_search=use_beam_search,
|
||||
)
|
||||
test_output = await request_func(request_func_input=test_input)
|
||||
if not test_output.success:
|
||||
raise ValueError(
|
||||
"Initial test run failed - Please make sure benchmark arguments "
|
||||
f"are correctly specified. Error: {test_output.error}")
|
||||
else:
|
||||
print("Initial test run completed. Starting main benchmark run...")
|
||||
print(f"Traffic request rate: {request_rate}")
|
||||
|
||||
pbar = None if disable_tqdm else tqdm(total=len(input_requests))
|
||||
|
||||
benchmark_start_time = time.perf_counter()
|
||||
tasks: List[asyncio.Task] = []
|
||||
tasks = []
|
||||
async for request in get_request(input_requests, request_rate):
|
||||
prompt, prompt_len, output_len = request
|
||||
request_func_input = RequestFuncInput(
|
||||
@ -318,7 +271,7 @@ async def benchmark(
|
||||
pbar=pbar)))
|
||||
outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks)
|
||||
|
||||
if pbar is not None:
|
||||
if not disable_tqdm:
|
||||
pbar.close()
|
||||
|
||||
benchmark_duration = time.perf_counter() - benchmark_start_time
|
||||
@ -355,10 +308,6 @@ async def benchmark(
|
||||
print("{:<40} {:<10.2f}".format("Median TPOT (ms):",
|
||||
metrics.median_tpot_ms))
|
||||
print("{:<40} {:<10.2f}".format("P99 TPOT (ms):", metrics.p99_tpot_ms))
|
||||
print("{s:{c}^{n}}".format(s='Inter-token Latency', n=50, c='-'))
|
||||
print("{:<40} {:<10.2f}".format("Mean ITL (ms):", metrics.mean_itl_ms))
|
||||
print("{:<40} {:<10.2f}".format("Median ITL (ms):", metrics.median_itl_ms))
|
||||
print("{:<40} {:<10.2f}".format("P99 ITL (ms):", metrics.p99_itl_ms))
|
||||
print("=" * 50)
|
||||
|
||||
result = {
|
||||
@ -375,9 +324,6 @@ async def benchmark(
|
||||
"mean_tpot_ms": metrics.mean_tpot_ms,
|
||||
"median_tpot_ms": metrics.median_tpot_ms,
|
||||
"p99_tpot_ms": metrics.p99_tpot_ms,
|
||||
"mean_itl_ms": metrics.mean_itl_ms,
|
||||
"median_itl_ms": metrics.median_itl_ms,
|
||||
"p99_itl_ms": metrics.p99_itl_ms,
|
||||
"input_lens": [output.prompt_len for output in outputs],
|
||||
"output_lens": actual_output_lens,
|
||||
"ttfts": [output.ttft for output in outputs],
|
||||
@ -415,7 +361,6 @@ def main(args: argparse.Namespace):
|
||||
dataset_path=args.dataset,
|
||||
num_requests=args.num_prompts,
|
||||
tokenizer=tokenizer,
|
||||
fixed_output_len=args.sharegpt_output_len,
|
||||
)
|
||||
|
||||
elif args.dataset_name == "sharegpt":
|
||||
@ -423,7 +368,6 @@ def main(args: argparse.Namespace):
|
||||
dataset_path=args.dataset_path,
|
||||
num_requests=args.num_prompts,
|
||||
tokenizer=tokenizer,
|
||||
fixed_output_len=args.sharegpt_output_len,
|
||||
)
|
||||
|
||||
elif args.dataset_name == "sonnet":
|
||||
@ -474,7 +418,7 @@ def main(args: argparse.Namespace):
|
||||
|
||||
# Save config and results to json
|
||||
if args.save_result:
|
||||
result_json: Dict[str, Any] = {}
|
||||
result_json = {}
|
||||
|
||||
# Setup
|
||||
current_dt = datetime.now().strftime("%Y%m%d-%H%M%S")
|
||||
@ -507,8 +451,6 @@ def main(args: argparse.Namespace):
|
||||
# Save to file
|
||||
base_model_id = model_id.split("/")[-1]
|
||||
file_name = f"{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json" #noqa
|
||||
if args.result_filename:
|
||||
file_name = args.result_filename
|
||||
if args.result_dir:
|
||||
file_name = os.path.join(args.result_dir, file_name)
|
||||
with open(file_name, "w") as outfile:
|
||||
@ -516,7 +458,7 @@ def main(args: argparse.Namespace):
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = FlexibleArgumentParser(
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Benchmark the online serving throughput.")
|
||||
parser.add_argument(
|
||||
"--backend",
|
||||
@ -582,12 +524,6 @@ if __name__ == "__main__":
|
||||
default=1000,
|
||||
help="Number of prompts to process.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--sharegpt-output-len",
|
||||
type=int,
|
||||
default=None,
|
||||
help="Output length for each request. Overrides the output length "
|
||||
"from the ShareGPT dataset.")
|
||||
parser.add_argument(
|
||||
"--sonnet-input-len",
|
||||
type=int,
|
||||
@ -649,15 +585,6 @@ if __name__ == "__main__":
|
||||
help="Specify directory to save benchmark json results."
|
||||
"If not specified, results are saved in the current directory.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--result-filename",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Specify the filename to save benchmark json results."
|
||||
"If not specified, results will be saved in "
|
||||
"{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json"
|
||||
" format.",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
main(args)
|
||||
|
||||
@ -10,10 +10,6 @@ from tqdm import tqdm
|
||||
from transformers import (AutoModelForCausalLM, AutoTokenizer,
|
||||
PreTrainedTokenizerBase)
|
||||
|
||||
from vllm.engine.arg_utils import EngineArgs
|
||||
from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
|
||||
|
||||
def sample_requests(
|
||||
dataset_path: str,
|
||||
@ -80,10 +76,8 @@ def run_vllm(
|
||||
enable_prefix_caching: bool,
|
||||
enable_chunked_prefill: bool,
|
||||
max_num_batched_tokens: int,
|
||||
distributed_executor_backend: Optional[str],
|
||||
gpu_memory_utilization: float = 0.9,
|
||||
download_dir: Optional[str] = None,
|
||||
load_format: str = EngineArgs.load_format,
|
||||
) -> float:
|
||||
from vllm import LLM, SamplingParams
|
||||
llm = LLM(
|
||||
@ -104,27 +98,28 @@ def run_vllm(
|
||||
download_dir=download_dir,
|
||||
enable_chunked_prefill=enable_chunked_prefill,
|
||||
max_num_batched_tokens=max_num_batched_tokens,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
load_format=load_format,
|
||||
)
|
||||
|
||||
# Add the requests to the engine.
|
||||
prompts: List[str] = []
|
||||
sampling_params: List[SamplingParams] = []
|
||||
for prompt, _, output_len in requests:
|
||||
prompts.append(prompt)
|
||||
sampling_params.append(
|
||||
SamplingParams(
|
||||
n=n,
|
||||
temperature=0.0 if use_beam_search else 1.0,
|
||||
top_p=1.0,
|
||||
use_beam_search=use_beam_search,
|
||||
ignore_eos=True,
|
||||
max_tokens=output_len,
|
||||
))
|
||||
sampling_params = SamplingParams(
|
||||
n=n,
|
||||
temperature=0.0 if use_beam_search else 1.0,
|
||||
top_p=1.0,
|
||||
use_beam_search=use_beam_search,
|
||||
ignore_eos=True,
|
||||
max_tokens=output_len,
|
||||
)
|
||||
# FIXME(woosuk): Do not use internal method.
|
||||
llm._add_request(
|
||||
prompt=prompt,
|
||||
prompt_token_ids=None,
|
||||
sampling_params=sampling_params,
|
||||
)
|
||||
|
||||
start = time.perf_counter()
|
||||
llm.generate(prompts, sampling_params, use_tqdm=True)
|
||||
# FIXME(woosuk): Do not use internal method.
|
||||
llm._run_engine(use_tqdm=True)
|
||||
end = time.perf_counter()
|
||||
return end - start
|
||||
|
||||
@ -231,8 +226,8 @@ def main(args: argparse.Namespace):
|
||||
args.enforce_eager, args.kv_cache_dtype,
|
||||
args.quantization_param_path, args.device,
|
||||
args.enable_prefix_caching, args.enable_chunked_prefill,
|
||||
args.max_num_batched_tokens, args.distributed_executor_backend,
|
||||
args.gpu_memory_utilization, args.download_dir, args.load_format)
|
||||
args.max_num_batched_tokens, args.gpu_memory_utilization,
|
||||
args.download_dir)
|
||||
elif args.backend == "hf":
|
||||
assert args.tensor_parallel_size == 1
|
||||
elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
|
||||
@ -248,21 +243,9 @@ def main(args: argparse.Namespace):
|
||||
print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
|
||||
f"{total_num_tokens / elapsed_time:.2f} tokens/s")
|
||||
|
||||
# Output JSON results if specified
|
||||
if args.output_json:
|
||||
results = {
|
||||
"elapsed_time": elapsed_time,
|
||||
"num_requests": len(requests),
|
||||
"total_num_tokens": total_num_tokens,
|
||||
"requests_per_second": len(requests) / elapsed_time,
|
||||
"tokens_per_second": total_num_tokens / elapsed_time,
|
||||
}
|
||||
with open(args.output_json, "w") as f:
|
||||
json.dump(results, f, indent=4)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = FlexibleArgumentParser(description="Benchmark the throughput.")
|
||||
parser = argparse.ArgumentParser(description="Benchmark the throughput.")
|
||||
parser.add_argument("--backend",
|
||||
type=str,
|
||||
choices=["vllm", "hf", "mii"],
|
||||
@ -284,7 +267,7 @@ if __name__ == "__main__":
|
||||
parser.add_argument("--tokenizer", type=str, default=None)
|
||||
parser.add_argument('--quantization',
|
||||
'-q',
|
||||
choices=[*QUANTIZATION_METHODS, None],
|
||||
choices=['awq', 'gptq', 'squeezellm', None],
|
||||
default=None)
|
||||
parser.add_argument("--tensor-parallel-size", "-tp", type=int, default=1)
|
||||
parser.add_argument("--n",
|
||||
@ -329,13 +312,15 @@ if __name__ == "__main__":
|
||||
action="store_true",
|
||||
help="enforce eager execution")
|
||||
parser.add_argument(
|
||||
'--kv-cache-dtype',
|
||||
"--kv-cache-dtype",
|
||||
type=str,
|
||||
choices=['auto', 'fp8', 'fp8_e5m2', 'fp8_e4m3'],
|
||||
choices=["auto", "fp8"],
|
||||
default="auto",
|
||||
help='Data type for kv cache storage. If "auto", will use model '
|
||||
'data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. '
|
||||
'ROCm (AMD GPU) supports fp8 (=fp8_e4m3)')
|
||||
help=
|
||||
'Data type for kv cache storage. If "auto", will use model data type. '
|
||||
'FP8_E5M2 (without scaling) is only supported on cuda version greater '
|
||||
'than 11.8. On ROCm (AMD GPU), FP8_E4M3 is instead supported for '
|
||||
'common inference criteria.')
|
||||
parser.add_argument(
|
||||
'--quantization-param-path',
|
||||
type=str,
|
||||
@ -349,10 +334,9 @@ if __name__ == "__main__":
|
||||
parser.add_argument(
|
||||
"--device",
|
||||
type=str,
|
||||
default="auto",
|
||||
choices=["auto", "cuda", "cpu", "openvino", "tpu", "xpu"],
|
||||
help='device type for vLLM execution, supporting CUDA, OpenVINO and '
|
||||
'CPU.')
|
||||
default="cuda",
|
||||
choices=["cuda", "cpu", "tpu"],
|
||||
help='device type for vLLM execution, supporting CUDA and CPU.')
|
||||
parser.add_argument(
|
||||
"--enable-prefix-caching",
|
||||
action='store_true',
|
||||
@ -370,41 +354,6 @@ if __name__ == "__main__":
|
||||
default=None,
|
||||
help='directory to download and load the weights, '
|
||||
'default to the default cache dir of huggingface')
|
||||
parser.add_argument(
|
||||
'--output-json',
|
||||
type=str,
|
||||
default=None,
|
||||
help='Path to save the throughput results in JSON format.')
|
||||
parser.add_argument(
|
||||
'--distributed-executor-backend',
|
||||
choices=['ray', 'mp'],
|
||||
default=None,
|
||||
help='Backend to use for distributed serving. When more than 1 GPU '
|
||||
'is used, will be automatically set to "ray" if installed '
|
||||
'or "mp" (multiprocessing) otherwise.')
|
||||
parser.add_argument(
|
||||
'--load-format',
|
||||
type=str,
|
||||
default=EngineArgs.load_format,
|
||||
choices=[
|
||||
'auto', 'pt', 'safetensors', 'npcache', 'dummy', 'tensorizer',
|
||||
'bitsandbytes'
|
||||
],
|
||||
help='The format of the model weights to load.\n\n'
|
||||
'* "auto" will try to load the weights in the safetensors format '
|
||||
'and fall back to the pytorch bin format if safetensors format '
|
||||
'is not available.\n'
|
||||
'* "pt" will load the weights in the pytorch bin format.\n'
|
||||
'* "safetensors" will load the weights in the safetensors format.\n'
|
||||
'* "npcache" will load the weights in pytorch format and store '
|
||||
'a numpy cache to speed up the loading.\n'
|
||||
'* "dummy" will initialize the weights with random values, '
|
||||
'which is mainly for profiling.\n'
|
||||
'* "tensorizer" will load the weights using tensorizer from '
|
||||
'CoreWeave. See the Tensorize vLLM Model script in the Examples'
|
||||
'section for more information.\n'
|
||||
'* "bitsandbytes" will load the weights using bitsandbytes '
|
||||
'quantization.\n')
|
||||
args = parser.parse_args()
|
||||
if args.tokenizer is None:
|
||||
args.tokenizer = args.model
|
||||
|
||||
@ -1,353 +0,0 @@
|
||||
import argparse
|
||||
import copy
|
||||
import itertools
|
||||
import pickle as pkl
|
||||
import time
|
||||
from typing import Callable, Iterable, List, Tuple
|
||||
|
||||
import torch
|
||||
import torch.utils.benchmark as TBenchmark
|
||||
from torch.utils.benchmark import Measurement as TMeasurement
|
||||
from weight_shapes import WEIGHT_SHAPES
|
||||
|
||||
from vllm import _custom_ops as ops
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
|
||||
DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())[1:]
|
||||
DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
|
||||
DEFAULT_TP_SIZES = [1]
|
||||
|
||||
# helpers
|
||||
|
||||
|
||||
def to_fp8(tensor: torch.tensor) -> torch.tensor:
|
||||
finfo = torch.finfo(torch.float8_e4m3fn)
|
||||
return torch.round(tensor.clamp(
|
||||
min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn)
|
||||
|
||||
|
||||
def to_int8(tensor: torch.tensor) -> torch.tensor:
|
||||
return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8)
|
||||
|
||||
|
||||
def make_rand_tensors(dtype: torch.dtype, m: int, n: int,
|
||||
k: int) -> Tuple[torch.tensor, torch.tensor]:
|
||||
|
||||
a = torch.randn((m, k), device='cuda') * 5
|
||||
b = torch.randn((n, k), device='cuda').t() * 5
|
||||
|
||||
if dtype == torch.int8:
|
||||
return to_int8(a), to_int8(b)
|
||||
if dtype == torch.float8_e4m3fn:
|
||||
return to_fp8(a), to_fp8(b)
|
||||
|
||||
raise ValueError("unsupported dtype")
|
||||
|
||||
|
||||
# impl
|
||||
|
||||
|
||||
def pytorch_mm_impl(a: torch.tensor, b: torch.tensor, scale_a: torch.tensor,
|
||||
scale_b: torch.tensor,
|
||||
out_dtype: torch.dtype) -> torch.tensor:
|
||||
return torch.mm(a, b)
|
||||
|
||||
|
||||
def pytorch_fp8_impl(a: torch.tensor, b: torch.tensor, scale_a: torch.tensor,
|
||||
scale_b: torch.tensor,
|
||||
out_dtype: torch.dtype) -> torch.tensor:
|
||||
return torch._scaled_mm(a,
|
||||
b,
|
||||
scale_a=scale_a,
|
||||
scale_b=scale_b,
|
||||
out_dtype=out_dtype)
|
||||
|
||||
|
||||
def pytorch_fp8_impl_fast_accum(a: torch.tensor, b: torch.tensor,
|
||||
scale_a: torch.tensor, scale_b: torch.tensor,
|
||||
out_dtype: torch.dtype) -> torch.tensor:
|
||||
return torch._scaled_mm(a,
|
||||
b,
|
||||
scale_a=scale_a,
|
||||
scale_b=scale_b,
|
||||
out_dtype=out_dtype,
|
||||
use_fast_accum=True)
|
||||
|
||||
|
||||
def cutlass_impl(a: torch.tensor, b: torch.tensor, scale_a: torch.tensor,
|
||||
scale_b: torch.tensor,
|
||||
out_dtype: torch.dtype) -> torch.tensor:
|
||||
return ops.cutlass_scaled_mm(a, b, scale_a, scale_b, out_dtype=out_dtype)
|
||||
|
||||
|
||||
# bench
|
||||
def bench_fn(a: torch.tensor, b: torch.tensor, scale_a: torch.tensor,
|
||||
scale_b: torch.tensor, out_dtype: torch.dtype, label: str,
|
||||
sub_label: str, fn: Callable, description: str) -> TMeasurement:
|
||||
|
||||
min_run_time = 1
|
||||
|
||||
globals = {
|
||||
"a": a,
|
||||
"b": b,
|
||||
"scale_a": scale_a,
|
||||
"scale_b": scale_b,
|
||||
"out_dtype": out_dtype,
|
||||
"fn": fn,
|
||||
}
|
||||
return TBenchmark.Timer(
|
||||
stmt="fn(a, b, scale_a, scale_b, out_dtype)",
|
||||
globals=globals,
|
||||
label=label,
|
||||
sub_label=sub_label,
|
||||
description=description,
|
||||
).blocked_autorange(min_run_time=min_run_time)
|
||||
|
||||
|
||||
def bench_int8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
|
||||
sub_label: str) -> Iterable[TMeasurement]:
|
||||
assert dtype == torch.int8
|
||||
a, b = make_rand_tensors(torch.int8, m, n, k)
|
||||
scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
|
||||
scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
|
||||
|
||||
timers = []
|
||||
# pytorch impl
|
||||
timers.append(
|
||||
bench_fn(a.to(dtype=torch.bfloat16, device="cuda"),
|
||||
b.to(dtype=torch.bfloat16, device="cuda"), scale_a, scale_b,
|
||||
torch.bfloat16, label, sub_label, pytorch_mm_impl,
|
||||
"pytorch_bf16_bf16_bf16_matmul-no-scales"))
|
||||
|
||||
# cutlass impl
|
||||
timers.append(
|
||||
bench_fn(a, b, scale_a, scale_b, torch.bfloat16, label, sub_label,
|
||||
cutlass_impl, "cutlass_i8_i8_bf16_scaled_mm"))
|
||||
|
||||
return timers
|
||||
|
||||
|
||||
def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
|
||||
sub_label: str) -> Iterable[TMeasurement]:
|
||||
assert dtype == torch.float8_e4m3fn
|
||||
a, b = make_rand_tensors(torch.float8_e4m3fn, m, n, k)
|
||||
scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
|
||||
scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
|
||||
|
||||
timers = []
|
||||
|
||||
# pytorch impl w. bf16
|
||||
timers.append(
|
||||
bench_fn(a.to(dtype=torch.bfloat16, device="cuda"),
|
||||
b.to(dtype=torch.bfloat16, device="cuda"), scale_a, scale_b,
|
||||
torch.bfloat16, label, sub_label, pytorch_mm_impl,
|
||||
"pytorch_bf16_bf16_bf16_matmul-no-scales"))
|
||||
|
||||
# pytorch impl: bf16 output, without fp8 fast accum
|
||||
timers.append(
|
||||
bench_fn(a, b, scale_a, scale_b, torch.bfloat16, label, sub_label,
|
||||
pytorch_fp8_impl, "pytorch_fp8_fp8_bf16_scaled_mm"))
|
||||
|
||||
# pytorch impl: bf16 output, with fp8 fast accum
|
||||
timers.append(
|
||||
bench_fn(a, b, scale_a, scale_b, torch.bfloat16, label, sub_label,
|
||||
pytorch_fp8_impl_fast_accum,
|
||||
"pytorch_fp8_fp8_bf16_scaled_mm_fast_accum"))
|
||||
|
||||
# pytorch impl: fp16 output, without fp8 fast accum
|
||||
timers.append(
|
||||
bench_fn(a, b, scale_a, scale_b, torch.float16, label, sub_label,
|
||||
pytorch_fp8_impl, "pytorch_fp8_fp8_fp16_scaled_mm"))
|
||||
|
||||
# pytorch impl: fp16 output, with fp8 fast accum
|
||||
timers.append(
|
||||
bench_fn(a, b, scale_a, scale_b, torch.float16, label, sub_label,
|
||||
pytorch_fp8_impl_fast_accum,
|
||||
"pytorch_fp8_fp8_fp16_scaled_mm_fast_accum"))
|
||||
|
||||
# cutlass impl: bf16 output
|
||||
timers.append(
|
||||
bench_fn(a, b, scale_a, scale_b, torch.bfloat16, label, sub_label,
|
||||
cutlass_impl, "cutlass_fp8_fp8_bf16_scaled_mm"))
|
||||
# cutlass impl: fp16 output
|
||||
timers.append(
|
||||
bench_fn(a, b, scale_a, scale_b, torch.float16, label, sub_label,
|
||||
cutlass_impl, "cutlass_fp8_fp8_fp16_scaled_mm"))
|
||||
return timers
|
||||
|
||||
|
||||
def bench(dtype: torch.dtype, m: int, k: int, n: int, label: str,
|
||||
sub_label: str) -> Iterable[TMeasurement]:
|
||||
if dtype == torch.int8:
|
||||
return bench_int8(dtype, m, k, n, label, sub_label)
|
||||
if dtype == torch.float8_e4m3fn:
|
||||
return bench_fp8(dtype, m, k, n, label, sub_label)
|
||||
raise ValueError("unsupported type")
|
||||
|
||||
|
||||
# runner
|
||||
def print_timers(timers: Iterable[TMeasurement]):
|
||||
compare = TBenchmark.Compare(timers)
|
||||
compare.print()
|
||||
|
||||
|
||||
def run(dtype: torch.dtype,
|
||||
MKNs: Iterable[Tuple[int, int, int]]) -> Iterable[TMeasurement]:
|
||||
|
||||
results = []
|
||||
for m, k, n in MKNs:
|
||||
timers = bench(dtype, m, k, n, f"scaled-{dtype}-gemm",
|
||||
f"MKN=({m}x{k}x{n})")
|
||||
print_timers(timers)
|
||||
results.extend(timers)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
# output makers
|
||||
def make_output(data: Iterable[TMeasurement],
|
||||
MKNs: Iterable[Tuple[int, int, int]],
|
||||
base_description: str,
|
||||
timestamp=None):
|
||||
|
||||
print(f"== All Results {base_description} ====")
|
||||
print_timers(data)
|
||||
|
||||
# pickle all the results
|
||||
timestamp = int(time.time()) if timestamp is None else timestamp
|
||||
with open(f"{base_description}-{timestamp}.pkl", "wb") as f:
|
||||
pkl.dump(data, f)
|
||||
|
||||
|
||||
# argparse runners
|
||||
|
||||
|
||||
def run_square_bench(args):
|
||||
dim_sizes = list(
|
||||
range(args.dim_start, args.dim_end + 1, args.dim_increment))
|
||||
MKNs = list(zip(dim_sizes, dim_sizes, dim_sizes))
|
||||
data = run(args.dtype, MKNs)
|
||||
|
||||
make_output(data, MKNs, f"square_bench-{args.dtype}")
|
||||
|
||||
|
||||
def run_range_bench(args):
|
||||
dim_sizes = list(range(args.dim_start, args.dim_end, args.dim_increment))
|
||||
n = len(dim_sizes)
|
||||
Ms = [args.m_constant] * n if args.m_constant is not None else dim_sizes
|
||||
Ks = [args.k_constant] * n if args.k_constant is not None else dim_sizes
|
||||
Ns = [args.n_constant] * n if args.n_constant is not None else dim_sizes
|
||||
MKNs = list(zip(Ms, Ks, Ns))
|
||||
data = run(args.dtype, MKNs)
|
||||
|
||||
make_output(data, MKNs, f"range_bench-{args.dtype}")
|
||||
|
||||
|
||||
def run_model_bench(args):
|
||||
|
||||
print("Benchmarking models:")
|
||||
for i, model in enumerate(args.models):
|
||||
print(f"[{i}] {model}")
|
||||
|
||||
def model_shapes(model_name: str, tp_size: int) -> List[Tuple[int, int]]:
|
||||
KNs = []
|
||||
for KN, tp_split_dim in copy.deepcopy(WEIGHT_SHAPES[model_name]):
|
||||
KN[tp_split_dim] = KN[tp_split_dim] // tp_size
|
||||
KNs.append(KN)
|
||||
return KNs
|
||||
|
||||
model_bench_data = []
|
||||
models_tps = list(itertools.product(args.models, args.tp_sizes))
|
||||
for model, tp_size in models_tps:
|
||||
Ms = args.batch_sizes
|
||||
KNs = model_shapes(model, tp_size)
|
||||
MKNs = []
|
||||
for m in Ms:
|
||||
for k, n in KNs:
|
||||
MKNs.append((m, k, n))
|
||||
|
||||
data = run(args.dtype, MKNs)
|
||||
model_bench_data.append(data)
|
||||
|
||||
# Print all results
|
||||
for data, model_tp in zip(model_bench_data, models_tps):
|
||||
model, tp_size = model_tp
|
||||
print(f"== Results {args.dtype} {model}-TP{tp_size} ====")
|
||||
print_timers(data)
|
||||
|
||||
timestamp = int(time.time())
|
||||
|
||||
all_data = []
|
||||
for d in model_bench_data:
|
||||
all_data.extend(d)
|
||||
# pickle all data
|
||||
with open(f"model_bench-{args.dtype}-{timestamp}.pkl", "wb") as f:
|
||||
pkl.dump(all_data, f)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
def to_torch_dtype(dt):
|
||||
if dt == "int8":
|
||||
return torch.int8
|
||||
if dt == "fp8":
|
||||
return torch.float8_e4m3fn
|
||||
raise ValueError("unsupported dtype")
|
||||
|
||||
parser = FlexibleArgumentParser(
|
||||
description="""
|
||||
Benchmark Cutlass GEMM.
|
||||
|
||||
To run square GEMMs:
|
||||
python3 ./benchmarks/cutlass_benchmarks/w8a8_benchmarks.py --dtype fp8 square_bench --dim-start 128 --dim-end 512 --dim-increment 64
|
||||
|
||||
To run constant N and K and sweep M:
|
||||
python3 ./benchmarks/cutlass_benchmarks/w8a8_benchmarks.py --dtype fp8 range_bench --dim-start 128 --dim-end 512 --dim-increment 64 --n-constant 16384 --k-constant 16384
|
||||
|
||||
To run dimensions from a model:
|
||||
python3 ./benchmarks/cutlass_benchmarks/w8a8_benchmarks.py --dtype fp8 model_bench --models meta-llama/Llama-2-7b-hf --batch-sizes 16 --tp-sizes 1
|
||||
|
||||
Output:
|
||||
- a .pkl file, that is a list of raw torch.benchmark.utils.Measurements for the pytorch and cutlass implementations for the various GEMMs.
|
||||
""", # noqa: E501
|
||||
formatter_class=argparse.RawTextHelpFormatter)
|
||||
|
||||
parser.add_argument("--dtype",
|
||||
type=to_torch_dtype,
|
||||
required=True,
|
||||
help="Available options are ['int8', 'fp8']")
|
||||
subparsers = parser.add_subparsers(dest="cmd")
|
||||
|
||||
square_parser = subparsers.add_parser("square_bench")
|
||||
square_parser.add_argument("--dim-start", type=int, required=True)
|
||||
square_parser.add_argument("--dim-end", type=int, required=True)
|
||||
square_parser.add_argument("--dim-increment", type=int, required=True)
|
||||
square_parser.set_defaults(func=run_square_bench)
|
||||
|
||||
range_parser = subparsers.add_parser("range_bench")
|
||||
range_parser.add_argument("--dim-start", type=int, required=True)
|
||||
range_parser.add_argument("--dim-end", type=int, required=True)
|
||||
range_parser.add_argument("--dim-increment", type=int, required=True)
|
||||
range_parser.add_argument("--m-constant", type=int, default=None)
|
||||
range_parser.add_argument("--n-constant", type=int, default=None)
|
||||
range_parser.add_argument("--k-constant", type=int, default=None)
|
||||
range_parser.set_defaults(func=run_range_bench)
|
||||
|
||||
model_parser = subparsers.add_parser("model_bench")
|
||||
model_parser.add_argument("--models",
|
||||
nargs="+",
|
||||
type=str,
|
||||
default=DEFAULT_MODELS,
|
||||
choices=WEIGHT_SHAPES.keys())
|
||||
model_parser.add_argument("--tp-sizes",
|
||||
nargs="+",
|
||||
type=int,
|
||||
default=DEFAULT_TP_SIZES)
|
||||
model_parser.add_argument("--batch-sizes",
|
||||
nargs="+",
|
||||
type=int,
|
||||
default=DEFAULT_BATCH_SIZES)
|
||||
model_parser.set_defaults(func=run_model_bench)
|
||||
|
||||
args = parser.parse_args()
|
||||
args.func(args)
|
||||
@ -1,43 +0,0 @@
|
||||
# Weight Shapes are in the format
|
||||
# ([K, N], TP_SPLIT_DIM)
|
||||
# Example:
|
||||
# A shape of ([14336, 4096], 0) indicates the following GEMM shape,
|
||||
# - TP1 : K = 14336, N = 4096
|
||||
# - TP2 : K = 7168, N = 4096
|
||||
# A shape of ([4096, 6144], 1) indicates the following GEMM shape,
|
||||
# - TP1 : K = 4096, N = 6144
|
||||
# - TP4 : K = 4096, N = 1536
|
||||
|
||||
# TP1 shapes
|
||||
WEIGHT_SHAPES = {
|
||||
"mistralai/Mistral-7B-v0.1": [
|
||||
([4096, 6144], 1),
|
||||
([4096, 4096], 0),
|
||||
([4096, 28672], 1),
|
||||
([14336, 4096], 0),
|
||||
],
|
||||
"meta-llama/Llama-2-7b-hf": [
|
||||
([4096, 12288], 1),
|
||||
([4096, 4096], 0),
|
||||
([4096, 22016], 1),
|
||||
([11008, 4096], 0),
|
||||
],
|
||||
"meta-llama/Llama-3-8b": [
|
||||
([4096, 6144], 1),
|
||||
([4096, 4096], 0),
|
||||
([4096, 28672], 1),
|
||||
([14336, 4096], 0),
|
||||
],
|
||||
"meta-llama/Llama-2-13b-hf": [
|
||||
([5120, 15360], 1),
|
||||
([5120, 5120], 0),
|
||||
([5120, 27648], 1),
|
||||
([13824, 5120], 0),
|
||||
],
|
||||
"meta-llama/Llama-2-70b-hf": [
|
||||
([8192, 10240], 1),
|
||||
([8192, 8192], 0),
|
||||
([8192, 57344], 1),
|
||||
([28672, 8192], 0),
|
||||
],
|
||||
}
|
||||
@ -1,302 +0,0 @@
|
||||
import os
|
||||
import sys
|
||||
from typing import Optional
|
||||
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
|
||||
from vllm import _custom_ops as ops
|
||||
from vllm.model_executor.layers.quantization.aqlm import (
|
||||
dequantize_weight, generic_dequantize_gemm, get_int_dtype,
|
||||
optimized_dequantize_gemm)
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
|
||||
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
|
||||
|
||||
|
||||
def torch_mult(
|
||||
input: torch.Tensor, # [..., in_features]
|
||||
weights: torch.Tensor,
|
||||
scales: torch.Tensor, # [num_out_groups, 1, 1, 1]
|
||||
) -> torch.Tensor:
|
||||
output = F.linear(input, weights)
|
||||
return output
|
||||
|
||||
|
||||
def dequant_out_scale(
|
||||
input: torch.Tensor, # [..., in_features]
|
||||
codes: torch.IntTensor, # [num_out_groups, num_in_groups, num_codebooks]
|
||||
codebooks: torch.
|
||||
Tensor, # [num_codebooks, codebook_size, out_group_size, in_group_size]
|
||||
scales: torch.Tensor, # [num_out_groups, 1, 1, 1]
|
||||
output_partition_sizes: torch.IntTensor,
|
||||
bias: Optional[torch.Tensor],
|
||||
) -> torch.Tensor:
|
||||
|
||||
weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes)
|
||||
|
||||
if bias is None:
|
||||
output = F.linear(input, weights, bias)
|
||||
orig_shape = output.shape
|
||||
flattened_output = output.view(-1, output.size(-1))
|
||||
f_scales = scales.view(-1, scales.shape[0])
|
||||
b_scales = f_scales.expand(flattened_output.shape[0], -1)
|
||||
flattened_output *= b_scales
|
||||
return flattened_output.view(orig_shape)
|
||||
else:
|
||||
b_scales = scales.view(scales.shape[:-3] + (-1, )).expand(
|
||||
-1, weights.shape[1])
|
||||
weights *= b_scales
|
||||
return F.linear(input, weights, bias)
|
||||
|
||||
|
||||
def dequant_weight_scale(
|
||||
input: torch.Tensor, # [..., in_features]
|
||||
codes: torch.IntTensor, # [num_out_groups, num_in_groups, num_codebooks]
|
||||
codebooks: torch.
|
||||
Tensor, # [num_codebooks, codebook_size, out_group_size, in_group_size]
|
||||
scales: torch.Tensor, # [num_out_groups, 1, 1, 1]
|
||||
output_partition_sizes: torch.IntTensor,
|
||||
bias: Optional[torch.Tensor],
|
||||
) -> torch.Tensor:
|
||||
|
||||
weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes)
|
||||
|
||||
b_scales = scales.view(scales.shape[:-3] + (-1, )).expand(
|
||||
-1, weights.shape[1])
|
||||
weights *= b_scales
|
||||
return F.linear(input, weights, bias)
|
||||
|
||||
|
||||
def dequant_no_scale(
|
||||
input: torch.Tensor, # [..., in_features]
|
||||
codes: torch.IntTensor, # [num_out_groups, num_in_groups, num_codebooks]
|
||||
codebooks: torch.
|
||||
Tensor, # [num_codebooks, codebook_size, out_group_size, in_group_size]
|
||||
scales: torch.Tensor, # [num_out_groups, 1, 1, 1]
|
||||
output_partition_sizes: torch.IntTensor,
|
||||
bias: Optional[torch.Tensor],
|
||||
) -> torch.Tensor:
|
||||
|
||||
weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes)
|
||||
|
||||
return F.linear(input, weights, bias)
|
||||
|
||||
|
||||
# Compare the optimized 1x16 and 2x8 cuda decompression/dequant kernels against
|
||||
# the generic pytorch version.
|
||||
# Just visual comparison.
|
||||
def dequant_test(k: int, parts: torch.Tensor, nbooks: int, bits: int) -> None:
|
||||
|
||||
n = int(parts.sum().item())
|
||||
|
||||
device = torch.device('cuda:0')
|
||||
|
||||
code_range = (1 << bits) // 2
|
||||
ingroups = 8
|
||||
|
||||
codes = torch.randint(-code_range,
|
||||
code_range,
|
||||
size=(n, k // ingroups, nbooks),
|
||||
dtype=get_int_dtype(bits),
|
||||
device=device)
|
||||
|
||||
codebooks = torch.randn(size=(parts.shape[0] * nbooks, 1 << bits, 1, 8),
|
||||
dtype=torch.float16,
|
||||
device=device)
|
||||
|
||||
count = 0
|
||||
for index in range(16):
|
||||
for i in range(8):
|
||||
for book in range(nbooks):
|
||||
codebooks[book, index, 0, i] = count * (10**book)
|
||||
count += 1
|
||||
|
||||
print("codes shape", codes.shape)
|
||||
|
||||
for i in range(16):
|
||||
for book in range(nbooks):
|
||||
codes[0, i, book] = i
|
||||
codes[0, -i, book] = i
|
||||
|
||||
weights = dequantize_weight(codes, codebooks, None)
|
||||
weights2 = ops.aqlm_dequant(codes, codebooks, parts)
|
||||
|
||||
print("weights shape:", weights.shape)
|
||||
print("weights2 shape:", weights2.shape)
|
||||
|
||||
print("weights are:", weights)
|
||||
print("weights2 are:", weights2)
|
||||
|
||||
print("first 128 weights are", weights[0, 0:128].to(torch.int32))
|
||||
print("first 128 weights2 are:", weights2[0, 0:128].to(torch.int32))
|
||||
|
||||
print("last 128 weights are", weights[0, -128:])
|
||||
print("last 128 weights2 are:", weights2[0, -128:])
|
||||
|
||||
|
||||
def main():
|
||||
|
||||
parser = FlexibleArgumentParser(description="Benchmark aqlm performance.")
|
||||
|
||||
# Add arguments
|
||||
parser.add_argument("--nbooks",
|
||||
type=int,
|
||||
default=1,
|
||||
help="Number of codebooks (default: 1)")
|
||||
parser.add_argument("--bits",
|
||||
type=int,
|
||||
default=16,
|
||||
help="Number of bits per code element (default: 16)")
|
||||
parser.add_argument(
|
||||
"--test",
|
||||
type=bool,
|
||||
default=False,
|
||||
help="Run the decompression/dequant tester rather than benchmarking "
|
||||
"(default: False)")
|
||||
|
||||
# Parse the arguments
|
||||
args = parser.parse_args()
|
||||
|
||||
# Extract values
|
||||
nbooks = args.nbooks
|
||||
bits = args.bits
|
||||
|
||||
if args.test:
|
||||
dequant_test(4096, torch.tensor((4096, )), nbooks, bits)
|
||||
return
|
||||
|
||||
# Otherwise, benchmark.
|
||||
methods = [
|
||||
ops.aqlm_gemm,
|
||||
dequant_out_scale,
|
||||
generic_dequantize_gemm,
|
||||
optimized_dequantize_gemm,
|
||||
dequant_weight_scale,
|
||||
torch_mult,
|
||||
dequant_no_scale,
|
||||
]
|
||||
|
||||
filename = f"./aqlm_benchmark_{nbooks}x{bits}.csv"
|
||||
print(f"writing benchmarks to file {filename}")
|
||||
with open(filename, "w") as f:
|
||||
sys.stdout = f
|
||||
|
||||
print('m | k | n | n parts', end='')
|
||||
for method in methods:
|
||||
print(f" | {method.__name__.replace('_', ' ')} (µs)", end='')
|
||||
print('')
|
||||
|
||||
# These are reasonable prefill sizes.
|
||||
ksandpartions = ((4096, (4096, 4096, 4096)), (4096, (4096, )),
|
||||
(4096, (11008, 11008)), (11008, (4096, )))
|
||||
|
||||
# reasonable ranges for m.
|
||||
for m in [
|
||||
1, 2, 4, 8, 10, 12, 14, 16, 24, 32, 48, 52, 56, 64, 96, 112,
|
||||
128, 256, 512, 1024, 1536, 2048, 3072, 4096
|
||||
]:
|
||||
print(f'{m}', file=sys.__stdout__)
|
||||
for ksp in ksandpartions:
|
||||
run_grid(m, ksp[0], torch.tensor(ksp[1]), nbooks, bits,
|
||||
methods)
|
||||
|
||||
sys.stdout = sys.__stdout__
|
||||
|
||||
|
||||
def run_grid(m: int, k: int, parts: torch.Tensor, nbooks: int, bits: int,
|
||||
methods):
|
||||
|
||||
# I didn't see visible improvements from increasing these, but feel free :)
|
||||
num_warmup_trials = 1
|
||||
num_trials = 1
|
||||
|
||||
num_calls = 100
|
||||
|
||||
# warmup.
|
||||
for method in methods:
|
||||
for _ in range(num_warmup_trials):
|
||||
run_timing(
|
||||
num_calls=num_calls,
|
||||
m=m,
|
||||
k=k,
|
||||
parts=parts,
|
||||
nbooks=nbooks,
|
||||
bits=bits,
|
||||
method=method,
|
||||
)
|
||||
|
||||
n = parts.sum().item()
|
||||
print(f'{m} | {k} | {n} | {parts.tolist()}', end='')
|
||||
|
||||
for method in methods:
|
||||
best_time_us = 1e20
|
||||
for _ in range(num_trials):
|
||||
kernel_dur_ms = run_timing(
|
||||
num_calls=num_calls,
|
||||
m=m,
|
||||
k=k,
|
||||
parts=parts,
|
||||
nbooks=nbooks,
|
||||
bits=bits,
|
||||
method=method,
|
||||
)
|
||||
|
||||
kernel_dur_us = 1000 * kernel_dur_ms
|
||||
|
||||
if kernel_dur_us < best_time_us:
|
||||
best_time_us = kernel_dur_us
|
||||
|
||||
print(f' | {kernel_dur_us:.0f}', end='')
|
||||
|
||||
print('')
|
||||
|
||||
|
||||
def run_timing(num_calls: int, m: int, k: int, parts: torch.Tensor,
|
||||
nbooks: int, bits: int, method) -> float:
|
||||
|
||||
n = int(parts.sum().item())
|
||||
|
||||
device = torch.device('cuda:0')
|
||||
|
||||
input = torch.randn((1, m, k), dtype=torch.float16, device=device)
|
||||
|
||||
code_range = (1 << bits) // 2
|
||||
ingroups = 8
|
||||
|
||||
codes = torch.randint(-code_range,
|
||||
code_range,
|
||||
size=(n, k // ingroups, nbooks),
|
||||
dtype=get_int_dtype(bits),
|
||||
device=device)
|
||||
|
||||
codebooks = torch.randn(size=(parts.shape[0] * nbooks, 1 << bits, 1, 8),
|
||||
dtype=torch.float16,
|
||||
device=device)
|
||||
|
||||
scales = torch.randn(size=(n, 1, 1, 1), dtype=torch.float16, device=device)
|
||||
|
||||
# for comparison to just a pytorch mult.
|
||||
weights = torch.randn((n, k), dtype=torch.float16, device=device)
|
||||
|
||||
start_event = torch.cuda.Event(enable_timing=True)
|
||||
end_event = torch.cuda.Event(enable_timing=True)
|
||||
|
||||
start_event.record()
|
||||
|
||||
if method is torch_mult:
|
||||
for i in range(num_calls):
|
||||
torch_mult(input, weights, scales)
|
||||
else:
|
||||
for i in range(num_calls):
|
||||
method(input, codes, codebooks, scales, parts, None)
|
||||
|
||||
end_event.record()
|
||||
end_event.synchronize()
|
||||
|
||||
dur_ms = start_event.elapsed_time(end_event) / num_calls
|
||||
return dur_ms
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
@ -1,235 +0,0 @@
|
||||
from typing import List
|
||||
|
||||
import torch
|
||||
import torch.utils.benchmark as benchmark
|
||||
from benchmark_shapes import WEIGHT_SHAPES
|
||||
|
||||
from vllm import _custom_ops as ops
|
||||
from vllm.model_executor.layers.quantization.gptq_marlin import (
|
||||
GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N,
|
||||
GPTQ_MARLIN_SUPPORTED_GROUP_SIZES, GPTQ_MARLIN_SUPPORTED_NUM_BITS)
|
||||
from vllm.model_executor.layers.quantization.gptq_marlin_24 import (
|
||||
GPTQ_MARLIN_24_MAX_PARALLEL, GPTQ_MARLIN_24_MIN_THREAD_N,
|
||||
GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES, GPTQ_MARLIN_24_SUPPORTED_NUM_BITS)
|
||||
from vllm.model_executor.layers.quantization.utils.marlin_utils import (
|
||||
MarlinWorkspace, marlin_24_quantize, marlin_quantize)
|
||||
from vllm.model_executor.layers.quantization.utils.quant_utils import (
|
||||
gptq_pack, quantize_weights, sort_weights)
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
|
||||
DEFAULT_MODELS = ["meta-llama/Llama-2-7b-hf/TP1"]
|
||||
DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
|
||||
|
||||
ACT_ORDER_OPTS = [False, True]
|
||||
K_FULL_OPTS = [False, True]
|
||||
|
||||
|
||||
def bench_run(results: List[benchmark.Measurement], model: str,
|
||||
act_order: bool, is_k_full: bool, num_bits: int, group_size: int,
|
||||
size_m: int, size_k: int, size_n: int):
|
||||
label = "Quant Matmul"
|
||||
|
||||
sub_label = ("{}, act={} k_full={}, b={}, g={}, "
|
||||
"MKN=({}x{}x{})".format(model, act_order, is_k_full, num_bits,
|
||||
group_size, size_m, size_k, size_n))
|
||||
|
||||
print(f"Testing: {sub_label}")
|
||||
|
||||
a = torch.randn(size_m, size_k).to(torch.half).cuda()
|
||||
b = torch.rand(size_k, size_n).to(torch.half).cuda()
|
||||
|
||||
a_tmp = (torch.zeros(size_m, size_k).to(torch.half).cuda())
|
||||
|
||||
# Marlin quant
|
||||
(
|
||||
marlin_w_ref,
|
||||
marlin_q_w,
|
||||
marlin_s,
|
||||
marlin_g_idx,
|
||||
marlin_sort_indices,
|
||||
marlin_rand_perm,
|
||||
) = marlin_quantize(b, num_bits, group_size, act_order)
|
||||
|
||||
# Marlin_24 quant
|
||||
(marlin_24_w_ref, marlin_24_q_w_comp, marlin_24_meta,
|
||||
marlin_24_s) = marlin_24_quantize(b, num_bits, group_size)
|
||||
|
||||
# GPTQ quant
|
||||
(w_ref, q_w, s, g_idx,
|
||||
rand_perm) = quantize_weights(b, num_bits, group_size, act_order)
|
||||
q_w_gptq = gptq_pack(q_w, num_bits, size_k, size_n)
|
||||
|
||||
# For act_order, sort the "weights" and "g_idx"
|
||||
# so that group ids are increasing
|
||||
repack_sort_indices = torch.empty(0, dtype=torch.int, device=b.device)
|
||||
if act_order:
|
||||
(q_w, g_idx, repack_sort_indices) = sort_weights(q_w, g_idx)
|
||||
|
||||
# Prepare
|
||||
marlin_workspace = MarlinWorkspace(size_n, GPTQ_MARLIN_MIN_THREAD_N,
|
||||
GPTQ_MARLIN_MAX_PARALLEL)
|
||||
|
||||
marlin_24_workspace = MarlinWorkspace(size_n, GPTQ_MARLIN_24_MIN_THREAD_N,
|
||||
GPTQ_MARLIN_24_MAX_PARALLEL)
|
||||
|
||||
globals = {
|
||||
# Gen params
|
||||
"num_bits": num_bits,
|
||||
"group_size": group_size,
|
||||
"size_m": size_m,
|
||||
"size_n": size_n,
|
||||
"size_k": size_k,
|
||||
"a": a,
|
||||
"a_tmp": a_tmp,
|
||||
# Marlin params
|
||||
"marlin_w_ref": marlin_w_ref,
|
||||
"marlin_q_w": marlin_q_w,
|
||||
"marlin_s": marlin_s,
|
||||
"marlin_g_idx": marlin_g_idx,
|
||||
"marlin_sort_indices": marlin_sort_indices,
|
||||
"marlin_rand_perm": marlin_rand_perm,
|
||||
"marlin_workspace": marlin_workspace,
|
||||
"is_k_full": is_k_full,
|
||||
# Marlin_24 params
|
||||
"marlin_24_w_ref": marlin_24_w_ref,
|
||||
"marlin_24_q_w_comp": marlin_24_q_w_comp,
|
||||
"marlin_24_meta": marlin_24_meta,
|
||||
"marlin_24_s": marlin_24_s,
|
||||
"marlin_24_workspace": marlin_24_workspace,
|
||||
# GPTQ params
|
||||
"q_w_gptq": q_w_gptq,
|
||||
"repack_sort_indices": repack_sort_indices,
|
||||
# Kernels
|
||||
"gptq_marlin_gemm": ops.gptq_marlin_gemm,
|
||||
"gptq_marlin_24_gemm": ops.gptq_marlin_24_gemm,
|
||||
"gptq_marlin_repack": ops.gptq_marlin_repack,
|
||||
}
|
||||
|
||||
min_run_time = 1
|
||||
|
||||
# Warmup pytorch
|
||||
for i in range(5):
|
||||
torch.matmul(a, marlin_w_ref)
|
||||
|
||||
results.append(
|
||||
benchmark.Timer(
|
||||
stmt="torch.matmul(a, marlin_w_ref)",
|
||||
globals=globals,
|
||||
label=label,
|
||||
sub_label=sub_label,
|
||||
description="pytorch_gemm",
|
||||
).blocked_autorange(min_run_time=min_run_time))
|
||||
|
||||
results.append(
|
||||
benchmark.Timer(
|
||||
stmt=
|
||||
"output = gptq_marlin_gemm(a, marlin_q_w, marlin_s, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, num_bits, size_m, size_n, size_k, is_k_full)", # noqa: E501
|
||||
globals=globals,
|
||||
label=label,
|
||||
sub_label=sub_label,
|
||||
description="gptq_marlin_gemm",
|
||||
).blocked_autorange(min_run_time=min_run_time))
|
||||
|
||||
if (num_bits in GPTQ_MARLIN_24_SUPPORTED_NUM_BITS
|
||||
and group_size in GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES):
|
||||
results.append(
|
||||
benchmark.Timer(
|
||||
stmt=
|
||||
"output = gptq_marlin_24_gemm(a, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s, marlin_24_workspace.scratch, num_bits, size_m, size_n, size_k)", # noqa: E501
|
||||
globals=globals,
|
||||
label=label,
|
||||
sub_label=sub_label,
|
||||
description="gptq_marlin_24_gemm",
|
||||
).blocked_autorange(min_run_time=min_run_time))
|
||||
|
||||
results.append(
|
||||
benchmark.Timer(
|
||||
stmt=
|
||||
"q_res = gptq_marlin_repack(q_w_gptq, repack_sort_indices, size_k, size_n, num_bits)", # noqa: E501
|
||||
globals=globals,
|
||||
label=label,
|
||||
sub_label=sub_label,
|
||||
description="gptq_marlin_repack",
|
||||
).blocked_autorange(min_run_time=min_run_time))
|
||||
|
||||
|
||||
def main(args):
|
||||
print("Benchmarking models:")
|
||||
for i, model in enumerate(args.models):
|
||||
print(f"[{i}] {model}")
|
||||
|
||||
results: List[benchmark.Measurement] = []
|
||||
|
||||
for model in args.models:
|
||||
for layer in WEIGHT_SHAPES[model]:
|
||||
size_k = layer[0]
|
||||
size_n = layer[1]
|
||||
|
||||
if len(args.limit_k) > 0 and size_k not in args.limit_k:
|
||||
continue
|
||||
|
||||
if len(args.limit_n) > 0 and size_n not in args.limit_n:
|
||||
continue
|
||||
|
||||
for act_order in ACT_ORDER_OPTS:
|
||||
if len(args.limit_act_order
|
||||
) > 0 and act_order not in args.limit_act_order:
|
||||
continue
|
||||
|
||||
for is_k_full in K_FULL_OPTS:
|
||||
if len(args.limit_k_full
|
||||
) > 0 and is_k_full not in args.limit_k_full:
|
||||
continue
|
||||
|
||||
for num_bits in GPTQ_MARLIN_SUPPORTED_NUM_BITS:
|
||||
if len(args.limit_num_bits
|
||||
) > 0 and num_bits not in args.limit_num_bits:
|
||||
continue
|
||||
|
||||
for group_size in GPTQ_MARLIN_SUPPORTED_GROUP_SIZES:
|
||||
if len(
|
||||
args.limit_group_size
|
||||
) > 0 and group_size not in args.limit_group_size:
|
||||
continue
|
||||
|
||||
# For act_order, the group_size must be less than
|
||||
# size_k
|
||||
if act_order and (group_size == size_k
|
||||
or group_size == -1):
|
||||
continue
|
||||
|
||||
for size_m in args.batch_sizes:
|
||||
bench_run(results, model, act_order, is_k_full,
|
||||
num_bits, group_size, size_m, size_k,
|
||||
size_n)
|
||||
|
||||
compare = benchmark.Compare(results)
|
||||
compare.print()
|
||||
|
||||
|
||||
# For quick benchmarking use:
|
||||
# python benchmark_marlin.py --batch-sizes 1 16 32 --limit-k 4096 --limit-n 4096 --limit-group-size 128 --limit-num-bits 4 --limit-act-order 0 --limit-k-full 1 # noqa E501
|
||||
#
|
||||
if __name__ == "__main__":
|
||||
parser = FlexibleArgumentParser(
|
||||
description="Benchmark Marlin across specified models/shapes/batches")
|
||||
parser.add_argument(
|
||||
"--models",
|
||||
nargs="+",
|
||||
type=str,
|
||||
default=DEFAULT_MODELS,
|
||||
choices=WEIGHT_SHAPES.keys(),
|
||||
)
|
||||
parser.add_argument("--batch-sizes",
|
||||
nargs="+",
|
||||
type=int,
|
||||
default=DEFAULT_BATCH_SIZES)
|
||||
parser.add_argument("--limit-k", nargs="+", type=int, default=[])
|
||||
parser.add_argument("--limit-n", nargs="+", type=int, default=[])
|
||||
parser.add_argument("--limit-group-size", nargs="+", type=int, default=[])
|
||||
parser.add_argument("--limit-num-bits", nargs="+", type=int, default=[])
|
||||
parser.add_argument("--limit-act-order", nargs="+", type=int, default=[])
|
||||
parser.add_argument("--limit-k-full", nargs="+", type=int, default=[])
|
||||
|
||||
args = parser.parse_args()
|
||||
main(args)
|
||||
182
benchmarks/kernels/benchmark_mixtral_moe.py
Normal file
182
benchmarks/kernels/benchmark_mixtral_moe.py
Normal file
@ -0,0 +1,182 @@
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
import triton
|
||||
|
||||
from vllm.model_executor.layers.fused_moe import (fused_moe,
|
||||
get_config_file_name)
|
||||
|
||||
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
|
||||
|
||||
|
||||
def main():
|
||||
method = fused_moe
|
||||
for bs in [
|
||||
1, 2, 4, 8, 16, 24, 32, 48, 64, 96, 128, 256, 512, 1024, 1536,
|
||||
2048, 3072, 4096
|
||||
]:
|
||||
run_grid(bs, method=method)
|
||||
|
||||
|
||||
def run_grid(bs, method):
|
||||
d_model = 4096
|
||||
num_total_experts = 8
|
||||
top_k = 2
|
||||
tp_size = 2
|
||||
model_intermediate_size = 14336
|
||||
num_layers = 32
|
||||
num_calls = 100
|
||||
|
||||
num_warmup_trials = 1
|
||||
num_trials = 1
|
||||
|
||||
configs = []
|
||||
if bs <= 16:
|
||||
BLOCK_SIZES_M = [16]
|
||||
elif bs <= 32:
|
||||
BLOCK_SIZES_M = [16, 32]
|
||||
elif bs <= 64:
|
||||
BLOCK_SIZES_M = [16, 32, 64]
|
||||
elif bs <= 128:
|
||||
BLOCK_SIZES_M = [16, 32, 64, 128]
|
||||
else:
|
||||
BLOCK_SIZES_M = [16, 32, 64, 128, 256]
|
||||
|
||||
for block_size_n in [32, 64, 128, 256]:
|
||||
for block_size_m in BLOCK_SIZES_M:
|
||||
for block_size_k in [64, 128, 256]:
|
||||
for group_size_m in [1, 16, 32, 64]:
|
||||
for num_warps in [4, 8]:
|
||||
configs.append({
|
||||
"BLOCK_SIZE_M": block_size_m,
|
||||
"BLOCK_SIZE_N": block_size_n,
|
||||
"BLOCK_SIZE_K": block_size_k,
|
||||
"GROUP_SIZE_M": group_size_m,
|
||||
"num_warps": num_warps,
|
||||
"num_stages": 4,
|
||||
})
|
||||
|
||||
best_config = None
|
||||
best_time_us = 1e20
|
||||
|
||||
for config in configs:
|
||||
print(f'{tp_size=} {bs=}')
|
||||
print(f'{config}')
|
||||
# warmup
|
||||
print('warming up')
|
||||
try:
|
||||
for _ in range(num_warmup_trials):
|
||||
run_timing(
|
||||
num_calls=num_calls,
|
||||
bs=bs,
|
||||
d_model=d_model,
|
||||
num_total_experts=num_total_experts,
|
||||
top_k=top_k,
|
||||
tp_size=tp_size,
|
||||
model_intermediate_size=model_intermediate_size,
|
||||
method=method,
|
||||
config=config,
|
||||
)
|
||||
except triton.runtime.autotuner.OutOfResources:
|
||||
continue
|
||||
|
||||
# trial
|
||||
print('benchmarking')
|
||||
for _ in range(num_trials):
|
||||
kernel_dur_ms = run_timing(
|
||||
num_calls=num_calls,
|
||||
bs=bs,
|
||||
d_model=d_model,
|
||||
num_total_experts=num_total_experts,
|
||||
top_k=top_k,
|
||||
tp_size=tp_size,
|
||||
model_intermediate_size=model_intermediate_size,
|
||||
method=method,
|
||||
config=config,
|
||||
)
|
||||
|
||||
kernel_dur_us = 1000 * kernel_dur_ms
|
||||
model_dur_ms = kernel_dur_ms * num_layers
|
||||
|
||||
if kernel_dur_us < best_time_us:
|
||||
best_config = config
|
||||
best_time_us = kernel_dur_us
|
||||
|
||||
print(f'{kernel_dur_us=:.1f} {model_dur_ms=:.1f}'
|
||||
f' {bs=} {tp_size=} {top_k=} {num_total_experts=} '
|
||||
f'{d_model=} {model_intermediate_size=} {num_layers=}')
|
||||
|
||||
print("best_time_us", best_time_us)
|
||||
print("best_config", best_config)
|
||||
|
||||
# holds Dict[str, Dict[str, int]]
|
||||
filename = get_config_file_name(num_total_experts,
|
||||
model_intermediate_size // tp_size)
|
||||
print(f"writing config to file {filename}")
|
||||
existing_content = {}
|
||||
if os.path.exists(filename):
|
||||
with open(filename, "r") as f:
|
||||
existing_content = json.load(f)
|
||||
existing_content[str(bs)] = best_config
|
||||
with open(filename, "w") as f:
|
||||
json.dump(existing_content, f, indent=4)
|
||||
f.write("\n")
|
||||
|
||||
|
||||
def run_timing(num_calls: int, bs: int, d_model: int, num_total_experts: int,
|
||||
top_k: int, tp_size: int, model_intermediate_size: int, method,
|
||||
config) -> float:
|
||||
shard_intermediate_size = model_intermediate_size // tp_size
|
||||
|
||||
hidden_states = torch.rand(
|
||||
(bs, d_model),
|
||||
device="cuda:0",
|
||||
dtype=torch.bfloat16,
|
||||
)
|
||||
|
||||
ws = torch.rand(
|
||||
(num_total_experts, 2 * shard_intermediate_size, d_model),
|
||||
device=hidden_states.device,
|
||||
dtype=hidden_states.dtype,
|
||||
)
|
||||
|
||||
w2s = torch.rand(
|
||||
(num_total_experts, d_model, shard_intermediate_size),
|
||||
device=hidden_states.device,
|
||||
dtype=hidden_states.dtype,
|
||||
)
|
||||
|
||||
gating_output = F.softmax(torch.rand(
|
||||
(num_calls, bs, num_total_experts),
|
||||
device=hidden_states.device,
|
||||
dtype=torch.float32,
|
||||
),
|
||||
dim=-1)
|
||||
|
||||
start_event = torch.cuda.Event(enable_timing=True)
|
||||
end_event = torch.cuda.Event(enable_timing=True)
|
||||
|
||||
start_event.record()
|
||||
for i in range(num_calls):
|
||||
hidden_states = method(
|
||||
hidden_states=hidden_states,
|
||||
w1=ws,
|
||||
w2=w2s,
|
||||
gating_output=gating_output[i],
|
||||
topk=2,
|
||||
renormalize=True,
|
||||
inplace=True,
|
||||
override_config=config,
|
||||
)
|
||||
end_event.record()
|
||||
end_event.synchronize()
|
||||
|
||||
dur_ms = start_event.elapsed_time(end_event) / num_calls
|
||||
return dur_ms
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
@ -1,333 +0,0 @@
|
||||
import argparse
|
||||
import time
|
||||
from datetime import datetime
|
||||
from typing import Any, Dict, List, Tuple, TypedDict
|
||||
|
||||
import ray
|
||||
import torch
|
||||
import triton
|
||||
from ray.experimental.tqdm_ray import tqdm
|
||||
from transformers import AutoConfig
|
||||
|
||||
from vllm.model_executor.layers.fused_moe.fused_moe import *
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
|
||||
|
||||
class BenchmarkConfig(TypedDict):
|
||||
BLOCK_SIZE_M: int
|
||||
BLOCK_SIZE_N: int
|
||||
BLOCK_SIZE_K: int
|
||||
GROUP_SIZE_M: int
|
||||
num_warps: int
|
||||
num_stages: int
|
||||
|
||||
|
||||
def benchmark_config(
|
||||
config: BenchmarkConfig,
|
||||
num_tokens: int,
|
||||
num_experts: int,
|
||||
shard_intermediate_size: int,
|
||||
hidden_size: int,
|
||||
topk: int,
|
||||
dtype: torch.dtype,
|
||||
use_fp8: bool,
|
||||
num_iters: int = 100,
|
||||
) -> float:
|
||||
init_dtype = torch.float16 if use_fp8 else dtype
|
||||
x = torch.randn(num_tokens, hidden_size, dtype=dtype)
|
||||
w1 = torch.randn(num_experts,
|
||||
shard_intermediate_size,
|
||||
hidden_size,
|
||||
dtype=init_dtype)
|
||||
w2 = torch.randn(num_experts,
|
||||
hidden_size,
|
||||
shard_intermediate_size // 2,
|
||||
dtype=init_dtype)
|
||||
gating_output = torch.randn(num_iters,
|
||||
num_tokens,
|
||||
num_experts,
|
||||
dtype=torch.float32)
|
||||
|
||||
w1_scale = None
|
||||
w2_scale = None
|
||||
a1_scale = None
|
||||
a2_scale = None
|
||||
if use_fp8:
|
||||
w1_scale = torch.randn(num_experts, dtype=torch.float32)
|
||||
w2_scale = torch.randn(num_experts, dtype=torch.float32)
|
||||
a1_scale = torch.randn(1, dtype=torch.float32)
|
||||
a2_scale = torch.randn(1, dtype=torch.float32)
|
||||
|
||||
w1 = w1.to(torch.float8_e4m3fn)
|
||||
w2 = w2.to(torch.float8_e4m3fn)
|
||||
|
||||
input_gating = torch.empty(num_tokens, num_experts, dtype=torch.float32)
|
||||
|
||||
def prepare(i: int):
|
||||
input_gating.copy_(gating_output[i])
|
||||
|
||||
def run():
|
||||
fused_moe(
|
||||
x,
|
||||
w1,
|
||||
w2,
|
||||
input_gating,
|
||||
topk,
|
||||
renormalize=True,
|
||||
inplace=True,
|
||||
override_config=config,
|
||||
use_fp8=use_fp8,
|
||||
w1_scale=w1_scale,
|
||||
w2_scale=w2_scale,
|
||||
a1_scale=a1_scale,
|
||||
a2_scale=a2_scale,
|
||||
)
|
||||
|
||||
# JIT compilation & warmup
|
||||
run()
|
||||
torch.cuda.synchronize()
|
||||
|
||||
# Capture 10 invocations with CUDA graph
|
||||
graph = torch.cuda.CUDAGraph()
|
||||
with torch.cuda.graph(graph):
|
||||
for _ in range(10):
|
||||
run()
|
||||
torch.cuda.synchronize()
|
||||
|
||||
# Warmup
|
||||
for _ in range(5):
|
||||
graph.replay()
|
||||
torch.cuda.synchronize()
|
||||
|
||||
start_event = torch.cuda.Event(enable_timing=True)
|
||||
end_event = torch.cuda.Event(enable_timing=True)
|
||||
|
||||
latencies: List[float] = []
|
||||
for i in range(num_iters):
|
||||
prepare(i)
|
||||
torch.cuda.synchronize()
|
||||
|
||||
start_event.record()
|
||||
graph.replay()
|
||||
end_event.record()
|
||||
end_event.synchronize()
|
||||
latencies.append(start_event.elapsed_time(end_event))
|
||||
avg = sum(latencies) / (num_iters * 10) * 1000 # us
|
||||
graph.reset()
|
||||
return avg
|
||||
|
||||
|
||||
def get_configs_compute_bound() -> List[Dict[str, int]]:
|
||||
# Reduced search space for faster tuning.
|
||||
# TODO(woosuk): Increase the search space and use a performance model to
|
||||
# prune the search space.
|
||||
configs: List[BenchmarkConfig] = []
|
||||
for num_stages in [2, 3, 4, 5]:
|
||||
for block_m in [16, 32, 64, 128, 256]:
|
||||
for block_k in [64, 128, 256]:
|
||||
for block_n in [32, 64, 128, 256]:
|
||||
for num_warps in [4, 8]:
|
||||
for group_size in [1, 16, 32, 64]:
|
||||
configs.append({
|
||||
"BLOCK_SIZE_M": block_m,
|
||||
"BLOCK_SIZE_N": block_n,
|
||||
"BLOCK_SIZE_K": block_k,
|
||||
"GROUP_SIZE_M": group_size,
|
||||
"num_warps": num_warps,
|
||||
"num_stages": num_stages,
|
||||
})
|
||||
return configs
|
||||
|
||||
|
||||
@ray.remote(num_gpus=1)
|
||||
class BenchmarkWorker:
|
||||
|
||||
def __init__(self, seed: int) -> None:
|
||||
torch.set_default_device("cuda")
|
||||
torch.cuda.manual_seed_all(seed)
|
||||
self.seed = seed
|
||||
|
||||
def benchmark(
|
||||
self,
|
||||
num_tokens: int,
|
||||
num_experts: int,
|
||||
shard_intermediate_size: int,
|
||||
hidden_size: int,
|
||||
topk: int,
|
||||
dtype: torch.dtype,
|
||||
use_fp8: bool,
|
||||
) -> Tuple[Dict[str, int], float]:
|
||||
torch.cuda.manual_seed_all(self.seed)
|
||||
|
||||
dtype_str = "float8" if use_fp8 else None
|
||||
# NOTE(woosuk): The current naming convention uses w2.shape[2], which
|
||||
# is the intermediate size after silu_and_mul.
|
||||
op_config = get_moe_configs(num_experts, shard_intermediate_size // 2,
|
||||
dtype_str)
|
||||
if op_config is None:
|
||||
config = get_default_config(num_tokens, num_experts,
|
||||
shard_intermediate_size, hidden_size,
|
||||
topk, dtype_str)
|
||||
else:
|
||||
config = op_config[min(op_config.keys(),
|
||||
key=lambda x: abs(x - num_tokens))]
|
||||
kernel_time = benchmark_config(config, num_tokens, num_experts,
|
||||
shard_intermediate_size, hidden_size,
|
||||
topk, dtype, use_fp8)
|
||||
return config, kernel_time
|
||||
|
||||
def tune(
|
||||
self,
|
||||
num_tokens: int,
|
||||
num_experts: int,
|
||||
shard_intermediate_size: int,
|
||||
hidden_size: int,
|
||||
topk: int,
|
||||
dtype: torch.dtype,
|
||||
use_fp8: bool,
|
||||
search_space: List[BenchmarkConfig],
|
||||
) -> BenchmarkConfig:
|
||||
best_config = None
|
||||
best_time = float("inf")
|
||||
for config in tqdm(search_space):
|
||||
try:
|
||||
kernel_time = benchmark_config(config,
|
||||
num_tokens,
|
||||
num_experts,
|
||||
shard_intermediate_size,
|
||||
hidden_size,
|
||||
topk,
|
||||
dtype,
|
||||
use_fp8,
|
||||
num_iters=10)
|
||||
except triton.runtime.autotuner.OutOfResources:
|
||||
# Some configurations may be invalid and fail to compile.
|
||||
continue
|
||||
|
||||
if kernel_time < best_time:
|
||||
best_time = kernel_time
|
||||
best_config = config
|
||||
now = datetime.now()
|
||||
print(f"{now.ctime()}] Completed tuning for batch_size={num_tokens}")
|
||||
assert best_config is not None
|
||||
return best_config
|
||||
|
||||
|
||||
def sort_config(config: BenchmarkConfig) -> BenchmarkConfig:
|
||||
return {
|
||||
"BLOCK_SIZE_M": config["BLOCK_SIZE_M"],
|
||||
"BLOCK_SIZE_N": config["BLOCK_SIZE_N"],
|
||||
"BLOCK_SIZE_K": config["BLOCK_SIZE_K"],
|
||||
"GROUP_SIZE_M": config["GROUP_SIZE_M"],
|
||||
"num_warps": config["num_warps"],
|
||||
"num_stages": config["num_stages"],
|
||||
}
|
||||
|
||||
|
||||
def save_configs(
|
||||
configs: Dict[int, BenchmarkConfig],
|
||||
num_experts: int,
|
||||
shard_intermediate_size: int,
|
||||
hidden_size: int,
|
||||
topk: int,
|
||||
dtype: torch.dtype,
|
||||
use_fp8: bool,
|
||||
) -> None:
|
||||
dtype_str = "float8" if use_fp8 else None
|
||||
# NOTE(woosuk): The current naming convention uses w2.shape[2], which
|
||||
# is the intermediate size after silu_and_mul.
|
||||
filename = get_config_file_name(num_experts, shard_intermediate_size // 2,
|
||||
dtype_str)
|
||||
print(f"Writing best config to {filename}...")
|
||||
with open(filename, "w") as f:
|
||||
json.dump(configs, f, indent=4)
|
||||
f.write("\n")
|
||||
|
||||
|
||||
def main(args: argparse.Namespace):
|
||||
print(args)
|
||||
|
||||
config = AutoConfig.from_pretrained(args.model)
|
||||
if config.architectures[0] == "DbrxForCausalLM":
|
||||
E = config.ffn_config.moe_num_experts
|
||||
topk = config.ffn_config.moe_top_k
|
||||
intermediate_size = config.ffn_config.ffn_hidden_size
|
||||
shard_intermediate_size = 2 * intermediate_size // args.tp_size
|
||||
else:
|
||||
# Default: Mixtral.
|
||||
E = config.num_local_experts
|
||||
topk = config.num_experts_per_tok
|
||||
intermediate_size = config.intermediate_size
|
||||
shard_intermediate_size = 2 * intermediate_size // args.tp_size
|
||||
|
||||
hidden_size = config.hidden_size
|
||||
dtype = config.torch_dtype
|
||||
use_fp8 = args.dtype == "fp8"
|
||||
|
||||
if args.batch_size is None:
|
||||
batch_sizes = [
|
||||
1, 2, 4, 8, 16, 24, 32, 48, 64, 96, 128, 256, 512, 1024, 1536,
|
||||
2048, 3072, 4096
|
||||
]
|
||||
else:
|
||||
batch_sizes = [args.batch_size]
|
||||
|
||||
ray.init()
|
||||
num_gpus = int(ray.available_resources()["GPU"])
|
||||
workers = [BenchmarkWorker.remote(args.seed) for _ in range(num_gpus)]
|
||||
|
||||
def _distribute(method: str, inputs: List[Any]) -> List[Any]:
|
||||
outputs = []
|
||||
worker_idx = 0
|
||||
for input_args in inputs:
|
||||
worker = workers[worker_idx]
|
||||
worker_method = getattr(worker, method)
|
||||
output = worker_method.remote(*input_args)
|
||||
outputs.append(output)
|
||||
worker_idx = (worker_idx + 1) % num_gpus
|
||||
return ray.get(outputs)
|
||||
|
||||
if args.tune:
|
||||
search_space = get_configs_compute_bound()
|
||||
print(f"Start tuning over {len(search_space)} configurations...")
|
||||
|
||||
start = time.time()
|
||||
configs = _distribute(
|
||||
"tune", [(batch_size, E, shard_intermediate_size, hidden_size,
|
||||
topk, dtype, use_fp8, search_space)
|
||||
for batch_size in batch_sizes])
|
||||
best_configs = {
|
||||
M: sort_config(config)
|
||||
for M, config in zip(batch_sizes, configs)
|
||||
}
|
||||
save_configs(best_configs, E, shard_intermediate_size, hidden_size,
|
||||
topk, dtype, use_fp8)
|
||||
end = time.time()
|
||||
print(f"Tuning took {end - start:.2f} seconds")
|
||||
else:
|
||||
outputs = _distribute("benchmark",
|
||||
[(batch_size, E, shard_intermediate_size,
|
||||
hidden_size, topk, dtype, use_fp8)
|
||||
for batch_size in batch_sizes])
|
||||
|
||||
for batch_size, (config, kernel_time) in zip(batch_sizes, outputs):
|
||||
print(f"Batch size: {batch_size}, config: {config}")
|
||||
print(f"Kernel time: {kernel_time:.2f} us")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = FlexibleArgumentParser()
|
||||
parser.add_argument("--model",
|
||||
type=str,
|
||||
default="mistralai/Mixtral-8x7B-Instruct-v0.1")
|
||||
parser.add_argument("--tp-size", "-tp", type=int, default=2)
|
||||
parser.add_argument("--dtype",
|
||||
type=str,
|
||||
choices=["auto", "fp8"],
|
||||
default="auto")
|
||||
parser.add_argument("--seed", type=int, default=0)
|
||||
parser.add_argument("--batch-size", type=int, required=False)
|
||||
parser.add_argument("--tune", action="store_true")
|
||||
args = parser.parse_args()
|
||||
|
||||
main(args)
|
||||
@ -1,12 +1,12 @@
|
||||
import argparse
|
||||
import random
|
||||
import time
|
||||
from typing import List, Optional
|
||||
from typing import Optional
|
||||
|
||||
import torch
|
||||
|
||||
from vllm import _custom_ops as ops
|
||||
from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser,
|
||||
create_kv_caches_with_random)
|
||||
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, create_kv_caches_with_random
|
||||
|
||||
NUM_BLOCKS = 1024
|
||||
PARTITION_SIZE = 512
|
||||
@ -16,7 +16,7 @@ PARTITION_SIZE = 512
|
||||
def main(
|
||||
version: str,
|
||||
num_seqs: int,
|
||||
seq_len: int,
|
||||
context_len: int,
|
||||
num_query_heads: int,
|
||||
num_kv_heads: int,
|
||||
head_size: int,
|
||||
@ -48,23 +48,20 @@ def main(
|
||||
dtype=torch.float,
|
||||
device=device)
|
||||
|
||||
seq_lens = [seq_len for _ in range(num_seqs)]
|
||||
max_seq_len = max(seq_lens)
|
||||
seq_lens = torch.tensor(seq_lens, dtype=torch.int, device=device)
|
||||
context_lens = [context_len for _ in range(num_seqs)]
|
||||
max_context_len = max(context_lens)
|
||||
context_lens = torch.tensor(context_lens, dtype=torch.int, device=device)
|
||||
|
||||
# Create the block tables.
|
||||
max_num_blocks_per_seq = (max_seq_len + block_size - 1) // block_size
|
||||
block_tables_lst: List[List[int]] = []
|
||||
max_num_blocks_per_seq = (max_context_len + block_size - 1) // block_size
|
||||
block_tables = []
|
||||
for _ in range(num_seqs):
|
||||
block_table = [
|
||||
random.randint(0, NUM_BLOCKS - 1)
|
||||
for _ in range(max_num_blocks_per_seq)
|
||||
]
|
||||
block_tables_lst.append(block_table)
|
||||
|
||||
block_tables = torch.tensor(block_tables_lst,
|
||||
dtype=torch.int,
|
||||
device=device)
|
||||
block_tables.append(block_table)
|
||||
block_tables = torch.tensor(block_tables, dtype=torch.int, device=device)
|
||||
|
||||
# Create the KV cache.
|
||||
key_caches, value_caches = create_kv_caches_with_random(NUM_BLOCKS,
|
||||
@ -80,7 +77,8 @@ def main(
|
||||
# Prepare for the paged attention kernel.
|
||||
output = torch.empty_like(query)
|
||||
if version == "v2":
|
||||
num_partitions = ((max_seq_len + PARTITION_SIZE - 1) // PARTITION_SIZE)
|
||||
num_partitions = ((max_context_len + PARTITION_SIZE - 1) //
|
||||
PARTITION_SIZE)
|
||||
tmp_output = torch.empty(
|
||||
size=(num_seqs, num_query_heads, num_partitions, head_size),
|
||||
dtype=output.dtype,
|
||||
@ -112,9 +110,9 @@ def main(
|
||||
num_kv_heads,
|
||||
scale,
|
||||
block_tables,
|
||||
seq_lens,
|
||||
context_lens,
|
||||
block_size,
|
||||
max_seq_len,
|
||||
max_context_len,
|
||||
alibi_slopes,
|
||||
kv_cache_dtype,
|
||||
kv_scale,
|
||||
@ -131,9 +129,9 @@ def main(
|
||||
num_kv_heads,
|
||||
scale,
|
||||
block_tables,
|
||||
seq_lens,
|
||||
context_lens,
|
||||
block_size,
|
||||
max_seq_len,
|
||||
max_context_len,
|
||||
alibi_slopes,
|
||||
kv_cache_dtype,
|
||||
kv_scale,
|
||||
@ -161,19 +159,19 @@ def main(
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = FlexibleArgumentParser(
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Benchmark the paged attention kernel.")
|
||||
parser.add_argument("--version",
|
||||
type=str,
|
||||
choices=["v1", "v2"],
|
||||
default="v2")
|
||||
parser.add_argument("--batch-size", type=int, default=8)
|
||||
parser.add_argument("--seq-len", type=int, default=4096)
|
||||
parser.add_argument("--context-len", type=int, default=4096)
|
||||
parser.add_argument("--num-query-heads", type=int, default=64)
|
||||
parser.add_argument("--num-kv-heads", type=int, default=8)
|
||||
parser.add_argument("--head-size",
|
||||
type=int,
|
||||
choices=[64, 80, 96, 112, 128, 192, 256],
|
||||
choices=[64, 80, 96, 112, 128, 256],
|
||||
default=128)
|
||||
parser.add_argument("--block-size", type=int, choices=[16, 32], default=16)
|
||||
parser.add_argument("--use-alibi", action="store_true")
|
||||
@ -186,11 +184,13 @@ if __name__ == '__main__':
|
||||
parser.add_argument(
|
||||
"--kv-cache-dtype",
|
||||
type=str,
|
||||
choices=["auto", "fp8", "fp8_e5m2", "fp8_e4m3"],
|
||||
choices=["auto", "fp8"],
|
||||
default="auto",
|
||||
help="Data type for kv cache storage. If 'auto', will use model "
|
||||
"data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. "
|
||||
"ROCm (AMD GPU) supports fp8 (=fp8_e4m3)")
|
||||
help=
|
||||
'Data type for kv cache storage. If "auto", will use model data type. '
|
||||
'FP8_E5M2 (without scaling) is only supported on cuda version greater '
|
||||
'than 11.8. On ROCm (AMD GPU), FP8_E4M3 is instead supported for '
|
||||
'common inference criteria.')
|
||||
args = parser.parse_args()
|
||||
print(args)
|
||||
|
||||
@ -199,7 +199,7 @@ if __name__ == '__main__':
|
||||
main(
|
||||
version=args.version,
|
||||
num_seqs=args.batch_size,
|
||||
seq_len=args.seq_len,
|
||||
context_len=args.context_len,
|
||||
num_query_heads=args.num_query_heads,
|
||||
num_kv_heads=args.num_kv_heads,
|
||||
head_size=args.head_size,
|
||||
|
||||
@ -1,12 +1,11 @@
|
||||
import argparse
|
||||
from itertools import accumulate
|
||||
from typing import List, Optional
|
||||
from typing import Optional
|
||||
|
||||
import nvtx
|
||||
import torch
|
||||
|
||||
from vllm.model_executor.layers.rotary_embedding import (RotaryEmbedding,
|
||||
get_rope)
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
from vllm.model_executor.layers.rotary_embedding import get_rope
|
||||
|
||||
|
||||
def benchmark_rope_kernels_multi_lora(
|
||||
@ -38,7 +37,7 @@ def benchmark_rope_kernels_multi_lora(
|
||||
})
|
||||
# non-batched RoPE takes only one scaling factor, we create multiple
|
||||
# instances to simulate the same behavior
|
||||
non_batched_ropes: List[RotaryEmbedding] = []
|
||||
non_batched_ropes = []
|
||||
for scaling_factor in scaling_factors:
|
||||
non_batched_ropes.append(
|
||||
get_rope(head_size, rotary_dim, max_position, base, is_neox_style,
|
||||
@ -86,7 +85,7 @@ def benchmark_rope_kernels_multi_lora(
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = FlexibleArgumentParser(
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Benchmark the rotary embedding kernels.")
|
||||
parser.add_argument("--is-neox-style", type=bool, default=True)
|
||||
parser.add_argument("--batch-size", type=int, default=16)
|
||||
@ -94,7 +93,7 @@ if __name__ == '__main__':
|
||||
parser.add_argument("--num-heads", type=int, default=8)
|
||||
parser.add_argument("--head-size",
|
||||
type=int,
|
||||
choices=[64, 80, 96, 112, 128, 192, 256],
|
||||
choices=[64, 80, 96, 112, 128, 256],
|
||||
default=128)
|
||||
parser.add_argument("--rotary-dim", type=int, choices=[16, 32], default=32)
|
||||
parser.add_argument("--dtype",
|
||||
|
||||
@ -1,75 +0,0 @@
|
||||
WEIGHT_SHAPES = {
|
||||
"ideal": [[4 * 256 * 32, 256 * 32]],
|
||||
"mistralai/Mistral-7B-v0.1/TP1": [
|
||||
[4096, 6144],
|
||||
[4096, 4096],
|
||||
[4096, 28672],
|
||||
[14336, 4096],
|
||||
],
|
||||
"mistralai/Mistral-7B-v0.1/TP2": [
|
||||
[4096, 3072],
|
||||
[2048, 4096],
|
||||
[4096, 14336],
|
||||
[7168, 4096],
|
||||
],
|
||||
"mistralai/Mistral-7B-v0.1/TP4": [
|
||||
[4096, 1536],
|
||||
[1024, 4096],
|
||||
[4096, 7168],
|
||||
[3584, 4096],
|
||||
],
|
||||
"meta-llama/Llama-2-7b-hf/TP1": [
|
||||
[4096, 12288],
|
||||
[4096, 4096],
|
||||
[4096, 22016],
|
||||
[11008, 4096],
|
||||
],
|
||||
"meta-llama/Llama-2-7b-hf/TP2": [
|
||||
[4096, 6144],
|
||||
[2048, 4096],
|
||||
[4096, 11008],
|
||||
[5504, 4096],
|
||||
],
|
||||
"meta-llama/Llama-2-7b-hf/TP4": [
|
||||
[4096, 3072],
|
||||
[1024, 4096],
|
||||
[4096, 5504],
|
||||
[2752, 4096],
|
||||
],
|
||||
"meta-llama/Llama-2-13b-hf/TP1": [
|
||||
[5120, 15360],
|
||||
[5120, 5120],
|
||||
[5120, 27648],
|
||||
[13824, 5120],
|
||||
],
|
||||
"meta-llama/Llama-2-13b-hf/TP2": [
|
||||
[5120, 7680],
|
||||
[2560, 5120],
|
||||
[5120, 13824],
|
||||
[6912, 5120],
|
||||
],
|
||||
"meta-llama/Llama-2-13b-hf/TP4": [
|
||||
[5120, 3840],
|
||||
[1280, 5120],
|
||||
[5120, 6912],
|
||||
[3456, 5120],
|
||||
],
|
||||
"meta-llama/Llama-2-70b-hf/TP1": [
|
||||
[8192, 10240],
|
||||
[8192, 8192],
|
||||
[8192, 57344],
|
||||
[28672, 8192],
|
||||
],
|
||||
"meta-llama/Llama-2-70b-hf/TP2": [
|
||||
[8192, 5120],
|
||||
[4096, 8192],
|
||||
[8192, 28672],
|
||||
[14336, 8192],
|
||||
],
|
||||
"meta-llama/Llama-2-70b-hf/TP4": [
|
||||
[8192, 2560],
|
||||
[2048, 8192],
|
||||
[8192, 14336],
|
||||
[7168, 8192],
|
||||
],
|
||||
}
|
||||
@ -4,7 +4,7 @@ PORT=8000
|
||||
MODEL=$1
|
||||
TOKENS=$2
|
||||
|
||||
docker run -e HF_TOKEN=$HF_TOKEN --gpus all --shm-size 1g -p $PORT:80 \
|
||||
docker run --gpus all --shm-size 1g -p $PORT:80 \
|
||||
-v $PWD/data:/data \
|
||||
ghcr.io/huggingface/text-generation-inference:1.4.0 \
|
||||
--model-id $MODEL \
|
||||
|
||||
@ -1,63 +0,0 @@
|
||||
import cProfile
|
||||
import pstats
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
|
||||
# A very long prompt, total number of tokens is about 15k.
|
||||
LONG_PROMPT = ["You are an expert in large language models, aren't you?"
|
||||
] * 1000
|
||||
LONG_PROMPT = ' '.join(LONG_PROMPT)
|
||||
|
||||
|
||||
def main(args):
|
||||
llm = LLM(
|
||||
model=args.model,
|
||||
enforce_eager=True,
|
||||
enable_prefix_caching=True,
|
||||
tensor_parallel_size=args.tensor_parallel_size,
|
||||
use_v2_block_manager=args.use_v2_block_manager,
|
||||
)
|
||||
|
||||
sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len)
|
||||
profiler = cProfile.Profile()
|
||||
|
||||
print("------warm up------")
|
||||
for i in range(3):
|
||||
output = llm.generate(LONG_PROMPT, sampling_params)
|
||||
print(output[0].outputs[0].text)
|
||||
|
||||
print("------start generating------")
|
||||
for i in range(3):
|
||||
profiler.runctx('llm.generate(LONG_PROMPT, sampling_params)',
|
||||
globals(), locals())
|
||||
|
||||
# analyze the runtime of hashing function
|
||||
stats = pstats.Stats(profiler)
|
||||
stats.sort_stats('cumulative')
|
||||
total_time = 0
|
||||
total_calls = 0
|
||||
for func in stats.stats:
|
||||
if 'hash_of_block' in func[2]:
|
||||
total_time = stats.stats[func][3]
|
||||
total_calls = stats.stats[func][0]
|
||||
percentage = (total_time / stats.total_tt) * 100
|
||||
print(f"Hashing took {total_time:.2f} seconds,"
|
||||
f"{percentage:.2f}% of the total runtime.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = FlexibleArgumentParser(
|
||||
description='Benchmark the performance of hashing function in'
|
||||
'automatic prefix caching.')
|
||||
parser.add_argument('--model', type=str, default='lmsys/longchat-7b-16k')
|
||||
parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1)
|
||||
parser.add_argument('--output-len', type=int, default=10)
|
||||
parser.add_argument('--enable-prefix-caching',
|
||||
action='store_true',
|
||||
help='enable prefix caching')
|
||||
parser.add_argument('--use-v2-block-manager',
|
||||
action='store_true',
|
||||
help='Use BlockSpaceMangerV2')
|
||||
args = parser.parse_args()
|
||||
main(args)
|
||||
@ -12,7 +12,7 @@ include_directories("${CMAKE_SOURCE_DIR}/csrc")
|
||||
#
|
||||
# Check the compile flags
|
||||
#
|
||||
list(APPEND CXX_COMPILE_FLAGS
|
||||
list(APPEND CXX_COMPILE_FLAGS
|
||||
"-fopenmp"
|
||||
"-DVLLM_CPU_EXTENSION")
|
||||
|
||||
@ -33,23 +33,9 @@ function (find_isa CPUINFO TARGET OUT)
|
||||
endif()
|
||||
endfunction()
|
||||
|
||||
function (is_avx512_disabled OUT)
|
||||
set(DISABLE_AVX512 $ENV{VLLM_CPU_DISABLE_AVX512})
|
||||
if(DISABLE_AVX512 AND DISABLE_AVX512 STREQUAL "true")
|
||||
set(${OUT} ON PARENT_SCOPE)
|
||||
else()
|
||||
set(${OUT} OFF PARENT_SCOPE)
|
||||
endif()
|
||||
endfunction()
|
||||
|
||||
is_avx512_disabled(AVX512_DISABLED)
|
||||
|
||||
find_isa(${CPUINFO} "avx2" AVX2_FOUND)
|
||||
find_isa(${CPUINFO} "avx512f" AVX512_FOUND)
|
||||
find_isa(${CPUINFO} "POWER10" POWER10_FOUND)
|
||||
find_isa(${CPUINFO} "POWER9" POWER9_FOUND)
|
||||
|
||||
if (AVX512_FOUND AND NOT AVX512_DISABLED)
|
||||
if (AVX512_FOUND)
|
||||
list(APPEND CXX_COMPILE_FLAGS
|
||||
"-mavx512f"
|
||||
"-mavx512vl"
|
||||
@ -58,8 +44,8 @@ if (AVX512_FOUND AND NOT AVX512_DISABLED)
|
||||
|
||||
find_isa(${CPUINFO} "avx512_bf16" AVX512BF16_FOUND)
|
||||
if (AVX512BF16_FOUND OR ENABLE_AVX512BF16)
|
||||
if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND
|
||||
CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 12.3)
|
||||
if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND
|
||||
CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 12.3)
|
||||
list(APPEND CXX_COMPILE_FLAGS "-mavx512bf16")
|
||||
else()
|
||||
message(WARNING "Disable AVX512-BF16 ISA support, requires gcc/g++ >= 12.3")
|
||||
@ -67,18 +53,8 @@ if (AVX512_FOUND AND NOT AVX512_DISABLED)
|
||||
else()
|
||||
message(WARNING "Disable AVX512-BF16 ISA support, no avx512_bf16 found in local CPU flags." " If cross-compilation is required, please set env VLLM_CPU_AVX512BF16=1.")
|
||||
endif()
|
||||
elseif (AVX2_FOUND)
|
||||
list(APPEND CXX_COMPILE_FLAGS "-mavx2")
|
||||
message(WARNING "vLLM CPU backend using AVX2 ISA")
|
||||
elseif (POWER9_FOUND OR POWER10_FOUND)
|
||||
message(STATUS "PowerPC detected")
|
||||
# Check for PowerPC VSX support
|
||||
list(APPEND CXX_COMPILE_FLAGS
|
||||
"-mvsx"
|
||||
"-mcpu=native"
|
||||
"-mtune=native")
|
||||
else()
|
||||
message(FATAL_ERROR "vLLM CPU backend requires AVX512 or AVX2 or Power9+ ISA support.")
|
||||
message(FATAL_ERROR "vLLM CPU backend requires AVX512 ISA support.")
|
||||
endif()
|
||||
|
||||
message(STATUS "CPU extension compile flags: ${CXX_COMPILE_FLAGS}")
|
||||
@ -97,7 +73,7 @@ set(VLLM_EXT_SRC
|
||||
"csrc/cpu/cache.cpp"
|
||||
"csrc/cpu/layernorm.cpp"
|
||||
"csrc/cpu/pos_encoding.cpp"
|
||||
"csrc/cpu/torch_bindings.cpp")
|
||||
"csrc/cpu/pybind.cpp")
|
||||
|
||||
define_gpu_extension_target(
|
||||
_C
|
||||
@ -105,10 +81,10 @@ define_gpu_extension_target(
|
||||
LANGUAGE CXX
|
||||
SOURCES ${VLLM_EXT_SRC}
|
||||
COMPILE_FLAGS ${CXX_COMPILE_FLAGS}
|
||||
USE_SABI 3
|
||||
WITH_SOABI
|
||||
WITH_SOABI
|
||||
)
|
||||
|
||||
add_custom_target(default)
|
||||
message(STATUS "Enabling C extension.")
|
||||
add_dependencies(default _C)
|
||||
|
||||
|
||||
@ -5,7 +5,7 @@
|
||||
macro (find_python_from_executable EXECUTABLE SUPPORTED_VERSIONS)
|
||||
file(REAL_PATH ${EXECUTABLE} EXECUTABLE)
|
||||
set(Python_EXECUTABLE ${EXECUTABLE})
|
||||
find_package(Python COMPONENTS Interpreter Development.Module Development.SABIModule)
|
||||
find_package(Python COMPONENTS Interpreter Development.Module)
|
||||
if (NOT Python_FOUND)
|
||||
message(FATAL_ERROR "Unable to find python matching: ${EXECUTABLE}.")
|
||||
endif()
|
||||
@ -99,7 +99,7 @@ function (get_torch_gpu_compiler_flags OUT_GPU_FLAGS GPU_LANG)
|
||||
"Failed to determine torch nvcc compiler flags")
|
||||
|
||||
if (CUDA_VERSION VERSION_GREATER_EQUAL 11.8)
|
||||
list(APPEND GPU_FLAGS "-DENABLE_FP8")
|
||||
list(APPEND GPU_FLAGS "-DENABLE_FP8_E5M2")
|
||||
endif()
|
||||
if (CUDA_VERSION VERSION_GREATER_EQUAL 12.0)
|
||||
list(REMOVE_ITEM GPU_FLAGS
|
||||
@ -119,7 +119,7 @@ function (get_torch_gpu_compiler_flags OUT_GPU_FLAGS GPU_LANG)
|
||||
|
||||
list(APPEND GPU_FLAGS
|
||||
"-DUSE_ROCM"
|
||||
"-DENABLE_FP8"
|
||||
"-DENABLE_FP8_E4M3"
|
||||
"-U__HIP_NO_HALF_CONVERSIONS__"
|
||||
"-U__HIP_NO_HALF_OPERATORS__"
|
||||
"-fno-gpu-rdc")
|
||||
@ -147,23 +147,16 @@ macro(override_gpu_arches GPU_ARCHES GPU_LANG GPU_SUPPORTED_ARCHES)
|
||||
if (${GPU_LANG} STREQUAL "HIP")
|
||||
#
|
||||
# `GPU_ARCHES` controls the `--offload-arch` flags.
|
||||
# `CMAKE_HIP_ARCHITECTURES` is set up by torch and can be controlled
|
||||
# via the `PYTORCH_ROCM_ARCH` env variable.
|
||||
#
|
||||
# If PYTORCH_ROCM_ARCH env variable exists, then we take it as a list,
|
||||
# if not, then we use CMAKE_HIP_ARCHITECTURES which was generated by calling
|
||||
# "rocm_agent_enumerator" in "enable_language(HIP)"
|
||||
# (in file Modules/CMakeDetermineHIPCompiler.cmake)
|
||||
#
|
||||
if(DEFINED ENV{PYTORCH_ROCM_ARCH})
|
||||
set(HIP_ARCHITECTURES $ENV{PYTORCH_ROCM_ARCH})
|
||||
else()
|
||||
set(HIP_ARCHITECTURES ${CMAKE_HIP_ARCHITECTURES})
|
||||
endif()
|
||||
|
||||
#
|
||||
# Find the intersection of the supported + detected architectures to
|
||||
# set the module architecture flags.
|
||||
#
|
||||
set(${GPU_ARCHES})
|
||||
foreach (_ARCH ${HIP_ARCHITECTURES})
|
||||
foreach (_ARCH ${CMAKE_HIP_ARCHITECTURES})
|
||||
if (_ARCH IN_LIST _GPU_SUPPORTED_ARCHES_LIST)
|
||||
list(APPEND ${GPU_ARCHES} ${_ARCH})
|
||||
endif()
|
||||
@ -171,7 +164,7 @@ macro(override_gpu_arches GPU_ARCHES GPU_LANG GPU_SUPPORTED_ARCHES)
|
||||
|
||||
if(NOT ${GPU_ARCHES})
|
||||
message(FATAL_ERROR
|
||||
"None of the detected ROCm architectures: ${HIP_ARCHITECTURES} is"
|
||||
"None of the detected ROCm architectures: ${CMAKE_HIP_ARCHITECTURES} is"
|
||||
" supported. Supported ROCm architectures are: ${_GPU_SUPPORTED_ARCHES_LIST}.")
|
||||
endif()
|
||||
|
||||
@ -301,7 +294,6 @@ endmacro()
|
||||
# INCLUDE_DIRECTORIES <dirs> - Extra include directories.
|
||||
# LIBRARIES <libraries> - Extra link libraries.
|
||||
# WITH_SOABI - Generate library with python SOABI suffix name.
|
||||
# USE_SABI <version> - Use python stable api <version>
|
||||
#
|
||||
# Note: optimization level/debug info is set via cmake build type.
|
||||
#
|
||||
@ -309,7 +301,7 @@ function (define_gpu_extension_target GPU_MOD_NAME)
|
||||
cmake_parse_arguments(PARSE_ARGV 1
|
||||
GPU
|
||||
"WITH_SOABI"
|
||||
"DESTINATION;LANGUAGE;USE_SABI"
|
||||
"DESTINATION;LANGUAGE"
|
||||
"SOURCES;ARCHITECTURES;COMPILE_FLAGS;INCLUDE_DIRECTORIES;LIBRARIES")
|
||||
|
||||
# Add hipify preprocessing step when building with HIP/ROCm.
|
||||
@ -323,11 +315,7 @@ function (define_gpu_extension_target GPU_MOD_NAME)
|
||||
set(GPU_WITH_SOABI)
|
||||
endif()
|
||||
|
||||
if (GPU_USE_SABI)
|
||||
Python_add_library(${GPU_MOD_NAME} MODULE USE_SABI ${GPU_USE_SABI} ${GPU_WITH_SOABI} "${GPU_SOURCES}")
|
||||
else()
|
||||
Python_add_library(${GPU_MOD_NAME} MODULE ${GPU_WITH_SOABI} "${GPU_SOURCES}")
|
||||
endif()
|
||||
Python_add_library(${GPU_MOD_NAME} MODULE "${GPU_SOURCES}" ${GPU_WITH_SOABI})
|
||||
|
||||
if (GPU_LANGUAGE STREQUAL "HIP")
|
||||
# Make this target dependent on the hipify preprocessor step.
|
||||
|
||||
@ -63,8 +63,6 @@ DEFAULT_CONDA_PATTERNS = {
|
||||
"magma",
|
||||
"triton",
|
||||
"optree",
|
||||
"nccl",
|
||||
"transformers",
|
||||
}
|
||||
|
||||
DEFAULT_PIP_PATTERNS = {
|
||||
@ -75,8 +73,6 @@ DEFAULT_PIP_PATTERNS = {
|
||||
"triton",
|
||||
"optree",
|
||||
"onnx",
|
||||
"nccl",
|
||||
"transformers",
|
||||
}
|
||||
|
||||
|
||||
@ -603,11 +599,6 @@ Versions of relevant libraries:
|
||||
{conda_packages}
|
||||
""".strip()
|
||||
|
||||
# both the above code and the following code use `strip()` to
|
||||
# remove leading/trailing whitespaces, so we need to add a newline
|
||||
# in between to separate the two sections
|
||||
env_info_fmt += "\n"
|
||||
|
||||
env_info_fmt += """
|
||||
ROCM Version: {rocm_version}
|
||||
Neuron SDK Version: {neuron_sdk_version}
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
#include <ATen/cuda/CUDAContext.h>
|
||||
#include <torch/all.h>
|
||||
#include <torch/extension.h>
|
||||
#include <c10/cuda/CUDAGuard.h>
|
||||
|
||||
#include <cmath>
|
||||
@ -10,11 +10,11 @@
|
||||
namespace vllm {
|
||||
|
||||
// Activation and gating kernel template.
|
||||
template <typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&)>
|
||||
template<typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&)>
|
||||
__global__ void act_and_mul_kernel(
|
||||
scalar_t* __restrict__ out, // [..., d]
|
||||
const scalar_t* __restrict__ input, // [..., 2, d]
|
||||
const int d) {
|
||||
scalar_t* __restrict__ out, // [..., d]
|
||||
const scalar_t* __restrict__ input, // [..., 2, d]
|
||||
const int d) {
|
||||
const int64_t token_idx = blockIdx.x;
|
||||
for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
|
||||
const scalar_t x = VLLM_LDG(&input[token_idx * 2 * d + idx]);
|
||||
@ -23,66 +23,72 @@ __global__ void act_and_mul_kernel(
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
template<typename T>
|
||||
__device__ __forceinline__ T silu_kernel(const T& x) {
|
||||
// x * sigmoid(x)
|
||||
return (T)(((float)x) / (1.0f + expf((float)-x)));
|
||||
return (T) (((float) x) / (1.0f + expf((float) -x)));
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
template<typename T>
|
||||
__device__ __forceinline__ T gelu_kernel(const T& x) {
|
||||
// Equivalent to PyTorch GELU with 'none' approximation.
|
||||
// Refer to:
|
||||
// https://github.com/pytorch/pytorch/blob/8ac9b20d4b090c213799e81acf48a55ea8d437d6/aten/src/ATen/native/cuda/ActivationGeluKernel.cu#L36-L38
|
||||
const float f = (float)x;
|
||||
const float f = (float) x;
|
||||
constexpr float ALPHA = M_SQRT1_2;
|
||||
return (T)(f * 0.5f * (1.0f + ::erf(f * ALPHA)));
|
||||
return (T) (f * 0.5f * (1.0f + ::erf(f * ALPHA)));
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
template<typename T>
|
||||
__device__ __forceinline__ T gelu_tanh_kernel(const T& x) {
|
||||
// Equivalent to PyTorch GELU with 'tanh' approximation.
|
||||
// Refer to:
|
||||
// https://github.com/pytorch/pytorch/blob/8ac9b20d4b090c213799e81acf48a55ea8d437d6/aten/src/ATen/native/cuda/ActivationGeluKernel.cu#L25-L30
|
||||
const float f = (float)x;
|
||||
const float f = (float) x;
|
||||
constexpr float BETA = M_SQRT2 * M_2_SQRTPI * 0.5f;
|
||||
constexpr float KAPPA = 0.044715;
|
||||
float x_cube = f * f * f;
|
||||
float inner = BETA * (f + KAPPA * x_cube);
|
||||
return (T)(0.5f * f * (1.0f + ::tanhf(inner)));
|
||||
return (T) (0.5f * f * (1.0f + ::tanhf(inner)));
|
||||
}
|
||||
|
||||
} // namespace vllm
|
||||
} // namespace vllm
|
||||
|
||||
// Launch activation and gating kernel.
|
||||
#define LAUNCH_ACTIVATION_GATE_KERNEL(KERNEL) \
|
||||
int d = input.size(-1) / 2; \
|
||||
int64_t num_tokens = input.numel() / input.size(-1); \
|
||||
dim3 grid(num_tokens); \
|
||||
dim3 block(std::min(d, 1024)); \
|
||||
const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); \
|
||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); \
|
||||
VLLM_DISPATCH_FLOATING_TYPES( \
|
||||
input.scalar_type(), "act_and_mul_kernel", [&] { \
|
||||
vllm::act_and_mul_kernel<scalar_t, KERNEL<scalar_t>> \
|
||||
<<<grid, block, 0, stream>>>(out.data_ptr<scalar_t>(), \
|
||||
input.data_ptr<scalar_t>(), d); \
|
||||
});
|
||||
#define LAUNCH_ACTIVATION_GATE_KERNEL(KERNEL) \
|
||||
int d = input.size(-1) / 2; \
|
||||
int64_t num_tokens = input.numel() / input.size(-1); \
|
||||
dim3 grid(num_tokens); \
|
||||
dim3 block(std::min(d, 1024)); \
|
||||
const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); \
|
||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); \
|
||||
VLLM_DISPATCH_FLOATING_TYPES( \
|
||||
input.scalar_type(), \
|
||||
"act_and_mul_kernel", \
|
||||
[&] { \
|
||||
vllm::act_and_mul_kernel<scalar_t, KERNEL<scalar_t>><<<grid, block, 0, stream>>>( \
|
||||
out.data_ptr<scalar_t>(), \
|
||||
input.data_ptr<scalar_t>(), \
|
||||
d); \
|
||||
});
|
||||
|
||||
void silu_and_mul(torch::Tensor& out, // [..., d]
|
||||
torch::Tensor& input) // [..., 2 * d]
|
||||
void silu_and_mul(
|
||||
torch::Tensor& out, // [..., d]
|
||||
torch::Tensor& input) // [..., 2 * d]
|
||||
{
|
||||
LAUNCH_ACTIVATION_GATE_KERNEL(vllm::silu_kernel);
|
||||
}
|
||||
|
||||
void gelu_and_mul(torch::Tensor& out, // [..., d]
|
||||
torch::Tensor& input) // [..., 2 * d]
|
||||
void gelu_and_mul(
|
||||
torch::Tensor& out, // [..., d]
|
||||
torch::Tensor& input) // [..., 2 * d]
|
||||
{
|
||||
LAUNCH_ACTIVATION_GATE_KERNEL(vllm::gelu_kernel);
|
||||
}
|
||||
|
||||
void gelu_tanh_and_mul(torch::Tensor& out, // [..., d]
|
||||
torch::Tensor& input) // [..., 2 * d]
|
||||
void gelu_tanh_and_mul(
|
||||
torch::Tensor& out, // [..., d]
|
||||
torch::Tensor& input) // [..., 2 * d]
|
||||
{
|
||||
LAUNCH_ACTIVATION_GATE_KERNEL(vllm::gelu_tanh_kernel);
|
||||
}
|
||||
@ -90,11 +96,11 @@ void gelu_tanh_and_mul(torch::Tensor& out, // [..., d]
|
||||
namespace vllm {
|
||||
|
||||
// Element-wise activation kernel template.
|
||||
template <typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&)>
|
||||
template<typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&)>
|
||||
__global__ void activation_kernel(
|
||||
scalar_t* __restrict__ out, // [..., d]
|
||||
const scalar_t* __restrict__ input, // [..., d]
|
||||
const int d) {
|
||||
scalar_t* __restrict__ out, // [..., d]
|
||||
const scalar_t* __restrict__ input, // [..., d]
|
||||
const int d) {
|
||||
const int64_t token_idx = blockIdx.x;
|
||||
for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
|
||||
const scalar_t x = VLLM_LDG(&input[token_idx * d + idx]);
|
||||
@ -102,61 +108,54 @@ __global__ void activation_kernel(
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace vllm
|
||||
} // namespace vllm
|
||||
|
||||
// Launch element-wise activation kernel.
|
||||
#define LAUNCH_ACTIVATION_KERNEL(KERNEL) \
|
||||
int d = input.size(-1); \
|
||||
int64_t num_tokens = input.numel() / d; \
|
||||
dim3 grid(num_tokens); \
|
||||
dim3 block(std::min(d, 1024)); \
|
||||
const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); \
|
||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); \
|
||||
VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "activation_kernel", [&] { \
|
||||
vllm::activation_kernel<scalar_t, KERNEL<scalar_t>> \
|
||||
<<<grid, block, 0, stream>>>(out.data_ptr<scalar_t>(), \
|
||||
input.data_ptr<scalar_t>(), d); \
|
||||
});
|
||||
#define LAUNCH_ACTIVATION_KERNEL(KERNEL) \
|
||||
int d = input.size(-1); \
|
||||
int64_t num_tokens = input.numel() / d; \
|
||||
dim3 grid(num_tokens); \
|
||||
dim3 block(std::min(d, 1024)); \
|
||||
const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); \
|
||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); \
|
||||
VLLM_DISPATCH_FLOATING_TYPES( \
|
||||
input.scalar_type(), \
|
||||
"activation_kernel", \
|
||||
[&] { \
|
||||
vllm::activation_kernel<scalar_t, KERNEL<scalar_t>><<<grid, block, 0, stream>>>( \
|
||||
out.data_ptr<scalar_t>(), \
|
||||
input.data_ptr<scalar_t>(), \
|
||||
d); \
|
||||
});
|
||||
|
||||
namespace vllm {
|
||||
|
||||
template <typename T>
|
||||
template<typename T>
|
||||
__device__ __forceinline__ T gelu_new_kernel(const T& x) {
|
||||
const float x3 = (float)(x * x * x);
|
||||
const T t = (T)tanhf((T)(0.79788456f * (float)(x + (T)(0.044715f * x3))));
|
||||
return ((T)0.5) * x * (((T)1.0) + t);
|
||||
const float x3 = (float) (x * x * x);
|
||||
const T t = (T) tanhf((T) (0.79788456f * (float) (x + (T) (0.044715f * x3))));
|
||||
return ((T) 0.5) * x * (((T) 1.0) + t);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
template<typename T>
|
||||
__device__ __forceinline__ T gelu_fast_kernel(const T& x) {
|
||||
const float f = (float)x;
|
||||
const T t =
|
||||
(T)tanhf(((T)(f * 0.79788456f)) * (((T)1.0) + (T)(0.044715f * f) * x));
|
||||
return ((T)0.5) * x * (((T)1.0) + t);
|
||||
const float f = (float) x;
|
||||
const T t = (T) tanhf(((T) (f * 0.79788456f)) * (((T) 1.0) + (T) (0.044715f * f) * x));
|
||||
return ((T) 0.5) * x * (((T) 1.0) + t);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ __forceinline__ T gelu_quick_kernel(const T& x) {
|
||||
// x * sigmoid(1.702 * x)
|
||||
return (T)(((float)x) / (1.0f + expf(-1.702f * (float)x)));
|
||||
}
|
||||
} // namespace vllm
|
||||
|
||||
} // namespace vllm
|
||||
|
||||
void gelu_new(torch::Tensor& out, // [..., d]
|
||||
torch::Tensor& input) // [..., d]
|
||||
void gelu_new(
|
||||
torch::Tensor& out, // [..., d]
|
||||
torch::Tensor& input) // [..., d]
|
||||
{
|
||||
LAUNCH_ACTIVATION_KERNEL(vllm::gelu_new_kernel);
|
||||
}
|
||||
|
||||
void gelu_fast(torch::Tensor& out, // [..., d]
|
||||
torch::Tensor& input) // [..., d]
|
||||
void gelu_fast(
|
||||
torch::Tensor& out, // [..., d]
|
||||
torch::Tensor& input) // [..., d]
|
||||
{
|
||||
LAUNCH_ACTIVATION_KERNEL(vllm::gelu_fast_kernel);
|
||||
}
|
||||
|
||||
void gelu_quick(torch::Tensor& out, // [..., d]
|
||||
torch::Tensor& input) // [..., d]
|
||||
{
|
||||
LAUNCH_ACTIVATION_KERNEL(vllm::gelu_quick_kernel);
|
||||
}
|
||||
|
||||
@ -1,6 +1,5 @@
|
||||
/*
|
||||
* Adapted from
|
||||
* https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h
|
||||
* Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h
|
||||
* Copyright (c) 2023, The vLLM team.
|
||||
* Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
@ -23,31 +22,31 @@
|
||||
namespace vllm {
|
||||
|
||||
// A vector type to store Q, K, V elements.
|
||||
template <typename T, int VEC_SIZE>
|
||||
template<typename T, int VEC_SIZE>
|
||||
struct Vec {};
|
||||
|
||||
// A vector type to store FP32 accumulators.
|
||||
template <typename T>
|
||||
template<typename T>
|
||||
struct FloatVec {};
|
||||
|
||||
// Template vector operations.
|
||||
template <typename Acc, typename A, typename B>
|
||||
template<typename Acc, typename A, typename B>
|
||||
inline __device__ Acc mul(A a, B b);
|
||||
|
||||
template <typename T>
|
||||
template<typename T>
|
||||
inline __device__ float sum(T v);
|
||||
|
||||
template <typename T>
|
||||
template<typename T>
|
||||
inline __device__ float dot(T a, T b) {
|
||||
return sum(mul<T, T, T>(a, b));
|
||||
}
|
||||
|
||||
template <typename A, typename T>
|
||||
template<typename A, typename T>
|
||||
inline __device__ float dot(T a, T b) {
|
||||
return sum(mul<A, T, T>(a, b));
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
template<typename T>
|
||||
inline __device__ void zero(T& dst) {
|
||||
constexpr int WORDS = sizeof(T) / 4;
|
||||
union {
|
||||
@ -62,4 +61,4 @@ inline __device__ void zero(T& dst) {
|
||||
dst = tmp.raw;
|
||||
}
|
||||
|
||||
} // namespace vllm
|
||||
} // namespace vllm
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -1,6 +1,5 @@
|
||||
/*
|
||||
* Adapted from
|
||||
* https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
|
||||
* Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
|
||||
* Copyright (c) 2023, The vLLM team.
|
||||
* Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
@ -27,7 +26,7 @@
|
||||
namespace vllm {
|
||||
|
||||
// Q*K^T operation.
|
||||
template <int THREAD_GROUP_SIZE, typename Vec, int N>
|
||||
template<int THREAD_GROUP_SIZE, typename Vec, int N>
|
||||
inline __device__ float qk_dot_(const Vec (&q)[N], const Vec (&k)[N]) {
|
||||
using A_vec = typename FloatVec<Vec>::Type;
|
||||
// Compute the parallel products for Q*K^T (treat vector lanes separately).
|
||||
@ -46,12 +45,12 @@ inline __device__ float qk_dot_(const Vec (&q)[N], const Vec (&k)[N]) {
|
||||
return qk;
|
||||
}
|
||||
|
||||
template <typename T, int THREAD_GROUP_SIZE>
|
||||
template<typename T, int THREAD_GROUP_SIZE>
|
||||
struct Qk_dot {
|
||||
template <typename Vec, int N>
|
||||
template<typename Vec, int N>
|
||||
static inline __device__ float dot(const Vec (&q)[N], const Vec (&k)[N]) {
|
||||
return qk_dot_<THREAD_GROUP_SIZE>(q, k);
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace vllm
|
||||
} // namespace vllm
|
||||
|
||||
@ -1,8 +1,6 @@
|
||||
/*
|
||||
* Adapted from
|
||||
* https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
|
||||
* and
|
||||
* https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h
|
||||
* Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
|
||||
* and https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h
|
||||
* Copyright (c) 2023, The vLLM team.
|
||||
* Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
@ -30,8 +28,8 @@
|
||||
#include <hip/hip_bf16.h>
|
||||
#include <hip/hip_fp16.h>
|
||||
|
||||
typedef __hip_bfloat162 __nv_bfloat162;
|
||||
typedef __hip_bfloat16 __nv_bfloat16;
|
||||
typedef __hip_bfloat162 __nv_bfloat162;
|
||||
typedef __hip_bfloat16 __nv_bfloat16;
|
||||
#endif
|
||||
|
||||
#include <stdint.h>
|
||||
@ -52,37 +50,37 @@ struct bf16_8_t {
|
||||
};
|
||||
|
||||
// BF16 vector types for Q, K, V.
|
||||
template <>
|
||||
template<>
|
||||
struct Vec<__nv_bfloat16, 1> {
|
||||
using Type = __nv_bfloat16;
|
||||
};
|
||||
template <>
|
||||
template<>
|
||||
struct Vec<__nv_bfloat16, 2> {
|
||||
using Type = __nv_bfloat162;
|
||||
};
|
||||
template <>
|
||||
template<>
|
||||
struct Vec<__nv_bfloat16, 4> {
|
||||
using Type = bf16_4_t;
|
||||
};
|
||||
template <>
|
||||
template<>
|
||||
struct Vec<__nv_bfloat16, 8> {
|
||||
using Type = bf16_8_t;
|
||||
};
|
||||
|
||||
// FP32 accumulator vector types corresponding to Vec.
|
||||
template <>
|
||||
template<>
|
||||
struct FloatVec<__nv_bfloat16> {
|
||||
using Type = float;
|
||||
};
|
||||
template <>
|
||||
template<>
|
||||
struct FloatVec<__nv_bfloat162> {
|
||||
using Type = float2;
|
||||
};
|
||||
template <>
|
||||
template<>
|
||||
struct FloatVec<bf16_4_t> {
|
||||
using Type = Float4_;
|
||||
};
|
||||
template <>
|
||||
template<>
|
||||
struct FloatVec<bf16_8_t> {
|
||||
using Type = Float8_;
|
||||
};
|
||||
@ -110,9 +108,9 @@ inline __device__ __nv_bfloat16 add(__nv_bfloat16 a, __nv_bfloat16 b) {
|
||||
assert(false);
|
||||
#else
|
||||
#ifndef USE_ROCM
|
||||
return a + b;
|
||||
return a + b;
|
||||
#else
|
||||
return __hadd(a, b);
|
||||
return __hadd(a, b);
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
@ -163,7 +161,7 @@ inline __device__ Float8_ add(bf16_8_t a, Float8_ fb) {
|
||||
}
|
||||
|
||||
// Vector multiplication.
|
||||
template <>
|
||||
template<>
|
||||
inline __device__ __nv_bfloat16 mul(__nv_bfloat16 a, __nv_bfloat16 b) {
|
||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
|
||||
assert(false);
|
||||
@ -172,7 +170,7 @@ inline __device__ __nv_bfloat16 mul(__nv_bfloat16 a, __nv_bfloat16 b) {
|
||||
#endif
|
||||
}
|
||||
|
||||
template <>
|
||||
template<>
|
||||
inline __device__ __nv_bfloat162 mul(__nv_bfloat162 a, __nv_bfloat162 b) {
|
||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
|
||||
assert(false);
|
||||
@ -181,12 +179,12 @@ inline __device__ __nv_bfloat162 mul(__nv_bfloat162 a, __nv_bfloat162 b) {
|
||||
#endif
|
||||
}
|
||||
|
||||
template <>
|
||||
template<>
|
||||
inline __device__ __nv_bfloat162 mul(__nv_bfloat16 a, __nv_bfloat162 b) {
|
||||
return mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(bf162bf162(a), b);
|
||||
}
|
||||
|
||||
template <>
|
||||
template<>
|
||||
inline __device__ bf16_4_t mul(bf16_4_t a, bf16_4_t b) {
|
||||
bf16_4_t c;
|
||||
c.x = mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(a.x, b.x);
|
||||
@ -194,7 +192,7 @@ inline __device__ bf16_4_t mul(bf16_4_t a, bf16_4_t b) {
|
||||
return c;
|
||||
}
|
||||
|
||||
template <>
|
||||
template<>
|
||||
inline __device__ bf16_4_t mul(__nv_bfloat16 a, bf16_4_t b) {
|
||||
__nv_bfloat162 s = bf162bf162(a);
|
||||
bf16_4_t c;
|
||||
@ -203,7 +201,7 @@ inline __device__ bf16_4_t mul(__nv_bfloat16 a, bf16_4_t b) {
|
||||
return c;
|
||||
}
|
||||
|
||||
template <>
|
||||
template<>
|
||||
inline __device__ bf16_8_t mul(bf16_8_t a, bf16_8_t b) {
|
||||
bf16_8_t c;
|
||||
c.x = mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(a.x, b.x);
|
||||
@ -213,7 +211,7 @@ inline __device__ bf16_8_t mul(bf16_8_t a, bf16_8_t b) {
|
||||
return c;
|
||||
}
|
||||
|
||||
template <>
|
||||
template<>
|
||||
inline __device__ bf16_8_t mul(__nv_bfloat16 a, bf16_8_t b) {
|
||||
__nv_bfloat162 s = bf162bf162(a);
|
||||
bf16_8_t c;
|
||||
@ -224,26 +222,26 @@ inline __device__ bf16_8_t mul(__nv_bfloat16 a, bf16_8_t b) {
|
||||
return c;
|
||||
}
|
||||
|
||||
template <>
|
||||
template<>
|
||||
inline __device__ float mul(__nv_bfloat16 a, __nv_bfloat16 b) {
|
||||
float fa = __bfloat162float(a);
|
||||
float fb = __bfloat162float(b);
|
||||
return fa * fb;
|
||||
}
|
||||
|
||||
template <>
|
||||
template<>
|
||||
inline __device__ float2 mul(__nv_bfloat162 a, __nv_bfloat162 b) {
|
||||
float2 fa = bf1622float2(a);
|
||||
float2 fb = bf1622float2(b);
|
||||
return mul<float2, float2, float2>(fa, fb);
|
||||
}
|
||||
|
||||
template <>
|
||||
template<>
|
||||
inline __device__ float2 mul(__nv_bfloat16 a, __nv_bfloat162 b) {
|
||||
return mul<float2, __nv_bfloat162, __nv_bfloat162>(bf162bf162(a), b);
|
||||
}
|
||||
|
||||
template <>
|
||||
template<>
|
||||
inline __device__ Float4_ mul(bf16_4_t a, bf16_4_t b) {
|
||||
Float4_ fc;
|
||||
fc.x = mul<float2, __nv_bfloat162, __nv_bfloat162>(a.x, b.x);
|
||||
@ -251,7 +249,7 @@ inline __device__ Float4_ mul(bf16_4_t a, bf16_4_t b) {
|
||||
return fc;
|
||||
}
|
||||
|
||||
template <>
|
||||
template<>
|
||||
inline __device__ Float4_ mul(__nv_bfloat16 a, bf16_4_t b) {
|
||||
__nv_bfloat162 s = bf162bf162(a);
|
||||
Float4_ fc;
|
||||
@ -260,7 +258,7 @@ inline __device__ Float4_ mul(__nv_bfloat16 a, bf16_4_t b) {
|
||||
return fc;
|
||||
}
|
||||
|
||||
template <>
|
||||
template<>
|
||||
inline __device__ Float8_ mul(bf16_8_t a, bf16_8_t b) {
|
||||
Float8_ fc;
|
||||
fc.x = mul<float2, __nv_bfloat162, __nv_bfloat162>(a.x, b.x);
|
||||
@ -270,7 +268,7 @@ inline __device__ Float8_ mul(bf16_8_t a, bf16_8_t b) {
|
||||
return fc;
|
||||
}
|
||||
|
||||
template <>
|
||||
template<>
|
||||
inline __device__ Float8_ mul(__nv_bfloat16 a, bf16_8_t b) {
|
||||
__nv_bfloat162 s = bf162bf162(a);
|
||||
Float8_ fc;
|
||||
@ -282,8 +280,7 @@ inline __device__ Float8_ mul(__nv_bfloat16 a, bf16_8_t b) {
|
||||
}
|
||||
|
||||
// Vector fused multiply-add.
|
||||
inline __device__ __nv_bfloat162 fma(__nv_bfloat162 a, __nv_bfloat162 b,
|
||||
__nv_bfloat162 c) {
|
||||
inline __device__ __nv_bfloat162 fma(__nv_bfloat162 a, __nv_bfloat162 b, __nv_bfloat162 c) {
|
||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
|
||||
assert(false);
|
||||
#else
|
||||
@ -291,8 +288,7 @@ inline __device__ __nv_bfloat162 fma(__nv_bfloat162 a, __nv_bfloat162 b,
|
||||
#endif
|
||||
}
|
||||
|
||||
inline __device__ __nv_bfloat162 fma(__nv_bfloat16 a, __nv_bfloat162 b,
|
||||
__nv_bfloat162 c) {
|
||||
inline __device__ __nv_bfloat162 fma(__nv_bfloat16 a, __nv_bfloat162 b, __nv_bfloat162 c) {
|
||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
|
||||
assert(false);
|
||||
#else
|
||||
@ -383,23 +379,23 @@ inline __device__ Float8_ fma(__nv_bfloat16 a, bf16_8_t b, Float8_ fc) {
|
||||
}
|
||||
|
||||
// Vector sum.
|
||||
template <>
|
||||
template<>
|
||||
inline __device__ float sum(__nv_bfloat16 v) {
|
||||
return __bfloat162float(v);
|
||||
}
|
||||
|
||||
template <>
|
||||
template<>
|
||||
inline __device__ float sum(__nv_bfloat162 v) {
|
||||
float2 vf = bf1622float2(v);
|
||||
return vf.x + vf.y;
|
||||
}
|
||||
|
||||
template <>
|
||||
template<>
|
||||
inline __device__ float sum(bf16_4_t v) {
|
||||
return sum(v.x) + sum(v.y);
|
||||
}
|
||||
|
||||
template <>
|
||||
template<>
|
||||
inline __device__ float sum(bf16_8_t v) {
|
||||
return sum(v.x) + sum(v.y) + sum(v.z) + sum(v.w);
|
||||
}
|
||||
@ -452,4 +448,4 @@ inline __device__ void zero(__nv_bfloat16& dst) {
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace vllm
|
||||
} // namespace vllm
|
||||
|
||||
@ -1,8 +1,6 @@
|
||||
/*
|
||||
* Adapted from
|
||||
* https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
|
||||
* and
|
||||
* https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h
|
||||
* Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
|
||||
* and https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h
|
||||
* Copyright (c) 2023, The vLLM team.
|
||||
* Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
@ -32,37 +30,37 @@
|
||||
namespace vllm {
|
||||
|
||||
// FP16 vector types for Q, K, V.
|
||||
template <>
|
||||
template<>
|
||||
struct Vec<uint16_t, 1> {
|
||||
using Type = uint16_t;
|
||||
};
|
||||
template <>
|
||||
template<>
|
||||
struct Vec<uint16_t, 2> {
|
||||
using Type = uint32_t;
|
||||
};
|
||||
template <>
|
||||
template<>
|
||||
struct Vec<uint16_t, 4> {
|
||||
using Type = uint2;
|
||||
};
|
||||
template <>
|
||||
template<>
|
||||
struct Vec<uint16_t, 8> {
|
||||
using Type = uint4;
|
||||
};
|
||||
|
||||
// FP32 accumulator vector types corresponding to Vec.
|
||||
template <>
|
||||
template<>
|
||||
struct FloatVec<uint16_t> {
|
||||
using Type = float;
|
||||
};
|
||||
template <>
|
||||
template<>
|
||||
struct FloatVec<uint32_t> {
|
||||
using Type = float2;
|
||||
};
|
||||
template <>
|
||||
template<>
|
||||
struct FloatVec<uint2> {
|
||||
using Type = Float4_;
|
||||
};
|
||||
template <>
|
||||
template<>
|
||||
struct FloatVec<uint4> {
|
||||
using Type = Float8_;
|
||||
};
|
||||
@ -75,8 +73,8 @@ inline __device__ uint32_t h0_h0(uint16_t a) {
|
||||
return b;
|
||||
#else
|
||||
union {
|
||||
uint32_t u32;
|
||||
uint16_t u16[2];
|
||||
uint32_t u32;
|
||||
uint16_t u16[2];
|
||||
} tmp;
|
||||
tmp.u16[0] = a;
|
||||
tmp.u16[1] = a;
|
||||
@ -132,12 +130,10 @@ inline __device__ uint32_t float2_to_half2(float2 f) {
|
||||
} tmp;
|
||||
#ifndef USE_ROCM
|
||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
|
||||
asm volatile("cvt.rn.f16x2.f32 %0, %1, %2;\n"
|
||||
: "=r"(tmp.u32)
|
||||
: "f"(f.y), "f"(f.x));
|
||||
asm volatile("cvt.rn.f16x2.f32 %0, %1, %2;\n" : "=r"(tmp.u32) : "f"(f.y), "f"(f.x));
|
||||
#else
|
||||
asm volatile("cvt.rn.f16.f32 %0, %1;\n" : "=h"(tmp.u16[0]) : "f"(f.x));
|
||||
asm volatile("cvt.rn.f16.f32 %0, %1;\n" : "=h"(tmp.u16[1]) : "f"(f.y));
|
||||
asm volatile("cvt.rn.f16.f32 %0, %1;\n" : "=h"(tmp.u16[0]) : "f"(f.x));
|
||||
asm volatile("cvt.rn.f16.f32 %0, %1;\n" : "=h"(tmp.u16[1]) : "f"(f.y));
|
||||
#endif
|
||||
#else
|
||||
tmp.u16[0] = float_to_half(f.x);
|
||||
@ -205,7 +201,7 @@ inline __device__ Float8_ add(uint4 a, Float8_ fb) {
|
||||
}
|
||||
|
||||
// Vector multiplication.
|
||||
template <>
|
||||
template<>
|
||||
inline __device__ uint16_t mul(uint16_t a, uint16_t b) {
|
||||
uint16_t c;
|
||||
#ifndef USE_ROCM
|
||||
@ -216,7 +212,7 @@ inline __device__ uint16_t mul(uint16_t a, uint16_t b) {
|
||||
return c;
|
||||
}
|
||||
|
||||
template <>
|
||||
template<>
|
||||
inline __device__ uint32_t mul(uint32_t a, uint32_t b) {
|
||||
uint32_t c;
|
||||
#ifndef USE_ROCM
|
||||
@ -227,12 +223,12 @@ inline __device__ uint32_t mul(uint32_t a, uint32_t b) {
|
||||
return c;
|
||||
}
|
||||
|
||||
template <>
|
||||
template<>
|
||||
inline __device__ uint32_t mul(uint16_t a, uint32_t b) {
|
||||
return mul<uint32_t, uint32_t, uint32_t>(h0_h0(a), b);
|
||||
}
|
||||
|
||||
template <>
|
||||
template<>
|
||||
inline __device__ uint2 mul(uint2 a, uint2 b) {
|
||||
uint2 c;
|
||||
c.x = mul<uint32_t, uint32_t, uint32_t>(a.x, b.x);
|
||||
@ -240,7 +236,7 @@ inline __device__ uint2 mul(uint2 a, uint2 b) {
|
||||
return c;
|
||||
}
|
||||
|
||||
template <>
|
||||
template<>
|
||||
inline __device__ uint2 mul(uint16_t a, uint2 b) {
|
||||
uint32_t s = h0_h0(a);
|
||||
uint2 c;
|
||||
@ -249,7 +245,7 @@ inline __device__ uint2 mul(uint16_t a, uint2 b) {
|
||||
return c;
|
||||
}
|
||||
|
||||
template <>
|
||||
template<>
|
||||
inline __device__ uint4 mul(uint4 a, uint4 b) {
|
||||
uint4 c;
|
||||
c.x = mul<uint32_t, uint32_t, uint32_t>(a.x, b.x);
|
||||
@ -259,7 +255,7 @@ inline __device__ uint4 mul(uint4 a, uint4 b) {
|
||||
return c;
|
||||
}
|
||||
|
||||
template <>
|
||||
template<>
|
||||
inline __device__ uint4 mul(uint16_t a, uint4 b) {
|
||||
uint32_t s = h0_h0(a);
|
||||
uint4 c;
|
||||
@ -270,26 +266,26 @@ inline __device__ uint4 mul(uint16_t a, uint4 b) {
|
||||
return c;
|
||||
}
|
||||
|
||||
template <>
|
||||
template<>
|
||||
inline __device__ float mul(uint16_t a, uint16_t b) {
|
||||
float fa = half_to_float(a);
|
||||
float fb = half_to_float(b);
|
||||
return fa * fb;
|
||||
}
|
||||
|
||||
template <>
|
||||
template<>
|
||||
inline __device__ float2 mul(uint32_t a, uint32_t b) {
|
||||
float2 fa = half2_to_float2(a);
|
||||
float2 fb = half2_to_float2(b);
|
||||
return mul<float2, float2, float2>(fa, fb);
|
||||
}
|
||||
|
||||
template <>
|
||||
template<>
|
||||
inline __device__ float2 mul(uint16_t a, uint32_t b) {
|
||||
return mul<float2, uint32_t, uint32_t>(h0_h0(a), b);
|
||||
}
|
||||
|
||||
template <>
|
||||
template<>
|
||||
inline __device__ Float4_ mul(uint2 a, uint2 b) {
|
||||
Float4_ fc;
|
||||
fc.x = mul<float2, uint32_t, uint32_t>(a.x, b.x);
|
||||
@ -297,7 +293,7 @@ inline __device__ Float4_ mul(uint2 a, uint2 b) {
|
||||
return fc;
|
||||
}
|
||||
|
||||
template <>
|
||||
template<>
|
||||
inline __device__ Float4_ mul(uint16_t a, uint2 b) {
|
||||
uint32_t s = h0_h0(a);
|
||||
Float4_ fc;
|
||||
@ -306,7 +302,7 @@ inline __device__ Float4_ mul(uint16_t a, uint2 b) {
|
||||
return fc;
|
||||
}
|
||||
|
||||
template <>
|
||||
template<>
|
||||
inline __device__ Float8_ mul(uint4 a, uint4 b) {
|
||||
Float8_ fc;
|
||||
fc.x = mul<float2, uint32_t, uint32_t>(a.x, b.x);
|
||||
@ -316,7 +312,7 @@ inline __device__ Float8_ mul(uint4 a, uint4 b) {
|
||||
return fc;
|
||||
}
|
||||
|
||||
template <>
|
||||
template<>
|
||||
inline __device__ Float8_ mul(uint16_t a, uint4 b) {
|
||||
uint32_t s = h0_h0(a);
|
||||
Float8_ fc;
|
||||
@ -331,13 +327,9 @@ inline __device__ Float8_ mul(uint16_t a, uint4 b) {
|
||||
inline __device__ uint32_t fma(uint32_t a, uint32_t b, uint32_t c) {
|
||||
uint32_t d;
|
||||
#ifndef USE_ROCM
|
||||
asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n"
|
||||
: "=r"(d)
|
||||
: "r"(a), "r"(b), "r"(c));
|
||||
asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(d) : "r"(a), "r"(b), "r"(c));
|
||||
#else
|
||||
asm volatile("v_pk_fma_f16 %0, %1, %2, %3;\n"
|
||||
: "=v"(d)
|
||||
: "v"(a), "v"(b), "v"(c));
|
||||
asm volatile("v_pk_fma_f16 %0, %1, %2, %3;\n" : "=v"(d) : "v"(a), "v"(b), "v"(c));
|
||||
#endif
|
||||
return d;
|
||||
}
|
||||
@ -431,24 +423,24 @@ inline __device__ Float8_ fma(uint16_t a, uint4 b, Float8_ fc) {
|
||||
}
|
||||
|
||||
// Vector sum.
|
||||
template <>
|
||||
template<>
|
||||
inline __device__ float sum(uint16_t v) {
|
||||
return half_to_float(v);
|
||||
}
|
||||
|
||||
template <>
|
||||
template<>
|
||||
inline __device__ float sum(uint32_t v) {
|
||||
float2 tmp = half2_to_float2(v);
|
||||
return tmp.x + tmp.y;
|
||||
}
|
||||
|
||||
template <>
|
||||
template<>
|
||||
inline __device__ float sum(uint2 v) {
|
||||
uint32_t c = add(v.x, v.y);
|
||||
return sum(c);
|
||||
}
|
||||
|
||||
template <>
|
||||
template<>
|
||||
inline __device__ float sum(uint4 v) {
|
||||
uint32_t c = add(v.x, v.y);
|
||||
c = add(c, v.z);
|
||||
@ -478,9 +470,13 @@ inline __device__ void from_float(uint4& dst, Float8_ src) {
|
||||
}
|
||||
|
||||
// From float16 to float32.
|
||||
inline __device__ float to_float(uint16_t u) { return half_to_float(u); }
|
||||
inline __device__ float to_float(uint16_t u) {
|
||||
return half_to_float(u);
|
||||
}
|
||||
|
||||
inline __device__ float2 to_float(uint32_t u) { return half2_to_float2(u); }
|
||||
inline __device__ float2 to_float(uint32_t u) {
|
||||
return half2_to_float2(u);
|
||||
}
|
||||
|
||||
inline __device__ Float4_ to_float(uint2 u) {
|
||||
Float4_ tmp;
|
||||
@ -499,6 +495,8 @@ inline __device__ Float8_ to_float(uint4 u) {
|
||||
}
|
||||
|
||||
// Zero-out a variable.
|
||||
inline __device__ void zero(uint16_t& dst) { dst = uint16_t(0); }
|
||||
inline __device__ void zero(uint16_t& dst) {
|
||||
dst = uint16_t(0);
|
||||
}
|
||||
|
||||
} // namespace vllm
|
||||
} // namespace vllm
|
||||
|
||||
@ -1,8 +1,6 @@
|
||||
/*
|
||||
* Adapted from
|
||||
* https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
|
||||
* and
|
||||
* https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h
|
||||
* Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
|
||||
* and https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h
|
||||
* Copyright (c) 2023, The vLLM team.
|
||||
* Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
@ -40,35 +38,37 @@ struct Float8_ {
|
||||
};
|
||||
|
||||
// FP32 vector types for Q, K, V.
|
||||
template <>
|
||||
template<>
|
||||
struct Vec<float, 1> {
|
||||
using Type = float;
|
||||
};
|
||||
template <>
|
||||
template<>
|
||||
struct Vec<float, 2> {
|
||||
using Type = float2;
|
||||
};
|
||||
template <>
|
||||
template<>
|
||||
struct Vec<float, 4> {
|
||||
using Type = float4;
|
||||
};
|
||||
|
||||
// FP32 accumulator vector types corresponding to Vec.
|
||||
template <>
|
||||
template<>
|
||||
struct FloatVec<float> {
|
||||
using Type = float;
|
||||
};
|
||||
template <>
|
||||
template<>
|
||||
struct FloatVec<float2> {
|
||||
using Type = float2;
|
||||
};
|
||||
template <>
|
||||
template<>
|
||||
struct FloatVec<float4> {
|
||||
using Type = float4;
|
||||
};
|
||||
|
||||
// Vector addition.
|
||||
inline __device__ float add(float a, float b) { return a + b; }
|
||||
inline __device__ float add(float a, float b) {
|
||||
return a + b;
|
||||
}
|
||||
|
||||
inline __device__ float2 add(float2 a, float2 b) {
|
||||
float2 c;
|
||||
@ -87,12 +87,12 @@ inline __device__ float4 add(float4 a, float4 b) {
|
||||
}
|
||||
|
||||
// Vector multiplication.
|
||||
template <>
|
||||
template<>
|
||||
inline __device__ float mul<float, float>(float a, float b) {
|
||||
return a * b;
|
||||
}
|
||||
|
||||
template <>
|
||||
template<>
|
||||
inline __device__ float2 mul(float2 a, float2 b) {
|
||||
float2 c;
|
||||
c.x = a.x * b.x;
|
||||
@ -100,7 +100,7 @@ inline __device__ float2 mul(float2 a, float2 b) {
|
||||
return c;
|
||||
}
|
||||
|
||||
template <>
|
||||
template<>
|
||||
inline __device__ float2 mul(float a, float2 b) {
|
||||
float2 c;
|
||||
c.x = a * b.x;
|
||||
@ -108,7 +108,7 @@ inline __device__ float2 mul(float a, float2 b) {
|
||||
return c;
|
||||
}
|
||||
|
||||
template <>
|
||||
template<>
|
||||
inline __device__ float4 mul(float4 a, float4 b) {
|
||||
float4 c;
|
||||
c.x = a.x * b.x;
|
||||
@ -118,7 +118,7 @@ inline __device__ float4 mul(float4 a, float4 b) {
|
||||
return c;
|
||||
}
|
||||
|
||||
template <>
|
||||
template<>
|
||||
inline __device__ float4 mul(float a, float4 b) {
|
||||
float4 c;
|
||||
c.x = a * b.x;
|
||||
@ -129,7 +129,9 @@ inline __device__ float4 mul(float a, float4 b) {
|
||||
}
|
||||
|
||||
// Vector fused multiply-add.
|
||||
inline __device__ float fma(float a, float b, float c) { return a * b + c; }
|
||||
inline __device__ float fma(float a, float b, float c) {
|
||||
return a * b + c;
|
||||
}
|
||||
|
||||
inline __device__ float2 fma(float2 a, float2 b, float2 c) {
|
||||
float2 d;
|
||||
@ -180,33 +182,35 @@ inline __device__ Float8_ fma(float a, Float8_ b, Float8_ c) {
|
||||
}
|
||||
|
||||
// Vector sum.
|
||||
template <>
|
||||
template<>
|
||||
inline __device__ float sum(float v) {
|
||||
return v;
|
||||
}
|
||||
|
||||
template <>
|
||||
template<>
|
||||
inline __device__ float sum(float2 v) {
|
||||
return v.x + v.y;
|
||||
}
|
||||
|
||||
template <>
|
||||
template<>
|
||||
inline __device__ float sum(float4 v) {
|
||||
return v.x + v.y + v.z + v.w;
|
||||
}
|
||||
|
||||
template <>
|
||||
template<>
|
||||
inline __device__ float sum(Float4_ v) {
|
||||
return v.x.x + v.x.y + v.y.x + v.y.y;
|
||||
}
|
||||
|
||||
template <>
|
||||
template<>
|
||||
inline __device__ float sum(Float8_ v) {
|
||||
return v.x.x + v.x.y + v.y.x + v.y.y + v.z.x + v.z.y + v.w.x + v.w.y;
|
||||
}
|
||||
|
||||
// Vector dot product.
|
||||
inline __device__ float dot(float a, float b) { return a * b; }
|
||||
inline __device__ float dot(float a, float b) {
|
||||
return a * b;
|
||||
}
|
||||
|
||||
inline __device__ float dot(float2 a, float2 b) {
|
||||
float2 c = mul<float2, float2, float2>(a, b);
|
||||
@ -228,24 +232,42 @@ inline __device__ float dot(Float8_ a, Float8_ b) {
|
||||
}
|
||||
|
||||
// From float to float.
|
||||
inline __device__ void from_float(float& dst, float src) { dst = src; }
|
||||
inline __device__ void from_float(float& dst, float src) {
|
||||
dst = src;
|
||||
}
|
||||
|
||||
inline __device__ void from_float(float2& dst, float2 src) { dst = src; }
|
||||
inline __device__ void from_float(float2& dst, float2 src) {
|
||||
dst = src;
|
||||
}
|
||||
|
||||
inline __device__ void from_float(float4& dst, float4 src) { dst = src; }
|
||||
inline __device__ void from_float(float4& dst, float4 src) {
|
||||
dst = src;
|
||||
}
|
||||
|
||||
// From float to float.
|
||||
inline __device__ float to_float(float u) { return u; }
|
||||
inline __device__ float to_float(float u) {
|
||||
return u;
|
||||
}
|
||||
|
||||
inline __device__ float2 to_float(float2 u) { return u; }
|
||||
inline __device__ float2 to_float(float2 u) {
|
||||
return u;
|
||||
}
|
||||
|
||||
inline __device__ float4 to_float(float4 u) { return u; }
|
||||
inline __device__ float4 to_float(float4 u) {
|
||||
return u;
|
||||
}
|
||||
|
||||
inline __device__ Float4_ to_float(Float4_ u) { return u; }
|
||||
inline __device__ Float4_ to_float(Float4_ u) {
|
||||
return u;
|
||||
}
|
||||
|
||||
inline __device__ Float8_ to_float(Float8_ u) { return u; }
|
||||
inline __device__ Float8_ to_float(Float8_ u) {
|
||||
return u;
|
||||
}
|
||||
|
||||
// Zero-out a variable.
|
||||
inline __device__ void zero(float& dst) { dst = 0.f; }
|
||||
inline __device__ void zero(float& dst) {
|
||||
dst = 0.f;
|
||||
}
|
||||
|
||||
} // namespace vllm
|
||||
} // namespace vllm
|
||||
|
||||
@ -3,39 +3,33 @@
|
||||
#include "attention_generic.cuh"
|
||||
|
||||
#include <stdint.h>
|
||||
#ifdef ENABLE_FP8
|
||||
#ifndef USE_ROCM
|
||||
#include <cuda_fp8.h>
|
||||
#endif // USE_ROCM
|
||||
#endif // ENABLE_FP8
|
||||
#ifdef ENABLE_FP8_E5M2
|
||||
#include <cuda_fp8.h>
|
||||
#endif
|
||||
|
||||
namespace vllm {
|
||||
|
||||
enum class Fp8KVCacheDataType {
|
||||
kAuto = 0,
|
||||
kFp8E4M3 = 1,
|
||||
kFp8E5M2 = 2,
|
||||
};
|
||||
|
||||
#if defined(ENABLE_FP8_E5M2) || defined(ENABLE_FP8_E4M3)
|
||||
// fp8 vector types for quantization of kv cache
|
||||
template <>
|
||||
|
||||
template<>
|
||||
struct Vec<uint8_t, 1> {
|
||||
using Type = uint8_t;
|
||||
using Type = uint8_t;
|
||||
};
|
||||
|
||||
template <>
|
||||
template<>
|
||||
struct Vec<uint8_t, 2> {
|
||||
using Type = uint16_t;
|
||||
using Type = uint16_t;
|
||||
};
|
||||
|
||||
template <>
|
||||
template<>
|
||||
struct Vec<uint8_t, 4> {
|
||||
using Type = uint32_t;
|
||||
using Type = uint32_t;
|
||||
};
|
||||
|
||||
template <>
|
||||
template<>
|
||||
struct Vec<uint8_t, 8> {
|
||||
using Type = uint2;
|
||||
using Type = uint2;
|
||||
};
|
||||
#endif // ENABLE_FP8_E5M2
|
||||
|
||||
} // namespace vllm
|
||||
} // namespace vllm
|
||||
|
||||
42
csrc/cache.h
42
csrc/cache.h
@ -1,32 +1,30 @@
|
||||
#pragma once
|
||||
|
||||
#include <torch/all.h>
|
||||
#include <torch/extension.h>
|
||||
|
||||
#include <map>
|
||||
#include <vector>
|
||||
|
||||
void swap_blocks(torch::Tensor& src, torch::Tensor& dst,
|
||||
const torch::Tensor& block_mapping);
|
||||
void swap_blocks(
|
||||
torch::Tensor& src,
|
||||
torch::Tensor& dst,
|
||||
const std::map<int64_t, int64_t>& block_mapping);
|
||||
|
||||
// Note: the key_caches and value_caches vectors are constant but
|
||||
// not the Tensors they contain. The vectors need to be const refs
|
||||
// in order to satisfy pytorch's C++ operator registration code.
|
||||
void copy_blocks(std::vector<torch::Tensor> const& key_caches,
|
||||
std::vector<torch::Tensor> const& value_caches,
|
||||
const torch::Tensor& block_mapping);
|
||||
void copy_blocks(
|
||||
std::vector<torch::Tensor>& key_caches,
|
||||
std::vector<torch::Tensor>& value_caches,
|
||||
const std::map<int64_t, std::vector<int64_t>>& block_mapping);
|
||||
|
||||
void reshape_and_cache(torch::Tensor& key, torch::Tensor& value,
|
||||
torch::Tensor& key_cache, torch::Tensor& value_cache,
|
||||
torch::Tensor& slot_mapping,
|
||||
const std::string& kv_cache_dtype,
|
||||
const double kv_scale);
|
||||
|
||||
void reshape_and_cache_flash(torch::Tensor& key, torch::Tensor& value,
|
||||
torch::Tensor& key_cache,
|
||||
torch::Tensor& value_cache,
|
||||
torch::Tensor& slot_mapping,
|
||||
const std::string& kv_cache_dtype);
|
||||
void reshape_and_cache(
|
||||
torch::Tensor& key,
|
||||
torch::Tensor& value,
|
||||
torch::Tensor& key_cache,
|
||||
torch::Tensor& value_cache,
|
||||
torch::Tensor& slot_mapping,
|
||||
const std::string& kv_cache_dtype,
|
||||
const float kv_scale);
|
||||
|
||||
// Just for unittest
|
||||
void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache,
|
||||
const double scale, const std::string& kv_cache_dtype);
|
||||
void convert_fp8(
|
||||
torch::Tensor& src_cache,
|
||||
torch::Tensor& dst_cache);
|
||||
|
||||
@ -1,14 +1,13 @@
|
||||
#include <torch/all.h>
|
||||
#include <torch/extension.h>
|
||||
#include <ATen/cuda/CUDAContext.h>
|
||||
#include <c10/cuda/CUDAGuard.h>
|
||||
|
||||
#include "cuda_compat.h"
|
||||
#include "dispatch_utils.h"
|
||||
|
||||
#ifdef USE_ROCM
|
||||
#include "quantization/fp8/amd/quant_utils.cuh"
|
||||
#else
|
||||
#include "quantization/fp8/nvidia/quant_utils.cuh"
|
||||
#if defined(ENABLE_FP8_E5M2)
|
||||
#include "quantization/fp8_e5m2_kvcache/quant_utils.cuh"
|
||||
#elif defined(ENABLE_FP8_E4M3)
|
||||
#include "quantization/fp8/amd_detail/quant_utils.cuh"
|
||||
#endif
|
||||
|
||||
#include <algorithm>
|
||||
@ -18,17 +17,20 @@
|
||||
|
||||
#ifdef USE_ROCM
|
||||
#include <hip/hip_bf16.h>
|
||||
typedef __hip_bfloat16 __nv_bfloat16;
|
||||
typedef __hip_bfloat16 __nv_bfloat16;
|
||||
#endif
|
||||
|
||||
void swap_blocks(torch::Tensor& src, torch::Tensor& dst,
|
||||
const torch::Tensor& block_mapping) {
|
||||
void swap_blocks(
|
||||
torch::Tensor& src,
|
||||
torch::Tensor& dst,
|
||||
const std::map<int64_t, int64_t>& block_mapping) {
|
||||
torch::Device src_device = src.device();
|
||||
torch::Device dst_device = dst.device();
|
||||
cudaMemcpyKind memcpy_type;
|
||||
if (src_device.is_cuda() && dst_device.is_cuda()) {
|
||||
TORCH_CHECK(src_device.index() == dst_device.index(),
|
||||
"src and dst must be on the same GPU");
|
||||
TORCH_CHECK(
|
||||
src_device.index() == dst_device.index(),
|
||||
"src and dst must be on the same GPU");
|
||||
memcpy_type = cudaMemcpyDeviceToDevice;
|
||||
} else if (src_device.is_cuda() && dst_device.is_cpu()) {
|
||||
memcpy_type = cudaMemcpyDeviceToHost;
|
||||
@ -38,44 +40,41 @@ void swap_blocks(torch::Tensor& src, torch::Tensor& dst,
|
||||
TORCH_CHECK(false, "Invalid device combination");
|
||||
}
|
||||
|
||||
// NOTE(youkaichao): keep in mind that `block_mapping` should be
|
||||
// a cpu tensor, otherwise every `item` call will require a gpu-cpu
|
||||
// synchronization.
|
||||
TORCH_CHECK(block_mapping.device().is_cpu(), "block_mapping must be on CPU");
|
||||
|
||||
char* src_ptr = static_cast<char*>(src.data_ptr());
|
||||
char* dst_ptr = static_cast<char*>(dst.data_ptr());
|
||||
char *src_ptr = static_cast<char*>(src.data_ptr());
|
||||
char *dst_ptr = static_cast<char*>(dst.data_ptr());
|
||||
|
||||
const int64_t block_size_in_bytes = src.element_size() * src[0].numel();
|
||||
const at::cuda::OptionalCUDAGuard device_guard(
|
||||
src_device.is_cuda() ? src_device : dst_device);
|
||||
const at::cuda::OptionalCUDAGuard device_guard(src_device.is_cuda() ? src_device : dst_device);
|
||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
// NOTE(woosuk): This can be slow if the number of blocks is large.
|
||||
const int64_t num_blocks = block_mapping.size(0);
|
||||
for (size_t i = 0; i < num_blocks; i++) {
|
||||
int64_t src_block_number = block_mapping[i][0].item<int64_t>();
|
||||
int64_t dst_block_number = block_mapping[i][1].item<int64_t>();
|
||||
for (const auto& pair : block_mapping) {
|
||||
int64_t src_block_number = pair.first;
|
||||
int64_t dst_block_number = pair.second;
|
||||
int64_t src_offset = src_block_number * block_size_in_bytes;
|
||||
int64_t dst_offset = dst_block_number * block_size_in_bytes;
|
||||
cudaMemcpyAsync(dst_ptr + dst_offset, src_ptr + src_offset,
|
||||
block_size_in_bytes, memcpy_type, stream);
|
||||
cudaMemcpyAsync(
|
||||
dst_ptr + dst_offset,
|
||||
src_ptr + src_offset,
|
||||
block_size_in_bytes,
|
||||
memcpy_type,
|
||||
stream);
|
||||
}
|
||||
}
|
||||
|
||||
namespace vllm {
|
||||
|
||||
// Grid: (num_layers, num_pairs)
|
||||
template <typename scalar_t>
|
||||
__global__ void copy_blocks_kernel(int64_t* key_cache_ptrs,
|
||||
int64_t* value_cache_ptrs,
|
||||
const int64_t* __restrict__ block_mapping,
|
||||
const int numel_per_block) {
|
||||
template<typename scalar_t>
|
||||
__global__ void copy_blocks_kernel(
|
||||
int64_t* key_cache_ptrs,
|
||||
int64_t* value_cache_ptrs,
|
||||
const int64_t* __restrict__ block_mapping,
|
||||
const int numel_per_block) {
|
||||
const int layer_idx = blockIdx.x;
|
||||
const int pair_idx = blockIdx.y;
|
||||
|
||||
scalar_t* key_cache = reinterpret_cast<scalar_t*>(key_cache_ptrs[layer_idx]);
|
||||
scalar_t* value_cache =
|
||||
reinterpret_cast<scalar_t*>(value_cache_ptrs[layer_idx]);
|
||||
scalar_t* value_cache = reinterpret_cast<scalar_t*>(value_cache_ptrs[layer_idx]);
|
||||
int64_t src_block_number = block_mapping[2 * pair_idx];
|
||||
int64_t dst_block_number = block_mapping[2 * pair_idx + 1];
|
||||
|
||||
@ -93,14 +92,12 @@ __global__ void copy_blocks_kernel(int64_t* key_cache_ptrs,
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace vllm
|
||||
} // namespace vllm
|
||||
|
||||
// Note: the key_caches and value_caches vectors are constant but
|
||||
// not the Tensors they contain. The vectors need to be const refs
|
||||
// in order to satisfy pytorch's C++ operator registration code.
|
||||
void copy_blocks(std::vector<torch::Tensor> const& key_caches,
|
||||
std::vector<torch::Tensor> const& value_caches,
|
||||
const torch::Tensor& block_mapping) {
|
||||
void copy_blocks(
|
||||
std::vector<torch::Tensor>& key_caches,
|
||||
std::vector<torch::Tensor>& value_caches,
|
||||
const std::map<int64_t, std::vector<int64_t>>& block_mapping) {
|
||||
int num_layers = key_caches.size();
|
||||
TORCH_CHECK(num_layers == value_caches.size());
|
||||
if (num_layers == 0) {
|
||||
@ -114,23 +111,29 @@ void copy_blocks(std::vector<torch::Tensor> const& key_caches,
|
||||
int64_t key_cache_ptrs[num_layers];
|
||||
int64_t value_cache_ptrs[num_layers];
|
||||
for (int layer_idx = 0; layer_idx < num_layers; ++layer_idx) {
|
||||
key_cache_ptrs[layer_idx] =
|
||||
reinterpret_cast<int64_t>(key_caches[layer_idx].data_ptr());
|
||||
value_cache_ptrs[layer_idx] =
|
||||
reinterpret_cast<int64_t>(value_caches[layer_idx].data_ptr());
|
||||
key_cache_ptrs[layer_idx] = reinterpret_cast<int64_t>(key_caches[layer_idx].data_ptr());
|
||||
value_cache_ptrs[layer_idx] = reinterpret_cast<int64_t>(value_caches[layer_idx].data_ptr());
|
||||
}
|
||||
|
||||
// block_mapping is a 2D tensor with shape (num_pairs, 2).
|
||||
int num_pairs = block_mapping.size(0);
|
||||
// Create block mapping array.
|
||||
std::vector<int64_t> block_mapping_vec;
|
||||
for (const auto& pair : block_mapping) {
|
||||
int64_t src_block_number = pair.first;
|
||||
for (int64_t dst_block_number : pair.second) {
|
||||
block_mapping_vec.push_back(src_block_number);
|
||||
block_mapping_vec.push_back(dst_block_number);
|
||||
}
|
||||
}
|
||||
int64_t* block_mapping_array = block_mapping_vec.data();
|
||||
int num_pairs = block_mapping_vec.size() / 2;
|
||||
|
||||
// Move the data structures to the GPU.
|
||||
// NOTE: This synchronizes the CPU and GPU.
|
||||
torch::Tensor key_cache_ptrs_tensor =
|
||||
torch::from_blob(key_cache_ptrs, {num_layers}, torch::kInt64)
|
||||
.to(cache_device);
|
||||
torch::Tensor value_cache_ptrs_tensor =
|
||||
torch::from_blob(value_cache_ptrs, {num_layers}, torch::kInt64)
|
||||
.to(cache_device);
|
||||
torch::Tensor key_cache_ptrs_tensor = torch::from_blob(
|
||||
key_cache_ptrs, {num_layers}, torch::kInt64).to(cache_device);
|
||||
torch::Tensor value_cache_ptrs_tensor = torch::from_blob(
|
||||
value_cache_ptrs, {num_layers}, torch::kInt64).to(cache_device);
|
||||
torch::Tensor block_mapping_tensor = torch::from_blob(
|
||||
block_mapping_array, {2 * num_pairs}, torch::kInt64).to(cache_device);
|
||||
|
||||
// Launch the kernel.
|
||||
const int numel_per_block = key_caches[0][0].numel();
|
||||
@ -139,28 +142,31 @@ void copy_blocks(std::vector<torch::Tensor> const& key_caches,
|
||||
const at::cuda::OptionalCUDAGuard device_guard(cache_device);
|
||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
VLLM_DISPATCH_FLOATING_AND_BYTE_TYPES(
|
||||
key_caches[0].scalar_type(), "copy_blocks_kernel", ([&] {
|
||||
vllm::copy_blocks_kernel<scalar_t><<<grid, block, 0, stream>>>(
|
||||
key_cache_ptrs_tensor.data_ptr<int64_t>(),
|
||||
value_cache_ptrs_tensor.data_ptr<int64_t>(),
|
||||
block_mapping.data_ptr<int64_t>(), numel_per_block);
|
||||
}));
|
||||
key_caches[0].scalar_type(), "copy_blocks_kernel", ([&] {
|
||||
vllm::copy_blocks_kernel<scalar_t><<<grid, block, 0, stream>>>(
|
||||
key_cache_ptrs_tensor.data_ptr<int64_t>(),
|
||||
value_cache_ptrs_tensor.data_ptr<int64_t>(),
|
||||
block_mapping_tensor.data_ptr<int64_t>(),
|
||||
numel_per_block);
|
||||
}));
|
||||
}
|
||||
|
||||
namespace vllm {
|
||||
|
||||
template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
|
||||
template<typename scalar_t, typename cache_t, bool is_fp8_kv_cache>
|
||||
__global__ void reshape_and_cache_kernel(
|
||||
const scalar_t* __restrict__ key, // [num_tokens, num_heads, head_size]
|
||||
const scalar_t* __restrict__ value, // [num_tokens, num_heads, head_size]
|
||||
cache_t* __restrict__ key_cache, // [num_blocks, num_heads, head_size/x,
|
||||
// block_size, x]
|
||||
cache_t* __restrict__ value_cache, // [num_blocks, num_heads, head_size,
|
||||
// block_size]
|
||||
const int64_t* __restrict__ slot_mapping, // [num_tokens]
|
||||
const int key_stride, const int value_stride, const int num_heads,
|
||||
const int head_size, const int block_size, const int x,
|
||||
const float kv_scale) {
|
||||
const scalar_t* __restrict__ key, // [num_tokens, num_heads, head_size]
|
||||
const scalar_t* __restrict__ value, // [num_tokens, num_heads, head_size]
|
||||
cache_t* __restrict__ key_cache, // [num_blocks, num_heads, head_size/x, block_size, x]
|
||||
cache_t* __restrict__ value_cache, // [num_blocks, num_heads, head_size, block_size]
|
||||
const int64_t* __restrict__ slot_mapping, // [num_tokens]
|
||||
const int key_stride,
|
||||
const int value_stride,
|
||||
const int num_heads,
|
||||
const int head_size,
|
||||
const int block_size,
|
||||
const int x,
|
||||
const float kv_scale) {
|
||||
const int64_t token_idx = blockIdx.x;
|
||||
const int64_t slot_idx = slot_mapping[token_idx];
|
||||
if (slot_idx < 0) {
|
||||
@ -181,84 +187,60 @@ __global__ void reshape_and_cache_kernel(
|
||||
const int x_idx = head_offset / x;
|
||||
const int x_offset = head_offset % x;
|
||||
|
||||
const int64_t tgt_key_idx =
|
||||
block_idx * num_heads * (head_size / x) * block_size * x +
|
||||
head_idx * (head_size / x) * block_size * x + x_idx * block_size * x +
|
||||
block_offset * x + x_offset;
|
||||
const int64_t tgt_value_idx =
|
||||
block_idx * num_heads * head_size * block_size +
|
||||
head_idx * head_size * block_size + head_offset * block_size +
|
||||
block_offset;
|
||||
const int64_t tgt_key_idx = block_idx * num_heads * (head_size / x) * block_size * x
|
||||
+ head_idx * (head_size / x) * block_size * x
|
||||
+ x_idx * block_size * x
|
||||
+ block_offset * x
|
||||
+ x_offset;
|
||||
const int64_t tgt_value_idx = block_idx * num_heads * head_size * block_size
|
||||
+ head_idx * head_size * block_size
|
||||
+ head_offset * block_size
|
||||
+ block_offset;
|
||||
scalar_t tgt_key = key[src_key_idx];
|
||||
scalar_t tgt_value = value[src_value_idx];
|
||||
if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
|
||||
if constexpr (is_fp8_kv_cache) {
|
||||
#if defined(ENABLE_FP8_E5M2)
|
||||
key_cache[tgt_key_idx] = fp8_e5m2_unscaled::vec_conversion<uint8_t, scalar_t>(tgt_key);
|
||||
value_cache[tgt_value_idx] = fp8_e5m2_unscaled::vec_conversion<uint8_t, scalar_t>(tgt_value);
|
||||
#elif defined(ENABLE_FP8_E4M3)
|
||||
key_cache[tgt_key_idx] = fp8_e4m3::scaled_vec_conversion<uint8_t, scalar_t>(tgt_key, kv_scale);
|
||||
value_cache[tgt_value_idx] = fp8_e4m3::scaled_vec_conversion<uint8_t, scalar_t>(tgt_value, kv_scale);
|
||||
#else
|
||||
assert(false);
|
||||
#endif
|
||||
} else {
|
||||
key_cache[tgt_key_idx] = tgt_key;
|
||||
value_cache[tgt_value_idx] = tgt_value;
|
||||
} else {
|
||||
key_cache[tgt_key_idx] =
|
||||
fp8::scaled_convert<cache_t, scalar_t, kv_dt>(tgt_key, kv_scale);
|
||||
value_cache[tgt_value_idx] =
|
||||
fp8::scaled_convert<cache_t, scalar_t, kv_dt>(tgt_value, kv_scale);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename scalar_t>
|
||||
__global__ void reshape_and_cache_flash_kernel(
|
||||
const scalar_t* __restrict__ key, // [num_tokens, num_heads, head_size]
|
||||
const scalar_t* __restrict__ value, // [num_tokens, num_heads, head_size]
|
||||
scalar_t* __restrict__ k_cache, // [num_blocks, block_size, num_heads,
|
||||
// head_size]
|
||||
scalar_t* __restrict__ v_cache, // [num_blocks, block_size, num_heads,
|
||||
// head_size]
|
||||
const int64_t* __restrict__ slot_mapping, // [num_tokens]
|
||||
const int block_stride, const int key_stride, const int value_stride,
|
||||
const int num_heads, const int head_size, const int block_size) {
|
||||
const int64_t token_idx = blockIdx.x;
|
||||
const int64_t slot_idx = slot_mapping[token_idx];
|
||||
// NOTE: slot_idx can be -1 if the token is padded
|
||||
if (slot_idx < 0) {
|
||||
return;
|
||||
}
|
||||
const int64_t block_idx = slot_idx / block_size;
|
||||
const int64_t block_offset = slot_idx % block_size;
|
||||
const int n = num_heads * head_size;
|
||||
for (int i = threadIdx.x; i < n; i += blockDim.x) {
|
||||
const int64_t src_key_idx = token_idx * key_stride + i;
|
||||
const int64_t src_value_idx = token_idx * value_stride + i;
|
||||
const int head_idx = i / head_size;
|
||||
const int head_offset = i % head_size;
|
||||
const int64_t tgt_value_idx = block_idx * block_stride +
|
||||
block_offset * num_heads * head_size +
|
||||
head_idx * head_size + head_offset;
|
||||
k_cache[tgt_value_idx] = key[src_key_idx];
|
||||
v_cache[tgt_value_idx] = value[src_value_idx];
|
||||
}
|
||||
}
|
||||
} // namespace vllm
|
||||
} // namespace vllm
|
||||
|
||||
// KV_T is the stored data type of kv-cache.
|
||||
// CACHE_T is the data type of key and value tensors.
|
||||
// KV_DTYPE is the real data type of kv-cache.
|
||||
#define CALL_RESHAPE_AND_CACHE(KV_T, CACHE_T, KV_DTYPE) \
|
||||
vllm::reshape_and_cache_kernel<KV_T, CACHE_T, KV_DTYPE> \
|
||||
<<<grid, block, 0, stream>>>( \
|
||||
reinterpret_cast<KV_T*>(key.data_ptr()), \
|
||||
reinterpret_cast<KV_T*>(value.data_ptr()), \
|
||||
reinterpret_cast<CACHE_T*>(key_cache.data_ptr()), \
|
||||
reinterpret_cast<CACHE_T*>(value_cache.data_ptr()), \
|
||||
slot_mapping.data_ptr<int64_t>(), key_stride, value_stride, \
|
||||
num_heads, head_size, block_size, x, kv_scale);
|
||||
#define CALL_RESHAPE_AND_CACHE(KV_T, CACHE_T, IS_FP8_KV_CACHE) \
|
||||
vllm::reshape_and_cache_kernel<KV_T, CACHE_T, IS_FP8_KV_CACHE><<<grid, block, 0, stream>>>( \
|
||||
reinterpret_cast<KV_T*>(key.data_ptr()), \
|
||||
reinterpret_cast<KV_T*>(value.data_ptr()), \
|
||||
reinterpret_cast<CACHE_T*>(key_cache.data_ptr()), \
|
||||
reinterpret_cast<CACHE_T*>(value_cache.data_ptr()), \
|
||||
slot_mapping.data_ptr<int64_t>(), \
|
||||
key_stride, \
|
||||
value_stride, \
|
||||
num_heads, \
|
||||
head_size, \
|
||||
block_size, \
|
||||
x, \
|
||||
kv_scale);
|
||||
|
||||
void reshape_and_cache(
|
||||
torch::Tensor& key, // [num_tokens, num_heads, head_size]
|
||||
torch::Tensor& value, // [num_tokens, num_heads, head_size]
|
||||
torch::Tensor&
|
||||
key_cache, // [num_blocks, num_heads, head_size/x, block_size, x]
|
||||
torch::Tensor&
|
||||
value_cache, // [num_blocks, num_heads, head_size, block_size]
|
||||
torch::Tensor& slot_mapping, // [num_tokens]
|
||||
const std::string& kv_cache_dtype, const double kv_scale) {
|
||||
torch::Tensor& key, // [num_tokens, num_heads, head_size]
|
||||
torch::Tensor& value, // [num_tokens, num_heads, head_size]
|
||||
torch::Tensor& key_cache, // [num_blocks, num_heads, head_size/x, block_size, x]
|
||||
torch::Tensor& value_cache, // [num_blocks, num_heads, head_size, block_size]
|
||||
torch::Tensor& slot_mapping, // [num_tokens]
|
||||
const std::string& kv_cache_dtype,
|
||||
const float kv_scale)
|
||||
{
|
||||
int num_tokens = key.size(0);
|
||||
int num_heads = key.size(1);
|
||||
int head_size = key.size(2);
|
||||
@ -272,78 +254,66 @@ void reshape_and_cache(
|
||||
dim3 block(std::min(num_heads * head_size, 512));
|
||||
const at::cuda::OptionalCUDAGuard device_guard(device_of(key));
|
||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
|
||||
DISPATCH_BY_KV_CACHE_DTYPE(key.dtype(), kv_cache_dtype,
|
||||
CALL_RESHAPE_AND_CACHE)
|
||||
}
|
||||
|
||||
void reshape_and_cache_flash(
|
||||
torch::Tensor& key, // [num_tokens, num_heads, head_size]
|
||||
torch::Tensor& value, // [num_tokens, num_heads, head_size]
|
||||
torch::Tensor& k_cache, // [num_blocks, block_size, num_heads, head_size]
|
||||
torch::Tensor& v_cache, // [num_blocks, block_size, num_heads, head_size]
|
||||
torch::Tensor& slot_mapping, // [num_tokens]
|
||||
const std::string& kv_cache_dtype) {
|
||||
// FIXME: only support auto datatype, does not support fp8
|
||||
if (kv_cache_dtype != "auto") {
|
||||
if (kv_cache_dtype == "auto") {
|
||||
if (key.dtype() == at::ScalarType::Float) {
|
||||
CALL_RESHAPE_AND_CACHE(float, float, false);
|
||||
} else if (key.dtype() == at::ScalarType::Half) {
|
||||
CALL_RESHAPE_AND_CACHE(uint16_t, uint16_t, false);
|
||||
} else if (key.dtype() == at::ScalarType::BFloat16) {
|
||||
CALL_RESHAPE_AND_CACHE(__nv_bfloat16, __nv_bfloat16, false);
|
||||
}
|
||||
} else if (kv_cache_dtype == "fp8") {
|
||||
if (key.dtype() == at::ScalarType::Float) {
|
||||
CALL_RESHAPE_AND_CACHE(float, uint8_t, true);
|
||||
} else if (key.dtype() == at::ScalarType::Half) {
|
||||
CALL_RESHAPE_AND_CACHE(uint16_t, uint8_t, true);
|
||||
} else if (key.dtype() == at::ScalarType::BFloat16) {
|
||||
CALL_RESHAPE_AND_CACHE(__nv_bfloat16, uint8_t, true);
|
||||
}
|
||||
} else {
|
||||
TORCH_CHECK(false, "Unsupported data type of kv cache: ", kv_cache_dtype);
|
||||
}
|
||||
int num_tokens = key.size(0);
|
||||
int num_heads = key.size(1);
|
||||
int head_size = key.size(2);
|
||||
int block_size = k_cache.size(1);
|
||||
|
||||
int key_stride = key.stride(0);
|
||||
int value_stride = value.stride(0);
|
||||
int block_stride = k_cache.stride(0);
|
||||
TORCH_CHECK(k_cache.stride(0) == v_cache.stride(0));
|
||||
|
||||
dim3 grid(num_tokens);
|
||||
dim3 block(std::min(num_heads * head_size, 512));
|
||||
const at::cuda::OptionalCUDAGuard device_guard(device_of(key));
|
||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
VLLM_DISPATCH_FLOATING_TYPES(
|
||||
key.scalar_type(), "reshape_and_cache_flash", [&] {
|
||||
vllm::reshape_and_cache_flash_kernel<scalar_t>
|
||||
<<<grid, block, 0, stream>>>(
|
||||
key.data_ptr<scalar_t>(), value.data_ptr<scalar_t>(),
|
||||
k_cache.data_ptr<scalar_t>(), v_cache.data_ptr<scalar_t>(),
|
||||
slot_mapping.data_ptr<int64_t>(), block_stride, key_stride,
|
||||
value_stride, num_heads, head_size, block_size);
|
||||
});
|
||||
}
|
||||
|
||||
namespace vllm {
|
||||
|
||||
template <typename Tout, typename Tin, Fp8KVCacheDataType kv_dt>
|
||||
__global__ void convert_fp8_kernel(const Tin* __restrict__ src_cache,
|
||||
Tout* __restrict__ dst_cache,
|
||||
const float kv_scale,
|
||||
const int64_t block_stride) {
|
||||
template<typename Tout, typename Tin>
|
||||
__global__ void convert_fp8_kernel(
|
||||
const Tin* __restrict__ src_cache,
|
||||
Tout* __restrict__ dst_cache,
|
||||
const int64_t block_stride) {
|
||||
const int64_t block_idx = blockIdx.x;
|
||||
for (int i = threadIdx.x; i < block_stride; i += blockDim.x) {
|
||||
int64_t idx = block_idx * block_stride + i;
|
||||
dst_cache[idx] =
|
||||
fp8::scaled_convert<Tout, Tin, kv_dt>(src_cache[idx], kv_scale);
|
||||
#if defined(ENABLE_FP8_E5M2)
|
||||
dst_cache[idx] = fp8_e5m2_unscaled::vec_conversion<Tout, Tin>(src_cache[idx]);
|
||||
#elif defined(ENABLE_FP8_E4M3)
|
||||
dst_cache[idx] = fp8_e4m3::vec_conversion<Tout, Tin>(src_cache[idx]);
|
||||
#else
|
||||
assert(false);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace vllm
|
||||
} // namespace vllm
|
||||
|
||||
#define CALL_CONVERT_FP8(Tout, Tin, KV_DTYPE) \
|
||||
vllm::convert_fp8_kernel<Tout, Tin, KV_DTYPE><<<grid, block, 0, stream>>>( \
|
||||
reinterpret_cast<Tin*>(src_cache.data_ptr()), \
|
||||
reinterpret_cast<Tout*>(dst_cache.data_ptr()), kv_scale, block_stride);
|
||||
#define CALL_CONVERT_FP8(Tout, Tin) \
|
||||
vllm::convert_fp8_kernel<Tout, Tin><<<grid, block, 0, stream>>>( \
|
||||
reinterpret_cast<Tin*>(src_cache.data_ptr()), \
|
||||
reinterpret_cast<Tout*>(dst_cache.data_ptr()), \
|
||||
block_stride);
|
||||
|
||||
// Only for testing.
|
||||
void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache,
|
||||
const double kv_scale, const std::string& kv_cache_dtype) {
|
||||
void convert_fp8(
|
||||
torch::Tensor& src_cache,
|
||||
torch::Tensor& dst_cache)
|
||||
{
|
||||
torch::Device src_device = src_cache.device();
|
||||
torch::Device dst_device = dst_cache.device();
|
||||
TORCH_CHECK(src_device.is_cuda(), "src must be on a GPU")
|
||||
TORCH_CHECK(dst_device.is_cuda(), "dst must be on a GPU")
|
||||
TORCH_CHECK(src_device.index() == dst_device.index(),
|
||||
"src and dst must be on the same GPU");
|
||||
TORCH_CHECK(
|
||||
src_device.index() == dst_device.index(),
|
||||
"src and dst must be on the same GPU");
|
||||
at::cuda::OptionalCUDAGuard device_guard(src_device);
|
||||
|
||||
int64_t num_blocks = src_cache.size(0);
|
||||
@ -353,37 +323,17 @@ void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache,
|
||||
dim3 block(std::min(block_stride, int64_t(512)));
|
||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
|
||||
if (kv_cache_dtype == "auto") {
|
||||
if (src_cache.dtype() == at::ScalarType::Float) {
|
||||
CALL_CONVERT_FP8(uint8_t, float, vllm::Fp8KVCacheDataType::kAuto);
|
||||
} else if (src_cache.dtype() == at::ScalarType::Half) {
|
||||
CALL_CONVERT_FP8(uint8_t, uint16_t, vllm::Fp8KVCacheDataType::kAuto);
|
||||
} else if (src_cache.dtype() == at::ScalarType::BFloat16) {
|
||||
CALL_CONVERT_FP8(uint8_t, __nv_bfloat16, vllm::Fp8KVCacheDataType::kAuto);
|
||||
} else if (dst_cache.dtype() == at::ScalarType::Float) {
|
||||
CALL_CONVERT_FP8(float, uint8_t, vllm::Fp8KVCacheDataType::kAuto);
|
||||
} else if (dst_cache.dtype() == at::ScalarType::Half) {
|
||||
CALL_CONVERT_FP8(uint16_t, uint8_t, vllm::Fp8KVCacheDataType::kAuto);
|
||||
} else if (dst_cache.dtype() == at::ScalarType::BFloat16) {
|
||||
CALL_CONVERT_FP8(__nv_bfloat16, uint8_t, vllm::Fp8KVCacheDataType::kAuto);
|
||||
}
|
||||
} else if (kv_cache_dtype == "fp8" || kv_cache_dtype == "fp8_e4m3") {
|
||||
if (src_cache.dtype() == at::ScalarType::Float) {
|
||||
CALL_CONVERT_FP8(uint8_t, float, vllm::Fp8KVCacheDataType::kFp8E4M3);
|
||||
} else if (src_cache.dtype() == at::ScalarType::Half) {
|
||||
CALL_CONVERT_FP8(uint8_t, uint16_t, vllm::Fp8KVCacheDataType::kFp8E4M3);
|
||||
} else if (src_cache.dtype() == at::ScalarType::BFloat16) {
|
||||
CALL_CONVERT_FP8(uint8_t, __nv_bfloat16,
|
||||
vllm::Fp8KVCacheDataType::kFp8E4M3);
|
||||
} else if (dst_cache.dtype() == at::ScalarType::Float) {
|
||||
CALL_CONVERT_FP8(float, uint8_t, vllm::Fp8KVCacheDataType::kFp8E4M3);
|
||||
} else if (dst_cache.dtype() == at::ScalarType::Half) {
|
||||
CALL_CONVERT_FP8(uint16_t, uint8_t, vllm::Fp8KVCacheDataType::kFp8E4M3);
|
||||
} else if (dst_cache.dtype() == at::ScalarType::BFloat16) {
|
||||
CALL_CONVERT_FP8(__nv_bfloat16, uint8_t,
|
||||
vllm::Fp8KVCacheDataType::kFp8E4M3);
|
||||
}
|
||||
} else {
|
||||
TORCH_CHECK(false, "Unsupported data type: ", kv_cache_dtype);
|
||||
if (src_cache.dtype() == at::ScalarType::Float) {
|
||||
CALL_CONVERT_FP8(uint8_t, float);
|
||||
} else if (src_cache.dtype() == at::ScalarType::Half) {
|
||||
CALL_CONVERT_FP8(uint8_t, uint16_t);
|
||||
} else if (src_cache.dtype() == at::ScalarType::BFloat16) {
|
||||
CALL_CONVERT_FP8(uint8_t, __nv_bfloat16);
|
||||
} else if (dst_cache.dtype() == at::ScalarType::Float) {
|
||||
CALL_CONVERT_FP8(float, uint8_t);
|
||||
} else if (dst_cache.dtype() == at::ScalarType::Half) {
|
||||
CALL_CONVERT_FP8(uint16_t, uint8_t);
|
||||
} else if (dst_cache.dtype() == at::ScalarType::BFloat16) {
|
||||
CALL_CONVERT_FP8(__nv_bfloat16, uint8_t);
|
||||
}
|
||||
}
|
||||
|
||||
@ -1,10 +1,10 @@
|
||||
#include "cpu_types.hpp"
|
||||
|
||||
namespace {
|
||||
template <typename scalar_t, vec_op::FP32Vec8 (*func)(const vec_op::FP32Vec8&),
|
||||
template <typename scalar_t, vec_op::FP32Vec8 (*func)(const vec_op::FP32Vec8 &),
|
||||
bool is_gated>
|
||||
void activation_kernel(int num_tokens, int d, scalar_t* __restrict__ input,
|
||||
scalar_t* __restrict__ output) {
|
||||
void activation_kernel(int num_tokens, int d, scalar_t *__restrict__ input,
|
||||
scalar_t *__restrict__ output) {
|
||||
using scalar_vec_t = vec_op::vec_t<scalar_t>;
|
||||
constexpr int VEC_ELEM_NUM = scalar_vec_t::get_elem_num();
|
||||
|
||||
@ -34,13 +34,13 @@ void activation_kernel(int num_tokens, int d, scalar_t* __restrict__ input,
|
||||
}
|
||||
}
|
||||
|
||||
FORCE_INLINE vec_op::FP32Vec8 silu_act(const vec_op::FP32Vec8& x) {
|
||||
FORCE_INLINE vec_op::FP32Vec8 silu_act(const vec_op::FP32Vec8 &x) {
|
||||
const vec_op::FP32Vec8 zeros(0.0);
|
||||
const vec_op::FP32Vec8 ones(1.0);
|
||||
return x / (ones + (zeros - x).exp());
|
||||
}
|
||||
|
||||
FORCE_INLINE vec_op::FP32Vec8 gelu_new_act(const vec_op::FP32Vec8& x) {
|
||||
FORCE_INLINE vec_op::FP32Vec8 gelu_new_act(const vec_op::FP32Vec8 &x) {
|
||||
const vec_op::FP32Vec8 ones(1.0);
|
||||
const vec_op::FP32Vec8 w1(0.79788456f);
|
||||
const vec_op::FP32Vec8 w2(0.044715f);
|
||||
@ -50,7 +50,7 @@ FORCE_INLINE vec_op::FP32Vec8 gelu_new_act(const vec_op::FP32Vec8& x) {
|
||||
return w3 * x * (ones + t);
|
||||
}
|
||||
|
||||
FORCE_INLINE vec_op::FP32Vec8 gelu_fast_act(const vec_op::FP32Vec8& x) {
|
||||
FORCE_INLINE vec_op::FP32Vec8 gelu_fast_act(const vec_op::FP32Vec8 &x) {
|
||||
const vec_op::FP32Vec8 ones(1.0);
|
||||
const vec_op::FP32Vec8 w1(0.79788456f);
|
||||
const vec_op::FP32Vec8 w2(0.044715f);
|
||||
@ -59,21 +59,14 @@ FORCE_INLINE vec_op::FP32Vec8 gelu_fast_act(const vec_op::FP32Vec8& x) {
|
||||
return w3 * x * (ones + t);
|
||||
}
|
||||
|
||||
FORCE_INLINE vec_op::FP32Vec8 gelu_quick_act(const vec_op::FP32Vec8& x) {
|
||||
const vec_op::FP32Vec8 zeros(0.0);
|
||||
const vec_op::FP32Vec8 ones(1.0);
|
||||
const vec_op::FP32Vec8 w1(1.702f);
|
||||
return x / (ones + (zeros - w1 * x).exp());
|
||||
}
|
||||
|
||||
FORCE_INLINE vec_op::FP32Vec8 gelu_act(const vec_op::FP32Vec8& x) {
|
||||
FORCE_INLINE vec_op::FP32Vec8 gelu_act(const vec_op::FP32Vec8 &x) {
|
||||
const vec_op::FP32Vec8 ones(1.0);
|
||||
const vec_op::FP32Vec8 w1(M_SQRT1_2);
|
||||
const vec_op::FP32Vec8 w2(0.5);
|
||||
return x * w2 * (ones + (x * w1).er());
|
||||
}
|
||||
|
||||
FORCE_INLINE vec_op::FP32Vec8 gelu_tanh_act(const vec_op::FP32Vec8& x) {
|
||||
FORCE_INLINE vec_op::FP32Vec8 gelu_tanh_act(const vec_op::FP32Vec8 &x) {
|
||||
const vec_op::FP32Vec8 ones(1.0);
|
||||
const vec_op::FP32Vec8 w1(M_SQRT2 * M_2_SQRTPI * 0.5);
|
||||
const vec_op::FP32Vec8 w2(0.5);
|
||||
@ -82,36 +75,40 @@ FORCE_INLINE vec_op::FP32Vec8 gelu_tanh_act(const vec_op::FP32Vec8& x) {
|
||||
const vec_op::FP32Vec8 inner = w1 * (x + x_3 * w3);
|
||||
return x * w2 * (ones + inner.tanh());
|
||||
}
|
||||
}; // namespace
|
||||
}; // namespace
|
||||
|
||||
void silu_and_mul(torch::Tensor& out, torch::Tensor& input) {
|
||||
void silu_and_mul(torch::Tensor &out, torch::Tensor &input) {
|
||||
int num_tokens = input.numel() / input.size(-1);
|
||||
int d = input.size(-1) / 2;
|
||||
|
||||
VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "silu_and_mul_impl", [&] {
|
||||
CPU_KERNEL_GUARD_IN(silu_and_mul_impl)
|
||||
activation_kernel<scalar_t, silu_act, true>(
|
||||
num_tokens, d, input.data_ptr<scalar_t>(), out.data_ptr<scalar_t>());
|
||||
CPU_KERNEL_GUARD_OUT(silu_and_mul_impl)
|
||||
});
|
||||
VLLM_DISPATCH_FLOATING_TYPES(
|
||||
input.scalar_type(), "silu_and_mul_impl", [&] {
|
||||
CPU_KERNEL_GUARD_IN(silu_and_mul_impl)
|
||||
activation_kernel<scalar_t, silu_act, true>(num_tokens, d,
|
||||
input.data_ptr<scalar_t>(),
|
||||
out.data_ptr<scalar_t>());
|
||||
CPU_KERNEL_GUARD_OUT(silu_and_mul_impl)
|
||||
});
|
||||
}
|
||||
|
||||
void gelu_and_mul(torch::Tensor& out, // [..., d]
|
||||
torch::Tensor& input) // [..., 2 * d]
|
||||
void gelu_and_mul(torch::Tensor &out, // [..., d]
|
||||
torch::Tensor &input) // [..., 2 * d]
|
||||
{
|
||||
int num_tokens = input.numel() / input.size(-1);
|
||||
int d = input.size(-1) / 2;
|
||||
|
||||
VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "gelu_and_mul_impl", [&] {
|
||||
CPU_KERNEL_GUARD_IN(gelu_and_mul_impl)
|
||||
activation_kernel<scalar_t, gelu_act, true>(
|
||||
num_tokens, d, input.data_ptr<scalar_t>(), out.data_ptr<scalar_t>());
|
||||
CPU_KERNEL_GUARD_OUT(gelu_and_mul_impl)
|
||||
});
|
||||
VLLM_DISPATCH_FLOATING_TYPES(
|
||||
input.scalar_type(), "gelu_and_mul_impl", [&] {
|
||||
CPU_KERNEL_GUARD_IN(gelu_and_mul_impl)
|
||||
activation_kernel<scalar_t, gelu_act, true>(num_tokens, d,
|
||||
input.data_ptr<scalar_t>(),
|
||||
out.data_ptr<scalar_t>());
|
||||
CPU_KERNEL_GUARD_OUT(gelu_and_mul_impl)
|
||||
});
|
||||
}
|
||||
|
||||
void gelu_tanh_and_mul(torch::Tensor& out, // [..., d]
|
||||
torch::Tensor& input) // [..., 2 * d]
|
||||
void gelu_tanh_and_mul(torch::Tensor &out, // [..., d]
|
||||
torch::Tensor &input) // [..., 2 * d]
|
||||
{
|
||||
int num_tokens = input.numel() / input.size(-1);
|
||||
int d = input.size(-1) / 2;
|
||||
@ -126,7 +123,7 @@ void gelu_tanh_and_mul(torch::Tensor& out, // [..., d]
|
||||
});
|
||||
}
|
||||
|
||||
void gelu_new(torch::Tensor& out, torch::Tensor& input) {
|
||||
void gelu_new(torch::Tensor &out, torch::Tensor &input) {
|
||||
int num_tokens = input.numel() / input.size(-1);
|
||||
int d = input.size(-1);
|
||||
|
||||
@ -138,7 +135,7 @@ void gelu_new(torch::Tensor& out, torch::Tensor& input) {
|
||||
});
|
||||
}
|
||||
|
||||
void gelu_fast(torch::Tensor& out, torch::Tensor& input) {
|
||||
void gelu_fast(torch::Tensor &out, torch::Tensor &input) {
|
||||
int num_tokens = input.numel() / input.size(-1);
|
||||
int d = input.size(-1);
|
||||
|
||||
@ -149,15 +146,3 @@ void gelu_fast(torch::Tensor& out, torch::Tensor& input) {
|
||||
CPU_KERNEL_GUARD_OUT(gelu_fast_impl)
|
||||
});
|
||||
}
|
||||
|
||||
void gelu_quick(torch::Tensor& out, torch::Tensor& input) {
|
||||
int num_tokens = input.numel() / input.size(-1);
|
||||
int d = input.size(-1);
|
||||
|
||||
VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "gelu_quick_impl", [&] {
|
||||
CPU_KERNEL_GUARD_IN(gelu_quick_impl)
|
||||
activation_kernel<scalar_t, gelu_quick_act, false>(
|
||||
num_tokens, d, input.data_ptr<scalar_t>(), out.data_ptr<scalar_t>());
|
||||
CPU_KERNEL_GUARD_OUT(gelu_quick_impl)
|
||||
});
|
||||
}
|
||||
|
||||
@ -2,8 +2,7 @@
|
||||
|
||||
namespace {
|
||||
|
||||
template <typename scalar_t>
|
||||
struct KernelVecType {
|
||||
template <typename scalar_t> struct KernelVecType {
|
||||
using q_load_vec_type = void;
|
||||
using q_vec_type = void;
|
||||
using k_load_vec_type = void;
|
||||
@ -12,8 +11,7 @@ struct KernelVecType {
|
||||
using v_load_vec_type = void;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct KernelVecType<float> {
|
||||
template <> struct KernelVecType<float> {
|
||||
using q_load_vec_type = vec_op::FP32Vec4;
|
||||
using q_vec_type = vec_op::FP32Vec16;
|
||||
using k_load_vec_type = vec_op::FP32Vec16;
|
||||
@ -23,8 +21,7 @@ struct KernelVecType<float> {
|
||||
};
|
||||
|
||||
#ifdef __AVX512BF16__
|
||||
template <>
|
||||
struct KernelVecType<c10::BFloat16> {
|
||||
template <> struct KernelVecType<c10::BFloat16> {
|
||||
using q_load_vec_type = vec_op::BF16Vec8;
|
||||
using q_vec_type = vec_op::BF16Vec32;
|
||||
using k_load_vec_type = vec_op::BF16Vec32;
|
||||
@ -33,8 +30,7 @@ struct KernelVecType<c10::BFloat16> {
|
||||
using v_load_vec_type = vec_op::BF16Vec16;
|
||||
};
|
||||
#else
|
||||
template <>
|
||||
struct KernelVecType<c10::BFloat16> {
|
||||
template <> struct KernelVecType<c10::BFloat16> {
|
||||
using q_load_vec_type = vec_op::BF16Vec8;
|
||||
using q_vec_type = vec_op::FP32Vec16;
|
||||
using k_load_vec_type = vec_op::BF16Vec16;
|
||||
@ -45,7 +41,7 @@ struct KernelVecType<c10::BFloat16> {
|
||||
#endif
|
||||
|
||||
template <typename T>
|
||||
FORCE_INLINE std::pair<T, T> reduceSoftmax(T* data, const int size,
|
||||
FORCE_INLINE std::pair<T, T> reduceSoftmax(T *data, const int size,
|
||||
const int capacity) {
|
||||
T max = data[0];
|
||||
for (int i = 1; i < size; ++i) {
|
||||
@ -71,15 +67,14 @@ FORCE_INLINE std::pair<T, T> reduceSoftmax(T* data, const int size,
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
FORCE_INLINE std::pair<T, T> reduceSoftmaxAlibi(T* data, const int size,
|
||||
const int capacity,
|
||||
const float alibi_slope,
|
||||
const int start_index,
|
||||
const int seq_len) {
|
||||
data[0] += alibi_slope * (start_index - seq_len + 1);
|
||||
FORCE_INLINE std::pair<T, T>
|
||||
reduceSoftmaxAlibi(T *data, const int size, const int capacity,
|
||||
const float alibi_slope, const int start_index,
|
||||
const int context_len) {
|
||||
data[0] += alibi_slope * (start_index - context_len + 1);
|
||||
T max = data[0];
|
||||
for (int i = 1; i < size; ++i) {
|
||||
T qk = data[i] + alibi_slope * (start_index + i - seq_len + 1);
|
||||
T qk = data[i] + alibi_slope * (start_index + i - context_len + 1);
|
||||
data[i] = qk;
|
||||
max = max >= qk ? max : qk;
|
||||
}
|
||||
@ -103,7 +98,7 @@ FORCE_INLINE std::pair<T, T> reduceSoftmaxAlibi(T* data, const int size,
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
FORCE_INLINE void reducePartitonSoftmax(const T* max_data, T* sum_data,
|
||||
FORCE_INLINE void reducePartitonSoftmax(const T *max_data, T *sum_data,
|
||||
const int size) {
|
||||
T max = max_data[0];
|
||||
for (int i = 1; i < size; ++i) {
|
||||
@ -137,9 +132,9 @@ struct reduceQKBlockKernel {
|
||||
static_assert(k_load_vec_type::get_elem_num() % x == 0);
|
||||
static_assert(q_load_vec_type::get_elem_num() * sizeof(scalar_t) == 16);
|
||||
|
||||
FORCE_INLINE static void call(const scalar_t* __restrict__ q,
|
||||
const scalar_t* __restrict__ k_block,
|
||||
float* __restrict__ logits, float scale,
|
||||
FORCE_INLINE static void call(const scalar_t *__restrict__ q,
|
||||
const scalar_t *__restrict__ k_block,
|
||||
float *__restrict__ logits, float scale,
|
||||
const int token_num) {
|
||||
const int group_num = (token_num + TOKEN_PER_GROUP - 1) / TOKEN_PER_GROUP;
|
||||
|
||||
@ -201,8 +196,8 @@ struct reduceQKBlockKernel {
|
||||
|
||||
template <typename scalar_t, int HEAD_SIZE, int BLOCK_SIZE,
|
||||
int HEAD_PARTITION_SIZE, typename acc_t>
|
||||
FORCE_INLINE void reduceValueBlock(const float* prob, const scalar_t* v_block,
|
||||
acc_t&& acc) {
|
||||
FORCE_INLINE void reduceValueBlock(const float *prob, const scalar_t *v_block,
|
||||
acc_t &&acc) {
|
||||
using v_load_vec_type = typename KernelVecType<scalar_t>::v_load_vec_type;
|
||||
constexpr int ELEM_NUM = v_load_vec_type::get_elem_num();
|
||||
static_assert(BLOCK_SIZE == ELEM_NUM);
|
||||
@ -214,65 +209,66 @@ FORCE_INLINE void reduceValueBlock(const float* prob, const scalar_t* v_block,
|
||||
acc[head_elem_idx] = acc[head_elem_idx] + prob_vec * fp32_v_vec;
|
||||
});
|
||||
}
|
||||
}; // namespace
|
||||
}; // namespace
|
||||
|
||||
// Paged attention v1
|
||||
namespace {
|
||||
template <typename scalar_t, int HEAD_SIZE, int BLOCK_SIZE>
|
||||
struct paged_attention_v1_impl {
|
||||
static void call(
|
||||
scalar_t* __restrict__ out, // [num_seqs, num_heads, head_size]
|
||||
const scalar_t* __restrict__ q, // [num_seqs, num_heads, head_size]
|
||||
const scalar_t* __restrict__ k_cache, // [num_blocks, num_kv_heads,
|
||||
static void
|
||||
call(scalar_t *__restrict__ out, // [num_seqs, num_heads, head_size]
|
||||
const scalar_t *__restrict__ q, // [num_seqs, num_heads, head_size]
|
||||
const scalar_t *__restrict__ k_cache, // [num_blocks, num_kv_heads,
|
||||
// head_size/x, block_size, x]
|
||||
const scalar_t* __restrict__ v_cache, // [num_blocks, num_kv_heads,
|
||||
const scalar_t *__restrict__ v_cache, // [num_blocks, num_kv_heads,
|
||||
// head_size, block_size]
|
||||
const int num_kv_heads, const float scale,
|
||||
const int* __restrict__ block_tables, // [num_seqs,
|
||||
// max_num_blocks_per_seq]
|
||||
const int* __restrict__ seq_lens, // [num_seqs]
|
||||
const int max_num_blocks_per_seq,
|
||||
const float* __restrict__ alibi_slopes, // [num_heads]
|
||||
const int q_stride, const int kv_block_stride, const int kv_head_stride,
|
||||
const int num_seqs, const int num_heads) {
|
||||
const int num_kv_heads, const float scale,
|
||||
const int
|
||||
*__restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq]
|
||||
const int *__restrict__ context_lens, // [num_seqs]
|
||||
const int max_num_blocks_per_seq,
|
||||
const float *__restrict__ alibi_slopes, // [num_heads]
|
||||
const int q_stride, const int kv_block_stride, const int kv_head_stride,
|
||||
const int num_seqs, const int num_heads) {
|
||||
constexpr int x = 16 / sizeof(scalar_t);
|
||||
const int num_queries_per_kv = num_heads / num_kv_heads;
|
||||
|
||||
static_assert(BLOCK_SIZE == 16);
|
||||
|
||||
int max_seq_len = max_num_blocks_per_seq * BLOCK_SIZE;
|
||||
int max_seq_len_padded = (max_seq_len + 15) & 0xFFFFFFF0;
|
||||
TORCH_CHECK((max_seq_len_padded * sizeof(float)) % 64 == 0);
|
||||
int max_context_len = max_num_blocks_per_seq * BLOCK_SIZE;
|
||||
int max_context_len_padded = (max_context_len + 15) & 0xFFFFFFF0;
|
||||
TORCH_CHECK((max_context_len_padded * sizeof(float)) % 64 == 0);
|
||||
|
||||
const int parallel_work_item_num = omp_get_max_threads();
|
||||
|
||||
size_t logits_bytes =
|
||||
parallel_work_item_num * max_seq_len_padded * sizeof(float);
|
||||
float* logits = (float*)std::aligned_alloc(
|
||||
64, logits_bytes); // Cacheline alignment for each context token.
|
||||
// [parallel_work_item_num, max_seq_len_padded]
|
||||
parallel_work_item_num * max_context_len_padded * sizeof(float);
|
||||
float *logits = (float *)std::aligned_alloc(
|
||||
64, logits_bytes); // Cacheline alignment for each context token.
|
||||
// [parallel_work_item_num, max_context_len_padded]
|
||||
|
||||
#pragma omp parallel for collapse(2) schedule(dynamic, 1)
|
||||
for (int seq_idx = 0; seq_idx < num_seqs; ++seq_idx) {
|
||||
for (int head_idx = 0; head_idx < num_heads; ++head_idx) {
|
||||
int seq_len = seq_lens[seq_idx];
|
||||
const int* seq_block_table =
|
||||
int context_len = context_lens[seq_idx];
|
||||
const int *seq_block_table =
|
||||
block_tables + max_num_blocks_per_seq * seq_idx;
|
||||
const int block_num = (seq_len + BLOCK_SIZE - 1) / BLOCK_SIZE;
|
||||
const int block_num = (context_len + BLOCK_SIZE - 1) / BLOCK_SIZE;
|
||||
const int64_t kv_head_idx = head_idx / num_queries_per_kv;
|
||||
const scalar_t* __restrict__ q_vec_ptr =
|
||||
const scalar_t *__restrict__ q_vec_ptr =
|
||||
q + seq_idx * q_stride + head_idx * HEAD_SIZE;
|
||||
const int last_block_token_num = seq_len - (block_num - 1) * BLOCK_SIZE;
|
||||
float* __restrict__ thread_block_logits =
|
||||
logits + omp_get_thread_num() * max_seq_len_padded;
|
||||
const int last_block_token_num =
|
||||
context_len - (block_num - 1) * BLOCK_SIZE;
|
||||
float *__restrict__ thread_block_logits =
|
||||
logits + omp_get_thread_num() * max_context_len_padded;
|
||||
|
||||
// Compute logits
|
||||
for (int block_idx = 0; block_idx < block_num; ++block_idx) {
|
||||
const int64_t physical_block_idx = seq_block_table[block_idx];
|
||||
const scalar_t* __restrict__ k_block_cache_ptr =
|
||||
const scalar_t *__restrict__ k_block_cache_ptr =
|
||||
k_cache + physical_block_idx * kv_block_stride +
|
||||
kv_head_idx * kv_head_stride;
|
||||
float* __restrict__ head_block_logits =
|
||||
float *__restrict__ head_block_logits =
|
||||
thread_block_logits + block_idx * BLOCK_SIZE;
|
||||
|
||||
reduceQKBlockKernel<scalar_t, HEAD_SIZE, BLOCK_SIZE, x>::call(
|
||||
@ -282,11 +278,12 @@ struct paged_attention_v1_impl {
|
||||
|
||||
// Compute softmax
|
||||
if (alibi_slopes) {
|
||||
reduceSoftmaxAlibi(thread_block_logits, seq_len,
|
||||
reduceSoftmaxAlibi(thread_block_logits, context_len,
|
||||
block_num * BLOCK_SIZE, alibi_slopes[head_idx], 0,
|
||||
seq_len);
|
||||
context_len);
|
||||
} else {
|
||||
reduceSoftmax(thread_block_logits, seq_len, block_num * BLOCK_SIZE);
|
||||
reduceSoftmax(thread_block_logits, context_len,
|
||||
block_num * BLOCK_SIZE);
|
||||
}
|
||||
|
||||
// Compute value
|
||||
@ -296,14 +293,14 @@ struct paged_attention_v1_impl {
|
||||
for (int head_part_idx = 0; head_part_idx < head_partition_num;
|
||||
++head_part_idx) {
|
||||
vec_op::FP32Vec16 accums[head_elem_num_per_partition];
|
||||
scalar_t* __restrict__ out_ptr =
|
||||
scalar_t *__restrict__ out_ptr =
|
||||
out + seq_idx * num_heads * HEAD_SIZE + head_idx * HEAD_SIZE +
|
||||
head_part_idx * head_elem_num_per_partition;
|
||||
for (int block_idx = 0; block_idx < block_num; ++block_idx) {
|
||||
const int64_t physical_block_idx = seq_block_table[block_idx];
|
||||
const float* __restrict__ prob_vec_ptr =
|
||||
const float *__restrict__ prob_vec_ptr =
|
||||
thread_block_logits + block_idx * BLOCK_SIZE;
|
||||
const scalar_t* __restrict__ v_block_cache_ptr =
|
||||
const scalar_t *__restrict__ v_block_cache_ptr =
|
||||
v_cache + physical_block_idx * kv_block_stride +
|
||||
kv_head_idx * kv_head_stride +
|
||||
BLOCK_SIZE * head_part_idx * head_elem_num_per_partition;
|
||||
@ -314,7 +311,7 @@ struct paged_attention_v1_impl {
|
||||
if (block_idx != block_num - 1) {
|
||||
const int64_t next_physical_block_idx =
|
||||
seq_block_table[block_idx + 1];
|
||||
const scalar_t* __restrict__ next_v_block_cache_ptr =
|
||||
const scalar_t *__restrict__ next_v_block_cache_ptr =
|
||||
v_cache + next_physical_block_idx * kv_block_stride +
|
||||
kv_head_idx * kv_head_stride +
|
||||
BLOCK_SIZE * head_part_idx * head_elem_num_per_partition;
|
||||
@ -343,16 +340,16 @@ struct paged_attention_v1_impl {
|
||||
#define LAUNCH_V1_ATTENTION_KERNEL(T, HEAD_SIZE, BLOCK_SIZE) \
|
||||
paged_attention_v1_impl<T, HEAD_SIZE, BLOCK_SIZE>::call( \
|
||||
out_ptr, query_ptr, key_cache_ptr, value_cache_ptr, num_kv_heads, scale, \
|
||||
block_tables_ptr, seq_lens_ptr, max_num_blocks_per_seq, \
|
||||
block_tables_ptr, context_lens_ptr, max_num_blocks_per_seq, \
|
||||
alibi_slopes_ptr, q_stride, kv_block_stride, kv_head_stride, num_seqs, \
|
||||
num_heads);
|
||||
|
||||
template <typename T, int BLOCK_SIZE>
|
||||
void paged_attention_v1_impl_launcher(
|
||||
torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache,
|
||||
torch::Tensor& value_cache, int num_kv_heads, float scale,
|
||||
torch::Tensor& block_tables, torch::Tensor& seq_lens, int max_seq_len,
|
||||
const c10::optional<torch::Tensor>& alibi_slopes) {
|
||||
torch::Tensor &out, torch::Tensor &query, torch::Tensor &key_cache,
|
||||
torch::Tensor &value_cache, int num_kv_heads, float scale,
|
||||
torch::Tensor &block_tables, torch::Tensor &context_lens,
|
||||
int max_context_len, const c10::optional<torch::Tensor> &alibi_slopes) {
|
||||
int num_seqs = query.size(0);
|
||||
int num_heads = query.size(1);
|
||||
int head_size = query.size(2);
|
||||
@ -362,74 +359,68 @@ void paged_attention_v1_impl_launcher(
|
||||
int kv_head_stride = key_cache.stride(1);
|
||||
|
||||
// NOTE: alibi_slopes is optional.
|
||||
const float* alibi_slopes_ptr =
|
||||
const float *alibi_slopes_ptr =
|
||||
alibi_slopes
|
||||
? reinterpret_cast<const float*>(alibi_slopes.value().data_ptr())
|
||||
? reinterpret_cast<const float *>(alibi_slopes.value().data_ptr())
|
||||
: nullptr;
|
||||
|
||||
T* out_ptr = reinterpret_cast<T*>(out.data_ptr());
|
||||
T* query_ptr = reinterpret_cast<T*>(query.data_ptr());
|
||||
T* key_cache_ptr = reinterpret_cast<T*>(key_cache.data_ptr());
|
||||
T* value_cache_ptr = reinterpret_cast<T*>(value_cache.data_ptr());
|
||||
int* block_tables_ptr = block_tables.data_ptr<int>();
|
||||
int* seq_lens_ptr = seq_lens.data_ptr<int>();
|
||||
T *out_ptr = reinterpret_cast<T *>(out.data_ptr());
|
||||
T *query_ptr = reinterpret_cast<T *>(query.data_ptr());
|
||||
T *key_cache_ptr = reinterpret_cast<T *>(key_cache.data_ptr());
|
||||
T *value_cache_ptr = reinterpret_cast<T *>(value_cache.data_ptr());
|
||||
int *block_tables_ptr = block_tables.data_ptr<int>();
|
||||
int *context_lens_ptr = context_lens.data_ptr<int>();
|
||||
|
||||
switch (head_size) {
|
||||
case 64:
|
||||
LAUNCH_V1_ATTENTION_KERNEL(T, 64, BLOCK_SIZE);
|
||||
break;
|
||||
case 80:
|
||||
LAUNCH_V1_ATTENTION_KERNEL(T, 80, BLOCK_SIZE);
|
||||
break;
|
||||
case 96:
|
||||
LAUNCH_V1_ATTENTION_KERNEL(T, 96, BLOCK_SIZE);
|
||||
break;
|
||||
case 112:
|
||||
LAUNCH_V1_ATTENTION_KERNEL(T, 112, BLOCK_SIZE);
|
||||
break;
|
||||
case 128:
|
||||
LAUNCH_V1_ATTENTION_KERNEL(T, 128, BLOCK_SIZE);
|
||||
break;
|
||||
case 192:
|
||||
LAUNCH_V1_ATTENTION_KERNEL(T, 192, BLOCK_SIZE);
|
||||
break;
|
||||
case 256:
|
||||
LAUNCH_V1_ATTENTION_KERNEL(T, 256, BLOCK_SIZE);
|
||||
break;
|
||||
default:
|
||||
TORCH_CHECK(false, "Unsupported head size: ", head_size);
|
||||
break;
|
||||
case 64:
|
||||
LAUNCH_V1_ATTENTION_KERNEL(T, 64, BLOCK_SIZE);
|
||||
break;
|
||||
case 80:
|
||||
LAUNCH_V1_ATTENTION_KERNEL(T, 80, BLOCK_SIZE);
|
||||
break;
|
||||
case 96:
|
||||
LAUNCH_V1_ATTENTION_KERNEL(T, 96, BLOCK_SIZE);
|
||||
break;
|
||||
case 112:
|
||||
LAUNCH_V1_ATTENTION_KERNEL(T, 112, BLOCK_SIZE);
|
||||
break;
|
||||
case 128:
|
||||
LAUNCH_V1_ATTENTION_KERNEL(T, 128, BLOCK_SIZE);
|
||||
break;
|
||||
case 256:
|
||||
LAUNCH_V1_ATTENTION_KERNEL(T, 256, BLOCK_SIZE);
|
||||
break;
|
||||
default:
|
||||
TORCH_CHECK(false, "Unsupported head size: ", head_size);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
#define CALL_V1_KERNEL_LAUNCHER(T, BLOCK_SIZE) \
|
||||
paged_attention_v1_impl_launcher<T, BLOCK_SIZE>( \
|
||||
out, query, key_cache, value_cache, num_kv_heads, scale, block_tables, \
|
||||
seq_lens, max_seq_len, alibi_slopes);
|
||||
#define CALL_V1_KERNEL_LAUNCHER(T, BLOCK_SIZE) \
|
||||
paged_attention_v1_impl_launcher<T, BLOCK_SIZE>( \
|
||||
out, query, key_cache, value_cache, num_kv_heads, scale, block_tables, \
|
||||
context_lens, max_context_len, alibi_slopes);
|
||||
|
||||
#define CALL_V1_KERNEL_LAUNCHER_BLOCK_SIZE(T) \
|
||||
switch (block_size) { \
|
||||
case 16: \
|
||||
CALL_V1_KERNEL_LAUNCHER(T, 16); \
|
||||
break; \
|
||||
default: \
|
||||
TORCH_CHECK(false, "Unsupported block size: ", block_size); \
|
||||
break; \
|
||||
#define CALL_V1_KERNEL_LAUNCHER_BLOCK_SIZE(T) \
|
||||
switch (block_size) { \
|
||||
case 16: \
|
||||
CALL_V1_KERNEL_LAUNCHER(T, 16); \
|
||||
break; \
|
||||
default: \
|
||||
TORCH_CHECK(false, "Unsupported block size: ", block_size); \
|
||||
break; \
|
||||
}
|
||||
} // namespace
|
||||
} // namespace
|
||||
|
||||
void paged_attention_v1(
|
||||
torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache,
|
||||
torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
|
||||
torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size,
|
||||
int64_t max_seq_len, const c10::optional<torch::Tensor>& alibi_slopes,
|
||||
const std::string& kv_cache_dtype, double kv_scale, const int64_t tp_rank,
|
||||
const int64_t blocksparse_local_blocks,
|
||||
const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
|
||||
const int64_t blocksparse_head_sliding_step) {
|
||||
void paged_attention_v1(torch::Tensor &out, torch::Tensor &query,
|
||||
torch::Tensor &key_cache, torch::Tensor &value_cache,
|
||||
int num_kv_heads, float scale,
|
||||
torch::Tensor &block_tables,
|
||||
torch::Tensor &context_lens, int block_size,
|
||||
int max_context_len,
|
||||
const c10::optional<torch::Tensor> &alibi_slopes,
|
||||
const std::string &kv_cache_dtype, float kv_scale) {
|
||||
TORCH_CHECK(kv_scale == 1.0f);
|
||||
TORCH_CHECK(blocksparse_vert_stride <= 1,
|
||||
"CPU backend does not support blocksparse attention yet.");
|
||||
VLLM_DISPATCH_FLOATING_TYPES(query.scalar_type(), "paged_attention_v1_impl",
|
||||
[&] {
|
||||
CPU_KERNEL_GUARD_IN(paged_attention_v1_impl)
|
||||
@ -443,24 +434,23 @@ namespace {
|
||||
template <typename scalar_t, int HEAD_SIZE, int BLOCK_SIZE, int PARTITION_SIZE>
|
||||
struct paged_attention_v2_impl {
|
||||
static void call(
|
||||
scalar_t* __restrict__ out, // [num_seqs, num_heads, head_size]
|
||||
float* __restrict__ exp_sums, // [num_seqs, num_heads,
|
||||
// max_num_partitions]
|
||||
float* __restrict__ max_logits, // [num_seqs, num_heads,
|
||||
// max_num_partitions]
|
||||
scalar_t* __restrict__ tmp_out, // [num_seqs, num_heads,
|
||||
// max_num_partitions, head_size]
|
||||
const scalar_t* __restrict__ q, // [num_seqs, num_heads, head_size]
|
||||
const scalar_t* __restrict__ k_cache, // [num_blocks, num_kv_heads,
|
||||
// head_size/x, block_size, x]
|
||||
const scalar_t* __restrict__ v_cache, // [num_blocks, num_kv_heads,
|
||||
// head_size, block_size]
|
||||
scalar_t *__restrict__ out, // [num_seqs, num_heads, head_size]
|
||||
float *__restrict__ exp_sums, // [num_seqs, num_heads, max_num_partitions]
|
||||
float
|
||||
*__restrict__ max_logits, // [num_seqs, num_heads, max_num_partitions]
|
||||
scalar_t *__restrict__ tmp_out, // [num_seqs, num_heads,
|
||||
// max_num_partitions, head_size]
|
||||
const scalar_t *__restrict__ q, // [num_seqs, num_heads, head_size]
|
||||
const scalar_t *__restrict__ k_cache, // [num_blocks, num_kv_heads,
|
||||
// head_size/x, block_size, x]
|
||||
const scalar_t *__restrict__ v_cache, // [num_blocks, num_kv_heads,
|
||||
// head_size, block_size]
|
||||
const int num_kv_heads, const float scale,
|
||||
const int* __restrict__ block_tables, // [num_seqs,
|
||||
// max_num_blocks_per_seq]
|
||||
const int* __restrict__ seq_lens, // [num_seqs]
|
||||
const int
|
||||
*__restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq]
|
||||
const int *__restrict__ context_lens, // [num_seqs]
|
||||
const int max_num_blocks_per_seq,
|
||||
const float* __restrict__ alibi_slopes, // [num_heads]
|
||||
const float *__restrict__ alibi_slopes, // [num_heads]
|
||||
const int q_stride, const int kv_block_stride, const int kv_head_stride,
|
||||
const int num_seqs, const int num_heads, const int max_num_partitions) {
|
||||
constexpr int x = 16 / sizeof(scalar_t);
|
||||
@ -475,25 +465,27 @@ struct paged_attention_v2_impl {
|
||||
for (int partition_idx = 0; partition_idx < max_num_partitions;
|
||||
++partition_idx) {
|
||||
for (int head_idx = 0; head_idx < num_heads; ++head_idx) {
|
||||
const int seq_len = seq_lens[seq_idx];
|
||||
const int context_len = context_lens[seq_idx];
|
||||
const int start_token_idx = partition_idx * PARTITION_SIZE;
|
||||
|
||||
if (start_token_idx >= seq_len) continue;
|
||||
if (start_token_idx >= context_len)
|
||||
continue;
|
||||
|
||||
const int partition_num =
|
||||
(seq_len + PARTITION_SIZE - 1) / PARTITION_SIZE;
|
||||
(context_len + PARTITION_SIZE - 1) / PARTITION_SIZE;
|
||||
const bool no_reduce = (partition_num == 1);
|
||||
const int token_num =
|
||||
(std::min(seq_len, start_token_idx + PARTITION_SIZE) -
|
||||
const int context_token_num =
|
||||
(std::min(context_len, start_token_idx + PARTITION_SIZE) -
|
||||
start_token_idx);
|
||||
const int block_num = (token_num + BLOCK_SIZE - 1) / BLOCK_SIZE;
|
||||
const int block_num =
|
||||
(context_token_num + BLOCK_SIZE - 1) / BLOCK_SIZE;
|
||||
const int last_block_token_num =
|
||||
token_num - (block_num - 1) * BLOCK_SIZE;
|
||||
const int* seq_block_table = block_tables +
|
||||
context_token_num - (block_num - 1) * BLOCK_SIZE;
|
||||
const int *seq_block_table = block_tables +
|
||||
max_num_blocks_per_seq * seq_idx +
|
||||
start_token_idx / BLOCK_SIZE;
|
||||
const int64_t kv_head_idx = head_idx / num_queries_per_kv;
|
||||
const scalar_t* __restrict__ q_vec_ptr =
|
||||
const scalar_t *__restrict__ q_vec_ptr =
|
||||
q + seq_idx * q_stride + head_idx * HEAD_SIZE;
|
||||
|
||||
float logits[PARTITION_SIZE] __attribute__((aligned(64))) = {0};
|
||||
@ -501,10 +493,10 @@ struct paged_attention_v2_impl {
|
||||
// Compute logits
|
||||
for (int block_idx = 0; block_idx < block_num; ++block_idx) {
|
||||
const int64_t physical_block_idx = seq_block_table[block_idx];
|
||||
const scalar_t* __restrict__ k_block_cache_ptr =
|
||||
const scalar_t *__restrict__ k_block_cache_ptr =
|
||||
k_cache + physical_block_idx * kv_block_stride +
|
||||
kv_head_idx * kv_head_stride;
|
||||
float* __restrict__ head_block_logits =
|
||||
float *__restrict__ head_block_logits =
|
||||
logits + block_idx * BLOCK_SIZE;
|
||||
|
||||
reduceQKBlockKernel<scalar_t, HEAD_SIZE, BLOCK_SIZE, x>::call(
|
||||
@ -515,16 +507,16 @@ struct paged_attention_v2_impl {
|
||||
std::pair<float, float> max_and_sum;
|
||||
if (alibi_slopes) {
|
||||
max_and_sum = reduceSoftmaxAlibi(
|
||||
logits, token_num, block_num * BLOCK_SIZE,
|
||||
alibi_slopes[head_idx], start_token_idx, seq_len);
|
||||
logits, context_token_num, block_num * BLOCK_SIZE,
|
||||
alibi_slopes[head_idx], start_token_idx, context_len);
|
||||
} else {
|
||||
max_and_sum =
|
||||
reduceSoftmax(logits, token_num, block_num * BLOCK_SIZE);
|
||||
max_and_sum = reduceSoftmax(logits, context_token_num,
|
||||
block_num * BLOCK_SIZE);
|
||||
}
|
||||
|
||||
auto&& [max_logit, exp_sum] = max_and_sum;
|
||||
auto &&[max_logit, exp_sum] = max_and_sum;
|
||||
|
||||
scalar_t* __restrict__ output_buffer = nullptr;
|
||||
scalar_t *__restrict__ output_buffer = nullptr;
|
||||
if (!no_reduce) {
|
||||
auto idx = seq_idx * num_heads * max_num_partitions +
|
||||
head_idx * max_num_partitions + partition_idx;
|
||||
@ -546,13 +538,13 @@ struct paged_attention_v2_impl {
|
||||
for (int head_part_idx = 0; head_part_idx < head_partition_num;
|
||||
++head_part_idx) {
|
||||
vec_op::FP32Vec16 accums[head_elem_num_per_partition];
|
||||
scalar_t* __restrict__ out_ptr =
|
||||
scalar_t *__restrict__ out_ptr =
|
||||
output_buffer + head_part_idx * head_elem_num_per_partition;
|
||||
for (int block_idx = 0; block_idx < block_num; ++block_idx) {
|
||||
const int64_t physical_block_idx = seq_block_table[block_idx];
|
||||
const float* __restrict__ prob_vec_ptr =
|
||||
const float *__restrict__ prob_vec_ptr =
|
||||
logits + block_idx * BLOCK_SIZE;
|
||||
const scalar_t* __restrict__ v_block_cache_ptr =
|
||||
const scalar_t *__restrict__ v_block_cache_ptr =
|
||||
v_cache + physical_block_idx * kv_block_stride +
|
||||
kv_head_idx * kv_head_stride +
|
||||
BLOCK_SIZE * head_part_idx * head_elem_num_per_partition;
|
||||
@ -563,7 +555,7 @@ struct paged_attention_v2_impl {
|
||||
if (block_idx != block_num - 1) {
|
||||
const int64_t next_physical_block_idx =
|
||||
seq_block_table[block_idx + 1];
|
||||
const scalar_t* __restrict__ next_v_block_cache_ptr =
|
||||
const scalar_t *__restrict__ next_v_block_cache_ptr =
|
||||
v_cache + next_physical_block_idx * kv_block_stride +
|
||||
kv_head_idx * kv_head_stride +
|
||||
BLOCK_SIZE * head_part_idx * head_elem_num_per_partition;
|
||||
@ -591,11 +583,12 @@ struct paged_attention_v2_impl {
|
||||
#pragma omp parallel for collapse(2) schedule(static, 1)
|
||||
for (int seq_idx = 0; seq_idx < num_seqs; ++seq_idx) {
|
||||
for (int head_idx = 0; head_idx < num_heads; ++head_idx) {
|
||||
const int seq_len = seq_lens[seq_idx];
|
||||
const int context_len = context_lens[seq_idx];
|
||||
const int partition_num =
|
||||
(seq_len + PARTITION_SIZE - 1) / PARTITION_SIZE;
|
||||
(context_len + PARTITION_SIZE - 1) / PARTITION_SIZE;
|
||||
|
||||
if (partition_num == 1) continue;
|
||||
if (partition_num == 1)
|
||||
continue;
|
||||
|
||||
reducePartitonSoftmax(
|
||||
max_logits + seq_idx * num_heads * max_num_partitions +
|
||||
@ -610,29 +603,30 @@ struct paged_attention_v2_impl {
|
||||
using v_load_vec_type = typename KernelVecType<scalar_t>::v_load_vec_type;
|
||||
static_assert(v_load_vec_type::get_elem_num() == BLOCK_SIZE);
|
||||
constexpr int head_elem_num_per_group =
|
||||
16; // Note: didn't align with the cacheline size, due to some
|
||||
// HEAD_SIZE didn't align with 64 bytes
|
||||
16; // Note: didn't align with the cacheline size, due to some HEAD_SIZE
|
||||
// didn't align with 64 bytes
|
||||
static_assert(HEAD_SIZE % head_elem_num_per_group == 0);
|
||||
constexpr int head_group_num = HEAD_SIZE / head_elem_num_per_group;
|
||||
const float* __restrict__ rescale_factors = exp_sums;
|
||||
const float *__restrict__ rescale_factors = exp_sums;
|
||||
#pragma omp parallel for collapse(3) schedule(static, 1)
|
||||
for (int seq_idx = 0; seq_idx < num_seqs; ++seq_idx) {
|
||||
for (int head_idx = 0; head_idx < num_heads; ++head_idx) {
|
||||
for (int group_idx = 0; group_idx < head_group_num; ++group_idx) {
|
||||
const int seq_len = seq_lens[seq_idx];
|
||||
const int context_len = context_lens[seq_idx];
|
||||
const int partition_num =
|
||||
(seq_len + PARTITION_SIZE - 1) / PARTITION_SIZE;
|
||||
(context_len + PARTITION_SIZE - 1) / PARTITION_SIZE;
|
||||
|
||||
if (partition_num == 1) continue;
|
||||
if (partition_num == 1)
|
||||
continue;
|
||||
|
||||
const float* __restrict__ seq_head_rescale_factors =
|
||||
const float *__restrict__ seq_head_rescale_factors =
|
||||
rescale_factors + seq_idx * num_heads * max_num_partitions +
|
||||
head_idx * max_num_partitions;
|
||||
const scalar_t* __restrict__ seq_head_tmp_out =
|
||||
const scalar_t *__restrict__ seq_head_tmp_out =
|
||||
tmp_out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE +
|
||||
head_idx * max_num_partitions * HEAD_SIZE +
|
||||
group_idx * head_elem_num_per_group;
|
||||
scalar_t* __restrict__ seq_head_output =
|
||||
scalar_t *__restrict__ seq_head_output =
|
||||
out + seq_idx * num_heads * HEAD_SIZE + head_idx * HEAD_SIZE +
|
||||
group_idx * head_elem_num_per_group;
|
||||
|
||||
@ -651,21 +645,21 @@ struct paged_attention_v2_impl {
|
||||
}
|
||||
};
|
||||
|
||||
#define LAUNCH_V2_ATTENTION_KERNEL(T, HEAD_SIZE, BLOCK_SIZE) \
|
||||
paged_attention_v2_impl<T, HEAD_SIZE, BLOCK_SIZE, PARTITION_SIZE>::call( \
|
||||
out_ptr, exp_sums_ptr, max_logits_ptr, tmp_out_ptr, query_ptr, \
|
||||
key_cache_ptr, value_cache_ptr, num_kv_heads, scale, block_tables_ptr, \
|
||||
seq_lens_ptr, max_num_blocks_per_seq, alibi_slopes_ptr, q_stride, \
|
||||
kv_block_stride, kv_head_stride, num_seqs, num_heads, \
|
||||
#define LAUNCH_V2_ATTENTION_KERNEL(T, HEAD_SIZE, BLOCK_SIZE) \
|
||||
paged_attention_v2_impl<T, HEAD_SIZE, BLOCK_SIZE, PARTITION_SIZE>::call( \
|
||||
out_ptr, exp_sums_ptr, max_logits_ptr, tmp_out_ptr, query_ptr, \
|
||||
key_cache_ptr, value_cache_ptr, num_kv_heads, scale, block_tables_ptr, \
|
||||
context_lens_ptr, max_num_blocks_per_seq, alibi_slopes_ptr, q_stride, \
|
||||
kv_block_stride, kv_head_stride, num_seqs, num_heads, \
|
||||
max_num_partitions);
|
||||
|
||||
template <typename T, int BLOCK_SIZE, int PARTITION_SIZE = 512>
|
||||
void paged_attention_v2_impl_launcher(
|
||||
torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits,
|
||||
torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
|
||||
torch::Tensor& value_cache, int num_kv_heads, float scale,
|
||||
torch::Tensor& block_tables, torch::Tensor& seq_lens, int block_size,
|
||||
int max_seq_len, const c10::optional<torch::Tensor>& alibi_slopes) {
|
||||
torch::Tensor &out, torch::Tensor &exp_sums, torch::Tensor &max_logits,
|
||||
torch::Tensor &tmp_out, torch::Tensor &query, torch::Tensor &key_cache,
|
||||
torch::Tensor &value_cache, int num_kv_heads, float scale,
|
||||
torch::Tensor &block_tables, torch::Tensor &context_lens, int block_size,
|
||||
int max_context_len, const c10::optional<torch::Tensor> &alibi_slopes) {
|
||||
int num_seqs = query.size(0);
|
||||
int num_heads = query.size(1);
|
||||
int head_size = query.size(2);
|
||||
@ -676,79 +670,73 @@ void paged_attention_v2_impl_launcher(
|
||||
int max_num_partitions = exp_sums.size(-1);
|
||||
|
||||
// NOTE: alibi_slopes is optional.
|
||||
const float* alibi_slopes_ptr =
|
||||
const float *alibi_slopes_ptr =
|
||||
alibi_slopes
|
||||
? reinterpret_cast<const float*>(alibi_slopes.value().data_ptr())
|
||||
? reinterpret_cast<const float *>(alibi_slopes.value().data_ptr())
|
||||
: nullptr;
|
||||
|
||||
T* out_ptr = reinterpret_cast<T*>(out.data_ptr());
|
||||
float* exp_sums_ptr = reinterpret_cast<float*>(exp_sums.data_ptr());
|
||||
float* max_logits_ptr = reinterpret_cast<float*>(max_logits.data_ptr());
|
||||
T* tmp_out_ptr = reinterpret_cast<T*>(tmp_out.data_ptr());
|
||||
T* query_ptr = reinterpret_cast<T*>(query.data_ptr());
|
||||
T* key_cache_ptr = reinterpret_cast<T*>(key_cache.data_ptr());
|
||||
T* value_cache_ptr = reinterpret_cast<T*>(value_cache.data_ptr());
|
||||
int* block_tables_ptr = block_tables.data_ptr<int>();
|
||||
int* seq_lens_ptr = seq_lens.data_ptr<int>();
|
||||
T *out_ptr = reinterpret_cast<T *>(out.data_ptr());
|
||||
float *exp_sums_ptr = reinterpret_cast<float *>(exp_sums.data_ptr());
|
||||
float *max_logits_ptr = reinterpret_cast<float *>(max_logits.data_ptr());
|
||||
T *tmp_out_ptr = reinterpret_cast<T *>(tmp_out.data_ptr());
|
||||
T *query_ptr = reinterpret_cast<T *>(query.data_ptr());
|
||||
T *key_cache_ptr = reinterpret_cast<T *>(key_cache.data_ptr());
|
||||
T *value_cache_ptr = reinterpret_cast<T *>(value_cache.data_ptr());
|
||||
int *block_tables_ptr = block_tables.data_ptr<int>();
|
||||
int *context_lens_ptr = context_lens.data_ptr<int>();
|
||||
|
||||
switch (head_size) {
|
||||
case 64:
|
||||
LAUNCH_V2_ATTENTION_KERNEL(T, 64, BLOCK_SIZE);
|
||||
break;
|
||||
case 80:
|
||||
LAUNCH_V2_ATTENTION_KERNEL(T, 80, BLOCK_SIZE);
|
||||
break;
|
||||
case 96:
|
||||
LAUNCH_V2_ATTENTION_KERNEL(T, 96, BLOCK_SIZE);
|
||||
break;
|
||||
case 112:
|
||||
LAUNCH_V2_ATTENTION_KERNEL(T, 112, BLOCK_SIZE);
|
||||
break;
|
||||
case 128:
|
||||
LAUNCH_V2_ATTENTION_KERNEL(T, 128, BLOCK_SIZE);
|
||||
break;
|
||||
case 192:
|
||||
LAUNCH_V2_ATTENTION_KERNEL(T, 192, BLOCK_SIZE);
|
||||
break;
|
||||
case 256:
|
||||
LAUNCH_V2_ATTENTION_KERNEL(T, 256, BLOCK_SIZE);
|
||||
break;
|
||||
default:
|
||||
TORCH_CHECK(false, "Unsupported head size: ", head_size);
|
||||
break;
|
||||
case 64:
|
||||
LAUNCH_V2_ATTENTION_KERNEL(T, 64, BLOCK_SIZE);
|
||||
break;
|
||||
case 80:
|
||||
LAUNCH_V2_ATTENTION_KERNEL(T, 80, BLOCK_SIZE);
|
||||
break;
|
||||
case 96:
|
||||
LAUNCH_V2_ATTENTION_KERNEL(T, 96, BLOCK_SIZE);
|
||||
break;
|
||||
case 112:
|
||||
LAUNCH_V2_ATTENTION_KERNEL(T, 112, BLOCK_SIZE);
|
||||
break;
|
||||
case 128:
|
||||
LAUNCH_V2_ATTENTION_KERNEL(T, 128, BLOCK_SIZE);
|
||||
break;
|
||||
case 256:
|
||||
LAUNCH_V2_ATTENTION_KERNEL(T, 256, BLOCK_SIZE);
|
||||
break;
|
||||
default:
|
||||
TORCH_CHECK(false, "Unsupported head size: ", head_size);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
#define CALL_V2_KERNEL_LAUNCHER(T, BLOCK_SIZE) \
|
||||
paged_attention_v2_impl_launcher<T, BLOCK_SIZE>( \
|
||||
out, exp_sums, max_logits, tmp_out, query, key_cache, value_cache, \
|
||||
num_kv_heads, scale, block_tables, seq_lens, block_size, max_seq_len, \
|
||||
alibi_slopes);
|
||||
#define CALL_V2_KERNEL_LAUNCHER(T, BLOCK_SIZE) \
|
||||
paged_attention_v2_impl_launcher<T, BLOCK_SIZE>( \
|
||||
out, exp_sums, max_logits, tmp_out, query, key_cache, value_cache, \
|
||||
num_kv_heads, scale, block_tables, context_lens, block_size, \
|
||||
max_context_len, alibi_slopes);
|
||||
|
||||
#define CALL_V2_KERNEL_LAUNCHER_BLOCK_SIZE(T) \
|
||||
switch (block_size) { \
|
||||
case 16: \
|
||||
CALL_V2_KERNEL_LAUNCHER(T, 16); \
|
||||
break; \
|
||||
default: \
|
||||
TORCH_CHECK(false, "Unsupported block size: ", block_size); \
|
||||
break; \
|
||||
#define CALL_V2_KERNEL_LAUNCHER_BLOCK_SIZE(T) \
|
||||
switch (block_size) { \
|
||||
case 16: \
|
||||
CALL_V2_KERNEL_LAUNCHER(T, 16); \
|
||||
break; \
|
||||
default: \
|
||||
TORCH_CHECK(false, "Unsupported block size: ", block_size); \
|
||||
break; \
|
||||
}
|
||||
} // namespace
|
||||
} // namespace
|
||||
|
||||
void paged_attention_v2(
|
||||
torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits,
|
||||
torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
|
||||
torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
|
||||
torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size,
|
||||
int64_t max_seq_len, const c10::optional<torch::Tensor>& alibi_slopes,
|
||||
const std::string& kv_cache_dtype, double kv_scale, const int64_t tp_rank,
|
||||
const int64_t blocksparse_local_blocks,
|
||||
const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
|
||||
const int64_t blocksparse_head_sliding_step) {
|
||||
void paged_attention_v2(torch::Tensor &out, torch::Tensor &exp_sums,
|
||||
torch::Tensor &max_logits, torch::Tensor &tmp_out,
|
||||
torch::Tensor &query, torch::Tensor &key_cache,
|
||||
torch::Tensor &value_cache, int num_kv_heads,
|
||||
float scale, torch::Tensor &block_tables,
|
||||
torch::Tensor &context_lens, int block_size,
|
||||
int max_context_len,
|
||||
const c10::optional<torch::Tensor> &alibi_slopes,
|
||||
const std::string &kv_cache_dtype, float kv_scale) {
|
||||
TORCH_CHECK(kv_scale == 1.0f);
|
||||
TORCH_CHECK(blocksparse_vert_stride <= 1,
|
||||
"CPU backend does not support blocksparse attention yet.");
|
||||
VLLM_DISPATCH_FLOATING_TYPES(query.scalar_type(), "paged_attention_v2_impl",
|
||||
[&] {
|
||||
CPU_KERNEL_GUARD_IN(paged_attention_v2_impl)
|
||||
|
||||
@ -5,26 +5,25 @@
|
||||
|
||||
namespace {
|
||||
template <typename scalar_t>
|
||||
void copy_blocks_cpu_impl(std::vector<torch::Tensor> const& key_caches,
|
||||
std::vector<torch::Tensor> const& value_caches,
|
||||
const torch::Tensor& mapping_pairs,
|
||||
const int element_num_per_block,
|
||||
const int layer_num) {
|
||||
const size_t pair_num = mapping_pairs.size(0);
|
||||
void copy_blocks_cpu_impl(
|
||||
std::vector<torch::Tensor> &key_caches,
|
||||
std::vector<torch::Tensor> &value_caches,
|
||||
const std::vector<std::pair<int64_t, int64_t>> mapping_pairs,
|
||||
const int element_num_per_block, const int layer_num) {
|
||||
const size_t pair_num = mapping_pairs.size();
|
||||
const size_t block_bytes = sizeof(scalar_t) * element_num_per_block;
|
||||
#pragma omp parallel for collapse(2)
|
||||
for (int layer = 0; layer < layer_num; ++layer) {
|
||||
for (size_t pair = 0; pair < pair_num; ++pair) {
|
||||
int64_t source_offset =
|
||||
element_num_per_block * mapping_pairs[pair][0].item<int64_t>();
|
||||
int64_t source_offset = element_num_per_block * mapping_pairs[pair].first;
|
||||
int64_t target_offset =
|
||||
element_num_per_block * mapping_pairs[pair][1].item<int64_t>();
|
||||
scalar_t* key_cache_ptr = key_caches[layer].data_ptr<scalar_t>();
|
||||
scalar_t* source_ptr = key_cache_ptr + source_offset;
|
||||
scalar_t* target_ptr = key_cache_ptr + target_offset;
|
||||
element_num_per_block * mapping_pairs[pair].second;
|
||||
scalar_t *key_cache_ptr = key_caches[layer].data_ptr<scalar_t>();
|
||||
scalar_t *source_ptr = key_cache_ptr + source_offset;
|
||||
scalar_t *target_ptr = key_cache_ptr + target_offset;
|
||||
std::memcpy(target_ptr, source_ptr, block_bytes);
|
||||
|
||||
scalar_t* value_cache_ptr = value_caches[layer].data_ptr<scalar_t>();
|
||||
scalar_t *value_cache_ptr = value_caches[layer].data_ptr<scalar_t>();
|
||||
source_ptr = value_cache_ptr + source_offset;
|
||||
target_ptr = value_cache_ptr + target_offset;
|
||||
std::memcpy(target_ptr, source_ptr, block_bytes);
|
||||
@ -34,9 +33,9 @@ void copy_blocks_cpu_impl(std::vector<torch::Tensor> const& key_caches,
|
||||
|
||||
template <typename scalar_t>
|
||||
void reshape_and_cache_cpu_impl(
|
||||
const scalar_t* __restrict__ key, const scalar_t* __restrict__ value,
|
||||
scalar_t* __restrict__ key_cache, scalar_t* __restrict__ value_cache,
|
||||
const int64_t* __restrict__ slot_mapping, const int num_tokens,
|
||||
const scalar_t *__restrict__ key, const scalar_t *__restrict__ value,
|
||||
scalar_t *__restrict__ key_cache, scalar_t *__restrict__ value_cache,
|
||||
const int64_t *__restrict__ slot_mapping, const int num_tokens,
|
||||
const int key_stride, const int value_stride, const int num_heads,
|
||||
const int head_size, const int block_size, const int x) {
|
||||
const int block_elem_num = num_heads * head_size * block_size;
|
||||
@ -49,14 +48,14 @@ void reshape_and_cache_cpu_impl(
|
||||
int src_key_head_idx = token_idx * key_stride + head_idx * head_size;
|
||||
int src_value_head_idx =
|
||||
token_idx * value_stride + head_idx * head_size;
|
||||
const scalar_t* src_key_head_ptr = key + src_key_head_idx;
|
||||
const scalar_t* src_value_head_ptr = value + src_value_head_idx;
|
||||
const scalar_t *src_key_head_ptr = key + src_key_head_idx;
|
||||
const scalar_t *src_value_head_ptr = value + src_value_head_idx;
|
||||
const int64_t block_index = slot_idx / block_size;
|
||||
const int64_t block_offset = slot_idx % block_size;
|
||||
scalar_t* target_key_head_ptr = key_cache +
|
||||
scalar_t *target_key_head_ptr = key_cache +
|
||||
block_elem_num * block_index +
|
||||
head_idx * block_size * head_size;
|
||||
scalar_t* target_value_head_ptr = value_cache +
|
||||
scalar_t *target_value_head_ptr = value_cache +
|
||||
block_elem_num * block_index +
|
||||
head_idx * block_size * head_size;
|
||||
|
||||
@ -80,34 +79,39 @@ void reshape_and_cache_cpu_impl(
|
||||
}
|
||||
}
|
||||
}
|
||||
}; // namespace
|
||||
}; // namespace
|
||||
|
||||
// Note: the key_caches and value_caches vectors are constant but
|
||||
// not the Tensors they contain. The vectors need to be const refs
|
||||
// in order to satisfy pytorch's C++ operator registration code.
|
||||
void copy_blocks(std::vector<torch::Tensor> const& key_caches,
|
||||
std::vector<torch::Tensor> const& value_caches,
|
||||
const torch::Tensor& block_mapping) {
|
||||
unsigned num_layers = key_caches.size();
|
||||
void copy_blocks(std::vector<torch::Tensor> &key_caches,
|
||||
std::vector<torch::Tensor> &value_caches,
|
||||
const std::map<int64_t, std::vector<int64_t>> &block_mapping) {
|
||||
int num_layers = key_caches.size();
|
||||
TORCH_CHECK(num_layers == value_caches.size());
|
||||
if (num_layers == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
std::vector<std::pair<int64_t, int64_t>> mapping_pairs;
|
||||
mapping_pairs.reserve(block_mapping.size());
|
||||
for (const auto &pair : block_mapping) {
|
||||
for (const auto &dst : pair.second) {
|
||||
mapping_pairs.emplace_back(pair.first, dst);
|
||||
}
|
||||
}
|
||||
|
||||
const int element_num_per_block = key_caches[0][0].numel();
|
||||
VLLM_DISPATCH_FLOATING_TYPES(
|
||||
key_caches[0].scalar_type(), "copy_blocks_cpu_impl", [&] {
|
||||
CPU_KERNEL_GUARD_IN(copy_blocks_cpu_impl)
|
||||
copy_blocks_cpu_impl<scalar_t>(key_caches, value_caches, block_mapping,
|
||||
copy_blocks_cpu_impl<scalar_t>(key_caches, value_caches, mapping_pairs,
|
||||
element_num_per_block, num_layers);
|
||||
CPU_KERNEL_GUARD_OUT(copy_blocks_cpu_impl)
|
||||
});
|
||||
}
|
||||
|
||||
void reshape_and_cache(torch::Tensor& key, torch::Tensor& value,
|
||||
torch::Tensor& key_cache, torch::Tensor& value_cache,
|
||||
torch::Tensor& slot_mapping,
|
||||
const std::string& kv_cache_dtype, double kv_scale) {
|
||||
void reshape_and_cache(torch::Tensor &key, torch::Tensor &value,
|
||||
torch::Tensor &key_cache, torch::Tensor &value_cache,
|
||||
torch::Tensor &slot_mapping,
|
||||
const std::string &kv_cache_dtype, float kv_scale) {
|
||||
TORCH_CHECK(kv_scale == 1.0f);
|
||||
|
||||
int num_tokens = key.size(0);
|
||||
@ -131,7 +135,7 @@ void reshape_and_cache(torch::Tensor& key, torch::Tensor& value,
|
||||
});
|
||||
}
|
||||
|
||||
void swap_blocks(torch::Tensor& src, torch::Tensor& dst,
|
||||
const torch::Tensor& block_mapping) {
|
||||
void swap_blocks(torch::Tensor &src, torch::Tensor &dst,
|
||||
const std::map<int64_t, int64_t> &block_mapping) {
|
||||
TORCH_CHECK(false, "swap_blocks is unsupported on CPU.")
|
||||
}
|
||||
|
||||
@ -2,14 +2,351 @@
|
||||
#ifndef CPU_TYPES_HPP
|
||||
#define CPU_TYPES_HPP
|
||||
|
||||
#if defined(__x86_64__)
|
||||
//x86 implementation
|
||||
#include "cpu_types_x86.hpp"
|
||||
#elif defined(__POWER9_VECTOR__)
|
||||
//ppc implementation
|
||||
#include "cpu_types_vsx.hpp"
|
||||
#include <immintrin.h>
|
||||
#include <torch/extension.h>
|
||||
|
||||
namespace vec_op {
|
||||
|
||||
// FIXME: FP16 is not fully supported in Torch-CPU
|
||||
#define VLLM_DISPATCH_CASE_FLOATING_TYPES(...) \
|
||||
AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \
|
||||
AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)
|
||||
|
||||
#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \
|
||||
AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
|
||||
|
||||
#ifndef CPU_OP_GUARD
|
||||
#define CPU_KERNEL_GUARD_IN(NAME)
|
||||
#define CPU_KERNEL_GUARD_OUT(NAME)
|
||||
#else
|
||||
#warning "unsupported vLLM cpu implementation"
|
||||
#define CPU_KERNEL_GUARD_IN(NAME) \
|
||||
std::cout << #NAME << " invoked." << std::endl;
|
||||
#define CPU_KERNEL_GUARD_OUT(NAME) std::cout << #NAME << " exit." << std::endl;
|
||||
#endif
|
||||
|
||||
#define FORCE_INLINE __attribute__((always_inline)) inline
|
||||
|
||||
namespace {
|
||||
template <typename T, T... indexes, typename F>
|
||||
constexpr void unroll_loop_item(std::integer_sequence<T, indexes...>, F &&f) {
|
||||
(f(std::integral_constant<T, indexes>{}), ...);
|
||||
}
|
||||
}; // namespace
|
||||
|
||||
template <typename T, T count, typename F,
|
||||
typename = std::enable_if_t<std::is_invocable_v<F, T>>>
|
||||
constexpr void unroll_loop(F &&f) {
|
||||
unroll_loop_item(std::make_integer_sequence<T, count>{}, std::forward<F>(f));
|
||||
}
|
||||
|
||||
template <typename T> struct Vec {
|
||||
constexpr static int get_elem_num() { return T::VEC_ELEM_NUM; }
|
||||
};
|
||||
|
||||
struct FP32Vec8;
|
||||
struct FP32Vec16;
|
||||
|
||||
#ifdef __AVX512FP16__
|
||||
struct FP16Vec8 : public Vec<FP16Vec8> {
|
||||
constexpr static int VEC_ELEM_NUM = 8;
|
||||
|
||||
__m128h reg;
|
||||
|
||||
explicit FP16Vec8(_Float16 v) : reg(_mm_set1_ph(v)) {}
|
||||
|
||||
explicit FP16Vec8(const void *ptr) : reg(_mm_loadu_ph(ptr)) {}
|
||||
|
||||
explicit FP16Vec8(__m128h data) : reg(data) {}
|
||||
|
||||
FP16Vec8 operator*(const FP16Vec8 &b) const {
|
||||
return FP16Vec8(_mm_mul_ph(reg, b.reg));
|
||||
}
|
||||
|
||||
FP16Vec8 operator+(const FP16Vec8 &b) const {
|
||||
return FP16Vec8(_mm_add_ph(reg, b.reg));
|
||||
}
|
||||
|
||||
FP16Vec8 operator-(const FP16Vec8 &b) const {
|
||||
return FP16Vec8(_mm_sub_ph(reg, b.reg));
|
||||
}
|
||||
|
||||
FP16Vec8 operator/(const FP16Vec8 &b) const {
|
||||
return FP16Vec8(_mm_div_ph(reg, b.reg));
|
||||
}
|
||||
|
||||
void save(void *ptr) const { _mm_storeu_ph(ptr, reg); }
|
||||
};
|
||||
#endif
|
||||
|
||||
struct BF16Vec8 : public Vec<BF16Vec8> {
|
||||
constexpr static int VEC_ELEM_NUM = 8;
|
||||
|
||||
__m128i reg;
|
||||
|
||||
explicit BF16Vec8(const void *ptr)
|
||||
: reg((__m128i)_mm_loadu_si128((__m128i *)ptr)) {}
|
||||
|
||||
explicit BF16Vec8(const FP32Vec8 &);
|
||||
|
||||
void save(void *ptr) const { *reinterpret_cast<__m128i *>(ptr) = reg; }
|
||||
};
|
||||
|
||||
struct BF16Vec16 : public Vec<BF16Vec16> {
|
||||
constexpr static int VEC_ELEM_NUM = 16;
|
||||
|
||||
__m256i reg;
|
||||
|
||||
explicit BF16Vec16(const void *ptr)
|
||||
: reg((__m256i)_mm256_loadu_si256((__m256i *)ptr)) {}
|
||||
|
||||
explicit BF16Vec16(const FP32Vec16 &);
|
||||
|
||||
void save(void *ptr) const { *reinterpret_cast<__m256i *>(ptr) = reg; }
|
||||
};
|
||||
|
||||
struct BF16Vec32 : public Vec<BF16Vec32> {
|
||||
constexpr static int VEC_ELEM_NUM = 32;
|
||||
|
||||
__m512i reg;
|
||||
|
||||
explicit BF16Vec32(const void *ptr) : reg((__m512i)_mm512_loadu_si512(ptr)) {}
|
||||
|
||||
explicit BF16Vec32(__m512i data) : reg(data) {}
|
||||
|
||||
explicit BF16Vec32(BF16Vec8 &vec8_data)
|
||||
: reg((__m512i)_mm512_inserti32x4(
|
||||
_mm512_inserti32x4(_mm512_inserti32x4(_mm512_castsi128_si512(
|
||||
(__m128i)vec8_data.reg),
|
||||
(__m128i)vec8_data.reg, 1),
|
||||
(__m128i)vec8_data.reg, 2),
|
||||
(__m128i)vec8_data.reg, 3)) {}
|
||||
|
||||
void save(void *ptr) const { *reinterpret_cast<__m512i *>(ptr) = reg; }
|
||||
};
|
||||
|
||||
struct FP32Vec4 : public Vec<FP32Vec4> {
|
||||
constexpr static int VEC_ELEM_NUM = 4;
|
||||
union AliasReg {
|
||||
__m128 reg;
|
||||
float values[VEC_ELEM_NUM];
|
||||
};
|
||||
|
||||
__m128 reg;
|
||||
|
||||
explicit FP32Vec4(float v) : reg(_mm_set1_ps(v)) {}
|
||||
|
||||
explicit FP32Vec4() : reg(_mm_set1_ps(0.0)) {}
|
||||
|
||||
explicit FP32Vec4(const float *ptr) : reg(_mm_loadu_ps(ptr)) {}
|
||||
|
||||
explicit FP32Vec4(__m128 data) : reg(data) {}
|
||||
|
||||
explicit FP32Vec4(const FP32Vec4 &data) : reg(data.reg) {}
|
||||
};
|
||||
|
||||
struct FP32Vec8 : public Vec<FP32Vec8> {
|
||||
constexpr static int VEC_ELEM_NUM = 8;
|
||||
union AliasReg {
|
||||
__m256 reg;
|
||||
float values[VEC_ELEM_NUM];
|
||||
};
|
||||
|
||||
__m256 reg;
|
||||
|
||||
explicit FP32Vec8(float v) : reg(_mm256_set1_ps(v)) {}
|
||||
|
||||
explicit FP32Vec8() : reg(_mm256_set1_ps(0.0)) {}
|
||||
|
||||
explicit FP32Vec8(const float *ptr) : reg(_mm256_loadu_ps(ptr)) {}
|
||||
|
||||
explicit FP32Vec8(__m256 data) : reg(data) {}
|
||||
|
||||
explicit FP32Vec8(const FP32Vec8 &data) : reg(data.reg) {}
|
||||
|
||||
#ifdef __AVX512FP16__
|
||||
explicit FP32Vec8(__m128h v) : reg(_mm256_cvtph_ps(_mm_castph_si128(v))) {}
|
||||
#endif
|
||||
|
||||
explicit FP32Vec8(const BF16Vec8 &v)
|
||||
: reg(_mm256_castsi256_ps(
|
||||
_mm256_bslli_epi128(_mm256_cvtepu16_epi32(v.reg), 2))) {}
|
||||
|
||||
float reduce_sum() const {
|
||||
AliasReg ar;
|
||||
ar.reg = reg;
|
||||
float result = 0;
|
||||
unroll_loop<int, VEC_ELEM_NUM>([&result, &ar](int i) { result += ar.values[i]; });
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
FP32Vec8 exp() const {
|
||||
AliasReg ar;
|
||||
ar.reg = reg;
|
||||
return FP32Vec8(_mm256_set_ps(expf(ar.values[7]), expf(ar.values[6]),
|
||||
expf(ar.values[5]), expf(ar.values[4]),
|
||||
expf(ar.values[3]), expf(ar.values[2]),
|
||||
expf(ar.values[1]), expf(ar.values[0])));
|
||||
}
|
||||
|
||||
FP32Vec8 tanh() const {
|
||||
AliasReg ar;
|
||||
ar.reg = reg;
|
||||
return FP32Vec8(_mm256_set_ps(tanhf(ar.values[7]), tanhf(ar.values[6]),
|
||||
tanhf(ar.values[5]), tanhf(ar.values[4]),
|
||||
tanhf(ar.values[3]), tanhf(ar.values[2]),
|
||||
tanhf(ar.values[1]), tanhf(ar.values[0])));
|
||||
}
|
||||
|
||||
FP32Vec8 er() const {
|
||||
AliasReg ar;
|
||||
ar.reg = reg;
|
||||
return FP32Vec8(_mm256_set_ps(erf(ar.values[7]), erf(ar.values[6]),
|
||||
erf(ar.values[5]), erf(ar.values[4]),
|
||||
erf(ar.values[3]), erf(ar.values[2]),
|
||||
erf(ar.values[1]), erf(ar.values[0])));
|
||||
}
|
||||
|
||||
FP32Vec8 operator*(const FP32Vec8 &b) const {
|
||||
return FP32Vec8(_mm256_mul_ps(reg, b.reg));
|
||||
}
|
||||
|
||||
FP32Vec8 operator+(const FP32Vec8 &b) const {
|
||||
return FP32Vec8(_mm256_add_ps(reg, b.reg));
|
||||
}
|
||||
|
||||
FP32Vec8 operator-(const FP32Vec8 &b) const {
|
||||
return FP32Vec8(_mm256_sub_ps(reg, b.reg));
|
||||
}
|
||||
|
||||
FP32Vec8 operator/(const FP32Vec8 &b) const {
|
||||
return FP32Vec8(_mm256_div_ps(reg, b.reg));
|
||||
}
|
||||
|
||||
void save(float *ptr) const { _mm256_storeu_ps(ptr, reg); }
|
||||
};
|
||||
|
||||
struct FP32Vec16 : public Vec<FP32Vec16> {
|
||||
constexpr static int VEC_ELEM_NUM = 16;
|
||||
union AliasReg {
|
||||
__m512 reg;
|
||||
float values[VEC_ELEM_NUM];
|
||||
};
|
||||
|
||||
__m512 reg;
|
||||
|
||||
explicit FP32Vec16(float v) : reg(_mm512_set1_ps(v)) {}
|
||||
|
||||
explicit FP32Vec16() : reg(_mm512_set1_ps(0.0)) {}
|
||||
|
||||
explicit FP32Vec16(const float *ptr) : reg(_mm512_loadu_ps(ptr)) {}
|
||||
|
||||
explicit FP32Vec16(__m512 data) : reg(data) {}
|
||||
|
||||
explicit FP32Vec16(const FP32Vec16 &data) : reg(data.reg) {}
|
||||
|
||||
explicit FP32Vec16(const FP32Vec4 &data)
|
||||
: reg((__m512)_mm512_inserti32x4(
|
||||
_mm512_inserti32x4(
|
||||
_mm512_inserti32x4(_mm512_castsi128_si512((__m128i)data.reg),
|
||||
(__m128i)data.reg, 1),
|
||||
(__m128i)data.reg, 2),
|
||||
(__m128i)data.reg, 3)) {}
|
||||
|
||||
explicit FP32Vec16(const FP32Vec8 &data)
|
||||
: reg((__m512)_mm512_inserti32x8(
|
||||
_mm512_castsi256_si512((__m256i)data.reg), (__m256i)data.reg, 1)) {}
|
||||
|
||||
explicit FP32Vec16(const BF16Vec16 &v)
|
||||
: reg(_mm512_castsi512_ps(
|
||||
_mm512_bslli_epi128(_mm512_cvtepu16_epi32(v.reg), 2))) {}
|
||||
|
||||
explicit FP32Vec16(const BF16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {}
|
||||
|
||||
FP32Vec16 operator*(const FP32Vec16 &b) const {
|
||||
return FP32Vec16(_mm512_mul_ps(reg, b.reg));
|
||||
}
|
||||
|
||||
FP32Vec16 operator+(const FP32Vec16 &b) const {
|
||||
return FP32Vec16(_mm512_add_ps(reg, b.reg));
|
||||
}
|
||||
|
||||
FP32Vec16 operator-(const FP32Vec16 &b) const {
|
||||
return FP32Vec16(_mm512_sub_ps(reg, b.reg));
|
||||
}
|
||||
|
||||
FP32Vec16 operator/(const FP32Vec16 &b) const {
|
||||
return FP32Vec16(_mm512_div_ps(reg, b.reg));
|
||||
}
|
||||
|
||||
float reduce_sum() const { return _mm512_reduce_add_ps(reg); }
|
||||
|
||||
template <int group_size> float reduce_sub_sum(int idx) {
|
||||
static_assert(VEC_ELEM_NUM % group_size == 0);
|
||||
constexpr uint32_t base_mask = (0xFFFF >> (16 - group_size));
|
||||
__mmask16 mask = _cvtu32_mask16(base_mask << (idx * group_size));
|
||||
return _mm512_mask_reduce_add_ps(mask, reg);
|
||||
}
|
||||
|
||||
void save(float *ptr) const { _mm512_storeu_ps(ptr, reg); }
|
||||
};
|
||||
|
||||
template <typename T> struct VecType { using vec_type = void; };
|
||||
|
||||
template <typename T> using vec_t = typename VecType<T>::vec_type;
|
||||
|
||||
template <> struct VecType<float> { using vec_type = FP32Vec8; };
|
||||
|
||||
#ifdef __AVX512FP16__
|
||||
template <> struct VecType<c10::Half> { using vec_type = FP16Vec16; };
|
||||
#endif
|
||||
|
||||
template <> struct VecType<c10::BFloat16> { using vec_type = BF16Vec8; };
|
||||
|
||||
template <typename T> void storeFP32(float v, T *ptr) { *ptr = v; }
|
||||
|
||||
#ifdef __AVX512FP16__
|
||||
template <> inline void storeFP32<c10::Half>(float v, c10::Half *ptr) {
|
||||
*reinterpret_cast<_Float16 *>(ptr) = v;
|
||||
}
|
||||
#endif
|
||||
|
||||
inline void fma(FP32Vec16 &acc, FP32Vec16 &a, FP32Vec16 &b) {
|
||||
acc = acc + a * b;
|
||||
}
|
||||
|
||||
#ifdef __AVX512BF16__
|
||||
template <> inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16 *ptr) {
|
||||
*reinterpret_cast<__bfloat16 *>(ptr) = _mm_cvtness_sbh(v);
|
||||
}
|
||||
|
||||
inline BF16Vec8::BF16Vec8(const FP32Vec8 &v)
|
||||
: reg((__m128i)_mm256_cvtneps_pbh(v.reg)) {}
|
||||
|
||||
inline BF16Vec16::BF16Vec16(const FP32Vec16 &v)
|
||||
: reg((__m256i)_mm512_cvtneps_pbh(v.reg)) {}
|
||||
|
||||
inline void fma(FP32Vec16 &acc, BF16Vec32 &a, BF16Vec32 &b) {
|
||||
acc.reg = _mm512_dpbf16_ps(acc.reg, (__m512bh)a.reg, (__m512bh)b.reg);
|
||||
}
|
||||
#else
|
||||
template <> inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16 *ptr) {
|
||||
c10::BFloat16 __attribute__((__may_alias__)) *v_ptr =
|
||||
reinterpret_cast<c10::BFloat16 *>(&v);
|
||||
*ptr = *(v_ptr + 1);
|
||||
}
|
||||
|
||||
inline BF16Vec8::BF16Vec8(const FP32Vec8 &v)
|
||||
: reg(_mm256_cvtepi32_epi16(
|
||||
_mm256_bsrli_epi128(_mm256_castps_si256(v.reg), 2))) {}
|
||||
|
||||
inline BF16Vec16::BF16Vec16(const FP32Vec16 &v)
|
||||
: reg(_mm512_cvtepi32_epi16(
|
||||
_mm512_bsrli_epi128(_mm512_castps_si512(v.reg), 2))) {}
|
||||
#endif
|
||||
|
||||
inline void prefetch(const void *addr) { _mm_prefetch(addr, _MM_HINT_T1); }
|
||||
|
||||
}; // namespace vec_op
|
||||
|
||||
#endif
|
||||
|
||||
@ -1,491 +0,0 @@
|
||||
|
||||
#ifndef CPU_TYPES_VSX_HPP
|
||||
#define CPU_TYPES_VSX_HPP
|
||||
|
||||
#include <altivec.h>
|
||||
#include <cmath>
|
||||
#include <torch/all.h>
|
||||
|
||||
namespace vec_op {
|
||||
|
||||
// FIXME: FP16 is not fully supported in Torch-CPU
|
||||
#define VLLM_DISPATCH_CASE_FLOATING_TYPES(...) \
|
||||
AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \
|
||||
AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)
|
||||
|
||||
#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \
|
||||
AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
|
||||
|
||||
#ifndef CPU_OP_GUARD
|
||||
#define CPU_KERNEL_GUARD_IN(NAME)
|
||||
#define CPU_KERNEL_GUARD_OUT(NAME)
|
||||
#else
|
||||
#define CPU_KERNEL_GUARD_IN(NAME) \
|
||||
std::cout << #NAME << " invoked." << std::endl;
|
||||
#define CPU_KERNEL_GUARD_OUT(NAME) std::cout << #NAME << " exit." << std::endl;
|
||||
#endif
|
||||
|
||||
#define FORCE_INLINE __attribute__((always_inline)) inline
|
||||
|
||||
namespace {
|
||||
template <typename T, T... indexes, typename F>
|
||||
constexpr void unroll_loop_item(std::integer_sequence<T, indexes...>, F &&f) {
|
||||
(f(std::integral_constant<T, indexes>{}), ...);
|
||||
}
|
||||
}; // namespace
|
||||
|
||||
template <typename T, T count, typename F,
|
||||
typename = std::enable_if_t<std::is_invocable_v<F, T>>>
|
||||
constexpr void unroll_loop(F &&f) {
|
||||
unroll_loop_item(std::make_integer_sequence<T, count>{}, std::forward<F>(f));
|
||||
}
|
||||
|
||||
template <typename T> struct Vec {
|
||||
constexpr static int get_elem_num() { return T::VEC_ELEM_NUM; }
|
||||
};
|
||||
|
||||
typedef struct ss16x8x2_t {
|
||||
__vector signed short val[2];
|
||||
} ss16x8x2_t;
|
||||
|
||||
typedef struct ss16x8x4_t {
|
||||
__vector signed short val[4];
|
||||
} ss16x8x4_t;
|
||||
|
||||
typedef struct f32x4x2_t {
|
||||
__vector float val[2];
|
||||
} f32x4x2_t;
|
||||
|
||||
typedef struct f32x4x4_t {
|
||||
__vector float val[4];
|
||||
} f32x4x4_t;
|
||||
|
||||
struct FP32Vec8;
|
||||
struct FP32Vec16;
|
||||
|
||||
struct BF16Vec8 : public Vec<BF16Vec8> {
|
||||
constexpr static int VEC_ELEM_NUM = 8;
|
||||
|
||||
__vector signed short reg;
|
||||
|
||||
explicit BF16Vec8(const void *ptr)
|
||||
: reg((__vector signed short)vec_xl(0, (__vector signed short *)ptr)) {}
|
||||
|
||||
explicit BF16Vec8(const FP32Vec8 &);
|
||||
|
||||
void save(void *ptr) const { *reinterpret_cast<__vector signed short *>(ptr) = reg; }
|
||||
};
|
||||
|
||||
struct BF16Vec16 : public Vec<BF16Vec16> {
|
||||
constexpr static int VEC_ELEM_NUM = 16;
|
||||
|
||||
ss16x8x2_t reg;
|
||||
|
||||
explicit BF16Vec16(const void *ptr) {
|
||||
// Load 256 bits in two parts
|
||||
reg.val[0] = (__vector signed short)vec_xl(0, (signed short *)ptr);
|
||||
reg.val[1] = (__vector signed short)vec_xl(16, (signed short *)ptr);
|
||||
}
|
||||
|
||||
explicit BF16Vec16(const FP32Vec16 &);
|
||||
|
||||
void save(void *ptr) const {
|
||||
// Save 256 bits in two parts
|
||||
vec_xst(reg.val[0], 0, (signed short *)ptr);
|
||||
vec_xst(reg.val[1], 16, (signed short *)ptr);
|
||||
}
|
||||
};
|
||||
|
||||
const static __vector signed short zero = vec_splats((signed short)0);
|
||||
|
||||
struct BF16Vec32 : public Vec<BF16Vec32> {
|
||||
constexpr static int VEC_ELEM_NUM = 32;
|
||||
|
||||
ss16x8x4_t reg;
|
||||
explicit BF16Vec32(const void *ptr)
|
||||
: reg(*reinterpret_cast<const ss16x8x4_t *>(ptr)) {}
|
||||
|
||||
explicit BF16Vec32(ss16x8x4_t data) : reg(data) {}
|
||||
|
||||
explicit BF16Vec32(const BF16Vec8 &vec8_data) : reg({
|
||||
vec8_data.reg,
|
||||
vec8_data.reg,
|
||||
vec8_data.reg,
|
||||
vec8_data.reg
|
||||
}) {}
|
||||
|
||||
void save(void *ptr) const { *reinterpret_cast<ss16x8x4_t *>(ptr) = reg; }
|
||||
};
|
||||
|
||||
struct FP32Vec4 : public Vec<FP32Vec4> {
|
||||
constexpr static int VEC_ELEM_NUM = 4;
|
||||
union AliasReg {
|
||||
__vector float reg;
|
||||
float values[VEC_ELEM_NUM];
|
||||
};
|
||||
|
||||
__vector float reg;
|
||||
|
||||
explicit FP32Vec4(float v) : reg(vec_splats(v)) {}
|
||||
|
||||
explicit FP32Vec4() : reg(vec_splats(0.0f)) {}
|
||||
|
||||
explicit FP32Vec4(const float *ptr) : reg(vec_xl(0, ptr)) {}
|
||||
|
||||
explicit FP32Vec4(__vector float data) : reg(data) {}
|
||||
|
||||
explicit FP32Vec4(const FP32Vec4 &data) : reg(data.reg) {}
|
||||
};
|
||||
|
||||
struct FP32Vec8 : public Vec<FP32Vec8> {
|
||||
constexpr static int VEC_ELEM_NUM = 8;
|
||||
union AliasReg {
|
||||
f32x4x2_t reg;
|
||||
float values[VEC_ELEM_NUM];
|
||||
};
|
||||
|
||||
f32x4x2_t reg;
|
||||
|
||||
explicit FP32Vec8(float v) {
|
||||
reg.val[0] = vec_splats(v);
|
||||
reg.val[1] = vec_splats(v);
|
||||
}
|
||||
|
||||
explicit FP32Vec8() {
|
||||
reg.val[0] = vec_splats(0.0f);
|
||||
reg.val[1] = vec_splats(0.0f);
|
||||
}
|
||||
|
||||
explicit FP32Vec8(const float *ptr) {
|
||||
reg.val[0] = vec_xl(0, ptr);
|
||||
reg.val[1] = vec_xl(16, ptr);
|
||||
}
|
||||
|
||||
explicit FP32Vec8(f32x4x2_t data) : reg(data) {}
|
||||
|
||||
explicit FP32Vec8(const FP32Vec8 &data) {
|
||||
reg.val[0] = data.reg.val[0];
|
||||
reg.val[1] = data.reg.val[1];
|
||||
}
|
||||
|
||||
explicit FP32Vec8(const BF16Vec8 &v) {
|
||||
reg.val[0] = (__vector float)vec_mergeh(zero, v.reg);
|
||||
reg.val[1] = (__vector float)vec_mergel(zero, v.reg);
|
||||
}
|
||||
|
||||
float reduce_sum() const {
|
||||
AliasReg ar;
|
||||
ar.reg = reg;
|
||||
float result = 0;
|
||||
unroll_loop<int, VEC_ELEM_NUM>([&result, &ar](int i) { result += ar.values[i]; });
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
FP32Vec8 exp() const {
|
||||
// TODO: Vectorize this
|
||||
AliasReg ar;
|
||||
ar.reg = reg;
|
||||
f32x4x4_t ret;
|
||||
ret.val[0][0] = std::exp(ar.values[0]);
|
||||
ret.val[0][1] = std::exp(ar.values[1]);
|
||||
ret.val[0][2] = std::exp(ar.values[2]);
|
||||
ret.val[0][3] = std::exp(ar.values[3]);
|
||||
ret.val[1][0] = std::exp(ar.values[4]);
|
||||
ret.val[1][1] = std::exp(ar.values[5]);
|
||||
ret.val[1][2] = std::exp(ar.values[6]);
|
||||
ret.val[1][3] = std::exp(ar.values[7]);
|
||||
return FP32Vec8(f32x4x2_t({ret.val[0], ret.val[1]}));
|
||||
}
|
||||
|
||||
FP32Vec8 tanh() const {
|
||||
// TODO: Vectorize this
|
||||
AliasReg ar;
|
||||
ar.reg = reg;
|
||||
f32x4x4_t ret;
|
||||
ret.val[0][0] = std::tanh(ar.values[0]);
|
||||
ret.val[0][1] = std::tanh(ar.values[1]);
|
||||
ret.val[0][2] = std::tanh(ar.values[2]);
|
||||
ret.val[0][3] = std::tanh(ar.values[3]);
|
||||
ret.val[1][0] = std::tanh(ar.values[4]);
|
||||
ret.val[1][1] = std::tanh(ar.values[5]);
|
||||
ret.val[1][2] = std::tanh(ar.values[6]);
|
||||
ret.val[1][3] = std::tanh(ar.values[7]);
|
||||
return FP32Vec8(f32x4x2_t({ret.val[0], ret.val[1]}));
|
||||
}
|
||||
|
||||
FP32Vec8 er() const {
|
||||
// TODO: Vectorize this
|
||||
AliasReg ar;
|
||||
ar.reg = reg;
|
||||
f32x4x4_t ret;
|
||||
ret.val[0][0] = std::erf(ar.values[0]);
|
||||
ret.val[0][1] = std::erf(ar.values[1]);
|
||||
ret.val[0][2] = std::erf(ar.values[2]);
|
||||
ret.val[0][3] = std::erf(ar.values[3]);
|
||||
ret.val[1][0] = std::erf(ar.values[4]);
|
||||
ret.val[1][1] = std::erf(ar.values[5]);
|
||||
ret.val[1][2] = std::erf(ar.values[6]);
|
||||
ret.val[1][3] = std::erf(ar.values[7]);
|
||||
return FP32Vec8(f32x4x2_t({ret.val[0], ret.val[1]}));
|
||||
}
|
||||
|
||||
FP32Vec8 operator*(const FP32Vec8 &b) const {
|
||||
return FP32Vec8({vec_mul(reg.val[0], b.reg.val[0]), vec_mul(reg.val[1], b.reg.val[1])});
|
||||
}
|
||||
|
||||
FP32Vec8 operator+(const FP32Vec8 &b) const {
|
||||
return FP32Vec8({vec_add(reg.val[0], b.reg.val[0]), vec_add(reg.val[1], b.reg.val[1])});
|
||||
}
|
||||
|
||||
FP32Vec8 operator-(const FP32Vec8 &b) const {
|
||||
return FP32Vec8({vec_sub(reg.val[0], b.reg.val[0]), vec_sub(reg.val[1], b.reg.val[1])});
|
||||
}
|
||||
|
||||
FP32Vec8 operator/(const FP32Vec8 &b) const {
|
||||
return FP32Vec8({vec_div(reg.val[0], b.reg.val[0]), vec_div(reg.val[1], b.reg.val[1])});
|
||||
}
|
||||
|
||||
void save(float *ptr) const {
|
||||
vec_xst(reg.val[0], 0, ptr);
|
||||
vec_xst(reg.val[1], 16, ptr);
|
||||
}
|
||||
};
|
||||
|
||||
struct FP32Vec16 : public Vec<FP32Vec16> {
|
||||
constexpr static int VEC_ELEM_NUM = 16;
|
||||
union AliasReg {
|
||||
f32x4x4_t reg;
|
||||
float values[VEC_ELEM_NUM];
|
||||
};
|
||||
|
||||
f32x4x4_t reg;
|
||||
|
||||
explicit FP32Vec16(float v) {
|
||||
reg.val[0] = vec_splats(v);
|
||||
reg.val[1] = vec_splats(v);
|
||||
reg.val[2] = vec_splats(v);
|
||||
reg.val[3] = vec_splats(v);
|
||||
}
|
||||
|
||||
explicit FP32Vec16() {
|
||||
reg.val[0] = vec_splats(0.0f);
|
||||
reg.val[1] = vec_splats(0.0f);
|
||||
reg.val[2] = vec_splats(0.0f);
|
||||
reg.val[3] = vec_splats(0.0f);
|
||||
}
|
||||
|
||||
explicit FP32Vec16(const float *ptr) {
|
||||
reg.val[0] = vec_xl(0, ptr);
|
||||
reg.val[1] = vec_xl(16, ptr);
|
||||
reg.val[2] = vec_xl(32, ptr);
|
||||
reg.val[3] = vec_xl(48, ptr);
|
||||
}
|
||||
|
||||
explicit FP32Vec16(f32x4x4_t data) : reg(data) {}
|
||||
|
||||
explicit FP32Vec16(const FP32Vec16 &data) {
|
||||
reg.val[0] = data.reg.val[0];
|
||||
reg.val[1] = data.reg.val[1];
|
||||
reg.val[2] = data.reg.val[2];
|
||||
reg.val[3] = data.reg.val[3];
|
||||
}
|
||||
|
||||
explicit FP32Vec16(const FP32Vec4 &data) {
|
||||
reg.val[0] = data.reg;
|
||||
reg.val[1] = data.reg;
|
||||
reg.val[2] = data.reg;
|
||||
reg.val[3] = data.reg;
|
||||
}
|
||||
|
||||
explicit FP32Vec16(const FP32Vec8 &data) {
|
||||
reg.val[0] = data.reg.val[0];
|
||||
reg.val[1] = data.reg.val[1];
|
||||
reg.val[2] = data.reg.val[0];
|
||||
reg.val[3] = data.reg.val[1];
|
||||
}
|
||||
|
||||
explicit FP32Vec16(const BF16Vec16 &v) {
|
||||
reg.val[0] = (__vector float)vec_mergeh(zero, v.reg.val[0]);
|
||||
reg.val[1] = (__vector float)vec_mergel(zero, v.reg.val[0]);
|
||||
reg.val[2] = (__vector float)vec_mergeh(zero, v.reg.val[1]);
|
||||
reg.val[3] = (__vector float)vec_mergel(zero, v.reg.val[1]);
|
||||
}
|
||||
|
||||
explicit FP32Vec16(const BF16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {}
|
||||
|
||||
FP32Vec16 operator*(const FP32Vec16 &b) const {
|
||||
return FP32Vec16(f32x4x4_t({
|
||||
vec_mul(reg.val[0], b.reg.val[0]),
|
||||
vec_mul(reg.val[1], b.reg.val[1]),
|
||||
vec_mul(reg.val[2], b.reg.val[2]),
|
||||
vec_mul(reg.val[3], b.reg.val[3])}));
|
||||
}
|
||||
|
||||
FP32Vec16 operator+(const FP32Vec16 &b) const {
|
||||
return FP32Vec16(f32x4x4_t({
|
||||
vec_add(reg.val[0], b.reg.val[0]),
|
||||
vec_add(reg.val[1], b.reg.val[1]),
|
||||
vec_add(reg.val[2], b.reg.val[2]),
|
||||
vec_add(reg.val[3], b.reg.val[3])}));
|
||||
}
|
||||
|
||||
FP32Vec16 operator-(const FP32Vec16 &b) const {
|
||||
return FP32Vec16(f32x4x4_t({
|
||||
vec_sub(reg.val[0], b.reg.val[0]),
|
||||
vec_sub(reg.val[1], b.reg.val[1]),
|
||||
vec_sub(reg.val[2], b.reg.val[2]),
|
||||
vec_sub(reg.val[3], b.reg.val[3])}));
|
||||
}
|
||||
|
||||
FP32Vec16 operator/(const FP32Vec16 &b) const {
|
||||
return FP32Vec16(f32x4x4_t({
|
||||
vec_div(reg.val[0], b.reg.val[0]),
|
||||
vec_div(reg.val[1], b.reg.val[1]),
|
||||
vec_div(reg.val[2], b.reg.val[2]),
|
||||
vec_div(reg.val[3], b.reg.val[3])}));
|
||||
}
|
||||
|
||||
float reduce_sum() const {
|
||||
AliasReg ar;
|
||||
ar.reg = reg;
|
||||
float result = 0;
|
||||
unroll_loop<int, VEC_ELEM_NUM>([&result, &ar](int i) { result += ar.values[i]; });
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
template <int group_size> float reduce_sub_sum(int idx) {
|
||||
static_assert(VEC_ELEM_NUM % group_size == 0);
|
||||
|
||||
AliasReg ar;
|
||||
ar.reg = reg;
|
||||
float result = 0;
|
||||
const int start = idx * group_size;
|
||||
unroll_loop<int, group_size>(
|
||||
[&result, &start, ar](int i) { result += ar.values[start + i]; });
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
void save(float *ptr) const {
|
||||
vec_xst(reg.val[0], 0, ptr);
|
||||
vec_xst(reg.val[1], 16, ptr);
|
||||
vec_xst(reg.val[2], 32, ptr);
|
||||
vec_xst(reg.val[3], 48, ptr);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T> struct VecType { using vec_type = void; };
|
||||
|
||||
template <typename T> using vec_t = typename VecType<T>::vec_type;
|
||||
|
||||
template <> struct VecType<float> { using vec_type = FP32Vec8; };
|
||||
|
||||
template <> struct VecType<c10::BFloat16> { using vec_type = BF16Vec8; };
|
||||
|
||||
template <typename T> void storeFP32(float v, T *ptr) { *ptr = v; }
|
||||
|
||||
inline void fma(FP32Vec16 &acc, FP32Vec16 &a, FP32Vec16 &b) {
|
||||
acc = acc + a * b;
|
||||
}
|
||||
|
||||
template <> inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16 *ptr) {
|
||||
c10::BFloat16 __attribute__((__may_alias__)) *v_ptr =
|
||||
reinterpret_cast<c10::BFloat16 *>(&v);
|
||||
*ptr = *(v_ptr + 1);
|
||||
}
|
||||
|
||||
#ifndef __VEC_CLASS_FP_NAN
|
||||
#define __VEC_CLASS_FP_NAN (1 << 6)
|
||||
#endif
|
||||
|
||||
const static __vector unsigned char omask = { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
|
||||
#ifndef _ARCH_PWR10
|
||||
const static __vector unsigned int bias = { 0x00007fff, 0x00007fff, 0x00007fff, 0x00007fff };
|
||||
const static __vector unsigned int nan = { 0x7fc00000, 0x7fc00000, 0x7fc00000, 0x7fc00000 };
|
||||
const static __vector unsigned int sh16 = { 16, 16, 16, 16 };
|
||||
const static __vector unsigned int one = { 1, 1, 1, 1 };
|
||||
#endif
|
||||
|
||||
inline BF16Vec8::BF16Vec8(const FP32Vec8 &v) {
|
||||
#ifdef _ARCH_PWR10
|
||||
__vector signed short ret[2];
|
||||
ret[0] = (__vector signed short)__builtin_vsx_xvcvspbf16((__vector unsigned char)v.reg.val[0]);
|
||||
ret[1] = (__vector signed short)__builtin_vsx_xvcvspbf16((__vector unsigned char)v.reg.val[1]);
|
||||
reg = vec_perm(ret[0], ret[1], omask);
|
||||
#elif defined(_ARCH_PWR9)
|
||||
__vector unsigned int inp0 = (__vector unsigned int)(v.reg.val[0]);
|
||||
__vector unsigned int inp1 = (__vector unsigned int)(v.reg.val[1]);
|
||||
__vector unsigned int lsb0 = vec_sr(inp0, sh16);
|
||||
__vector unsigned int lsb1 = vec_sr(inp1, sh16);
|
||||
lsb0 = vec_and(lsb0, one);
|
||||
lsb1 = vec_and(lsb1, one);
|
||||
__vector unsigned int rnd0 = vec_add(lsb0, bias);
|
||||
__vector unsigned int rnd1 = vec_add(lsb1, bias);
|
||||
inp0 = vec_add(inp0, rnd0);
|
||||
inp1 = vec_add(inp1, rnd1);
|
||||
__vector __bool int sel0 = vec_test_data_class(v.reg.val[0], __VEC_CLASS_FP_NAN);
|
||||
__vector __bool int sel1 = vec_test_data_class(v.reg.val[1], __VEC_CLASS_FP_NAN);
|
||||
inp0 = vec_sel(inp0, nan, sel0);
|
||||
inp1 = vec_sel(inp1, nan, sel1);
|
||||
inp0 = vec_sr(inp0, sh16);
|
||||
inp1 = vec_sr(inp1, sh16);
|
||||
reg = (__vector signed short)vec_perm(inp0, inp1, omask);
|
||||
#endif
|
||||
}
|
||||
|
||||
inline BF16Vec16::BF16Vec16(const FP32Vec16 &v) {
|
||||
#ifdef _ARCH_PWR10
|
||||
__vector signed short ret[4];
|
||||
ret[0] = (__vector signed short)__builtin_vsx_xvcvspbf16((__vector unsigned char)v.reg.val[0]);
|
||||
ret[1] = (__vector signed short)__builtin_vsx_xvcvspbf16((__vector unsigned char)v.reg.val[1]);
|
||||
ret[2] = (__vector signed short)__builtin_vsx_xvcvspbf16((__vector unsigned char)v.reg.val[2]);
|
||||
ret[3] = (__vector signed short)__builtin_vsx_xvcvspbf16((__vector unsigned char)v.reg.val[3]);
|
||||
reg.val[0] = vec_perm(ret[0], ret[1], omask);
|
||||
reg.val[1] = vec_perm(ret[2], ret[3], omask);
|
||||
#elif defined(_ARCH_PWR9)
|
||||
__vector unsigned int inp0 = (__vector unsigned int)(v.reg.val[0]);
|
||||
__vector unsigned int inp1 = (__vector unsigned int)(v.reg.val[1]);
|
||||
__vector unsigned int inp2 = (__vector unsigned int)(v.reg.val[2]);
|
||||
__vector unsigned int inp3 = (__vector unsigned int)(v.reg.val[3]);
|
||||
__vector unsigned int lsb0 = vec_sr(inp0, sh16);
|
||||
__vector unsigned int lsb1 = vec_sr(inp1, sh16);
|
||||
__vector unsigned int lsb2 = vec_sr(inp2, sh16);
|
||||
__vector unsigned int lsb3 = vec_sr(inp3, sh16);
|
||||
lsb0 = vec_and(lsb0, one);
|
||||
lsb1 = vec_and(lsb1, one);
|
||||
lsb2 = vec_and(lsb2, one);
|
||||
lsb3 = vec_and(lsb3, one);
|
||||
__vector unsigned int rnd0 = vec_add(lsb0, bias);
|
||||
__vector unsigned int rnd1 = vec_add(lsb1, bias);
|
||||
__vector unsigned int rnd2 = vec_add(lsb2, bias);
|
||||
__vector unsigned int rnd3 = vec_add(lsb3, bias);
|
||||
inp0 = vec_add(inp0, rnd0);
|
||||
inp1 = vec_add(inp1, rnd1);
|
||||
inp2 = vec_add(inp2, rnd2);
|
||||
inp3 = vec_add(inp3, rnd3);
|
||||
__vector __bool int sel0 = vec_test_data_class(v.reg.val[0], __VEC_CLASS_FP_NAN);
|
||||
__vector __bool int sel1 = vec_test_data_class(v.reg.val[1], __VEC_CLASS_FP_NAN);
|
||||
__vector __bool int sel2 = vec_test_data_class(v.reg.val[2], __VEC_CLASS_FP_NAN);
|
||||
__vector __bool int sel3 = vec_test_data_class(v.reg.val[3], __VEC_CLASS_FP_NAN);
|
||||
inp0 = vec_sel(inp0, nan, sel0);
|
||||
inp1 = vec_sel(inp1, nan, sel1);
|
||||
inp2 = vec_sel(inp2, nan, sel2);
|
||||
inp3 = vec_sel(inp3, nan, sel3);
|
||||
inp0 = vec_sr(inp0, sh16);
|
||||
inp1 = vec_sr(inp1, sh16);
|
||||
inp2 = vec_sr(inp2, sh16);
|
||||
inp3 = vec_sr(inp3, sh16);
|
||||
reg.val[0] = (__vector signed short)vec_perm(inp0, inp1, omask);
|
||||
reg.val[1] = (__vector signed short)vec_perm(inp2, inp3, omask);
|
||||
#endif
|
||||
}
|
||||
|
||||
inline void prefetch(const void *addr) {
|
||||
__asm__ __volatile__("dcbt 0, %0" : : "r"(addr) : "memory");
|
||||
}
|
||||
|
||||
}; // namespace vec_op
|
||||
|
||||
#endif
|
||||
@ -1,515 +0,0 @@
|
||||
|
||||
#ifndef CPU_TYPES_X86_HPP
|
||||
#define CPU_TYPES_X86_HPP
|
||||
|
||||
#include <immintrin.h>
|
||||
#include <torch/all.h>
|
||||
|
||||
#ifndef __AVX2__
|
||||
static_assert(false, "AVX2 must be supported for the current implementation.");
|
||||
#endif
|
||||
|
||||
namespace vec_op {
|
||||
|
||||
// FIXME: FP16 is not fully supported in Torch-CPU
|
||||
#define VLLM_DISPATCH_CASE_FLOATING_TYPES(...) \
|
||||
AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \
|
||||
AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)
|
||||
|
||||
#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \
|
||||
AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
|
||||
|
||||
#ifndef CPU_OP_GUARD
|
||||
#define CPU_KERNEL_GUARD_IN(NAME)
|
||||
#define CPU_KERNEL_GUARD_OUT(NAME)
|
||||
#else
|
||||
#define CPU_KERNEL_GUARD_IN(NAME) \
|
||||
std::cout << #NAME << " invoked." << std::endl;
|
||||
#define CPU_KERNEL_GUARD_OUT(NAME) std::cout << #NAME << " exit." << std::endl;
|
||||
#endif
|
||||
|
||||
#define FORCE_INLINE __attribute__((always_inline)) inline
|
||||
|
||||
namespace {
|
||||
template <typename T, T... indexes, typename F>
|
||||
constexpr void unroll_loop_item(std::integer_sequence<T, indexes...>, F &&f) {
|
||||
(f(std::integral_constant<T, indexes>{}), ...);
|
||||
}
|
||||
}; // namespace
|
||||
|
||||
template <typename T, T count, typename F,
|
||||
typename = std::enable_if_t<std::is_invocable_v<F, T>>>
|
||||
constexpr void unroll_loop(F &&f) {
|
||||
unroll_loop_item(std::make_integer_sequence<T, count>{}, std::forward<F>(f));
|
||||
}
|
||||
|
||||
template <typename T> struct Vec {
|
||||
constexpr static int get_elem_num() { return T::VEC_ELEM_NUM; }
|
||||
};
|
||||
|
||||
struct FP32Vec8;
|
||||
struct FP32Vec16;
|
||||
|
||||
#ifdef __AVX512FP16__
|
||||
struct FP16Vec8 : public Vec<FP16Vec8> {
|
||||
constexpr static int VEC_ELEM_NUM = 8;
|
||||
|
||||
__m128h reg;
|
||||
|
||||
explicit FP16Vec8(_Float16 v) : reg(_mm_set1_ph(v)) {}
|
||||
|
||||
explicit FP16Vec8(const void *ptr) : reg(_mm_loadu_ph(ptr)) {}
|
||||
|
||||
explicit FP16Vec8(__m128h data) : reg(data) {}
|
||||
|
||||
FP16Vec8 operator*(const FP16Vec8 &b) const {
|
||||
return FP16Vec8(_mm_mul_ph(reg, b.reg));
|
||||
}
|
||||
|
||||
FP16Vec8 operator+(const FP16Vec8 &b) const {
|
||||
return FP16Vec8(_mm_add_ph(reg, b.reg));
|
||||
}
|
||||
|
||||
FP16Vec8 operator-(const FP16Vec8 &b) const {
|
||||
return FP16Vec8(_mm_sub_ph(reg, b.reg));
|
||||
}
|
||||
|
||||
FP16Vec8 operator/(const FP16Vec8 &b) const {
|
||||
return FP16Vec8(_mm_div_ph(reg, b.reg));
|
||||
}
|
||||
|
||||
void save(void *ptr) const { _mm_storeu_ph(ptr, reg); }
|
||||
};
|
||||
#endif
|
||||
|
||||
struct BF16Vec8 : public Vec<BF16Vec8> {
|
||||
constexpr static int VEC_ELEM_NUM = 8;
|
||||
|
||||
__m128i reg;
|
||||
|
||||
explicit BF16Vec8(const void *ptr)
|
||||
: reg((__m128i)_mm_loadu_si128((__m128i *)ptr)) {}
|
||||
|
||||
explicit BF16Vec8(const FP32Vec8 &);
|
||||
|
||||
void save(void *ptr) const { *reinterpret_cast<__m128i *>(ptr) = reg; }
|
||||
};
|
||||
|
||||
struct BF16Vec16 : public Vec<BF16Vec16> {
|
||||
constexpr static int VEC_ELEM_NUM = 16;
|
||||
|
||||
__m256i reg;
|
||||
|
||||
explicit BF16Vec16(const void *ptr)
|
||||
: reg((__m256i)_mm256_loadu_si256((__m256i *)ptr)) {}
|
||||
|
||||
explicit BF16Vec16(const FP32Vec16 &);
|
||||
|
||||
void save(void *ptr) const { *reinterpret_cast<__m256i *>(ptr) = reg; }
|
||||
};
|
||||
|
||||
#ifdef __AVX512F__
|
||||
struct BF16Vec32 : public Vec<BF16Vec32> {
|
||||
constexpr static int VEC_ELEM_NUM = 32;
|
||||
|
||||
__m512i reg;
|
||||
|
||||
explicit BF16Vec32(const void *ptr) : reg((__m512i)_mm512_loadu_si512(ptr)) {}
|
||||
|
||||
explicit BF16Vec32(__m512i data) : reg(data) {}
|
||||
|
||||
explicit BF16Vec32(BF16Vec8 &vec8_data)
|
||||
: reg((__m512i)_mm512_inserti32x4(
|
||||
_mm512_inserti32x4(_mm512_inserti32x4(_mm512_castsi128_si512(
|
||||
(__m128i)vec8_data.reg),
|
||||
(__m128i)vec8_data.reg, 1),
|
||||
(__m128i)vec8_data.reg, 2),
|
||||
(__m128i)vec8_data.reg, 3)) {}
|
||||
|
||||
void save(void *ptr) const { *reinterpret_cast<__m512i *>(ptr) = reg; }
|
||||
};
|
||||
#else
|
||||
struct BF16Vec32 : public Vec<BF16Vec32> {
|
||||
constexpr static int VEC_ELEM_NUM = 32;
|
||||
|
||||
__m256i reg_low;
|
||||
__m256i reg_high;
|
||||
|
||||
explicit BF16Vec32(const void *ptr)
|
||||
: reg_low(_mm256_loadu_si256((__m256i const *)ptr)),
|
||||
reg_high(_mm256_loadu_si256((__m256i const *)ptr + 1)) {}
|
||||
|
||||
explicit BF16Vec32(__m256i low, __m256i high) : reg_low(low),
|
||||
reg_high(high) {}
|
||||
|
||||
explicit BF16Vec32(BF16Vec8 &vec8_data)
|
||||
: reg_low((__m256i)_mm256_inserti32x4(
|
||||
_mm256_castsi128_si256((__m128i)vec8_data.reg),
|
||||
(__m128i)vec8_data.reg, 1)),
|
||||
reg_high((__m256i)_mm256_inserti32x4(
|
||||
_mm256_castsi128_si256((__m128i)vec8_data.reg),
|
||||
(__m128i)vec8_data.reg, 1)) {}
|
||||
|
||||
void save(void *ptr) const {
|
||||
*reinterpret_cast<__m256i *>(ptr) = reg_low;
|
||||
*reinterpret_cast<__m256i *>((__m256i *)ptr + 1) = reg_high;
|
||||
}
|
||||
};
|
||||
#endif
|
||||
|
||||
struct FP32Vec4 : public Vec<FP32Vec4> {
|
||||
constexpr static int VEC_ELEM_NUM = 4;
|
||||
union AliasReg {
|
||||
__m128 reg;
|
||||
float values[VEC_ELEM_NUM];
|
||||
};
|
||||
|
||||
__m128 reg;
|
||||
|
||||
explicit FP32Vec4(float v) : reg(_mm_set1_ps(v)) {}
|
||||
|
||||
explicit FP32Vec4() : reg(_mm_set1_ps(0.0)) {}
|
||||
|
||||
explicit FP32Vec4(const float *ptr) : reg(_mm_loadu_ps(ptr)) {}
|
||||
|
||||
explicit FP32Vec4(__m128 data) : reg(data) {}
|
||||
|
||||
explicit FP32Vec4(const FP32Vec4 &data) : reg(data.reg) {}
|
||||
};
|
||||
|
||||
struct FP32Vec8 : public Vec<FP32Vec8> {
|
||||
constexpr static int VEC_ELEM_NUM = 8;
|
||||
union AliasReg {
|
||||
__m256 reg;
|
||||
float values[VEC_ELEM_NUM];
|
||||
};
|
||||
|
||||
__m256 reg;
|
||||
|
||||
explicit FP32Vec8(float v) : reg(_mm256_set1_ps(v)) {}
|
||||
|
||||
explicit FP32Vec8() : reg(_mm256_set1_ps(0.0)) {}
|
||||
|
||||
explicit FP32Vec8(const float *ptr) : reg(_mm256_loadu_ps(ptr)) {}
|
||||
|
||||
explicit FP32Vec8(__m256 data) : reg(data) {}
|
||||
|
||||
explicit FP32Vec8(const FP32Vec8 &data) : reg(data.reg) {}
|
||||
|
||||
#ifdef __AVX512FP16__
|
||||
explicit FP32Vec8(__m128h v) : reg(_mm256_cvtph_ps(_mm_castph_si128(v))) {}
|
||||
#endif
|
||||
|
||||
explicit FP32Vec8(const BF16Vec8 &v)
|
||||
: reg(_mm256_castsi256_ps(
|
||||
_mm256_bslli_epi128(_mm256_cvtepu16_epi32(v.reg), 2))) {}
|
||||
|
||||
float reduce_sum() const {
|
||||
AliasReg ar;
|
||||
ar.reg = reg;
|
||||
float result = 0;
|
||||
unroll_loop<int, VEC_ELEM_NUM>([&result, &ar](int i) { result += ar.values[i]; });
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
FP32Vec8 exp() const {
|
||||
AliasReg ar;
|
||||
ar.reg = reg;
|
||||
return FP32Vec8(_mm256_set_ps(expf(ar.values[7]), expf(ar.values[6]),
|
||||
expf(ar.values[5]), expf(ar.values[4]),
|
||||
expf(ar.values[3]), expf(ar.values[2]),
|
||||
expf(ar.values[1]), expf(ar.values[0])));
|
||||
}
|
||||
|
||||
FP32Vec8 tanh() const {
|
||||
AliasReg ar;
|
||||
ar.reg = reg;
|
||||
return FP32Vec8(_mm256_set_ps(tanhf(ar.values[7]), tanhf(ar.values[6]),
|
||||
tanhf(ar.values[5]), tanhf(ar.values[4]),
|
||||
tanhf(ar.values[3]), tanhf(ar.values[2]),
|
||||
tanhf(ar.values[1]), tanhf(ar.values[0])));
|
||||
}
|
||||
|
||||
FP32Vec8 er() const {
|
||||
AliasReg ar;
|
||||
ar.reg = reg;
|
||||
return FP32Vec8(_mm256_set_ps(erf(ar.values[7]), erf(ar.values[6]),
|
||||
erf(ar.values[5]), erf(ar.values[4]),
|
||||
erf(ar.values[3]), erf(ar.values[2]),
|
||||
erf(ar.values[1]), erf(ar.values[0])));
|
||||
}
|
||||
|
||||
FP32Vec8 operator*(const FP32Vec8 &b) const {
|
||||
return FP32Vec8(_mm256_mul_ps(reg, b.reg));
|
||||
}
|
||||
|
||||
FP32Vec8 operator+(const FP32Vec8 &b) const {
|
||||
return FP32Vec8(_mm256_add_ps(reg, b.reg));
|
||||
}
|
||||
|
||||
FP32Vec8 operator-(const FP32Vec8 &b) const {
|
||||
return FP32Vec8(_mm256_sub_ps(reg, b.reg));
|
||||
}
|
||||
|
||||
FP32Vec8 operator/(const FP32Vec8 &b) const {
|
||||
return FP32Vec8(_mm256_div_ps(reg, b.reg));
|
||||
}
|
||||
|
||||
void save(float *ptr) const { _mm256_storeu_ps(ptr, reg); }
|
||||
};
|
||||
|
||||
#ifdef __AVX512F__
|
||||
struct FP32Vec16 : public Vec<FP32Vec16> {
|
||||
constexpr static int VEC_ELEM_NUM = 16;
|
||||
union AliasReg {
|
||||
__m512 reg;
|
||||
float values[VEC_ELEM_NUM];
|
||||
};
|
||||
|
||||
__m512 reg;
|
||||
|
||||
explicit FP32Vec16(float v) : reg(_mm512_set1_ps(v)) {}
|
||||
|
||||
explicit FP32Vec16() : reg(_mm512_set1_ps(0.0)) {}
|
||||
|
||||
explicit FP32Vec16(const float *ptr) : reg(_mm512_loadu_ps(ptr)) {}
|
||||
|
||||
explicit FP32Vec16(__m512 data) : reg(data) {}
|
||||
|
||||
explicit FP32Vec16(const FP32Vec16 &data) : reg(data.reg) {}
|
||||
|
||||
explicit FP32Vec16(const FP32Vec4 &data)
|
||||
: reg((__m512)_mm512_inserti32x4(
|
||||
_mm512_inserti32x4(
|
||||
_mm512_inserti32x4(_mm512_castsi128_si512((__m128i)data.reg),
|
||||
(__m128i)data.reg, 1),
|
||||
(__m128i)data.reg, 2),
|
||||
(__m128i)data.reg, 3)) {}
|
||||
|
||||
explicit FP32Vec16(const FP32Vec8 &data)
|
||||
: reg((__m512)_mm512_inserti32x8(
|
||||
_mm512_castsi256_si512((__m256i)data.reg), (__m256i)data.reg, 1)) {}
|
||||
|
||||
explicit FP32Vec16(const BF16Vec16 &v)
|
||||
: reg(_mm512_castsi512_ps(
|
||||
_mm512_bslli_epi128(_mm512_cvtepu16_epi32(v.reg), 2))) {}
|
||||
|
||||
explicit FP32Vec16(const BF16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {}
|
||||
|
||||
FP32Vec16 operator*(const FP32Vec16 &b) const {
|
||||
return FP32Vec16(_mm512_mul_ps(reg, b.reg));
|
||||
}
|
||||
|
||||
FP32Vec16 operator+(const FP32Vec16 &b) const {
|
||||
return FP32Vec16(_mm512_add_ps(reg, b.reg));
|
||||
}
|
||||
|
||||
FP32Vec16 operator-(const FP32Vec16 &b) const {
|
||||
return FP32Vec16(_mm512_sub_ps(reg, b.reg));
|
||||
}
|
||||
|
||||
FP32Vec16 operator/(const FP32Vec16 &b) const {
|
||||
return FP32Vec16(_mm512_div_ps(reg, b.reg));
|
||||
}
|
||||
|
||||
float reduce_sum() const { return _mm512_reduce_add_ps(reg); }
|
||||
|
||||
template <int group_size> float reduce_sub_sum(int idx) {
|
||||
static_assert(VEC_ELEM_NUM % group_size == 0);
|
||||
constexpr uint32_t base_mask = (0xFFFF >> (16 - group_size));
|
||||
__mmask16 mask = _cvtu32_mask16(base_mask << (idx * group_size));
|
||||
return _mm512_mask_reduce_add_ps(mask, reg);
|
||||
}
|
||||
|
||||
void save(float *ptr) const { _mm512_storeu_ps(ptr, reg); }
|
||||
};
|
||||
#else
|
||||
struct FP32Vec16 : public Vec<FP32Vec16> {
|
||||
constexpr static int VEC_ELEM_NUM = 16;
|
||||
|
||||
union AliasReg {
|
||||
__m256 reg;
|
||||
float values[8];
|
||||
};
|
||||
|
||||
__m256 reg_low;
|
||||
__m256 reg_high;
|
||||
|
||||
explicit FP32Vec16(float v) : reg_low(_mm256_set1_ps(v)),
|
||||
reg_high(_mm256_set1_ps(v)) {}
|
||||
|
||||
explicit FP32Vec16() : reg_low(_mm256_set1_ps(0.0)),
|
||||
reg_high(_mm256_set1_ps(0.0)) {}
|
||||
|
||||
explicit FP32Vec16(const float *ptr) : reg_low(_mm256_loadu_ps(ptr)),
|
||||
reg_high(_mm256_loadu_ps(ptr + 8)) {}
|
||||
|
||||
explicit FP32Vec16(__m256 low, __m256 high) : reg_low(low), reg_high(high) {}
|
||||
|
||||
explicit FP32Vec16(const FP32Vec16 &data) : reg_low(data.reg_low),
|
||||
reg_high(data.reg_high) {}
|
||||
|
||||
explicit FP32Vec16(const FP32Vec4 &data)
|
||||
: reg_low((__m256)_mm256_inserti128_si256(
|
||||
_mm256_castsi128_si256((__m128i)data.reg),
|
||||
(__m128i)data.reg, 1)),
|
||||
reg_high((__m256)_mm256_inserti128_si256(
|
||||
_mm256_castsi128_si256((__m128i)data.reg),
|
||||
(__m128i)data.reg, 1)) {}
|
||||
|
||||
explicit FP32Vec16(const FP32Vec8 &data)
|
||||
: reg_low(data.reg), reg_high(data.reg) {}
|
||||
|
||||
explicit FP32Vec16(const BF16Vec16 &v) {
|
||||
__m128i low = _mm256_extractf128_si256(v.reg, 0);
|
||||
__m128i high = _mm256_extractf128_si256(v.reg, 1);
|
||||
|
||||
__m256i v_low_epi32 = _mm256_cvtepu16_epi32(low);
|
||||
__m256i v_high_epi32 = _mm256_cvtepu16_epi32(high);
|
||||
|
||||
__m256i v_low_shifted = _mm256_bslli_epi128(v_low_epi32, 2);
|
||||
__m256i v_high_shifted = _mm256_bslli_epi128(v_high_epi32, 2);
|
||||
|
||||
reg_low = _mm256_castsi256_ps(v_low_shifted);
|
||||
reg_high = _mm256_castsi256_ps(v_high_shifted);
|
||||
}
|
||||
|
||||
explicit FP32Vec16(const BF16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {}
|
||||
|
||||
FP32Vec16 operator*(const FP32Vec16 &b) const {
|
||||
return FP32Vec16(_mm256_mul_ps(reg_low, b.reg_low),
|
||||
_mm256_mul_ps(reg_high, b.reg_high));
|
||||
}
|
||||
|
||||
FP32Vec16 operator+(const FP32Vec16 &b) const {
|
||||
return FP32Vec16(_mm256_add_ps(reg_low, b.reg_low),
|
||||
_mm256_add_ps(reg_high, b.reg_high));
|
||||
}
|
||||
|
||||
FP32Vec16 operator-(const FP32Vec16 &b) const {
|
||||
return FP32Vec16(_mm256_sub_ps(reg_low, b.reg_low),
|
||||
_mm256_sub_ps(reg_high, b.reg_high));
|
||||
}
|
||||
|
||||
FP32Vec16 operator/(const FP32Vec16 &b) const {
|
||||
return FP32Vec16(_mm256_div_ps(reg_low, b.reg_low),
|
||||
_mm256_div_ps(reg_high, b.reg_high));
|
||||
}
|
||||
|
||||
float reduce_sum() const {
|
||||
FP32Vec8 low = FP32Vec8(reg_low);
|
||||
FP32Vec8 high = FP32Vec8(reg_high);
|
||||
return low.reduce_sum() + high.reduce_sum();
|
||||
}
|
||||
|
||||
template <int group_size> float reduce_sub_sum(int idx) {
|
||||
float sum = 0.0;
|
||||
static_assert(VEC_ELEM_NUM % group_size == 0);
|
||||
constexpr uint32_t base_mask = (0xFFFF >> (16 - group_size));
|
||||
uint32_t mask = base_mask << (idx * group_size);
|
||||
|
||||
AliasReg ar;
|
||||
|
||||
auto func = [&sum, &mask, &ar](int i) {
|
||||
int flag = mask & 0x1;
|
||||
mask = mask >> 1;
|
||||
if (flag != 0) sum += ar.values[i];
|
||||
};
|
||||
|
||||
ar.reg = reg_low;
|
||||
unroll_loop<int, 8>(func);
|
||||
|
||||
ar.reg = reg_high;
|
||||
unroll_loop<int, 8>(func);
|
||||
|
||||
return sum;
|
||||
}
|
||||
|
||||
void save(float *ptr) const {
|
||||
_mm256_storeu_ps(ptr, reg_low);
|
||||
_mm256_storeu_ps(ptr + 8, reg_high);
|
||||
}
|
||||
};
|
||||
#endif
|
||||
|
||||
template <typename T> struct VecType { using vec_type = void; };
|
||||
|
||||
template <typename T> using vec_t = typename VecType<T>::vec_type;
|
||||
|
||||
template <> struct VecType<float> { using vec_type = FP32Vec8; };
|
||||
|
||||
#ifdef __AVX512FP16__
|
||||
template <> struct VecType<c10::Half> { using vec_type = FP16Vec16; };
|
||||
#endif
|
||||
|
||||
template <> struct VecType<c10::BFloat16> { using vec_type = BF16Vec8; };
|
||||
|
||||
template <typename T> void storeFP32(float v, T *ptr) { *ptr = v; }
|
||||
|
||||
#ifdef __AVX512FP16__
|
||||
template <> inline void storeFP32<c10::Half>(float v, c10::Half *ptr) {
|
||||
*reinterpret_cast<_Float16 *>(ptr) = v;
|
||||
}
|
||||
#endif
|
||||
|
||||
inline void fma(FP32Vec16 &acc, FP32Vec16 &a, FP32Vec16 &b) {
|
||||
acc = acc + a * b;
|
||||
}
|
||||
|
||||
#ifdef __AVX512BF16__
|
||||
template <> inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16 *ptr) {
|
||||
*reinterpret_cast<__bfloat16 *>(ptr) = _mm_cvtness_sbh(v);
|
||||
}
|
||||
|
||||
inline BF16Vec8::BF16Vec8(const FP32Vec8 &v)
|
||||
: reg((__m128i)_mm256_cvtneps_pbh(v.reg)) {}
|
||||
|
||||
inline BF16Vec16::BF16Vec16(const FP32Vec16 &v)
|
||||
: reg((__m256i)_mm512_cvtneps_pbh(v.reg)) {}
|
||||
|
||||
inline void fma(FP32Vec16 &acc, BF16Vec32 &a, BF16Vec32 &b) {
|
||||
acc.reg = _mm512_dpbf16_ps(acc.reg, (__m512bh)a.reg, (__m512bh)b.reg);
|
||||
}
|
||||
#else
|
||||
template <> inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16 *ptr) {
|
||||
c10::BFloat16 __attribute__((__may_alias__)) *v_ptr =
|
||||
reinterpret_cast<c10::BFloat16 *>(&v);
|
||||
*ptr = *(v_ptr + 1);
|
||||
}
|
||||
|
||||
#ifdef __AVX512F__
|
||||
inline BF16Vec8::BF16Vec8(const FP32Vec8 &v)
|
||||
: reg(_mm256_cvtepi32_epi16(
|
||||
_mm256_bsrli_epi128(_mm256_castps_si256(v.reg), 2))) {}
|
||||
|
||||
inline BF16Vec16::BF16Vec16(const FP32Vec16 &v)
|
||||
: reg(_mm512_cvtepi32_epi16(
|
||||
_mm512_bsrli_epi128(_mm512_castps_si512(v.reg), 2))) {}
|
||||
#else
|
||||
namespace{
|
||||
__m128i FP32Vec8_to_BF16Vec8_avx2(__m256 a) {
|
||||
__m256i ai = _mm256_castps_si256(a);
|
||||
ai = _mm256_srli_epi32(ai, 16);
|
||||
ai = _mm256_packus_epi32(ai, ai);
|
||||
ai = _mm256_permute4x64_epi64(ai, 0b00111001);
|
||||
return _mm256_extracti128_si256(ai, 0);
|
||||
}
|
||||
}
|
||||
|
||||
inline BF16Vec8::BF16Vec8(const FP32Vec8 &v)
|
||||
: reg(FP32Vec8_to_BF16Vec8_avx2(v.reg)) {}
|
||||
|
||||
inline BF16Vec16::BF16Vec16(const FP32Vec16 &v) {
|
||||
BF16Vec8 low = BF16Vec8(FP32Vec8(v.reg_low));
|
||||
BF16Vec8 high = BF16Vec8(FP32Vec8(v.reg_high));
|
||||
reg = _mm256_insertf128_si256(_mm256_castsi128_si256(low.reg), high.reg, 1);
|
||||
}
|
||||
#endif // __AVX512F__
|
||||
#endif // __AVX512BF16__
|
||||
|
||||
inline void prefetch(const void *addr) { _mm_prefetch(addr, _MM_HINT_T1); }
|
||||
|
||||
}; // namespace vec_op
|
||||
|
||||
#endif
|
||||
@ -2,10 +2,10 @@
|
||||
|
||||
namespace {
|
||||
template <typename scalar_t>
|
||||
void rms_norm_impl(scalar_t* __restrict__ out,
|
||||
const scalar_t* __restrict__ input,
|
||||
const scalar_t* __restrict__ weight, const float epsilon,
|
||||
const int num_tokens, const int hidden_size) {
|
||||
void rms_norm_impl(scalar_t *__restrict__ out,
|
||||
const scalar_t *__restrict__ input,
|
||||
const scalar_t *__restrict__ weight, const float epsilon,
|
||||
const int num_tokens, const int hidden_size) {
|
||||
using scalar_vec_t = vec_op::vec_t<scalar_t>;
|
||||
constexpr int VEC_ELEM_NUM = scalar_vec_t::get_elem_num();
|
||||
TORCH_CHECK(hidden_size % VEC_ELEM_NUM == 0);
|
||||
@ -41,11 +41,11 @@ void rms_norm_impl(scalar_t* __restrict__ out,
|
||||
}
|
||||
|
||||
template <typename scalar_t>
|
||||
void fused_add_rms_norm_impl(scalar_t* __restrict__ input,
|
||||
scalar_t* __restrict__ residual,
|
||||
const scalar_t* __restrict__ weight,
|
||||
const float epsilon, const int num_tokens,
|
||||
const int hidden_size) {
|
||||
void fused_add_rms_norm_impl(scalar_t *__restrict__ input,
|
||||
scalar_t *__restrict__ residual,
|
||||
const scalar_t *__restrict__ weight,
|
||||
const float epsilon, const int num_tokens,
|
||||
const int hidden_size) {
|
||||
using scalar_vec_t = vec_op::vec_t<scalar_t>;
|
||||
constexpr int VEC_ELEM_NUM = scalar_vec_t::get_elem_num();
|
||||
TORCH_CHECK(hidden_size % VEC_ELEM_NUM == 0);
|
||||
@ -85,24 +85,24 @@ void fused_add_rms_norm_impl(scalar_t* __restrict__ input,
|
||||
}
|
||||
}
|
||||
}
|
||||
} // namespace
|
||||
} // namespace
|
||||
|
||||
void rms_norm(torch::Tensor& out, torch::Tensor& input, torch::Tensor& weight,
|
||||
double epsilon) {
|
||||
void rms_norm(torch::Tensor &out, torch::Tensor &input,
|
||||
torch::Tensor &weight, float epsilon) {
|
||||
int hidden_size = input.size(-1);
|
||||
int num_tokens = input.numel() / hidden_size;
|
||||
|
||||
VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "rms_norm_impl", [&] {
|
||||
CPU_KERNEL_GUARD_IN(rms_norm_impl)
|
||||
rms_norm_impl(out.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(),
|
||||
weight.data_ptr<scalar_t>(), epsilon, num_tokens,
|
||||
hidden_size);
|
||||
weight.data_ptr<scalar_t>(), epsilon, num_tokens,
|
||||
hidden_size);
|
||||
CPU_KERNEL_GUARD_OUT(rms_norm_impl)
|
||||
});
|
||||
}
|
||||
|
||||
void fused_add_rms_norm(torch::Tensor& input, torch::Tensor& residual,
|
||||
torch::Tensor& weight, double epsilon) {
|
||||
void fused_add_rms_norm(torch::Tensor &input, torch::Tensor &residual,
|
||||
torch::Tensor &weight, float epsilon) {
|
||||
int hidden_size = input.size(-1);
|
||||
int num_tokens = input.numel() / hidden_size;
|
||||
|
||||
|
||||
@ -4,107 +4,107 @@
|
||||
namespace {
|
||||
template <typename scalar_t>
|
||||
void rotary_embedding_impl(
|
||||
const int64_t* __restrict__ positions, // [batch_size, seq_len] or
|
||||
// [num_tokens]
|
||||
scalar_t* __restrict__ query, /// [batch_size, seq_len, num_heads,
|
||||
/// head_size] or [num_tokens, num_heads,
|
||||
/// head_size]
|
||||
scalar_t* __restrict__ key, // [batch_size, seq_len, num_kv_heads,
|
||||
// head_size] or [num_tokens, num_kv_heads,
|
||||
// head_size]
|
||||
const scalar_t* __restrict__ cos_sin_cache, // [max_position, 2, rot_dim //
|
||||
// 2]
|
||||
const int64_t
|
||||
*__restrict__ positions, // [batch_size, seq_len] or [num_tokens]
|
||||
scalar_t
|
||||
*__restrict__ query, /// [batch_size, seq_len, num_heads, head_size] or
|
||||
/// [num_tokens, num_heads, head_size]
|
||||
scalar_t
|
||||
*__restrict__ key, // [batch_size, seq_len, num_kv_heads, head_size] or
|
||||
// [num_tokens, num_kv_heads, head_size]
|
||||
const scalar_t
|
||||
*__restrict__ cos_sin_cache, // [max_position, 2, rot_dim // 2]
|
||||
const int rot_dim, const int64_t query_stride, const int64_t key_stride,
|
||||
const int num_heads, const int num_kv_heads, const int head_size,
|
||||
const int num_tokens) {
|
||||
using scalar_vec_t = vec_op::vec_t<scalar_t>;
|
||||
constexpr int VEC_ELEM_NUM = scalar_vec_t::get_elem_num();
|
||||
constexpr int ELEM_SIZE = sizeof(scalar_t);
|
||||
|
||||
const int embed_dim = rot_dim / 2;
|
||||
bool flag = (embed_dim % VEC_ELEM_NUM == 0);
|
||||
const int loop_upper = flag ? embed_dim : embed_dim - VEC_ELEM_NUM;
|
||||
|
||||
auto compute_loop = [&](const int64_t token_head, const scalar_t* cache_ptr,
|
||||
scalar_t* qk) {
|
||||
int j = 0;
|
||||
for (; j < loop_upper; j += VEC_ELEM_NUM) {
|
||||
const int rot_offset = j;
|
||||
const int x_index = rot_offset;
|
||||
const int y_index = embed_dim + rot_offset;
|
||||
|
||||
const int64_t out_x = token_head + x_index;
|
||||
const int64_t out_y = token_head + y_index;
|
||||
|
||||
const scalar_vec_t cos(cache_ptr + x_index);
|
||||
const scalar_vec_t sin(cache_ptr + y_index);
|
||||
|
||||
const scalar_vec_t q_x(qk + out_x);
|
||||
const scalar_vec_t q_y(qk + out_y);
|
||||
|
||||
vec_op::FP32Vec8 fp32_cos(cos);
|
||||
vec_op::FP32Vec8 fp32_sin(sin);
|
||||
|
||||
vec_op::FP32Vec8 fp32_q_x(q_x);
|
||||
vec_op::FP32Vec8 fp32_q_y(q_y);
|
||||
|
||||
auto out1 = fp32_q_x * fp32_cos - fp32_q_y * fp32_sin;
|
||||
scalar_vec_t(out1).save(qk + out_x);
|
||||
|
||||
auto out2 = fp32_q_y * fp32_cos + fp32_q_x * fp32_sin;
|
||||
scalar_vec_t(out2).save(qk + out_y);
|
||||
}
|
||||
if (!flag) {
|
||||
for (; j < embed_dim; ++j) {
|
||||
const int x_index = j;
|
||||
const int y_index = embed_dim + j;
|
||||
|
||||
const int64_t out_x = token_head + x_index;
|
||||
const int64_t out_y = token_head + y_index;
|
||||
|
||||
const float fp32_cos = cache_ptr[x_index];
|
||||
const float fp32_sin = cache_ptr[y_index];
|
||||
|
||||
const float fp32_q_x = qk[out_x];
|
||||
const float fp32_q_y = qk[out_y];
|
||||
|
||||
qk[out_x] = fp32_q_x * fp32_cos - fp32_q_y * fp32_sin;
|
||||
qk[out_y] = fp32_q_y * fp32_cos + fp32_q_x * fp32_sin;
|
||||
}
|
||||
}
|
||||
};
|
||||
TORCH_CHECK(embed_dim % VEC_ELEM_NUM == 0);
|
||||
|
||||
#pragma omp parallel for
|
||||
for (int token_idx = 0; token_idx < num_tokens; ++token_idx) {
|
||||
int64_t pos = positions[token_idx];
|
||||
const scalar_t* cache_ptr = cos_sin_cache + pos * rot_dim;
|
||||
const scalar_t *cache_ptr = cos_sin_cache + pos * rot_dim;
|
||||
|
||||
for (int i = 0; i < num_heads; ++i) {
|
||||
const int head_idx = i;
|
||||
const int64_t token_head =
|
||||
token_idx * query_stride + head_idx * head_size;
|
||||
compute_loop(token_head, cache_ptr, query);
|
||||
for (int j = 0; j < embed_dim; j += VEC_ELEM_NUM) {
|
||||
const int rot_offset = j;
|
||||
const int x_index = rot_offset;
|
||||
const int y_index = embed_dim + rot_offset;
|
||||
|
||||
const int64_t out_x = token_head + x_index;
|
||||
const int64_t out_y = token_head + y_index;
|
||||
|
||||
const scalar_vec_t cos(cache_ptr + x_index);
|
||||
const scalar_vec_t sin(cache_ptr + y_index);
|
||||
|
||||
const scalar_vec_t q_x(query + out_x);
|
||||
const scalar_vec_t q_y(query + out_y);
|
||||
|
||||
vec_op::FP32Vec8 fp32_cos(cos);
|
||||
vec_op::FP32Vec8 fp32_sin(sin);
|
||||
|
||||
vec_op::FP32Vec8 fp32_q_x(q_x);
|
||||
vec_op::FP32Vec8 fp32_q_y(q_y);
|
||||
|
||||
auto out1 = fp32_q_x * fp32_cos - fp32_q_y * fp32_sin;
|
||||
scalar_vec_t(out1).save(query + out_x);
|
||||
|
||||
auto out2 = fp32_q_y * fp32_cos + fp32_q_x * fp32_sin;
|
||||
scalar_vec_t(out2).save(query + out_y);
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < num_kv_heads; ++i) {
|
||||
const int head_idx = i;
|
||||
const int64_t token_head = token_idx * key_stride + head_idx * head_size;
|
||||
compute_loop(token_head, cache_ptr, key);
|
||||
for (int j = 0; j < embed_dim; j += VEC_ELEM_NUM) {
|
||||
const int rot_offset = j;
|
||||
const int x_index = rot_offset;
|
||||
const int y_index = embed_dim + rot_offset;
|
||||
|
||||
const int64_t out_x = token_head + x_index;
|
||||
const int64_t out_y = token_head + y_index;
|
||||
|
||||
const scalar_vec_t cos(cache_ptr + x_index);
|
||||
const scalar_vec_t sin(cache_ptr + y_index);
|
||||
|
||||
const scalar_vec_t k_x(key + out_x);
|
||||
const scalar_vec_t k_y(key + out_y);
|
||||
|
||||
vec_op::FP32Vec8 fp32_cos(cos);
|
||||
vec_op::FP32Vec8 fp32_sin(sin);
|
||||
|
||||
vec_op::FP32Vec8 fp32_k_x(k_x);
|
||||
vec_op::FP32Vec8 fp32_k_y(k_y);
|
||||
|
||||
auto out1 = fp32_k_x * fp32_cos - fp32_k_y * fp32_sin;
|
||||
scalar_vec_t(out1).save(key + out_x);
|
||||
auto out2 = fp32_k_y * fp32_cos + fp32_k_x * fp32_sin;
|
||||
scalar_vec_t(out2).save(key + out_y);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename scalar_t>
|
||||
void rotary_embedding_gptj_impl(
|
||||
const int64_t* __restrict__ positions, // [batch_size, seq_len] or
|
||||
// [num_tokens]
|
||||
scalar_t* __restrict__ query, /// [batch_size, seq_len, num_heads,
|
||||
/// head_size] or [num_tokens, num_heads,
|
||||
/// head_size]
|
||||
scalar_t* __restrict__ key, // [batch_size, seq_len, num_kv_heads,
|
||||
// head_size] or [num_tokens, num_kv_heads,
|
||||
// head_size]
|
||||
const scalar_t* __restrict__ cos_sin_cache, // [max_position, 2, rot_dim //
|
||||
// 2]
|
||||
const int64_t
|
||||
*__restrict__ positions, // [batch_size, seq_len] or [num_tokens]
|
||||
scalar_t
|
||||
*__restrict__ query, /// [batch_size, seq_len, num_heads, head_size] or
|
||||
/// [num_tokens, num_heads, head_size]
|
||||
scalar_t
|
||||
*__restrict__ key, // [batch_size, seq_len, num_kv_heads, head_size] or
|
||||
// [num_tokens, num_kv_heads, head_size]
|
||||
const scalar_t
|
||||
*__restrict__ cos_sin_cache, // [max_position, 2, rot_dim // 2]
|
||||
const int rot_dim, const int64_t query_stride, const int64_t key_stride,
|
||||
const int num_heads, const int num_kv_heads, const int head_size,
|
||||
const int num_tokens) {
|
||||
@ -114,13 +114,13 @@ void rotary_embedding_gptj_impl(
|
||||
for (int token_idx = 0; token_idx < num_tokens; ++token_idx) {
|
||||
for (int i = 0; i < num_heads; ++i) {
|
||||
int64_t pos = positions[token_idx];
|
||||
const scalar_t* cache_ptr = cos_sin_cache + pos * rot_dim;
|
||||
const scalar_t* cos_cache_ptr = cache_ptr;
|
||||
const scalar_t* sin_cache_ptr = cache_ptr + embed_dim;
|
||||
const scalar_t *cache_ptr = cos_sin_cache + pos * rot_dim;
|
||||
const scalar_t *cos_cache_ptr = cache_ptr;
|
||||
const scalar_t *sin_cache_ptr = cache_ptr + embed_dim;
|
||||
const int head_idx = i;
|
||||
const int64_t token_head =
|
||||
token_idx * query_stride + head_idx * head_size;
|
||||
scalar_t* head_query = token_head + query;
|
||||
scalar_t *head_query = token_head + query;
|
||||
for (int j = 0; j < embed_dim; j += 1) {
|
||||
const int rot_offset = j;
|
||||
const int x_index = 2 * rot_offset;
|
||||
@ -142,12 +142,12 @@ void rotary_embedding_gptj_impl(
|
||||
for (int token_idx = 0; token_idx < num_tokens; ++token_idx) {
|
||||
for (int i = 0; i < num_kv_heads; ++i) {
|
||||
int64_t pos = positions[token_idx];
|
||||
const scalar_t* cache_ptr = cos_sin_cache + pos * rot_dim;
|
||||
const scalar_t* cos_cache_ptr = cache_ptr;
|
||||
const scalar_t* sin_cache_ptr = cache_ptr + embed_dim;
|
||||
const scalar_t *cache_ptr = cos_sin_cache + pos * rot_dim;
|
||||
const scalar_t *cos_cache_ptr = cache_ptr;
|
||||
const scalar_t *sin_cache_ptr = cache_ptr + embed_dim;
|
||||
const int head_idx = i;
|
||||
const int64_t token_head = token_idx * key_stride + head_idx * head_size;
|
||||
scalar_t* head_key = key + token_head;
|
||||
scalar_t *head_key = key + token_head;
|
||||
for (int j = 0; j < embed_dim; j += 1) {
|
||||
const int rot_offset = j;
|
||||
const int x_index = 2 * rot_offset;
|
||||
@ -165,11 +165,11 @@ void rotary_embedding_gptj_impl(
|
||||
}
|
||||
}
|
||||
}
|
||||
}; // namespace
|
||||
}; // namespace
|
||||
|
||||
void rotary_embedding(torch::Tensor& positions, torch::Tensor& query,
|
||||
torch::Tensor& key, int64_t head_size,
|
||||
torch::Tensor& cos_sin_cache, bool is_neox) {
|
||||
void rotary_embedding(torch::Tensor &positions, torch::Tensor &query,
|
||||
torch::Tensor &key, int head_size,
|
||||
torch::Tensor &cos_sin_cache, bool is_neox) {
|
||||
int num_tokens = query.numel() / query.size(-1);
|
||||
int rot_dim = cos_sin_cache.size(1);
|
||||
int num_heads = query.size(-1) / head_size;
|
||||
|
||||
73
csrc/cpu/pybind.cpp
Normal file
73
csrc/cpu/pybind.cpp
Normal file
@ -0,0 +1,73 @@
|
||||
#include "cache.h"
|
||||
#include "cuda_utils.h"
|
||||
#include "ops.h"
|
||||
#include <torch/extension.h>
|
||||
|
||||
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
|
||||
// vLLM custom ops
|
||||
pybind11::module ops = m.def_submodule("ops", "vLLM custom operators");
|
||||
|
||||
// Attention ops
|
||||
ops.def(
|
||||
"paged_attention_v1",
|
||||
&paged_attention_v1,
|
||||
"Compute the attention between an input query and the cached keys/values using PagedAttention.");
|
||||
ops.def(
|
||||
"paged_attention_v2",
|
||||
&paged_attention_v2,
|
||||
"PagedAttention V2.");
|
||||
|
||||
// Activation ops
|
||||
ops.def(
|
||||
"silu_and_mul",
|
||||
&silu_and_mul,
|
||||
"Activation function used in SwiGLU.");
|
||||
ops.def(
|
||||
"gelu_and_mul",
|
||||
&gelu_and_mul,
|
||||
"Activation function used in GeGLU with `none` approximation.");
|
||||
ops.def(
|
||||
"gelu_tanh_and_mul",
|
||||
&gelu_tanh_and_mul,
|
||||
"Activation function used in GeGLU with `tanh` approximation.");
|
||||
ops.def(
|
||||
"gelu_new",
|
||||
&gelu_new,
|
||||
"GELU implementation used in GPT-2.");
|
||||
ops.def(
|
||||
"gelu_fast",
|
||||
&gelu_fast,
|
||||
"Approximate GELU implementation.");
|
||||
|
||||
// Layernorm
|
||||
ops.def(
|
||||
"rms_norm",
|
||||
&rms_norm,
|
||||
"Apply Root Mean Square (RMS) Normalization to the input tensor.");
|
||||
|
||||
ops.def(
|
||||
"fused_add_rms_norm",
|
||||
&fused_add_rms_norm,
|
||||
"In-place fused Add and RMS Normalization");
|
||||
|
||||
// Rotary embedding
|
||||
ops.def(
|
||||
"rotary_embedding",
|
||||
&rotary_embedding,
|
||||
"Apply GPT-NeoX or GPT-J style rotary embedding to query and key");
|
||||
|
||||
// Cache ops
|
||||
pybind11::module cache_ops = m.def_submodule("cache_ops", "vLLM cache ops");
|
||||
cache_ops.def(
|
||||
"swap_blocks",
|
||||
&swap_blocks,
|
||||
"Swap in (out) the cache blocks from src to dst");
|
||||
cache_ops.def(
|
||||
"copy_blocks",
|
||||
©_blocks,
|
||||
"Copy the cache blocks from src to dst");
|
||||
cache_ops.def(
|
||||
"reshape_and_cache",
|
||||
&reshape_and_cache,
|
||||
"Reshape the key and value tensors and cache them");
|
||||
}
|
||||
@ -1,110 +0,0 @@
|
||||
#include "cache.h"
|
||||
#include "ops.h"
|
||||
#include "registration.h"
|
||||
|
||||
#include <torch/library.h>
|
||||
|
||||
TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
||||
// vLLM custom ops
|
||||
|
||||
// Attention ops
|
||||
// Compute the attention between an input query and the cached keys/values
|
||||
// using PagedAttention.
|
||||
ops.def(
|
||||
"paged_attention_v1("
|
||||
" Tensor! out, Tensor query, Tensor key_cache,"
|
||||
" Tensor value_cache, int num_kv_heads, float scale,"
|
||||
" Tensor block_tables, Tensor seq_lens, int block_size,"
|
||||
" int max_seq_len, Tensor? alibi_slopes,"
|
||||
" str kv_cache_dtype, float kv_scale, int tp_rank,"
|
||||
" int blocksparse_local_blocks,"
|
||||
" int blocksparse_vert_stride, int blocksparse_block_size,"
|
||||
" int blocksparse_head_sliding_step) -> ()");
|
||||
ops.impl("paged_attention_v1", torch::kCPU, &paged_attention_v1);
|
||||
|
||||
// PagedAttention V2.
|
||||
ops.def(
|
||||
"paged_attention_v2("
|
||||
" Tensor! out, Tensor exp_sums, Tensor max_logits,"
|
||||
" Tensor tmp_out, Tensor query, Tensor key_cache,"
|
||||
" Tensor value_cache, int num_kv_heads, float scale,"
|
||||
" Tensor block_tables, Tensor seq_lens, int block_size,"
|
||||
" int max_seq_len, Tensor? alibi_slopes,"
|
||||
" str kv_cache_dtype, float kv_scale, int tp_rank,"
|
||||
" int blocksparse_local_blocks,"
|
||||
" int blocksparse_vert_stride, int blocksparse_block_size,"
|
||||
" int blocksparse_head_sliding_step) -> ()");
|
||||
ops.impl("paged_attention_v2", torch::kCPU, &paged_attention_v2);
|
||||
|
||||
// Activation ops
|
||||
|
||||
// Activation function used in SwiGLU.
|
||||
ops.def("silu_and_mul(Tensor! out, Tensor input) -> ()");
|
||||
ops.impl("silu_and_mul", torch::kCPU, &silu_and_mul);
|
||||
|
||||
// Activation function used in GeGLU with `none` approximation.
|
||||
ops.def("gelu_and_mul(Tensor! out, Tensor input) -> ()");
|
||||
ops.impl("gelu_and_mul", torch::kCPU, &gelu_and_mul);
|
||||
|
||||
// Activation function used in GeGLU with `tanh` approximation.
|
||||
ops.def("gelu_tanh_and_mul(Tensor! out, Tensor input) -> ()");
|
||||
ops.impl("gelu_tanh_and_mul", torch::kCPU, &gelu_tanh_and_mul);
|
||||
|
||||
// GELU implementation used in GPT-2.
|
||||
ops.def("gelu_new(Tensor! out, Tensor input) -> ()");
|
||||
ops.impl("gelu_new", torch::kCPU, &gelu_new);
|
||||
|
||||
// Approximate GELU implementation.
|
||||
ops.def("gelu_fast(Tensor! out, Tensor input) -> ()");
|
||||
ops.impl("gelu_fast", torch::kCPU, &gelu_fast);
|
||||
|
||||
// Quick GELU implementation.
|
||||
ops.def("gelu_quick(Tensor! out, Tensor input) -> ()");
|
||||
ops.impl("gelu_quick", torch::kCPU, &gelu_quick);
|
||||
|
||||
// Layernorm
|
||||
// Apply Root Mean Square (RMS) Normalization to the input tensor.
|
||||
ops.def(
|
||||
"rms_norm(Tensor! out, Tensor input, Tensor weight, float epsilon) -> "
|
||||
"()");
|
||||
ops.impl("rms_norm", torch::kCPU, &rms_norm);
|
||||
|
||||
// In-place fused Add and RMS Normalization.
|
||||
ops.def(
|
||||
"fused_add_rms_norm(Tensor! input, Tensor! residual, Tensor weight, "
|
||||
"float epsilon) -> ()");
|
||||
ops.impl("fused_add_rms_norm", torch::kCPU, &fused_add_rms_norm);
|
||||
|
||||
// Rotary embedding
|
||||
// Apply GPT-NeoX or GPT-J style rotary embedding to query and key.
|
||||
ops.def(
|
||||
"rotary_embedding(Tensor positions, Tensor! query,"
|
||||
" Tensor! key, int head_size,"
|
||||
" Tensor cos_sin_cache, bool is_neox) -> ()");
|
||||
ops.impl("rotary_embedding", torch::kCPU, &rotary_embedding);
|
||||
}
|
||||
|
||||
TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
|
||||
// Cache ops
|
||||
// Swap in (out) the cache blocks from src to dst.
|
||||
cache_ops.def(
|
||||
"swap_blocks(Tensor src, Tensor! dst, Tensor block_mapping) -> ()");
|
||||
cache_ops.impl("swap_blocks", torch::kCPU, &swap_blocks);
|
||||
|
||||
// Copy the cache blocks from src to dst.
|
||||
cache_ops.def(
|
||||
"copy_blocks(Tensor[]! key_caches, Tensor[]! value_caches, Tensor "
|
||||
"block_mapping) -> ()");
|
||||
cache_ops.impl("copy_blocks", torch::kCPU, ©_blocks);
|
||||
|
||||
// Reshape the key and value tensors and cache them.
|
||||
cache_ops.def(
|
||||
"reshape_and_cache(Tensor key, Tensor value,"
|
||||
" Tensor! key_cache, Tensor! value_cache,"
|
||||
" Tensor slot_mapping,"
|
||||
" str kv_cache_dtype,"
|
||||
" float kv_scale) -> ()");
|
||||
cache_ops.impl("reshape_and_cache", torch::kCPU, &reshape_and_cache);
|
||||
}
|
||||
|
||||
REGISTER_EXTENSION(TORCH_EXTENSION_NAME)
|
||||
@ -1,7 +1,7 @@
|
||||
#pragma once
|
||||
|
||||
#ifdef USE_ROCM
|
||||
#include <hip/hip_runtime.h>
|
||||
#include <hip/hip_runtime.h>
|
||||
#endif
|
||||
|
||||
#ifndef USE_ROCM
|
||||
@ -17,14 +17,9 @@
|
||||
#endif
|
||||
|
||||
#ifndef USE_ROCM
|
||||
#define VLLM_SHFL_XOR_SYNC(var, lane_mask) \
|
||||
__shfl_xor_sync(uint32_t(-1), var, lane_mask)
|
||||
#define VLLM_SHFL_XOR_SYNC_WIDTH(var, lane_mask, width) \
|
||||
__shfl_xor_sync(uint32_t(-1), var, lane_mask, width)
|
||||
#define VLLM_SHFL_XOR_SYNC(var, lane_mask) __shfl_xor_sync(uint32_t(-1), var, lane_mask)
|
||||
#else
|
||||
#define VLLM_SHFL_XOR_SYNC(var, lane_mask) __shfl_xor(var, lane_mask)
|
||||
#define VLLM_SHFL_XOR_SYNC_WIDTH(var, lane_mask, width) \
|
||||
__shfl_xor(var, lane_mask, width)
|
||||
#endif
|
||||
|
||||
#ifndef USE_ROCM
|
||||
@ -33,13 +28,6 @@
|
||||
#define VLLM_SHFL_SYNC(var, src_lane) __shfl(var, src_lane)
|
||||
#endif
|
||||
|
||||
#ifndef USE_ROCM
|
||||
#define VLLM_SHFL_DOWN_SYNC(var, lane_delta) \
|
||||
__shfl_down_sync(uint32_t(-1), var, lane_delta)
|
||||
#else
|
||||
#define VLLM_SHFL_DOWN_SYNC(var, lane_delta) __shfl_down(var, lane_delta)
|
||||
#endif
|
||||
|
||||
#ifndef USE_ROCM
|
||||
#define VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(FUNC, VAL) \
|
||||
cudaFuncSetAttribute(FUNC, cudaFuncAttributeMaxDynamicSharedMemorySize, VAL)
|
||||
@ -47,3 +35,4 @@
|
||||
#define VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(FUNC, VAL) \
|
||||
hipFuncSetAttribute(FUNC, hipFuncAttributeMaxDynamicSharedMemorySize, VAL)
|
||||
#endif
|
||||
|
||||
|
||||
@ -1,5 +1,10 @@
|
||||
#pragma once
|
||||
|
||||
int64_t get_device_attribute(int64_t attribute, int64_t device_id);
|
||||
#include <torch/extension.h>
|
||||
|
||||
int64_t get_max_shared_memory_per_block_device_attribute(int64_t device_id);
|
||||
int get_device_attribute(
|
||||
int attribute,
|
||||
int device_id);
|
||||
|
||||
int get_max_shared_memory_per_block_device_attribute(
|
||||
int device_id);
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user