Compare commits

..

50 Commits

Author SHA1 Message Date
220d694080 updated
Signed-off-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com>
2025-03-24 01:00:20 +00:00
70e06dd574 updated
Signed-off-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com>
2025-03-24 00:46:46 +00:00
7954461d4c updated
Signed-off-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com>
2025-03-23 23:03:42 +00:00
a10da86677 updated
Signed-off-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com>
2025-03-23 22:56:53 +00:00
284d5df45b added __init__.py
Signed-off-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com>
2025-03-23 22:50:20 +00:00
d5b0db449e added __init__.py
Signed-off-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com>
2025-03-23 22:44:36 +00:00
66349c33a1 updated
Signed-off-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com>
2025-03-23 22:36:57 +00:00
28d0396ff1 updated
Signed-off-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com>
2025-03-23 21:54:04 +00:00
2f29ae383a added files
Signed-off-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com>
2025-03-23 21:45:01 +00:00
cf64b0e6a7 updated
Signed-off-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com>
2025-03-23 21:44:14 +00:00
f51f182d64 pre-commit
Signed-off-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com>
2025-03-23 20:18:50 +00:00
79e465f557 fix pre-commit
Signed-off-by: Robert Shaw <rshaw@neuralmagic.com>
2025-03-23 09:38:55 -04:00
2ba687d39f updated
Signed-off-by: Robert Shaw <rshaw@neuralmagic.com>
2025-03-22 18:52:06 -04:00
5d57896e2c updated
Signed-off-by: Robert Shaw <rshaw@neuralmagic.com>
2025-03-22 18:51:53 -04:00
f6f008ca1d cleanup
Signed-off-by: Robert Shaw <rshaw@neuralmagic.com>
2025-03-22 18:51:21 -04:00
24cbbe4778 updated
Signed-off-by: Robert Shaw <rshaw@neuralmagic.com>
2025-03-22 18:50:48 -04:00
2fec6e0b5c working?
Signed-off-by: Robert Shaw <rshaw@neuralmagic.com>
2025-03-22 18:45:00 -04:00
47a3f26b2a updated
Signed-off-by: Robert Shaw <rshaw@neuralmagic.com>
2025-03-22 18:36:52 -04:00
144162fc8c Merge branch 'main' into rob-fixes 2025-03-22 17:12:53 -04:00
522279ebb9 Stash
Signed-off-by: Robert Shaw <rshaw@neuralmagic.com>
2025-03-22 17:12:21 -04:00
85687b43e7 updated
Signed-off-by: Robert Shaw <rshaw@neuralmagic.com>
2025-03-22 17:00:46 -04:00
120bbdfd82 updated
Signed-off-by: Robert Shaw <rshaw@neuralmagic.com>
2025-03-22 15:58:51 -04:00
2ceb7bc534 updated
Signed-off-by: Robert Shaw <rshaw@neuralmagic.com>
2025-03-22 13:25:05 -04:00
9f7fb5ec84 updated
Signed-off-by: Robert Shaw <rshaw@neuralmagic.com>
2025-03-22 13:22:00 -04:00
a8a621e419 updated
Signed-off-by: Robert Shaw <rshaw@neuralmagic.com>
2025-03-22 13:11:50 -04:00
b89d89f456 fix rebase
Signed-off-by: clark <panf2333@gmail.com>
2025-03-21 08:32:21 +08:00
8355358fb3 add unlimited HWM
Signed-off-by: clark <panf2333@gmail.com>
2025-03-21 08:20:12 +08:00
c0b1443345 fix mypy
Signed-off-by: clark <panf2333@gmail.com>
2025-03-21 08:20:12 +08:00
d35dace985 refactor zmq msg to object
Signed-off-by: clark <panf2333@gmail.com>
2025-03-21 08:20:12 +08:00
912031ceb5 refactor disagg
Signed-off-by: clark <panf2333@gmail.com>
2025-03-21 08:20:12 +08:00
4f13e89143 fix SIM105
Signed-off-by: clark <panf2333@gmail.com>
2025-03-21 08:18:19 +08:00
b9a7dbe769 remove default socket address value
Signed-off-by: clark <panf2333@gmail.com>
2025-03-21 08:18:19 +08:00
0cb2e05256 change log level and fix some comments
Signed-off-by: clark <panf2333@gmail.com>
2025-03-21 08:18:19 +08:00
d6945ecdf0 change disagg_prefill example to use zmq
Signed-off-by: clark <panf2333@gmail.com>
2025-03-21 08:18:19 +08:00
298298f97d remove invalid zmq benchmark code
Signed-off-by: clark <panf2333@gmail.com>
2025-03-21 08:18:19 +08:00
6c8fae82dd run format
Signed-off-by: clark <panf2333@gmail.com>
2025-03-21 08:18:19 +08:00
16ed827378 add benchmark shell
Signed-off-by: clark <panf2333@gmail.com>
2025-03-21 08:18:08 +08:00
8fa9df7987 run format.sh
Signed-off-by: clark <panf2333@gmail.com>
2025-03-21 08:17:57 +08:00
27c1afe88b fix ThreadProxy
Signed-off-by: clark <panf2333@gmail.com>
2025-03-21 08:17:57 +08:00
ee6607332e create proxy sockets in the proxy function for thread safety
Signed-off-by: clark <panf2333@gmail.com>
2025-03-21 08:17:57 +08:00
7fbf70db57 1. replace tpc:// with ipc:// \n 2. fix json response
Signed-off-by: clark <panf2333@gmail.com>
2025-03-21 08:17:57 +08:00
2c31e4c3ea Run yapf and ruff
Signed-off-by: clark <panf2333@gmail.com>
2025-03-21 08:17:57 +08:00
187f112ccd 1. fix mypy issue
Signed-off-by: clark <panf2333@gmail.com>
2025-03-21 08:17:44 +08:00
897db7b93d Replace zmq.asyncio.Context().term() with zmq.asyncio.Context().destroy(linger=0) for immediate termination
Signed-off-by: clark <panf2333@gmail.com>
2025-03-21 08:17:44 +08:00
b7ffb43792 update disagg_connect test_request.py
Signed-off-by: clark <panf2333@gmail.com>
2025-03-21 08:17:44 +08:00
6e1fba8a73 1. connect_parser set --prefill-addr and --decode-addr are required
2.To more accurately reflect its purpose, we will rename connect.py to disagg_connector.py.

Signed-off-by: clark <panf2333@gmail.com>
2025-03-21 08:17:44 +08:00
bfde1688e7 add /v1/completions stream support
Signed-off-by: clark <panf2333@gmail.com>
2025-03-21 08:17:44 +08:00
905424ed65 add identity url headers
Signed-off-by: clark <panf2333@gmail.com>
2025-03-21 08:15:42 +08:00
5d20f389d6 add vllm connect cmd
Signed-off-by: clark <panf2333@gmail.com>
2025-03-21 08:15:42 +08:00
2a0cb78016 add test py
Signed-off-by: clark <panf2333@gmail.com>
2025-03-21 08:15:42 +08:00
1045 changed files with 25686 additions and 79000 deletions

View File

@ -1,4 +1,3 @@
# For vllm script, with -t option (tensor parallel size).
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m deepseek-ai/DeepSeek-V2-Lite-Chat -b "auto" -l 1000 -f 5 -t 2
model_name: "deepseek-ai/DeepSeek-V2-Lite-Chat"
tasks:

View File

@ -1,4 +1,3 @@
# For hf script, without -t option (tensor parallel size).
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5
model_name: "nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform"
tasks:

View File

@ -1,4 +1,3 @@
# For hf script, without -t option (tensor parallel size).
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-70B-Instruct -b 32 -l 250 -f 5
model_name: "meta-llama/Meta-Llama-3-70B-Instruct"
tasks:

View File

@ -1,4 +1,3 @@
# For vllm script, with -t option (tensor parallel size).
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors -b auto -l 1000 -f 5 -t 1
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors"
tasks:

View File

@ -1,4 +1,3 @@
# For vllm script, with -t option (tensor parallel size).
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5 -t 1
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform"
tasks:

View File

@ -1,4 +1,3 @@
# For vllm script, with -t option (tensor parallel size).
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test -b 32 -l 1000 -f 5 -t 1
model_name: "nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test"
tasks:

View File

@ -1,4 +1,3 @@
# For vllm script, with -t option (tensor parallel size).
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Meta-Llama-3-8B-Instruct-FP8 -b 32 -l 250 -f 5 -t 1
model_name: "neuralmagic/Meta-Llama-3-8B-Instruct-FP8"
tasks:

View File

@ -1,4 +1,3 @@
# For vllm script, with -t option (tensor parallel size).
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test"
tasks:

View File

@ -1,4 +1,3 @@
# For vllm script, with -t option (tensor parallel size).
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test"
tasks:

View File

@ -1,4 +1,3 @@
# For vllm script, with -t option (tensor parallel size).
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test -b auto -l 1000 -f 5 -t 1
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test"
tasks:

View File

@ -1,5 +1,4 @@
# For hf script, without -t option (tensor parallel size).
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-8B-Instruct -b 32 -l 250 -f 5
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-8B-Instruct -b 32 -l 250 -f 5 -t 1
model_name: "meta-llama/Meta-Llama-3-8B-Instruct"
tasks:
- name: "gsm8k"

View File

@ -1,4 +1,3 @@
# For vllm script, with -t option (tensor parallel size).
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m HandH1998/QQQ-Llama-3-8b-g128 -b 32 -l 1000 -f 5 -t 1
model_name: "HandH1998/QQQ-Llama-3-8b-g128"
tasks:

View File

@ -1,4 +1,3 @@
# For vllm script, with -t option (tensor parallel size).
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
model_name: "neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8"
tasks:

View File

@ -1,4 +1,3 @@
# For vllm script, with -t option (tensor parallel size).
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m mgoin/Minitron-4B-Base-FP8 -b auto -l 1000 -f 5 -t 1
model_name: "mgoin/Minitron-4B-Base-FP8"
tasks:

View File

@ -1,4 +1,3 @@
# For vllm script, with -t option (tensor parallel size).
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic -b "auto" -l 250 -f 5 -t 8
model_name: "neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic"
tasks:

View File

@ -1,4 +1,3 @@
# For vllm script, with -t option (tensor parallel size).
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8 -b "auto" -l 250 -f 5 -t 4
model_name: "neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8"
tasks:

View File

@ -1,5 +1,4 @@
# For hf script, without -t option (tensor parallel size).
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1 -b 32 -l 250 -f 5
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1 -b 32 -l 250 -f 5 -t 4
model_name: "mistralai/Mixtral-8x7B-Instruct-v0.1"
tasks:
- name: "gsm8k"

View File

@ -1,12 +0,0 @@
# For vllm script, with -t option (tensor parallel size).
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16 -b auto -l 1319 -f 5 -t 1
model_name: "nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16"
tasks:
- name: "gsm8k"
metrics:
- name: "exact_match,strict-match"
value: 0.30
- name: "exact_match,flexible-extract"
value: 0.465
limit: 1319
num_fewshot: 5

View File

@ -1,4 +1,3 @@
# For vllm script, with -t option (tensor parallel size).
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-FP8W8 -b auto -l 1000 -f 5 -t 1
model_name: "nm-testing/Qwen2-1.5B-Instruct-FP8W8"
tasks:

View File

@ -1,4 +1,3 @@
# For vllm script, with -t option (tensor parallel size).
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
model_name: "neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8"
tasks:

View File

@ -1,4 +1,3 @@
# For vllm script, with -t option (tensor parallel size).
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise -b "auto" -l 1000 -f 5 -t 1
model_name: "nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise"
tasks:

View File

@ -1,4 +1,3 @@
# For vllm script, with -t option (tensor parallel size).
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m Qwen/Qwen2-57B-A14B-Instruct -b "auto" -l 250 -f 5 -t 4
model_name: "Qwen/Qwen2-57B-A14B-Instruct"
tasks:

View File

@ -1,4 +1,3 @@
# For vllm script, with -t option (tensor parallel size).
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM -b "auto" -t 2
model_name: "nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM"
tasks:

View File

@ -4,7 +4,7 @@ Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
Qwen1.5-MoE-W4A16-compressed-tensors.yaml
Minitron-4B-Base-FP8.yaml
Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
Qwen2-1.5B-Instruct-FP8W8.yaml
Meta-Llama-3-8B-QQQ.yaml

View File

@ -16,7 +16,7 @@ import numpy
import pytest
import yaml
RTOL = 0.08
RTOL = 0.05
TEST_DATA_FILE = os.environ.get(
"LM_EVAL_TEST_DATA_FILE",
".buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml")

View File

@ -10,24 +10,15 @@ set -x
set -o pipefail
check_gpus() {
if command -v nvidia-smi; then
# check the number of GPUs and GPU type.
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
elif command -v amd-smi; then
declare -g gpu_count=$(amd-smi list | grep 'GPU' | wc -l)
fi
# check the number of GPUs and GPU type.
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
if [[ $gpu_count -gt 0 ]]; then
echo "GPU found."
else
echo "Need at least 1 GPU to run benchmarking."
exit 1
fi
if command -v nvidia-smi; then
declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')
elif command -v amd-smi; then
declare -g gpu_type=$(amd-smi static -g 0 -a | grep 'MARKET_NAME' | awk '{print $2}')
fi
declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')
echo "GPU type is $gpu_type"
}
@ -99,15 +90,9 @@ kill_gpu_processes() {
# wait until GPU memory usage smaller than 1GB
if command -v nvidia-smi; then
while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
sleep 1
done
elif command -v amd-smi; then
while [ "$(amd-smi metric -g 0 | grep 'USED_VRAM' | awk '{print $2}')" -ge 1000 ]; do
sleep 1
done
fi
while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
sleep 1
done
# remove vllm config file
rm -rf ~/.config/vllm

View File

@ -63,12 +63,10 @@
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
"disable_log_requests": "",
"tensor_parallel_size": 4,
"swap_space": 16,
"speculative_config": {
"model": "turboderp/Qwama-0.5B-Instruct",
"num_speculative_tokens": 4,
"draft_tensor_parallel_size": 1
}
"swap_space": 16,
"speculative_model": "turboderp/Qwama-0.5B-Instruct",
"num_speculative_tokens": 4,
"speculative_draft_tensor_parallel_size": 1
},
"client_parameters": {
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",

View File

@ -3,10 +3,10 @@ steps:
agents:
queue: cpu_queue_postmerge
commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag vllm-ci:build-image --target build --progress plain ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-wheels.sh"
- "bash .buildkite/upload-wheels.sh"
env:
DOCKER_BUILDKIT: "1"
@ -14,10 +14,10 @@ steps:
agents:
queue: cpu_queue_postmerge
commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-wheels.sh"
- "bash .buildkite/upload-wheels.sh"
env:
DOCKER_BUILDKIT: "1"
@ -31,10 +31,10 @@ steps:
agents:
queue: cpu_queue_postmerge
commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-wheels.sh"
- "bash .buildkite/upload-wheels.sh"
env:
DOCKER_BUILDKIT: "1"
@ -48,7 +48,7 @@ steps:
queue: cpu_queue_postmerge
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ."
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
- label: "Build and publish TPU release image"
@ -57,7 +57,7 @@ steps:
agents:
queue: tpu_queue_postmerge
commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f docker/Dockerfile.tpu ."
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f Dockerfile.tpu ."
- "docker push vllm/vllm-tpu:nightly"
- "docker push vllm/vllm-tpu:$BUILDKITE_COMMIT"
plugins:
@ -82,22 +82,7 @@ steps:
queue: cpu_queue_postmerge
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain -f Dockerfile.cpu ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
env:
DOCKER_BUILDKIT: "1"
- block: "Build Neuron release image"
key: block-neuron-release-image-build
depends_on: ~
- label: "Build and publish Neuron release image"
depends_on: block-neuron-release-image-build
agents:
queue: neuron-postmerge
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:latest --progress plain -f docker/Dockerfile.neuron ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version)"
env:
DOCKER_BUILDKIT: "1"

View File

@ -98,13 +98,6 @@ if [[ $commands == *" kernels "* ]]; then
--ignore=kernels/test_machete_mm.py \
--ignore=kernels/test_mha_attn.py \
--ignore=kernels/test_block_fp8.py \
--ignore=kernels/test_cutlass_moe.py \
--ignore=kernels/test_mamba_ssm_ssd.py \
--ignore=kernels/test_attention.py \
--ignore=kernels/test_block_int8.py \
--ignore=kernels/test_fused_quant_layernorm.py \
--ignore=kernels/test_int8_kernel.py \
--ignore=kernels/test_triton_moe_ptpc_fp8.py \
--ignore=kernels/test_permute_cols.py"
fi
@ -112,33 +105,19 @@ fi
if [[ $commands == *" entrypoints/openai "* ]]; then
commands=${commands//" entrypoints/openai "/" entrypoints/openai \
--ignore=entrypoints/openai/test_audio.py \
--ignore=entrypoints/openai/test_chat.py \
--ignore=entrypoints/openai/test_shutdown.py \
--ignore=entrypoints/openai/test_completion.py \
--ignore=entrypoints/openai/test_sleep.py \
--ignore=entrypoints/openai/test_models.py \
--ignore=entrypoints/openai/test_lora_adapters.py \
--ignore=entrypoints/openai/test_return_tokens_as_ids.py \
--ignore=entrypoints/openai/test_root_path.py \
--ignore=entrypoints/openai/test_tokenization.py \
--ignore=entrypoints/openai/test_prompt_validation.py "}
fi
#ignore certain Entrypoints/llm tests
if [[ $commands == *" entrypoints/llm "* ]]; then
commands=${commands//" entrypoints/llm "/" entrypoints/llm \
--ignore=entrypoints/llm/test_chat.py \
--ignore=entrypoints/llm/test_accuracy.py \
--ignore=entrypoints/llm/test_init.py \
--ignore=entrypoints/llm/test_generate_multiple_loras.py \
--ignore=entrypoints/llm/test_prompt_validation.py "}
if [[ $commands == *" && pytest -v -s entrypoints/llm/test_guided_generate.py"* ]]; then
commands=${commands//" && pytest -v -s entrypoints/llm/test_guided_generate.py"/" "}
fi
#Obsolete currently
##ignore certain Entrypoints/llm tests
#if [[ $commands == *" && pytest -v -s entrypoints/llm/test_guided_generate.py"* ]]; then
# commands=${commands//" && pytest -v -s entrypoints/llm/test_guided_generate.py"/" "}
#fi
# --ignore=entrypoints/openai/test_encoder_decoder.py \
# --ignore=entrypoints/openai/test_embedding.py \
# --ignore=entrypoints/openai/test_oot_registration.py
@ -155,10 +134,9 @@ if [[ $commands == *"--shard-id="* ]]; then
# assign shard-id for each shard
commands_gpu=${commands//"--shard-id= "/"--shard-id=${GPU} "}
echo "Shard ${GPU} commands:$commands_gpu"
echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
docker run \
--device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
--network=host \
--device /dev/kfd --device /dev/dri \
--network host \
--shm-size=16gb \
--rm \
-e HIP_VISIBLE_DEVICES="${GPU}" \
@ -185,10 +163,9 @@ if [[ $commands == *"--shard-id="* ]]; then
fi
done
else
echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
docker run \
--device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
--network=host \
--device /dev/kfd --device /dev/dri \
--network host \
--shm-size=16gb \
--rm \
-e HIP_VISIBLE_DEVICES=0 \

View File

@ -5,8 +5,8 @@
set -ex
set -o pipefail
# cd 2 levels into the working directory
cd "$(dirname "${BASH_SOURCE[0]}")/../.."
# cd into parent directory of this file
cd "$(dirname "${BASH_SOURCE[0]}")/.."
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)

View File

@ -10,4 +10,5 @@ trap remove_docker_container EXIT
remove_docker_container
# Try building the docker image
docker build -t cpu-test -f docker/Dockerfile.s390x .
docker build -t cpu-test -f Dockerfile.ppc64le .

View File

@ -8,19 +8,15 @@ set -ex
CORE_RANGE=${CORE_RANGE:-48-95}
NUMA_NODE=${NUMA_NODE:-1}
# Try building the docker image
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build -t cpu-test-"$BUILDKITE_BUILD_NUMBER" -f Dockerfile.cpu .
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 -f Dockerfile.cpu .
# Setup cleanup
remove_docker_container() {
set -e;
docker rm -f cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" || true;
docker image rm cpu-test-"$BUILDKITE_BUILD_NUMBER" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 || true;
}
remove_docker_container() { set -e; docker rm -f cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" || true; }
trap remove_docker_container EXIT
remove_docker_container
# Try building the docker image
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$BUILDKITE_BUILD_NUMBER" --target vllm-test -f docker/Dockerfile.cpu .
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
# Run the image, setting --shm-size=4g for tensor parallel.
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
--cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"
@ -40,8 +36,8 @@ function cpu_tests() {
# Run basic model test
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
set -e
pytest -v -s tests/kernels/test_cache.py -m cpu_model
pytest -v -s tests/kernels/test_mla_decode_cpu.py -m cpu_model
pip install -r vllm/requirements/test.txt
pip install -r vllm/requirements/cpu.txt
pytest -v -s tests/models/decoder_only/language -m cpu_model
pytest -v -s tests/models/embedding/language -m cpu_model
pytest -v -s tests/models/encoder_decoder/language -m cpu_model

View File

@ -9,13 +9,11 @@ python3 use_existing_torch.py
# Try building the docker image
DOCKER_BUILDKIT=1 docker build . \
--file docker/Dockerfile \
--target vllm-openai \
--platform "linux/arm64" \
-t gh200-test \
--build-arg max_jobs=66 \
--build-arg nvcc_threads=2 \
--build-arg RUN_WHEEL_CHECK=false \
--build-arg torch_cuda_arch_list="9.0+PTX" \
--build-arg vllm_fa_cmake_gpu_arches="90-real"
@ -25,6 +23,6 @@ trap remove_docker_container EXIT
remove_docker_container
# Run the image and test offline inference
docker run -e HF_TOKEN -e VLLM_WORKER_MULTIPROC_METHOD=spawn -v /root/.cache/huggingface:/root/.cache/huggingface --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
docker run -e HF_TOKEN -v /root/.cache/huggingface:/root/.cache/huggingface --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
python3 examples/offline_inference/basic/generate.py --model meta-llama/Llama-3.2-1B
'

View File

@ -5,7 +5,7 @@
set -ex
# Try building the docker image
docker build -t hpu-test-env -f docker/Dockerfile.hpu .
docker build -t hpu-test-env -f Dockerfile.hpu .
# Setup cleanup
# certain versions of HPU software stack have a bug that can

View File

@ -3,7 +3,7 @@
set -euox pipefail
if [[ $# -lt 4 ]]; then
echo "Usage: .buildkite/scripts/run-multi-node-test.sh WORKING_DIR NUM_NODES NUM_GPUS DOCKER_IMAGE COMMAND1 COMMAND2 ... COMMANDN"
echo "Usage: .buildkite/run-multi-node-test.sh WORKING_DIR NUM_NODES NUM_GPUS DOCKER_IMAGE COMMAND1 COMMAND2 ... COMMANDN"
exit 1
fi

View File

@ -35,7 +35,7 @@ else
date "+%s" > /tmp/neuron-docker-build-timestamp
fi
docker build -t "${image_name}" -f docker/Dockerfile.neuron .
docker build -t "${image_name}" -f Dockerfile.neuron .
# Setup cleanup
remove_docker_container() {

View File

@ -1,9 +1,9 @@
#!/bin/bash
set -xue
set -e
# Build the docker image.
docker build -f docker/Dockerfile.tpu -t vllm-tpu .
docker build -f Dockerfile.tpu -t vllm-tpu .
# Set up cleanup.
remove_docker_container() { docker rm -f tpu-test || true; }
@ -17,17 +17,12 @@ source /etc/environment
docker run --privileged --net host --shm-size=16G -it \
-e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \
&& python3 -m pip install pytest pytest-asyncio tpu-info \
&& python3 -m pip install pytest \
&& python3 -m pip install lm_eval[api]==0.4.4 \
&& export VLLM_XLA_CACHE_PATH= \
&& export VLLM_USE_V1=1 \
&& export VLLM_XLA_CHECK_RECOMPILATION=1 \
&& echo HARDWARE \
&& tpu-info \
&& echo TEST_0 \
&& pytest -v -s /workspace/vllm/tests/v1/tpu/test_perf.py \
&& echo TEST_1 \
&& pytest -v -s /workspace/vllm/tests/tpu/test_compilation.py \
&& python3 /workspace/vllm/tests/tpu/test_compilation.py \
&& echo TEST_2 \
&& pytest -v -s /workspace/vllm/tests/v1/tpu/test_basic.py \
&& echo TEST_3 \
@ -35,20 +30,9 @@ docker run --privileged --net host --shm-size=16G -it \
&& echo TEST_4 \
&& pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py \
&& echo TEST_5 \
&& python3 /workspace/vllm/examples/offline_inference/tpu.py \
&& echo TEST_6 \
&& pytest -s -v /workspace/vllm/tests/v1/tpu/worker/test_tpu_model_runner.py \
&& echo TEST_7 \
&& pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py \
&& echo TEST_8 \
&& pytest -s -v /workspace/vllm/tests/v1/tpu/test_topk_topp_sampler.py \
&& echo TEST_9 \
&& pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py \
&& echo TEST_10 \
&& pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py \
&& echo TEST_11 \
&& pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py" \
&& python3 /workspace/vllm/examples/offline_inference/tpu.py" \
# TODO: This test fails because it uses RANDOM_SEED sampling
# && VLLM_USE_V1=1 pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \

View File

@ -8,7 +8,7 @@ image_name="xpu/vllm-ci:${BUILDKITE_COMMIT}"
container_name="xpu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
# Try building the docker image
docker build -t ${image_name} -f docker/Dockerfile.xpu .
docker build -t ${image_name} -f Dockerfile.xpu .
# Setup cleanup
remove_docker_container() {

View File

@ -1,45 +0,0 @@
#!/bin/bash
# This script build the CPU docker image and run the offline inference inside the container.
# It serves a sanity check for compilation and basic model usage.
set -ex
# Setup cleanup
remove_docker_container() {
if [[ -n "$container_id" ]]; then
podman rm -f "$container_id" || true
fi
podman system prune -f
}
trap remove_docker_container EXIT
remove_docker_container
# Try building the docker image
podman build -t cpu-test-ubi9-ppc -f docker/Dockerfile.ppc64le .
# Run the image
container_id=$(podman run -itd --entrypoint /bin/bash -v /tmp/:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN cpu-test-ubi9-ppc)
function cpu_tests() {
# offline inference
podman exec -it "$container_id" bash -c "
set -e
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
# Run basic model test
podman exec -it "$container_id" bash -c "
set -e
pip install pytest pytest-asyncio einops peft Pillow soundfile transformers_stream_generator matplotlib
pip install sentence-transformers datamodel_code_generator
pytest -v -s tests/models/embedding/language/test_cls_models.py::test_classification_models[float-jason9693/Qwen2.5-1.5B-apeach]
pytest -v -s tests/models/embedding/language/test_embedding.py::test_models[half-BAAI/bge-base-en-v1.5]
pytest -v -s tests/models/encoder_decoder/language -m cpu_model"
}
# All of CPU tests are expected to be finished less than 40 mins.
export container_id
export -f cpu_tests
timeout 40m bash -c cpu_tests

View File

@ -8,7 +8,6 @@
# Documentation
# label(str): the name of the test. emoji allowed.
# fast_check(bool): whether to run this on each commit on fastcheck pipeline.
# torch_nightly(bool): whether to run this on vllm against torch nightly pipeline.
# fast_check_only(bool): run this test on fastcheck pipeline only
# optional(bool): never run this test by default (i.e. need to unblock manually) unless it's scheduled nightly run.
# command(str): the single command to run for tests. incompatible with commands.
@ -71,7 +70,6 @@ steps:
- label: Basic Correctness Test # 30min
#mirror_hardwares: [amd]
fast_check: true
torch_nightly: true
source_file_dependencies:
- vllm/
- tests/basic_correctness/test_basic_correctness
@ -106,8 +104,7 @@ steps:
- label: Entrypoints Test # 40min
working_dir: "/vllm-workspace/tests"
fast_check: true
torch_nightly: true
#mirror_hardwares: [amd]
mirror_hardwares: [amd]
source_file_dependencies:
- vllm/
- tests/entrypoints/llm
@ -121,7 +118,7 @@ steps:
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
- pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
- VLLM_USE_V1=0 pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_openai_schema.py
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/correctness/
- pytest -v -s entrypoints/test_chat_utils.py
- VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
@ -138,14 +135,12 @@ steps:
- examples/offline_inference/rlhf.py
- examples/offline_inference/rlhf_colocate.py
- tests/examples/offline_inference/data_parallel.py
- tests/v1/test_async_llm_dp.py
commands:
# test with tp=2 and external_dp=2
- VLLM_USE_V1=0 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
- torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
# test with internal dp
- python3 ../examples/offline_inference/data_parallel.py
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
- pytest -v -s distributed/test_utils.py
- pytest -v -s compile/test_basic_correctness.py
- pytest -v -s distributed/test_pynccl.py
@ -158,7 +153,6 @@ steps:
- popd
- label: Metrics, Tracing Test # 10min
mirror_hardwares: [amd]
num_gpus: 2
source_file_dependencies:
- vllm/
@ -166,13 +160,18 @@ steps:
- tests/tracing
commands:
- pytest -v -s metrics
- "pip install \
'opentelemetry-sdk>=1.26.0,<1.27.0' \
'opentelemetry-api>=1.26.0,<1.27.0' \
'opentelemetry-exporter-otlp>=1.26.0,<1.27.0' \
'opentelemetry-semantic-conventions-ai>=0.4.1,<0.5.0'"
- pytest -v -s tracing
##### fast check tests #####
##### 1 GPU test #####
- label: Regression Test # 5min
#mirror_hardwares: [amd]
mirror_hardwares: [amd]
source_file_dependencies:
- vllm/
- tests/test_regression
@ -203,13 +202,12 @@ steps:
commands:
# split the test to avoid interference
- pytest -v -s v1/core
- pytest -v -s v1/entrypoints
- pytest -v -s v1/engine
- pytest -v -s v1/entrypoints
- pytest -v -s v1/sample
- pytest -v -s v1/worker
- pytest -v -s v1/structured_output
- pytest -v -s v1/spec_decode
- pytest -v -s v1/test_serial_utils.py
- pytest -v -s v1/test_stats.py
- pytest -v -s v1/test_utils.py
- pytest -v -s v1/test_oracle.py
@ -285,22 +283,13 @@ steps:
- pytest -v -s spec_decode/e2e/test_eagle_correctness.py
- label: LoRA Test %N # 15min each
#mirror_hardwares: [amd]
mirror_hardwares: [amd]
source_file_dependencies:
- vllm/lora
- tests/lora
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_minicpmv_tp.py --ignore=lora/test_transfomers_model.py
parallelism: 4
- label: PyTorch Compilation Unit Tests
source_file_dependencies:
- vllm/
- tests/compile
commands:
- pytest -v -s compile/test_pass_manager.py
- pytest -v -s compile/test_fusion.py
- pytest -v -s compile/test_sequence_parallelism.py
- label: PyTorch Fullgraph Smoke Test # 9min
source_file_dependencies:
- vllm/
@ -310,6 +299,7 @@ steps:
# these tests need to be separated, cannot combine
- pytest -v -s compile/piecewise/test_simple.py
- pytest -v -s compile/piecewise/test_toy_llama.py
- pytest -v -s compile/test_pass_manager.py
- label: PyTorch Fullgraph Test # 18min
source_file_dependencies:
@ -318,49 +308,18 @@ steps:
commands:
- pytest -v -s compile/test_full_graph.py
- label: Kernels Core Operation Test
- label: Kernels Test %N # 1h each
mirror_hardwares: [amd]
source_file_dependencies:
- csrc/
- tests/kernels/core
commands:
- pytest -v -s kernels/core
- label: Kernels Attention Test %N
source_file_dependencies:
- csrc/attention/
- vllm/attention
- vllm/v1/attention
- tests/kernels/attention
- tests/kernels
commands:
- pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
parallelism: 2
- label: Kernels Quantization Test %N
source_file_dependencies:
- csrc/quantization/
- vllm/model_executor/layers/quantization
- tests/kernels/quantization
commands:
- pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
parallelism: 2
- label: Kernels MoE Test
source_file_dependencies:
- csrc/moe/
- tests/kernels/moe
- vllm/model_executor/layers/fused_moe/
commands:
- pytest -v -s kernels/moe
- label: Kernels Mamba Test
source_file_dependencies:
- csrc/mamba/
- tests/kernels/mamba
commands:
- pytest -v -s kernels/mamba
- pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
parallelism: 4
- label: Tensorizer Test # 11min
# mirror_hardwares: [amd]
mirror_hardwares: [amd]
soft_fail: true
source_file_dependencies:
- vllm/model_executor/model_loader
@ -376,14 +335,7 @@ steps:
source_file_dependencies:
- benchmarks/
commands:
- bash scripts/run-benchmarks.sh
- label: Benchmarks CLI Test # 10min
source_file_dependencies:
- vllm/
- tests/benchmarks/
commands:
- pytest -v -s benchmarks/
- bash run-benchmarks.sh
- label: Quantization Test # 33min
source_file_dependencies:
@ -418,14 +370,12 @@ steps:
- label: OpenAI-Compatible Tool Use # 20 min
fast_check: false
#mirror_hardwares: [ amd ]
mirror_hardwares: [ amd ]
source_file_dependencies:
- vllm/
- tests/tool_use
- tests/mistral_tool_use
commands:
- pytest -v -s tool_use
- pytest -v -s mistral_tool_use
##### models test #####
@ -437,9 +387,7 @@ steps:
- pytest -v -s models/test_transformers.py
- pytest -v -s models/test_registry.py
# V1 Test: https://github.com/vllm-project/vllm/issues/14531
- VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'
- VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'llama4'
- VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'plamo2'
- VLLM_USE_V1=0 pytest -v -s models/test_initialization.py
- label: Language Models Test (Standard) # 32min
#mirror_hardwares: [amd]
@ -449,8 +397,6 @@ steps:
- tests/models/embedding/language
- tests/models/encoder_decoder/language
commands:
# Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
- pip install causal-conv1d
- pytest -v -s models/decoder_only/language -m 'core_model or quant_model'
- pytest -v -s models/embedding/language -m core_model
@ -462,8 +408,6 @@ steps:
- tests/models/embedding/language
- tests/models/encoder_decoder/language
commands:
# Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
- pip install causal-conv1d
- pytest -v -s models/decoder_only/language -m 'not core_model and not quant_model'
- pytest -v -s models/embedding/language -m 'not core_model'
@ -480,12 +424,11 @@ steps:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pytest -v -s models/multimodal
- pytest -v -s models/decoder_only/audio_language -m 'core_model or quant_model'
- pytest -v -s models/decoder_only/vision_language -m 'core_model or quant_model'
- pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'core_model or quant_model'
- pytest -v -s models/embedding/vision_language -m core_model
- pytest -v -s models/encoder_decoder/audio_language -m core_model
- pytest -v -s models/encoder_decoder/language -m core_model
- pytest -v -s models/encoder_decoder/vision_language -m core_model
- pytest -v -s models/decoder_only/vision_language/test_interleaved.py
- label: Multi-Modal Models Test (Extended) 1 # 48m
optional: true
@ -499,7 +442,10 @@ steps:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pytest -v -s models/decoder_only/audio_language -m 'not core_model and not quant_model'
- pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=0) and not core_model and not quant_model'
- pytest -v -s --ignore models/decoder_only/vision_language/test_models.py models/decoder_only/vision_language -m 'not core_model and not quant_model'
# HACK - run phi3v tests separately to sidestep this transformers bug
# https://github.com/huggingface/transformers/issues/34307
- pytest -v -s models/decoder_only/vision_language/test_phi3v.py
- pytest -v -s --ignore models/decoder_only/vision_language/test_models.py --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'not core_model and not quant_model'
- pytest -v -s models/embedding/vision_language -m 'not core_model'
- pytest -v -s models/encoder_decoder/language -m 'not core_model'
- pytest -v -s models/encoder_decoder/vision_language -m 'not core_model'
@ -515,7 +461,6 @@ steps:
# This test is used only in PR development phase to test individual models and should never run on main
- label: Custom Models Test
mirror_hardwares: [amd]
optional: true
commands:
- echo 'Testing custom models...'
@ -527,7 +472,6 @@ steps:
##### multi gpus test #####
- label: Distributed Comm Ops Test # 7min
mirror_hardwares: [amd]
working_dir: "/vllm-workspace/tests"
num_gpus: 2
source_file_dependencies:
@ -570,10 +514,7 @@ steps:
- vllm/worker/worker.py
- vllm/worker/model_runner.py
- entrypoints/llm/test_collective_rpc.py
- tests/v1/test_async_llm_dp.py
- vllm/v1/engine/
commands:
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
- pytest -v -s entrypoints/llm/test_collective_rpc.py
- pytest -v -s ./compile/test_basic_correctness.py
- pytest -v -s ./compile/test_wrapper.py
@ -584,14 +525,11 @@ steps:
- pytest models/encoder_decoder/language/test_bart.py -v -s -m 'distributed(num_gpus=2)'
- pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m 'distributed(num_gpus=2)'
- pytest models/decoder_only/vision_language/test_models.py -v -s -m 'distributed(num_gpus=2)'
# test sequence parallel
- pytest -v -s distributed/test_sequence_parallel.py
# this test fails consistently.
# TODO: investigate and fix
# - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
- VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
- VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/test_disagg.py
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
- label: Plugin Tests (2 GPUs) # 40min
working_dir: "/vllm-workspace/tests"
@ -654,10 +592,14 @@ steps:
# FIXIT: find out which code initialize cuda before running the test
# before the fix, we need to use spawn to test it
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
# This test runs llama 13B, so it is required to run on 4 GPUs.
- pytest -v -s -x lora/test_long_context.py
# There is some Tensor Parallelism related processing logic in LoRA that
# requires multi-GPU testing for validation.
- pytest -v -s -x lora/test_chatglm3_tp.py
- pytest -v -s -x lora/test_llama_tp.py
- pytest -v -s -x lora/test_minicpmv_tp.py
- pytest -v -s -x lora/test_transfomers_model.py
- label: Weight Loading Multiple GPU Test # 33min

1
.github/CODEOWNERS vendored
View File

@ -12,7 +12,6 @@
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth
/vllm/model_executor/guided_decoding @mgoin @russellb
/vllm/multimodal @DarkLight1337 @ywang96
/vllm/vllm_flash_attn @LucasWilkinson
CMakeLists.txt @tlrmchlsmth
# vLLM V1

View File

@ -14,7 +14,7 @@ body:
description: |
Please run the following and paste the output below.
```sh
wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py
wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
# For security purposes, please feel free to check the contents of collect_env.py before running it.
python collect_env.py
```

View File

@ -14,7 +14,7 @@ body:
description: |
Please run the following and paste the output below.
```sh
wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py
wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
# For security purposes, please feel free to check the contents of collect_env.py before running it.
python collect_env.py
```

View File

@ -14,7 +14,7 @@ body:
description: |
Please run the following and paste the output below.
```sh
wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py
wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
# For security purposes, please feel free to check the contents of collect_env.py before running it.
python collect_env.py
```

View File

@ -9,7 +9,7 @@ body:
value: >
#### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
#### We also highly recommend you read https://docs.vllm.ai/en/latest/contributing/model/index.html first to understand how to add a new model.
#### We also highly recommend you read https://docs.vllm.ai/en/latest/contributing/model/adding_model.html first to understand how to add a new model.
- type: textarea
attributes:
label: The model to consider.

View File

@ -35,7 +35,7 @@ body:
description: |
Please run the following and paste the output below.
```sh
wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py
wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
# For security purposes, please feel free to check the contents of collect_env.py before running it.
python collect_env.py
```

View File

@ -3,4 +3,4 @@ FILL IN THE PR DESCRIPTION HERE
FIX #xxxx (*link existing issues this PR will resolve*)
<!--- pyml disable-next-line no-emphasis-as-heading -->
**BEFORE SUBMITTING, PLEASE READ <https://docs.vllm.ai/en/latest/contributing/overview.html>** (anything written below this line will be removed by GitHub Actions)
**BEFORE SUBMITTING, PLEASE READ <https://docs.vllm.ai/en/latest/contributing/overview.html>**

66
.github/mergify.yml vendored
View File

@ -19,7 +19,7 @@ pull_request_rules:
- files~=\.buildkite/
- files~=^cmake/
- files=CMakeLists.txt
- files~=^docker/Dockerfile
- files~=^Dockerfile
- files~=^requirements.*\.txt
- files=setup.py
actions:
@ -55,19 +55,11 @@ pull_request_rules:
description: Automatically apply structured-output label
conditions:
- or:
- files~=^benchmarks/structured_schemas/
- files=benchmarks/benchmark_serving_structured_output.py
- files=benchmarks/run_structured_output_benchmark.sh
- files=docs/source/features/structured_outputs.md
- files=examples/offline_inference/structured_outputs.py
- files=examples/online_serving/openai_chat_completion_structured_outputs.py
- files=examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
- files~=^vllm/model_executor/guided_decoding/
- files=tests/model_executor/test_guided_processors.py
- files=tests/entrypoints/llm/test_guided_generate.py
- files~=^tests/v1/structured_output/
- files=tests/v1/entrypoints/llm/test_guided_generate.py
- files~=^vllm/v1/structured_output/
- files=benchmarks/benchmark_serving_guided.py
- files=benchmarks/benchmark_guided.py
actions:
label:
add:
@ -96,58 +88,6 @@ pull_request_rules:
add:
- v1
- name: label-tpu
description: Automatically apply tpu label
# Keep this list in sync with `label-tpu-remove` conditions
conditions:
- or:
- files~=tpu.py
- files~=_tpu
- files~=tpu_
- files~=/tpu/
- files~=pallas
actions:
label:
add:
- tpu
- name: label-tpu-remove
description: Automatically remove tpu label
# Keep this list in sync with `label-tpu` conditions
conditions:
- and:
- -files~=tpu.py
- -files~=_tpu
- -files~=tpu_
- -files~=/tpu/
- -files~=pallas
actions:
label:
remove:
- tpu
- name: label-tool-calling
description: Automatically add tool-calling label
conditions:
- or:
- files~=^tests/tool_use/
- files~=^tests/mistral_tool_use/
- files~=^tests/entrypoints/openai/tool_parsers/
- files=tests/entrypoints/openai/test_chat_with_tool_reasoning.py
- files~=^vllm/entrypoints/openai/tool_parsers/
- files=docs/source/features/tool_calling.md
- files=docs/source/getting_started/examples/openai_chat_completion_client_with_tools.md
- files=docs/source/getting_started/examples/chat_with_tools.md
- files~=^examples/tool_chat_*
- files=examples/offline_inference/chat_with_tools.py
- files=examples/online_serving/openai_chat_completion_client_with_tools_required.py
- files=examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py
- files=examples/online_serving/openai_chat_completion_client_with_tools.py
actions:
label:
add:
- tool-calling
- name: ping author on conflicts and add 'needs-rebase' label
conditions:
- conflict

View File

@ -50,7 +50,7 @@ jobs:
uses: helm/kind-action@a1b0e391336a6ee6713a0583f8c6240d70863de3 # v1.12.0
- name: Build the Docker image vllm cpu
run: docker buildx build -f docker/Dockerfile.cpu -t vllm-cpu-env .
run: docker buildx build -f Dockerfile.cpu -t vllm-cpu-env .
- name: Configuration of docker images, network and namespace for the kind cluster
run: |

5
.gitignore vendored
View File

@ -2,7 +2,7 @@
/vllm/_version.py
# vllm-flash-attn built from source
vllm/vllm_flash_attn/*
vllm/vllm_flash_attn/
# Byte-compiled / optimized / DLL files
__pycache__/
@ -202,6 +202,3 @@ benchmarks/**/*.json
# Linting
actionlint
shellcheck*/
# Ingore moe/marlin_moe gen code
csrc/moe/marlin_moe_wna16/kernel_*

View File

@ -1,6 +1,3 @@
default_install_hook_types:
- pre-commit
- commit-msg
default_stages:
- pre-commit # Run locally
- manual # Run in CI
@ -11,6 +8,7 @@ repos:
hooks:
- id: yapf
args: [--in-place, --verbose]
additional_dependencies: [toml] # TODO: Remove when yapf is upgraded
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.9.3
hooks:
@ -121,12 +119,6 @@ repos:
language: system
always_run: true
pass_filenames: false
- id: update-dockerfile-graph
name: Update Dockerfile dependency graph
entry: tools/update-dockerfile-graph.sh
language: script
files: ^docker/Dockerfile$
pass_filenames: false
# Keep `suggestion` last
- id: suggestion
name: Suggestion

View File

@ -34,7 +34,7 @@ set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12")
set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0")
# Supported AMD GPU architectures.
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201")
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101")
#
# Supported/expected torch versions for CUDA/ROCm.
@ -44,7 +44,7 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1
#
# Note: the CUDA torch version is derived from pyproject.toml and various
# requirements.txt files and should be kept consistent. The ROCm torch
# versions are derived from docker/Dockerfile.rocm
# versions are derived from Dockerfile.rocm
#
set(TORCH_SUPPORTED_VERSION_CUDA "2.6.0")
set(TORCH_SUPPORTED_VERSION_ROCM "2.6.0")
@ -230,12 +230,10 @@ set(VLLM_EXT_SRC
"csrc/cache_kernels.cu"
"csrc/attention/paged_attention_v1.cu"
"csrc/attention/paged_attention_v2.cu"
"csrc/attention/merge_attn_states.cu"
"csrc/pos_encoding_kernels.cu"
"csrc/activation_kernels.cu"
"csrc/layernorm_kernels.cu"
"csrc/layernorm_quant_kernels.cu"
"csrc/cuda_view.cu"
"csrc/quantization/gptq/q_gemm.cu"
"csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
"csrc/quantization/fp8/common.cu"
@ -243,7 +241,6 @@ set(VLLM_EXT_SRC
"csrc/quantization/gguf/gguf_kernel.cu"
"csrc/cuda_utils_kernels.cu"
"csrc/prepare_inputs/advance_step.cu"
"csrc/custom_all_reduce.cu"
"csrc/torch_bindings.cpp")
if(VLLM_GPU_LANG STREQUAL "CUDA")
@ -251,7 +248,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# Set CUTLASS_REVISION manually -- its revision detection doesn't work in this case.
# Please keep this in sync with FetchContent_Declare line below.
set(CUTLASS_REVISION "v3.9.0" CACHE STRING "CUTLASS revision to use")
set(CUTLASS_REVISION "v3.8.0" CACHE STRING "CUTLASS revision to use")
# Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
@ -269,7 +266,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
cutlass
GIT_REPOSITORY https://github.com/nvidia/cutlass.git
# Please keep this in sync with CUTLASS_REVISION line above.
GIT_TAG v3.9.0
GIT_TAG v3.8.0
GIT_PROGRESS TRUE
# Speed up CUTLASS download by retrieving only the specified GIT_TAG instead of the history.
@ -285,13 +282,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
"csrc/mamba/causal_conv1d/causal_conv1d.cu"
"csrc/quantization/aqlm/gemm_kernels.cu"
"csrc/quantization/awq/gemm_kernels.cu"
"csrc/custom_all_reduce.cu"
"csrc/permute_cols.cu"
"csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
"csrc/quantization/fp4/nvfp4_quant_entry.cu"
"csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu"
"csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
"csrc/cutlass_extensions/common.cpp"
"csrc/attention/mla/cutlass_mla_entry.cu")
"csrc/cutlass_extensions/common.cpp")
set_gencode_flags_for_srcs(
SRCS "${VLLM_EXT_SRC}"
@ -464,52 +461,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
set(FP4_ARCHS)
endif()
# CUTLASS MLA Archs and flags
cuda_archs_loose_intersection(MLA_ARCHS "10.0a" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND MLA_ARCHS)
set(SRCS
"csrc/attention/mla/cutlass_mla_kernels.cu")
set_gencode_flags_for_srcs(
SRCS "${SRCS}"
CUDA_ARCHS "${MLA_ARCHS}")
list(APPEND VLLM_EXT_SRC "${SRCS}")
list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MLA=1")
# Add MLA-specific include directories only to MLA source files
set_source_files_properties(${SRCS}
PROPERTIES INCLUDE_DIRECTORIES "${CUTLASS_DIR}/examples/77_blackwell_fmha;${CUTLASS_DIR}/examples/common")
message(STATUS "Building CUTLASS MLA for archs: ${MLA_ARCHS}")
else()
message(STATUS "Not building CUTLASS MLA as no compatible archs were found.")
# clear MLA_ARCHS
set(MLA_ARCHS)
endif()
# CUTLASS MoE kernels
# The MoE kernel cutlass_moe_mm requires CUDA 12.3 or later (and only works
# on Hopper). get_cutlass_moe_mm_data should only be compiled if it's possible
# to compile MoE kernels that use its output.
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu"
"csrc/quantization/cutlass_w8a8/moe/moe_data.cu")
set_gencode_flags_for_srcs(
SRCS "${SRCS}"
CUDA_ARCHS "${SCALED_MM_ARCHS}")
list(APPEND VLLM_EXT_SRC "${SRCS}")
list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MOE_SM90=1")
message(STATUS "Building grouped_mm_c3x for archs: ${SCALED_MM_ARCHS}")
else()
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
message(STATUS "Not building grouped_mm_c3x kernels as CUDA Compiler version is "
"not >= 12.3, we recommend upgrading to CUDA 12.3 or later "
"if you intend on running FP8 quantized MoE models on Hopper.")
else()
message(STATUS "Not building grouped_mm_c3x as no compatible archs found "
"in CUDA target architectures")
endif()
endif()
#
# Machete kernels
@ -629,51 +580,21 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
list(APPEND VLLM_MOE_EXT_SRC "${VLLM_MOE_WNA16_SRC}")
cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0" "${CUDA_ARCHS}")
if (MARLIN_MOE_ARCHS)
set(MARLIN_MOE_SRC
"csrc/moe/marlin_kernels/marlin_moe_kernel.h"
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.h"
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.cu"
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.h"
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.cu"
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.h"
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.cu"
"csrc/moe/marlin_moe_ops.cu")
#
# For the Marlin MOE kernels we automatically generate sources for various
# preselected input type pairs and schedules.
# Generate sources:
set(MOE_MARLIN_GEN_SCRIPT
${CMAKE_CURRENT_SOURCE_DIR}/csrc/moe/marlin_moe_wna16/generate_kernels.py)
file(MD5 ${MOE_MARLIN_GEN_SCRIPT} MOE_MARLIN_GEN_SCRIPT_HASH)
message(STATUS "Marlin MOE generation script hash: ${MOE_MARLIN_GEN_SCRIPT_HASH}")
message(STATUS "Last run Marlin MOE generate script hash: $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH}")
if (NOT DEFINED CACHE{MOE_MARLIN_GEN_SCRIPT_HASH}
OR NOT $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH} STREQUAL ${MOE_MARLIN_GEN_SCRIPT_HASH})
execute_process(
COMMAND ${CMAKE_COMMAND} -E env
PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/:${VLLM_PYTHON_PATH}:$PYTHONPATH
${Python_EXECUTABLE} ${MOE_MARLIN_GEN_SCRIPT}
RESULT_VARIABLE moe_marlin_generation_result
OUTPUT_VARIABLE moe_marlin_generation_output
OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log
ERROR_FILE ${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log
)
if (NOT moe_marlin_generation_result EQUAL 0)
message(FATAL_ERROR "Marlin MOE generation failed."
" Result: \"${moe_marlin_generation_result}\""
"\nCheck the log for details: "
"${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log")
else()
set(MOE_MARLIN_GEN_SCRIPT_HASH ${MOE_MARLIN_GEN_SCRIPT_HASH}
CACHE STRING "Last run Marlin MOE generate script hash" FORCE)
message(STATUS "Marlin MOE generation completed successfully.")
endif()
else()
message(STATUS "Marlin MOE generation script has not changed, skipping generation.")
endif()
file(GLOB MOE_WNAA16_MARLIN_SRC "csrc/moe/marlin_moe_wna16/*.cu")
set_gencode_flags_for_srcs(
SRCS "${MOE_WNAA16_MARLIN_SRC}"
SRCS "${MARLIN_MOE_SRC}"
CUDA_ARCHS "${MARLIN_MOE_ARCHS}")
list(APPEND VLLM_MOE_EXT_SRC ${MOE_WNAA16_MARLIN_SRC})
list(APPEND VLLM_MOE_EXT_SRC "${MARLIN_MOE_SRC}")
message(STATUS "Building Marlin MOE kernels for archs: ${MARLIN_MOE_ARCHS}")
else()
message(STATUS "Not building Marlin MOE kernels as no compatible archs found"
@ -698,7 +619,6 @@ if(VLLM_GPU_LANG STREQUAL "HIP")
#
set(VLLM_ROCM_EXT_SRC
"csrc/rocm/torch_bindings.cpp"
"csrc/rocm/skinny_gemms.cu"
"csrc/rocm/attention.cu")
define_gpu_extension_target(

View File

@ -14,22 +14,17 @@ ARG PYTHON_VERSION=3.12
ARG TARGETPLATFORM
ENV DEBIAN_FRONTEND=noninteractive
# Install Python and other dependencies
RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
&& apt-get update -y \
&& apt-get install -y ccache software-properties-common git curl sudo \
&& add-apt-repository ppa:deadsnakes/ppa \
&& apt-get update -y \
&& apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
&& update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
&& update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
&& ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
&& curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
&& python3 --version && python3 -m pip --version
# Install uv for faster pip installs
RUN --mount=type=cache,target=/root/.cache/uv \
python3 -m pip install uv
# Install minimal dependencies and uv
RUN apt-get update -y \
&& apt-get install -y ccache git curl wget sudo \
&& curl -LsSf https://astral.sh/uv/install.sh | sh
# Add uv to PATH
ENV PATH="/root/.local/bin:$PATH"
# Create venv with specified Python and activate by placing at the front of path
ENV VIRTUAL_ENV="/opt/venv"
RUN uv venv --python ${PYTHON_VERSION} --seed ${VIRTUAL_ENV}
ENV PATH="$VIRTUAL_ENV/bin:$PATH"
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
# Reference: https://github.com/astral-sh/uv/pull/1694
@ -51,22 +46,19 @@ RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
WORKDIR /workspace
# install build and runtime dependencies
# arm64 (GH200) build follows the practice of "use existing pytorch" build,
# we need to install torch and torchvision from the nightly builds first,
# pytorch will not appear as a vLLM dependency in all of the following steps
# after this step
RUN --mount=type=cache,target=/root/.cache/uv \
if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu128 "torch==2.8.0.dev20250318+cu128" "torchvision==0.22.0.dev20250319"; \
uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu128 --pre pytorch_triton==3.3.0+gitab727c40; \
uv pip install --index-url https://download.pytorch.org/whl/nightly/cu126 "torch==2.7.0.dev20250121+cu126" "torchvision==0.22.0.dev20250121"; \
fi
COPY requirements/common.txt requirements/common.txt
COPY requirements/cuda.txt requirements/cuda.txt
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system -r requirements/cuda.txt
uv pip install -r requirements/cuda.txt
# cuda arch list used by torch
# can be useful for both `dev` and `test`
@ -91,7 +83,7 @@ COPY requirements/build.txt requirements/build.txt
ENV UV_HTTP_TIMEOUT=500
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system -r requirements/build.txt
uv pip install -r requirements/build.txt
COPY . .
ARG GIT_REPO_CHECK=0
@ -162,11 +154,8 @@ ENV UV_HTTP_TIMEOUT=500
COPY requirements/lint.txt requirements/lint.txt
COPY requirements/test.txt requirements/test.txt
COPY requirements/dev.txt requirements/dev.txt
# Workaround for #17068
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system mamba-ssm==2.2.4 --no-build-isolation
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system -r requirements/dev.txt
uv pip install -r requirements/dev.txt
#################### DEV IMAGE ####################
#################### vLLM installation IMAGE ####################
@ -182,23 +171,18 @@ ARG TARGETPLATFORM
RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment
# Install Python and other dependencies
RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
&& apt-get update -y \
&& apt-get install -y ccache software-properties-common git curl wget sudo vim python3-pip \
&& apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
&& add-apt-repository ppa:deadsnakes/ppa \
&& apt-get update -y \
&& apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv libibverbs-dev \
&& update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
&& update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
&& ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
&& curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
&& python3 --version && python3 -m pip --version
# Install uv for faster pip installs
RUN --mount=type=cache,target=/root/.cache/uv \
python3 -m pip install uv
# Install minimal dependencies and uv
RUN apt-get update -y \
&& apt-get install -y ccache git curl wget sudo vim \
&& apt-get install -y ffmpeg libsm6 libxext6 libgl1 libibverbs-dev \
&& curl -LsSf https://astral.sh/uv/install.sh | sh
# Add uv to PATH
ENV PATH="/root/.local/bin:$PATH"
# Create venv with specified Python and activate by placing at the front of path
ENV VIRTUAL_ENV="/opt/venv"
RUN uv venv --python ${PYTHON_VERSION} --seed ${VIRTUAL_ENV}
ENV PATH="$VIRTUAL_ENV/bin:$PATH"
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
# Reference: https://github.com/astral-sh/uv/pull/1694
@ -216,14 +200,13 @@ RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
# after this step
RUN --mount=type=cache,target=/root/.cache/uv \
if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu128 "torch==2.8.0.dev20250318+cu128" "torchvision==0.22.0.dev20250319"; \
uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu128 --pre pytorch_triton==3.3.0+gitab727c40; \
uv pip install --index-url https://download.pytorch.org/whl/nightly/cu124 "torch==2.6.0.dev20241210+cu124" "torchvision==0.22.0.dev20241215"; \
fi
# Install vllm wheel first, so that torch etc will be installed.
RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
--mount=type=cache,target=/root/.cache/uv \
uv pip install --system dist/*.whl --verbose
uv pip install dist/*.whl --verbose
# If we need to build FlashInfer wheel before its release:
# $ export FLASHINFER_ENABLE_AOT=1
@ -238,13 +221,10 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
# $ # upload the wheel to a public location, e.g. https://wheels.vllm.ai/flashinfer/524304395bd1d8cd7d07db083859523fcaa246a4/flashinfer_python-0.2.1.post1+cu124torch2.5-cp38-abi3-linux_x86_64.whl
RUN --mount=type=cache,target=/root/.cache/uv \
. /etc/environment && \
if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \
uv pip install --system https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.1.post2/flashinfer_python-0.2.1.post2+cu124torch2.6-cp38-abi3-linux_x86_64.whl ; \
uv pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.1.post2/flashinfer_python-0.2.1.post2+cu124torch2.6-cp38-abi3-linux_x86_64.whl ; \
fi
COPY examples examples
COPY benchmarks benchmarks
COPY ./vllm/collect_env.py .
# Although we build Flashinfer with AOT mode, there's still
# some issues w.r.t. JIT compilation. Therefore we need to
@ -252,7 +232,7 @@ COPY ./vllm/collect_env.py .
# TODO: Remove this once FlashInfer AOT wheel is fixed
COPY requirements/build.txt requirements/build.txt
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system -r requirements/build.txt
uv pip install -r requirements/build.txt
#################### vLLM installation IMAGE ####################
@ -268,19 +248,16 @@ ADD . /vllm-workspace/
ENV UV_HTTP_TIMEOUT=500
# install development dependencies (for testing)
# Workaround for #17068
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system mamba-ssm==2.2.4 --no-build-isolation
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system -r requirements/dev.txt
uv pip install -r requirements/dev.txt
# install development dependencies (for testing)
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system -e tests/vllm_test_utils
uv pip install -e tests/vllm_test_utils
# enable fast downloads from hf (for testing)
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system hf_transfer
uv pip install hf_transfer
ENV HF_HUB_ENABLE_HF_TRANSFER 1
# Copy in the v1 package for testing (it isn't distributed yet)
@ -297,7 +274,6 @@ RUN mv vllm test_docs/
#################### OPENAI API SERVER ####################
# base openai image with additional requirements, for any subsequent openai-style images
FROM vllm-base AS vllm-openai-base
ARG TARGETPLATFORM
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
# Reference: https://github.com/astral-sh/uv/pull/1694
@ -306,9 +282,9 @@ ENV UV_HTTP_TIMEOUT=500
# install additional dependencies for openai api server
RUN --mount=type=cache,target=/root/.cache/uv \
if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
uv pip install --system accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.42.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
uv pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.42.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
else \
uv pip install --system accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.45.3' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
uv pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.45.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
fi
ENV VLLM_USAGE_SOURCE production-docker-image

69
Dockerfile.cpu Normal file
View File

@ -0,0 +1,69 @@
# This vLLM Dockerfile is used to construct image that can build and run vLLM on x86 CPU platform.
FROM ubuntu:22.04 AS cpu-test-1
ENV CCACHE_DIR=/root/.cache/ccache
ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache
RUN --mount=type=cache,target=/var/cache/apt \
apt-get update -y \
&& apt-get install -y curl ccache git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 libnuma-dev \
&& apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
&& update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
# https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/performance_tuning/tuning_guide.html
# intel-openmp provides additional performance improvement vs. openmp
# tcmalloc provides better memory allocation efficiency, e.g, holding memory in caches to speed up access of commonly-used objects.
RUN --mount=type=cache,target=/root/.cache/pip \
pip install intel-openmp==2025.0.1
ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/usr/local/lib/libiomp5.so"
RUN echo 'ulimit -c 0' >> ~/.bashrc
RUN pip install intel_extension_for_pytorch==2.6.0
WORKDIR /workspace
ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=bind,src=requirements/build.txt,target=requirements/build.txt \
pip install --upgrade pip && \
pip install -r requirements/build.txt
FROM cpu-test-1 AS build
WORKDIR /workspace/vllm
RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=bind,src=requirements/common.txt,target=requirements/common.txt \
--mount=type=bind,src=requirements/cpu.txt,target=requirements/cpu.txt \
pip install -v -r requirements/cpu.txt
COPY . .
ARG GIT_REPO_CHECK=0
RUN --mount=type=bind,source=.git,target=.git \
if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
# Support for building with non-AVX512 vLLM: docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" ...
ARG VLLM_CPU_DISABLE_AVX512
ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}
RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=cache,target=/root/.cache/ccache \
--mount=type=bind,source=.git,target=.git \
VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel && \
pip install dist/*.whl && \
rm -rf dist
WORKDIR /workspace/
RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
# install development dependencies (for testing)
RUN --mount=type=cache,target=/root/.cache/pip \
pip install -e tests/vllm_test_utils
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]

View File

@ -1,4 +1,4 @@
FROM vault.habana.ai/gaudi-docker/1.20.1/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
FROM vault.habana.ai/gaudi-docker/1.19.1/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest
COPY ./ /workspace/vllm

View File

@ -1,6 +1,6 @@
# default base image
# https://gallery.ecr.aws/neuron/pytorch-inference-neuronx
ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.5.1-neuronx-py310-sdk2.22.0-ubuntu22.04"
ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.5.1-neuronx-py310-sdk2.21.0-ubuntu22.04"
FROM $BASE_IMAGE
@ -21,9 +21,9 @@ VOLUME [ ${APP_MOUNT} ]
WORKDIR ${APP_MOUNT}/vllm
RUN python3 -m pip install --upgrade pip
RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas tenacity
RUN python3 -m pip install sentencepiece transformers==4.48.0 -U
RUN python3 -m pip install neuronx-cc==2.17.194.0 --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas
RUN python3 -m pip install sentencepiece transformers==4.45.2 -U
RUN python3 -m pip install neuronx-cc==2.16.345.0 --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
RUN python3 -m pip install pytest
# uninstall transformers-neuronx package explicitly to avoid version conflict

37
Dockerfile.ppc64le Normal file
View File

@ -0,0 +1,37 @@
FROM mambaorg/micromamba
ARG MAMBA_DOCKERFILE_ACTIVATE=1
USER root
ENV PATH="/usr/local/cargo/bin:$PATH:/opt/conda/bin/"
RUN apt-get update -y && apt-get install -y git wget kmod curl vim libnuma-dev libsndfile-dev libprotobuf-dev build-essential ffmpeg libsm6 libxext6 libgl1 libssl-dev
# Some packages in requirements/cpu are installed here
# IBM provides optimized packages for ppc64le processors in the open-ce project for mamba
# Currently these may not be available for venv or pip directly
RUN micromamba install -y -n base -c https://ftp.osuosl.org/pub/open-ce/1.11.0-p10/ -c defaults python=3.10 rust && micromamba clean --all --yes
COPY ./ /workspace/vllm
WORKDIR /workspace/vllm
ARG GIT_REPO_CHECK=0
RUN --mount=type=bind,source=.git,target=.git \
if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh; fi
RUN --mount=type=cache,target=/root/.cache/pip \
RUSTFLAGS='-L /opt/conda/lib' pip install -v --prefer-binary --extra-index-url https://repo.fury.io/mgiessing \
'cmake>=3.26' ninja packaging 'setuptools-scm>=8' wheel jinja2 \
-r requirements/cpu.txt \
xformers uvloop==0.20.0
RUN --mount=type=bind,source=.git,target=.git \
VLLM_TARGET_DEVICE=cpu python3 setup.py install
# install development dependencies (for testing)
RUN python3 -m pip install -e tests/vllm_test_utils
WORKDIR /workspace/
RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
ENTRYPOINT ["/opt/conda/bin/python3", "-m", "vllm.entrypoints.openai.api_server"]

View File

@ -12,8 +12,7 @@ ENV PYTORCH_ROCM_ARCH=${ARG_PYTORCH_ROCM_ARCH:-${PYTORCH_ROCM_ARCH}}
# Install some basic utilities
RUN apt-get update -q -y && apt-get install -q -y \
sqlite3 libsqlite3-dev libfmt-dev libmsgpack-dev libsuitesparse-dev \
apt-transport-https ca-certificates wget curl
sqlite3 libsqlite3-dev libfmt-dev libmsgpack-dev libsuitesparse-dev
# Remove sccache
RUN python3 -m pip install --upgrade pip && pip install setuptools_scm
RUN apt-get purge -y sccache; python3 -m pip uninstall -y sccache; rm -f "$(which sccache)"
@ -41,7 +40,7 @@ ARG USE_CYTHON
RUN cd vllm \
&& python3 -m pip install -r requirements/rocm.txt \
&& python3 setup.py clean --all \
&& if [ ${USE_CYTHON} -eq "1" ]; then python3 tests/build_cython.py build_ext --inplace; fi \
&& if [ ${USE_CYTHON} -eq "1" ]; then python3 setup_cython.py build_ext --inplace; fi \
&& python3 setup.py bdist_wheel --dist-dir=dist
FROM scratch AS export_vllm
ARG COMMON_WORKDIR

View File

@ -1,18 +1,18 @@
ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:6.3.1-complete
ARG HIPBLASLT_BRANCH="db8e93b4"
ARG HIPBLASLT_BRANCH="4d40e36"
ARG HIPBLAS_COMMON_BRANCH="7c1566b"
ARG LEGACY_HIPBLASLT_OPTION=
ARG RCCL_BRANCH="648a58d"
ARG RCCL_REPO="https://github.com/ROCm/rccl"
ARG TRITON_BRANCH="e5be006"
ARG TRITON_REPO="https://github.com/triton-lang/triton.git"
ARG PYTORCH_BRANCH="295f2ed4"
ARG PYTORCH_VISION_BRANCH="v0.21.0"
ARG PYTORCH_BRANCH="3a585126"
ARG PYTORCH_VISION_BRANCH="v0.19.1"
ARG PYTORCH_REPO="https://github.com/pytorch/pytorch.git"
ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"
ARG FA_BRANCH="1a7f4dfa"
ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git"
ARG AITER_BRANCH="7e1ed08"
ARG FA_BRANCH="b7d29fb"
ARG FA_REPO="https://github.com/ROCm/flash-attention.git"
ARG AITER_BRANCH="21d47a9"
ARG AITER_REPO="https://github.com/ROCm/aiter.git"
FROM ${BASE_IMAGE} AS base
@ -20,7 +20,7 @@ FROM ${BASE_IMAGE} AS base
ENV PATH=/opt/rocm/llvm/bin:$PATH
ENV ROCM_PATH=/opt/rocm
ENV LD_LIBRARY_PATH=/opt/rocm/lib:/usr/local/lib:
ARG PYTORCH_ROCM_ARCH=gfx90a;gfx942;gfx1100;gfx1101;gfx1200;gfx1201
ARG PYTORCH_ROCM_ARCH=gfx90a;gfx942
ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}
ARG PYTHON_VERSION=3.12
@ -31,7 +31,7 @@ ENV DEBIAN_FRONTEND=noninteractive
# Install Python and other dependencies
RUN apt-get update -y \
&& apt-get install -y software-properties-common git curl sudo vim less libgfortran5 \
&& apt-get install -y software-properties-common git curl sudo vim less \
&& add-apt-repository ppa:deadsnakes/ppa \
&& apt-get update -y \
&& apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
@ -42,7 +42,7 @@ RUN apt-get update -y \
&& curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
&& python3 --version && python3 -m pip --version
RUN pip install -U packaging 'cmake<4' ninja wheel setuptools pybind11 Cython
RUN pip install -U packaging cmake ninja wheel setuptools pybind11 Cython
FROM base AS build_hipblaslt
ARG HIPBLASLT_BRANCH
@ -60,8 +60,7 @@ RUN cd hipBLAS-common \
RUN git clone https://github.com/ROCm/hipBLASLt
RUN cd hipBLASLt \
&& git checkout ${HIPBLASLT_BRANCH} \
&& apt-get install -y llvm-dev \
&& ./install.sh -dc --architecture ${PYTORCH_ROCM_ARCH} ${LEGACY_HIPBLASLT_OPTION} \
&& ./install.sh -d --architecture ${PYTORCH_ROCM_ARCH} ${LEGACY_HIPBLASLT_OPTION} \
&& cd build/release \
&& make package
RUN mkdir -p /app/install && cp /app/hipBLASLt/build/release/*.deb /app/hipBLAS-common/build/*.deb /app/install
@ -111,24 +110,11 @@ RUN git clone ${FA_REPO}
RUN cd flash-attention \
&& git checkout ${FA_BRANCH} \
&& git submodule update --init \
&& GPU_ARCHS=$(echo ${PYTORCH_ROCM_ARCH} | sed -e 's/;gfx1[0-9]\{3\}//g') python3 setup.py bdist_wheel --dist-dir=dist
&& MAX_JOBS=64 GPU_ARCHS=${PYTORCH_ROCM_ARCH} python3 setup.py bdist_wheel --dist-dir=dist
RUN mkdir -p /app/install && cp /app/pytorch/dist/*.whl /app/install \
&& cp /app/vision/dist/*.whl /app/install \
&& cp /app/flash-attention/dist/*.whl /app/install
FROM base AS build_aiter
ARG AITER_BRANCH
ARG AITER_REPO
RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
pip install /install/*.whl
RUN git clone --recursive ${AITER_REPO}
RUN cd aiter \
&& git checkout ${AITER_BRANCH} \
&& git submodule update --init --recursive \
&& pip install -r requirements.txt
RUN pip install pyyaml && cd aiter && PREBUILD_KERNELS=1 GPU_ARCHS=gfx942 python3 setup.py bdist_wheel --dist-dir=dist && ls /app/aiter/dist/*.whl
RUN mkdir -p /app/install && cp /app/aiter/dist/*.whl /app/install
FROM base AS final
RUN --mount=type=bind,from=build_hipblaslt,src=/app/install/,target=/install \
dpkg -i /install/*deb \
@ -144,12 +130,19 @@ RUN --mount=type=bind,from=build_amdsmi,src=/app/install/,target=/install \
pip install /install/*.whl
RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
pip install /install/*.whl
RUN --mount=type=bind,from=build_aiter,src=/app/install/,target=/install \
pip install /install/*.whl
ARG AITER_REPO
ARG AITER_BRANCH
RUN git clone --recursive ${AITER_REPO}
RUN cd aiter \
&& git checkout ${AITER_BRANCH} \
&& git submodule update --init --recursive \
&& pip install -r requirements.txt \
&& PREBUILD_KERNELS=1 GPU_ARCHS=gfx942 python3 setup.py develop && pip show aiter
ARG BASE_IMAGE
ARG HIPBLAS_COMMON_BRANCH
ARG HIPBLASLT_BRANCH
ARG HIPBLAS_COMMON_BRANCH
ARG LEGACY_HIPBLASLT_OPTION
ARG RCCL_BRANCH
ARG RCCL_REPO
@ -161,8 +154,6 @@ ARG PYTORCH_REPO
ARG PYTORCH_VISION_REPO
ARG FA_BRANCH
ARG FA_REPO
ARG AITER_BRANCH
ARG AITER_REPO
RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
&& echo "HIPBLAS_COMMON_BRANCH: ${HIPBLAS_COMMON_BRANCH}" >> /app/versions.txt \
&& echo "HIPBLASLT_BRANCH: ${HIPBLASLT_BRANCH}" >> /app/versions.txt \
@ -176,5 +167,6 @@ RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
&& echo "PYTORCH_REPO: ${PYTORCH_REPO}" >> /app/versions.txt \
&& echo "PYTORCH_VISION_REPO: ${PYTORCH_VISION_REPO}" >> /app/versions.txt \
&& echo "FA_BRANCH: ${FA_BRANCH}" >> /app/versions.txt \
&& echo "FA_REPO: ${FA_REPO}" >> /app/versions.txt \
&& echo "AITER_BRANCH: ${AITER_BRANCH}" >> /app/versions.txt \
&& echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt

View File

@ -58,7 +58,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
cd ../../python && \
export PYARROW_PARALLEL=4 && \
export ARROW_BUILD_TYPE=release && \
uv pip install -r requirements-build.txt && \
uv pip install -r requirements/build.txt && \
python setup.py build_ext --build-type=$ARROW_BUILD_TYPE --bundle-arrow-cpp bdist_wheel
FROM python-install AS numa-build
@ -96,22 +96,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install -v torch==${TORCH_VERSION} --extra-index-url https://download.pytorch.org/whl/nightly/cpu && \
python setup.py bdist_wheel
FROM python-install AS hf-xet-builder
# Install hf-xet
WORKDIR /tmp
ENV CARGO_HOME=/root/.cargo
ENV RUSTUP_HOME=/root/.rustup
ENV PATH="$CARGO_HOME/bin:$RUSTUP_HOME/bin:$PATH"
RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,from=rust,source=/root/.cargo,target=/root/.cargo,rw \
--mount=type=bind,from=rust,source=/root/.rustup,target=/root/.rustup,rw \
git clone https://github.com/huggingface/xet-core.git && \
cd xet-core/hf_xet/ && \
uv pip install maturin patchelf && \
python -m maturin build --release --out dist && \
mkdir -p /tmp/hf-xet/dist && \
cp dist/*.whl /tmp/hf-xet/dist/
# Final build stage
FROM python-install AS vllm-cpu
ARG PYTHON_VERSION
@ -136,15 +120,12 @@ RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,from=rust,source=/root/.rustup,target=/root/.rustup,rw \
--mount=type=bind,from=pyarrow,source=/tmp/arrow/python/dist,target=/tmp/arrow-wheels \
--mount=type=bind,from=torch-vision,source=/tmp/vision/dist,target=/tmp/vision-wheels/ \
--mount=type=bind,from=hf-xet-builder,source=/tmp/hf-xet/dist,target=/tmp/hf-xet-wheels/ \
sed -i '/^torch/d' requirements/build.txt && \
ARROW_WHL_FILE=$(ls /tmp/arrow-wheels/pyarrow-*.whl | head -n 1) && \
VISION_WHL_FILE=$(ls /tmp/vision-wheels/*.whl | head -n 1) && \
HF_XET_WHL_FILE=$(ls /tmp/hf-xet-wheels/*.whl | head -n 1) && \
uv pip install -v \
$ARROW_WHL_FILE \
$VISION_WHL_FILE \
$HF_XET_WHL_FILE \
--extra-index-url https://download.pytorch.org/whl/nightly/cpu \
--index-strategy unsafe-best-match \
-r requirements/build.txt \
@ -168,5 +149,4 @@ USER 2000
WORKDIR /home/vllm
# Set the default entrypoint
ENTRYPOINT ["python", "-m", "vllm.entrypoints.openai.api_server"]
ENTRYPOINT ["python", "-m", "vllm.entrypoints.openai.api_server"]

View File

@ -10,40 +10,28 @@ Easy, fast, and cheap LLM serving for everyone
</h3>
<p align="center">
| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://blog.vllm.ai/"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> | <a href="https://discuss.vllm.ai"><b>User Forum</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> |
| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://vllm.ai"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> | <a href="https://discuss.vllm.ai"><b>User Forum</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> |
</p>
---
[2025/03] We are collaborating with Ollama to host an [Inference Night](https://lu.ma/vllm-ollama) at Y Combinator in San Francisco on Thursday, March 27, at 6 PM. Discuss all things inference local or data center!
[2025/04] We're hosting our first-ever *vLLM Asia Developer Day* in Singapore on *April 3rd*! This is a full-day event (9 AM - 9 PM SGT) in partnership with SGInnovate, AMD, and Embedded LLM. Meet the vLLM team and learn about LLM inference for RL, MI300X, and more! [Register Now](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day)
---
*Latest News* 🔥
- [2025/04] We hosted [Asia Developer Day](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/19cp6Qu8u48ihB91A064XfaXruNYiBOUKrBxAmDOllOo/edit?usp=sharing).
- [2025/03] We hosted [vLLM x Ollama Inference Night](https://lu.ma/vllm-ollama)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/16T2PDD1YwRnZ4Tu8Q5r6n53c5Lr5c73UV9Vd2_eBo4U/edit?usp=sharing).
- [2025/03] We hosted [the first vLLM China Meetup](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg)! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1REHvfQMKGnvz6p3Fd23HhSO4c8j5WPGZV0bKYLwnHyQ/edit?usp=sharing).
- [2025/03] We hosted [the East Coast vLLM Meetup](https://lu.ma/7mu4k4xx)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1NHiv8EUFF1NLd3fEYODm56nDmL26lEeXCaDgyDlTsRs/edit#slide=id.g31441846c39_0_0).
- [2025/02] We hosted [the ninth vLLM meetup](https://lu.ma/h7g3kuj9) with Meta! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1jzC_PZVXrVNSFVCW-V4cFXb6pn7zZ2CyP_Flwo05aqg/edit?usp=sharing) and AMD [here](https://drive.google.com/file/d/1Zk5qEJIkTmlQ2eQcXQZlljAx3m9s7nwn/view?usp=sharing). The slides from Meta will not be posted.
- [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html).
- [2025/01] We hosted [the eighth vLLM meetup](https://lu.ma/zep56hui) with Google Cloud! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1epVkt4Zu8Jz_S5OhEHPc798emsYh2BwYfRuDDVEF7u4/edit?usp=sharing), and Google Cloud team [here](https://drive.google.com/file/d/1h24pHewANyRL11xy5dXUbvRC9F9Kkjix/view?usp=sharing).
- [2024/12] vLLM joins [pytorch ecosystem](https://pytorch.org/blog/vllm-joins-pytorch)! Easy, Fast, and Cheap LLM Serving for Everyone!
<details>
<summary>Previous News</summary>
- [2024/11] We hosted [the seventh vLLM meetup](https://lu.ma/h0qvrajz) with Snowflake! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing), and Snowflake team [here](https://docs.google.com/presentation/d/1qF3RkDAbOULwz9WK5TOltt2fE9t6uIc_hVNLFAaQX6A/edit?usp=sharing).
- [2024/10] We have just created a developer slack ([slack.vllm.ai](https://slack.vllm.ai)) focusing on coordinating contributions and discussing features. Please feel free to join us there!
- [2024/10] Ray Summit 2024 held a special track for vLLM! Please find the opening talk slides from the vLLM team [here](https://docs.google.com/presentation/d/1B_KQxpHBTRa_mDF-tR6i8rWdOU5QoTZNcEg2MKZxEHM/edit?usp=sharing). Learn more from the [talks](https://www.youtube.com/playlist?list=PLzTswPQNepXl6AQwifuwUImLPFRVpksjR) from other vLLM contributors and users!
- [2024/09] We hosted [the sixth vLLM meetup](https://lu.ma/87q3nvnh) with NVIDIA! Please find the meetup slides [here](https://docs.google.com/presentation/d/1wrLGwytQfaOTd5wCGSPNhoaW3nq0E-9wqyP7ny93xRs/edit?usp=sharing).
- [2024/07] We hosted [the fifth vLLM meetup](https://lu.ma/lp0gyjqr) with AWS! Please find the meetup slides [here](https://docs.google.com/presentation/d/1RgUD8aCfcHocghoP3zmXzck9vX3RCI9yfUAB2Bbcl4Y/edit?usp=sharing).
- [2024/07] In partnership with Meta, vLLM officially supports Llama 3.1 with FP8 quantization and pipeline parallelism! Please check out our blog post [here](https://blog.vllm.ai/2024/07/23/llama31.html).
- [2024/06] We hosted [the fourth vLLM meetup](https://lu.ma/agivllm) with Cloudflare and BentoML! Please find the meetup slides [here](https://docs.google.com/presentation/d/1iJ8o7V2bQEi0BFEljLTwc5G1S10_Rhv3beed5oB0NJ4/edit?usp=sharing).
- [2024/04] We hosted [the third vLLM meetup](https://robloxandvllmmeetup2024.splashthat.com/) with Roblox! Please find the meetup slides [here](https://docs.google.com/presentation/d/1A--47JAK4BJ39t954HyTkvtfwn0fkqtsL8NGFuslReM/edit?usp=sharing).
- [2024/01] We hosted [the second vLLM meetup](https://lu.ma/ygxbpzhl) with IBM! Please find the meetup slides [here](https://docs.google.com/presentation/d/12mI2sKABnUw5RBWXDYY-HtHth4iMSNcEoQ10jDQbxgA/edit?usp=sharing).
- [2023/10] We hosted [the first vLLM meetup](https://lu.ma/first-vllm-meetup) with a16z! Please find the meetup slides [here](https://docs.google.com/presentation/d/1QL-XPFXiFpDBh86DbEegFXBXFXjix4v032GhShbKf3s/edit?usp=sharing).
- [2023/08] We would like to express our sincere gratitude to [Andreessen Horowitz](https://a16z.com/2023/08/30/supporting-the-open-source-ai-community/) (a16z) for providing a generous grant to support the open-source development and research of vLLM.
- [2023/06] We officially released vLLM! FastChat-vLLM integration has powered [LMSYS Vicuna and Chatbot Arena](https://chat.lmsys.org) since mid-April. Check out our [blog post](https://vllm.ai).
</details>
- [2024/12] vLLM joins [PyTorch ecosystem](https://pytorch.org/blog/vllm-joins-pytorch)! Easy, Fast, and Cheap LLM Serving for Everyone!
---
## About
vLLM is a fast and easy-to-use library for LLM inference and serving.
@ -98,7 +86,7 @@ Visit our [documentation](https://docs.vllm.ai/en/latest/) to learn more.
## Contributing
We welcome and value any contributions and collaborations.
Please check out [Contributing to vLLM](https://docs.vllm.ai/en/stable/contributing/overview.html) for how to get involved.
Please check out [CONTRIBUTING.md](./CONTRIBUTING.md) for how to get involved.
## Sponsors
@ -121,7 +109,6 @@ Compute Resources:
- Databricks
- DeepInfra
- Google Cloud
- Intel
- Lambda Lab
- Nebius
- Novita AI

View File

@ -41,39 +41,29 @@ become available.
<td><code>synthetic</code></td>
</tr>
<tr>
<td><strong>HuggingFace-VisionArena</strong></td>
<td style="text-align: center;"></td>
<td style="text-align: center;"></td>
<td><code>lmarena-ai/VisionArena-Chat</code></td>
<td><strong>HuggingFace</strong></td>
<td style="text-align: center;">🟡</td>
<td style="text-align: center;">🟡</td>
<td>Specify your dataset path on HuggingFace</td>
</tr>
<tr>
<td><strong>HuggingFace-InstructCoder</strong></td>
<td><strong>VisionArena</strong></td>
<td style="text-align: center;"></td>
<td style="text-align: center;"></td>
<td><code>likaixin/InstructCoder</code></td>
</tr>
<tr>
<td><strong>HuggingFace-AIMO</strong></td>
<td style="text-align: center;"></td>
<td style="text-align: center;"></td>
<td><code>AI-MO/aimo-validation-aime</code> , <code>AI-MO/NuminaMath-1.5</code>, <code>AI-MO/NuminaMath-CoT</code></td>
</tr>
<tr>
<td><strong>HuggingFace-Other</strong></td>
<td style="text-align: center;"></td>
<td style="text-align: center;"></td>
<td><code>lmms-lab/LLaVA-OneVision-Data</code>, <code>Aeala/ShareGPT_Vicuna_unfiltered</code></td>
<td><code>lmarena-ai/vision-arena-bench-v0.1</code> (a HuggingFace dataset)</td>
</tr>
</tbody>
</table>
✅: supported
🟡: Partial support
🚧: to be supported
**Note**: HuggingFace dataset's `dataset-name` should be set to `hf`
🟡: Partial support. Currently, HuggingFaceDataset only supports dataset formats
similar to `lmms-lab/LLaVA-OneVision-Data` and `Aeala/ShareGPT_Vicuna_unfiltered`.
If you need support for other dataset formats, please consider contributing.
**Note**: VisionArenas `dataset-name` should be set to `hf`
---
## Example - Online Benchmark
@ -81,7 +71,8 @@ become available.
First start serving your model
```bash
vllm serve NousResearch/Hermes-3-Llama-3.1-8B --disable-log-requests
MODEL_NAME="NousResearch/Hermes-3-Llama-3.1-8B"
vllm serve ${MODEL_NAME} --disable-log-requests
```
Then run the benchmarking script
@ -89,13 +80,12 @@ Then run the benchmarking script
```bash
# download dataset
# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
python3 vllm/benchmarks/benchmark_serving.py \
--backend vllm \
--model NousResearch/Hermes-3-Llama-3.1-8B \
--endpoint /v1/completions \
--dataset-name sharegpt \
--dataset-path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
--num-prompts 10
MODEL_NAME="NousResearch/Hermes-3-Llama-3.1-8B"
NUM_PROMPTS=10
BACKEND="vllm"
DATASET_NAME="sharegpt"
DATASET_PATH="<your data path>/ShareGPT_V3_unfiltered_cleaned_split.json"
python3 vllm/benchmarks/benchmark_serving.py --backend ${BACKEND} --model ${MODEL_NAME} --endpoint /v1/completions --dataset-name ${DATASET_NAME} --dataset-path ${DATASET_PATH} --num-prompts ${NUM_PROMPTS}
```
If successful, you will see the following output
@ -132,105 +122,88 @@ vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
```
```bash
MODEL_NAME="Qwen/Qwen2-VL-7B-Instruct"
NUM_PROMPTS=10
BACKEND="openai-chat"
DATASET_NAME="hf"
DATASET_PATH="lmarena-ai/vision-arena-bench-v0.1"
DATASET_SPLIT='train'
python3 vllm/benchmarks/benchmark_serving.py \
--backend openai-chat \
--model Qwen/Qwen2-VL-7B-Instruct \
--endpoint /v1/chat/completions \
--dataset-name hf \
--dataset-path lmarena-ai/VisionArena-Chat \
--hf-split train \
--num-prompts 1000
--backend "${BACKEND}" \
--model "${MODEL_NAME}" \
--endpoint "/v1/chat/completions" \
--dataset-name "${DATASET_NAME}" \
--dataset-path "${DATASET_PATH}" \
--hf-split "${DATASET_SPLIT}" \
--num-prompts "${NUM_PROMPTS}"
```
### InstructCoder Benchmark with Speculative Decoding
### HuggingFaceDataset Examples
``` bash
VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
--speculative-model "[ngram]" \
--ngram_prompt_lookup_min 2 \
--ngram-prompt-lookup-max 5 \
--num_speculative_tokens 5
```
``` bash
python3 benchmarks/benchmark_serving.py \
--model meta-llama/Meta-Llama-3-8B-Instruct \
--dataset-name hf \
--dataset-path likaixin/InstructCoder \
--num-prompts 2048
```
### Other HuggingFaceDataset Examples
Currently, HuggingFaceDataset only supports dataset formats
similar to `lmms-lab/LLaVA-OneVision-Data` and `Aeala/ShareGPT_Vicuna_unfiltered`. If you need support for other dataset
formats, please consider contributing.
```bash
# need a model with vision capability here
vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
```
**`lmms-lab/LLaVA-OneVision-Data`**
```bash
MODEL_NAME="Qwen/Qwen2-VL-7B-Instruct"
NUM_PROMPTS=10
BACKEND="openai-chat"
DATASET_NAME="hf"
DATASET_PATH="lmms-lab/LLaVA-OneVision-Data"
DATASET_SPLIT='train'
DATASET_SUBSET='chart2text(cauldron)'
python3 vllm/benchmarks/benchmark_serving.py \
--backend openai-chat \
--model Qwen/Qwen2-VL-7B-Instruct \
--endpoint /v1/chat/completions \
--dataset-name hf \
--dataset-path lmms-lab/LLaVA-OneVision-Data \
--hf-split train \
--hf-subset "chart2text(cauldron)" \
--num-prompts 10
--backend "${BACKEND}" \
--model "${MODEL_NAME}" \
--endpoint "/v1/chat/completions" \
--dataset-name "${DATASET_NAME}" \
--dataset-path "${DATASET_PATH}" \
--hf-split "${DATASET_SPLIT}" \
--num-prompts "${NUM_PROMPTS}" \
--hf-subset "${DATASET_SUBSET}"
```
**`Aeala/ShareGPT_Vicuna_unfiltered`**
```bash
MODEL_NAME="Qwen/Qwen2-VL-7B-Instruct"
NUM_PROMPTS=10
BACKEND="openai-chat"
DATASET_NAME="hf"
DATASET_PATH="Aeala/ShareGPT_Vicuna_unfiltered"
DATASET_SPLIT='train'
python3 vllm/benchmarks/benchmark_serving.py \
--backend openai-chat \
--model Qwen/Qwen2-VL-7B-Instruct \
--endpoint /v1/chat/completions \
--dataset-name hf \
--dataset-path Aeala/ShareGPT_Vicuna_unfiltered \
--hf-split train \
--num-prompts 10
```
**`AI-MO/aimo-validation-aime`**
``` bash
python3 vllm/benchmarks/benchmark_serving.py \
--model Qwen/QwQ-32B \
--dataset-name hf \
--dataset-path AI-MO/aimo-validation-aime \
--num-prompts 10 \
--seed 42
```
### Running With Sampling Parameters
When using OpenAI-compatible backends such as `vllm`, optional sampling
parameters can be specified. Example client command:
```bash
python3 vllm/benchmarks/benchmark_serving.py \
--backend vllm \
--model NousResearch/Hermes-3-Llama-3.1-8B \
--endpoint /v1/completions \
--dataset-name sharegpt \
--dataset-path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
--top-k 10 \
--top-p 0.9 \
--temperature 0.5 \
--num-prompts 10
--backend "${BACKEND}" \
--model "${MODEL_NAME}" \
--endpoint "/v1/chat/completions" \
--dataset-name "${DATASET_NAME}" \
--dataset-path "${DATASET_PATH}" \
--hf-split "${DATASET_SPLIT}" \
--num-prompts "${NUM_PROMPTS}" \
```
---
## Example - Offline Throughput Benchmark
```bash
MODEL_NAME="NousResearch/Hermes-3-Llama-3.1-8B"
NUM_PROMPTS=10
DATASET_NAME="sonnet"
DATASET_PATH="vllm/benchmarks/sonnet.txt"
python3 vllm/benchmarks/benchmark_throughput.py \
--model NousResearch/Hermes-3-Llama-3.1-8B \
--dataset-name sonnet \
--dataset-path vllm/benchmarks/sonnet.txt \
--num-prompts 10
--model "${MODEL_NAME}" \
--dataset-name "${DATASET_NAME}" \
--dataset-path "${DATASET_PATH}" \
--num-prompts "${NUM_PROMPTS}"
```
If successful, you will see the following output
@ -244,13 +217,19 @@ Total num output tokens: 1500
### VisionArena Benchmark for Vision Language Models
``` bash
MODEL_NAME="Qwen/Qwen2-VL-7B-Instruct"
NUM_PROMPTS=10
DATASET_NAME="hf"
DATASET_PATH="lmarena-ai/vision-arena-bench-v0.1"
DATASET_SPLIT="train"
python3 vllm/benchmarks/benchmark_throughput.py \
--model Qwen/Qwen2-VL-7B-Instruct \
--backend vllm-chat \
--dataset-name hf \
--dataset-path lmarena-ai/VisionArena-Chat \
--num-prompts 1000 \
--hf-split train
--model "${MODEL_NAME}" \
--backend "vllm-chat" \
--dataset-name "${DATASET_NAME}" \
--dataset-path "${DATASET_PATH}" \
--num-prompts "${NUM_PROMPTS}" \
--hf-split "${DATASET_SPLIT}"
```
The `num prompt tokens` now includes image token counts
@ -261,83 +240,29 @@ Total num prompt tokens: 14527
Total num output tokens: 1280
```
### InstructCoder Benchmark with Speculative Decoding
``` bash
VLLM_WORKER_MULTIPROC_METHOD=spawn \
VLLM_USE_V1=1 \
python3 vllm/benchmarks/benchmark_throughput.py \
--dataset-name=hf \
--dataset-path=likaixin/InstructCoder \
--model=meta-llama/Meta-Llama-3-8B-Instruct \
--input-len=1000 \
--output-len=100 \
--num-prompts=2048 \
--async-engine \
--speculative-model="[ngram]" \
--ngram_prompt_lookup_min=2 \
--ngram-prompt-lookup-max=5 \
--num_speculative_tokens=5
```
```
Throughput: 104.77 requests/s, 23836.22 total tokens/s, 10477.10 output tokens/s
Total num prompt tokens: 261136
Total num output tokens: 204800
```
### Other HuggingFaceDataset Examples
**`lmms-lab/LLaVA-OneVision-Data`**
```bash
python3 vllm/benchmarks/benchmark_throughput.py \
--model Qwen/Qwen2-VL-7B-Instruct \
--backend vllm-chat \
--dataset-name hf \
--dataset-path lmms-lab/LLaVA-OneVision-Data \
--hf-split train \
--hf-subset "chart2text(cauldron)" \
--num-prompts 10
```
**`Aeala/ShareGPT_Vicuna_unfiltered`**
```bash
python3 vllm/benchmarks/benchmark_throughput.py \
--model Qwen/Qwen2-VL-7B-Instruct \
--backend vllm-chat \
--dataset-name hf \
--dataset-path Aeala/ShareGPT_Vicuna_unfiltered \
--hf-split train \
--num-prompts 10
```
**`AI-MO/aimo-validation-aime`**
```bash
python3 benchmarks/benchmark_throughput.py \
--model Qwen/QwQ-32B \
--backend vllm \
--dataset-name hf \
--dataset-path AI-MO/aimo-validation-aime \
--hf-split train \
--num-prompts 10
```
### Benchmark with LoRA Adapters
``` bash
# download dataset
# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
MODEL_NAME="meta-llama/Llama-2-7b-hf"
BACKEND="vllm"
DATASET_NAME="sharegpt"
DATASET_PATH="<your data path>/ShareGPT_V3_unfiltered_cleaned_split.json"
NUM_PROMPTS=10
MAX_LORAS=2
MAX_LORA_RANK=8
ENABLE_LORA="--enable-lora"
LORA_PATH="yard1/llama-2-7b-sql-lora-test"
python3 vllm/benchmarks/benchmark_throughput.py \
--model meta-llama/Llama-2-7b-hf \
--backend vllm \
--dataset_path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
--dataset_name sharegpt \
--num-prompts 10 \
--max-loras 2 \
--max-lora-rank 8 \
--enable-lora \
--lora-path yard1/llama-2-7b-sql-lora-test
--model "${MODEL_NAME}" \
--backend "${BACKEND}" \
--dataset_path "${DATASET_PATH}" \
--dataset_name "${DATASET_NAME}" \
--num-prompts "${NUM_PROMPTS}" \
--max-loras "${MAX_LORAS}" \
--max-lora-rank "${MAX_LORA_RANK}" \
${ENABLE_LORA} \
--lora-path "${LORA_PATH}"
```

View File

@ -1,6 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
import io
import json
import os
import sys
@ -33,7 +32,6 @@ class RequestFuncInput:
extra_body: Optional[dict] = None
multi_modal_content: Optional[dict] = None
ignore_eos: bool = False
language: Optional[str] = None
@dataclass
@ -221,15 +219,7 @@ async def async_request_deepspeed_mii(
if response.status == 200:
parsed_resp = await response.json()
output.latency = time.perf_counter() - st
if "choices" in parsed_resp:
output.generated_text = parsed_resp["choices"][0][
"text"]
elif "text" in parsed_resp:
output.generated_text = parsed_resp["text"][0]
else:
output.error = ("Unexpected response format: "
"neither 'choices' nor 'text' found")
output.success = False
output.generated_text = parsed_resp["text"][0]
output.success = True
else:
output.error = response.reason or ""
@ -438,110 +428,6 @@ async def async_request_openai_chat_completions(
return output
async def async_request_openai_audio(
request_func_input: RequestFuncInput,
pbar: Optional[tqdm] = None,
) -> RequestFuncOutput:
# Lazy import without PlaceholderModule to avoid vllm dep.
import soundfile
api_url = request_func_input.api_url
assert api_url.endswith(
("transcriptions", "translations"
)), "OpenAI Chat Completions API URL must end with 'transcriptions' "
"or `translations`."
async with aiohttp.ClientSession(trust_env=True,
timeout=AIOHTTP_TIMEOUT) as session:
content = [{"type": "text", "text": request_func_input.prompt}]
payload = {
"model": request_func_input.model_name \
if request_func_input.model_name else request_func_input.model,
"temperature": 0.0,
"max_completion_tokens": request_func_input.output_len,
"stream": True,
"language": "en",
# Flattened due to multipart/form-data
"stream_include_usage": True,
"stream_continuous_usage_stats": True
}
if request_func_input.extra_body:
payload.update(request_func_input.extra_body)
headers = {
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
}
# Send audio file
def to_bytes(y, sr):
buffer = io.BytesIO()
soundfile.write(buffer, y, sr, format="WAV")
buffer.seek(0)
return buffer
with to_bytes(*request_func_input.multi_modal_content['audio']) as f:
form = aiohttp.FormData()
form.add_field('file', f, content_type='audio/wav')
for key, value in payload.items():
form.add_field(key, str(value))
output = RequestFuncOutput()
output.prompt_len = request_func_input.prompt_len
generated_text = ""
ttft = 0.0
st = time.perf_counter()
most_recent_timestamp = st
try:
async with session.post(url=api_url,
data=form,
headers=headers) as response:
if response.status == 200:
async for chunk_bytes in response.content:
chunk_bytes = chunk_bytes.strip()
if not chunk_bytes:
continue
chunk = chunk_bytes.decode("utf-8").removeprefix(
"data: ")
if chunk != "[DONE]":
timestamp = time.perf_counter()
data = json.loads(chunk)
if choices := data.get("choices"):
content = choices[0]["delta"].get(
"content")
# First token
if ttft == 0.0:
ttft = timestamp - st
output.ttft = ttft
# Decoding phase
else:
output.itl.append(
timestamp - most_recent_timestamp)
generated_text += content or ""
elif usage := data.get("usage"):
output.output_tokens = usage.get(
"completion_tokens")
most_recent_timestamp = timestamp
output.generated_text = generated_text
output.success = True
output.latency = most_recent_timestamp - st
else:
output.error = response.reason or ""
output.success = False
except Exception:
output.success = False
exc_info = sys.exc_info()
output.error = "".join(traceback.format_exception(*exc_info))
if pbar:
pbar.update(1)
return output
def get_model(pretrained_model_name_or_path: str) -> str:
if os.getenv('VLLM_USE_MODELSCOPE', 'False').lower() == 'true':
from modelscope import snapshot_download
@ -599,14 +485,7 @@ ASYNC_REQUEST_FUNCS = {
"deepspeed-mii": async_request_deepspeed_mii,
"openai": async_request_openai_completions,
"openai-chat": async_request_openai_chat_completions,
"openai-audio": async_request_openai_audio,
"tensorrt-llm": async_request_trt_llm,
"scalellm": async_request_openai_completions,
"sglang": async_request_openai_completions,
}
OPENAI_COMPATIBLE_BACKENDS = [
k for k, v in ASYNC_REQUEST_FUNCS.items()
if v in (async_request_openai_completions,
async_request_openai_chat_completions)
]

View File

@ -23,8 +23,7 @@ from abc import ABC, abstractmethod
from collections.abc import Mapping
from dataclasses import dataclass
from functools import cache
from io import BytesIO
from typing import Any, Callable, Optional, Union
from typing import Any, Optional, Union
import numpy as np
import pandas as pd
@ -64,7 +63,6 @@ class SampleRequest:
class BenchmarkDataset(ABC):
DEFAULT_SEED = 0
IS_MULTIMODAL = False
def __init__(
self,
@ -241,24 +239,21 @@ def process_image(image: Any) -> Mapping[str, Any]:
"""
Process a single image input and return a multimedia content dictionary.
Supports three input types:
For a PIL.Image.Image input:
- Converts the image to RGB.
- Saves the image as a JPEG in-memory.
- Encodes the JPEG data as a base64 string.
- Returns a dictionary with the image as a base64 data URL.
1. Dictionary with raw image bytes: - Expects a dict with a 'bytes' key
containing raw image data. - Loads the bytes as a PIL.Image.Image.
2. PIL.Image.Image input: - Converts the image to RGB. - Saves the image as
a JPEG in memory. - Encodes the JPEG data as a base64 string. - Returns
a dictionary with the image as a base64 data URL.
3. String input: - Treats the string as a URL or local file path. -
Prepends "file://" if the string doesn't start with "http://" or
"file://". - Returns a dictionary with the image URL.
For a string input:
- Treats the string as a URL or file path.
- Prepends "file://" if the string doesn't start with "http://" or
"file://".
- Returns a dictionary with the image URL.
Raises:
ValueError: If the input is not a supported type.
ValueError: If the input is neither a PIL.Image.Image nor a string.
"""
if isinstance(image, dict) and 'bytes' in image:
image = Image.open(BytesIO(image['bytes']))
if isinstance(image, Image.Image):
image = image.convert("RGB")
with io.BytesIO() as image_data:
@ -277,8 +272,8 @@ def process_image(image: Any) -> Mapping[str, Any]:
("http://", "file://")) else f"file://{image}")
return {"type": "image_url", "image_url": {"url": image_url}}
raise ValueError(f"Invalid image input {image}. Must be a PIL.Image.Image"
" or str or dictionary with raw image bytes.")
raise ValueError(
f"Invalid image input {image}. Must be a PIL.Image.Image or str.")
# -----------------------------------------------------------------------------
@ -289,7 +284,7 @@ def process_image(image: Any) -> Mapping[str, Any]:
class RandomDataset(BenchmarkDataset):
# Default values copied from benchmark_serving.py for the random dataset.
DEFAULT_PREFIX_LEN = 0
DEFAULT_RANGE_RATIO = 0.0
DEFAULT_RANGE_RATIO = 1.0
DEFAULT_INPUT_LEN = 1024
DEFAULT_OUTPUT_LEN = 128
@ -309,32 +304,19 @@ class RandomDataset(BenchmarkDataset):
output_len: int = DEFAULT_OUTPUT_LEN,
**kwargs,
) -> list[SampleRequest]:
# Enforce range_ratio < 1
assert range_ratio < 1.0, (
"random_range_ratio must be < 1.0 to ensure a valid sampling range"
)
vocab_size = tokenizer.vocab_size
prefix_token_ids = (np.random.randint(
0, vocab_size, size=prefix_len).tolist() if prefix_len > 0 else [])
# New sampling logic: [X * (1 - b), X * (1 + b)]
input_low = int(input_len * (1 - range_ratio))
input_high = int(input_len * (1 + range_ratio))
output_low = int(output_len * (1 - range_ratio))
output_high = int(output_len * (1 + range_ratio))
# Add logging for debugging
logger.info("Sampling input_len from [%s, %s]", input_low, input_high)
logger.info("Sampling output_len from [%s, %s]", output_low,
output_high)
input_low = int(input_len * range_ratio)
output_low = int(output_len * range_ratio)
input_lens = np.random.randint(input_low,
input_high + 1,
input_len + 1,
size=num_requests)
output_lens = np.random.randint(output_low,
output_high + 1,
output_len + 1,
size=num_requests)
offsets = np.random.randint(0, vocab_size, size=num_requests)
@ -486,11 +468,11 @@ class SonnetDataset(BenchmarkDataset):
# Determine how many poem lines to use.
num_input_lines = round((input_len - base_offset) / avg_len)
num_prefix_lines = max(round((prefix_len - base_offset) / avg_len), 0)
num_prefix_lines = round((prefix_len - base_offset) / avg_len)
prefix_lines = self.data[:num_prefix_lines]
samples = []
while len(samples) < num_requests:
for _ in range(num_requests):
extra_lines = random.choices(self.data,
k=num_input_lines - num_prefix_lines)
prompt = f"{base_prompt}{''.join(prefix_lines + extra_lines)}"
@ -498,14 +480,13 @@ class SonnetDataset(BenchmarkDataset):
prompt_formatted = tokenizer.apply_chat_template(
msg, add_generation_prompt=True, tokenize=False)
prompt_len = len(tokenizer(prompt_formatted).input_ids)
if prompt_len <= input_len:
samples.append(
SampleRequest(
prompt=prompt_formatted
if return_prompt_formatted else prompt,
prompt_len=prompt_len,
expected_output_len=output_len,
))
samples.append(
SampleRequest(
prompt=prompt_formatted
if return_prompt_formatted else prompt,
prompt_len=prompt_len,
expected_output_len=output_len,
))
return samples
@ -581,48 +562,48 @@ class BurstGPTDataset(BenchmarkDataset):
# -----------------------------------------------------------------------------
# HuggingFace Dataset Base Implementation
# HuggingFace Dataset Implementation
# -----------------------------------------------------------------------------
class HuggingFaceDataset(BenchmarkDataset):
"""Base class for datasets hosted on HuggingFace."""
SUPPORTED_DATASET_PATHS: Union[set[str], dict[str, Callable]] = set()
class HuggingFaceDataset(BenchmarkDataset):
"""
Dataset class for processing a HuggingFace dataset with conversation data
and optional images.
"""
def __init__(
self,
dataset_path: str,
dataset_split: str,
dataset_subset: Optional[str] = None,
**kwargs,
) -> None:
super().__init__(dataset_path=dataset_path, **kwargs)
super().__init__(**kwargs)
self.dataset_split = dataset_split
self.dataset_subset = dataset_subset
self.load_data()
def load_data(self) -> None:
"""Load data from HuggingFace datasets."""
if not self.dataset_path:
raise ValueError("dataset_path must be provided for loading data.")
self.data = load_dataset(
self.dataset_path,
name=self.dataset_subset,
split=self.dataset_split,
streaming=True,
)
self.data = self.data.shuffle(seed=self.random_seed)
# -----------------------------------------------------------------------------
# Conversation Dataset Implementation
# -----------------------------------------------------------------------------
class ConversationDataset(HuggingFaceDataset):
"""Dataset for conversation data with multimodal support."""
SUPPORTED_DATASET_PATHS = {
'lmms-lab/LLaVA-OneVision-Data', 'Aeala/ShareGPT_Vicuna_unfiltered'
}
IS_MULTIMODAL = True
if self.data.features is None or "conversations" \
not in self.data.features:
raise ValueError(
"HuggingFaceDataset currently only supports datasets with "
"a 'conversations' column like lmms-lab/LLaVA-OneVision-Data. "
"Please consider contributing if you would like to add "
"support for additional dataset formats.")
# Shuffle and filter examples with at least 2 conversations.
self.data = self.data.shuffle(seed=self.random_seed).filter(
lambda x: len(x["conversations"]) >= 2)
def sample(self,
tokenizer: PreTrainedTokenizerBase,
@ -630,13 +611,10 @@ class ConversationDataset(HuggingFaceDataset):
output_len: Optional[int] = None,
enable_multimodal_chat: bool = False,
**kwargs) -> list:
# Filter examples with at least 2 conversations
filtered_data = self.data.filter(
lambda x: len(x["conversations"]) >= 2)
sampled_requests = []
dynamic_output = output_len is None
for item in filtered_data:
for item in self.data:
if len(sampled_requests) >= num_requests:
break
conv = item["conversations"]
@ -681,13 +659,29 @@ class VisionArenaDataset(HuggingFaceDataset):
"""
DEFAULT_OUTPUT_LEN = 128
SUPPORTED_DATASET_PATHS = {
"lmarena-ai/VisionArena-Chat":
lambda x: x["conversation"][0][0]["content"],
"lmarena-ai/vision-arena-bench-v0.1":
lambda x: x["turns"][0][0]["content"]
}
IS_MULTIMODAL = True
VISION_ARENA_DATASET_PATH = "lmarena-ai/vision-arena-bench-v0.1"
def __init__(
self,
**kwargs,
) -> None:
super().__init__(**kwargs)
if self.dataset_path != self.VISION_ARENA_DATASET_PATH:
raise ValueError(f"Only support Vision Arena dataset.\
This data path {self.dataset_path} is not valid.")
if self.dataset_subset is None and self.dataset_split != "train":
raise ValueError("Dataset split must be 'train'.")
self.load_data()
def load_data(self) -> None:
dataset = load_dataset(
self.dataset_path,
name=self.dataset_subset,
split=self.dataset_split,
streaming=True,
)
self.data = dataset.shuffle(seed=self.random_seed)
def sample(
self,
@ -703,11 +697,7 @@ class VisionArenaDataset(HuggingFaceDataset):
for item in self.data:
if len(sampled_requests) >= num_requests:
break
parser_fn = self.SUPPORTED_DATASET_PATHS.get(self.dataset_path)
if parser_fn is None:
raise ValueError(
f"Unsupported dataset path: {self.dataset_path}")
prompt = parser_fn(item)
prompt = item["turns"][0][0]["content"]
mm_content = process_image(item["images"][0])
prompt_len = len(tokenizer(prompt).input_ids)
if enable_multimodal_chat:
@ -725,173 +715,3 @@ class VisionArenaDataset(HuggingFaceDataset):
))
self.maybe_oversample_requests(sampled_requests, num_requests)
return sampled_requests
# -----------------------------------------------------------------------------
# Instruct Coder Dataset Implementation
# -----------------------------------------------------------------------------
class InstructCoderDataset(HuggingFaceDataset):
"""
InstructCoder Dataset.
https://huggingface.co/datasets/likaixin/InstructCoder
InstructCoder is the dataset designed for general code editing. It consists
of 114,239 instruction-input-output triplets, and covers multiple distinct
code editing scenario.
"""
DEFAULT_OUTPUT_LEN = 200 # this is the average default output length
SUPPORTED_DATASET_PATHS = {
"likaixin/InstructCoder",
}
def sample(self,
tokenizer: PreTrainedTokenizerBase,
num_requests: int,
output_len: Optional[int] = None,
enable_multimodal_chat: bool = False,
**kwargs) -> list:
output_len = (output_len
if output_len is not None else self.DEFAULT_OUTPUT_LEN)
sampled_requests = []
for item in self.data:
if len(sampled_requests) >= num_requests:
break
prompt = f"{item['instruction']}:\n{item['input']}"
prompt_len = len(tokenizer(prompt).input_ids)
sampled_requests.append(
SampleRequest(
prompt=prompt,
prompt_len=prompt_len,
expected_output_len=output_len,
))
self.maybe_oversample_requests(sampled_requests, num_requests)
return sampled_requests
# -----------------------------------------------------------------------------
# AIMO Dataset Implementation
# -----------------------------------------------------------------------------
class AIMODataset(HuggingFaceDataset):
"""
Dataset class for processing a AIMO dataset with reasoning questions.
"""
SUPPORTED_DATASET_PATHS = {
"AI-MO/aimo-validation-aime", "AI-MO/NuminaMath-1.5",
"AI-MO/NuminaMath-CoT"
}
def sample(self,
tokenizer: PreTrainedTokenizerBase,
num_requests: int,
output_len: Optional[int] = None,
**kwargs) -> list:
sampled_requests = []
dynamic_output = output_len is None
for item in self.data:
if len(sampled_requests) >= num_requests:
break
prompt, completion = item['problem'], item["solution"]
prompt_ids = tokenizer(prompt).input_ids
completion_ids = tokenizer(completion).input_ids
prompt_len = len(prompt_ids)
completion_len = len(completion_ids)
output_len = completion_len if dynamic_output else output_len
assert isinstance(output_len, int) and output_len > 0
if dynamic_output and not is_valid_sequence(prompt_len,
completion_len,
max_prompt_len=2048,
max_total_len=32000):
continue
sampled_requests.append(
SampleRequest(
prompt=prompt,
prompt_len=prompt_len,
expected_output_len=output_len,
multi_modal_data=None,
))
self.maybe_oversample_requests(sampled_requests, num_requests)
return sampled_requests
# -----------------------------------------------------------------------------
# ASR Dataset Implementation
# -----------------------------------------------------------------------------
class ASRDataset(HuggingFaceDataset):
"""
Dataset class for processing a ASR dataset for transcription.
Tested on the following set:
+----------------+----------------------------------------+--------------------------+-----------------------------+
| Dataset | Domain | Speaking Style | hf-subset |
+----------------+----------------------------------------+--------------------------+-----------------------------+
| TED-LIUM | TED talks | Oratory | release1, release2, release3|
| | | | release3-speaker-adaptation |
| VoxPopuli | European Parliament | Oratory | en, de, it, fr, ... |
| LibriSpeech | Audiobook | Narrated | "LIUM/tedlium" |
| GigaSpeech | Audiobook, podcast, YouTube | Narrated, spontaneous | xs, s, m, l, xl, dev, test |
| SPGISpeech | Financial meetings | Oratory, spontaneous | S, M, L, dev, test |
| AMI | Meetings | Spontaneous | ihm, sdm |
+----------------+----------------------------------------+--------------------------+-----------------------------+
""" # noqa: E501
SUPPORTED_DATASET_PATHS = {
"openslr/librispeech_asr", "facebook/voxpopuli", "LIUM/tedlium",
"edinburghcstr/ami", "speechcolab/gigaspeech", "kensho/spgispeech"
}
DEFAULT_OUTPUT_LEN = 128
IS_MULTIMODAL = True
# TODO Whisper-specific. Abstract interface when more models are supported.
TRANSCRIPTION_PREAMBLE = "<|startoftranscript|><|en|><|transcribe|>"\
"<|notimestamps|>"
skip_long_audios: bool = True
def sample(
self,
tokenizer: PreTrainedTokenizerBase,
num_requests: int,
output_len: Optional[int] = None,
**kwargs,
) -> list:
import librosa
output_len = (output_len
if output_len is not None else self.DEFAULT_OUTPUT_LEN)
prompt = ASRDataset.TRANSCRIPTION_PREAMBLE
prompt_len = len(tokenizer(prompt).input_ids)
sampled_requests = []
skipped = 0
for item in self.data:
if len(sampled_requests) >= num_requests:
break
audio = item["audio"]
y, sr = audio["array"], audio["sampling_rate"]
duration_s = librosa.get_duration(y=y, sr=sr)
# Whisper max supported duration
if self.skip_long_audios and duration_s > 30:
skipped += 1
continue
mm_content = {"audio": (y, sr)}
sampled_requests.append(
SampleRequest(
prompt=prompt,
prompt_len=prompt_len,
expected_output_len=output_len,
multi_modal_data=mm_content,
))
if skipped:
logger.warning("%d samples discarded from dataset due to" \
" their length being greater than" \
" what Whisper supports.", skipped)
self.maybe_oversample_requests(sampled_requests, num_requests)
return sampled_requests

View File

@ -63,16 +63,14 @@ class Request:
output_len: int
def sample_tokens(tokenizer: PreTrainedTokenizerBase,
length: int) -> list[int]:
def sample_tokens(tokenizer: PreTrainedTokenizerBase, length: int) -> str:
vocab = tokenizer.get_vocab()
all_special_ids = set(tokenizer.all_special_ids)
# Remove the special tokens.
return random.choices(
[v for k, v in vocab.items() if k not in all_special_ids],
k=length,
)
vocab = {
k: v
for k, v in vocab.items() if k not in tokenizer.all_special_ids
}
return random.choices(list(vocab.values()), k=length)
def sample_requests_from_dataset(

View File

@ -7,6 +7,9 @@ On the server side, run one of the following commands:
--swap-space 16 \
--disable-log-requests
(TGI backend)
./launch_tgi_server.sh <your_model> <max_batch_total_tokens>
On the client side, run:
python benchmarks/benchmark_serving.py \
--backend <backend> \
@ -34,8 +37,7 @@ from datetime import datetime
from typing import Any, Optional
import numpy as np
from backend_request_func import (ASYNC_REQUEST_FUNCS,
OPENAI_COMPATIBLE_BACKENDS, RequestFuncInput,
from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput,
RequestFuncOutput)
from tqdm.asyncio import tqdm
from transformers import PreTrainedTokenizerBase
@ -50,11 +52,9 @@ try:
except ImportError:
from argparse import ArgumentParser as FlexibleArgumentParser
from benchmark_dataset import (AIMODataset, ASRDataset, BurstGPTDataset,
ConversationDataset, HuggingFaceDataset,
InstructCoderDataset, RandomDataset,
SampleRequest, ShareGPTDataset, SonnetDataset,
VisionArenaDataset)
from benchmark_dataset import (BurstGPTDataset, HuggingFaceDataset,
RandomDataset, SampleRequest, ShareGPTDataset,
SonnetDataset, VisionArenaDataset)
from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
MILLISECONDS_TO_SECONDS_CONVERSION = 1000
@ -156,7 +156,7 @@ def calculate_metrics(
if outputs[i].success:
output_len = outputs[i].output_tokens
if not output_len:
if output_len is None:
# We use the tokenizer to count the number of output tokens
# for some serving backends instead of looking at
# len(outputs[i].itl) since multiple output tokens may be
@ -261,7 +261,6 @@ async def benchmark(
goodput_config_dict: dict[str, float],
max_concurrency: Optional[int],
lora_modules: Optional[Iterable[str]],
extra_body: Optional[dict],
):
if backend in ASYNC_REQUEST_FUNCS:
request_func = ASYNC_REQUEST_FUNCS[backend]
@ -274,6 +273,10 @@ async def benchmark(
input_requests[0].expected_output_len, \
input_requests[0].multi_modal_data
if backend != "openai-chat" and test_mm_content is not None:
# multi-modal benchmark is only available on OpenAI Chat backend.
raise ValueError(
"Multi-modal content is only supported on 'openai-chat' backend.")
assert test_mm_content is None or isinstance(test_mm_content, dict)
test_input = RequestFuncInput(
model=model_id,
@ -285,7 +288,6 @@ async def benchmark(
logprobs=logprobs,
multi_modal_content=test_mm_content,
ignore_eos=ignore_eos,
extra_body=extra_body,
)
test_output = await request_func(request_func_input=test_input)
@ -312,8 +314,7 @@ async def benchmark(
output_len=test_output_len,
logprobs=logprobs,
multi_modal_content=test_mm_content,
ignore_eos=ignore_eos,
extra_body=extra_body)
ignore_eos=ignore_eos)
profile_output = await request_func(request_func_input=profile_input)
if profile_output.success:
print("Profiler started")
@ -363,8 +364,7 @@ async def benchmark(
output_len=output_len,
logprobs=logprobs,
multi_modal_content=mm_content,
ignore_eos=ignore_eos,
extra_body=extra_body)
ignore_eos=ignore_eos)
tasks.append(
asyncio.create_task(
limited_request_func(request_func_input=request_func_input,
@ -586,49 +586,19 @@ def main(args: argparse.Namespace):
return_prompt_formatted=True)
elif args.dataset_name == "hf":
# all following datasets are implemented from the
# HuggingFaceDataset base class
if args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS:
dataset_class = VisionArenaDataset
args.hf_split = "train"
args.hf_subset = None
elif args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS:
dataset_class = InstructCoderDataset
args.hf_split = "train"
elif args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS:
dataset_class = ConversationDataset
elif args.dataset_path in AIMODataset.SUPPORTED_DATASET_PATHS:
dataset_class = AIMODataset
args.hf_split = "train"
elif args.dataset_path in ASRDataset.SUPPORTED_DATASET_PATHS:
dataset_class = ASRDataset
args.hf_split = "train"
else:
supported_datasets = set([
dataset_name for cls in HuggingFaceDataset.__subclasses__()
for dataset_name in cls.SUPPORTED_DATASET_PATHS
])
raise ValueError(
f"Unsupported dataset path: {args.dataset_path}. "
"Huggingface dataset only supports dataset_path"
f" from one of following: {supported_datasets}. "
"Please consider contributing if you would "
"like to add support for additional dataset formats.")
if (dataset_class.IS_MULTIMODAL and backend not in \
["openai-chat", "openai-audio"]):
# multi-modal benchmark is only available on OpenAI Chat backend.
raise ValueError(
"Multi-modal content is only supported on 'openai-chat' and " \
"'openai-audio' backend.")
# Choose between VisionArenaDataset
# and HuggingFaceDataset based on provided parameters.
dataset_class = (VisionArenaDataset if args.dataset_path
== VisionArenaDataset.VISION_ARENA_DATASET_PATH
and args.hf_subset is None else HuggingFaceDataset)
input_requests = dataset_class(
dataset_path=args.dataset_path,
dataset_subset=args.hf_subset,
dataset_split=args.hf_split,
random_seed=args.seed,
).sample(
num_requests=args.num_prompts,
tokenizer=tokenizer,
random_seed=args.seed,
output_len=args.hf_output_len,
)
@ -663,26 +633,6 @@ def main(args: argparse.Namespace):
raise ValueError(f"Unknown dataset: {args.dataset_name}") from err
goodput_config_dict = check_goodput_args(args)
# Collect the sampling parameters.
sampling_params = {
k: v
for k, v in {
"top_p": args.top_p,
"top_k": args.top_k,
"min_p": args.min_p,
"temperature": args.temperature
}.items() if v is not None
}
# Sampling parameters are only supported by openai-compatible backend.
if sampling_params and args.backend not in OPENAI_COMPATIBLE_BACKENDS:
raise ValueError(
"Sampling parameters are only supported by openai-compatible "
"backends.")
if "temperature" not in sampling_params:
sampling_params["temperature"] = 0.0 # Default to greedy decoding.
# Avoid GC processing "static" data - reduce pause times.
gc.collect()
gc.freeze()
@ -709,11 +659,10 @@ def main(args: argparse.Namespace):
goodput_config_dict=goodput_config_dict,
max_concurrency=args.max_concurrency,
lora_modules=args.lora_modules,
extra_body=sampling_params,
))
# Save config and results to json
if args.save_result or args.append_result:
if args.save_result:
result_json: dict[str, Any] = {}
# Setup
@ -734,14 +683,6 @@ def main(args: argparse.Namespace):
raise ValueError(
"Invalid metadata format. Please use KEY=VALUE format."
)
# Traffic
result_json["request_rate"] = (args.request_rate if args.request_rate
< float("inf") else "inf")
result_json["burstiness"] = args.burstiness
result_json["max_concurrency"] = args.max_concurrency
# Merge with benchmark result
result_json = {**result_json, **benchmark_result}
if not args.save_detailed:
# Remove fields with too many data points
@ -752,6 +693,15 @@ def main(args: argparse.Namespace):
if field in result_json:
del result_json[field]
# Traffic
result_json["request_rate"] = (args.request_rate if args.request_rate
< float("inf") else "inf")
result_json["burstiness"] = args.burstiness
result_json["max_concurrency"] = args.max_concurrency
# Merge with benchmark result
result_json = {**result_json, **benchmark_result}
# Save to file
base_model_id = model_id.split("/")[-1]
max_concurrency_str = (f"-concurrency{args.max_concurrency}"
@ -761,12 +711,7 @@ def main(args: argparse.Namespace):
file_name = args.result_filename
if args.result_dir:
file_name = os.path.join(args.result_dir, file_name)
with open(file_name,
mode="a+" if args.append_result else "w",
encoding='utf-8') as outfile:
# Append a newline.
if args.append_result and outfile.tell() != 0:
outfile.write("\n")
with open(file_name, "w", encoding='utf-8') as outfile:
json.dump(result_json, outfile)
save_to_pytorch_benchmark_format(args, result_json, file_name)
@ -898,11 +843,6 @@ if __name__ == "__main__":
help="When saving the results, whether to include per request "
"information such as response, error, ttfs, tpots, etc.",
)
parser.add_argument(
"--append-result",
action="store_true",
help="Append the benchmark result to the existing json file.",
)
parser.add_argument(
"--metadata",
metavar="KEY=VALUE",
@ -936,7 +876,7 @@ if __name__ == "__main__":
"--percentile-metrics",
type=str,
default="ttft,tpot,itl",
help="Comma-separated list of selected metrics to report percentils. "
help="Comma-seperated list of selected metrics to report percentils. "
"This argument specifies the metrics to report percentiles. "
"Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"e2el\". "
"Default value is \"ttft,tpot,itl\".")
@ -944,7 +884,7 @@ if __name__ == "__main__":
"--metric-percentiles",
type=str,
default="99",
help="Comma-separated list of percentiles for selected metrics. "
help="Comma-seperated list of percentiles for selected metrics. "
"To report 25-th, 50-th, and 75-th percentiles, use \"25,50,75\". "
"Default value is \"99\". "
"Use \"--percentile-metrics\" to select metrics.",
@ -1011,23 +951,18 @@ if __name__ == "__main__":
random_group.add_argument(
"--random-range-ratio",
type=float,
default=0.0,
help="Range ratio for sampling input/output length, "
"used only for random sampling. Must be in the range [0, 1) to define "
"a symmetric sampling range"
"[length * (1 - range_ratio), length * (1 + range_ratio)].",
default=1.0,
help="Range of sampled ratio of input/output length, "
"used only for random sampling.",
)
random_group.add_argument(
"--random-prefix-len",
type=int,
default=0,
help=("Number of fixed prefix tokens before the random context "
"in a request. "
"The total input length is the sum of `random-prefix-len` and "
"a random "
"context length sampled from [input_len * (1 - range_ratio), "
"input_len * (1 + range_ratio)]."),
)
help="Number of fixed prefix tokens before random "
" context. The length range of context in a random "
" request is [random-prefix-len, "
" random-prefix-len + random-prefix-len * random-range-ratio).")
hf_group = parser.add_argument_group("hf dataset options")
hf_group.add_argument("--hf-subset",
@ -1046,33 +981,6 @@ if __name__ == "__main__":
"from the sampled HF dataset.",
)
sampling_group = parser.add_argument_group("sampling parameters")
sampling_group.add_argument(
"--top-p",
type=float,
default=None,
help="Top-p sampling parameter. Only has effect on openai-compatible "
"backends.")
sampling_group.add_argument(
"--top-k",
type=int,
default=None,
help="Top-k sampling parameter. Only has effect on openai-compatible "
"backends.")
sampling_group.add_argument(
"--min-p",
type=float,
default=None,
help="Min-p sampling parameter. Only has effect on openai-compatible "
"backends.")
sampling_group.add_argument(
"--temperature",
type=float,
default=None,
help="Temperature sampling parameter. Only has effect on "
"openai-compatible backends. If not specified, default to greedy "
"decoding (i.e. temperature==0.0).")
parser.add_argument(
'--tokenizer-mode',
type=str,

View File

@ -5,13 +5,16 @@ On the server side, run one of the following commands:
(vLLM OpenAI API server)
vllm serve <your_model> --disable-log-requests
(TGI backend)
./launch_tgi_server.sh <your_model> <max_batch_total_tokens>
On the client side, run:
python benchmarks/benchmark_serving_structured_output.py \
--backend <backend> \
--model <your_model> \
--dataset json \
--structured-output-ratio 1.0 \
--structured-output-backend auto \
--structured-output-backend xgrammar \
--request-rate 10 \
--num-prompts 1000
@ -51,7 +54,7 @@ try:
except ImportError:
from argparse import ArgumentParser as FlexibleArgumentParser
from vllm.v1.structured_output.backend_xgrammar import (
from vllm.v1.structured_output.utils import (
has_xgrammar_unsupported_json_features)
MILLISECONDS_TO_SECONDS_CONVERSION = 1000
@ -130,11 +133,10 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase,
"description":
"An unique optional field to avoid cached schemas"
}
else:
json_schemas = [schema] * args.num_prompts
def gen_prompt(index: int):
return f"Generate an example of a user profile given the following schema: {json.dumps(get_schema(index))}" # noqa: E501
schema = json_schemas[index % len(json_schemas)]
return f"Generate an example of a user profile given the following schema: {json.dumps(schema)}" # noqa: E501
def get_schema(index: int):
return json_schemas[index % len(json_schemas)]
@ -150,17 +152,17 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase,
elif args.dataset == "grammar":
schema = """
root ::= select_statement
?start: select_statement
select_statement ::= "SELECT " column " from " table " where " condition
?select_statement: "SELECT " column_list " FROM " table_name
column ::= "col_1 " | "col_2 "
?column_list: column_name ("," column_name)*
table ::= "table_1 " | "table_2 "
?table_name: identifier
condition ::= column "= " number
?column_name: identifier
number ::= "1 " | "2 "
?identifier: /[a-zA-Z_][a-zA-Z0-9_]*/
"""
prompt = "Generate an SQL query to show the 'username' \
and 'email' from the 'users' table."
@ -964,7 +966,7 @@ if __name__ == "__main__":
"--percentile-metrics",
type=str,
default="ttft,tpot,itl",
help="Comma-separated list of selected metrics to report percentils. "
help="Comma-seperated list of selected metrics to report percentils. "
"This argument specifies the metrics to report percentiles. "
"Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"e2el\". "
"Default value is \"ttft,tpot,itl\".")
@ -972,7 +974,7 @@ if __name__ == "__main__":
"--metric-percentiles",
type=str,
default="99",
help="Comma-separated list of percentiles for selected metrics. "
help="Comma-seperated list of percentiles for selected metrics. "
"To report 25-th, 50-th, and 75-th percentiles, use \"25,50,75\". "
"Default value is \"99\". "
"Use \"--percentile-metrics\" to select metrics.",
@ -997,14 +999,12 @@ if __name__ == "__main__":
type=float,
default=1.0,
help="Ratio of Structured Outputs requests")
parser.add_argument("--structured-output-backend",
type=str,
choices=[
"outlines", "lm-format-enforcer", "xgrammar",
"guidance", "auto"
],
default="auto",
help="Backend to use for structured outputs")
parser.add_argument(
"--structured-output-backend",
type=str,
choices=["outlines", "lm-format-enforcer", "xgrammar", "guidance"],
default="xgrammar",
help="Backend to use for structured outputs")
args = parser.parse_args()
main(args)

View File

@ -11,8 +11,7 @@ from typing import Any, Optional, Union
import torch
import uvloop
from benchmark_dataset import (AIMODataset, BurstGPTDataset,
ConversationDataset, InstructCoderDataset,
from benchmark_dataset import (BurstGPTDataset, HuggingFaceDataset,
RandomDataset, SampleRequest, ShareGPTDataset,
SonnetDataset, VisionArenaDataset)
from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
@ -213,17 +212,14 @@ def run_hf(
max_prompt_len = 0
max_output_len = 0
for i in range(len(requests)):
prompt = requests[i].prompt
prompt_len = requests[i].prompt_len
output_len = requests[i].expected_output_len
prompt, prompt_len, output_len = requests[i]
# Add the prompt to the batch.
batch.append(prompt)
max_prompt_len = max(max_prompt_len, prompt_len)
max_output_len = max(max_output_len, output_len)
if len(batch) < max_batch_size and i != len(requests) - 1:
# Check if we can add more requests to the batch.
next_prompt_len = requests[i + 1].prompt_len
next_output_len = requests[i + 1].expected_output_len
_, next_prompt_len, next_output_len = requests[i + 1]
if (max(max_prompt_len, next_prompt_len) +
max(max_output_len, next_output_len)) <= 2048:
# We can add more requests to the batch.
@ -304,7 +300,6 @@ def get_requests(args, tokenizer):
"input_len": args.input_len,
"output_len": args.output_len,
}
if args.dataset_path is None or args.dataset_name == "random":
sample_kwargs["range_ratio"] = args.random_range_ratio
sample_kwargs["prefix_len"] = args.prefix_len
@ -322,23 +317,18 @@ def get_requests(args, tokenizer):
elif args.dataset_name == "burstgpt":
dataset_cls = BurstGPTDataset
elif args.dataset_name == "hf":
if args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS:
dataset_cls = VisionArenaDataset
common_kwargs['dataset_subset'] = None
common_kwargs['dataset_split'] = "train"
sample_kwargs["enable_multimodal_chat"] = True
elif args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS:
dataset_cls = InstructCoderDataset
common_kwargs['dataset_split'] = "train"
elif args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS:
dataset_cls = ConversationDataset
common_kwargs['dataset_subset'] = args.hf_subset
common_kwargs['dataset_split'] = args.hf_split
sample_kwargs["enable_multimodal_chat"] = True
elif args.dataset_path in AIMODataset.SUPPORTED_DATASET_PATHS:
dataset_cls = AIMODataset
common_kwargs['dataset_subset'] = None
common_kwargs['dataset_split'] = "train"
if args.backend != "vllm-chat":
raise ValueError(
"hf datasets only are supported by vllm-chat backend")
# Choose between VisionArenaDataset and HuggingFaceDataset based on
# provided parameters.
dataset_cls = (VisionArenaDataset if args.dataset_path
== VisionArenaDataset.VISION_ARENA_DATASET_PATH
and args.hf_subset is None else HuggingFaceDataset)
common_kwargs['dataset_subset'] = args.hf_subset
common_kwargs['dataset_split'] = args.hf_split
sample_kwargs["enable_multimodal_chat"] = True
else:
raise ValueError(f"Unknown dataset name: {args.dataset_name}")
# Remove None values
@ -472,17 +462,9 @@ def validate_args(args):
warnings.warn("--hf-subset and --hf-split will be ignored \
since --dataset-name is not 'hf'.",
stacklevel=2)
elif args.dataset_name == "hf":
if args.dataset_path in (
VisionArenaDataset.SUPPORTED_DATASET_PATHS.keys()
| ConversationDataset.SUPPORTED_DATASET_PATHS):
assert args.backend == "vllm-chat", f"{args.dataset_path} needs to use vllm-chat as the backend." #noqa: E501
elif args.dataset_path in (InstructCoderDataset.SUPPORTED_DATASET_PATHS
| AIMODataset.SUPPORTED_DATASET_PATHS):
assert args.backend == "vllm", f"{args.dataset_path} needs to use vllm as the backend." #noqa: E501
else:
raise ValueError(
f"{args.dataset_path} is not supported by hf dataset.")
elif args.dataset_name == "hf" and args.backend != "vllm-chat":
raise ValueError(
"When --dataset-name is 'hf', backend must be 'vllm-chat'")
# --random-range-ratio: only used when dataset_name is 'random'
if args.dataset_name != 'random' and args.random_range_ratio is not None:
@ -523,13 +505,6 @@ def validate_args(args):
raise ValueError(
"Tokenizer must be the same as the model for MII backend.")
# --data-parallel is not supported currently.
# https://github.com/vllm-project/vllm/issues/16222
if args.data_parallel_size > 1:
raise ValueError(
"Data parallel is not supported in offline benchmark, \
please use benchmark serving instead")
if __name__ == "__main__":
parser = FlexibleArgumentParser(description="Benchmark the throughput.")
@ -601,30 +576,18 @@ if __name__ == "__main__":
default=None,
help="Path to the lora adapters to use. This can be an absolute path, "
"a relative path, or a Hugging Face model identifier.")
parser.add_argument(
"--prefix-len",
type=int,
default=None,
help=f"Number of prefix tokens to be used in RandomDataset "
"and SonnetDataset. For RandomDataset, the total input "
"length is the sum of prefix-len (default: "
f"{RandomDataset.DEFAULT_PREFIX_LEN}) and a random context length "
"sampled from [input_len * (1 - range_ratio), "
"input_len * (1 + range_ratio)]. For SonnetDataset, "
f"prefix_len (default: {SonnetDataset.DEFAULT_PREFIX_LEN}) "
"controls how much of the input is fixed lines versus "
"random lines, but the total input length remains approximately "
"input_len tokens.")
parser.add_argument("--prefix-len",
type=int,
default=None,
help="Number of prefix tokens per request."
"This is for the RandomDataset and SonnetDataset")
# random dataset
parser.add_argument(
"--random-range-ratio",
type=float,
default=None,
help=f"Range ratio (default : {RandomDataset.DEFAULT_RANGE_RATIO}) "
"for sampling input/output length, "
"used only for RandomDataset. Must be in the range [0, 1) to "
"define a symmetric sampling range "
"[length * (1 - range_ratio), length * (1 + range_ratio)].",
help="Range of sampled ratio of input/output length, "
"used only for RandomDataSet.",
)
# hf dtaset

View File

@ -1,236 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
from vllm.model_executor.layers.quantization.utils.bitblas_utils import (
MINIMUM_BITBLAS_VERSION)
try:
import bitblas
if bitblas.__version__ < MINIMUM_BITBLAS_VERSION:
raise ImportError("bitblas version is wrong. Please "
f"install bitblas>={MINIMUM_BITBLAS_VERSION}")
except ImportError as e:
bitblas_import_exception = e
raise ValueError("Trying to use the bitblas backend, but could not import"
f"with the following error: {bitblas_import_exception}. "
"Please install bitblas through the following command: "
f"`pip install bitblas>={MINIMUM_BITBLAS_VERSION}`"
) from bitblas_import_exception
from bitblas import Matmul, MatmulConfig, auto_detect_nvidia_target
from vllm.utils import FlexibleArgumentParser
parser = FlexibleArgumentParser(
description="Benchmark BitBLAS int4 on a specific target.")
# Add arguments to the parser
parser.add_argument(
"--target",
type=str,
default=auto_detect_nvidia_target(),
help="Specify the target device for benchmarking.",
)
parser.add_argument("--group_size",
type=int,
default=None,
help="Group size for grouped quantization.")
parser.add_argument(
"--A_dtype",
type=str,
default="float16",
choices=["float16", "float32", "float64", "int32", "int8"],
help="Data type of activation A.",
)
parser.add_argument(
"--W_dtype",
type=str,
default="int4",
choices=[
"float16",
"float32",
"float64",
"int32",
"int8",
"int4",
"int2",
"int1",
"nf4",
"fp4_e2m1",
],
help="Data type of weight W.",
)
parser.add_argument(
"--accum_dtype",
type=str,
default="float16",
choices=["float16", "int32"],
help="Data type for accumulation.",
)
parser.add_argument(
"--out_dtype",
type=str,
default="float16",
choices=["float16", "float32", "int32", "int8"],
help="Data type for output.",
)
parser.add_argument(
"--layout",
type=str,
default="nt",
choices=["nt", "nn"],
help="Matrix layout, 'nt' for non-transpose A and transpose W.",
)
parser.add_argument("--with_bias",
action="store_true",
help="Include bias in the benchmark.")
parser.add_argument(
"--with_scaling",
action="store_true",
help="Include scaling factor in the quantization.",
)
parser.add_argument("--with_zeros",
action="store_true",
help="Include zeros in the quantization.")
parser.add_argument(
"--zeros_mode",
type=str,
default=None,
choices=["original", "rescale", "quantized"],
help="Specify the mode for calculating zeros.",
)
# Parse the arguments
args = parser.parse_args()
# Assign arguments to variables
target = args.target
A_dtype = args.A_dtype
W_dtype = args.W_dtype
accum_dtype = args.accum_dtype
out_dtype = args.out_dtype
layout = args.layout
with_bias = args.with_bias
group_size = args.group_size
with_scaling = args.with_scaling
with_zeros = args.with_zeros
zeros_mode = args.zeros_mode
# Define a list of shared arguments that repeat in every config
shared_args = [
A_dtype,
W_dtype,
out_dtype,
accum_dtype,
layout,
with_bias,
group_size,
with_scaling,
with_zeros,
zeros_mode,
]
# Define just the (M, K, N) shapes in a more compact list
shapes = [
# square test
(1, 16384, 16384),
# BLOOM-176B
(1, 43008, 14336),
(1, 14336, 14336),
(1, 57344, 14336),
(1, 14336, 57344),
# OPT-65B
(1, 9216, 9216),
(1, 36864, 9216),
(1, 9216, 36864),
(1, 22016, 8192),
# LLAMA-70B/65B
(1, 8192, 22016),
(1, 8192, 8192),
(1, 28672, 8192),
(1, 8192, 28672),
# square test
(16384, 16384, 16384),
# BLOOM-176B
(8192, 43008, 14336),
(8192, 14336, 14336),
(8192, 57344, 14336),
(8192, 14336, 57344),
# OPT-65B
(8192, 9216, 9216),
(8192, 36864, 9216),
(8192, 9216, 36864),
(8192, 22016, 8192),
# LLAMA-70B/65B
(8192, 8192, 22016),
(8192, 8192, 8192),
(8192, 28672, 8192),
(8192, 8192, 28672),
]
# Build test shapes with all the shared arguments
test_shapes = [(MatmulConfig, Matmul, (*shape, *shared_args))
for shape in shapes]
benchmark_sets = []
benchmark_sets.extend(test_shapes)
benchmark_results = {}
for config_class, operator, input_args in benchmark_sets:
config = config_class(*input_args)
matmul = operator(config, target=target, enable_tuning=True)
kernel_latency = matmul.profile_latency()
print("Time cost is: {:.3f} ms".format(kernel_latency))
profile_config = {
f"{operator.__name__}-{'-'.join([str(i) for i in input_args])}": {
"BitBLAS_top20_latency": kernel_latency,
}
}
benchmark_results.update(profile_config)
# Define headers for the table
headers = [
"PrimFunc",
"Input Arguments",
"BitBLAS Top20 Latency",
]
# Calculate column widths for pretty printing
col_widths = [0, 0, 0]
for config_key, values in benchmark_results.items():
args_split = config_key.split("-")
func_name = args_split[0]
input_args_str = "-".join(args_split[1:])
col_widths[0] = max(col_widths[0], len(func_name) + 2, len(headers[0]) + 2)
col_widths[1] = max(col_widths[1],
len(input_args_str) + 2,
len(headers[1]) + 2)
col_widths[2] = max(col_widths[2],
len(f"{values['BitBLAS_top20_latency']:.3f} ms") + 2,
len(headers[2]) + 2)
# break only if you want to measure widths from a single example;
# otherwise, let it loop over all items.
# Print header
for i, header in enumerate(headers):
headers[i] = header.ljust(col_widths[i])
print("".join(headers))
print("-" * sum(col_widths))
# Print rows
for config_key, values in benchmark_results.items():
args_split = config_key.split("-")
func_name = args_split[0]
input_args_str = "-".join(args_split[1:])
row = [
func_name,
input_args_str,
f"{values['BitBLAS_top20_latency']:.3f} ms",
]
row_str = "".join(
[str(cell).ljust(col_widths[idx]) for idx, cell in enumerate(row)])
print(row_str)

View File

@ -1,340 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
import torch
import torch.utils.benchmark as benchmark
from benchmark_shapes import WEIGHT_SHAPES_MOE
from vllm import _custom_ops as ops
from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
from vllm.model_executor.layers.fused_moe.fused_moe import (cutlass_moe_fp8,
fused_experts,
fused_topk)
from vllm.utils import FlexibleArgumentParser
DEFAULT_MODELS = [
"nm-testing/Mixtral-8x7B-Instruct-v0.1", "nm-testing/deepseekv2-lite",
"ibm-granite/granite-3.0-1b-a400m", "ibm-granite/granite-3.0-3b-a800m"
]
DEFAULT_BATCH_SIZES = [1, 4, 8, 16, 32, 64, 128, 256, 512]
DEFAULT_TP_SIZES = [1]
PER_ACT_TOKEN_OPTS = [False]
PER_OUT_CH_OPTS = [False]
def to_fp8(tensor: torch.Tensor):
finfo = torch.finfo(torch.float8_e4m3fn)
return torch.round(tensor.clamp(
min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn)
def bench_run(results: list[benchmark.Measurement], model: str,
num_experts: int, topk: int, per_act_token: bool,
per_out_ch: bool, mkn: tuple[int, int, int]):
label = "Quant Matmul"
sub_label = (
"{}, num_experts={}, topk={}, per_act_token={} per_out_ch={}, "
"MKN=({})".format(model, num_experts, topk, per_act_token, per_out_ch,
mkn))
print(f"Testing: {sub_label}")
(m, k, n) = mkn
dtype = torch.half
a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
w1 = torch.randn((num_experts, 2 * n, k), device="cuda", dtype=dtype) / 10
w2 = torch.randn((num_experts, k, n), device="cuda", dtype=dtype) / 10
_, a_scale = ops.scaled_fp8_quant(a)
w1_q = torch.empty((num_experts, 2 * n, k),
device="cuda",
dtype=torch.float8_e4m3fn)
w2_q = torch.empty((num_experts, k, n),
device="cuda",
dtype=torch.float8_e4m3fn)
w1_scale = torch.empty((num_experts, 1, 1),
device="cuda",
dtype=torch.float32)
w2_scale = torch.empty((num_experts, 1, 1),
device="cuda",
dtype=torch.float32)
ab_strides1 = torch.full((num_experts, ),
k,
device="cuda",
dtype=torch.int64)
c_strides1 = torch.full((num_experts, ),
2 * n,
device="cuda",
dtype=torch.int64)
ab_strides2 = torch.full((num_experts, ),
n,
device="cuda",
dtype=torch.int64)
c_strides2 = torch.full((num_experts, ),
k,
device="cuda",
dtype=torch.int64)
for expert in range(num_experts):
w1_q[expert], w1_scale[expert] = ops.scaled_fp8_quant(w1[expert])
w2_q[expert], w2_scale[expert] = ops.scaled_fp8_quant(w2[expert])
w1_q_notransp = w1_q.clone()
w2_q_notransp = w2_q.clone()
w1_q = w1_q.transpose(1, 2)
w2_q = w2_q.transpose(1, 2)
score = torch.randn((m, num_experts), device="cuda", dtype=dtype)
topk_weights, topk_ids = fused_topk(a, score, topk, renormalize=False)
def run_triton_moe(a: torch.Tensor, w1: torch.Tensor, w2: torch.Tensor,
topk_weights: torch.Tensor, topk_ids: torch.Tensor,
w1_scale: torch.Tensor, w2_scale: torch.Tensor,
a_scale: torch.Tensor, num_repeats: int):
for _ in range(num_repeats):
fused_experts(a,
w1,
w2,
topk_weights,
topk_ids,
use_fp8_w8a8=True,
w1_scale=w1_scale,
w2_scale=w2_scale,
a1_scale=a_scale)
def run_cutlass_moe(a: torch.Tensor, a_scale: torch.Tensor,
w1: torch.Tensor, w2: torch.Tensor,
w1_scale: torch.Tensor, w2_scale: torch.Tensor,
topk_weights: torch.Tensor, topk_ids: torch.Tensor,
ab_strides1: torch.Tensor, c_strides1: torch.Tensor,
ab_strides2: torch.Tensor, c_strides2: torch.Tensor,
num_repeats: int):
for _ in range(num_repeats):
cutlass_moe_fp8(a,
w1,
w2,
w1_scale,
w2_scale,
topk_weights,
topk_ids,
ab_strides1,
c_strides1,
ab_strides2,
c_strides2,
a1_scale=a_scale)
def run_cutlass_from_graph(
a: torch.Tensor, a_scale: torch.Tensor, w1_q: torch.Tensor,
w2_q: torch.Tensor, w1_scale: torch.Tensor, w2_scale: torch.Tensor,
topk_weights: torch.Tensor, topk_ids: torch.Tensor,
ab_strides1: torch.Tensor, c_strides1: torch.Tensor,
ab_strides2: torch.Tensor, c_strides2: torch.Tensor):
with set_current_vllm_config(
VllmConfig(parallel_config=ParallelConfig(
pipeline_parallel_size=1))):
return cutlass_moe_fp8(a,
w1_q,
w2_q,
w1_scale,
w2_scale,
topk_weights,
topk_ids,
ab_strides1,
c_strides1,
ab_strides2,
c_strides2,
a1_scale=a_scale)
def run_triton_from_graph(a: torch.Tensor, w1: torch.Tensor,
w2: torch.Tensor, topk_weights: torch.Tensor,
topk_ids: torch.Tensor, w1_scale: torch.Tensor,
w2_scale: torch.Tensor, a_scale: torch.Tensor):
with set_current_vllm_config(
VllmConfig(parallel_config=ParallelConfig(
pipeline_parallel_size=1))):
return fused_experts(a,
w1,
w2,
topk_weights,
topk_ids,
use_fp8_w8a8=True,
w1_scale=w1_scale,
w2_scale=w2_scale,
a1_scale=a_scale)
def replay_graph(graph, num_repeats):
for _ in range(num_repeats):
graph.replay()
torch.cuda.synchronize()
cutlass_stream = torch.cuda.Stream()
cutlass_graph = torch.cuda.CUDAGraph()
with torch.cuda.graph(cutlass_graph, stream=cutlass_stream):
run_cutlass_from_graph(a, a_scale, w1_q, w2_q, w1_scale, w2_scale,
topk_weights, topk_ids, ab_strides1, c_strides1,
ab_strides2, c_strides2)
torch.cuda.synchronize()
triton_stream = torch.cuda.Stream()
triton_graph = torch.cuda.CUDAGraph()
with torch.cuda.graph(triton_graph, stream=triton_stream):
run_triton_from_graph(a, w1_q_notransp, w2_q_notransp, topk_weights,
topk_ids, w1_scale, w2_scale, a_scale)
torch.cuda.synchronize()
min_run_time = 5
num_warmup = 5
num_runs = 25
globals = {
# Baseline params
"w1": w1,
"w2": w2,
"score": score,
"topk": topk,
"w1_q_notransp": w1_q_notransp,
"w2_q_notransp": w2_q_notransp,
# Cutlass params
"a_scale": a_scale,
"w1_q": w1_q,
"w2_q": w2_q,
"w1_scale": w1_scale,
"w2_scale": w2_scale,
"ab_strides1": ab_strides1,
"c_strides1": c_strides1,
"ab_strides2": ab_strides2,
"c_strides2": c_strides2,
# cuda graph params
"cutlass_graph": cutlass_graph,
"triton_graph": triton_graph,
# Gen params
"a": a,
"topk_weights": topk_weights,
"topk_ids": topk_ids,
"num_runs": num_runs,
# Kernels
"run_triton_moe": run_triton_moe,
"run_cutlass_moe": run_cutlass_moe,
"replay_graph": replay_graph,
}
# Warmup
run_triton_moe(a, w1_q_notransp, w2_q_notransp, topk_weights, topk_ids,
w1_scale, w2_scale, a_scale, num_warmup)
results.append(
benchmark.Timer(
stmt=
"run_triton_moe(a, w1_q_notransp, w2_q_notransp, topk_weights, topk_ids, w1_scale, w2_scale, a_scale, num_runs)", # noqa: E501
globals=globals,
label=label,
sub_label=sub_label,
description="triton_moe",
).blocked_autorange(min_run_time=min_run_time))
# Warmup
replay_graph(triton_graph, num_warmup)
results.append(
benchmark.Timer(
stmt="replay_graph(triton_graph, num_runs)",
globals=globals,
label=label,
sub_label=sub_label,
description="triton_moe_cuda_graphs",
).blocked_autorange(min_run_time=min_run_time))
# Warmup
run_cutlass_moe(a, a_scale, w1_q, w2_q, w1_scale, w2_scale, topk_weights,
topk_ids, ab_strides1, c_strides1, ab_strides2, c_strides2,
num_warmup)
results.append(
benchmark.Timer(
stmt=
"run_cutlass_moe(a, a_scale, w1_q, w2_q, w1_scale, w2_scale, topk_weights, topk_ids, ab_strides1, c_strides1, ab_strides2, c_strides2, num_runs)", # noqa: E501
globals=globals,
label=label,
sub_label=sub_label,
description="grouped_gemm_moe",
).blocked_autorange(min_run_time=min_run_time))
# Warmup
replay_graph(cutlass_graph, num_warmup)
results.append(
benchmark.Timer(
stmt="replay_graph(cutlass_graph, num_runs)",
globals=globals,
label=label,
sub_label=sub_label,
description="grouped_gemm_moe_cuda_graphs",
).blocked_autorange(min_run_time=min_run_time))
def main(args):
print("Benchmarking models:")
for i, model in enumerate(args.models):
print(f"[{i}] {model}")
results: list[benchmark.Measurement] = []
for model in args.models:
for tp in args.tp_sizes:
for layer in WEIGHT_SHAPES_MOE[model]:
num_experts = layer[0]
topk = layer[1]
size_k = layer[2]
size_n = layer[3] // tp
if len(args.limit_k) > 0 and size_k not in args.limit_k:
continue
if len(args.limit_n) > 0 and size_n not in args.limit_n:
continue
for per_act_token in PER_ACT_TOKEN_OPTS:
for per_out_ch in PER_OUT_CH_OPTS:
for size_m in DEFAULT_BATCH_SIZES:
mkn = (size_m, size_k, size_n)
bench_run(results, model, num_experts, topk,
per_act_token, per_out_ch, mkn)
compare = benchmark.Compare(results)
compare.print()
if __name__ == "__main__":
parser = FlexibleArgumentParser(
description="Benchmark Marlin across specified models/shapes/batches")
parser.add_argument(
"--models",
nargs="+",
type=str,
default=DEFAULT_MODELS,
choices=WEIGHT_SHAPES_MOE.keys(),
)
parser.add_argument("--tp-sizes",
nargs="+",
type=int,
default=DEFAULT_TP_SIZES)
parser.add_argument("--batch-sizes",
nargs="+",
type=int,
default=DEFAULT_BATCH_SIZES)
parser.add_argument("--limit-k", nargs="+", type=int, default=[])
parser.add_argument("--limit-n", nargs="+", type=int, default=[])
parser.add_argument("--limit-num-groups", nargs="+", type=int, default=[])
parser.add_argument("--limit-per-act-token",
nargs="+",
type=int,
default=[])
parser.add_argument("--limit-per-out-ch", nargs="+", type=int, default=[])
args = parser.parse_args()
main(args)

View File

@ -17,14 +17,8 @@ from torch.utils.benchmark import Measurement as TMeasurement
from utils import ArgPool, Bench, CudaGraphBenchParams
from weight_shapes import WEIGHT_SHAPES
from vllm.triton_utils import HAS_TRITON
if HAS_TRITON:
from vllm.lora.ops.triton_ops import (LoRAKernelMeta, lora_expand,
lora_shrink)
from vllm.lora.ops.triton_ops.utils import (_LORA_A_PTR_DICT,
_LORA_B_PTR_DICT)
from vllm.lora.ops.triton_ops import LoRAKernelMeta, lora_expand, lora_shrink
from vllm.lora.ops.triton_ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT
from vllm.utils import FlexibleArgumentParser
DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())

View File

@ -30,18 +30,19 @@ class BenchmarkConfig(TypedDict):
num_stages: int
def benchmark_config(config: BenchmarkConfig,
num_tokens: int,
num_experts: int,
shard_intermediate_size: int,
hidden_size: int,
topk: int,
dtype: torch.dtype,
use_fp8_w8a8: bool,
use_int8_w8a16: bool,
num_iters: int = 100,
block_quant_shape: List[int] = None,
use_deep_gemm: bool = False) -> float:
def benchmark_config(
config: BenchmarkConfig,
num_tokens: int,
num_experts: int,
shard_intermediate_size: int,
hidden_size: int,
topk: int,
dtype: torch.dtype,
use_fp8_w8a8: bool,
use_int8_w8a16: bool,
num_iters: int = 100,
block_quant_shape: List[int] = None,
) -> float:
init_dtype = torch.float16 if use_fp8_w8a8 else dtype
x = torch.randn(num_tokens, hidden_size, dtype=dtype)
if use_int8_w8a16:
@ -114,41 +115,22 @@ def benchmark_config(config: BenchmarkConfig,
def run():
from vllm.model_executor.layers.fused_moe import override_config
with override_config(config):
if use_deep_gemm:
topk_weights, topk_ids = fused_topk(x, input_gating, topk,
False)
return fused_experts(
x,
w1,
w2,
topk_weights,
topk_ids,
inplace=True,
use_fp8_w8a8=use_fp8_w8a8,
w1_scale=w1_scale,
w2_scale=w2_scale,
a1_scale=a1_scale,
a2_scale=a2_scale,
block_shape=block_quant_shape,
allow_deep_gemm=True,
)
else:
fused_moe(
x,
w1,
w2,
input_gating,
topk,
renormalize=True,
inplace=True,
use_fp8_w8a8=use_fp8_w8a8,
use_int8_w8a16=use_int8_w8a16,
w1_scale=w1_scale,
w2_scale=w2_scale,
a1_scale=a1_scale,
a2_scale=a2_scale,
block_shape=block_quant_shape,
)
fused_moe(
x,
w1,
w2,
input_gating,
topk,
renormalize=True,
inplace=True,
use_fp8_w8a8=use_fp8_w8a8,
use_int8_w8a16=use_int8_w8a16,
w1_scale=w1_scale,
w2_scale=w2_scale,
a1_scale=a1_scale,
a2_scale=a2_scale,
block_shape=block_quant_shape,
)
# JIT compilation & warmup
run()
@ -384,7 +366,6 @@ class BenchmarkWorker:
use_fp8_w8a8: bool,
use_int8_w8a16: bool,
block_quant_shape: List[int] = None,
use_deep_gemm: bool = False,
) -> tuple[dict[str, int], float]:
current_platform.seed_everything(self.seed)
dtype_str = get_config_dtype_str(dtype,
@ -415,8 +396,7 @@ class BenchmarkWorker:
use_fp8_w8a8,
use_int8_w8a16,
num_iters=100,
block_quant_shape=block_quant_shape,
use_deep_gemm=use_deep_gemm)
block_quant_shape=block_quant_shape)
return config, kernel_time
def tune(
@ -431,7 +411,6 @@ class BenchmarkWorker:
use_int8_w8a16: bool,
search_space: list[dict[str, int]],
block_quant_shape: list[int],
use_deep_gemm: bool,
) -> dict[str, int]:
best_config = None
best_time = float("inf")
@ -457,8 +436,7 @@ class BenchmarkWorker:
use_fp8_w8a8,
use_int8_w8a16,
num_iters=20,
block_quant_shape=block_quant_shape,
use_deep_gemm=use_deep_gemm)
block_quant_shape=block_quant_shape)
except triton.runtime.autotuner.OutOfResources:
# Some configurations may be invalid and fail to compile.
continue
@ -553,8 +531,6 @@ def main(args: argparse.Namespace):
intermediate_size = config.moe_intermediate_size
shard_intermediate_size = 2 * intermediate_size // args.tp_size
else:
# Support for llama4
config = config.get_text_config()
# Default: Mixtral.
E = config.num_local_experts
topk = config.num_experts_per_tok
@ -574,8 +550,6 @@ def main(args: argparse.Namespace):
else:
batch_sizes = [args.batch_size]
use_deep_gemm = bool(args.use_deep_gemm)
ray.init()
num_gpus = int(ray.available_resources()["GPU"])
workers = [BenchmarkWorker.remote(args.seed) for _ in range(num_gpus)]
@ -598,10 +572,10 @@ def main(args: argparse.Namespace):
start = time.time()
configs = _distribute(
"tune", [(batch_size, E, shard_intermediate_size, hidden_size,
topk, dtype, use_fp8_w8a8, use_int8_w8a16, search_space,
block_quant_shape, use_deep_gemm)
for batch_size in batch_sizes])
"tune",
[(batch_size, E, shard_intermediate_size, hidden_size, topk, dtype,
use_fp8_w8a8, use_int8_w8a16, search_space, block_quant_shape)
for batch_size in batch_sizes])
best_configs = {
M: sort_config(config)
for M, config in zip(batch_sizes, configs)
@ -615,7 +589,7 @@ def main(args: argparse.Namespace):
outputs = _distribute(
"benchmark",
[(batch_size, E, shard_intermediate_size, hidden_size, topk, dtype,
use_fp8_w8a8, use_int8_w8a16, block_quant_shape, use_deep_gemm)
use_fp8_w8a8, use_int8_w8a16, block_quant_shape)
for batch_size in batch_sizes])
for batch_size, (config, kernel_time) in zip(batch_sizes, outputs):
@ -637,7 +611,6 @@ if __name__ == "__main__":
type=str,
choices=["auto", "fp8_w8a8", "int8_w8a16"],
default="auto")
parser.add_argument("--use-deep-gemm", action="store_true")
parser.add_argument("--seed", type=int, default=0)
parser.add_argument("--batch-size", type=int, required=False)
parser.add_argument("--tune", action="store_true")

View File

@ -7,13 +7,10 @@ from typing import Optional
import torch
from vllm import _custom_ops as ops
from vllm.logger import init_logger
from vllm.platforms import current_platform
from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser,
create_kv_caches_with_random)
logger = init_logger(__name__)
NUM_BLOCKS = 128 * 1024
PARTITION_SIZE = 512
PARTITION_SIZE_ROCM = 256
@ -196,9 +193,6 @@ def main(
if __name__ == '__main__':
logger.warning("This script benchmarks the paged attention kernel. "
"By default this is no longer used in vLLM inference.")
parser = FlexibleArgumentParser(
description="Benchmark the paged attention kernel.")
parser.add_argument("--version",

View File

@ -75,19 +75,3 @@ WEIGHT_SHAPES = {
[7168, 8192],
],
}
WEIGHT_SHAPES_MOE = {
"nm-testing/Mixtral-8x7B-Instruct-v0.1": [
[8, 2, 4096, 28672],
[8, 2, 14336, 4096],
],
"nm-testing/deepseekv2-lite": [
[64, 6, 2048, 1408],
],
"ibm-granite/granite-3.0-1b-a400m": [
[32, 8, 1024, 1024],
],
"ibm-granite/granite-3.0-3b-a800m": [
[40, 8, 1024, 1536],
],
}

View File

@ -1,420 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# Adapted from sglang quantization/tuning_block_wise_kernel.py
import argparse
import json
import multiprocessing as mp
import os
import time
from datetime import datetime
from typing import Any
import torch
import tqdm
import triton
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
_w8a8_block_fp8_matmul)
from vllm.platforms import current_platform
from vllm.utils import FlexibleArgumentParser
mp.set_start_method("spawn", force=True)
assert current_platform.is_cuda(
), "Only support tune w8a8 block fp8 kernel on CUDA device."
DTYPE_MAP = {
"float32": torch.float32,
"float16": torch.float16,
"half": torch.half,
"bfloat16": torch.bfloat16,
}
def w8a8_block_matmul(
A: torch.Tensor,
B: torch.Tensor,
As: torch.Tensor,
Bs: torch.Tensor,
block_size: list[int],
config: dict[str, Any],
output_dtype: torch.dtype = torch.float16,
) -> torch.Tensor:
"""This function performs matrix multiplication with
block-wise quantization.
It takes two input tensors `A` and `B` with scales `As` and `Bs`.
The output is returned in the specified `output_dtype`.
Args:
A: The input tensor, e.g., activation.
B: The input tensor, e.g., weight.
As: The per-token-group quantization scale for `A`.
Bs: The per-block quantization scale for `B`.
block_size: The block size for per-block quantization.
It should be 2-dim, e.g., [128, 128].
output_dytpe: The dtype of the returned tensor.
Returns:
torch.Tensor: The result of matmul.
"""
assert len(block_size) == 2
block_n, block_k = block_size[0], block_size[1]
assert A.shape[-1] == B.shape[-1]
assert A.shape[:-1] == As.shape[:-1] and A.is_contiguous()
assert triton.cdiv(A.shape[-1], block_k) == As.shape[-1]
M = A.numel() // A.shape[-1]
assert B.ndim == 2 and B.is_contiguous() and Bs.ndim == 2
N, K = B.shape
assert triton.cdiv(N, block_n) == Bs.shape[0]
assert triton.cdiv(K, block_k) == Bs.shape[1]
C_shape = A.shape[:-1] + (N, )
C = A.new_empty(C_shape, dtype=output_dtype)
def grid(META):
return (triton.cdiv(M, META["BLOCK_SIZE_M"]) *
triton.cdiv(N, META["BLOCK_SIZE_N"]), )
if A.dtype == torch.float8_e4m3fn:
kernel = _w8a8_block_fp8_matmul
else:
raise RuntimeError(
"Currently, only support tune w8a8 block fp8 kernel.")
kernel[grid](
A,
B,
C,
As,
Bs,
M,
N,
K,
block_n,
block_k,
A.stride(-2),
A.stride(-1),
B.stride(1),
B.stride(0),
C.stride(-2),
C.stride(-1),
As.stride(-2),
As.stride(-1),
Bs.stride(1),
Bs.stride(0),
**config,
)
return C
def get_configs_compute_bound():
configs = []
for num_stages in [2, 3, 4, 5]:
for block_m in [16, 32, 64, 128, 256]:
for block_k in [64, 128]:
for block_n in [32, 64, 128, 256]:
for num_warps in [4, 8]:
for group_size in [1, 16, 32, 64]:
configs.append({
"BLOCK_SIZE_M": block_m,
"BLOCK_SIZE_N": block_n,
"BLOCK_SIZE_K": block_k,
"GROUP_SIZE_M": group_size,
"num_warps": num_warps,
"num_stages": num_stages,
})
return configs
def get_weight_shapes(tp_size):
# NOTE(HandH1998): The weight shapes only works for DeepSeek-V3.
# Modify them, if you tune for another different model.
# cannot TP
total = [
(512 + 64, 7168),
((128 + 64) * 128, 7168),
(128 * (128 + 128), 512),
(7168, 16384),
(7168, 18432),
]
# N can TP
n_tp = [
(18432 * 2, 7168),
((128 + 64) * 128, 7168),
(128 * (128 + 128), 512),
(24576, 1536),
(12288, 7168),
(4096, 7168),
]
# K can TP
k_tp = [(7168, 18432), (7168, 16384), (7168, 2048)]
weight_shapes = []
for t in total:
weight_shapes.append(t)
for n_t in n_tp:
new_t = (n_t[0] // tp_size, n_t[1])
weight_shapes.append(new_t)
for k_t in k_tp:
new_t = (k_t[0], k_t[1] // tp_size)
weight_shapes.append(new_t)
return weight_shapes
def benchmark_config(A,
B,
As,
Bs,
block_size,
config,
out_dtype=torch.float16,
num_iters=10):
def run():
w8a8_block_matmul(A, B, As, Bs, block_size, config, out_dtype)
torch.cuda.synchronize()
# JIT complication & warmup
for _ in range(5):
run()
torch.cuda.synchronize()
start_event = torch.cuda.Event(enable_timing=True)
end_event = torch.cuda.Event(enable_timing=True)
latencies: list[float] = []
for i in range(num_iters):
torch.cuda.synchronize()
start_event.record()
run()
end_event.record()
end_event.synchronize()
latencies.append(start_event.elapsed_time(end_event))
avg = sum(latencies) / (num_iters * 10) * 1000 # us
return avg
def tune(M, N, K, block_size, out_dtype, search_space, input_type):
factor_for_scale = 1e-2
if input_type == "fp8":
fp8_info = torch.finfo(torch.float8_e4m3fn)
fp8_max, fp8_min = fp8_info.max, fp8_info.min
A_fp32 = (
(torch.rand(M, K, dtype=torch.float32, device="cuda") - 0.5) * 2 *
fp8_max)
A = A_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
B_fp32 = (
(torch.rand(N, K, dtype=torch.float32, device="cuda") - 0.5) * 2 *
fp8_max)
B = B_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
else:
raise RuntimeError(
"Currently, only support tune w8a8 block fp8 kernel.")
block_n, block_k = block_size[0], block_size[1]
n_tiles = (N + block_n - 1) // block_n
k_tiles = (K + block_k - 1) // block_k
As = torch.rand(M, k_tiles, dtype=torch.float32,
device="cuda") * factor_for_scale
Bs = (torch.rand(n_tiles, k_tiles, dtype=torch.float32, device="cuda") *
factor_for_scale)
best_config = None
best_time = float("inf")
for config in tqdm(search_space):
try:
kernel_time = benchmark_config(
A,
B,
As,
Bs,
block_size,
config,
out_dtype,
num_iters=10,
)
except triton.runtime.autotuner.OutOfResources:
# Some configurations may be invalid and fail to compile.
continue
if kernel_time < best_time:
best_time = kernel_time
best_config = config
now = datetime.now()
print(f"{now.ctime()}] Completed tuning for batch_size={M}")
assert best_config is not None
return best_config
def save_configs(
N,
K,
block_n,
block_k,
configs,
save_path,
input_type="fp8",
) -> None:
os.makedirs(save_path, exist_ok=True)
device_name = current_platform.get_device_name().replace(" ", "_")
json_file_name = (
f"N={N},K={K},device_name={device_name},dtype={input_type}_w8a8,"
f"block_shape=[{block_n},{block_k}].json")
config_file_path = os.path.join(save_path, json_file_name)
print(f"Writing best config to {config_file_path}...")
with open(config_file_path, "w") as f:
json.dump(configs, f, indent=4)
f.write("\n")
def tune_on_gpu(args_dict):
"""Run tuning on a specific GPU."""
gpu_id = args_dict["gpu_id"]
batch_sizes = args_dict["batch_sizes"]
weight_shapes = args_dict["weight_shapes"]
args = args_dict["args"]
torch.cuda.set_device(gpu_id)
print(f"Starting tuning on GPU {gpu_id} with batch sizes {batch_sizes}")
block_n = args.block_n
block_k = args.block_k
out_dtype = DTYPE_MAP[args.out_dtype]
save_path = args.save_path
input_type = args.input_type
search_space = get_configs_compute_bound()
search_space = [
config for config in search_space
if block_k % config["BLOCK_SIZE_K"] == 0
]
start = time.time()
for shape in tqdm(weight_shapes, desc=f"GPU {gpu_id} - Shapes"):
N, K = shape[0], shape[1]
print(f"[GPU {gpu_id}] Tune for weight shape of `N: {N}, K: {K}`")
benchmark_results = [
tune(
batch_size,
N,
K,
[block_n, block_k],
out_dtype,
search_space,
input_type,
) for batch_size in tqdm(batch_sizes,
desc=f"GPU {gpu_id} - Batch sizes")
]
best_configs = {
M: config
for M, config in zip(batch_sizes, benchmark_results)
}
save_configs(N, K, block_n, block_k, best_configs, save_path,
input_type)
end = time.time()
print(f"Tuning on GPU {gpu_id} took {end - start:.2f} seconds")
def distribute_batch_sizes(batch_sizes, num_gpus):
"""Distribute batch sizes across available GPUs."""
batches_per_gpu = []
for i in range(num_gpus):
start_idx = i * len(batch_sizes) // num_gpus
end_idx = (i + 1) * len(batch_sizes) // num_gpus
batches_per_gpu.append(batch_sizes[start_idx:end_idx])
return batches_per_gpu
def main(args):
print(args)
num_gpus = torch.cuda.device_count()
if num_gpus == 0:
raise RuntimeError("No GPU available for tuning")
print(f"Found {num_gpus} GPUs for parallel tuning")
torch.cuda.init()
if args.batch_size is None:
batch_sizes = [
1,
2,
4,
8,
16,
24,
32,
48,
64,
96,
128,
256,
512,
1024,
1536,
2048,
3072,
4096,
]
else:
batch_sizes = [args.batch_size]
num_gpus = 1 # If only one batch size, use only one GPU
weight_shapes = get_weight_shapes(args.tp_size)
batches_per_gpu = distribute_batch_sizes(batch_sizes, num_gpus)
process_args = []
for gpu_id in range(num_gpus):
process_args.append({
"gpu_id": gpu_id,
"batch_sizes": batches_per_gpu[gpu_id],
"weight_shapes":
weight_shapes, # Each GPU processes all weight shapes
"args": args,
})
ctx = mp.get_context("spawn")
with ctx.Pool(num_gpus) as pool:
pool.map(tune_on_gpu, process_args)
print("Multi-GPU tuning completed")
if __name__ == "__main__":
parser = FlexibleArgumentParser(
description="""
Tune triton w8a8 block fp8 for DeepSeek-V3/DeepSeek-R1:
python3 benchmark_w8a8_block_fp8.py --tp-size 8 --input-type fp8
Then copy to model_executor/layers/quantization/utils/configs
""",
formatter_class=argparse.RawTextHelpFormatter)
parser.add_argument("--tp-size", "-tp", type=int, default=8)
parser.add_argument("--input-type",
type=str,
choices=["fp8"],
default="fp8")
parser.add_argument(
"--out-dtype",
type=str,
choices=["float32", "float16", "bfloat16", "half"],
default="float16",
)
parser.add_argument("--block-n", type=int, default=128)
parser.add_argument("--block-k", type=int, default=128)
parser.add_argument("--batch-size", type=int, required=False)
parser.add_argument("--save-path", type=str, default="./")
args = parser.parse_args()
main(args)

16
benchmarks/launch_tgi_server.sh Executable file
View File

@ -0,0 +1,16 @@
#!/bin/bash
PORT=8000
MODEL=$1
TOKENS=$2
docker run -e "HF_TOKEN=$HF_TOKEN" --gpus all --shm-size 1g -p $PORT:80 \
-v "$PWD/data:/data" \
ghcr.io/huggingface/text-generation-inference:2.2.0 \
--model-id "$MODEL" \
--sharded false \
--max-input-length 1024 \
--max-total-tokens 2048 \
--max-best-of 5 \
--max-concurrent-requests 5000 \
--max-batch-total-tokens "$TOKENS"

View File

@ -33,6 +33,8 @@ endif()
if(MACOSX_FOUND)
list(APPEND CXX_COMPILE_FLAGS
"-Xpreprocessor"
"-fopenmp"
"-DVLLM_CPU_EXTENSION")
else()
list(APPEND CXX_COMPILE_FLAGS
@ -188,14 +190,12 @@ set(VLLM_EXT_SRC
"csrc/cpu/cache.cpp"
"csrc/cpu/utils.cpp"
"csrc/cpu/layernorm.cpp"
"csrc/cpu/mla_decode.cpp"
"csrc/cpu/pos_encoding.cpp"
"csrc/cpu/torch_bindings.cpp")
if (AVX512_FOUND AND NOT AVX512_DISABLED)
set(VLLM_EXT_SRC
"csrc/cpu/quant.cpp"
"csrc/cpu/shm.cpp"
${VLLM_EXT_SRC})
endif()

View File

@ -38,7 +38,7 @@ else()
FetchContent_Declare(
vllm-flash-attn
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
GIT_TAG 8798f27777fb57f447070301bf33a9f9c607f491
GIT_TAG dc9d410b3e2d6534a4c70724c2515f4def670a22
GIT_PROGRESS TRUE
# Don't share the vllm-flash-attn build between build types
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn

View File

@ -105,14 +105,8 @@ def run(command):
else:
enc = locale.getpreferredencoding()
output = raw_output.decode(enc)
if command == 'nvidia-smi topo -m':
# don't remove the leading whitespace of `nvidia-smi topo -m`
# because they are meaningful
output = output.rstrip()
else:
output = output.strip()
err = raw_err.decode(enc)
return rc, output, err.strip()
return rc, output.strip(), err.strip()
def run_and_read_all(run_lambda, command):
@ -282,20 +276,12 @@ def get_vllm_version():
if __version__ == "dev":
return "N/A (dev)"
version_str = __version_tuple__[-1]
if isinstance(version_str, str) and version_str.startswith('g'):
# it's a dev build
if '.' in version_str:
# it's a dev build containing local changes
git_sha = version_str.split('.')[0][1:]
date = version_str.split('.')[-1][1:]
return f"{__version__} (git sha: {git_sha}, date: {date})"
else:
# it's a dev build without local changes
git_sha = version_str[1:] # type: ignore
return f"{__version__} (git sha: {git_sha})"
return __version__
if len(__version_tuple__) == 4: # dev build
git_sha = __version_tuple__[-1][1:] # type: ignore
return f"{__version__} (git sha: {git_sha}"
return __version__
def summarize_vllm_build_flags():
# This could be a static method if the flags are constant, or dynamic if you need to check environment variables, etc.
@ -496,30 +482,16 @@ def get_pip_packages(run_lambda, patterns=None):
if patterns is None:
patterns = DEFAULT_PIP_PATTERNS
def run_with_pip():
try:
import importlib.util
pip_spec = importlib.util.find_spec('pip')
pip_available = pip_spec is not None
except ImportError:
pip_available = False
if pip_available:
cmd = [sys.executable, '-mpip', 'list', '--format=freeze']
elif os.environ.get("UV") is not None:
print("uv is set")
cmd = ["uv", "pip", "list", "--format=freeze"]
else:
raise RuntimeError(
"Could not collect pip list output (pip or uv module not available)"
)
out = run_and_read_all(run_lambda, cmd)
# People generally have `pip` as `pip` or `pip3`
# But here it is invoked as `python -mpip`
def run_with_pip(pip):
out = run_and_read_all(run_lambda, pip + ["list", "--format=freeze"])
return "\n".join(line for line in out.splitlines()
if any(name in line for name in patterns))
pip_version = 'pip3' if sys.version[0] == '3' else 'pip'
out = run_with_pip()
out = run_with_pip([sys.executable, '-mpip'])
return pip_version, out
@ -545,12 +517,13 @@ def is_xnnpack_available():
else:
return "N/A"
def get_env_vars():
env_vars = ''
secret_terms = ('secret', 'token', 'api', 'access', 'password')
report_prefix = ("TORCH", "NCCL", "PYTORCH", "CUDA", "CUBLAS", "CUDNN",
"OMP_", "MKL_", "NVIDIA")
secret_terms=('secret', 'token', 'api', 'access', 'password')
report_prefix = ("TORCH", "NCCL", "PYTORCH",
"CUDA", "CUBLAS", "CUDNN",
"OMP_", "MKL_",
"NVIDIA")
for k, v in os.environ.items():
if any(term in k.lower() for term in secret_terms):
continue
@ -561,7 +534,6 @@ def get_env_vars():
return env_vars
def get_env_info():
run_lambda = run
pip_version, pip_list_output = get_pip_packages(run_lambda)

View File

@ -1,178 +0,0 @@
#include <optional>
#include <torch/all.h>
#include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDAGuard.h>
#include <algorithm>
#include "attention_dtypes.h"
#include "attention_utils.cuh"
namespace vllm {
// Implements section 2.2 of https://www.arxiv.org/pdf/2501.01005
// can be used to combine partial attention results (in the split-KV case)
template <typename scalar_t, const uint NUM_THREADS>
__global__ void merge_attn_states_kernel(
scalar_t* output, float* output_lse, const scalar_t* prefix_output,
const float* prefix_lse, const scalar_t* suffix_output,
const float* suffix_lse, const uint num_tokens, const uint num_heads,
const uint head_size) {
using pack_128b_t = uint4;
const uint pack_size = 16 / sizeof(scalar_t);
const uint threads_per_head = head_size / pack_size;
const uint global_idx = blockIdx.x * NUM_THREADS + threadIdx.x;
const uint token_head_threads = num_tokens * num_heads * threads_per_head;
if (global_idx >= token_head_threads) return;
// global_idx -> token_idx + head_idx + pack_idx
const uint token_head_idx = global_idx / threads_per_head;
const uint pack_idx = global_idx % threads_per_head;
const uint token_idx = token_head_idx / num_heads;
const uint head_idx = token_head_idx % num_heads;
const uint pack_offset = pack_idx * pack_size; // (0~15)*8, etc.
const uint head_offset =
token_idx * num_heads * head_size + head_idx * head_size;
const scalar_t* prefix_head_ptr = prefix_output + head_offset;
const scalar_t* suffix_head_ptr = suffix_output + head_offset;
scalar_t* output_head_ptr = output + head_offset;
float p_lse = prefix_lse[head_idx * num_tokens + token_idx];
float s_lse = suffix_lse[head_idx * num_tokens + token_idx];
p_lse = std::isinf(p_lse) ? -std::numeric_limits<float>::infinity() : p_lse;
s_lse = std::isinf(s_lse) ? -std::numeric_limits<float>::infinity() : s_lse;
const float max_lse = fmaxf(p_lse, s_lse);
p_lse = p_lse - max_lse;
s_lse = s_lse - max_lse;
const float p_se = expf(p_lse);
const float s_se = expf(s_lse);
const float out_se = p_se + s_se;
const float p_scale = p_se / out_se;
const float s_scale = s_se / out_se;
if (pack_offset < head_size) {
// Pack 128b load
pack_128b_t p_out_pack = reinterpret_cast<const pack_128b_t*>(
prefix_head_ptr)[pack_offset / pack_size];
pack_128b_t s_out_pack = reinterpret_cast<const pack_128b_t*>(
suffix_head_ptr)[pack_offset / pack_size];
pack_128b_t o_out_pack;
#pragma unroll
for (uint i = 0; i < pack_size; ++i) {
// Always use float for FMA to keep high precision.
// half(uint16_t), bfloat16, float -> float.
const float p_out_f =
vllm::to_float(reinterpret_cast<const scalar_t*>(&p_out_pack)[i]);
const float s_out_f =
vllm::to_float(reinterpret_cast<const scalar_t*>(&s_out_pack)[i]);
// fma: a * b + c = p_out_f * p_scale + (s_out_f * s_scale)
const float o_out_f = p_out_f * p_scale + (s_out_f * s_scale);
// float -> half(uint16_t), bfloat16, float.
vllm::from_float(reinterpret_cast<scalar_t*>(&o_out_pack)[i], o_out_f);
}
// Pack 128b storage
reinterpret_cast<pack_128b_t*>(output_head_ptr)[pack_offset / pack_size] =
o_out_pack;
}
// We only need to write to output_lse once per head.
if (output_lse != nullptr && pack_idx == 0) {
float out_lse = logf(out_se) + max_lse;
output_lse[head_idx * num_tokens + token_idx] = out_lse;
}
}
} // namespace vllm
// The following macro is used to dispatch the conversion function based on
// the output data type. The FN is a macro that calls a function with
// template<typename scalar_t>.
#define DISPATCH_BY_SCALAR_DTYPE(scalar_dtype, fn) \
{ \
if (scalar_dtype == at::ScalarType::Float) { \
fn(float); \
} else if (scalar_dtype == at::ScalarType::Half) { \
fn(uint16_t); \
} else if (scalar_dtype == at::ScalarType::BFloat16) { \
fn(__nv_bfloat16); \
} else { \
TORCH_CHECK(false, "Unsupported data type of O: ", scalar_dtype); \
} \
}
#define LAUNCH_MERGE_ATTN_STATES(scalar_t, NUM_THREADS) \
{ \
vllm::merge_attn_states_kernel<scalar_t, NUM_THREADS> \
<<<grid, block, 0, stream>>>( \
reinterpret_cast<scalar_t*>(output.data_ptr()), output_lse_ptr, \
reinterpret_cast<scalar_t*>(prefix_output.data_ptr()), \
reinterpret_cast<float*>(prefix_lse.data_ptr()), \
reinterpret_cast<scalar_t*>(suffix_output.data_ptr()), \
reinterpret_cast<float*>(suffix_lse.data_ptr()), num_tokens, \
num_heads, head_size); \
}
/*@brief Merges the attention states from prefix and suffix
* into the output tensor. NUM_TOKENS: n, NUM_HEADS: h, HEAD_SIZE: d
*
* @param output [n,h,d] The output tensor to store the merged attention states.
* @param output_lse [h,d] Optional tensor to store the log-sum-exp values.
* @param prefix_output [n,h,d] The prefix attention states.
* @param prefix_lse [h,n] The log-sum-exp values for the prefix attention
* states.
* @param suffix_output [n,h,d] The suffix attention states.
* @param suffix_lse [h,n] The log-sum-exp values for the suffix attention
* states.
*/
template <typename scalar_t>
void merge_attn_states_launcher(torch::Tensor& output,
std::optional<torch::Tensor> output_lse,
const torch::Tensor& prefix_output,
const torch::Tensor& prefix_lse,
const torch::Tensor& suffix_output,
const torch::Tensor& suffix_lse) {
constexpr uint NUM_THREADS = 128;
const uint num_tokens = output.size(0);
const uint num_heads = output.size(1);
const uint head_size = output.size(2);
const uint pack_size = 16 / sizeof(scalar_t);
TORCH_CHECK(head_size % pack_size == 0,
"headsize must be multiple of pack_size:", pack_size);
float* output_lse_ptr = nullptr;
if (output_lse.has_value()) {
output_lse_ptr = output_lse.value().data_ptr<float>();
}
// Process one pack elements per thread. for float, the
// pack_size is 4 for half/bf16, the pack_size is 8.
const uint threads_per_head = head_size / pack_size;
const uint total_threads = num_tokens * num_heads * threads_per_head;
dim3 block(NUM_THREADS);
dim3 grid((total_threads + NUM_THREADS - 1) / NUM_THREADS);
const c10::cuda::OptionalCUDAGuard device_guard(prefix_output.device());
auto stream = at::cuda::getCurrentCUDAStream();
LAUNCH_MERGE_ATTN_STATES(scalar_t, NUM_THREADS);
}
#define CALL_MERGE_ATTN_STATES_LAUNCHER(scalar_t) \
{ \
merge_attn_states_launcher<scalar_t>(output, output_lse, prefix_output, \
prefix_lse, suffix_output, \
suffix_lse); \
}
void merge_attn_states(torch::Tensor& output,
std::optional<torch::Tensor> output_lse,
const torch::Tensor& prefix_output,
const torch::Tensor& prefix_lse,
const torch::Tensor& suffix_output,
const torch::Tensor& suffix_lse) {
DISPATCH_BY_SCALAR_DTYPE(output.dtype(), CALL_MERGE_ATTN_STATES_LAUNCHER);
}

View File

@ -1,38 +0,0 @@
/*
* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <torch/all.h>
#if defined ENABLE_CUTLASS_MLA && ENABLE_CUTLASS_MLA
void cutlass_mla_decode_sm100a(torch::Tensor const& out,
torch::Tensor const& q_nope,
torch::Tensor const& q_pe,
torch::Tensor const& kv_c_and_k_pe_cache,
torch::Tensor const& seq_lens,
torch::Tensor const& page_table, double scale);
#endif
void cutlass_mla_decode(torch::Tensor const& out, torch::Tensor const& q_nope,
torch::Tensor const& q_pe,
torch::Tensor const& kv_c_and_k_pe_cache,
torch::Tensor const& seq_lens,
torch::Tensor const& page_table, double scale) {
#if defined ENABLE_CUTLASS_MLA && ENABLE_CUTLASS_MLA
return cutlass_mla_decode_sm100a(out, q_nope, q_pe, kv_c_and_k_pe_cache,
seq_lens, page_table, scale);
#endif
TORCH_CHECK_NOT_IMPLEMENTED(false, "No compiled cutlass MLA");
}

View File

@ -1,225 +0,0 @@
/*
* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <torch/all.h>
#include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDAGuard.h>
#include "cute/tensor.hpp"
#include "cutlass/cutlass.h"
#include "cutlass/kernel_hardware_info.h"
#include "cutlass_extensions/common.hpp"
#include "device/sm100_mla.hpp"
#include "kernel/sm100_mla_tile_scheduler.hpp"
using namespace cute;
using namespace cutlass::fmha::kernel;
template <typename T, bool PersistenceOption = true>
struct MlaSm100 {
using Element = T;
using ElementAcc = float;
using ElementOut = T;
using TileShape = Shape<_128, _128, Shape<_512, _64>>;
using TileShapeH = cute::tuple_element_t<0, TileShape>;
using TileShapeD = cute::tuple_element_t<2, TileShape>;
// H K (D_latent D_rope) B
using ProblemShape = cute::tuple<TileShapeH, int, TileShapeD, int>;
using StrideQ = cute::tuple<int64_t, _1, int64_t>; // H D B
using StrideK = cute::tuple<int64_t, _1, int64_t>; // K D B
using StrideO = StrideK; // H D B
using StrideLSE = cute::tuple<_1, int>; // H B
using TileScheduler =
std::conditional_t<PersistenceOption, Sm100MlaPersistentTileScheduler,
Sm100MlaIndividualTileScheduler>;
using FmhaKernel =
cutlass::fmha::kernel::Sm100FmhaMlaKernelTmaWarpspecialized<
TileShape, Element, ElementAcc, ElementOut, ElementAcc, TileScheduler,
/*kIsCpAsync=*/true>;
using Fmha = cutlass::fmha::device::MLA<FmhaKernel>;
};
template <typename T>
typename T::Fmha::Arguments args_from_options(
at::Tensor const& out, at::Tensor const& q_nope, at::Tensor const& q_pe,
at::Tensor const& kv_c_and_k_pe_cache, at::Tensor const& seq_lens,
at::Tensor const& page_table, double scale) {
cutlass::KernelHardwareInfo hw_info;
hw_info.device_id = q_nope.device().index();
hw_info.sm_count =
cutlass::KernelHardwareInfo::query_device_multiprocessor_count(
hw_info.device_id);
int batches = q_nope.sizes()[0];
int page_count_per_seq = page_table.sizes()[1];
int page_count_total = kv_c_and_k_pe_cache.sizes()[0];
int page_size = kv_c_and_k_pe_cache.sizes()[1];
int max_seq_len = page_size * page_count_per_seq;
using TileShapeH = typename T::TileShapeH;
using TileShapeD = typename T::TileShapeD;
auto problem_shape =
cute::make_tuple(TileShapeH{}, max_seq_len, TileShapeD{}, batches);
auto [H, K, D, B] = problem_shape;
auto [D_latent, D_rope] = D;
using StrideQ = typename T::StrideQ;
using StrideK = typename T::StrideK;
using StrideO = typename T::StrideO;
using StrideLSE = typename T::StrideLSE;
StrideQ stride_Q_latent = cute::make_tuple(
static_cast<int64_t>(D_latent), _1{}, static_cast<int64_t>(H * D_latent));
StrideQ stride_Q_rope = cute::make_tuple(static_cast<int64_t>(D_rope), _1{},
static_cast<int64_t>(H * D_rope));
StrideK stride_C =
cute::make_tuple(static_cast<int64_t>(D_latent + D_rope), _1{},
static_cast<int64_t>(page_size * (D_latent + D_rope)));
StrideLSE stride_PT = cute::make_stride(_1{}, page_count_per_seq);
StrideLSE stride_LSE = cute::make_tuple(_1{}, static_cast<int>(H));
StrideO stride_O = cute::make_tuple(static_cast<int64_t>(D_latent), _1{},
static_cast<int64_t>(H * D_latent));
using Element = typename T::Element;
using ElementOut = typename T::ElementOut;
using ElementAcc = typename T::ElementAcc;
auto Q_latent_ptr = static_cast<Element*>(q_nope.data_ptr());
auto Q_rope_ptr = static_cast<Element*>(q_pe.data_ptr());
auto C_ptr = static_cast<Element*>(kv_c_and_k_pe_cache.data_ptr());
auto scale_f = static_cast<float>(scale);
typename T::Fmha::Arguments arguments{
problem_shape,
{scale_f, Q_latent_ptr, stride_Q_latent, Q_rope_ptr, stride_Q_rope, C_ptr,
stride_C, C_ptr + D_latent, stride_C,
static_cast<int*>(seq_lens.data_ptr()),
static_cast<int*>(page_table.data_ptr()), stride_PT, page_count_total,
page_size},
{static_cast<ElementOut*>(out.data_ptr()), stride_O,
static_cast<ElementAcc*>(nullptr), stride_LSE},
hw_info,
-1, // split_kv
nullptr, // is_var_split_kv
};
// TODO(kaixih@nvidia): When split_kv=-1 and is_var_split_kv=false, we compute
// split_kv automatically based on batch size and sequence length to balance
// workload across available SMs. Consider using var_split_kv for manual
// control if needed.
T::Fmha::set_split_kv(arguments);
return arguments;
}
template <typename Element>
void runMla(at::Tensor const& out, at::Tensor const& q_nope,
at::Tensor const& q_pe, at::Tensor const& kv_c_and_k_pe_cache,
at::Tensor const& seq_lens, at::Tensor const& page_table,
float scale, cudaStream_t stream) {
using MlaSm100Type = MlaSm100<Element>;
typename MlaSm100Type::Fmha fmha;
auto arguments = args_from_options<MlaSm100Type>(
out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, scale);
size_t workspace_size = MlaSm100Type::Fmha::get_workspace_size(arguments);
auto const workspace_options =
torch::TensorOptions().dtype(torch::kUInt8).device(q_nope.device());
auto workspace = torch::empty(workspace_size, workspace_options);
CUTLASS_CHECK(fmha.can_implement(arguments));
CUTLASS_CHECK(fmha.initialize(arguments, workspace.data_ptr(), stream));
CUTLASS_CHECK(fmha.run(arguments, workspace.data_ptr(), stream));
}
void cutlass_mla_decode_sm100a(torch::Tensor const& out,
torch::Tensor const& q_nope,
torch::Tensor const& q_pe,
torch::Tensor const& kv_c_and_k_pe_cache,
torch::Tensor const& seq_lens,
torch::Tensor const& page_table, double scale) {
TORCH_CHECK(q_nope.device().is_cuda(), "q_nope must be on CUDA");
TORCH_CHECK(q_nope.dim() == 3, "q_nope must be a 3D tensor");
TORCH_CHECK(q_pe.dim() == 3, "q_pe must be a 3D tensor");
TORCH_CHECK(kv_c_and_k_pe_cache.dim() == 3,
"kv_c_and_k_pe_cache must be a 3D tensor");
TORCH_CHECK(seq_lens.dim() == 1, "seq_lens must be a 1D tensor");
TORCH_CHECK(page_table.dim() == 2, "page_table must be a 2D tensor");
TORCH_CHECK(out.dim() == 3, "out must be a 3D tensor");
auto B_q_nope = q_nope.size(0);
auto H_q_nope = q_nope.size(1);
auto D_q_nope = q_nope.size(2);
auto B_q_pe = q_pe.size(0);
auto H_q_pe = q_pe.size(1);
auto D_q_pe = q_pe.size(2);
auto B_pt = page_table.size(0);
auto PAGE_NUM = page_table.size(1);
auto PAGE_SIZE = kv_c_and_k_pe_cache.size(1);
auto D_ckv = kv_c_and_k_pe_cache.size(2);
auto B_o = out.size(0);
auto H_o = out.size(1);
auto D_o = out.size(2);
TORCH_CHECK(D_q_nope == 512, "D_q_nope must be equal to 512");
TORCH_CHECK(D_q_pe == 64, "D_q_pe must be equal to 64");
TORCH_CHECK(D_ckv == 576, "D_ckv must be equal to 576");
TORCH_CHECK(H_q_nope == H_q_pe && H_q_nope == H_o && H_o == 128,
"H_q_nope, H_q_pe, and H_o must be equal to 128");
TORCH_CHECK(PAGE_SIZE > 0 && (PAGE_SIZE & (PAGE_SIZE - 1)) == 0,
"PAGE_SIZE must be a power of 2");
TORCH_CHECK(
B_q_nope == B_q_pe && B_q_nope == B_pt && B_q_nope == B_o,
"Batch dims must be same for page_table, q_nope and q_pe, and out");
TORCH_CHECK(PAGE_NUM % (128 / PAGE_SIZE) == 0,
"PAGE_NUM must be divisible by 128 / PAGE_SIZE");
TORCH_CHECK(D_o == 512, "D_o must be equal to 512");
TORCH_CHECK(q_nope.dtype() == at::ScalarType::Half ||
q_nope.dtype() == at::ScalarType::BFloat16 ||
q_nope.dtype() == at::ScalarType::Float8_e4m3fn,
"q_nope must be a half, bfloat16, or float8_e4m3fn tensor");
TORCH_CHECK(kv_c_and_k_pe_cache.dtype() == q_nope.dtype() &&
q_nope.dtype() == q_pe.dtype(),
"kv_c_and_k_pe_cache, q_nope, and q_pe must be the same type");
TORCH_CHECK(seq_lens.dtype() == torch::kInt32,
"seq_lens must be a 32-bit integer tensor");
TORCH_CHECK(page_table.dtype() == torch::kInt32,
"page_table must be a 32-bit integer tensor");
auto in_dtype = q_nope.dtype();
at::cuda::CUDAGuard device_guard{(char)q_nope.get_device()};
const cudaStream_t stream =
at::cuda::getCurrentCUDAStream(q_nope.get_device());
if (in_dtype == at::ScalarType::Half) {
runMla<cutlass::half_t>(out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens,
page_table, scale, stream);
} else if (in_dtype == at::ScalarType::BFloat16) {
runMla<cutlass::bfloat16_t>(out, q_nope, q_pe, kv_c_and_k_pe_cache,
seq_lens, page_table, scale, stream);
} else if (in_dtype == at::ScalarType::Float8_e4m3fn) {
runMla<cutlass::float_e4m3_t>(out, q_nope, q_pe, kv_c_and_k_pe_cache,
seq_lens, page_table, scale, stream);
} else {
TORCH_CHECK(false, "Unsupported input data type of MLA");
}
}

View File

@ -270,10 +270,9 @@ __global__ void reshape_and_cache_flash_kernel(
cache_t* __restrict__ value_cache, // [num_blocks, block_size, num_heads,
// head_size]
const int64_t* __restrict__ slot_mapping, // [num_tokens]
const int64_t block_stride, const int64_t page_stride,
const int64_t head_stride, const int64_t key_stride,
const int64_t value_stride, const int num_heads, const int head_size,
const int block_size, const float* k_scale, const float* v_scale) {
const int block_stride, const int key_stride, const int value_stride,
const int num_heads, const int head_size, const int block_size,
const float* k_scale, const float* v_scale) {
const int64_t token_idx = blockIdx.x;
const int64_t slot_idx = slot_mapping[token_idx];
// NOTE: slot_idx can be -1 if the token is padded
@ -289,8 +288,8 @@ __global__ void reshape_and_cache_flash_kernel(
const int head_idx = i / head_size;
const int head_offset = i % head_size;
const int64_t tgt_key_value_idx = block_idx * block_stride +
block_offset * page_stride +
head_idx * head_stride + head_offset;
block_offset * num_heads * head_size +
head_idx * head_size + head_offset;
scalar_t tgt_key = key[src_key_idx];
scalar_t tgt_value = value[src_value_idx];
if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
@ -397,16 +396,16 @@ void reshape_and_cache(
// KV_T is the data type of key and value tensors.
// CACHE_T is the stored data type of kv-cache.
// KV_DTYPE is the real data type of kv-cache.
#define CALL_RESHAPE_AND_CACHE_FLASH(KV_T, CACHE_T, KV_DTYPE) \
vllm::reshape_and_cache_flash_kernel<KV_T, CACHE_T, KV_DTYPE> \
<<<grid, block, 0, stream>>>( \
reinterpret_cast<KV_T*>(key.data_ptr()), \
reinterpret_cast<KV_T*>(value.data_ptr()), \
reinterpret_cast<CACHE_T*>(key_cache.data_ptr()), \
reinterpret_cast<CACHE_T*>(value_cache.data_ptr()), \
slot_mapping.data_ptr<int64_t>(), block_stride, page_stride, \
head_stride, key_stride, value_stride, num_heads, head_size, \
block_size, reinterpret_cast<const float*>(k_scale.data_ptr()), \
#define CALL_RESHAPE_AND_CACHE_FLASH(KV_T, CACHE_T, KV_DTYPE) \
vllm::reshape_and_cache_flash_kernel<KV_T, CACHE_T, KV_DTYPE> \
<<<grid, block, 0, stream>>>( \
reinterpret_cast<KV_T*>(key.data_ptr()), \
reinterpret_cast<KV_T*>(value.data_ptr()), \
reinterpret_cast<CACHE_T*>(key_cache.data_ptr()), \
reinterpret_cast<CACHE_T*>(value_cache.data_ptr()), \
slot_mapping.data_ptr<int64_t>(), block_stride, key_stride, \
value_stride, num_heads, head_size, block_size, \
reinterpret_cast<const float*>(k_scale.data_ptr()), \
reinterpret_cast<const float*>(v_scale.data_ptr()));
void reshape_and_cache_flash(
@ -433,11 +432,9 @@ void reshape_and_cache_flash(
int head_size = key.size(2);
int block_size = key_cache.size(1);
int64_t key_stride = key.stride(0);
int64_t value_stride = value.stride(0);
int64_t block_stride = key_cache.stride(0);
int64_t page_stride = key_cache.stride(1);
int64_t head_stride = key_cache.stride(2);
int key_stride = key.stride(0);
int value_stride = value.stride(0);
int block_stride = key_cache.stride(0);
TORCH_CHECK(key_cache.stride(0) == value_cache.stride(0));
dim3 grid(num_tokens);

View File

@ -88,48 +88,6 @@ void reshape_and_cache_cpu_impl(
}
}; // namespace
template <typename scalar_t>
void concat_and_cache_mla_cpu_impl(
const scalar_t* __restrict__ kv_c, // [num_tokens, kv_lora_rank]
const scalar_t* __restrict__ k_pe, // [num_tokens, pe_dim]
scalar_t* __restrict__ kv_cache, // [num_blocks, block_size, (kv_lora_rank
// + pe_dim)]
const int64_t* __restrict__ slot_mapping, // [num_tokens]
const int num_tokens, //
const int block_stride, //
const int entry_stride, //
const int kv_c_stride, //
const int k_pe_stride, //
const int kv_lora_rank, //
const int pe_dim, //
const int block_size //
) {
#pragma omp parallel for
for (int token_idx = 0; token_idx < num_tokens; ++token_idx) {
const int64_t slot_idx = slot_mapping[token_idx];
// NOTE: slot_idx can be -1 if the token is padded
if (slot_idx < 0) {
continue;
}
const int64_t block_idx = slot_idx / block_size;
const int64_t block_offset = slot_idx % block_size;
auto copy = [&](const scalar_t* __restrict__ src,
scalar_t* __restrict__ dst, int src_stride, int dst_stride,
int size, int offset) {
for (int i = 0; i < size; i++) {
const int64_t src_idx = token_idx * src_stride + i;
const int64_t dst_idx =
block_idx * block_stride + block_offset * entry_stride + i + offset;
dst[dst_idx] = src[src_idx];
}
};
copy(kv_c, kv_cache, kv_c_stride, block_stride, kv_lora_rank, 0);
copy(k_pe, kv_cache, k_pe_stride, block_stride, pe_dim, kv_lora_rank);
}
}
// Note: the key_caches and value_caches vectors are constant but
// not the Tensors they contain. The vectors need to be const refs
// in order to satisfy pytorch's C++ operator registration code.
@ -176,38 +134,6 @@ void reshape_and_cache(torch::Tensor& key, torch::Tensor& value,
});
}
void concat_and_cache_mla(
torch::Tensor& kv_c, // [num_tokens, kv_lora_rank]
torch::Tensor& k_pe, // [num_tokens, pe_dim]
torch::Tensor& kv_cache, // [num_blocks, block_size, (kv_lora_rank +
// pe_dim)]
torch::Tensor& slot_mapping, // [num_tokens] or [num_actual_tokens]
const std::string& kv_cache_dtype, torch::Tensor& scale) {
int num_tokens = slot_mapping.size(0);
int kv_lora_rank = kv_c.size(1);
int pe_dim = k_pe.size(1);
int block_size = kv_cache.size(1);
TORCH_CHECK(kv_cache.size(2) == kv_lora_rank + pe_dim);
TORCH_CHECK(kv_cache_dtype != "fp8");
int kv_c_stride = kv_c.stride(0);
int k_pe_stride = k_pe.stride(0);
int block_stride = kv_cache.stride(0);
int entry_stride = kv_cache.stride(1);
VLLM_DISPATCH_FLOATING_TYPES(
kv_c.scalar_type(), "concat_and_cache_mla_cpu_impl", [&] {
CPU_KERNEL_GUARD_IN(concat_and_cache_mla_cpu_impl)
concat_and_cache_mla_cpu_impl<scalar_t>(
kv_c.data_ptr<scalar_t>(), k_pe.data_ptr<scalar_t>(),
kv_cache.data_ptr<scalar_t>(), slot_mapping.data_ptr<int64_t>(),
num_tokens, block_stride, entry_stride, kv_c_stride, k_pe_stride,
kv_lora_rank, pe_dim, block_size);
CPU_KERNEL_GUARD_OUT(concat_and_cache_mla_cpu_impl)
});
}
void swap_blocks(torch::Tensor& src, torch::Tensor& dst,
const torch::Tensor& block_mapping) {
TORCH_CHECK(false, "swap_blocks is unsupported on CPU.")

View File

@ -78,14 +78,9 @@ struct FP16Vec16 : public Vec<FP16Vec16> {
__m256i reg;
// normal load
explicit FP16Vec16(const void* ptr)
: reg((__m256i)_mm256_loadu_si256((__m256i*)ptr)) {}
// non-temproal load
explicit FP16Vec16(bool, void* ptr)
: reg(_mm256_stream_load_si256((__m256i*)ptr)) {}
explicit FP16Vec16(const FP32Vec16&);
void save(void* ptr) const { *reinterpret_cast<__m256i*>(ptr) = reg; }
@ -115,14 +110,9 @@ struct BF16Vec16 : public Vec<BF16Vec16> {
__m256i reg;
// normal load
explicit BF16Vec16(const void* ptr)
: reg((__m256i)_mm256_loadu_si256((__m256i*)ptr)) {}
// non-temproal load
explicit BF16Vec16(bool, void* ptr)
: reg(_mm256_stream_load_si256((__m256i*)ptr)) {}
explicit BF16Vec16(const FP32Vec16&);
void save(void* ptr) const { *reinterpret_cast<__m256i*>(ptr) = reg; }
@ -140,8 +130,6 @@ struct BF16Vec32 : public Vec<BF16Vec32> {
__m512i reg;
explicit BF16Vec32() : reg(_mm512_setzero_si512()) {}
explicit BF16Vec32(const void* ptr) : reg((__m512i)_mm512_loadu_si512(ptr)) {}
explicit BF16Vec32(__m512i data) : reg(data) {}
@ -323,13 +311,8 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
explicit FP32Vec16() : reg(_mm512_set1_ps(0.0)) {}
// normal load
explicit FP32Vec16(const float* ptr) : reg(_mm512_loadu_ps(ptr)) {}
// non-temproal load
explicit FP32Vec16(bool, void* ptr)
: reg((__m512)_mm512_stream_load_si512(ptr)) {}
explicit FP32Vec16(__m512 data) : reg(data) {}
explicit FP32Vec16(const FP32Vec4& data)
@ -562,33 +545,6 @@ struct INT8Vec16 : public Vec<INT8Vec16> {
_mm_mask_storeu_epi8(ptr, mask, reg);
}
};
struct INT8Vec64 : public Vec<INT8Vec64> {
constexpr static int VEC_ELEM_NUM = 64;
union AliasReg {
__m512i reg;
int8_t values[VEC_ELEM_NUM];
};
__m512i reg;
// normal load
explicit INT8Vec64(void* ptr) : reg(_mm512_loadu_epi8(ptr)) {}
// non-temproal load
explicit INT8Vec64(bool, void* ptr) : reg(_mm512_stream_load_si512(ptr)) {}
void save(void* ptr) const { _mm512_storeu_epi8(ptr, reg); }
void save(int8_t* ptr, const int elem_num) const {
constexpr uint64_t M = 0xFFFFFFFFFFFFFFFF;
__mmask64 mask = _cvtu64_mask64(M >> (64 - elem_num));
_mm512_mask_storeu_epi8(ptr, mask, reg);
}
// non-temproal save
void nt_save(int8_t* ptr) { _mm512_stream_si512((__m512i*)ptr, reg); }
};
#endif
template <typename T>
@ -699,22 +655,6 @@ inline BF16Vec16::BF16Vec16(const FP32Vec16& v) {
inline void prefetch(const void* addr) { _mm_prefetch(addr, _MM_HINT_T1); }
#ifdef __AVX512F__
inline void non_temporal_save(FP16Vec16& vec, void* ptr) {
_mm256_stream_si256((__m256i*)ptr, vec.reg);
}
inline void non_temporal_save(BF16Vec32& vec, void* ptr) {
_mm512_stream_si512((__m512i*)ptr, vec.reg);
}
inline void non_temporal_save(BF16Vec16& vec, void* ptr) {
_mm256_stream_si256((__m256i*)ptr, vec.reg);
}
inline void non_temporal_save(FP32Vec16& vec, void* ptr) {
_mm512_stream_ps((float*)ptr, vec.reg);
}
#endif
inline void mem_barrier() { _mm_mfence(); }
}; // namespace vec_op
#endif

View File

@ -1,393 +0,0 @@
#include "cpu_types.hpp"
#include <float.h>
namespace {
template <typename scalar_t>
struct KernelVecType {
using qk_load_vec_type = void;
using qk_vec_type = void;
using v_load_vec_type = void;
};
template <>
struct KernelVecType<float> {
using qk_load_vec_type = vec_op::FP32Vec16;
using qk_vec_type = vec_op::FP32Vec16;
using v_load_vec_type = vec_op::FP32Vec16;
};
template <>
struct KernelVecType<c10::Half> {
#if defined(__powerpc64__) || defined(__s390x__)
// Power and s390x architecture-specific vector types
using qk_load_vec_type = vec_op::FP32Vec16;
using qk_vec_type = vec_op::FP32Vec16;
using v_load_vec_type = vec_op::FP32Vec16;
#else
// Fallback for other architectures, including x86
using qk_load_vec_type = vec_op::FP16Vec16;
using qk_vec_type = vec_op::FP32Vec16;
using v_load_vec_type = vec_op::FP16Vec16;
#endif
};
#ifdef __AVX512BF16__
template <>
struct KernelVecType<c10::BFloat16> {
using qk_load_vec_type = vec_op::BF16Vec32;
using qk_vec_type = vec_op::BF16Vec32;
using v_load_vec_type = vec_op::BF16Vec16;
};
#elif defined(__aarch64__) && !defined(ARM_BF16_SUPPORT)
// pass
#else
template <>
struct KernelVecType<c10::BFloat16> {
using qk_load_vec_type = vec_op::BF16Vec16;
using qk_vec_type = vec_op::FP32Vec16;
using v_load_vec_type = vec_op::BF16Vec16;
};
#endif
template <int HEAD_DIM, int V_HEAD_DIM, int BLOCK_SIZE, int HEAD_UNROLL,
typename qk_vec_type>
void mla_decode_block_head(
const qk_vec_type* __restrict__ q_vecs, // [HEAD_UNROLL, head_dim]
const qk_vec_type* __restrict__ k_vecs, // [block_size, head_dim]
const vec_op::FP32Vec16* __restrict v_vecs_f32, // [block_size, v_head_dim]
float* __restrict__ acc_out, // [HEAD_UNROLL, v_head_dim]
float* __restrict__ acc_lse, // [HEAD_UNROLL]
const float scale, const int num_tokens) {
using f32_vec_type = vec_op::FP32Vec16;
constexpr int QK_NUM_ELEM = qk_vec_type::VEC_ELEM_NUM;
constexpr int V_NUM_ELEM = f32_vec_type::VEC_ELEM_NUM;
float logits[BLOCK_SIZE][HEAD_UNROLL] = {}; // initialize to zeros
float max_val[HEAD_UNROLL];
std::fill(max_val, max_val + HEAD_UNROLL, -FLT_MAX);
f32_vec_type acc_vec[BLOCK_SIZE][HEAD_UNROLL];
for (int i = 0; i < HEAD_DIM; i += QK_NUM_ELEM) {
// load to registers
qk_vec_type q_vec[HEAD_UNROLL];
#pragma unroll
for (int unroll = 0; unroll < HEAD_UNROLL; ++unroll)
q_vec[unroll] =
qk_vec_type{q_vecs[(i + unroll * HEAD_DIM) / QK_NUM_ELEM]};
for (int block_offset = 0; block_offset < num_tokens; ++block_offset) {
qk_vec_type k_vec(k_vecs[(block_offset * HEAD_DIM + i) / QK_NUM_ELEM]);
#pragma unroll
for (int unroll = 0; unroll < HEAD_UNROLL; ++unroll)
vec_op::fma(acc_vec[block_offset][unroll], q_vec[unroll], k_vec);
}
}
for (int block_offset = 0; block_offset < num_tokens; ++block_offset) {
#pragma unroll
for (int unroll = 0; unroll < HEAD_UNROLL; ++unroll) {
const float acc = acc_vec[block_offset][unroll].reduce_sum() * scale;
logits[block_offset][unroll] = acc;
max_val[unroll] = std::max(max_val[unroll], acc);
}
}
float sum_exp[HEAD_UNROLL] = {};
for (int block_offset = 0; block_offset < num_tokens; ++block_offset) {
#pragma unroll
for (int unroll = 0; unroll < HEAD_UNROLL; ++unroll) {
const float val =
std::exp(logits[block_offset][unroll] - max_val[unroll]);
logits[block_offset][unroll] = val;
sum_exp[unroll] += val;
}
}
f32_vec_type this_out[V_HEAD_DIM / V_NUM_ELEM][HEAD_UNROLL];
for (int block_offset = 0; block_offset < num_tokens; ++block_offset) {
// load to registers
f32_vec_type scale_[HEAD_UNROLL];
#pragma unroll
for (int unroll = 0; unroll < HEAD_UNROLL; ++unroll)
scale_[unroll] =
f32_vec_type{logits[block_offset][unroll] / sum_exp[unroll]};
for (int i = 0; i < V_HEAD_DIM; i += V_NUM_ELEM) {
f32_vec_type v_vec(
v_vecs_f32[(block_offset * HEAD_DIM + i) / V_NUM_ELEM]);
#pragma unroll
for (int unroll = 0; unroll < HEAD_UNROLL; ++unroll)
vec_op::fma(this_out[i / V_NUM_ELEM][unroll], v_vec, scale_[unroll]);
}
}
// merge attention state
// section 2.2 in https://arxiv.org/pdf/2501.01005
f32_vec_type prev_scale[HEAD_UNROLL];
f32_vec_type curr_scale[HEAD_UNROLL];
#pragma unroll
for (int unroll = 0; unroll < HEAD_UNROLL; ++unroll) {
const float prev_lse = acc_lse[unroll];
const float curr_lse = std::log(sum_exp[unroll]) +
max_val[unroll]; // add back max_val to get true lse
// softmax trick
const float max_lse = std::max(prev_lse, curr_lse);
const float prev_sum_exp = std::exp(prev_lse - max_lse);
const float curr_sum_exp = std::exp(curr_lse - max_lse);
const float new_sum_exp = prev_sum_exp + curr_sum_exp;
acc_lse[unroll] = std::log(new_sum_exp) + max_lse;
prev_scale[unroll] = f32_vec_type{prev_sum_exp / new_sum_exp};
curr_scale[unroll] = f32_vec_type{curr_sum_exp / new_sum_exp};
}
for (int i = 0; i < V_HEAD_DIM; i += V_NUM_ELEM) {
#pragma unroll
for (int unroll = 0; unroll < HEAD_UNROLL; ++unroll) {
f32_vec_type o_vec(acc_out + i + V_HEAD_DIM * unroll);
o_vec = o_vec * prev_scale[unroll] +
this_out[i / V_NUM_ELEM][unroll] * curr_scale[unroll];
o_vec.save(acc_out + i + V_HEAD_DIM * unroll);
}
}
q_vecs += HEAD_DIM / QK_NUM_ELEM * HEAD_UNROLL;
acc_out += V_HEAD_DIM * HEAD_UNROLL;
}
template <typename scalar_t, int HEAD_DIM, int V_HEAD_DIM, int BLOCK_SIZE,
typename qk_vec_type>
void mla_decode_block(
const qk_vec_type* __restrict__ q_vecs, // [num_heads, head_dim]
const scalar_t* __restrict__ kv_cache, // [block_size, head_dim]
float* __restrict__ acc_out, // [num_heads, v_head_dim]
float* __restrict__ acc_lse, // [num_heads]
const int num_heads, const float scale, const int num_tokens) {
using qk_load_vec_type = typename KernelVecType<scalar_t>::qk_load_vec_type;
static_assert(
std::is_same<qk_vec_type,
typename KernelVecType<scalar_t>::qk_vec_type>::value);
using v_load_vec_type = typename KernelVecType<scalar_t>::v_load_vec_type;
using f32_vec_type = vec_op::FP32Vec16;
static_assert(qk_load_vec_type::VEC_ELEM_NUM == qk_vec_type::VEC_ELEM_NUM);
static_assert(v_load_vec_type::VEC_ELEM_NUM == f32_vec_type::VEC_ELEM_NUM);
constexpr int QK_NUM_ELEM = qk_vec_type::VEC_ELEM_NUM;
constexpr int V_NUM_ELEM = v_load_vec_type::VEC_ELEM_NUM;
const qk_vec_type* k_vecs;
const f32_vec_type* v_vecs_f32;
float* kv_cache_f32 = nullptr;
if constexpr (!std::is_same<scalar_t, float>::value) {
// convert KV cache block to FP32 to reuse it across query heads and
// attn @ V computation, since FP16/BF16->FP32 is expensive.
// TODO: move malloc outside of this fn to reuse across iterations.
const int nbytes = BLOCK_SIZE * HEAD_DIM * sizeof(float);
kv_cache_f32 = static_cast<float*>(std::aligned_alloc(64, nbytes));
for (int block_offset = 0; block_offset < num_tokens; ++block_offset)
for (int i = 0; i < HEAD_DIM; i += V_NUM_ELEM) {
v_load_vec_type kv_load_vec(kv_cache + block_offset * HEAD_DIM + i);
f32_vec_type kv_vec_f32(kv_load_vec);
kv_vec_f32.save(kv_cache_f32 + block_offset * HEAD_DIM + i);
}
if constexpr (std::is_same<qk_load_vec_type, qk_vec_type>::value) {
// for AVX512_BF16, Q @ K.T uses BF16 for K (no conversion)
// NOTE: in this case, we only need to convert the V section to FP32.
// But for simplicity, we will convert the whole KV block to FP32.
k_vecs = reinterpret_cast<const qk_vec_type*>(kv_cache);
} else {
k_vecs = reinterpret_cast<const qk_vec_type*>(kv_cache_f32);
}
// attn @ V always use FP32 for V, since attn is FP32.
v_vecs_f32 = reinterpret_cast<const f32_vec_type*>(kv_cache_f32);
} else {
// KV cache is FP32. don't need to do anything.
k_vecs = reinterpret_cast<const qk_vec_type*>(kv_cache);
v_vecs_f32 = reinterpret_cast<const f32_vec_type*>(kv_cache);
}
// compute 2 heads at the same time to improve ILP and
// take advantage of register cache for K and V.
constexpr int HEAD_UNROLL = 2;
for (int iter = 0; iter < num_heads / HEAD_UNROLL; ++iter) {
mla_decode_block_head<HEAD_DIM, V_HEAD_DIM, BLOCK_SIZE, HEAD_UNROLL>(
q_vecs, k_vecs, v_vecs_f32, acc_out, acc_lse, scale, num_tokens);
q_vecs += HEAD_UNROLL * HEAD_DIM / QK_NUM_ELEM;
acc_out += HEAD_UNROLL * V_HEAD_DIM;
acc_lse += HEAD_UNROLL;
}
// take care of the remaining heads
for (int iter = 0; iter < num_heads % HEAD_UNROLL; ++iter) {
mla_decode_block_head<HEAD_DIM, V_HEAD_DIM, BLOCK_SIZE, 1>(
q_vecs, k_vecs, v_vecs_f32, acc_out, acc_lse, scale, num_tokens);
q_vecs += HEAD_DIM / QK_NUM_ELEM;
acc_out += V_HEAD_DIM;
acc_lse += 1;
}
if (kv_cache_f32 != nullptr) {
std::free(kv_cache_f32);
}
}
} // namespace
template <typename scalar_t, int HEAD_DIM, int V_HEAD_DIM, int BLOCK_SIZE>
void mla_decode_kvcache_cpu_impl(
scalar_t* __restrict__ out, // [num_seqs, num_heads, v_head_dim]
const scalar_t* __restrict__ q, // [num_seqs, num_heads, head_dim]
const scalar_t* __restrict__ kv_cache, // [num_blocks, block_size,
// head_dim]
const int num_heads, const float scale,
const int* __restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq]
const int* __restrict__ seq_lens, // [num_seqs]
const int max_num_blocks_per_seq, const int o_stride, const int q_stride,
const int kv_stride, const int num_seqs) {
using qk_load_vec_type = typename KernelVecType<scalar_t>::qk_load_vec_type;
using qk_vec_type = typename KernelVecType<scalar_t>::qk_vec_type;
constexpr int QK_NUM_ELEM = qk_vec_type::VEC_ELEM_NUM;
// shared across threads
const int max_threads = omp_get_max_threads();
const int acc_out_nbytes =
max_threads * num_heads * V_HEAD_DIM * sizeof(float);
float* acc_out = static_cast<float*>(std::aligned_alloc(64, acc_out_nbytes));
std::vector<float> acc_lse(max_threads * num_heads);
// allocate memory to pre-convert query to FP32 later
float* q_f32;
constexpr bool PRE_CONVERT_QUERY =
!std::is_same<scalar_t, float>::value &&
std::is_same<qk_vec_type, vec_op::FP32Vec16>::value;
if constexpr (PRE_CONVERT_QUERY) {
const int q_f32_nbytes = num_heads * HEAD_DIM * sizeof(float);
q_f32 = static_cast<float*>(std::aligned_alloc(64, q_f32_nbytes));
}
#pragma omp parallel
{
const int num_threads = omp_get_num_threads();
const int thread_id = omp_get_thread_num();
float* __restrict__ acc_out_thread =
acc_out + thread_id * num_heads * V_HEAD_DIM;
float* __restrict__ acc_lse_thread = acc_lse.data() + thread_id * num_heads;
for (int seq_idx = 0; seq_idx < num_seqs; ++seq_idx) {
// reset accumulator
std::fill(acc_out_thread, acc_out_thread + num_heads * V_HEAD_DIM, 0.0f);
std::fill(acc_lse_thread, acc_lse_thread + num_heads, -FLT_MAX);
const int seq_len = seq_lens[seq_idx];
const int block_num = (seq_len + BLOCK_SIZE - 1) / BLOCK_SIZE;
const int last_block_size = seq_len - (block_num - 1) * BLOCK_SIZE;
const qk_vec_type* q_vecs;
if constexpr (PRE_CONVERT_QUERY) {
// pre-convert query to FP32 since FP16/BF16->FP32 is slow.
#pragma omp for
for (int i = 0; i < num_heads * HEAD_DIM; i += QK_NUM_ELEM) {
qk_load_vec_type q_load_vec(q + seq_idx * q_stride + i);
qk_vec_type q_vec(q_load_vec);
q_vec.save(q_f32 + i);
}
q_vecs = reinterpret_cast<const qk_vec_type*>(q_f32);
} else {
q_vecs = reinterpret_cast<const qk_vec_type*>(q + seq_idx * q_stride);
}
#pragma omp for
for (int block_idx = 0; block_idx < block_num; ++block_idx) {
const int physical_block_idx =
block_tables[seq_idx * max_num_blocks_per_seq + block_idx];
const int num_tokens =
block_idx < block_num - 1 ? BLOCK_SIZE : last_block_size;
mla_decode_block<scalar_t, HEAD_DIM, V_HEAD_DIM, BLOCK_SIZE>(
q_vecs, kv_cache + physical_block_idx * kv_stride, acc_out_thread,
acc_lse_thread, num_heads, scale, num_tokens);
}
// merge attention states across threads
// section 2.2 in https://arxiv.org/pdf/2501.01005
// each thread is responsible for 1 head
#pragma omp for
for (int head_idx = 0; head_idx < num_heads; ++head_idx) {
float* acc_lse_head = acc_lse.data() + head_idx;
float* acc_out_head = acc_out + head_idx * V_HEAD_DIM;
float max_val = -FLT_MAX;
for (int thread_id_ = 0; thread_id_ < num_threads; ++thread_id_) {
max_val = std::max(max_val, acc_lse_head[thread_id_ * num_heads]);
}
float sum_exp = 0.0f;
for (int thread_id_ = 0; thread_id_ < num_threads; ++thread_id_) {
float val = std::exp(acc_lse_head[thread_id_ * num_heads] - max_val);
acc_lse_head[thread_id_ * num_heads] = val;
sum_exp += val;
}
float inv_sum = 1.0f / sum_exp;
float out_head[V_HEAD_DIM] = {};
for (int thread_id_ = 0; thread_id_ < num_threads; ++thread_id_) {
float scale_ = acc_lse_head[thread_id_ * num_heads] * inv_sum;
for (int i = 0; i < V_HEAD_DIM; ++i) {
out_head[i] +=
acc_out_head[thread_id_ * num_heads * V_HEAD_DIM + i] * scale_;
}
}
for (int i = 0; i < V_HEAD_DIM; ++i) {
vec_op::storeFP32(out_head[i], out + seq_idx * o_stride +
head_idx * V_HEAD_DIM + i);
}
}
}
}
if (PRE_CONVERT_QUERY) {
std::free(q_f32);
}
std::free(acc_out);
}
void mla_decode_kvcache(torch::Tensor& out, torch::Tensor& query,
torch::Tensor& kv_cache, double scale,
torch::Tensor& block_tables, torch::Tensor& seq_lens) {
const int num_seqs = query.size(0);
const int num_heads = query.size(1);
const int head_dim = query.size(2);
const int block_size = kv_cache.size(1);
const int v_head_dim = out.size(2);
const int max_num_blocks_per_seq = block_tables.size(1);
const int o_stride = out.stride(0);
const int q_stride = query.stride(0);
const int kv_stride = kv_cache.stride(0);
VLLM_DISPATCH_FLOATING_TYPES(
query.scalar_type(), "mla_decode_kvcache_cpu_impl", [&] {
CPU_KERNEL_GUARD_IN(mla_decode_kvcache_cpu_impl)
if (head_dim == 576 && v_head_dim == 512 && block_size == 16)
mla_decode_kvcache_cpu_impl<scalar_t, 576, 512, 16>(
out.data_ptr<scalar_t>(), query.data_ptr<scalar_t>(),
kv_cache.data_ptr<scalar_t>(), num_heads, scale,
block_tables.data_ptr<int>(), seq_lens.data_ptr<int>(),
max_num_blocks_per_seq, o_stride, q_stride, kv_stride, num_seqs);
else
TORCH_CHECK(false, "Unsupported block size: ", block_size);
CPU_KERNEL_GUARD_OUT(mla_decode_kvcache_cpu_impl)
});
}

View File

@ -1,781 +0,0 @@
#include "cpu/cpu_types.hpp"
#include <fcntl.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <unistd.h>
namespace {
#define MAX_SHM_RANK_NUM 8
#define MAX_THREAD_NUM 12
#define PER_THREAD_SHM_BUFFER_BYTES (4 * 1024 * 1024)
#define MIN_THREAD_PROCESS_SIZE (8 * 1024)
#define MAX_P2P_SEND_TENSOR_NUM 8
template <typename scalar_t>
struct KernelVecType {
using scalar_vec_t = void;
};
template <>
struct KernelVecType<float> {
using scalar_vec_t = vec_op::FP32Vec16;
};
template <>
struct KernelVecType<c10::BFloat16> {
using scalar_vec_t = vec_op::BF16Vec16;
};
template <>
struct KernelVecType<c10::Half> {
using scalar_vec_t = vec_op::FP16Vec16;
};
enum class ThreadSHMStat : char { THREAD_READY = 0, SHM_DATA_READY, DONE };
struct ThreadSHMContext {
volatile ThreadSHMStat thread_stats[MAX_SHM_RANK_NUM];
int thread_id;
int thread_num;
int rank;
int group_size;
size_t _spinning_count;
int swizzled_ranks[MAX_SHM_RANK_NUM];
void* thread_shm_ptrs[MAX_SHM_RANK_NUM];
ThreadSHMContext* shm_contexts[MAX_SHM_RANK_NUM];
ThreadSHMContext(const int thread_id, const int thread_num, const int rank,
const int group_size, void* thread_shm_ptr)
: thread_id(thread_id),
thread_num(thread_num),
rank(rank),
group_size(group_size),
_spinning_count(0) {
static_assert(sizeof(ThreadSHMContext) % 64 == 0);
TORCH_CHECK(group_size <= MAX_SHM_RANK_NUM);
TORCH_CHECK((size_t)this % 64 == 0);
TORCH_CHECK((size_t)thread_shm_ptr % 64 == 0);
for (int i = 0; i < MAX_SHM_RANK_NUM; ++i) {
shm_contexts[i] = nullptr;
thread_shm_ptrs[i] = nullptr;
swizzled_ranks[i] = (i + rank) % group_size;
thread_stats[i] = ThreadSHMStat::DONE;
}
set_context(rank, this, thread_shm_ptr);
}
void set_context(int rank, ThreadSHMContext* ptr, void* thread_shm_ptr) {
TORCH_CHECK(rank < MAX_SHM_RANK_NUM);
TORCH_CHECK(ptr);
TORCH_CHECK(thread_shm_ptr);
TORCH_CHECK_EQ(ptr->thread_num, thread_num);
TORCH_CHECK_EQ(ptr->thread_id, thread_id);
shm_contexts[rank] = ptr;
thread_shm_ptrs[rank] = thread_shm_ptr;
}
template <typename T>
T* get_thread_shm_ptr(int rank) {
return reinterpret_cast<T*>(thread_shm_ptrs[rank]);
}
int get_swizzled_rank(int idx) { return swizzled_ranks[idx]; }
void wait_for_all(ThreadSHMStat prev_stat) {
for (int idx = 0; idx < group_size; ++idx) {
int rank = get_swizzled_rank(idx);
while (thread_stats[rank] == prev_stat) {
++_spinning_count;
_mm_pause();
}
}
vec_op::mem_barrier();
}
void wait_for_one(int rank, ThreadSHMStat prev_stat) {
while (thread_stats[rank] == prev_stat) {
++_spinning_count;
_mm_pause();
}
vec_op::mem_barrier();
}
void set_thread_stat(ThreadSHMStat stat) {
for (int idx = 0; idx < group_size; ++idx) {
int rank = get_swizzled_rank(idx);
shm_contexts[rank]->thread_stats[this->rank] = stat;
}
}
void set_thread_stat(int target_rank, ThreadSHMStat stat) {
for (int idx = 0; idx < group_size; ++idx) {
int rank = get_swizzled_rank(idx);
shm_contexts[rank]->thread_stats[target_rank] = stat;
}
}
// barrier for all ranks in the group, used for all2all ops
// DONE -> THREAD_READY -> SHM_DATA_READY -> DONE -> ...
void barrier(ThreadSHMStat next_stat) {
if (next_stat == ThreadSHMStat::THREAD_READY) {
set_thread_stat(ThreadSHMStat::THREAD_READY);
wait_for_all(ThreadSHMStat::DONE);
} else if (next_stat == ThreadSHMStat::SHM_DATA_READY) {
set_thread_stat(ThreadSHMStat::SHM_DATA_READY);
wait_for_all(ThreadSHMStat::THREAD_READY);
} else if (next_stat == ThreadSHMStat::DONE) {
set_thread_stat(ThreadSHMStat::DONE);
wait_for_all(ThreadSHMStat::SHM_DATA_READY);
} else {
TORCH_CHECK(false, "Invalid next_stat to barrier.");
}
}
std::string to_string() const {
std::stringstream ss;
ss << "SHMContext:";
ss << "\nrank: " << rank;
ss << "\ngroup_size: " << group_size;
ss << "\nthread_num: " << thread_num;
ss << "\nthread_id: " << thread_id;
ss << "\nshm_ctx_stat_loop_seq: [";
for (int i = 0; i < group_size; ++i) {
ss << swizzled_ranks[i] << ", ";
}
ss << "]";
ss << "\nshm_contexts: [";
for (int i = 0; i < group_size; ++i) {
if (shm_contexts[i]) {
ss << shm_contexts[i]->rank << ", ";
}
}
ss << "]";
return ss.str();
}
};
class SHMManager {
public:
explicit SHMManager(const std::string& name, const int rank,
const int group_size)
: _rank(rank),
_group_size(group_size),
_thread_num(std::min(torch::get_num_threads(), MAX_THREAD_NUM)),
_shm_names({""}),
_shared_mem_ptrs({nullptr}),
_shm_ctx(nullptr) {
_shm_names[rank] = get_shm_name(name, rank);
_shared_mem_ptrs[rank] = init_shm(rank);
_shm_ctx = reinterpret_cast<ThreadSHMContext*>(_shared_mem_ptrs[rank]);
for (int i = 0; i < _thread_num; ++i) {
ThreadSHMContext* ctx = new (_shm_ctx + i)
ThreadSHMContext(i, _thread_num, _rank, _group_size,
compute_thread_shm_ptr(_shm_ctx, i));
}
}
void join(const std::string& name) {
for (int rank_idx = 0; rank_idx < _group_size; ++rank_idx) {
if (rank_idx != _rank) {
TORCH_CHECK(_shm_names[rank_idx].empty());
TORCH_CHECK(_shared_mem_ptrs[rank_idx] == nullptr);
_shm_names[rank_idx] = get_shm_name(name, rank_idx);
_shared_mem_ptrs[rank_idx] = init_shm(rank_idx);
ThreadSHMContext* target_ctx =
reinterpret_cast<ThreadSHMContext*>(_shared_mem_ptrs[rank_idx]);
for (int thread_idx = 0; thread_idx < _thread_num; ++thread_idx) {
_shm_ctx[thread_idx].set_context(
rank_idx, target_ctx + thread_idx,
compute_thread_shm_ptr(target_ctx, thread_idx));
}
}
}
}
~SHMManager() { destroy_shm(); }
ThreadSHMContext* get_shm_ctx() const { return _shm_ctx; }
static std::string get_shm_name(const std::string& name, int rank) {
return name + "_" + std::to_string(rank);
}
static int64_t create_singleton_instance(const std::string& name,
const int group_size,
const int rank) {
std::lock_guard<std::mutex> guard(SingletonInstancesLock);
SingletonInstances.emplace_back(
std::make_unique<SHMManager>(name, rank, group_size));
return static_cast<int64_t>(SingletonInstances.size() - 1);
}
static SHMManager* get_singleton_instance(int64_t handle) {
return SingletonInstances[handle].get();
}
protected:
static std::vector<std::unique_ptr<SHMManager>> SingletonInstances;
static std::mutex SingletonInstancesLock;
private:
static size_t round_to_alignment(size_t num) {
return ((num + 63) / 64) * 64;
}
int8_t* compute_thread_shm_ptr(ThreadSHMContext* ctx, int thread_id) {
int8_t* thread_shm_ptr =
reinterpret_cast<int8_t*>(ctx) +
round_to_alignment(_thread_num * sizeof(ThreadSHMContext));
return thread_shm_ptr +
thread_id * round_to_alignment(PER_THREAD_SHM_BUFFER_BYTES);
}
size_t compute_shm_size() {
const size_t rounded_rank_buffer_size =
round_to_alignment(PER_THREAD_SHM_BUFFER_BYTES) * _thread_num;
const size_t rounded_thread_shm_ctx_size =
round_to_alignment(_thread_num * sizeof(ThreadSHMContext));
const size_t shm_size =
rounded_thread_shm_ctx_size + rounded_rank_buffer_size;
return shm_size;
}
void* init_shm(int target_rank) {
const std::string& shm_name = _shm_names[target_rank];
const int local_rank = _rank;
const size_t shm_size = compute_shm_size();
int fd = -1;
if (local_rank == target_rank) {
fd = shm_open(shm_name.c_str(), O_CREAT | O_EXCL | O_RDWR,
S_IRUSR | S_IWUSR);
if (fd == -1)
TORCH_CHECK(false, "create shm in SHMManager failed. errno: " +
std::to_string(errno));
if (ftruncate(fd, shm_size) == -1)
TORCH_CHECK(false, "ftruncate in SHMManager failed. errno: " +
std::to_string(errno));
} else {
fd = shm_open(shm_name.c_str(), O_RDWR, S_IRUSR | S_IWUSR);
if (fd == -1)
TORCH_CHECK(false, "open shm in SHMManager failed. errno: " +
std::to_string(errno));
}
void* shm_ptr = mmap(nullptr, shm_size, PROT_READ | PROT_WRITE,
MAP_SHARED | MAP_POPULATE, fd, 0);
if (shm_ptr == MAP_FAILED) {
TORCH_CHECK(false,
"mmap in SHMManager failed. errno: " + std::to_string(errno));
}
if (close(fd) != 0) {
TORCH_CHECK(
false, "close in SHMManager failed. errno: " + std::to_string(errno));
}
TORCH_CHECK((size_t)shm_ptr % 64 == 0);
return shm_ptr;
}
void destroy_shm() {
std::stringstream ss;
ss << "local rank " << _rank << ": [";
for (int thread_id = 0; thread_id < _thread_num; ++thread_id) {
ss << _shm_ctx[thread_id]._spinning_count << ", ";
}
ss << "]\n";
for (int i = 0; i < MAX_SHM_RANK_NUM; ++i) {
if (_shared_mem_ptrs[i] != nullptr) {
munmap(_shared_mem_ptrs[i], compute_shm_size());
}
if (!_shm_names[i].empty()) {
shm_unlink(_shm_names[i].c_str());
}
}
}
int _rank;
int _group_size;
int _thread_num;
std::array<std::string, MAX_SHM_RANK_NUM> _shm_names;
std::array<void*, MAX_SHM_RANK_NUM> _shared_mem_ptrs;
ThreadSHMContext* _shm_ctx;
};
namespace shm_cc_ops {
template <typename scalar_t, typename F>
void shm_cc_loop(ThreadSHMContext* ctx, int64_t elem_num, F&& inner_func) {
int thread_num = ctx->thread_num;
int64_t total_bytes = elem_num * sizeof(scalar_t);
int64_t total_units_num =
(total_bytes + MIN_THREAD_PROCESS_SIZE - 1) / MIN_THREAD_PROCESS_SIZE;
int64_t per_thread_units_num =
(total_units_num + thread_num - 1) / thread_num;
int64_t per_unit_elem_num = MIN_THREAD_PROCESS_SIZE / sizeof(scalar_t);
int64_t max_per_thread_iteration_elem_num =
PER_THREAD_SHM_BUFFER_BYTES / sizeof(scalar_t);
int64_t per_thread_elem_num = per_unit_elem_num * per_thread_units_num;
#pragma omp parallel for schedule(static, 1)
for (int i = 0; i < thread_num; ++i) {
int64_t offset = i * per_thread_elem_num;
int64_t end = std::min(elem_num, offset + per_thread_elem_num);
int64_t curr_elem_num =
std::min(max_per_thread_iteration_elem_num, end - offset);
ThreadSHMContext* thread_ctx = ctx + i;
while (curr_elem_num > 0) {
inner_func(thread_ctx, offset, curr_elem_num);
offset += max_per_thread_iteration_elem_num;
curr_elem_num = std::min(max_per_thread_iteration_elem_num, end - offset);
}
}
}
}; // namespace shm_cc_ops
namespace shm_cc_ops {
void memcpy_from_shm(void* dst, void* src, const int64_t bytes) {
const int64_t aligned_bytes = ((bytes >> 6) << 6); // 64 bytes aligned
int64_t i = 0;
#pragma GCC unroll 4
for (; i < aligned_bytes; i += 64) {
vec_op::INT8Vec64 data(
true, (int8_t*)src + i); // stream loading shm to avoid caching
data.save((int8_t*)dst + i);
}
if (aligned_bytes < bytes) {
vec_op::INT8Vec64 data(true, (int8_t*)src + aligned_bytes);
data.save((int8_t*)dst + aligned_bytes, bytes - aligned_bytes);
}
}
void memcpy_to_shm(void* dst, void* src, const int64_t bytes) {
#pragma GCC unroll 4
for (int64_t i = 0; i < bytes; i += 64) {
vec_op::INT8Vec64 data((int8_t*)src + i);
data.nt_save((int8_t*)dst + i);
}
}
void memcpy(void* dst, void* src, const int64_t bytes) {
const int64_t aligned_bytes = ((bytes >> 6) << 6); // 64 bytes aligned
int64_t i = 0;
#pragma GCC unroll 4
for (; i < aligned_bytes; i += 64) {
vec_op::INT8Vec64 data((int8_t*)src + i);
data.save((int8_t*)dst + i);
}
if (aligned_bytes < bytes) {
vec_op::INT8Vec64 data((int8_t*)src + aligned_bytes);
data.save((int8_t*)dst + aligned_bytes, bytes - aligned_bytes);
}
}
template <typename scalar_t, int RANKS>
void all_reduce_sum_impl(ThreadSHMContext* ctx, scalar_t* data,
size_t elem_num) {
CPU_KERNEL_GUARD_IN(all_reduce_sum_impl)
using vec_t = typename KernelVecType<scalar_t>::scalar_vec_t;
constexpr int64_t vec_elem_num = vec_t::get_elem_num();
const int worldsize = ctx->group_size;
shm_cc_ops::shm_cc_loop<scalar_t>(
ctx, elem_num,
[&](ThreadSHMContext* thread_ctx, int64_t data_offset,
int64_t data_elem_num) {
int rank = thread_ctx->rank;
scalar_t* thread_shm_ptr =
thread_ctx->get_thread_shm_ptr<scalar_t>(rank);
scalar_t* thread_data_ptr = data + data_offset;
int64_t thread_data_elem_num = data_elem_num * sizeof(scalar_t);
scalar_t* remote_data_ptrs[RANKS - 1];
vec_op::unroll_loop<int, RANKS - 1>([&](int idx) {
remote_data_ptrs[idx] = thread_ctx->get_thread_shm_ptr<scalar_t>(
thread_ctx->get_swizzled_rank(idx + 1));
});
thread_ctx->barrier(ThreadSHMStat::THREAD_READY);
shm_cc_ops::memcpy_to_shm(thread_shm_ptr, thread_data_ptr,
thread_data_elem_num);
thread_ctx->barrier(ThreadSHMStat::SHM_DATA_READY);
int64_t aligned_data_elem_num =
(data_elem_num / vec_elem_num) * vec_elem_num;
int64_t i = 0;
#pragma GCC unroll 4
for (; i < aligned_data_elem_num; i += vec_elem_num) {
vec_t local_data(thread_data_ptr + i); // load from cache
vec_op::FP32Vec16 local_data_fp32(local_data);
vec_op::unroll_loop<int, RANKS - 1>([&](int idx) {
vec_t remote_data(
true, remote_data_ptrs[idx] + i); // stream load from shm
vec_op::FP32Vec16 remote_data_fp32(remote_data);
local_data_fp32 = local_data_fp32 + remote_data_fp32; // sum reduce
});
vec_t reduced_data(local_data_fp32);
reduced_data.save(thread_data_ptr + i);
}
if (i < data_elem_num) {
vec_t local_data(thread_data_ptr + i); // load from cache
vec_op::FP32Vec16 local_data_fp32(local_data);
vec_op::unroll_loop<int, RANKS - 1>([&](int idx) {
vec_t remote_data(
true, remote_data_ptrs[idx] + i); // stream load from shm
vec_op::FP32Vec16 remote_data_fp32(remote_data);
local_data_fp32 = local_data_fp32 + remote_data_fp32; // sum reduce
});
vec_t reduced_data(local_data_fp32);
reduced_data.save(thread_data_ptr + i,
data_elem_num - aligned_data_elem_num);
}
thread_ctx->barrier(ThreadSHMStat::DONE);
});
return;
}
}; // namespace shm_cc_ops
std::vector<std::unique_ptr<SHMManager>> SHMManager::SingletonInstances = {};
std::mutex SHMManager::SingletonInstancesLock = {};
template <typename scalar_t>
void shm_allreduce_sum(ThreadSHMContext* ctx, scalar_t* data, size_t elem_num) {
switch (ctx->group_size) {
case 2:
shm_cc_ops::all_reduce_sum_impl<scalar_t, 2>(ctx, data, elem_num);
break;
case 3:
shm_cc_ops::all_reduce_sum_impl<scalar_t, 3>(ctx, data, elem_num);
break;
case 4:
shm_cc_ops::all_reduce_sum_impl<scalar_t, 4>(ctx, data, elem_num);
break;
case 8:
shm_cc_ops::all_reduce_sum_impl<scalar_t, 8>(ctx, data, elem_num);
break;
default:
TORCH_CHECK(false,
"Invalid world size: " + std::to_string(ctx->group_size));
}
}
template <typename scalar_t>
void shm_gather_impl(ThreadSHMContext* ctx, scalar_t* data, size_t elem_num,
scalar_t** outputs, const int dst) {
CPU_KERNEL_GUARD_IN(shm_gather_impl)
const int worldsize = ctx->group_size;
TORCH_CHECK_LT(dst, worldsize);
shm_cc_ops::shm_cc_loop<scalar_t>(
ctx, elem_num,
[&](ThreadSHMContext* thread_ctx, int64_t data_offset,
int64_t data_elem_num) {
int rank = thread_ctx->rank;
scalar_t* thread_shm_ptr =
thread_ctx->get_thread_shm_ptr<scalar_t>(rank);
thread_ctx->barrier(ThreadSHMStat::THREAD_READY);
shm_cc_ops::memcpy_to_shm(thread_shm_ptr, data + data_offset,
data_elem_num * sizeof(scalar_t));
thread_ctx->barrier(ThreadSHMStat::SHM_DATA_READY);
if (rank == dst) {
shm_cc_ops::memcpy(outputs[rank] + data_offset, data + data_offset,
data_elem_num * sizeof(scalar_t));
for (int i = 1; i < worldsize; ++i) {
int src_rank = thread_ctx->get_swizzled_rank(i);
scalar_t* src_ptr =
thread_ctx->get_thread_shm_ptr<scalar_t>(src_rank); // shm
scalar_t* dst_ptr = outputs[src_rank] + data_offset;
shm_cc_ops::memcpy_from_shm(dst_ptr, src_ptr,
data_elem_num * sizeof(scalar_t));
}
}
thread_ctx->barrier(ThreadSHMStat::DONE);
});
return;
}
struct MemPiece {
void* ptr;
int64_t size;
template <typename T>
T* data_ptr() {
return reinterpret_cast<T*>(ptr);
}
};
struct TensorListMeta {
int64_t tensor_bytes[MAX_P2P_SEND_TENSOR_NUM];
torch::ScalarType tensor_types[MAX_P2P_SEND_TENSOR_NUM];
int64_t tensor_num;
int64_t total_bytes;
TensorListMeta() : tensor_num(0), total_bytes(0) {
static_assert(sizeof(TensorListMeta) % 64 == 0);
static_assert(sizeof(TensorListMeta) <
MIN_THREAD_PROCESS_SIZE); // To ensure the metadata always
// hold by the thread 0
for (int i = 0; i < MAX_P2P_SEND_TENSOR_NUM; ++i) {
tensor_bytes[i] = 0;
tensor_ptrs[i] = nullptr;
tensor_types[i] = torch::ScalarType::Undefined;
}
}
// For send and recv
void bind_tensor_list(std::vector<torch::Tensor>& tensor_list) {
TORCH_CHECK(tensor_types[0] == torch::ScalarType::Undefined,
"Re-bind TensorListMeta is not allowed.")
TORCH_CHECK_LE(tensor_list.size(), MAX_P2P_SEND_TENSOR_NUM);
tensor_num = tensor_list.size();
int64_t bytes_sum = 0;
for (int i = 0; i < tensor_list.size(); ++i) {
torch::Tensor& t = tensor_list[i];
TORCH_CHECK(t.is_contiguous());
tensor_bytes[i] = t.nbytes();
tensor_types[i] = t.scalar_type();
tensor_ptrs[i] = t.data_ptr();
bytes_sum += t.nbytes();
}
total_bytes = bytes_sum;
}
// For recv
std::vector<torch::Tensor> generate_tensor_list() {
std::vector<torch::Tensor> tensor_list;
tensor_list.reserve(tensor_num);
for (int i = 0; i < tensor_num; ++i) {
int64_t bytes = tensor_bytes[i];
auto type = tensor_types[i];
int64_t elem_bytes = torch::elementSize(type);
TORCH_CHECK_EQ(bytes % elem_bytes, 0);
int64_t elem_num = bytes / elem_bytes;
auto options = torch::TensorOptions().dtype(type).device(torch::kCPU);
tensor_list.emplace_back(torch::empty({elem_num}, options));
}
return tensor_list;
}
MemPiece get_data(int64_t offset) {
for (int i = 0; i < tensor_num; ++i) {
if (offset < tensor_bytes[i]) {
return {reinterpret_cast<int8_t*>(tensor_ptrs[i]) + offset,
tensor_bytes[i] - offset};
}
offset -= tensor_bytes[i];
}
return {nullptr, 0};
}
private:
void* tensor_ptrs[MAX_P2P_SEND_TENSOR_NUM];
int8_t _padding[40];
};
void shm_send_tensor_list_impl(ThreadSHMContext* ctx,
const std::vector<torch::Tensor>& tensor_list) {
CPU_KERNEL_GUARD_IN(shm_send_tensor_list_impl)
std::vector<torch::Tensor> tensor_list_with_metadata;
tensor_list_with_metadata.reserve(1 + tensor_list.size());
auto options = torch::TensorOptions().dtype(torch::kInt8).device(torch::kCPU);
tensor_list_with_metadata.emplace_back(
torch::empty({sizeof(TensorListMeta)}, options));
tensor_list_with_metadata.insert(tensor_list_with_metadata.end(),
tensor_list.begin(), tensor_list.end());
torch::Tensor& metadata_tensor = tensor_list_with_metadata[0];
TORCH_CHECK_EQ(metadata_tensor.nbytes(), sizeof(TensorListMeta));
TensorListMeta* metadata = new (metadata_tensor.data_ptr()) TensorListMeta();
metadata->bind_tensor_list(tensor_list_with_metadata);
shm_cc_ops::shm_cc_loop<int8_t>(
ctx, metadata->total_bytes,
[&](ThreadSHMContext* thread_ctx, int64_t data_offset,
int64_t data_elem_num) {
int rank = thread_ctx->rank;
// Wait until the receiver set the stat to DONE
thread_ctx->wait_for_one(rank, ThreadSHMStat::SHM_DATA_READY);
int64_t curr_shm_offset = 0;
while (curr_shm_offset < data_elem_num) {
MemPiece frag = metadata->get_data(data_offset + curr_shm_offset);
frag.size = std::min(frag.size, data_elem_num - curr_shm_offset);
shm_cc_ops::memcpy(
thread_ctx->get_thread_shm_ptr<int8_t>(rank) + curr_shm_offset,
frag.ptr, frag.size);
curr_shm_offset += frag.size;
}
thread_ctx->set_thread_stat(rank, ThreadSHMStat::SHM_DATA_READY);
});
}
std::vector<torch::Tensor> shm_recv_tensor_list_impl(ThreadSHMContext* ctx,
int64_t src) {
CPU_KERNEL_GUARD_IN(shm_recv_tensor_list_impl)
auto options = torch::TensorOptions().dtype(torch::kInt8).device(torch::kCPU);
torch::Tensor metadata_tensor =
torch::empty({sizeof(TensorListMeta)}, options);
// Wait until the sender set the stat of the thread 0 to SHM_DATA_READY
ctx->wait_for_one(src, ThreadSHMStat::DONE);
shm_cc_ops::memcpy(metadata_tensor.data_ptr(),
ctx->get_thread_shm_ptr<void>(src),
sizeof(TensorListMeta));
TensorListMeta* src_metadata =
reinterpret_cast<TensorListMeta*>(metadata_tensor.data_ptr());
std::vector<torch::Tensor> tensor_list_with_metadata =
src_metadata->generate_tensor_list();
TensorListMeta metadata;
metadata.bind_tensor_list(tensor_list_with_metadata);
TORCH_CHECK_EQ(metadata.tensor_num, src_metadata->tensor_num);
TORCH_CHECK_EQ(metadata.total_bytes, src_metadata->total_bytes);
shm_cc_ops::shm_cc_loop<int8_t>(
ctx, metadata.total_bytes,
[&](ThreadSHMContext* thread_ctx, int64_t data_offset,
int64_t data_elem_num) {
// Wait until the sender set the stat to SHM_DATA_READY
thread_ctx->wait_for_one(src, ThreadSHMStat::DONE);
int64_t curr_shm_offset = 0;
while (curr_shm_offset < data_elem_num) {
MemPiece frag = metadata.get_data(data_offset + curr_shm_offset);
frag.size = std::min(frag.size, data_elem_num - curr_shm_offset);
shm_cc_ops::memcpy(
frag.ptr,
thread_ctx->get_thread_shm_ptr<int8_t>(src) + curr_shm_offset,
frag.size);
curr_shm_offset += frag.size;
}
thread_ctx->set_thread_stat(src, ThreadSHMStat::DONE);
});
std::vector<torch::Tensor> tensor_list;
tensor_list.reserve(metadata.tensor_num - 1);
tensor_list.insert(tensor_list.begin(), tensor_list_with_metadata.begin() + 1,
tensor_list_with_metadata.end());
return tensor_list;
}
} // namespace
void shm_gather(int64_t handle, torch::Tensor& data,
const std::optional<std::vector<torch::Tensor>>& outputs,
int64_t dst) {
TORCH_CHECK(data.is_contiguous())
VLLM_DISPATCH_FLOATING_TYPES(data.scalar_type(), "shm_gather_impl", [&] {
CPU_KERNEL_GUARD_IN(shm_gather_impl)
if (outputs.has_value()) {
TORCH_CHECK_LE(outputs->size(), MAX_SHM_RANK_NUM);
scalar_t* output_ptrs[MAX_SHM_RANK_NUM] = {nullptr};
for (int i = 0; i < outputs->size(); ++i) {
output_ptrs[i] = outputs->at(i).data_ptr<scalar_t>();
}
shm_gather_impl(SHMManager::get_singleton_instance(handle)->get_shm_ctx(),
data.data_ptr<scalar_t>(), data.numel(), output_ptrs,
dst);
} else {
shm_gather_impl(SHMManager::get_singleton_instance(handle)->get_shm_ctx(),
data.data_ptr<scalar_t>(), data.numel(), (scalar_t**)(0),
dst);
}
CPU_KERNEL_GUARD_OUT(shm_gather_impl)
});
}
void shm_all_gather(int64_t handle, const torch::Tensor& data,
torch::Tensor& output) {
TORCH_CHECK(data.is_contiguous())
TORCH_CHECK(output.is_contiguous())
const int64_t input_elem_num = data.numel();
const int64_t output_elem_num = output.numel();
TORCH_CHECK_EQ(output_elem_num % input_elem_num, 0);
const int world_size = output_elem_num / input_elem_num;
VLLM_DISPATCH_FLOATING_TYPES(data.scalar_type(), "shm_all_gather_impl", [&] {
CPU_KERNEL_GUARD_IN(shm_all_gather_impl)
auto ctx = SHMManager::get_singleton_instance(handle)->get_shm_ctx();
TORCH_CHECK_EQ(ctx->group_size, world_size);
scalar_t* output_ptrs[MAX_SHM_RANK_NUM] = {nullptr};
for (int i = 0; i < world_size; ++i) {
output_ptrs[i] = output.data_ptr<scalar_t>() + i * input_elem_num;
}
shm_gather_impl(ctx, data.data_ptr<scalar_t>(), data.numel(), output_ptrs,
ctx->rank);
CPU_KERNEL_GUARD_OUT(shm_all_gather_impl)
});
}
void shm_allreduce(int64_t handle, torch::Tensor& data) {
TORCH_CHECK(data.is_contiguous())
VLLM_DISPATCH_FLOATING_TYPES(data.scalar_type(), "shm_allreduce_sum", [&] {
CPU_KERNEL_GUARD_IN(shm_allreduce_sum)
shm_allreduce_sum(SHMManager::get_singleton_instance(handle)->get_shm_ctx(),
data.data_ptr<scalar_t>(), data.numel());
CPU_KERNEL_GUARD_OUT(shm_allreduce_sum)
});
}
void shm_send_tensor_list(int64_t handle,
const std::vector<torch::Tensor>& tensor_list,
int64_t dst) {
CPU_KERNEL_GUARD_IN(shm_send_tensor_list)
shm_send_tensor_list_impl(
SHMManager::get_singleton_instance(handle)->get_shm_ctx(), tensor_list);
CPU_KERNEL_GUARD_OUT(shm_send_tensor_list)
}
std::vector<torch::Tensor> shm_recv_tensor_list(int64_t handle, int64_t src) {
CPU_KERNEL_GUARD_IN(shm_recv_tensor_list)
auto tensor_list = shm_recv_tensor_list_impl(
SHMManager::get_singleton_instance(handle)->get_shm_ctx(), src);
CPU_KERNEL_GUARD_OUT(shm_recv_tensor_list)
return tensor_list;
}
int64_t init_shm_manager(const std::string& name, const int64_t group_size,
const int64_t rank) {
return SHMManager::create_singleton_instance(name, group_size, rank);
}
std::string join_shm_manager(int64_t handle, const std::string& name) {
auto shm_manager = SHMManager::get_singleton_instance(handle);
TORCH_CHECK(shm_manager);
shm_manager->join(name);
return shm_manager->get_shm_ctx()->to_string();
}

View File

@ -18,30 +18,6 @@ void int8_scaled_mm_azp(torch::Tensor& c, const torch::Tensor& a,
const std::optional<torch::Tensor>& azp,
const std::optional<torch::Tensor>& bias);
void mla_decode_kvcache(torch::Tensor& out, torch::Tensor& query,
torch::Tensor& kv_cache, double scale,
torch::Tensor& block_tables, torch::Tensor& seq_lens);
int64_t init_shm_manager(const std::string& name, const int64_t group_size,
const int64_t rank);
std::string join_shm_manager(int64_t handle, const std::string& name);
void shm_allreduce(int64_t handle, torch::Tensor& data);
void shm_gather(int64_t handle, torch::Tensor& data,
const std::optional<std::vector<torch::Tensor>>& outputs,
int64_t dst);
void shm_all_gather(int64_t handle, const torch::Tensor& data,
torch::Tensor& output);
void shm_send_tensor_list(int64_t handle,
const std::vector<torch::Tensor>& tensor_list,
int64_t dst);
std::vector<torch::Tensor> shm_recv_tensor_list(int64_t handle, int64_t src);
TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
// vLLM custom ops
@ -151,29 +127,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
" Tensor? azp, Tensor? bias) -> ()");
ops.impl("cutlass_scaled_mm_azp", torch::kCPU, &int8_scaled_mm_azp);
#endif
// SHM CCL
#ifdef __AVX512F__
ops.def("init_shm_manager(str name, int group_size, int rank) -> int",
&init_shm_manager);
ops.def("join_shm_manager(int handle, str name) -> str", &join_shm_manager);
ops.def("shm_allreduce(int handle, Tensor! data) -> ()");
ops.impl("shm_allreduce", torch::kCPU, &shm_allreduce);
ops.def(
"shm_gather(int handle, Tensor data, Tensor[](a!)? outputs, int dst) -> "
"()");
ops.impl("shm_gather", torch::kCPU, &shm_gather);
ops.def(
"shm_all_gather(int handle, Tensor data, Tensor! output) -> "
"()");
ops.impl("shm_all_gather", torch::kCPU, &shm_all_gather);
ops.def(
"shm_send_tensor_list(int handle, Tensor[](a) tensor_list, int dst) -> "
"()");
ops.impl("shm_send_tensor_list", torch::kCPU, &shm_send_tensor_list);
ops.def("shm_recv_tensor_list(int handle, int src) -> Tensor[](a)",
&shm_recv_tensor_list);
#endif
}
TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
@ -197,14 +150,6 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
" str kv_cache_dtype,"
" Tensor k_scale, Tensor v_scale) -> ()");
cache_ops.impl("reshape_and_cache", torch::kCPU, &reshape_and_cache);
cache_ops.def(
"concat_and_cache_mla(Tensor kv_c, Tensor k_pe,"
" Tensor! kv_cache,"
" Tensor slot_mapping,"
" str kv_cache_dtype,"
" Tensor scale) -> ()");
cache_ops.impl("concat_and_cache_mla", torch::kCPU, &concat_and_cache_mla);
}
TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _utils), utils) {
@ -212,12 +157,4 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _utils), utils) {
utils.def("init_cpu_threads_env(str cpu_ids) -> str", &init_cpu_threads_env);
}
TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cpu), cpu_ops) {
cpu_ops.def(
"mla_decode_kvcache("
" Tensor! out, Tensor query, Tensor kv_cache,"
" float scale, Tensor block_tables, Tensor seq_lens) -> ()");
cpu_ops.impl("mla_decode_kvcache", torch::kCPU, &mla_decode_kvcache);
}
REGISTER_EXTENSION(TORCH_EXTENSION_NAME)

View File

@ -4,11 +4,6 @@
#include <string>
#include <sched.h>
#endif
#if __GLIBC__ == 2 && __GLIBC_MINOR__ < 30
#include <unistd.h>
#include <sys/syscall.h>
#define gettid() syscall(SYS_gettid)
#endif
#include "cpu_types.hpp"
@ -23,7 +18,7 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) {
#ifndef VLLM_NUMA_DISABLED
std::string init_cpu_threads_env(const std::string& cpu_ids) {
bitmask* omp_cpu_mask = numa_parse_cpustring_all(cpu_ids.c_str());
bitmask* omp_cpu_mask = numa_parse_cpustring(cpu_ids.c_str());
TORCH_CHECK(omp_cpu_mask->size > 0);
std::vector<int> omp_cpu_ids;
omp_cpu_ids.reserve(omp_cpu_mask->size);

View File

@ -1,39 +0,0 @@
#include <torch/all.h>
#include <torch/cuda.h>
#include <cuda_runtime.h>
// This function assumes that `cpu_tensor` is a CPU tensor allocated with pinned
// memory, and that UVA (Unified Virtual Addressing) is enabled.
torch::Tensor get_cuda_view_from_cpu_tensor(torch::Tensor& cpu_tensor) {
TORCH_CHECK(cpu_tensor.device().is_cpu(), "Input tensor must be on CPU");
// Get raw host pointer from CPU tensor
void* host_ptr = cpu_tensor.data_ptr();
// Get a device pointer corresponding to the pinned host memory
void* device_ptr = nullptr;
cudaError_t err = cudaHostGetDevicePointer(&device_ptr, host_ptr, 0);
TORCH_CHECK(err == cudaSuccess,
"cudaHostGetDevicePointer failed: ", cudaGetErrorString(err));
// We'll use the same sizes, strides, and dtype as the CPU tensor.
// TODO: check if layout is respected.
auto sizes = cpu_tensor.sizes();
auto strides = cpu_tensor.strides();
auto options = cpu_tensor.options().device(torch::kCUDA);
// from_blob signature: from_blob(void *data, IntArrayRef sizes, ..., Deleter,
// const TensorOptions &) Provide a no-op deleter. The CPU tensor holds the
// memory, so we don't free it here.
auto deleter = [](void*) {
// no-op, since the memory is owned by the original CPU tensor
};
torch::Tensor cuda_tensor =
torch::from_blob(device_ptr, sizes, strides, deleter, options);
TORCH_CHECK(cuda_tensor.device().is_cuda(),
"Resulting tensor is not on CUDA device");
return cuda_tensor;
}

View File

@ -12,7 +12,7 @@ static_assert(sizeof(void*) == sizeof(fptr_t));
fptr_t init_custom_ar(const std::vector<fptr_t>& fake_ipc_ptrs,
torch::Tensor& rank_data, int64_t rank,
bool fully_connected) {
bool full_nvlink) {
int world_size = fake_ipc_ptrs.size();
if (world_size > 8)
throw std::invalid_argument("world size > 8 is not supported");
@ -27,7 +27,7 @@ fptr_t init_custom_ar(const std::vector<fptr_t>& fake_ipc_ptrs,
}
return (fptr_t) new vllm::CustomAllreduce(ipc_ptrs, rank_data.data_ptr(),
rank_data.numel(), rank, world_size,
fully_connected);
full_nvlink);
}
/**
@ -142,48 +142,3 @@ void register_graph_buffers(fptr_t _fa,
bytes.reserve(handles.size());
fa->register_graph_buffers(bytes, offsets);
}
std::tuple<fptr_t, torch::Tensor> allocate_shared_buffer_and_handle(
int64_t size) {
auto device_index = c10::cuda::current_device();
at::DeviceGuard device_guard(at::Device(at::DeviceType::CUDA, device_index));
void* buffer;
cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
auto stream = c10::cuda::getCurrentCUDAStream().stream();
AT_CUDA_CHECK(cudaThreadExchangeStreamCaptureMode(&mode));
// Allocate buffer
#if defined(USE_ROCM)
// data buffers need to be "uncached" for signal on MI200
AT_CUDA_CHECK(
hipExtMallocWithFlags((void**)&buffer, size, hipDeviceMallocUncached));
#else
AT_CUDA_CHECK(cudaMalloc((void**)&buffer, size));
#endif
AT_CUDA_CHECK(cudaMemsetAsync(buffer, 0, size, stream));
AT_CUDA_CHECK(cudaStreamSynchronize(stream));
AT_CUDA_CHECK(cudaThreadExchangeStreamCaptureMode(&mode));
// Create IPC memhandle for the allocated buffer.
// Will use it in open_mem_handle.
auto options =
torch::TensorOptions().dtype(torch::kUInt8).device(torch::kCPU);
auto handle =
torch::empty({static_cast<int64_t>(sizeof(cudaIpcMemHandle_t))}, options);
AT_CUDA_CHECK(
cudaIpcGetMemHandle((cudaIpcMemHandle_t*)handle.data_ptr(), buffer));
return std::make_tuple(reinterpret_cast<fptr_t>(buffer), handle);
}
fptr_t open_mem_handle(torch::Tensor& mem_handle) {
void* ipc_ptr;
AT_CUDA_CHECK(cudaIpcOpenMemHandle(
(void**)&ipc_ptr, *((const cudaIpcMemHandle_t*)mem_handle.data_ptr()),
cudaIpcMemLazyEnablePeerAccess));
return reinterpret_cast<fptr_t>(ipc_ptr);
}
void free_shared_buffer(fptr_t buffer) {
AT_CUDA_CHECK(cudaFree(reinterpret_cast<void*>(buffer)));
}

View File

@ -5,10 +5,6 @@
#include <cuda_fp16.h>
#include <cuda_runtime.h>
#if defined(USE_ROCM)
typedef __hip_bfloat16 nv_bfloat16;
#endif
#include <iostream>
#include <array>
#include <limits>
@ -16,7 +12,6 @@ typedef __hip_bfloat16 nv_bfloat16;
#include <unordered_map>
#include <vector>
namespace vllm {
#define CUDACHECK(cmd) \
do { \
cudaError_t e = cmd; \
@ -27,37 +22,24 @@ namespace vllm {
} \
} while (0)
// Maximal number of blocks in allreduce kernel.
namespace vllm {
constexpr int kMaxBlocks = 36;
// Default number of blocks in allreduce kernel.
#ifndef USE_ROCM
const int defaultBlockLimit = 36;
CUpointer_attribute rangeStartAddrAttr = CU_POINTER_ATTRIBUTE_RANGE_START_ADDR;
#else
const int defaultBlockLimit = 16;
hipPointer_attribute rangeStartAddrAttr =
HIP_POINTER_ATTRIBUTE_RANGE_START_ADDR;
#endif
// Counter may overflow, but it's fine since unsigned int overflow is
// well-defined behavior.
using FlagType = uint32_t;
// Two sets of peer counters are needed for two syncs: starting and ending an
// operation. The reason is that it's possible for peer GPU block to arrive at
// the second sync point while the current GPU block haven't passed the first
// sync point. Thus, peer GPU may write counter+1 while current GPU is busy
// waiting for counter. We use alternating counter array to avoid this
// possibility.
struct Signal {
alignas(128) FlagType start[kMaxBlocks][8];
alignas(128) FlagType end[kMaxBlocks][8];
alignas(128) FlagType _flag[kMaxBlocks]; // incremental flags for each rank
alignas(128) FlagType self_counter[kMaxBlocks][8];
// Two sets of peer counters are needed for two syncs. The reason is that
// it's possible for peer GPU block to arrive at the second sync point while
// the current GPU block haven't passed the first sync point. Thus, peer GPU
// may write counter+1 while current GPU is busy waiting for counter. We use
// alternating counter array to avoid this possibility.
alignas(128) FlagType peer_counter[2][kMaxBlocks][8];
};
struct __align__(16) RankData {
const void* ptrs[8];
const void* __restrict__ ptrs[8];
};
struct __align__(16) RankSignals {
@ -152,29 +134,27 @@ DINLINE O downcast(array_t<float, O::size> val) {
}
}
#if !defined(USE_ROCM)
static DINLINE void st_flag_release(FlagType* flag_addr, FlagType flag) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
asm volatile("st.release.sys.global.u32 [%1], %0;" ::"r"(flag),
"l"(flag_addr));
#else
#else
asm volatile("membar.sys; st.volatile.global.u32 [%1], %0;" ::"r"(flag),
"l"(flag_addr));
#endif
#endif
}
static DINLINE FlagType ld_flag_acquire(FlagType* flag_addr) {
FlagType flag;
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
asm volatile("ld.acquire.sys.global.u32 %0, [%1];"
: "=r"(flag)
: "l"(flag_addr));
#else
#else
asm volatile("ld.volatile.global.u32 %0, [%1]; membar.gl;"
: "=r"(flag)
: "l"(flag_addr));
#endif
#endif
return flag;
}
@ -190,99 +170,37 @@ static DINLINE FlagType ld_flag_volatile(FlagType* flag_addr) {
return flag;
}
// This function is meant to be used as the first synchronization in the all
// reduce kernel. Thus, it doesn't need to make any visibility guarantees for
// prior memory accesses. Note: volatile writes will not be reordered against
// other volatile writes.
template <int ngpus>
DINLINE void barrier_at_start(const RankSignals& sg, Signal* self_sg,
int rank) {
uint32_t flag = self_sg->_flag[blockIdx.x] + 1;
// is_start: whether this is the very first synchronization barrier.
// need_fence: whether a memory fence is needed. If true, a release-acquire
// semantic is used to enforce memory access order before and after this
// barrier.
template <int ngpus, bool is_start, bool need_fence = false>
DINLINE void multi_gpu_barrier(const RankSignals& sg, Signal* self_sg,
int rank) {
if constexpr (!is_start) __syncthreads();
static_assert(
!(is_start && need_fence)); // Start barrier shouldn't need fence.
if (threadIdx.x < ngpus) {
auto peer_counter_ptr = &sg.signals[threadIdx.x]->start[blockIdx.x][rank];
auto self_counter_ptr = &self_sg->start[blockIdx.x][threadIdx.x];
// Write the expected counter value to peer and wait for correct value
// from peer.
st_flag_volatile(peer_counter_ptr, flag);
while (ld_flag_volatile(self_counter_ptr) != flag);
}
__syncthreads();
// use one thread to update flag
if (threadIdx.x == 0) self_sg->_flag[blockIdx.x] = flag;
}
// This function is meant to be used as the second or the final
// synchronization barrier in the all reduce kernel. If it's the final
// synchronization barrier, we don't need to make any visibility guarantees
// for prior memory accesses.
template <int ngpus, bool final_sync = false>
DINLINE void barrier_at_end(const RankSignals& sg, Signal* self_sg, int rank) {
__syncthreads();
uint32_t flag = self_sg->_flag[blockIdx.x] + 1;
if (threadIdx.x < ngpus) {
auto peer_counter_ptr = &sg.signals[threadIdx.x]->end[blockIdx.x][rank];
auto self_counter_ptr = &self_sg->end[blockIdx.x][threadIdx.x];
// Increment the counter. Technically we only need one counter, but we use
// multiple per block to eliminate the need to share the counter via smem.
auto val = self_sg->self_counter[blockIdx.x][threadIdx.x] += 1;
// Write the expected counter value to peer and wait for correct value from
// peer.
if constexpr (!final_sync) {
st_flag_release(peer_counter_ptr, flag);
while (ld_flag_acquire(self_counter_ptr) != flag);
auto peer_counter_ptr =
&sg.signals[threadIdx.x]->peer_counter[val % 2][blockIdx.x][rank];
auto self_counter_ptr =
&self_sg->peer_counter[val % 2][blockIdx.x][threadIdx.x];
if constexpr (need_fence) {
st_flag_release(peer_counter_ptr, val);
while (ld_flag_acquire(self_counter_ptr) != val);
} else {
st_flag_volatile(peer_counter_ptr, flag);
while (ld_flag_volatile(self_counter_ptr) != flag);
st_flag_volatile(peer_counter_ptr, val);
while (ld_flag_volatile(self_counter_ptr) != val);
}
}
if constexpr (!final_sync) __syncthreads();
// use one thread to update flag
if (threadIdx.x == 0) self_sg->_flag[blockIdx.x] = flag;
if constexpr (is_start || need_fence) __syncthreads();
}
#else
template <int ngpus>
DINLINE void barrier_at_start(const RankSignals& sg, Signal* self_sg,
int rank) {
uint32_t flag = self_sg->_flag[blockIdx.x] + 1;
if (threadIdx.x < ngpus) {
// simultaneously write to the corresponding flag of all ranks.
// Latency = 1 p2p write
__scoped_atomic_store_n(&sg.signals[threadIdx.x]->start[blockIdx.x][rank],
flag, __ATOMIC_RELAXED, __MEMORY_SCOPE_SYSTEM);
// wait until we got true from all ranks
while (__scoped_atomic_load_n(&self_sg->start[blockIdx.x][threadIdx.x],
__ATOMIC_RELAXED,
__MEMORY_SCOPE_DEVICE) < flag);
}
__syncthreads();
// use one thread to update flag
if (threadIdx.x == 0) self_sg->_flag[blockIdx.x] = flag;
}
template <int ngpus, bool final_sync = false>
DINLINE void barrier_at_end(const RankSignals& sg, Signal* self_sg, int rank) {
__syncthreads();
uint32_t flag = self_sg->_flag[blockIdx.x] + 1;
if (threadIdx.x < ngpus) {
// simultaneously write to the corresponding flag of all ranks.
// Latency = 1 p2p write
__scoped_atomic_store_n(&sg.signals[threadIdx.x]->end[blockIdx.x][rank],
flag,
final_sync ? __ATOMIC_RELAXED : __ATOMIC_RELEASE,
__MEMORY_SCOPE_SYSTEM);
// wait until we got true from all ranks
while (
__scoped_atomic_load_n(&self_sg->end[blockIdx.x][threadIdx.x],
final_sync ? __ATOMIC_RELAXED : __ATOMIC_ACQUIRE,
__MEMORY_SCOPE_DEVICE) < flag);
}
if constexpr (!final_sync) __syncthreads();
// use one thread to update flag
if (threadIdx.x == 0) self_sg->_flag[blockIdx.x] = flag;
}
#endif
template <typename P, int ngpus, typename A>
DINLINE P packed_reduce(const P* ptrs[], int idx) {
A tmp = upcast(ptrs[0][idx]);
@ -302,13 +220,13 @@ __global__ void __launch_bounds__(512, 1)
// note: we don't reorder the address so the accumulation order is the same
// for all ranks, ensuring bitwise identical results
auto dp = *_dp;
barrier_at_start<ngpus>(sg, self_sg, rank);
multi_gpu_barrier<ngpus, true>(sg, self_sg, rank);
// do the actual reduction
for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size;
idx += gridDim.x * blockDim.x) {
((P*)result)[idx] = packed_reduce<P, ngpus, A>((const P**)&dp.ptrs[0], idx);
}
barrier_at_end<ngpus, true>(sg, self_sg, rank);
multi_gpu_barrier<ngpus, false>(sg, self_sg, rank);
}
template <typename P>
@ -337,20 +255,18 @@ __global__ void __launch_bounds__(512, 1)
tmps[i] = get_tmp_buf<P>(sg.signals[target]);
}
auto tmp_out = tmps[0];
barrier_at_start<ngpus>(sg, self_sg, rank);
multi_gpu_barrier<ngpus, true>(sg, self_sg, rank);
// stage 1: reduce scatter
for (int idx = start + tid; idx < end; idx += stride) {
tmp_out[idx - start] = packed_reduce<P, ngpus, A>(ptrs, idx);
}
barrier_at_end<ngpus>(sg, self_sg, rank);
multi_gpu_barrier<ngpus, false, true>(sg, self_sg, rank);
// stage 2: allgather. Note: it's important to match the tid between
// the two stages, because visibility across devices is only guaranteed
// between threads that have the same tid. If thread i computes the sum of
// start + i in the first stage, then thread i also gathers start + i from
// all ranks.
// start + i in the first stage, then thread i also gathers start + i from all
// ranks.
for (int idx = tid; idx < largest_part; idx += stride) {
#pragma unroll
for (int i = 0; i < ngpus; i++) {
@ -371,22 +287,21 @@ class CustomAllreduce {
public:
int rank_;
int world_size_;
// Full NVLink or xGMI connection between GPUs.
bool fully_connected_;
bool full_nvlink_;
RankSignals sg_;
// Stores a map from a pointer to its peer pointers from all ranks.
// Stores an map from a pointer to its peer pointters from all ranks.
std::unordered_map<void*, RankData*> buffers_;
Signal* self_sg_;
// Stores rank data from all ranks. This is mainly for cuda graph purposes.
// For cuda graph to work, all kernel arguments must be fixed during graph
// capture time. However, the peer pointers are not known during graph
// capture time. Therefore, during capture, we increment the rank data
// pointer and use that as the argument to the kernel. The kernel arguments
// are stored in graph_unreg_buffers_. The actual peer pointers will be
// filled in at the memory pointed to by the pointers in
// graph_unreg_buffers_ when the IPC handles are exchanged between ranks.
// capture time. However, the peer pointers are not known during graph capture
// time. Therefore, during capture, we increment the rank data pointer and use
// that as the argument to the kernel. The kernel arguments are stored in
// graph_unreg_buffers_. The actual peer pointers will be filled in at the
// memory pointed to by the pointers in graph_unreg_buffers_ when
// the IPC handles are exchanged between ranks.
//
// The overall process looks like this:
// 1. Graph capture.
@ -404,18 +319,17 @@ class CustomAllreduce {
* Signals are an array of ipc-enabled buffers from all ranks.
* For each of the buffer, the layout is as follows:
* | -- sizeof(Signal) -- | ------ a few MB ----- |
* The first section is for allreduce synchronization, and the second
* section is for storing the intermediate results required by some
* allreduce algos.
* The first section is for allreduce synchronization, and the second section
* is for storing the intermediate results required by some allreduce algos.
*
* Note: this class does not own any device memory. Any required buffers
* are passed in from the constructor.
*/
CustomAllreduce(Signal** signals, void* rank_data, size_t rank_data_sz,
int rank, int world_size, bool fully_connected = true)
int rank, int world_size, bool full_nvlink = true)
: rank_(rank),
world_size_(world_size),
fully_connected_(fully_connected),
full_nvlink_(full_nvlink),
self_sg_(signals[rank]),
d_rank_data_base_(reinterpret_cast<RankData*>(rank_data)),
d_rank_data_end_(d_rank_data_base_ + rank_data_sz / sizeof(RankData)) {
@ -447,7 +361,8 @@ class CustomAllreduce {
void* base_ptr;
// note: must share the base address of each allocation, or we get wrong
// address
if (cuPointerGetAttribute(&base_ptr, rangeStartAddrAttr,
if (cuPointerGetAttribute(&base_ptr,
CU_POINTER_ATTRIBUTE_RANGE_START_ADDR,
(CUdeviceptr)ptr) != CUDA_SUCCESS)
throw std::runtime_error("failed to get pointer attr");
CUDACHECK(cudaIpcGetMemHandle(
@ -481,11 +396,11 @@ class CustomAllreduce {
// Note: when registering graph buffers, we intentionally choose to not
// deduplicate the addresses. That means if the allocator reuses some
// addresses, they will be registered again. This is to account for the
// remote possibility of different allocation patterns between ranks. For
// example, rank 1 may get the same input address for the second allreduce,
// but rank 2 got a different address. IPC handles have internal reference
// counting mechanism so overhead should be small.
// addresses, they will be registered again. This is to account for the remote
// possibility of different allocation patterns between ranks. For example,
// rank 1 may get the same input address for the second allreduce, but rank 2
// got a different address. IPC handles have internal reference counting
// mechanism so overhead should be small.
void register_graph_buffers(
const std::vector<std::string>& handles,
const std::vector<std::vector<int64_t>>& offsets) {
@ -516,15 +431,15 @@ class CustomAllreduce {
/**
* Performs allreduce, assuming input has already been registered.
*
* Block and grid default configs are results after careful grid search.
* Using 36 blocks give the best or close to the best runtime on the devices
* I tried: A100, A10, A30, T4, V100. You'll notice that NCCL kernels also
* only take a small amount of SMs. Not quite sure the underlying reason,
* but my guess is that too many SMs will cause contention on NVLink bus.
* Block and grid default configs are results after careful grid search. Using
* 36 blocks give the best or close to the best runtime on the devices I
* tried: A100, A10, A30, T4, V100. You'll notice that NCCL kernels also only
* take a small amount of SMs. Not quite sure the underlying reason, but my
* guess is that too many SMs will cause contention on NVLink bus.
*/
template <typename T>
void allreduce(cudaStream_t stream, T* input, T* output, int size,
int threads = 512, int block_limit = defaultBlockLimit) {
int threads = 512, int block_limit = 36) {
auto d = packed_t<T>::P::size;
if (size % d != 0)
throw std::runtime_error(
@ -558,11 +473,13 @@ class CustomAllreduce {
#define KL(ngpus, name) \
name<T, ngpus><<<blocks, threads, 0, stream>>>(ptrs, sg_, self_sg_, output, \
rank_, size);
// TODO(hanzhi713): Threshold is different for A100 and H100.
// Add per device threshold.
#define REDUCE_CASE(ngpus) \
case ngpus: { \
if (world_size_ == 2) { \
KL(ngpus, cross_device_reduce_1stage); \
} else if (fully_connected_) { \
} else if (full_nvlink_) { \
if ((world_size_ <= 4 && bytes < 512 * 1024) || \
(world_size_ <= 8 && bytes < 256 * 1024)) { \
KL(ngpus, cross_device_reduce_1stage); \
@ -580,8 +497,7 @@ class CustomAllreduce {
REDUCE_CASE(8)
default:
throw std::runtime_error(
"custom allreduce only supports num gpus in (2,4,6,8). Actual "
"num "
"custom allreduce only supports num gpus in (2,4,6,8). Actual num "
"gpus = " +
std::to_string(world_size_));
}
@ -595,11 +511,10 @@ class CustomAllreduce {
}
}
};
/**
* To inspect PTX/SASS, copy paste this header file to compiler explorer and
add a template instantiation:
* To inspect PTX/SASS, copy paste this header file to compiler explorer and add
a template instantiation:
* template void vllm::CustomAllreduce::allreduce<half>(cudaStream_t, half *,
half *, int, int, int);
*/
} // namespace vllm
} // namespace vllm

View File

@ -1,9 +1,9 @@
/**
* This is a standalone test for custom allreduce.
* To compile, make sure you have MPI and NCCL installed in your system.
* export MPI_HOME=XXX
* export MPI_HOME=xxx
* nvcc -O2 -arch=native -std=c++17 custom_all_reduce_test.cu -o
* custom_all_reduce_test -lnccl -I${MPI_HOME}/include -lmpi
* custom_all_reduce_test -lnccl -I${MPI_HOME} -lmpi
*
* Warning: this C++ test is not designed to be very readable and was used
* during the rapid prototyping process.
@ -22,15 +22,7 @@
#include "cuda_profiler_api.h"
#include "custom_all_reduce.cuh"
#include "mpi.h"
#ifdef USE_ROCM
#include <hip/hip_bf16.h>
typedef __hip_bfloat16 nv_bfloat16;
#include "rccl/rccl.h"
#include "custom_all_reduce_hip.cuh"
#else
#include "nccl.h"
#include "custom_all_reduce.cuh"
#endif
#include "nccl.h"
#define MPICHECK(cmd) \
do { \
@ -51,29 +43,16 @@ typedef __hip_bfloat16 nv_bfloat16;
} \
} while (0)
#ifdef USE_ROCM
__global__ void dummy_kernel() {
for (int i = 0; i < 100; i++) {
uint64_t start = wall_clock64();
uint64_t cycles_elapsed;
do {
cycles_elapsed = wall_clock64() - start;
} while (cycles_elapsed < 100);
}
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
for (int i = 0; i < 100; i++) __nanosleep(1000000); // 100ms
}
#else
__global__ void dummy_kernel() {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
for (int i = 0; i < 100; i++) __nanosleep(1000000); // 100ms
#else
for (int i = 0; i < 100; i++) {
long long int start = clock64();
while (clock64() - start < 150000000); // approximately 98.4ms on P40
}
#endif
}
#endif
}
template <typename T>
__global__ void set_data(T* data, int size, int myRank) {
@ -142,14 +121,8 @@ void run(int myRank, int nRanks, ncclComm_t& comm, int threads, int block_limit,
* registration, they are allocated and registered together in the test for
* convenience.
*/
#ifdef USE_ROCM
CUDACHECK(hipExtMallocWithFlags(
(void**)&buffer, 2 * data_size * sizeof(T) + sizeof(vllm::Signal),
hipDeviceMallocUncached));
#else
CUDACHECK(
cudaMalloc(&buffer, 2 * data_size * sizeof(T) + sizeof(vllm::Signal)));
#endif
CUDACHECK(
cudaMemset(buffer, 0, 2 * data_size * sizeof(T) + sizeof(vllm::Signal)));
CUDACHECK(cudaMalloc(&self_data_copy, data_size * sizeof(T)));
@ -338,18 +311,13 @@ int main(int argc, char** argv) {
bool performance_test = true;
cudaProfilerStart();
// Uncomment to scan through different block size configs.
// for (int threads : {256, 512, 1024}) {
// for (int block_limit = 16; block_limit < 112; block_limit += 4) {
// run<half>(myRank, nRanks, comm, threads, block_limit, 1024 * 1024,
// performance_test);
// }
// }
#ifdef USE_ROCM
const int block_limit = 16;
#else
const int block_limit = 36;
#endif
// Uncomment to scan through different block size configs.
// for (int threads : {256, 512, 1024}) {
// for (int block_limit = 16; block_limit < 112; block_limit += 4) {
// run<half>(myRank, nRanks, comm, threads, block_limit, 1024 * 1024,
// performance_test);
// }
// }
// Scan through different sizes to test performance.
for (int sz = 512; sz <= (8 << 20); sz *= 2) {
run<half>(myRank, nRanks, comm, 512, 36, sz + 8 * 47, performance_test);
@ -358,4 +326,4 @@ int main(int argc, char** argv) {
cudaProfilerStop();
MPICHECK(MPI_Finalize());
return EXIT_SUCCESS;
}
}

View File

@ -48,14 +48,4 @@ struct enable_sm90_or_later : Kernel {
Kernel::operator()(std::forward<Args>(args)...);
#endif
}
};
template <typename Kernel>
struct enable_sm90_only : Kernel {
template <typename... Args>
CUTLASS_DEVICE void operator()(Args&&... args) {
#if defined __CUDA_ARCH__ && __CUDA_ARCH__ == 900
Kernel::operator()(std::forward<Args>(args)...);
#endif
}
};
};

View File

@ -1,457 +0,0 @@
/***************************************************************************************************
* Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights
*reserved. SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
*this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
*ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
*LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
*CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
*SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
*INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
*CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
*ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
*POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
//
// This file is a modified excerpt of
// include/cutlass/epilogue/fusion/sm90_visitor_load_tma_warpspecialized.hpp
// from https://github.com/NVIDIA/cutlass v3.5.0
// It has been modified to support either row/column or scalar broadcasting
// where the tensor being loaded from is always passed in via a device pointer.
// This lets one compiled kernel handle all cases of per-tensor or
// per-channel/per-token quantization.
//
// This interface also allows the scales to be passed in as tensors that
// consistently reside on the device, which avoids an issue with a previous
// implementation where scalars needed to be on the CPU since they
// were passed in via float values. This created a potential performance hazard
// if scales were initially on the device, and caused torch.compile graphs
// breaks when moving scales to the CPU.
//
#pragma once
// Turn off clang-format for the entire file to keep it close to upstream
// clang-format off
#include "cutlass/cutlass.h"
#include "cutlass/arch/barrier.h"
#include "cute/tensor.hpp"
#include "cutlass/epilogue/fusion/sm90_visitor_tma_warpspecialized.hpp"
namespace cutlass::epilogue::fusion {
using namespace cute;
using namespace detail;
// Row vector broadcast
template<
int Stages,
class CtaTileShapeMNK,
class Element,
class StrideMNL = Stride<_0,_1,_0>,
int Alignment = 128 / sizeof_bits_v<Element>
>
struct Sm90RowOrScalarBroadcastArray {
static_assert(Stages == 0, "Row broadcast doesn't support smem usage");
static_assert(is_static_v<decltype(take<0,2>(StrideMNL{}))>); // batch stride can be dynamic or static
static_assert(take<0,2>(StrideMNL{}) == Stride<_0,_1>{});
struct SharedStorage {
array_aligned<Element, size<1>(CtaTileShapeMNK{})> smem;
};
// This struct has been modified to have a bool indicating that ptr_row is a
// scalar that must be broadcast, instead of containing a scalar that is
// valid if ptr_row is null.
struct Arguments {
const Element* const* ptr_row_array = nullptr;
bool row_broadcast = true;
StrideMNL dRow = {};
};
using Params = Arguments;
template <class ProblemShape>
static constexpr Params
to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
return args;
}
template <class ProblemShape>
static bool
can_implement(ProblemShape const& problem_shape, Arguments const& args) {
return true;
}
template <class ProblemShape>
static size_t
get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
return 0;
}
template <class ProblemShape>
static cutlass::Status
initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
CudaHostAdapter* cuda_adapter = nullptr) {
return cutlass::Status::kSuccess;
}
CUTLASS_HOST_DEVICE
Sm90RowOrScalarBroadcastArray() { }
CUTLASS_HOST_DEVICE
Sm90RowOrScalarBroadcastArray(Params const& params, SharedStorage const& shared_storage)
: params(params)
, smem(const_cast<Element*>(shared_storage.smem.data())) { }
Params params;
Element *smem = nullptr;
CUTLASS_DEVICE bool
is_producer_load_needed() const {
return false;
}
CUTLASS_DEVICE bool
is_C_load_needed() const {
return false;
}
CUTLASS_DEVICE bool
is_zero() const {
return (!params.row_broadcast && *(params.ptr_row_array[group]) == Element(0));
}
template <class... Args>
CUTLASS_DEVICE auto
get_producer_load_callbacks(ProducerLoadArgs<Args...> const& args) {
return EmptyProducerLoadCallbacks{};
}
template <class GS_GTensor, class GS_STensor, class GS_CTensor, class Tiled_G2S, class SR_STensor, class SR_RTensor, class CTensor, class ThrResidue, class ThrNum>
struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
CUTLASS_DEVICE
ConsumerStoreCallbacks(
GS_GTensor tGS_gRow_, GS_STensor tGS_sRow_,
GS_CTensor tGS_cRow_, Tiled_G2S tiled_g2s_,
SR_STensor tSR_sRow_, SR_RTensor tSR_rRow_,
CTensor tCcRow_, ThrResidue residue_tCcRow_, ThrNum thr_num_,
int group, Params const& params_)
: tGS_gRow(tGS_gRow_)
, tGS_sRow(tGS_sRow_)
, tGS_cRow(tGS_cRow_)
, tiled_G2S(tiled_g2s_)
, tSR_sRow(tSR_sRow_)
, tSR_rRow(tSR_rRow_)
, tCcRow(tCcRow_)
, residue_tCcRow(residue_tCcRow_)
, group(group)
, params(params_) {}
GS_GTensor tGS_gRow; // (CPY,CPY_M,CPY_N)
GS_STensor tGS_sRow; // (CPY,CPY_M,CPY_N)
GS_CTensor tGS_cRow; // (CPY,CPY_M,CPY_N)
Tiled_G2S tiled_G2S;
SR_STensor tSR_sRow; // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
SR_RTensor tSR_rRow; // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
CTensor tCcRow; // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
ThrResidue residue_tCcRow; // (m, n)
ThrNum thr_num;
int group;
Params const& params;
CUTLASS_DEVICE void
begin() {
if (!params.row_broadcast) {
fill(tSR_rRow, *(params.ptr_row_array[group]));
return;
}
auto synchronize = [&] () { cutlass::arch::NamedBarrier::sync(thr_num, cutlass::arch::ReservedNamedBarriers::EpilogueBarrier); };
Tensor tGS_gRow_flt = filter_zeros(tGS_gRow);
Tensor tGS_sRow_flt = filter_zeros(tGS_sRow);
Tensor tGS_cRow_flt = make_tensor(tGS_cRow.data(), make_layout(tGS_gRow_flt.shape(), tGS_cRow.stride()));
for (int i = 0; i < size(tGS_gRow_flt); ++i) {
if (get<1>(tGS_cRow_flt(i)) >= size<1>(CtaTileShapeMNK{})) {
continue; // OOB of SMEM,
}
if (elem_less(tGS_cRow_flt(i), make_coord(get<0>(residue_tCcRow), get<1>(residue_tCcRow)))) {
tGS_sRow_flt(i) = tGS_gRow_flt(i);
}
else {
tGS_sRow_flt(i) = Element(0); // Set to Zero when OOB so LDS could be issue without any preds.
}
}
synchronize();
}
CUTLASS_DEVICE void
begin_loop(int epi_m, int epi_n) {
if (epi_m == 0) { // Assumes M-major subtile loop
if (!params.row_broadcast) return; // Do not issue LDS when row is scalar
Tensor tSR_sRow_flt = filter_zeros(tSR_sRow(_,_,_,epi_m,epi_n));
Tensor tSR_rRow_flt = filter_zeros(tSR_rRow);
copy(tSR_sRow_flt, tSR_rRow_flt);
}
}
template <typename ElementAccumulator, int FragmentSize>
CUTLASS_DEVICE Array<Element, FragmentSize>
visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n) {
Array<Element, FragmentSize> frg_row;
CUTLASS_PRAGMA_UNROLL
for (int i = 0; i < FragmentSize; ++i) {
frg_row[i] = tSR_rRow(epi_v * FragmentSize + i);
}
return frg_row;
}
};
template <
bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
class... Args
>
CUTLASS_DEVICE auto
get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
auto [M, N, K, L] = args.problem_shape_mnkl;
auto [m, n, k, l] = args.tile_coord_mnkl;
using ThreadCount = decltype(size(args.tiled_copy));
Tensor mRow = make_tensor(make_gmem_ptr(params.ptr_row_array[l]), make_shape(M,N,1), params.dRow);
Tensor gRow = local_tile(mRow(_,_,l), take<0,2>(args.tile_shape_mnk), make_coord(m, n)); // (CTA_M, CTA_N)
Tensor sRow = make_tensor(make_smem_ptr(smem),
make_shape(size<0>(CtaTileShapeMNK{}), size<1>(CtaTileShapeMNK{})), make_shape(_0{}, _1{})); // (CTA_M, CTA_N)
//// G2S: Gmem to Smem
auto tiled_g2s = make_tiled_copy(Copy_Atom<DefaultCopy, Element>{},
Layout< Shape<_1, ThreadCount>,
Stride<_0, _1>>{},
Layout<_1>{});
auto thr_g2s = tiled_g2s.get_slice(args.thread_idx);
Tensor tGS_gRow = thr_g2s.partition_S(gRow);
Tensor tGS_sRow = thr_g2s.partition_D(sRow);
//// G2S: Coord
auto cRow = make_identity_tensor(make_shape(size<0>(CtaTileShapeMNK{}), size<1>(CtaTileShapeMNK{})));
Tensor tGS_cRow = thr_g2s.partition_S(cRow);
//// S2R: Smem to Reg
Tensor tSR_sRow = sm90_partition_for_epilogue<ReferenceSrc>(sRow, args.epi_tile, args.tiled_copy, args.thread_idx);
Tensor tSR_rRow = make_tensor_like(take<0,3>(tSR_sRow)); // (CPY,CPY_M,CPY_N)
return ConsumerStoreCallbacks<decltype(tGS_gRow), decltype(tGS_sRow), decltype(tGS_cRow), decltype(tiled_g2s), decltype(tSR_sRow), decltype(tSR_rRow), decltype(args.tCcD), decltype(args.residue_cD), ThreadCount>(
tGS_gRow,
tGS_sRow,
tGS_cRow, tiled_g2s,
tSR_sRow,
tSR_rRow,
args.tCcD,
args.residue_cD,
ThreadCount{},
l,
params);
}
};
/////////////////////////////////////////////////////////////////////////////////////////////////
// Column vector broadcast
template<
int Stages,
class CtaTileShapeMNK,
class Element,
class StrideMNL = Stride<_1,_0,_0>,
int Alignment = 128 / sizeof_bits_v<Element>
>
struct Sm90ColOrScalarBroadcastArray {
static_assert(Stages == 0, "Column broadcast doesn't support smem usage yet");
static_assert(Alignment * sizeof_bits_v<Element> % 128 == 0, "sub-16B alignment not supported yet");
static_assert(
(cute::is_same_v<StrideMNL, Stride<_1,_0, _0>>) || // col vector broadcast, e.g. per-row alpha/bias
(cute::is_same_v<StrideMNL, Stride<_1,_0,int>>)); // batched col vector broadcast, e.g. batched per-row bias
// Accumulator distributes col elements evenly amongst threads so we can just directly load from gmem
struct SharedStorage { };
// This struct has been modified to have a bool indicating that ptr_col is a
// scalar that must be broadcast, instead of containing a scalar that is
// valid if ptr_col is null.
struct Arguments {
const Element* const* ptr_col_array = nullptr;
bool col_broadcast = true;
StrideMNL dCol = {};
};
using Params = Arguments;
template <class ProblemShape>
static constexpr Params
to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
return args;
}
template <class ProblemShape>
static bool
can_implement(ProblemShape const& problem_shape, Arguments const& args) {
return true;
}
template <class ProblemShape>
static size_t
get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
return 0;
}
template <class ProblemShape>
static cutlass::Status
initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
CudaHostAdapter* cuda_adapter = nullptr) {
return cutlass::Status::kSuccess;
}
CUTLASS_DEVICE bool
is_producer_load_needed() const {
return false;
}
CUTLASS_DEVICE bool
is_C_load_needed() const {
return false;
}
CUTLASS_DEVICE bool
is_zero() const {
return (!params.col_broadcast && *(params.ptr_col_array[group]) == Element(0));
}
CUTLASS_HOST_DEVICE
Sm90ColOrScalarBroadcastArray() { }
CUTLASS_HOST_DEVICE
Sm90ColOrScalarBroadcastArray(Params const& params, SharedStorage const& shared_storage)
: params(params) { }
Params params;
template <class... Args>
CUTLASS_DEVICE auto
get_producer_load_callbacks(ProducerLoadArgs<Args...> const& args) {
return EmptyProducerLoadCallbacks{};
}
template<class GTensor, class RTensor, class CTensor, class ProblemShape>
struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
CUTLASS_DEVICE
ConsumerStoreCallbacks(
GTensor&& tCgCol,
RTensor&& tCrCol,
CTensor&& tCcCol,
ProblemShape problem_shape,
int group,
Params const& params
):
tCgCol(cute::forward<GTensor>(tCgCol)),
tCrCol(cute::forward<RTensor>(tCrCol)),
tCcCol(cute::forward<CTensor>(tCcCol)),
m(get<0>(problem_shape)),
group(group),
params(params) {}
GTensor tCgCol; // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
RTensor tCrCol;
CTensor tCcCol; // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
Params const& params;
int m;
int group;
CUTLASS_DEVICE void
begin() {
Tensor pred = make_tensor<bool>(shape(tCgCol));
CUTLASS_PRAGMA_UNROLL
for (int i = 0; i < size(pred); ++i) {
pred(i) = get<0>(tCcCol(i)) < m;
}
if (!params.col_broadcast) {
fill(tCrCol, *(params.ptr_col_array[group]));
return;
}
// Filter so we don't issue redundant copies over stride-0 modes
// (only works if 0-strides are in same location, which is by construction)
copy_if(pred, filter(tCgCol), filter(tCrCol));
}
template <typename ElementAccumulator, int FragmentSize>
CUTLASS_DEVICE Array<Element, FragmentSize>
visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n) {
Array<Element, FragmentSize> frg_col;
Tensor tCrCol_mn = tCrCol(_,_,_,epi_m,epi_n);
CUTLASS_PRAGMA_UNROLL
for (int i = 0; i < FragmentSize; ++i) {
frg_col[i] = tCrCol_mn(epi_v * FragmentSize + i);
}
return frg_col;
}
};
template <
bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
class... Args
>
CUTLASS_DEVICE auto
get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
auto [M, N, K, L] = args.problem_shape_mnkl;
auto [m, n, k, l] = args.tile_coord_mnkl;
Tensor mCol = make_tensor(make_gmem_ptr(params.ptr_col_array[l]), make_shape(M,N,1), params.dCol);
Tensor tCgCol = sm90_partition_for_epilogue<ReferenceSrc>( // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
mCol, args.tile_shape_mnk, args.tile_coord_mnkl, args.epi_tile, args.tiled_copy, args.thread_idx);
Tensor tCrCol = make_tensor_like(tCgCol); // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
// Generate an identity tensor matching the shape of the global tensor and
// partition the same way, this will be used to generate the predicate
// tensor for loading
Tensor cCol = make_identity_tensor(mCol.shape());
Tensor tCcCol = sm90_partition_for_epilogue<ReferenceSrc>( // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
cCol, args.tile_shape_mnk, args.tile_coord_mnkl, args.epi_tile, args.tiled_copy, args.thread_idx);
return ConsumerStoreCallbacks(
cute::move(tCgCol),
cute::move(tCrCol),
cute::move(tCcCol),
args.problem_shape_mnkl,
l,
params
);
}
};
}

View File

@ -1,7 +1,6 @@
#pragma once
#include "cutlass_extensions/epilogue/broadcast_load_epilogue_c3x.hpp"
#include "cutlass_extensions/epilogue/broadcast_load_epilogue_array_c3x.hpp"
/*
This file defines custom epilogues for fusing channel scales, token scales,
@ -70,16 +69,6 @@ struct ScaledEpilogueBase {
0 /*Stages*/, TileShape, T, T, Stride<Int<0>, Int<1>, Int<0>>,
128 / sizeof_bits_v<T>, EnableNullPtr>;
template <typename T>
using ColOrScalarLoadArray =
cutlass::epilogue::fusion::Sm90ColOrScalarBroadcastArray<
0 /*Stages*/, TileShape, T, Stride<Int<1>, Int<0>, Int<0>>>;
template <typename T>
using RowOrScalarLoadArray =
cutlass::epilogue::fusion::Sm90RowOrScalarBroadcastArray<
0 /*Stages*/, TileShape, T, Stride<Int<0>, Int<1>, Int<0>>>;
// This utility function constructs the arguments for the load descriptors
// from a tensor. It can handle both row and column, as well as row/column or
// scalar cases.
@ -107,14 +96,6 @@ struct ScaledEpilogueBase {
std::is_same_v<Descriptor, RowLoad<T, true>>);
return Arguments{data_ptr};
}
template <typename Descriptor, typename T>
static auto args_from_tensor(const T* const* data_ptr, bool do_broadcast) {
using Arguments = typename Descriptor::Arguments;
static_assert(std::is_same_v<Descriptor, ColOrScalarLoadArray<T>> ||
std::is_same_v<Descriptor, RowOrScalarLoadArray<T>>);
return Arguments{data_ptr, do_broadcast};
}
};
/*
@ -400,51 +381,4 @@ struct ScaledEpilogueBiasAzpToken
}
};
/*
This epilogue works like ScaledEpilogue, but ScaleA and ScaleB are pointers
to arrays containing different scales used in group gemm. The number of
pointers in ScaleA and the number of pointers in ScaleB are equal to the
group size.
*/
template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
struct ScaledEpilogueArray
: private ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor> {
private:
using SUPER = ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor>;
using Accum = typename SUPER::Accum;
using ScaleA = typename SUPER::template ColOrScalarLoadArray<float>;
using ScaleB = typename SUPER::template RowOrScalarLoadArray<float>;
using Compute0 = cutlass::epilogue::fusion::Sm90Compute<
cutlass::multiplies, float, float,
cutlass::FloatRoundStyle::round_to_nearest>;
using EVTCompute0 =
cutlass::epilogue::fusion::Sm90EVT<Compute0, ScaleB, Accum>;
using Compute1 = cutlass::epilogue::fusion::Sm90Compute<
cutlass::multiplies, ElementD, float,
cutlass::FloatRoundStyle::round_to_nearest>;
public:
using EVTCompute =
cutlass::epilogue::fusion::Sm90EVT<Compute1, ScaleA, EVTCompute0>;
using ArgumentType = typename EVTCompute::Arguments;
using ScaleAArray = typename SUPER::template ColOrScalarLoadArray<float>;
using ScaleBArray = typename SUPER::template RowOrScalarLoadArray<float>;
static ArgumentType prepare_args(float const* const* a_scales_ptr,
float const* const* b_scales_ptr,
bool a_col_broadcast, bool b_row_broadcast) {
auto a_args = SUPER::template args_from_tensor<ScaleAArray, float>(
a_scales_ptr, a_col_broadcast);
auto b_args = SUPER::template args_from_tensor<ScaleBArray, float>(
b_scales_ptr, b_row_broadcast);
typename EVTCompute0::Arguments evt0_args{b_args, {}, {}};
return ArgumentType{a_args, evt0_args, {}};
}
};
}; // namespace vllm::c3x

Some files were not shown because too many files have changed in this diff Show More