Compare commits
235 Commits
skip-lmfe-
...
codex/add-
| Author | SHA1 | Date | |
|---|---|---|---|
| 14299bfcaf | |||
| b10c64c834 | |||
| 0925b28a8e | |||
| 99722d5f0e | |||
| 4c91a28e30 | |||
| b038d9c40c | |||
| 2ba60ec7fe | |||
| bd7157a071 | |||
| be429d0cfd | |||
| c253745eb8 | |||
| daec4d2624 | |||
| 6c9fdbf725 | |||
| 483ea64611 | |||
| e20eba753b | |||
| bbc1b29665 | |||
| acb1bfa601 | |||
| 75c7ad9918 | |||
| 5550ff9c25 | |||
| 3aeb19a39e | |||
| 8c017b3490 | |||
| 9c2c2287a0 | |||
| fec2b341ad | |||
| 87bc0c492f | |||
| fe3b9372ad | |||
| bde9e2272a | |||
| 08405609cc | |||
| ab81379ea6 | |||
| 4ffd6e8942 | |||
| 965c5f4914 | |||
| 4d055ef465 | |||
| 17c540a993 | |||
| 4d4d6bad19 | |||
| 11ae016bd7 | |||
| 41d3071918 | |||
| fb5e10d3fb | |||
| b2f78cbad4 | |||
| 23583ee28c | |||
| 01c977e96d | |||
| b3dda72c23 | |||
| fb0571b077 | |||
| 2ed8b6b3d0 | |||
| 013abde6ef | |||
| a5464dcf92 | |||
| ac3ed5a815 | |||
| e6ba2000ae | |||
| aa255ff55a | |||
| 7bb736d00e | |||
| 9f4e30904b | |||
| 5afd3276df | |||
| 43721bc67f | |||
| 02d709a6f1 | |||
| 4a510ab487 | |||
| 314fa8abbf | |||
| 334535b6fb | |||
| dcbb3f1871 | |||
| 00417f4e44 | |||
| ed344f4116 | |||
| e51928793e | |||
| d2740fafbf | |||
| 17838e50ef | |||
| 44c8555621 | |||
| f7d318de2b | |||
| 76f0d05bc6 | |||
| 7d8975de84 | |||
| 785d8b6410 | |||
| f6cdc9a02f | |||
| 509cdc0370 | |||
| 9b6504c307 | |||
| e19b16dde6 | |||
| 582f2c6be7 | |||
| f8a0acbdbe | |||
| 1317034379 | |||
| 0ecc553ee6 | |||
| f96bc3649c | |||
| 938c43ea7f | |||
| 0a9ef0cfce | |||
| e5b438a247 | |||
| 0b99f5d302 | |||
| 1f491aa0c8 | |||
| de92d916fe | |||
| a1063628a4 | |||
| d796375258 | |||
| 14f8456344 | |||
| 4794c2bd92 | |||
| d3cbaa08dc | |||
| 828523ad8e | |||
| 136a17fe6e | |||
| f57438338d | |||
| 5d598680e3 | |||
| 8f4b313c37 | |||
| f93e348010 | |||
| f54f85129e | |||
| d4d1a6024f | |||
| db1764e4e0 | |||
| 7f83b4ee8e | |||
| 5c3bae1a6a | |||
| 5210dc3940 | |||
| 650b51f9f9 | |||
| 6256697997 | |||
| 71557a5f7c | |||
| f3c378ffa7 | |||
| f5ed68ef63 | |||
| efdef57b1f | |||
| b8a4572157 | |||
| 302ef403a2 | |||
| 8865da157b | |||
| f0862eae43 | |||
| 8c851f6d04 | |||
| 7cfa420f49 | |||
| a27b288e4a | |||
| e471d7ca7e | |||
| c43ca8259e | |||
| 85a65e7f51 | |||
| a2986b3e33 | |||
| 96b9aa5aa0 | |||
| e66d787bce | |||
| bfad142e25 | |||
| 9354660036 | |||
| 07ca70af8d | |||
| 2dcd12d357 | |||
| 579d2e5458 | |||
| 0512c04aee | |||
| 7e0ef4084a | |||
| 4aed506b65 | |||
| a86b4c58e8 | |||
| ff4810ba73 | |||
| 9d6964926e | |||
| 0e65818910 | |||
| 380f17527c | |||
| b92ab3deda | |||
| acaa2c0a4a | |||
| 82af928c41 | |||
| 87efc681db | |||
| c3a722fcb2 | |||
| aba48f7db1 | |||
| 04b5f9802d | |||
| efc8f7d814 | |||
| 6d87a2838c | |||
| e6cdbd6792 | |||
| df850c4912 | |||
| 720394de43 | |||
| 88a49745af | |||
| ca683a2a72 | |||
| e9f1b8c9e9 | |||
| ea97940d6c | |||
| fdd32750f0 | |||
| c715ba3735 | |||
| 9c4cb68339 | |||
| 780eb03d9b | |||
| ef9676a1f1 | |||
| 70b1b330e1 | |||
| d1d063a588 | |||
| 7e6edb1469 | |||
| 74704d4553 | |||
| d2f816d6ff | |||
| 577d498212 | |||
| fd85c9f426 | |||
| d32c611f45 | |||
| 01ad27faff | |||
| 481545b397 | |||
| d3cc8427c0 | |||
| 4821ac1b4d | |||
| 4497c8f821 | |||
| 2e36cdbe2b | |||
| fe3edb4cf0 | |||
| 29350922c6 | |||
| 8ae169286f | |||
| 8a0af6a561 | |||
| cfded80793 | |||
| b59dd19b55 | |||
| 3e051bda82 | |||
| 8317f72354 | |||
| d8bebb008a | |||
| 35bc22f23c | |||
| fa96fb9c70 | |||
| e3fdb627d9 | |||
| 7200a21cd1 | |||
| 577c72a227 | |||
| 314285d4f2 | |||
| d2a7938582 | |||
| 89342ce4c0 | |||
| f89f599395 | |||
| e251e457c5 | |||
| afc47e4de7 | |||
| e3b90c1ba2 | |||
| 134f70b3ed | |||
| a1b2d658ee | |||
| 5c7fe25491 | |||
| 53c9a7cee2 | |||
| 0d21b9b51e | |||
| 10214b6935 | |||
| 4a61950f4d | |||
| 3263799056 | |||
| 8e67b2557a | |||
| 4073c82c4e | |||
| 767c3ab869 | |||
| 4f207c7174 | |||
| 782505ed8e | |||
| 98f30b8cba | |||
| 3cd36660f7 | |||
| 46ad73955a | |||
| 41f3884438 | |||
| 60e419c1ee | |||
| 7ef6052804 | |||
| 4fca1a1bd2 | |||
| a6049be73c | |||
| 18ed7746ea | |||
| 8fcaaf6a16 | |||
| 9bb38130cb | |||
| b91d8db873 | |||
| 045b396d09 | |||
| 76852017ea | |||
| 82e64c7a20 | |||
| 4ca204055e | |||
| c5c8f5ea59 | |||
| 01653a917b | |||
| 0cd103e7cb | |||
| 5be7ca1b99 | |||
| f0a30a067b | |||
| 9d6cff3ede | |||
| a25f2adee9 | |||
| d0bed837ac | |||
| f7ee69868a | |||
| d2a71530c1 | |||
| 086609de64 | |||
| 727144bed1 | |||
| 55392bc879 | |||
| ddaff2938e | |||
| 27ed39a347 | |||
| 8f8474fbe3 | |||
| be067861c6 | |||
| 5bc26c438d | |||
| eef921f45e | |||
| e317414ce1 | |||
| 949cb0170d |
@ -5,11 +5,11 @@ import os
|
||||
import sys
|
||||
import zipfile
|
||||
|
||||
# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 450 MiB
|
||||
# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 500 MiB
|
||||
# Note that we have 800 MiB quota, please use it wisely.
|
||||
# See https://github.com/pypi/support/issues/6326 .
|
||||
# Please also sync the value with the one in Dockerfile.
|
||||
VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 450))
|
||||
VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 500))
|
||||
|
||||
|
||||
def print_top_10_largest_files(zip_file):
|
||||
|
||||
12
.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml
Normal file
12
.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml
Normal file
@ -0,0 +1,12 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m HandH1998/QQQ-Llama-3-8b-g128 -b 32 -l 1000 -f 5 -t 1
|
||||
model_name: "HandH1998/QQQ-Llama-3-8b-g128"
|
||||
tasks:
|
||||
- name: "gsm8k"
|
||||
metrics:
|
||||
- name: "exact_match,strict-match"
|
||||
value: 0.419
|
||||
- name: "exact_match,flexible-extract"
|
||||
value: 0.416
|
||||
limit: 1000
|
||||
num_fewshot: 5
|
||||
@ -0,0 +1,12 @@
|
||||
# For hf script, without -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh -m meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -l 100 -t 8
|
||||
model_name: "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
|
||||
backend: "vllm-vlm"
|
||||
tasks:
|
||||
- name: "chartqa"
|
||||
metrics:
|
||||
- name: "relaxed_accuracy,none"
|
||||
# TODO(zhewenl): model card is 0.90, but the actual score is 0.80.
|
||||
value: 0.80
|
||||
limit: 100
|
||||
num_fewshot: 0
|
||||
@ -0,0 +1,10 @@
|
||||
# For hf script, without -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh -m meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -l 250 -t 8 -f 5
|
||||
model_name: "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
|
||||
tasks:
|
||||
- name: "mmlu_pro"
|
||||
metrics:
|
||||
- name: "exact_match,custom-extract"
|
||||
value: 0.80
|
||||
limit: 250 # will run on 250 * 14 subjects = 3500 samples
|
||||
num_fewshot: 5
|
||||
@ -1,4 +1,5 @@
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -b auto -l 1319 -f 5 -t 1
|
||||
# For vllm script, with -t option (tensor parallel size)
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -l 1319 -t 1
|
||||
model_name: "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic"
|
||||
tasks:
|
||||
- name: "gsm8k"
|
||||
|
||||
@ -0,0 +1,12 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh -m Qwen/Qwen2.5-VL-7B-Instruct -l 2500 -t 1
|
||||
|
||||
model_name: "Qwen/Qwen2.5-VL-7B-Instruct"
|
||||
backend: "vllm-vlm"
|
||||
tasks:
|
||||
- name: "chartqa"
|
||||
metrics:
|
||||
- name: "relaxed_accuracy,none"
|
||||
value: 0.855
|
||||
limit: 2500
|
||||
num_fewshot: 0
|
||||
1
.buildkite/lm-eval-harness/configs/models-large-h100.txt
Normal file
1
.buildkite/lm-eval-harness/configs/models-large-h100.txt
Normal file
@ -0,0 +1 @@
|
||||
Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
|
||||
@ -0,0 +1 @@
|
||||
Meta-Llama-4-Maverick-17B-128E-Instruct-FP8-MM.yaml
|
||||
1
.buildkite/lm-eval-harness/configs/models-mm-small.txt
Normal file
1
.buildkite/lm-eval-harness/configs/models-mm-small.txt
Normal file
@ -0,0 +1 @@
|
||||
Qwen2.5-VL-7B-Instruct.yaml
|
||||
44
.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh
Executable file
44
.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh
Executable file
@ -0,0 +1,44 @@
|
||||
#!/bin/bash
|
||||
# We can use this script to compute baseline accuracy on chartqa for vllm.
|
||||
#
|
||||
# Make sure you have lm-eval-harness installed:
|
||||
# pip install lm-eval==0.4.9
|
||||
|
||||
usage() {
|
||||
echo``
|
||||
echo "Runs lm eval harness on ChartQA using multimodal vllm."
|
||||
echo "This pathway is intended to be used to create baselines for "
|
||||
echo "our correctness tests in vllm's CI."
|
||||
echo
|
||||
echo "usage: ${0} <options>"
|
||||
echo
|
||||
echo " -m - huggingface stub or local directory of the model"
|
||||
echo " -l - limit number of samples to run"
|
||||
echo " -t - tensor parallel size to run at"
|
||||
echo
|
||||
}
|
||||
|
||||
while getopts "m:l:t:" OPT; do
|
||||
case ${OPT} in
|
||||
m )
|
||||
MODEL="$OPTARG"
|
||||
;;
|
||||
l )
|
||||
LIMIT="$OPTARG"
|
||||
;;
|
||||
t )
|
||||
TP_SIZE="$OPTARG"
|
||||
;;
|
||||
\? )
|
||||
usage
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
lm_eval --model vllm-vlm \
|
||||
--model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE" \
|
||||
--tasks chartqa \
|
||||
--batch_size auto \
|
||||
--apply_chat_template \
|
||||
--limit $LIMIT
|
||||
0
.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
Normal file → Executable file
0
.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
Normal file → Executable file
@ -0,0 +1,50 @@
|
||||
#!/bin/bash
|
||||
# We can use this script to compute baseline accuracy on MMLUPRO for vllm.
|
||||
# We use this for fp8, which HF does not support.
|
||||
#
|
||||
# Make sure you have lm-eval-harness installed:
|
||||
# pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
|
||||
|
||||
usage() {
|
||||
echo``
|
||||
echo "Runs lm eval harness on MMLU Pro using huggingface transformers."
|
||||
echo "This pathway is intended to be used to create baselines for "
|
||||
echo "our automated nm-test-accuracy workflow"
|
||||
echo
|
||||
echo "usage: ${0} <options>"
|
||||
echo
|
||||
echo " -m - huggingface stub or local directory of the model"
|
||||
echo " -l - limit number of samples to run"
|
||||
echo " -f - number of fewshot samples to use"
|
||||
echo " -t - tensor parallel size to run at"
|
||||
echo
|
||||
}
|
||||
|
||||
while getopts "m:b:l:f:t:" OPT; do
|
||||
case ${OPT} in
|
||||
m )
|
||||
MODEL="$OPTARG"
|
||||
;;
|
||||
b )
|
||||
BATCH_SIZE="$OPTARG"
|
||||
;;
|
||||
l )
|
||||
LIMIT="$OPTARG"
|
||||
;;
|
||||
f )
|
||||
FEWSHOT="$OPTARG"
|
||||
;;
|
||||
t )
|
||||
TP_SIZE="$OPTARG"
|
||||
;;
|
||||
\? )
|
||||
usage
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
lm_eval --model vllm \
|
||||
--model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,add_bos_token=true,trust_remote_code=true,max_model_len=4096" \
|
||||
--tasks mmlu_pro --num_fewshot "$FEWSHOT" --limit "$LIMIT" \
|
||||
--batch_size auto
|
||||
@ -19,21 +19,27 @@ RTOL = 0.08
|
||||
def launch_lm_eval(eval_config, tp_size):
|
||||
trust_remote_code = eval_config.get("trust_remote_code", False)
|
||||
max_model_len = eval_config.get("max_model_len", 4096)
|
||||
batch_size = eval_config.get("batch_size", "auto")
|
||||
backend = eval_config.get("backend", "vllm")
|
||||
model_args = (
|
||||
f"pretrained={eval_config['model_name']},"
|
||||
f"tensor_parallel_size={tp_size},"
|
||||
f"enforce_eager=true,"
|
||||
f"add_bos_token=true,"
|
||||
f"trust_remote_code={trust_remote_code},"
|
||||
f"max_model_len={max_model_len}"
|
||||
f"max_model_len={max_model_len},"
|
||||
)
|
||||
results = lm_eval.simple_evaluate(
|
||||
model="vllm",
|
||||
model=backend,
|
||||
model_args=model_args,
|
||||
tasks=[task["name"] for task in eval_config["tasks"]],
|
||||
num_fewshot=eval_config["num_fewshot"],
|
||||
limit=eval_config["limit"],
|
||||
batch_size="auto",
|
||||
# TODO(yeq): using chat template w/ fewshot_as_multiturn is supposed help
|
||||
# text models. however, this is regressing measured strict-match for
|
||||
# existing text models in CI, so only apply it for mm.
|
||||
apply_chat_template=backend == "vllm-vlm",
|
||||
batch_size=batch_size,
|
||||
)
|
||||
return results
|
||||
|
||||
|
||||
@ -8,7 +8,7 @@ steps:
|
||||
commands:
|
||||
# #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
|
||||
# https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg VLLM_MAIN_CUDA_VERSION=12.9 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg VLLM_MAIN_CUDA_VERSION=12.9 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||
- "mkdir artifacts"
|
||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||
- "bash .buildkite/scripts/upload-wheels.sh"
|
||||
@ -76,7 +76,7 @@ steps:
|
||||
queue: arm64_cpu_queue_postmerge
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
|
||||
|
||||
# Add job to create multi-arch manifest
|
||||
|
||||
@ -25,25 +25,28 @@ function cpu_tests() {
|
||||
|
||||
# offline inference
|
||||
podman exec -it "$container_id" bash -c "
|
||||
set -e
|
||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
|
||||
set -xve
|
||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" >> $HOME/test_basic.log
|
||||
|
||||
# Run basic model test
|
||||
podman exec -it "$container_id" bash -c "
|
||||
set -e
|
||||
set -evx
|
||||
pip install pytest pytest-asyncio einops peft Pillow soundfile transformers_stream_generator matplotlib
|
||||
pip install sentence-transformers datamodel_code_generator
|
||||
pytest -v -s tests/models/language/generation/test_bart.py -m cpu_model
|
||||
|
||||
# Note: disable Bart until supports V1
|
||||
# pytest -v -s tests/models/language/generation/test_bart.py -m cpu_model
|
||||
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-openai-community/gpt2]
|
||||
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-facebook/opt-125m]
|
||||
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-google/gemma-1.1-2b-it]
|
||||
pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach]
|
||||
pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model"
|
||||
# TODO: Below test case tests/models/language/pooling/test_embedding.py::test_models[True-ssmits/Qwen2-7B-Instruct-embed-base] fails on ppc64le. Disabling it for time being.
|
||||
# pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model" >> $HOME/test_rest.log
|
||||
}
|
||||
|
||||
# All of CPU tests are expected to be finished less than 40 mins.
|
||||
|
||||
export container_id
|
||||
export -f cpu_tests
|
||||
timeout 40m bash -c cpu_tests
|
||||
timeout 120m bash -c cpu_tests
|
||||
|
||||
|
||||
@ -70,7 +70,7 @@ function cpu_tests() {
|
||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||
set -e
|
||||
pytest -x -s -v \
|
||||
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs[False-10-32-neuralmagic/Llama-3.2-1B-quantized.w8a8]"
|
||||
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs"
|
||||
|
||||
# Note: disable it until supports V1
|
||||
# Run AWQ test
|
||||
|
||||
@ -44,6 +44,5 @@ docker run \
|
||||
pytest -v -s v1/structured_output
|
||||
pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py
|
||||
pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py
|
||||
pytest -v -s v1/test_metrics
|
||||
pytest -v -s v1/test_serial_utils.py
|
||||
'
|
||||
|
||||
1267
.buildkite/test-amd.yaml
Normal file
1267
.buildkite/test-amd.yaml
Normal file
File diff suppressed because it is too large
Load Diff
@ -403,6 +403,7 @@ steps:
|
||||
- pytest -v -s compile/test_fusion_all_reduce.py
|
||||
- pytest -v -s compile/test_decorator.py
|
||||
- pytest -v -s compile/test_noop_elimination.py
|
||||
- pytest -v -s compile/test_aot_compile.py
|
||||
|
||||
- label: PyTorch Fullgraph Smoke Test # 15min
|
||||
timeout_in_minutes: 30
|
||||
@ -415,8 +416,8 @@ steps:
|
||||
- pytest -v -s compile/test_basic_correctness.py
|
||||
- pytest -v -s compile/piecewise/
|
||||
|
||||
- label: PyTorch Fullgraph Test # 20min
|
||||
timeout_in_minutes: 30
|
||||
- label: PyTorch Fullgraph Test # 22min
|
||||
timeout_in_minutes: 35
|
||||
mirror_hardwares: [amdexperimental]
|
||||
torch_nightly: true
|
||||
source_file_dependencies:
|
||||
@ -424,6 +425,7 @@ steps:
|
||||
- tests/compile
|
||||
commands:
|
||||
- pytest -v -s compile/test_full_graph.py
|
||||
- pytest -v -s compile/test_fusions_e2e.py
|
||||
|
||||
- label: Kernels Core Operation Test # 48min
|
||||
timeout_in_minutes: 75
|
||||
@ -526,8 +528,9 @@ steps:
|
||||
# since torchao nightly is only compatible with torch nightly currently
|
||||
# https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
|
||||
# we can only upgrade after this is resolved
|
||||
- pip install --pre torchao==0.13.0.dev20250814 --index-url https://download.pytorch.org/whl/nightly/cu128
|
||||
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/
|
||||
# TODO(jerryzh168): resolve the above comment
|
||||
- uv pip install --system torchao==0.13.0
|
||||
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
|
||||
|
||||
- label: LM Eval Small Models # 53min
|
||||
timeout_in_minutes: 75
|
||||
@ -732,6 +735,16 @@ steps:
|
||||
- pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
|
||||
- cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
|
||||
|
||||
- label: Multi-Modal Accuracy Eval (Small Models) # 50min
|
||||
timeout_in_minutes: 70
|
||||
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
||||
source_file_dependencies:
|
||||
- vllm/multimodal/
|
||||
- vllm/inputs/
|
||||
- vllm/v1/core/
|
||||
commands:
|
||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
|
||||
|
||||
- label: Multi-Modal Models Test (Extended) 1
|
||||
mirror_hardwares: [amdexperimental]
|
||||
optional: true
|
||||
@ -795,8 +808,8 @@ steps:
|
||||
# Whisper needs spawn method to avoid deadlock
|
||||
- VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
|
||||
|
||||
- label: Blackwell Test # 38 min
|
||||
timeout_in_minutes: 60
|
||||
- label: Blackwell Test # 21 min
|
||||
timeout_in_minutes: 30
|
||||
working_dir: "/vllm-workspace/"
|
||||
gpu: b200
|
||||
# optional: true
|
||||
@ -809,8 +822,6 @@ steps:
|
||||
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
|
||||
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
||||
- vllm/v1/attention/backends/flashinfer.py
|
||||
- vllm/compilation/fusion.py
|
||||
- vllm/compilation/fusion_attn.py
|
||||
commands:
|
||||
- nvidia-smi
|
||||
- python3 examples/offline_inference/basic/chat.py
|
||||
@ -827,15 +838,32 @@ steps:
|
||||
- pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
|
||||
- pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
|
||||
- pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
|
||||
- pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
|
||||
- pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
|
||||
# Fusion
|
||||
- pytest -v -s tests/compile/test_fusion_all_reduce.py
|
||||
- pytest -v -s tests/compile/test_fusion_attn.py::test_attention_quant_pattern
|
||||
- pytest -v -s tests/kernels/moe/test_flashinfer.py
|
||||
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
|
||||
- pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py
|
||||
- pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py
|
||||
- pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
|
||||
- pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
|
||||
- pytest -v -s tests/kernels/moe/test_flashinfer.py
|
||||
|
||||
- label: Blackwell Fusion Tests # 30 min
|
||||
timeout_in_minutes: 40
|
||||
working_dir: "/vllm-workspace/"
|
||||
gpu: b200
|
||||
source_file_dependencies:
|
||||
- csrc/quantization/fp4/
|
||||
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
||||
- vllm/v1/attention/backends/flashinfer.py
|
||||
- vllm/compilation/
|
||||
# can affect pattern matching
|
||||
- vllm/model_executor/layers/layernorm.py
|
||||
- vllm/model_executor/layers/activation.py
|
||||
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
||||
commands:
|
||||
- nvidia-smi
|
||||
- pytest -v -s tests/compile/test_fusion_attn.py
|
||||
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
|
||||
# this runner has 2 GPUs available even though num_gpus=2 is not set
|
||||
- pytest -v -s tests/compile/test_fusion_all_reduce.py
|
||||
- pytest -v -s tests/compile/test_fusions_e2e.py
|
||||
|
||||
- label: Blackwell GPT-OSS Eval
|
||||
timeout_in_minutes: 60
|
||||
@ -1056,6 +1084,17 @@ steps:
|
||||
- tests/weight_loading
|
||||
commands:
|
||||
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
|
||||
|
||||
- label: NixlConnector PD accuracy tests (Distributed) # 30min
|
||||
timeout_in_minutes: 30
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 4
|
||||
source_file_dependencies:
|
||||
- vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
|
||||
- tests/v1/kv_connector/nixl_integration/
|
||||
commands:
|
||||
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
|
||||
- bash v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh
|
||||
|
||||
|
||||
##### multi gpus test #####
|
||||
@ -1088,7 +1127,7 @@ steps:
|
||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
|
||||
|
||||
##### H200 test #####
|
||||
- label: Distrubted Tests (H200) # optional
|
||||
- label: Distributed Tests (H200) # optional
|
||||
gpu: h200
|
||||
optional: true
|
||||
working_dir: "/vllm-workspace/"
|
||||
@ -1096,6 +1135,8 @@ steps:
|
||||
commands:
|
||||
- pytest -v -s tests/compile/test_async_tp.py
|
||||
- pytest -v -s tests/compile/test_sequence_parallelism.py
|
||||
- pytest -v -s tests/compile/test_fusion_all_reduce.py
|
||||
- pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
|
||||
- pytest -v -s tests/distributed/test_context_parallel.py
|
||||
- CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
|
||||
|
||||
|
||||
17
.coveragerc
17
.coveragerc
@ -1,5 +1,10 @@
|
||||
[run]
|
||||
source = vllm
|
||||
# Track the installed vllm package (this is what actually gets imported during tests)
|
||||
# Use wildcard pattern to match the installed location
|
||||
source =
|
||||
vllm
|
||||
*/dist-packages/vllm
|
||||
*/site-packages/vllm
|
||||
omit =
|
||||
*/tests/*
|
||||
*/test_*
|
||||
@ -12,6 +17,16 @@ omit =
|
||||
*/benchmarks/*
|
||||
*/docs/*
|
||||
|
||||
[paths]
|
||||
# Map all possible vllm locations to a canonical "vllm" path
|
||||
# This ensures coverage.combine properly merges data from different test runs
|
||||
source =
|
||||
vllm
|
||||
/vllm-workspace/src/vllm
|
||||
/vllm-workspace/vllm
|
||||
*/site-packages/vllm
|
||||
*/dist-packages/vllm
|
||||
|
||||
[report]
|
||||
exclude_lines =
|
||||
pragma: no cover
|
||||
|
||||
4
.git-blame-ignore-revs
Normal file
4
.git-blame-ignore-revs
Normal file
@ -0,0 +1,4 @@
|
||||
# Migrate from `yapf` & `isort` to `ruff`
|
||||
d6953beb91da4e9c99be4c0a1304a2d24189535c
|
||||
# Convert `Optional[x]` to `x | None` and `Union[x, y]` to `x | y`
|
||||
8fcaaf6a165e661f63fc51be906bc05b0767332f
|
||||
13
.github/CODEOWNERS
vendored
13
.github/CODEOWNERS
vendored
@ -5,9 +5,7 @@
|
||||
/vllm/attention @LucasWilkinson
|
||||
/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
||||
/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
|
||||
/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
|
||||
/vllm/model_executor/layers/fused_moe @mgoin
|
||||
/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @NickLucche
|
||||
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256
|
||||
/vllm/model_executor/layers/mamba @tdoublep
|
||||
/vllm/model_executor/model_loader @22quinn
|
||||
@ -26,7 +24,6 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
|
||||
/vllm/config/cache.py @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg @heheda12345
|
||||
|
||||
# vLLM V1
|
||||
/vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
|
||||
/vllm/v1/attention @LucasWilkinson
|
||||
/vllm/v1/attention/backends/flashinfer.py @mgoin
|
||||
/vllm/v1/attention/backends/triton_attn.py @tdoublep
|
||||
@ -60,7 +57,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
|
||||
/tests/v1/offloading @ApostaC
|
||||
|
||||
# Transformers backend
|
||||
/vllm/model_executor/models/transformers.py @hmellor
|
||||
/vllm/model_executor/models/transformers @hmellor
|
||||
/tests/models/test_transformers.py @hmellor
|
||||
|
||||
# Docs
|
||||
@ -121,3 +118,11 @@ mkdocs.yaml @hmellor
|
||||
|
||||
# KVConnector installation files
|
||||
/requirements/kv_connectors.txt @NickLucche
|
||||
|
||||
# Pooling models
|
||||
/examples/*/pooling/ @noooop
|
||||
/tests/models/*/pooling* @noooop
|
||||
/tests/entrypoints/pooling @noooop
|
||||
/vllm/config/pooler.py @noooop
|
||||
/vllm/pooling_params.py @noooop
|
||||
/vllm/model_executor/layers/pooler.py @noooop
|
||||
|
||||
138
.github/workflows/issue_autolabel.yml
vendored
138
.github/workflows/issue_autolabel.yml
vendored
@ -13,6 +13,7 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Label issues based on keywords
|
||||
id: label-step
|
||||
uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
|
||||
with:
|
||||
script: |
|
||||
@ -42,7 +43,6 @@ jobs:
|
||||
searchIn: "body"
|
||||
},
|
||||
],
|
||||
|
||||
// Substring search - matches anywhere in text (partial matches)
|
||||
substrings: [
|
||||
{
|
||||
@ -89,14 +89,12 @@ jobs:
|
||||
term: "hip_",
|
||||
searchIn: "both"
|
||||
},
|
||||
|
||||
// ROCm tools and libraries
|
||||
{
|
||||
term: "hipify",
|
||||
searchIn: "both"
|
||||
},
|
||||
],
|
||||
|
||||
// Regex patterns - for complex pattern matching
|
||||
regexPatterns: [
|
||||
{
|
||||
@ -107,13 +105,17 @@ jobs:
|
||||
}
|
||||
],
|
||||
},
|
||||
// Add more label configurations here as needed
|
||||
// example: {
|
||||
// keywords: [...],
|
||||
// substrings: [...],
|
||||
// regexPatterns: [...]
|
||||
// },
|
||||
};
|
||||
|
||||
// Helper function to create regex based on search type
|
||||
function createSearchRegex(term, type) {
|
||||
// Escape special regex characters in the term
|
||||
const escapedTerm = term.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
|
||||
|
||||
switch (type) {
|
||||
case 'keyword':
|
||||
// Word boundary search - matches whole words only
|
||||
@ -125,16 +127,13 @@ jobs:
|
||||
throw new Error(`Unknown search type: ${type}`);
|
||||
}
|
||||
}
|
||||
|
||||
// Helper function to find matching terms in text with line information
|
||||
function findMatchingTermsWithLines(text, searchTerms = [], searchType = 'keyword', searchLocation = '') {
|
||||
const matches = [];
|
||||
const lines = text.split('\n');
|
||||
|
||||
for (const termConfig of searchTerms) {
|
||||
let regex;
|
||||
let term, searchIn, pattern, description, flags;
|
||||
|
||||
// Handle different input formats (string or object)
|
||||
if (typeof termConfig === 'string') {
|
||||
term = termConfig;
|
||||
@ -146,21 +145,17 @@ jobs:
|
||||
description = termConfig.description;
|
||||
flags = termConfig.flags;
|
||||
}
|
||||
|
||||
// Skip if this term shouldn't be searched in the current location
|
||||
if (searchIn !== 'both' && searchIn !== searchLocation) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Create appropriate regex
|
||||
if (searchType === 'regex') {
|
||||
regex = new RegExp(pattern, flags || "gi");
|
||||
} else {
|
||||
regex = createSearchRegex(term, searchType);
|
||||
}
|
||||
|
||||
const termMatches = [];
|
||||
|
||||
// Check each line for matches
|
||||
lines.forEach((line, lineIndex) => {
|
||||
const lineMatches = line.match(regex);
|
||||
@ -175,15 +170,14 @@ jobs:
|
||||
originalTerm: term || pattern,
|
||||
description: description,
|
||||
// Show context around the match in the line
|
||||
context: line.length > 100 ?
|
||||
line.substring(Math.max(0, line.toLowerCase().indexOf(match.toLowerCase()) - 30),
|
||||
line.toLowerCase().indexOf(match.toLowerCase()) + match.length + 30) + '...'
|
||||
context: line.length > 100 ?
|
||||
line.substring(Math.max(0, line.toLowerCase().indexOf(match.toLowerCase()) - 30),
|
||||
line.toLowerCase().indexOf(match.toLowerCase()) + match.length + 30) + '...'
|
||||
: line.trim()
|
||||
});
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
if (termMatches.length > 0) {
|
||||
matches.push({
|
||||
term: term || (description || pattern),
|
||||
@ -196,64 +190,48 @@ jobs:
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return matches;
|
||||
}
|
||||
|
||||
// Helper function to check if label should be added
|
||||
async function processLabel(labelName, config) {
|
||||
const body = context.payload.issue.body || "";
|
||||
const title = context.payload.issue.title || "";
|
||||
|
||||
core.notice(`Processing label: ${labelName}`);
|
||||
core.notice(`Issue Title: "${title}"`);
|
||||
core.notice(`Issue Body length: ${body.length} characters`);
|
||||
|
||||
let shouldAddLabel = false;
|
||||
let allMatches = [];
|
||||
let reason = '';
|
||||
|
||||
const keywords = config.keywords || [];
|
||||
const substrings = config.substrings || [];
|
||||
const regexPatterns = config.regexPatterns || [];
|
||||
|
||||
core.notice(`Searching with ${keywords.length} keywords, ${substrings.length} substrings, and ${regexPatterns.length} regex patterns`);
|
||||
|
||||
// Search in title
|
||||
if (title.trim()) {
|
||||
core.notice(`Searching in title: "${title}"`);
|
||||
|
||||
const titleKeywordMatches = findMatchingTermsWithLines(title, keywords, 'keyword', 'title');
|
||||
const titleSubstringMatches = findMatchingTermsWithLines(title, substrings, 'substring', 'title');
|
||||
const titleRegexMatches = findMatchingTermsWithLines(title, regexPatterns, 'regex', 'title');
|
||||
|
||||
allMatches.push(...titleKeywordMatches, ...titleSubstringMatches, ...titleRegexMatches);
|
||||
}
|
||||
|
||||
// Search in body
|
||||
if (body.trim()) {
|
||||
core.notice(`Searching in body (${body.length} characters)`);
|
||||
|
||||
const bodyKeywordMatches = findMatchingTermsWithLines(body, keywords, 'keyword', 'body');
|
||||
const bodySubstringMatches = findMatchingTermsWithLines(body, substrings, 'substring', 'body');
|
||||
const bodyRegexMatches = findMatchingTermsWithLines(body, regexPatterns, 'regex', 'body');
|
||||
|
||||
allMatches.push(...bodyKeywordMatches, ...bodySubstringMatches, ...bodyRegexMatches);
|
||||
}
|
||||
|
||||
if (allMatches.length > 0) {
|
||||
core.notice(`Found ${allMatches.length} matching term(s):`);
|
||||
|
||||
for (const termMatch of allMatches) {
|
||||
const locationText = termMatch.searchLocation === 'title' ? 'title' : 'body';
|
||||
const searchInText = termMatch.searchIn === 'both' ? 'both' : termMatch.searchIn;
|
||||
|
||||
if (termMatch.searchType === 'regex') {
|
||||
core.notice(` 📍 Regex: "${termMatch.term}" (pattern: ${termMatch.pattern}) found ${termMatch.count} time(s) in ${locationText} (configured to search in: ${searchInText}):`);
|
||||
} else {
|
||||
core.notice(` 📍 Term: "${termMatch.term}" (${termMatch.searchType} search) found ${termMatch.count} time(s) in ${locationText} (configured to search in: ${searchInText}):`);
|
||||
}
|
||||
|
||||
// Show details for each match
|
||||
termMatch.matches.forEach((match, index) => {
|
||||
core.notice(` ${index + 1}. Line ${match.lineNumber} in ${match.searchLocation}: "${match.match}" [${match.searchType}]`);
|
||||
@ -266,7 +244,6 @@ jobs:
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
shouldAddLabel = true;
|
||||
const totalMatches = allMatches.reduce((sum, t) => sum + t.count, 0);
|
||||
const titleMatches = allMatches.filter(t => t.searchLocation === 'title').reduce((sum, t) => sum + t.count, 0);
|
||||
@ -274,13 +251,10 @@ jobs:
|
||||
const keywordMatches = allMatches.filter(t => t.searchType === 'keyword').reduce((sum, t) => sum + t.count, 0);
|
||||
const substringMatches = allMatches.filter(t => t.searchType === 'substring').reduce((sum, t) => sum + t.count, 0);
|
||||
const regexMatches = allMatches.filter(t => t.searchType === 'regex').reduce((sum, t) => sum + t.count, 0);
|
||||
|
||||
reason = `Found ${totalMatches} total matches (${titleMatches} in title, ${bodyMatches} in body) - ${keywordMatches} keyword matches, ${substringMatches} substring matches, ${regexMatches} regex matches`;
|
||||
}
|
||||
|
||||
core.notice(`Final decision: ${shouldAddLabel ? 'ADD LABEL' : 'DO NOT ADD LABEL'}`);
|
||||
core.notice(`Reason: ${reason || 'No matching terms found'}`);
|
||||
|
||||
if (shouldAddLabel) {
|
||||
const existingLabels = context.payload.issue.labels.map(l => l.name);
|
||||
if (!existingLabels.includes(labelName)) {
|
||||
@ -296,14 +270,92 @@ jobs:
|
||||
core.notice(`Label "${labelName}" already present.`);
|
||||
return false;
|
||||
}
|
||||
|
||||
core.notice(`No matching terms found for label "${labelName}".`);
|
||||
return false;
|
||||
}
|
||||
|
||||
// Process all configured labels
|
||||
const processLabels = Object.entries(labelConfig)
|
||||
.map(([labelName, config]) => processLabel(labelName, config));
|
||||
const labelsAdded = await Promise.all(processLabels);
|
||||
const numLabelsAdded = labelsAdded.reduce((x, y) => x + y, 0);
|
||||
core.notice(`Processing complete. ${numLabelsAdded} label(s) added.`);
|
||||
const labelsAddedResults = await Promise.all(
|
||||
Object.entries(labelConfig).map(([labelName, config]) =>
|
||||
processLabel(labelName, config).then(added => ({ labelName, added }))
|
||||
)
|
||||
);
|
||||
|
||||
const numLabelsAdded = labelsAddedResults.filter(r => r.added).length;
|
||||
core.notice(`Processing complete. ${numLabelsAdded} label(s) added.`);
|
||||
|
||||
// Return which labels were added for the next step
|
||||
const addedLabels = labelsAddedResults.filter(r => r.added).map(r => r.labelName);
|
||||
core.setOutput('labels_added', JSON.stringify(addedLabels));
|
||||
return addedLabels;
|
||||
|
||||
- name: CC users for labeled issues
|
||||
if: steps.label-step.outputs.labels_added != '[]'
|
||||
uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
|
||||
with:
|
||||
script: |
|
||||
// Configuration: Map labels to GitHub users to CC
|
||||
// You can add multiple users per label, and multiple label configurations
|
||||
const ccConfig = {
|
||||
rocm: {
|
||||
users: ['hongxiayang', 'tjtanaa', 'vllmellm'], // Add more users as needed: ['user1', 'user2', 'user3']
|
||||
message: 'CC {users} for ROCm-related issue' // {users} will be replaced with @mentions
|
||||
},
|
||||
// Add more label -> user mappings here
|
||||
// Example:
|
||||
// cuda: {
|
||||
// users: ['user1', 'user2'],
|
||||
// message: 'CC {users} for CUDA-related issue'
|
||||
// },
|
||||
// performance: {
|
||||
// users: ['perfexpert'],
|
||||
// message: 'CC {users} for performance issue'
|
||||
// },
|
||||
};
|
||||
|
||||
const labelsAdded = JSON.parse('${{ steps.label-step.outputs.labels_added }}');
|
||||
core.notice(`Labels added: ${labelsAdded.join(', ')}`);
|
||||
|
||||
// Get existing comments to check for already mentioned users
|
||||
const comments = await github.rest.issues.listComments({
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
issue_number: context.issue.number,
|
||||
});
|
||||
|
||||
const issueBody = context.payload.issue.body || '';
|
||||
const allExistingText = issueBody + '\n' + comments.data.map(c => c.body).join('\n');
|
||||
|
||||
// Process each label that was added
|
||||
for (const label of labelsAdded) {
|
||||
if (ccConfig[label]) {
|
||||
const config = ccConfig[label];
|
||||
const usersToMention = [];
|
||||
|
||||
// Check which users haven't been mentioned yet
|
||||
for (const user of config.users) {
|
||||
const mentionPattern = new RegExp(`@${user}\\b`, 'i');
|
||||
if (!mentionPattern.test(allExistingText)) {
|
||||
usersToMention.push(user);
|
||||
} else {
|
||||
core.notice(`@${user} already mentioned for label "${label}", skipping`);
|
||||
}
|
||||
}
|
||||
|
||||
// Post comment if there are users to mention
|
||||
if (usersToMention.length > 0) {
|
||||
const mentions = usersToMention.map(u => `@${u}`).join(' ');
|
||||
const message = config.message.replace('{users}', mentions);
|
||||
|
||||
await github.rest.issues.createComment({
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
issue_number: context.issue.number,
|
||||
body: message
|
||||
});
|
||||
|
||||
core.notice(`CC comment added for label "${label}": ${mentions}`);
|
||||
} else {
|
||||
core.notice(`All users for label "${label}" already mentioned, skipping comment`);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -4,7 +4,6 @@ MD013: false
|
||||
MD024:
|
||||
siblings_only: true
|
||||
MD033: false
|
||||
MD042: false
|
||||
MD045: false
|
||||
MD046: false
|
||||
MD051: false
|
||||
|
||||
@ -16,6 +16,7 @@ repos:
|
||||
rev: v1.38.1
|
||||
hooks:
|
||||
- id: typos
|
||||
args: [--force-exclude]
|
||||
- repo: https://github.com/pre-commit/mirrors-clang-format
|
||||
rev: v21.1.2
|
||||
hooks:
|
||||
|
||||
@ -8,7 +8,6 @@ import sys
|
||||
import time
|
||||
import traceback
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Optional, Union
|
||||
|
||||
import aiohttp
|
||||
import huggingface_hub.constants
|
||||
@ -28,13 +27,13 @@ class RequestFuncInput:
|
||||
prompt_len: int
|
||||
output_len: int
|
||||
model: str
|
||||
model_name: Optional[str] = None
|
||||
logprobs: Optional[int] = None
|
||||
extra_body: Optional[dict] = None
|
||||
multi_modal_content: Optional[dict | list[dict]] = None
|
||||
model_name: str | None = None
|
||||
logprobs: int | None = None
|
||||
extra_body: dict | None = None
|
||||
multi_modal_content: dict | list[dict] | None = None
|
||||
ignore_eos: bool = False
|
||||
language: Optional[str] = None
|
||||
request_id: Optional[str] = None
|
||||
language: str | None = None
|
||||
request_id: str | None = None
|
||||
|
||||
|
||||
@dataclass
|
||||
@ -52,7 +51,7 @@ class RequestFuncOutput:
|
||||
|
||||
async def async_request_tgi(
|
||||
request_func_input: RequestFuncInput,
|
||||
pbar: Optional[tqdm] = None,
|
||||
pbar: tqdm | None = None,
|
||||
) -> RequestFuncOutput:
|
||||
api_url = request_func_input.api_url
|
||||
assert api_url.endswith("generate_stream")
|
||||
@ -133,7 +132,7 @@ async def async_request_tgi(
|
||||
|
||||
async def async_request_trt_llm(
|
||||
request_func_input: RequestFuncInput,
|
||||
pbar: Optional[tqdm] = None,
|
||||
pbar: tqdm | None = None,
|
||||
) -> RequestFuncOutput:
|
||||
api_url = request_func_input.api_url
|
||||
assert api_url.endswith("generate_stream")
|
||||
@ -204,7 +203,7 @@ async def async_request_trt_llm(
|
||||
|
||||
async def async_request_deepspeed_mii(
|
||||
request_func_input: RequestFuncInput,
|
||||
pbar: Optional[tqdm] = None,
|
||||
pbar: tqdm | None = None,
|
||||
) -> RequestFuncOutput:
|
||||
api_url = request_func_input.api_url
|
||||
assert api_url.endswith(("completions", "profile")), (
|
||||
@ -267,7 +266,7 @@ async def async_request_deepspeed_mii(
|
||||
|
||||
async def async_request_openai_completions(
|
||||
request_func_input: RequestFuncInput,
|
||||
pbar: Optional[tqdm] = None,
|
||||
pbar: tqdm | None = None,
|
||||
) -> RequestFuncOutput:
|
||||
api_url = request_func_input.api_url
|
||||
assert api_url.endswith(("completions", "profile")), (
|
||||
@ -367,7 +366,7 @@ async def async_request_openai_completions(
|
||||
|
||||
async def async_request_openai_chat_completions(
|
||||
request_func_input: RequestFuncInput,
|
||||
pbar: Optional[tqdm] = None,
|
||||
pbar: tqdm | None = None,
|
||||
) -> RequestFuncOutput:
|
||||
api_url = request_func_input.api_url
|
||||
assert api_url.endswith(("chat/completions", "profile")), (
|
||||
@ -476,7 +475,7 @@ async def async_request_openai_chat_completions(
|
||||
|
||||
async def async_request_openai_audio(
|
||||
request_func_input: RequestFuncInput,
|
||||
pbar: Optional[tqdm] = None,
|
||||
pbar: tqdm | None = None,
|
||||
) -> RequestFuncOutput:
|
||||
# Lazy import without PlaceholderModule to avoid vllm dep.
|
||||
import soundfile
|
||||
@ -610,7 +609,7 @@ def get_tokenizer(
|
||||
tokenizer_mode: str = "auto",
|
||||
trust_remote_code: bool = False,
|
||||
**kwargs,
|
||||
) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
|
||||
) -> PreTrainedTokenizer | PreTrainedTokenizerFast:
|
||||
if pretrained_model_name_or_path is not None and not os.path.exists(
|
||||
pretrained_model_name_or_path
|
||||
):
|
||||
|
||||
@ -32,7 +32,6 @@ import dataclasses
|
||||
import json
|
||||
import random
|
||||
import time
|
||||
from typing import Optional
|
||||
|
||||
from transformers import PreTrainedTokenizerBase
|
||||
|
||||
@ -80,7 +79,7 @@ def sample_requests_from_dataset(
|
||||
num_requests: int,
|
||||
tokenizer: PreTrainedTokenizerBase,
|
||||
input_length_range: tuple[int, int],
|
||||
fixed_output_len: Optional[int],
|
||||
fixed_output_len: int | None,
|
||||
) -> list[Request]:
|
||||
if fixed_output_len is not None and fixed_output_len < 4:
|
||||
raise ValueError("output_len too small")
|
||||
@ -128,7 +127,7 @@ def sample_requests_from_random(
|
||||
num_requests: int,
|
||||
tokenizer: PreTrainedTokenizerBase,
|
||||
input_length_range: tuple[int, int],
|
||||
fixed_output_len: Optional[int],
|
||||
fixed_output_len: int | None,
|
||||
prefix_len: int,
|
||||
) -> list[Request]:
|
||||
requests = []
|
||||
|
||||
@ -7,7 +7,6 @@ import dataclasses
|
||||
import json
|
||||
import random
|
||||
import time
|
||||
from typing import Optional
|
||||
|
||||
from transformers import AutoTokenizer, PreTrainedTokenizerBase
|
||||
|
||||
@ -24,7 +23,7 @@ def sample_requests(
|
||||
dataset_path: str,
|
||||
num_requests: int,
|
||||
tokenizer: PreTrainedTokenizerBase,
|
||||
fixed_output_len: Optional[int],
|
||||
fixed_output_len: int | None,
|
||||
) -> list[tuple[str, int, int, int]]:
|
||||
if fixed_output_len is not None and fixed_output_len < 4:
|
||||
raise ValueError("output_len too small")
|
||||
|
||||
@ -31,8 +31,8 @@ import time
|
||||
import uuid
|
||||
import warnings
|
||||
from collections.abc import AsyncGenerator
|
||||
from contextlib import nullcontext
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional
|
||||
|
||||
import datasets
|
||||
import numpy as np
|
||||
@ -316,7 +316,7 @@ def calculate_metrics(
|
||||
tokenizer: PreTrainedTokenizerBase,
|
||||
selected_percentile_metrics: list[str],
|
||||
selected_percentiles: list[float],
|
||||
goodput_config_dict: Optional[dict[str, float]] = None,
|
||||
goodput_config_dict: dict[str, float] | None = None,
|
||||
) -> tuple[BenchmarkMetrics, list[int]]:
|
||||
actual_output_lens: list[int] = []
|
||||
total_input = 0
|
||||
@ -436,9 +436,9 @@ async def benchmark(
|
||||
selected_percentile_metrics: list[str],
|
||||
selected_percentiles: list[str],
|
||||
ignore_eos: bool,
|
||||
max_concurrency: Optional[int],
|
||||
max_concurrency: int | None,
|
||||
structured_output_ratio: float,
|
||||
goodput_config_dict: Optional[dict[str, float]] = None,
|
||||
goodput_config_dict: dict[str, float] | None = None,
|
||||
):
|
||||
if backend in ASYNC_REQUEST_FUNCS:
|
||||
request_func = ASYNC_REQUEST_FUNCS[backend]
|
||||
@ -502,15 +502,9 @@ async def benchmark(
|
||||
|
||||
pbar = None if disable_tqdm else tqdm(total=len(input_requests))
|
||||
|
||||
# This can be used once the minimum Python version is 3.10 or higher,
|
||||
# and it will simplify the code in limited_request_func.
|
||||
# semaphore = (asyncio.Semaphore(max_concurrency)
|
||||
# if max_concurrency else contextlib.nullcontext())
|
||||
semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else None
|
||||
semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else nullcontext()
|
||||
|
||||
async def limited_request_func(request_func_input, pbar):
|
||||
if semaphore is None:
|
||||
return await request_func(request_func_input=request_func_input, pbar=pbar)
|
||||
async with semaphore:
|
||||
return await request_func(request_func_input=request_func_input, pbar=pbar)
|
||||
|
||||
|
||||
@ -6,7 +6,7 @@ import math
|
||||
import os
|
||||
import time
|
||||
from types import TracebackType
|
||||
from typing import Any, Optional, Union
|
||||
from typing import Any
|
||||
|
||||
|
||||
def convert_to_pytorch_benchmark_format(
|
||||
@ -92,7 +92,7 @@ class TimeCollector:
|
||||
def __init__(self, scale: int) -> None:
|
||||
self.cnt: int = 0
|
||||
self._sum: int = 0
|
||||
self._max: Optional[int] = None
|
||||
self._max: int | None = None
|
||||
self.scale = scale
|
||||
self.start_time: int = time.monotonic_ns()
|
||||
|
||||
@ -104,13 +104,13 @@ class TimeCollector:
|
||||
else:
|
||||
self._max = max(self._max, v)
|
||||
|
||||
def avg(self) -> Union[float, str]:
|
||||
def avg(self) -> float | str:
|
||||
return self._sum * 1.0 / self.cnt / self.scale if self.cnt > 0 else "N/A"
|
||||
|
||||
def max(self) -> Union[float, str]:
|
||||
def max(self) -> float | str:
|
||||
return self._max / self.scale if self._max else "N/A"
|
||||
|
||||
def dump_avg_max(self) -> list[Union[float, str]]:
|
||||
def dump_avg_max(self) -> list[float | str]:
|
||||
return [self.avg(), self.max()]
|
||||
|
||||
def __enter__(self) -> None:
|
||||
@ -118,8 +118,8 @@ class TimeCollector:
|
||||
|
||||
def __exit__(
|
||||
self,
|
||||
exc_type: Optional[type[BaseException]],
|
||||
exc_value: Optional[BaseException],
|
||||
exc_traceback: Optional[TracebackType],
|
||||
exc_type: type[BaseException] | None,
|
||||
exc_value: BaseException | None,
|
||||
exc_traceback: TracebackType | None,
|
||||
) -> None:
|
||||
self.collect(time.monotonic_ns() - self.start_time)
|
||||
|
||||
@ -6,8 +6,7 @@ import copy
|
||||
import itertools
|
||||
import pickle as pkl
|
||||
import time
|
||||
from collections.abc import Iterable
|
||||
from typing import Callable
|
||||
from collections.abc import Callable, Iterable
|
||||
|
||||
import torch
|
||||
import torch.utils.benchmark as TBenchmark
|
||||
|
||||
@ -6,8 +6,7 @@ import copy
|
||||
import itertools
|
||||
import pickle as pkl
|
||||
import time
|
||||
from collections.abc import Iterable
|
||||
from typing import Callable, Optional
|
||||
from collections.abc import Callable, Iterable
|
||||
|
||||
import torch
|
||||
import torch.utils.benchmark as TBenchmark
|
||||
@ -53,7 +52,7 @@ def bench_int8(
|
||||
n: int,
|
||||
label: str,
|
||||
sub_label: str,
|
||||
bench_kernels: Optional[list[str]] = None,
|
||||
bench_kernels: list[str] | None = None,
|
||||
) -> Iterable[TMeasurement]:
|
||||
"""Benchmark INT8-based kernels."""
|
||||
assert dtype == torch.int8
|
||||
@ -108,7 +107,7 @@ def bench_fp8(
|
||||
n: int,
|
||||
label: str,
|
||||
sub_label: str,
|
||||
bench_kernels: Optional[list[str]] = None,
|
||||
bench_kernels: list[str] | None = None,
|
||||
) -> Iterable[TMeasurement]:
|
||||
"""Benchmark FP8-based kernels."""
|
||||
assert dtype == torch.float8_e4m3fn
|
||||
@ -183,7 +182,7 @@ def bench(
|
||||
n: int,
|
||||
label: str,
|
||||
sub_label: str,
|
||||
bench_kernels: Optional[list[str]] = None,
|
||||
bench_kernels: list[str] | None = None,
|
||||
) -> Iterable[TMeasurement]:
|
||||
if dtype == torch.int8:
|
||||
return bench_int8(dtype, m, k, n, label, sub_label, bench_kernels)
|
||||
@ -201,7 +200,7 @@ def print_timers(timers: Iterable[TMeasurement]):
|
||||
def run(
|
||||
dtype: torch.dtype,
|
||||
MKNs: Iterable[tuple[int, int, int]],
|
||||
bench_kernels: Optional[list[str]] = None,
|
||||
bench_kernels: list[str] | None = None,
|
||||
) -> Iterable[TMeasurement]:
|
||||
results = []
|
||||
for m, k, n in MKNs:
|
||||
|
||||
@ -3,10 +3,9 @@
|
||||
|
||||
import pickle as pkl
|
||||
import time
|
||||
from collections.abc import Iterable
|
||||
from collections.abc import Callable, Iterable
|
||||
from dataclasses import dataclass
|
||||
from itertools import product
|
||||
from typing import Callable, Optional
|
||||
|
||||
import torch
|
||||
import torch.utils.benchmark as TBenchmark
|
||||
@ -51,7 +50,7 @@ def get_bench_params() -> list[bench_params_t]:
|
||||
def unfused_int8_impl(
|
||||
rms_norm_layer: RMSNorm,
|
||||
x: torch.Tensor,
|
||||
residual: Optional[torch.Tensor],
|
||||
residual: torch.Tensor | None,
|
||||
quant_dtype: torch.dtype,
|
||||
):
|
||||
# Norm
|
||||
@ -68,7 +67,7 @@ def unfused_int8_impl(
|
||||
def unfused_fp8_impl(
|
||||
rms_norm_layer: RMSNorm,
|
||||
x: torch.Tensor,
|
||||
residual: Optional[torch.Tensor],
|
||||
residual: torch.Tensor | None,
|
||||
quant_dtype: torch.dtype,
|
||||
):
|
||||
# Norm
|
||||
@ -85,7 +84,7 @@ def unfused_fp8_impl(
|
||||
def fused_impl(
|
||||
rms_norm_layer: RMSNorm, # this stores the weights
|
||||
x: torch.Tensor,
|
||||
residual: Optional[torch.Tensor],
|
||||
residual: torch.Tensor | None,
|
||||
quant_dtype: torch.dtype,
|
||||
):
|
||||
out, _ = ops.rms_norm_dynamic_per_token_quant(
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import itertools
|
||||
from typing import Callable
|
||||
from collections.abc import Callable
|
||||
from unittest.mock import patch
|
||||
|
||||
import pandas as pd
|
||||
|
||||
@ -22,8 +22,8 @@ Example:
|
||||
import json
|
||||
import os
|
||||
import time
|
||||
from collections.abc import Callable
|
||||
from contextlib import nullcontext
|
||||
from typing import Callable, Optional
|
||||
|
||||
import torch
|
||||
import torch.distributed as dist
|
||||
@ -264,12 +264,12 @@ class CommunicatorBenchmark:
|
||||
def benchmark_allreduce_single(
|
||||
self,
|
||||
sequence_length: int,
|
||||
allreduce_fn: Callable[[torch.Tensor], Optional[torch.Tensor]],
|
||||
allreduce_fn: Callable[[torch.Tensor], torch.Tensor | None],
|
||||
should_use_fn: Callable[[torch.Tensor], bool],
|
||||
context,
|
||||
num_warmup: int,
|
||||
num_trials: int,
|
||||
) -> Optional[float]:
|
||||
) -> float | None:
|
||||
"""Benchmark method with CUDA graph optimization."""
|
||||
try:
|
||||
# Create test tensor (2D: sequence_length x hidden_size)
|
||||
|
||||
@ -6,11 +6,12 @@ import copy
|
||||
import json
|
||||
import pickle
|
||||
import time
|
||||
from collections.abc import Callable
|
||||
from dataclasses import dataclass
|
||||
from enum import Enum, auto
|
||||
from itertools import product
|
||||
from pathlib import Path
|
||||
from typing import Any, Callable, Optional
|
||||
from typing import Any
|
||||
|
||||
import torch
|
||||
import torch.utils.benchmark as TBenchmark
|
||||
@ -158,7 +159,7 @@ def ref_group_gemm(
|
||||
seq_lens_cpu: torch.Tensor,
|
||||
prompt_lora_mapping_cpu: torch.Tensor,
|
||||
scaling: float,
|
||||
add_inputs: Optional[bool],
|
||||
add_inputs: bool | None,
|
||||
):
|
||||
"""
|
||||
Torch group gemm reference implementation to test correctness of
|
||||
@ -316,8 +317,8 @@ class BenchmarkContext:
|
||||
lora_rank: int
|
||||
sort_by_lora_id: bool
|
||||
dtype: torch.dtype
|
||||
seq_length: Optional[int] = None
|
||||
num_slices: Optional[int] = None # num_slices for slice based ops
|
||||
seq_length: int | None = None
|
||||
num_slices: int | None = None # num_slices for slice based ops
|
||||
|
||||
def with_seq_length(self, seq_length: int) -> "BenchmarkContext":
|
||||
ctx = copy.copy(self)
|
||||
@ -561,7 +562,7 @@ class BenchmarkTensors:
|
||||
}
|
||||
|
||||
def bench_fn_kwargs(
|
||||
self, op_type: OpType, add_inputs: Optional[bool] = None
|
||||
self, op_type: OpType, add_inputs: bool | None = None
|
||||
) -> dict[str, Any]:
|
||||
if op_type.is_shrink_fn():
|
||||
assert add_inputs is None
|
||||
@ -575,7 +576,7 @@ class BenchmarkTensors:
|
||||
raise ValueError(f"Unrecognized optype {self}")
|
||||
|
||||
def test_correctness(
|
||||
self, op_type: OpType, expand_fn_add_inputs: Optional[bool]
|
||||
self, op_type: OpType, expand_fn_add_inputs: bool | None
|
||||
) -> bool:
|
||||
"""
|
||||
Test correctness of op_type implementation against a grouped gemm
|
||||
@ -611,8 +612,8 @@ def bench_optype(
|
||||
ctx: BenchmarkContext,
|
||||
arg_pool_size: int,
|
||||
op_type: OpType,
|
||||
cuda_graph_nops: Optional[int] = None,
|
||||
expand_fn_add_inputs: Optional[bool] = None,
|
||||
cuda_graph_nops: int | None = None,
|
||||
expand_fn_add_inputs: bool | None = None,
|
||||
test_correctness: bool = False,
|
||||
) -> TMeasurement:
|
||||
assert arg_pool_size >= 1
|
||||
@ -679,7 +680,7 @@ def bench_torch_mm(
|
||||
ctx: BenchmarkContext,
|
||||
arg_pool_size: int,
|
||||
op_type: OpType,
|
||||
cuda_graph_nops: Optional[int] = None,
|
||||
cuda_graph_nops: int | None = None,
|
||||
) -> TMeasurement:
|
||||
"""
|
||||
Benchmark basic torch.mm as a roofline.
|
||||
@ -744,7 +745,7 @@ def use_cuda_graph_recommendation() -> str:
|
||||
"""
|
||||
|
||||
|
||||
def print_timers(timers: list[TMeasurement], args: Optional[argparse.Namespace] = None):
|
||||
def print_timers(timers: list[TMeasurement], args: argparse.Namespace | None = None):
|
||||
compare = TBenchmark.Compare(timers)
|
||||
compare.print()
|
||||
|
||||
|
||||
@ -8,10 +8,9 @@ import math
|
||||
import os
|
||||
import pickle as pkl
|
||||
import time
|
||||
from collections.abc import Iterable
|
||||
from collections.abc import Callable, Iterable
|
||||
from dataclasses import dataclass
|
||||
from itertools import product
|
||||
from typing import Callable, Optional
|
||||
|
||||
import pandas as pd
|
||||
import torch
|
||||
@ -63,23 +62,23 @@ class BenchmarkTensors:
|
||||
a: torch.Tensor
|
||||
|
||||
w_q: torch.Tensor
|
||||
group_size: Optional[int]
|
||||
group_size: int | None
|
||||
wtype: ScalarType
|
||||
w_g_s: torch.Tensor
|
||||
w_g_zp: Optional[torch.Tensor]
|
||||
w_ch_s: Optional[torch.Tensor]
|
||||
w_tok_s: Optional[torch.Tensor]
|
||||
w_g_zp: torch.Tensor | None
|
||||
w_ch_s: torch.Tensor | None
|
||||
w_tok_s: torch.Tensor | None
|
||||
|
||||
|
||||
@dataclass
|
||||
class TypeConfig:
|
||||
act_type: torch.dtype
|
||||
weight_type: ScalarType
|
||||
output_type: Optional[torch.dtype]
|
||||
group_scale_type: Optional[torch.dtype]
|
||||
group_zero_type: Optional[torch.dtype]
|
||||
channel_scale_type: Optional[torch.dtype]
|
||||
token_scale_type: Optional[torch.dtype]
|
||||
output_type: torch.dtype | None
|
||||
group_scale_type: torch.dtype | None
|
||||
group_zero_type: torch.dtype | None
|
||||
channel_scale_type: torch.dtype | None
|
||||
token_scale_type: torch.dtype | None
|
||||
|
||||
|
||||
def rand_data(shape, dtype=torch.float16, scale=1):
|
||||
@ -93,8 +92,8 @@ def quantize_and_pack(
|
||||
atype: torch.dtype,
|
||||
w: torch.Tensor,
|
||||
wtype: ScalarType,
|
||||
stype: Optional[torch.dtype],
|
||||
group_size: Optional[int],
|
||||
stype: torch.dtype | None,
|
||||
group_size: int | None,
|
||||
zero_points: bool = False,
|
||||
):
|
||||
assert wtype.is_integer(), "TODO: support floating point weights"
|
||||
@ -113,7 +112,7 @@ def quantize_and_pack(
|
||||
|
||||
|
||||
def create_bench_tensors(
|
||||
shape: tuple[int, int, int], types: TypeConfig, group_size: Optional[int]
|
||||
shape: tuple[int, int, int], types: TypeConfig, group_size: int | None
|
||||
) -> list[BenchmarkTensors]:
|
||||
m, n, k = shape
|
||||
|
||||
@ -331,8 +330,8 @@ def bench_fns(label: str, sub_label: str, description: str, fns: list[Callable])
|
||||
return res
|
||||
|
||||
|
||||
_SWEEP_SCHEDULES_RESULTS: Optional[pd.DataFrame] = None
|
||||
_SWEEP_SCHEDULES_RESULTS_CSV: Optional[str] = None
|
||||
_SWEEP_SCHEDULES_RESULTS: pd.DataFrame | None = None
|
||||
_SWEEP_SCHEDULES_RESULTS_CSV: str | None = None
|
||||
|
||||
|
||||
def bench(
|
||||
|
||||
@ -631,7 +631,7 @@ def main(args: argparse.Namespace):
|
||||
else:
|
||||
ensure_divisibility(intermediate_size, args.tp_size, "intermediate_size")
|
||||
shard_intermediate_size = 2 * intermediate_size // args.tp_size
|
||||
dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype
|
||||
dtype = torch.float16 if current_platform.is_rocm() else config.dtype
|
||||
use_fp8_w8a8 = args.dtype == "fp8_w8a8"
|
||||
use_int8_w8a16 = args.dtype == "int8_w8a16"
|
||||
block_quant_shape = get_weight_block_size_safety(config)
|
||||
|
||||
@ -344,7 +344,7 @@ def main(args: argparse.Namespace):
|
||||
topk = config.num_experts_per_tok
|
||||
|
||||
hidden_size = config.hidden_size
|
||||
dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype
|
||||
dtype = torch.float16 if current_platform.is_rocm() else config.dtype
|
||||
use_fp8_w8a8 = args.dtype == "fp8_w8a8"
|
||||
use_int8_w8a16 = args.dtype == "int8_w8a16"
|
||||
use_customized_permute = args.use_customized_permute
|
||||
|
||||
@ -3,7 +3,6 @@
|
||||
|
||||
import random
|
||||
import time
|
||||
from typing import Optional
|
||||
|
||||
import torch
|
||||
|
||||
@ -37,7 +36,7 @@ def main(
|
||||
seed: int,
|
||||
do_profile: bool,
|
||||
device: str = "cuda",
|
||||
kv_cache_dtype: Optional[str] = None,
|
||||
kv_cache_dtype: str | None = None,
|
||||
) -> None:
|
||||
current_platform.seed_everything(seed)
|
||||
|
||||
|
||||
@ -3,8 +3,8 @@
|
||||
|
||||
import argparse
|
||||
import math
|
||||
from collections.abc import Callable
|
||||
from contextlib import contextmanager
|
||||
from typing import Callable
|
||||
from unittest.mock import patch
|
||||
|
||||
import torch
|
||||
|
||||
@ -1,7 +1,5 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from __future__ import annotations
|
||||
|
||||
import random
|
||||
import time
|
||||
|
||||
|
||||
@ -1,7 +1,5 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from __future__ import annotations
|
||||
|
||||
import random
|
||||
import time
|
||||
|
||||
|
||||
@ -2,7 +2,6 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import itertools
|
||||
from typing import Optional, Union
|
||||
|
||||
import torch
|
||||
from flashinfer.norm import fused_add_rmsnorm, rmsnorm
|
||||
@ -21,8 +20,8 @@ class HuggingFaceRMSNorm(nn.Module):
|
||||
def forward(
|
||||
self,
|
||||
x: torch.Tensor,
|
||||
residual: Optional[torch.Tensor] = None,
|
||||
) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
|
||||
residual: torch.Tensor | None = None,
|
||||
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
|
||||
orig_dtype = x.dtype
|
||||
x = x.to(torch.float32)
|
||||
if residual is not None:
|
||||
@ -41,7 +40,7 @@ class HuggingFaceRMSNorm(nn.Module):
|
||||
def rmsnorm_naive(
|
||||
x: torch.Tensor,
|
||||
weight: torch.Tensor,
|
||||
residual: Optional[torch.Tensor] = None,
|
||||
residual: torch.Tensor | None = None,
|
||||
eps: float = 1e-6,
|
||||
):
|
||||
naive_norm = HuggingFaceRMSNorm(x.shape[-1], eps=eps)
|
||||
@ -65,7 +64,7 @@ def rmsnorm_naive(
|
||||
def rmsnorm_flashinfer(
|
||||
x: torch.Tensor,
|
||||
weight: torch.Tensor,
|
||||
residual: Optional[torch.Tensor] = None,
|
||||
residual: torch.Tensor | None = None,
|
||||
eps: float = 1e-6,
|
||||
):
|
||||
orig_shape = x.shape
|
||||
@ -89,7 +88,7 @@ def rmsnorm_flashinfer(
|
||||
def rmsnorm_vllm(
|
||||
x: torch.Tensor,
|
||||
weight: torch.Tensor,
|
||||
residual: Optional[torch.Tensor] = None,
|
||||
residual: torch.Tensor | None = None,
|
||||
eps: float = 1e-6,
|
||||
):
|
||||
orig_shape = x.shape
|
||||
|
||||
@ -2,7 +2,6 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from itertools import accumulate
|
||||
from typing import Optional
|
||||
|
||||
import nvtx
|
||||
import torch
|
||||
@ -18,7 +17,7 @@ def benchmark_rope_kernels_multi_lora(
|
||||
seq_len: int,
|
||||
num_heads: int,
|
||||
head_size: int,
|
||||
rotary_dim: Optional[int],
|
||||
rotary_dim: int | None,
|
||||
dtype: torch.dtype,
|
||||
seed: int,
|
||||
device: str,
|
||||
|
||||
@ -4,7 +4,6 @@
|
||||
import csv
|
||||
import os
|
||||
from datetime import datetime
|
||||
from typing import Optional
|
||||
|
||||
import flashinfer
|
||||
import torch
|
||||
@ -28,9 +27,7 @@ def to_float8(x, dtype=torch.float8_e4m3fn):
|
||||
@torch.no_grad()
|
||||
def benchmark_decode(
|
||||
dtype: torch.dtype,
|
||||
quant_dtypes: tuple[
|
||||
Optional[torch.dtype], Optional[torch.dtype], Optional[torch.dtype]
|
||||
],
|
||||
quant_dtypes: tuple[torch.dtype | None, torch.dtype | None, torch.dtype | None],
|
||||
batch_size: int,
|
||||
max_seq_len: int,
|
||||
num_heads: tuple[int, int] = (64, 8),
|
||||
|
||||
@ -4,7 +4,6 @@
|
||||
import csv
|
||||
import os
|
||||
from datetime import datetime
|
||||
from typing import Optional
|
||||
|
||||
import flashinfer
|
||||
import torch
|
||||
@ -28,9 +27,7 @@ def to_float8(x, dtype=torch.float8_e4m3fn):
|
||||
@torch.no_grad()
|
||||
def benchmark_prefill(
|
||||
dtype: torch.dtype,
|
||||
quant_dtypes: tuple[
|
||||
Optional[torch.dtype], Optional[torch.dtype], Optional[torch.dtype]
|
||||
],
|
||||
quant_dtypes: tuple[torch.dtype | None, torch.dtype | None, torch.dtype | None],
|
||||
batch_size: int,
|
||||
max_seq_len: int,
|
||||
num_heads: tuple[int, int] = (64, 8),
|
||||
|
||||
@ -2,8 +2,8 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import dataclasses
|
||||
from collections.abc import Iterable
|
||||
from typing import Any, Callable, Optional
|
||||
from collections.abc import Callable, Iterable
|
||||
from typing import Any
|
||||
|
||||
import torch
|
||||
import torch.utils.benchmark as TBenchmark
|
||||
@ -55,7 +55,7 @@ class Bench:
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
cuda_graph_params: Optional[CudaGraphBenchParams],
|
||||
cuda_graph_params: CudaGraphBenchParams | None,
|
||||
label: str,
|
||||
sub_label: str,
|
||||
description: str,
|
||||
|
||||
@ -2,7 +2,7 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from abc import ABC, abstractmethod
|
||||
from statistics import mean
|
||||
from typing import Any, NamedTuple, Optional, Union
|
||||
from typing import Any, NamedTuple
|
||||
|
||||
import numpy as np # type: ignore
|
||||
import pandas as pd # type: ignore
|
||||
@ -35,8 +35,8 @@ class Distribution(ABC):
|
||||
class UniformDistribution(Distribution):
|
||||
def __init__(
|
||||
self,
|
||||
min_val: Union[int, float],
|
||||
max_val: Union[int, float],
|
||||
min_val: int | float,
|
||||
max_val: int | float,
|
||||
is_integer: bool = True,
|
||||
) -> None:
|
||||
self.min_val = min_val
|
||||
@ -56,7 +56,7 @@ class UniformDistribution(Distribution):
|
||||
|
||||
|
||||
class ConstantDistribution(Distribution):
|
||||
def __init__(self, value: Union[int, float]) -> None:
|
||||
def __init__(self, value: int | float) -> None:
|
||||
self.value = value
|
||||
self.max_val = value
|
||||
|
||||
@ -68,7 +68,7 @@ class ConstantDistribution(Distribution):
|
||||
|
||||
|
||||
class ZipfDistribution(Distribution):
|
||||
def __init__(self, alpha: float, max_val: Optional[int] = None) -> None:
|
||||
def __init__(self, alpha: float, max_val: int | None = None) -> None:
|
||||
self.alpha = alpha
|
||||
self.max_val = max_val
|
||||
|
||||
@ -83,7 +83,7 @@ class ZipfDistribution(Distribution):
|
||||
|
||||
|
||||
class PoissonDistribution(Distribution):
|
||||
def __init__(self, alpha: float, max_val: Optional[int] = None) -> None:
|
||||
def __init__(self, alpha: float, max_val: int | None = None) -> None:
|
||||
self.alpha = alpha
|
||||
self.max_val = max_val
|
||||
|
||||
@ -100,11 +100,11 @@ class PoissonDistribution(Distribution):
|
||||
class LognormalDistribution(Distribution):
|
||||
def __init__(
|
||||
self,
|
||||
mean: Optional[float] = None,
|
||||
sigma: Optional[float] = None,
|
||||
average: Optional[int] = None,
|
||||
median_ratio: Optional[float] = None,
|
||||
max_val: Optional[int] = None,
|
||||
mean: float | None = None,
|
||||
sigma: float | None = None,
|
||||
average: int | None = None,
|
||||
median_ratio: float | None = None,
|
||||
max_val: int | None = None,
|
||||
) -> None:
|
||||
self.average = average
|
||||
self.median_ratio = median_ratio
|
||||
|
||||
@ -13,7 +13,7 @@ from datetime import datetime
|
||||
from enum import Enum
|
||||
from http import HTTPStatus
|
||||
from statistics import mean
|
||||
from typing import NamedTuple, Union
|
||||
from typing import NamedTuple
|
||||
|
||||
import aiohttp # type: ignore
|
||||
import numpy as np # type: ignore
|
||||
@ -169,7 +169,7 @@ class MovingAverage:
|
||||
class DebugStats:
|
||||
def __init__(self, logger: logging.Logger, window_size: int) -> None:
|
||||
self.logger = logger
|
||||
self.metrics: dict[str, Union[MovingAverage, MetricStats]] = {
|
||||
self.metrics: dict[str, MovingAverage | MetricStats] = {
|
||||
"moving_avg_ttft_ms": MovingAverage(window_size),
|
||||
"moving_avg_tpot_ms": MovingAverage(window_size),
|
||||
"ttft_ms": MetricStats(),
|
||||
@ -636,7 +636,7 @@ async def client_main(
|
||||
|
||||
if args.verbose:
|
||||
curr_time_sec: float = time.perf_counter()
|
||||
time_since_last_turn: Union[str, float] = "N/A"
|
||||
time_since_last_turn: str | float = "N/A"
|
||||
if conv_id in time_of_last_turn:
|
||||
time_since_last_turn = round(
|
||||
curr_time_sec - time_of_last_turn[conv_id], 3
|
||||
@ -928,13 +928,13 @@ async def main_mp(
|
||||
f"{num_clients_finished} out of {bench_args.num_clients} clients finished, collected {len(client_metrics)} measurements, runtime {runtime_sec:.3f} sec{Color.RESET}" # noqa: E501
|
||||
)
|
||||
|
||||
rps: Union[str, float] = round(len(client_metrics) / runtime_sec, 3)
|
||||
rps: str | float = round(len(client_metrics) / runtime_sec, 3)
|
||||
if len(client_metrics) < (5 * bench_args.num_clients):
|
||||
# Do not estimate the RPS if the number of samples is very low
|
||||
# (threshold can be tuned if needed)
|
||||
rps = "N/A"
|
||||
|
||||
runtime_left_sec: Union[str, float] = round(
|
||||
runtime_left_sec: str | float = round(
|
||||
(runtime_sec / finished_convs) * (total_convs - finished_convs), 3
|
||||
)
|
||||
if percent < 0.05:
|
||||
@ -1251,7 +1251,7 @@ async def main() -> None:
|
||||
default=None,
|
||||
help="The model name used in the API. "
|
||||
"If not specified, the model name will be the "
|
||||
"same as the ``--model`` argument. ",
|
||||
"same as the `--model` argument. ",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
|
||||
@ -13,7 +13,7 @@ import argparse
|
||||
import json
|
||||
import random
|
||||
from statistics import mean
|
||||
from typing import Any, Optional
|
||||
from typing import Any
|
||||
|
||||
import pandas as pd # type: ignore
|
||||
import tqdm # type: ignore
|
||||
@ -25,7 +25,7 @@ def has_non_english_chars(text: str) -> bool:
|
||||
|
||||
|
||||
def content_is_valid(
|
||||
content: str, min_content_len: Optional[int], max_content_len: Optional[int]
|
||||
content: str, min_content_len: int | None, max_content_len: int | None
|
||||
) -> bool:
|
||||
if min_content_len and len(content) < min_content_len:
|
||||
return False
|
||||
@ -37,7 +37,7 @@ def content_is_valid(
|
||||
|
||||
|
||||
def print_stats(
|
||||
conversations: "list[dict[Any, Any]]", tokenizer: Optional[AutoTokenizer] = None
|
||||
conversations: "list[dict[Any, Any]]", tokenizer: AutoTokenizer | None = None
|
||||
) -> None:
|
||||
# Collect statistics
|
||||
stats = []
|
||||
@ -109,12 +109,12 @@ def convert_sharegpt_to_openai(
|
||||
seed: int,
|
||||
input_file: str,
|
||||
output_file: str,
|
||||
max_items: Optional[int],
|
||||
min_content_len: Optional[int] = None,
|
||||
max_content_len: Optional[int] = None,
|
||||
min_turns: Optional[int] = None,
|
||||
max_turns: Optional[int] = None,
|
||||
model: Optional[str] = None,
|
||||
max_items: int | None,
|
||||
min_content_len: int | None = None,
|
||||
max_content_len: int | None = None,
|
||||
min_turns: int | None = None,
|
||||
max_turns: int | None = None,
|
||||
model: str | None = None,
|
||||
) -> None:
|
||||
if min_turns and max_turns:
|
||||
assert min_turns <= max_turns
|
||||
|
||||
@ -198,13 +198,24 @@ else()
|
||||
endif()
|
||||
|
||||
if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON_FOUND) OR POWER9_FOUND OR POWER10_FOUND OR POWER11_FOUND)
|
||||
FetchContent_Declare(
|
||||
oneDNN
|
||||
GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git
|
||||
GIT_TAG v3.9
|
||||
GIT_PROGRESS TRUE
|
||||
GIT_SHALLOW TRUE
|
||||
)
|
||||
set(FETCHCONTENT_SOURCE_DIR_ONEDNN "$ENV{FETCHCONTENT_SOURCE_DIR_ONEDNN}" CACHE PATH "Path to a local oneDNN source directory.")
|
||||
|
||||
if(FETCHCONTENT_SOURCE_DIR_ONEDNN)
|
||||
message(STATUS "Using oneDNN from specified source directory: ${FETCHCONTENT_SOURCE_DIR_ONEDNN}")
|
||||
FetchContent_Declare(
|
||||
oneDNN
|
||||
SOURCE_DIR ${FETCHCONTENT_SOURCE_DIR_ONEDNN}
|
||||
)
|
||||
else()
|
||||
message(STATUS "Downloading oneDNN from GitHub")
|
||||
FetchContent_Declare(
|
||||
oneDNN
|
||||
GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git
|
||||
GIT_TAG v3.9
|
||||
GIT_PROGRESS TRUE
|
||||
GIT_SHALLOW TRUE
|
||||
)
|
||||
endif()
|
||||
|
||||
if(USE_ACL)
|
||||
find_library(ARM_COMPUTE_LIBRARY NAMES arm_compute PATHS $ENV{ACL_ROOT_DIR}/build/)
|
||||
@ -227,7 +238,7 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON
|
||||
set(ONEDNN_ENABLE_ITT_TASKS "OFF")
|
||||
set(ONEDNN_ENABLE_MAX_CPU_ISA "OFF")
|
||||
set(ONEDNN_ENABLE_CPU_ISA_HINTS "OFF")
|
||||
set(ONEDNN_VERBOSE "ON")
|
||||
set(ONEDNN_VERBOSE "OFF")
|
||||
set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)
|
||||
|
||||
FetchContent_MakeAvailable(oneDNN)
|
||||
@ -309,4 +320,4 @@ define_gpu_extension_target(
|
||||
WITH_SOABI
|
||||
)
|
||||
|
||||
message(STATUS "Enabling C extension.")
|
||||
message(STATUS "Enabling C extension.")
|
||||
@ -22,10 +22,10 @@ else()
|
||||
CONFIGURE_COMMAND ""
|
||||
BUILD_COMMAND ""
|
||||
)
|
||||
FetchContent_Populate(qutlass)
|
||||
set(qutlass_SOURCE_DIR "${qutlass_SOURCE_DIR}")
|
||||
endif()
|
||||
|
||||
FetchContent_Populate(qutlass)
|
||||
|
||||
if(NOT qutlass_SOURCE_DIR)
|
||||
message(FATAL_ERROR "[QUTLASS] source directory could not be resolved.")
|
||||
endif()
|
||||
|
||||
12
codecov.yml
Normal file
12
codecov.yml
Normal file
@ -0,0 +1,12 @@
|
||||
codecov:
|
||||
require_ci_to_pass: false
|
||||
|
||||
fixes:
|
||||
# Map source code paths to repository root paths
|
||||
# Wildcards match any Python version (python3.*)
|
||||
- "/vllm-workspace/src/vllm/::vllm/"
|
||||
- "/vllm-workspace/vllm/::vllm/"
|
||||
- "/usr/local/lib/python3.*/dist-packages/vllm/::vllm/"
|
||||
- "/usr/local/lib/python3.*/site-packages/vllm/::vllm/"
|
||||
- "/usr/lib/python3.*/dist-packages/vllm/::vllm/"
|
||||
- "/usr/lib/python3.*/site-packages/vllm/::vllm/"
|
||||
@ -125,32 +125,37 @@ public:
|
||||
}
|
||||
|
||||
static void set_split_kv (KernelArguments& args) {
|
||||
// printf("set_split_kv start");
|
||||
if (args.split_kv >= 1) return;
|
||||
auto [H, K, D, B] = args.problem_shape;
|
||||
// std::cout << H << " " << K << " " << D << " " << B << "\n";
|
||||
int sm_count = args.hw_info.sm_count;
|
||||
// printf(" sm_count = %d\n", sm_count);
|
||||
int max_splits = ceil_div(K, 128);
|
||||
max_splits = min(16, max_splits);
|
||||
float seq_length_k = static_cast<float>(K) / 1024.0f;
|
||||
int max_splits = 1;
|
||||
|
||||
// TODO: This avoids a hang when the batch size larger than 1 and
|
||||
// there is more than 1 kv_splits.
|
||||
// Discuss with NVIDIA how this can be fixed.
|
||||
if (B > 1) {
|
||||
max_splits = min(1, max_splits);
|
||||
if (B <= 4 && seq_length_k >= 16) {
|
||||
max_splits = 16;
|
||||
}
|
||||
|
||||
// printf(" max_splits = %d\n", max_splits);
|
||||
else if (B <= 8 && seq_length_k >= 4) {
|
||||
max_splits = 8;
|
||||
}
|
||||
else if ((B <= 16 && seq_length_k >= 8) ||
|
||||
(B == 48 && seq_length_k >= 32)) {
|
||||
max_splits = 4;
|
||||
}
|
||||
else if ((B <= 32 && seq_length_k >= 16) ||
|
||||
(B == 96 && seq_length_k >= 16)) {
|
||||
max_splits = 2;
|
||||
}
|
||||
else {
|
||||
max_splits = 1;
|
||||
}
|
||||
|
||||
// Wave-aware scheduling: ensure integer number of waves in K dimension
|
||||
int sms_per_batch = max(1, sm_count / B);
|
||||
// printf(" sms_per_batch = %d\n", sms_per_batch);
|
||||
int split_heur = min(max_splits, sms_per_batch);
|
||||
int waves = ceil_div(B * split_heur, sm_count);
|
||||
int k_waves = ceil_div(max_splits, split_heur);
|
||||
int split_wave_aware = ceil_div(max_splits, k_waves);
|
||||
args.split_kv = split_wave_aware;
|
||||
// printf(" args.split_kv = %d\n", args.split_kv);
|
||||
|
||||
}
|
||||
|
||||
/// Determines whether the GEMM can execute the given problem.
|
||||
|
||||
@ -5,12 +5,15 @@
|
||||
|
||||
namespace vllm {
|
||||
|
||||
// vllm_kernel_override_batch_invariant(); returns true
|
||||
// if env VLLM_KERNEL_OVERRIDE_BATCH_INVARIANT=1
|
||||
inline bool vllm_kernel_override_batch_invariant() {
|
||||
std::string env_key = "VLLM_KERNEL_OVERRIDE_BATCH_INVARIANT";
|
||||
const char* val = std::getenv(env_key.c_str());
|
||||
return (val && std::atoi(val) != 0) ? 1 : 0;
|
||||
// vllm_is_batch_invariant(); returns true
|
||||
// if env VLLM_BATCH_INVARIANT=1
|
||||
inline bool vllm_is_batch_invariant() {
|
||||
static bool cached = []() {
|
||||
std::string env_key = "VLLM_BATCH_INVARIANT";
|
||||
const char* val = std::getenv(env_key.c_str());
|
||||
return (val && std::atoi(val) != 0) ? 1 : 0;
|
||||
}();
|
||||
return cached;
|
||||
}
|
||||
|
||||
} // namespace vllm
|
||||
|
||||
@ -2,7 +2,6 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import enum
|
||||
from typing import Union
|
||||
|
||||
from cutlass_library import *
|
||||
|
||||
@ -22,7 +21,7 @@ class MixedInputKernelScheduleType(enum.Enum):
|
||||
TmaWarpSpecializedCooperative = enum_auto()
|
||||
|
||||
|
||||
VLLMDataTypeNames: dict[Union[VLLMDataType, DataType], str] = {
|
||||
VLLMDataTypeNames: dict[VLLMDataType | DataType, str] = {
|
||||
**DataTypeNames, # type: ignore
|
||||
**{
|
||||
VLLMDataType.u4b8: "u4b8",
|
||||
@ -30,7 +29,7 @@ VLLMDataTypeNames: dict[Union[VLLMDataType, DataType], str] = {
|
||||
},
|
||||
}
|
||||
|
||||
VLLMDataTypeTag: dict[Union[VLLMDataType, DataType], str] = {
|
||||
VLLMDataTypeTag: dict[VLLMDataType | DataType, str] = {
|
||||
**DataTypeTag, # type: ignore
|
||||
**{
|
||||
VLLMDataType.u4b8: "cutlass::vllm_uint4b8_t",
|
||||
@ -38,7 +37,7 @@ VLLMDataTypeTag: dict[Union[VLLMDataType, DataType], str] = {
|
||||
},
|
||||
}
|
||||
|
||||
VLLMDataTypeSize: dict[Union[VLLMDataType, DataType], int] = {
|
||||
VLLMDataTypeSize: dict[VLLMDataType | DataType, int] = {
|
||||
**DataTypeSize, # type: ignore
|
||||
**{
|
||||
VLLMDataType.u4b8: 4,
|
||||
@ -46,7 +45,7 @@ VLLMDataTypeSize: dict[Union[VLLMDataType, DataType], int] = {
|
||||
},
|
||||
}
|
||||
|
||||
VLLMDataTypeVLLMScalarTypeTag: dict[Union[VLLMDataType, DataType], str] = {
|
||||
VLLMDataTypeVLLMScalarTypeTag: dict[VLLMDataType | DataType, str] = {
|
||||
VLLMDataType.u4b8: "vllm::kU4B8",
|
||||
VLLMDataType.u8b128: "vllm::kU8B128",
|
||||
DataType.u4: "vllm::kU4",
|
||||
@ -57,7 +56,7 @@ VLLMDataTypeVLLMScalarTypeTag: dict[Union[VLLMDataType, DataType], str] = {
|
||||
DataType.bf16: "vllm::kBfloat16",
|
||||
}
|
||||
|
||||
VLLMDataTypeTorchDataTypeTag: dict[Union[VLLMDataType, DataType], str] = {
|
||||
VLLMDataTypeTorchDataTypeTag: dict[VLLMDataType | DataType, str] = {
|
||||
DataType.u8: "at::ScalarType::Byte",
|
||||
DataType.s8: "at::ScalarType::Char",
|
||||
DataType.e4m3: "at::ScalarType::Float8_e4m3fn",
|
||||
@ -67,9 +66,7 @@ VLLMDataTypeTorchDataTypeTag: dict[Union[VLLMDataType, DataType], str] = {
|
||||
DataType.f32: "at::ScalarType::Float",
|
||||
}
|
||||
|
||||
VLLMKernelScheduleTag: dict[
|
||||
Union[MixedInputKernelScheduleType, KernelScheduleType], str
|
||||
] = {
|
||||
VLLMKernelScheduleTag: dict[MixedInputKernelScheduleType | KernelScheduleType, str] = {
|
||||
**KernelScheduleTag, # type: ignore
|
||||
**{
|
||||
MixedInputKernelScheduleType.TmaWarpSpecialized: "cutlass::gemm::KernelTmaWarpSpecialized", # noqa: E501
|
||||
|
||||
@ -2,6 +2,7 @@
|
||||
#include "dispatch_utils.h"
|
||||
#include "cub_helpers.h"
|
||||
#include "core/batch_invariant.hpp"
|
||||
#include "quantization/vectorization_utils.cuh"
|
||||
|
||||
#include <torch/cuda.h>
|
||||
#include <c10/cuda/CUDAGuard.h>
|
||||
@ -18,11 +19,22 @@ __global__ void rms_norm_kernel(
|
||||
const float epsilon, const int num_tokens, const int hidden_size) {
|
||||
__shared__ float s_variance;
|
||||
float variance = 0.0f;
|
||||
const scalar_t* input_row = input + blockIdx.x * input_stride;
|
||||
|
||||
for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
|
||||
const float x = (float)input[blockIdx.x * input_stride + idx];
|
||||
constexpr int VEC_SIZE = 8;
|
||||
auto vec_op = [&variance](const vec_n_t<scalar_t, VEC_SIZE>& vec) {
|
||||
#pragma unroll
|
||||
for (int i = 0; i < VEC_SIZE; ++i) {
|
||||
float x = static_cast<float>(vec.val[i]);
|
||||
variance += x * x;
|
||||
}
|
||||
};
|
||||
auto scalar_op = [&variance](const scalar_t& val) {
|
||||
float x = static_cast<float>(val);
|
||||
variance += x * x;
|
||||
}
|
||||
};
|
||||
vllm::vectorize_read_with_alignment<VEC_SIZE>(
|
||||
input_row, hidden_size, threadIdx.x, blockDim.x, vec_op, scalar_op);
|
||||
|
||||
using BlockReduce = cub::BlockReduce<float, 1024>;
|
||||
__shared__ typename BlockReduce::TempStorage reduceStore;
|
||||
@ -352,18 +364,26 @@ void rms_norm(torch::Tensor& out, // [..., hidden_size]
|
||||
TORCH_CHECK(weight.is_contiguous());
|
||||
|
||||
int hidden_size = input.size(-1);
|
||||
int num_tokens = input.numel() / hidden_size;
|
||||
int64_t input_stride = input.stride(-2);
|
||||
|
||||
// We cannot just use `input.stride(-2)` if the tensor is not row-major.
|
||||
// Instead, we use a 2d view to get the second-innermost stride.
|
||||
// That way the dimensions (except the last one) can be arbitrarily permuted.
|
||||
torch::Tensor input_view = input.view({-1, hidden_size});
|
||||
|
||||
int num_tokens = input_view.numel() / hidden_size;
|
||||
int64_t input_stride = input_view.stride(-2);
|
||||
|
||||
dim3 grid(num_tokens);
|
||||
dim3 block(std::min(hidden_size, 1024));
|
||||
const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
|
||||
const at::cuda::OptionalCUDAGuard device_guard(device_of(input_view));
|
||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "rms_norm_kernel", [&] {
|
||||
vllm::rms_norm_kernel<scalar_t><<<grid, block, 0, stream>>>(
|
||||
out.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(), input_stride,
|
||||
weight.data_ptr<scalar_t>(), epsilon, num_tokens, hidden_size);
|
||||
});
|
||||
VLLM_DISPATCH_FLOATING_TYPES(
|
||||
input_view.scalar_type(), "rms_norm_kernel", [&] {
|
||||
vllm::rms_norm_kernel<scalar_t><<<grid, block, 0, stream>>>(
|
||||
out.data_ptr<scalar_t>(), input_view.data_ptr<scalar_t>(),
|
||||
input_stride, weight.data_ptr<scalar_t>(), epsilon, num_tokens,
|
||||
hidden_size);
|
||||
});
|
||||
}
|
||||
|
||||
#define LAUNCH_FUSED_ADD_RMS_NORM(width) \
|
||||
@ -380,6 +400,8 @@ void fused_add_rms_norm(torch::Tensor& input, // [..., hidden_size]
|
||||
torch::Tensor& residual, // [..., hidden_size]
|
||||
torch::Tensor& weight, // [hidden_size]
|
||||
double epsilon) {
|
||||
TORCH_CHECK(weight.scalar_type() == input.scalar_type());
|
||||
TORCH_CHECK(input.scalar_type() == residual.scalar_type());
|
||||
TORCH_CHECK(residual.is_contiguous());
|
||||
TORCH_CHECK(weight.is_contiguous());
|
||||
int hidden_size = input.size(-1);
|
||||
@ -414,7 +436,7 @@ void fused_add_rms_norm(torch::Tensor& input, // [..., hidden_size]
|
||||
wt_ptr % req_alignment_bytes == 0;
|
||||
bool offsets_are_multiple_of_vector_width =
|
||||
hidden_size % vector_width == 0 && input_stride % vector_width == 0;
|
||||
bool batch_invariant_launch = vllm::vllm_kernel_override_batch_invariant();
|
||||
bool batch_invariant_launch = vllm::vllm_is_batch_invariant();
|
||||
if (ptrs_are_aligned && offsets_are_multiple_of_vector_width &&
|
||||
!batch_invariant_launch) {
|
||||
LAUNCH_FUSED_ADD_RMS_NORM(8);
|
||||
@ -462,7 +484,7 @@ void poly_norm(torch::Tensor& out, // [..., hidden_size]
|
||||
auto inp_ptr = reinterpret_cast<std::uintptr_t>(input.data_ptr());
|
||||
auto out_ptr = reinterpret_cast<std::uintptr_t>(out.data_ptr());
|
||||
bool ptrs_are_aligned = inp_ptr % 16 == 0 && out_ptr % 16 == 0;
|
||||
bool batch_invariant_launch = vllm::vllm_kernel_override_batch_invariant();
|
||||
bool batch_invariant_launch = vllm::vllm_is_batch_invariant();
|
||||
if (ptrs_are_aligned && hidden_size % 8 == 0 && !batch_invariant_launch) {
|
||||
LAUNCH_FUSED_POLY_NORM(8);
|
||||
} else {
|
||||
|
||||
@ -10,6 +10,7 @@
|
||||
#include "dispatch_utils.h"
|
||||
#include "cub_helpers.h"
|
||||
#include "core/batch_invariant.hpp"
|
||||
#include "quantization/vectorization_utils.cuh"
|
||||
|
||||
#include <torch/cuda.h>
|
||||
#include <c10/cuda/CUDAGuard.h>
|
||||
@ -28,10 +29,22 @@ __global__ void rms_norm_static_fp8_quant_kernel(
|
||||
__shared__ float s_variance;
|
||||
float variance = 0.0f;
|
||||
|
||||
for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
|
||||
const float x = (float)input[blockIdx.x * input_stride + idx];
|
||||
const scalar_t* input_row = input + blockIdx.x * input_stride;
|
||||
|
||||
constexpr int VEC_SIZE = 8;
|
||||
auto vec_op = [&variance](const vec_n_t<scalar_t, VEC_SIZE>& vec) {
|
||||
#pragma unroll
|
||||
for (int i = 0; i < VEC_SIZE; ++i) {
|
||||
float x = static_cast<float>(vec.val[i]);
|
||||
variance += x * x;
|
||||
}
|
||||
};
|
||||
auto scalar_op = [&variance](const scalar_t& val) {
|
||||
float x = static_cast<float>(val);
|
||||
variance += x * x;
|
||||
}
|
||||
};
|
||||
vllm::vectorize_read_with_alignment<VEC_SIZE>(
|
||||
input_row, hidden_size, threadIdx.x, blockDim.x, vec_op, scalar_op);
|
||||
|
||||
using BlockReduce = cub::BlockReduce<float, 1024>;
|
||||
__shared__ typename BlockReduce::TempStorage reduceStore;
|
||||
@ -216,6 +229,8 @@ void fused_add_rms_norm_static_fp8_quant(
|
||||
double epsilon) {
|
||||
TORCH_CHECK(out.is_contiguous());
|
||||
TORCH_CHECK(residual.is_contiguous());
|
||||
TORCH_CHECK(residual.scalar_type() == input.scalar_type());
|
||||
TORCH_CHECK(weight.scalar_type() == input.scalar_type());
|
||||
int hidden_size = input.size(-1);
|
||||
int input_stride = input.stride(-2);
|
||||
int num_tokens = input.numel() / hidden_size;
|
||||
@ -241,7 +256,7 @@ void fused_add_rms_norm_static_fp8_quant(
|
||||
auto wt_ptr = reinterpret_cast<std::uintptr_t>(weight.data_ptr());
|
||||
bool ptrs_are_aligned =
|
||||
inp_ptr % 16 == 0 && res_ptr % 16 == 0 && wt_ptr % 16 == 0;
|
||||
bool batch_invariant_launch = vllm::vllm_kernel_override_batch_invariant();
|
||||
bool batch_invariant_launch = vllm::vllm_is_batch_invariant();
|
||||
if (ptrs_are_aligned && hidden_size % 8 == 0 && input_stride % 8 == 0 &&
|
||||
!batch_invariant_launch) {
|
||||
LAUNCH_FUSED_ADD_RMS_NORM(8);
|
||||
|
||||
@ -8,12 +8,77 @@
|
||||
|
||||
#include "../cuda_compat.h"
|
||||
#include "../dispatch_utils.h"
|
||||
#include "core/math.hpp"
|
||||
|
||||
#define CEILDIV(x, y) (((x) + (y) - 1) / (y))
|
||||
|
||||
namespace vllm {
|
||||
namespace moe {
|
||||
|
||||
namespace batched_moe_align_block_size {
|
||||
|
||||
// Note num_threads needs to be 1024 for BlockScan Reduction in the kernel.
|
||||
static constexpr int32_t num_threads = 1024;
|
||||
static constexpr int32_t num_blocks = 1;
|
||||
__global__ void batched_moe_align_block_size_kernel(
|
||||
int32_t const num_batches, int32_t const max_tokens_per_batch,
|
||||
int32_t const block_size, int32_t const* __restrict__ batch_num_tokens,
|
||||
int32_t* __restrict__ sorted_ids, int32_t* __restrict__ block_ids,
|
||||
int32_t* __restrict__ num_tokens_post_pad) {
|
||||
// TODO(varun): This is a naive implementation. Could be optimized.
|
||||
|
||||
size_t const batch_id = threadIdx.x;
|
||||
size_t const stride = blockDim.x * gridDim.x;
|
||||
int32_t const num_blocks_per_batch =
|
||||
CEILDIV(max_tokens_per_batch, block_size);
|
||||
int32_t const sorted_ids_size =
|
||||
num_blocks_per_batch * num_batches * block_size;
|
||||
int32_t const block_ids_size = sorted_ids_size / block_size;
|
||||
int32_t const SENTINEL =
|
||||
num_batches * max_tokens_per_batch; // To denote invalid entries.
|
||||
// Intialize sorted_ids
|
||||
for (size_t i = threadIdx.x; i < sorted_ids_size; i += stride) {
|
||||
sorted_ids[i] = SENTINEL;
|
||||
}
|
||||
// Intialize expert_ids with -1
|
||||
for (size_t i = threadIdx.x; i < block_ids_size; i += stride) {
|
||||
block_ids[i] = -1;
|
||||
}
|
||||
|
||||
int32_t b_num_tokens = 0;
|
||||
if (batch_id < num_batches) {
|
||||
b_num_tokens = batch_num_tokens[batch_id];
|
||||
}
|
||||
int32_t const ceil_b_num_tokens =
|
||||
CEILDIV(b_num_tokens, block_size) * block_size;
|
||||
|
||||
// Compute prefix sum over token counts per expert
|
||||
using BlockScan = cub::BlockScan<int32_t, 1024>;
|
||||
__shared__ typename BlockScan::TempStorage temp_storage;
|
||||
int cumsum_val;
|
||||
BlockScan(temp_storage).ExclusiveSum(ceil_b_num_tokens, cumsum_val);
|
||||
__syncthreads();
|
||||
|
||||
bool const is_last_batch = batch_id == (num_batches - 1);
|
||||
if (is_last_batch) {
|
||||
*num_tokens_post_pad = cumsum_val + ceil_b_num_tokens;
|
||||
}
|
||||
|
||||
if (batch_id < num_batches) {
|
||||
int32_t const batch_offset = batch_id * max_tokens_per_batch;
|
||||
for (size_t i = 0; i < b_num_tokens; ++i) {
|
||||
sorted_ids[cumsum_val + i] = batch_offset + i;
|
||||
}
|
||||
|
||||
int32_t const block_start = cumsum_val / block_size;
|
||||
int32_t const num_blocks = ceil_b_num_tokens / block_size;
|
||||
for (size_t i = 0; i < num_blocks; ++i) {
|
||||
block_ids[block_start + i] = batch_id;
|
||||
}
|
||||
}
|
||||
}
|
||||
} // namespace batched_moe_align_block_size
|
||||
|
||||
template <typename scalar_t>
|
||||
__global__ void moe_align_block_size_kernel(
|
||||
const scalar_t* __restrict__ topk_ids,
|
||||
@ -280,6 +345,33 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
|
||||
});
|
||||
}
|
||||
|
||||
void batched_moe_align_block_size(int64_t max_tokens_per_batch,
|
||||
int64_t block_size,
|
||||
torch::Tensor const& batch_num_tokens,
|
||||
torch::Tensor sorted_ids,
|
||||
torch::Tensor batch_ids,
|
||||
torch::Tensor num_tokens_post_pad) {
|
||||
namespace batched_kernel = vllm::moe::batched_moe_align_block_size;
|
||||
|
||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
int32_t const B = batch_num_tokens.size(0);
|
||||
int32_t const num_blocks_per_batch =
|
||||
round_to_next_multiple_of(max_tokens_per_batch, block_size) / block_size;
|
||||
int32_t const num_blocks = num_blocks_per_batch * B;
|
||||
int64_t const sorted_ids_size = num_blocks * block_size;
|
||||
|
||||
TORCH_CHECK(sorted_ids.size(0) == sorted_ids_size);
|
||||
TORCH_CHECK(batch_ids.size(0) == sorted_ids_size / block_size);
|
||||
TORCH_CHECK(num_tokens_post_pad.size(0) == 1);
|
||||
TORCH_CHECK(B <= batched_kernel::num_threads);
|
||||
|
||||
batched_kernel::batched_moe_align_block_size_kernel<<<
|
||||
batched_kernel::num_blocks, batched_kernel::num_threads, 0, stream>>>(
|
||||
B, max_tokens_per_batch, block_size, batch_num_tokens.data_ptr<int32_t>(),
|
||||
sorted_ids.data_ptr<int32_t>(), batch_ids.data_ptr<int32_t>(),
|
||||
num_tokens_post_pad.data_ptr<int32_t>());
|
||||
}
|
||||
|
||||
void moe_sum(torch::Tensor& input, // [num_tokens, topk, hidden_size]
|
||||
torch::Tensor& output) // [num_tokens, hidden_size]
|
||||
{
|
||||
|
||||
@ -4,7 +4,7 @@
|
||||
|
||||
void topk_softmax(torch::Tensor& topk_weights, torch::Tensor& topk_indices,
|
||||
torch::Tensor& token_expert_indices,
|
||||
torch::Tensor& gating_output);
|
||||
torch::Tensor& gating_output, bool renormalize);
|
||||
|
||||
void moe_sum(torch::Tensor& input, torch::Tensor& output);
|
||||
|
||||
@ -12,6 +12,14 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
|
||||
int64_t block_size, torch::Tensor sorted_token_ids,
|
||||
torch::Tensor experts_ids,
|
||||
torch::Tensor num_tokens_post_pad);
|
||||
|
||||
void batched_moe_align_block_size(int64_t max_tokens_per_batch,
|
||||
int64_t block_size,
|
||||
torch::Tensor const& expert_num_tokens,
|
||||
torch::Tensor sorted_ids,
|
||||
torch::Tensor expert_ids,
|
||||
torch::Tensor num_tokens_post_pad);
|
||||
|
||||
#ifndef USE_ROCM
|
||||
torch::Tensor moe_wna16_gemm(torch::Tensor input, torch::Tensor output,
|
||||
torch::Tensor b_qweight, torch::Tensor b_scales,
|
||||
|
||||
@ -16,12 +16,22 @@
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include <type_traits>
|
||||
#include <torch/all.h>
|
||||
#include <ATen/cuda/CUDAContext.h>
|
||||
#include <c10/cuda/CUDAGuard.h>
|
||||
#include "../cuda_compat.h"
|
||||
#include "../cub_helpers.h"
|
||||
#include "../core/batch_invariant.hpp"
|
||||
|
||||
#ifndef USE_ROCM
|
||||
#include <cuda_bf16.h>
|
||||
#include <cuda_fp16.h>
|
||||
#else
|
||||
#include <hip/hip_bf16.h>
|
||||
#include <hip/hip_fp16.h>
|
||||
typedef __hip_bfloat16 __nv_bfloat16;
|
||||
typedef __hip_bfloat162 __nv_bfloat162;
|
||||
#endif
|
||||
|
||||
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
||||
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
||||
@ -37,16 +47,27 @@ template <
|
||||
/// Alignment requirement in bytes
|
||||
int Alignment = sizeof(T) * N
|
||||
>
|
||||
class alignas(Alignment) AlignedArray {
|
||||
float data[N];
|
||||
struct alignas(Alignment) AlignedArray {
|
||||
T data[N];
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
__device__ __forceinline__ float toFloat(T value) {
|
||||
if constexpr (std::is_same_v<T, float>) {
|
||||
return value;
|
||||
} else if constexpr (std::is_same_v<T, __nv_bfloat16>) {
|
||||
return __bfloat162float(value);
|
||||
} else if constexpr (std::is_same_v<T, __half>) {
|
||||
return __half2float(value);
|
||||
}
|
||||
}
|
||||
|
||||
// ====================== Softmax things ===============================
|
||||
// We have our own implementation of softmax here so we can support transposing the output
|
||||
// in the softmax kernel when we extend this module to support expert-choice routing.
|
||||
template <int TPB>
|
||||
template <int TPB, typename InputType>
|
||||
__launch_bounds__(TPB) __global__
|
||||
void moeSoftmax(const float* input, const bool* finished, float* output, const int num_cols)
|
||||
void moeSoftmax(const InputType* input, const bool* finished, float* output, const int num_cols)
|
||||
{
|
||||
using BlockReduce = cub::BlockReduce<float, TPB>;
|
||||
__shared__ typename BlockReduce::TempStorage tmpStorage;
|
||||
@ -67,7 +88,8 @@ __launch_bounds__(TPB) __global__
|
||||
for (int ii = threadIdx.x; ii < num_cols; ii += TPB)
|
||||
{
|
||||
const int idx = thread_row_offset + ii;
|
||||
threadData = max(static_cast<float>(input[idx]), threadData);
|
||||
const float val = toFloat(input[idx]);
|
||||
threadData = max(val, threadData);
|
||||
}
|
||||
|
||||
const float maxElem = BlockReduce(tmpStorage).Reduce(threadData, CubMaxOp());
|
||||
@ -82,7 +104,8 @@ __launch_bounds__(TPB) __global__
|
||||
for (int ii = threadIdx.x; ii < num_cols; ii += TPB)
|
||||
{
|
||||
const int idx = thread_row_offset + ii;
|
||||
threadData += exp((static_cast<float>(input[idx]) - float_max));
|
||||
const float val = toFloat(input[idx]);
|
||||
threadData += expf(val - float_max);
|
||||
}
|
||||
|
||||
const auto Z = BlockReduce(tmpStorage).Reduce(threadData, CubAddOp());
|
||||
@ -96,8 +119,9 @@ __launch_bounds__(TPB) __global__
|
||||
for (int ii = threadIdx.x; ii < num_cols; ii += TPB)
|
||||
{
|
||||
const int idx = thread_row_offset + ii;
|
||||
const float val = exp((static_cast<float>(input[idx]) - float_max)) * normalizing_factor;
|
||||
output[idx] = val;
|
||||
const float val = toFloat(input[idx]);
|
||||
const float softmax_val = expf(val - float_max) * normalizing_factor;
|
||||
output[idx] = softmax_val;
|
||||
}
|
||||
}
|
||||
|
||||
@ -111,7 +135,8 @@ __launch_bounds__(TPB) __global__ void moeTopK(
|
||||
const int num_experts,
|
||||
const int k,
|
||||
const int start_expert,
|
||||
const int end_expert)
|
||||
const int end_expert,
|
||||
const bool renormalize)
|
||||
{
|
||||
|
||||
using cub_kvp = cub::KeyValuePair<int, float>;
|
||||
@ -126,6 +151,7 @@ __launch_bounds__(TPB) __global__ void moeTopK(
|
||||
|
||||
const bool row_is_active = finished ? !finished[block_row] : true;
|
||||
const int thread_read_offset = blockIdx.x * num_experts;
|
||||
float selected_sum = 0.f;
|
||||
for (int k_idx = 0; k_idx < k; ++k_idx)
|
||||
{
|
||||
thread_kvp.key = 0;
|
||||
@ -164,9 +190,23 @@ __launch_bounds__(TPB) __global__ void moeTopK(
|
||||
indices[idx] = should_process_row ? (expert - start_expert) : num_experts;
|
||||
assert(indices[idx] >= 0);
|
||||
source_rows[idx] = k_idx * num_rows + block_row;
|
||||
if (renormalize) {
|
||||
selected_sum += result_kvp.value;
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
// Renormalize the k weights for this row to sum to 1, if requested.
|
||||
if (renormalize) {
|
||||
if (threadIdx.x == 0) {
|
||||
const float denom = selected_sum > 0.f ? selected_sum : 1.f;
|
||||
for (int k_idx = 0; k_idx < k; ++k_idx) {
|
||||
const int idx = k * block_row + k_idx;
|
||||
output[idx] = output[idx] / denom;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ====================== TopK softmax things ===============================
|
||||
@ -185,21 +225,30 @@ __launch_bounds__(TPB) __global__ void moeTopK(
|
||||
2) This implementation assumes k is small, but will work for any k.
|
||||
*/
|
||||
|
||||
template <int VPT, int NUM_EXPERTS, int WARPS_PER_CTA, int BYTES_PER_LDG, int WARP_SIZE_PARAM, typename IndType>
|
||||
template <int VPT, int NUM_EXPERTS, int WARPS_PER_CTA, int BYTES_PER_LDG, int WARP_SIZE_PARAM, typename IndType, typename InputType = float>
|
||||
__launch_bounds__(WARPS_PER_CTA* WARP_SIZE_PARAM) __global__
|
||||
void topkGatingSoftmax(const float* input, const bool* finished, float* output, const int num_rows, IndType* indices,
|
||||
int* source_rows, const int k, const int start_expert, const int end_expert)
|
||||
void topkGatingSoftmax(const InputType* input, const bool* finished, float* output, const int num_rows, IndType* indices,
|
||||
int* source_rows, const int k, const int start_expert, const int end_expert, const bool renormalize)
|
||||
{
|
||||
static_assert(std::is_same_v<InputType, float> || std::is_same_v<InputType, __nv_bfloat16> ||
|
||||
std::is_same_v<InputType, __half>,
|
||||
"InputType must be float, __nv_bfloat16, or __half");
|
||||
|
||||
// We begin by enforcing compile time assertions and setting up compile time constants.
|
||||
static_assert(BYTES_PER_LDG == (BYTES_PER_LDG & -BYTES_PER_LDG), "BYTES_PER_LDG must be power of 2");
|
||||
static_assert(BYTES_PER_LDG <= 16, "BYTES_PER_LDG must be leq 16");
|
||||
|
||||
// Number of bytes each thread pulls in per load
|
||||
static constexpr int ELTS_PER_LDG = BYTES_PER_LDG / sizeof(float);
|
||||
static constexpr int ELTS_PER_LDG = BYTES_PER_LDG / sizeof(InputType);
|
||||
static constexpr int ELTS_PER_ROW = NUM_EXPERTS;
|
||||
static constexpr int THREADS_PER_ROW = ELTS_PER_ROW / VPT;
|
||||
static constexpr int LDG_PER_THREAD = VPT / ELTS_PER_LDG;
|
||||
|
||||
if constexpr (std::is_same_v<InputType, __nv_bfloat16> || std::is_same_v<InputType, __half>) {
|
||||
static_assert(ELTS_PER_LDG == 1 || ELTS_PER_LDG % 2 == 0,
|
||||
"ELTS_PER_LDG must be 1 or even for 16-bit conversion");
|
||||
}
|
||||
|
||||
// Restrictions based on previous section.
|
||||
static_assert(VPT % ELTS_PER_LDG == 0, "The elements per thread must be a multiple of the elements per ldg");
|
||||
static_assert(WARP_SIZE_PARAM % THREADS_PER_ROW == 0, "The threads per row must cleanly divide the threads per warp");
|
||||
@ -237,27 +286,71 @@ __launch_bounds__(WARPS_PER_CTA* WARP_SIZE_PARAM) __global__
|
||||
|
||||
// We finally start setting up the read pointers for each thread. First, each thread jumps to the start of the
|
||||
// row it will read.
|
||||
const float* thread_row_ptr = input + thread_row * ELTS_PER_ROW;
|
||||
const InputType* thread_row_ptr = input + thread_row * ELTS_PER_ROW;
|
||||
|
||||
// Now, we compute the group each thread belong to in order to determine the first column to start loads.
|
||||
const int thread_group_idx = threadIdx.x % THREADS_PER_ROW;
|
||||
const int first_elt_read_by_thread = thread_group_idx * ELTS_PER_LDG;
|
||||
const float* thread_read_ptr = thread_row_ptr + first_elt_read_by_thread;
|
||||
|
||||
// Determine the pointer type to use to read in the data depending on the BYTES_PER_LDG template param. In theory,
|
||||
// this can support all powers of 2 up to 16.
|
||||
// NOTE(woosuk): The original implementation uses CUTLASS aligned array here.
|
||||
// We defined our own aligned array and use it here to avoid the dependency on CUTLASS.
|
||||
using AccessType = AlignedArray<float, ELTS_PER_LDG>;
|
||||
const InputType* thread_read_ptr = thread_row_ptr + first_elt_read_by_thread;
|
||||
|
||||
// Finally, we pull in the data from global mem
|
||||
float row_chunk[VPT];
|
||||
AccessType* row_chunk_vec_ptr = reinterpret_cast<AccessType*>(&row_chunk);
|
||||
const AccessType* vec_thread_read_ptr = reinterpret_cast<const AccessType*>(thread_read_ptr);
|
||||
|
||||
// NOTE(zhuhaoran): dispatch different input types loading, BF16/FP16 convert to float
|
||||
if constexpr (std::is_same_v<InputType, float>) {
|
||||
using VecType = AlignedArray<float, ELTS_PER_LDG>;
|
||||
VecType* row_chunk_vec_ptr = reinterpret_cast<VecType*>(&row_chunk);
|
||||
const VecType* vec_thread_read_ptr = reinterpret_cast<const VecType*>(thread_read_ptr);
|
||||
#pragma unroll
|
||||
for (int ii = 0; ii < LDG_PER_THREAD; ++ii)
|
||||
{
|
||||
row_chunk_vec_ptr[ii] = vec_thread_read_ptr[ii * THREADS_PER_ROW];
|
||||
for (int ii = 0; ii < LDG_PER_THREAD; ++ii) {
|
||||
row_chunk_vec_ptr[ii] = vec_thread_read_ptr[ii * THREADS_PER_ROW];
|
||||
}
|
||||
} else if constexpr (std::is_same_v<InputType, __nv_bfloat16>) {
|
||||
if constexpr (ELTS_PER_LDG >= 2) {
|
||||
using VecType = AlignedArray<__nv_bfloat16, ELTS_PER_LDG>;
|
||||
float2* row_chunk_f2 = reinterpret_cast<float2*>(row_chunk);
|
||||
const VecType* vec_thread_read_ptr = reinterpret_cast<const VecType*>(thread_read_ptr);
|
||||
#pragma unroll
|
||||
for (int ii = 0; ii < LDG_PER_THREAD; ++ii) {
|
||||
VecType vec = vec_thread_read_ptr[ii * THREADS_PER_ROW];
|
||||
int base_idx_f2 = ii * ELTS_PER_LDG / 2;
|
||||
#pragma unroll
|
||||
for (int jj = 0; jj < ELTS_PER_LDG / 2; ++jj) {
|
||||
row_chunk_f2[base_idx_f2 + jj] = __bfloat1622float2(
|
||||
*reinterpret_cast<const __nv_bfloat162*>(vec.data + jj * 2)
|
||||
);
|
||||
}
|
||||
}
|
||||
} else { // ELTS_PER_LDG == 1
|
||||
#pragma unroll
|
||||
for (int ii = 0; ii < LDG_PER_THREAD; ++ii) {
|
||||
const __nv_bfloat16* scalar_ptr = thread_read_ptr + ii * THREADS_PER_ROW;
|
||||
row_chunk[ii] = __bfloat162float(*scalar_ptr);
|
||||
}
|
||||
}
|
||||
} else if constexpr (std::is_same_v<InputType, __half>) {
|
||||
if constexpr (ELTS_PER_LDG >= 2) {
|
||||
using VecType = AlignedArray<__half, ELTS_PER_LDG>;
|
||||
float2* row_chunk_f2 = reinterpret_cast<float2*>(row_chunk);
|
||||
const VecType* vec_thread_read_ptr = reinterpret_cast<const VecType*>(thread_read_ptr);
|
||||
#pragma unroll
|
||||
for (int ii = 0; ii < LDG_PER_THREAD; ++ii) {
|
||||
VecType vec = vec_thread_read_ptr[ii * THREADS_PER_ROW];
|
||||
int base_idx_f2 = ii * ELTS_PER_LDG / 2;
|
||||
#pragma unroll
|
||||
for (int jj = 0; jj < ELTS_PER_LDG / 2; ++jj) {
|
||||
row_chunk_f2[base_idx_f2 + jj] = __half22float2(
|
||||
*reinterpret_cast<const __half2*>(vec.data + jj * 2)
|
||||
);
|
||||
}
|
||||
}
|
||||
} else { // ELTS_PER_LDG == 1
|
||||
#pragma unroll
|
||||
for (int ii = 0; ii < LDG_PER_THREAD; ++ii) {
|
||||
const __half* scalar_ptr = thread_read_ptr + ii * THREADS_PER_ROW;
|
||||
row_chunk[ii] = __half2float(*scalar_ptr);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// First, we perform a max reduce within the thread. We can do the max in fp16 safely (I think) and just
|
||||
@ -311,6 +404,7 @@ __launch_bounds__(WARPS_PER_CTA* WARP_SIZE_PARAM) __global__
|
||||
int start_col = first_elt_read_by_thread;
|
||||
static constexpr int COLS_PER_GROUP_LDG = ELTS_PER_LDG * THREADS_PER_ROW;
|
||||
|
||||
float selected_sum = 0.f;
|
||||
for (int k_idx = 0; k_idx < k; ++k_idx)
|
||||
{
|
||||
// First, each thread does the local argmax
|
||||
@ -364,6 +458,9 @@ __launch_bounds__(WARPS_PER_CTA* WARP_SIZE_PARAM) __global__
|
||||
output[idx] = max_val;
|
||||
indices[idx] = should_process_row ? (expert - start_expert) : NUM_EXPERTS;
|
||||
source_rows[idx] = k_idx * num_rows + thread_row;
|
||||
if (renormalize) {
|
||||
selected_sum += max_val;
|
||||
}
|
||||
}
|
||||
|
||||
// Finally, we clear the value in the thread with the current max if there is another iteration to run.
|
||||
@ -381,15 +478,28 @@ __launch_bounds__(WARPS_PER_CTA* WARP_SIZE_PARAM) __global__
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Renormalize the k weights for this row to sum to 1, if requested.
|
||||
if (renormalize) {
|
||||
if (thread_group_idx == 0)
|
||||
{
|
||||
const float denom = selected_sum > 0.f ? selected_sum : 1.f;
|
||||
for (int k_idx = 0; k_idx < k; ++k_idx)
|
||||
{
|
||||
const int idx = k * thread_row + k_idx;
|
||||
output[idx] = output[idx] / denom;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
namespace detail
|
||||
{
|
||||
// Constructs some constants needed to partition the work across threads at compile time.
|
||||
template <int EXPERTS, int BYTES_PER_LDG, int WARP_SIZE_PARAM>
|
||||
template <int EXPERTS, int BYTES_PER_LDG, int WARP_SIZE_PARAM, typename InputType>
|
||||
struct TopkConstants
|
||||
{
|
||||
static constexpr int ELTS_PER_LDG = BYTES_PER_LDG / sizeof(float);
|
||||
static constexpr int ELTS_PER_LDG = BYTES_PER_LDG / sizeof(InputType);
|
||||
static_assert(EXPERTS / (ELTS_PER_LDG * WARP_SIZE_PARAM) == 0 || EXPERTS % (ELTS_PER_LDG * WARP_SIZE_PARAM) == 0, "");
|
||||
static constexpr int VECs_PER_THREAD = MAX(1, EXPERTS / (ELTS_PER_LDG * WARP_SIZE_PARAM));
|
||||
static constexpr int VPT = VECs_PER_THREAD * ELTS_PER_LDG;
|
||||
@ -398,21 +508,21 @@ struct TopkConstants
|
||||
};
|
||||
} // namespace detail
|
||||
|
||||
template <int EXPERTS, int WARPS_PER_TB, int WARP_SIZE_PARAM, int MAX_BYTES_PER_LDG, typename IndType>
|
||||
void topkGatingSoftmaxLauncherHelper(const float* input, const bool* finished, float* output, IndType* indices,
|
||||
int* source_row, const int num_rows, const int k, const int start_expert, const int end_expert, cudaStream_t stream)
|
||||
template <int EXPERTS, int WARPS_PER_TB, int WARP_SIZE_PARAM, int MAX_BYTES_PER_LDG, typename IndType, typename InputType>
|
||||
void topkGatingSoftmaxLauncherHelper(const InputType* input, const bool* finished, float* output, IndType* indices,
|
||||
int* source_row, const int num_rows, const int k, const int start_expert, const int end_expert, const bool renormalize,
|
||||
cudaStream_t stream)
|
||||
{
|
||||
static constexpr int BYTES_PER_LDG = MIN(MAX_BYTES_PER_LDG, sizeof(float) * EXPERTS);
|
||||
using Constants = detail::TopkConstants<EXPERTS, BYTES_PER_LDG, WARP_SIZE_PARAM>;
|
||||
static constexpr int BYTES_PER_LDG = MIN(MAX_BYTES_PER_LDG, sizeof(InputType) * EXPERTS);
|
||||
using Constants = detail::TopkConstants<EXPERTS, BYTES_PER_LDG, WARP_SIZE_PARAM, InputType>;
|
||||
static constexpr int VPT = Constants::VPT;
|
||||
static constexpr int ROWS_PER_WARP = Constants::ROWS_PER_WARP;
|
||||
const bool batch_invariant_launch = vllm::vllm_kernel_override_batch_invariant();
|
||||
const int num_warps = batch_invariant_launch ? 32 : (num_rows + ROWS_PER_WARP - 1) / ROWS_PER_WARP;
|
||||
const int num_warps = (num_rows + ROWS_PER_WARP - 1) / ROWS_PER_WARP;
|
||||
const int num_blocks = (num_warps + WARPS_PER_TB - 1) / WARPS_PER_TB;
|
||||
|
||||
dim3 block_dim(WARP_SIZE_PARAM, WARPS_PER_TB);
|
||||
topkGatingSoftmax<VPT, EXPERTS, WARPS_PER_TB, BYTES_PER_LDG, WARP_SIZE_PARAM><<<num_blocks, block_dim, 0, stream>>>(
|
||||
input, finished, output, num_rows, indices, source_row, k, start_expert, end_expert);
|
||||
topkGatingSoftmax<VPT, EXPERTS, WARPS_PER_TB, BYTES_PER_LDG, WARP_SIZE_PARAM, IndType, InputType><<<num_blocks, block_dim, 0, stream>>>(
|
||||
input, finished, output, num_rows, indices, source_row, k, start_expert, end_expert, renormalize);
|
||||
}
|
||||
|
||||
#ifndef USE_ROCM
|
||||
@ -420,26 +530,26 @@ void topkGatingSoftmaxLauncherHelper(const float* input, const bool* finished, f
|
||||
static_assert(WARP_SIZE == 32, \
|
||||
"Unsupported warp size. Only 32 is supported for CUDA"); \
|
||||
topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB, WARP_SIZE, MAX_BYTES>( \
|
||||
gating_output, nullptr, topk_weights, topk_indices, \
|
||||
token_expert_indices, num_tokens, topk, 0, num_experts, stream);
|
||||
gating_output, nullptr, topk_weights, topk_indices, token_expert_indices, \
|
||||
num_tokens, topk, 0, num_experts, renormalize, stream);
|
||||
#else
|
||||
#define LAUNCH_SOFTMAX(NUM_EXPERTS, WARPS_PER_TB, MAX_BYTES) \
|
||||
if (WARP_SIZE == 64) { \
|
||||
topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB, 64, MAX_BYTES>( \
|
||||
gating_output, nullptr, topk_weights, topk_indices, \
|
||||
token_expert_indices, num_tokens, topk, 0, num_experts, stream); \
|
||||
gating_output, nullptr, topk_weights, topk_indices, token_expert_indices, \
|
||||
num_tokens, topk, 0, num_experts, renormalize, stream); \
|
||||
} else if (WARP_SIZE == 32) { \
|
||||
topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB, 32, MAX_BYTES>( \
|
||||
gating_output, nullptr, topk_weights, topk_indices, \
|
||||
token_expert_indices, num_tokens, topk, 0, num_experts, stream); \
|
||||
gating_output, nullptr, topk_weights, topk_indices, token_expert_indices, \
|
||||
num_tokens, topk, 0, num_experts, renormalize, stream); \
|
||||
} else { \
|
||||
assert(false && "Unsupported warp size. Only 32 and 64 are supported for ROCm"); \
|
||||
}
|
||||
#endif
|
||||
|
||||
template <typename IndType>
|
||||
template <typename IndType, typename InputType>
|
||||
void topkGatingSoftmaxKernelLauncher(
|
||||
const float* gating_output,
|
||||
const InputType* gating_output,
|
||||
float* topk_weights,
|
||||
IndType* topk_indices,
|
||||
int* token_expert_indices,
|
||||
@ -447,11 +557,15 @@ void topkGatingSoftmaxKernelLauncher(
|
||||
const int num_tokens,
|
||||
const int num_experts,
|
||||
const int topk,
|
||||
const bool renormalize,
|
||||
cudaStream_t stream) {
|
||||
static constexpr int WARPS_PER_TB = 4;
|
||||
static constexpr int BYTES_PER_LDG_POWER_OF_2 = 16;
|
||||
#ifndef USE_ROCM
|
||||
static constexpr int BYTES_PER_LDG_MULTIPLE_64 = 8;
|
||||
// for bfloat16 dtype, we need 4 bytes loading to make sure num_experts
|
||||
// elements can be loaded by a warp
|
||||
static constexpr int BYTES_PER_LDG_MULTIPLE_64 =
|
||||
(std::is_same_v<InputType, __nv_bfloat16> || std::is_same_v<InputType, __half>) ? 4 : 8;
|
||||
#endif
|
||||
switch (num_experts) {
|
||||
case 1:
|
||||
@ -508,11 +622,11 @@ void topkGatingSoftmaxKernelLauncher(
|
||||
TORCH_CHECK(softmax_workspace != nullptr,
|
||||
"softmax_workspace must be provided for num_experts that are not a power of 2 or multiple of 64.");
|
||||
static constexpr int TPB = 256;
|
||||
moeSoftmax<TPB><<<num_tokens, TPB, 0, stream>>>(
|
||||
moeSoftmax<TPB, InputType><<<num_tokens, TPB, 0, stream>>>(
|
||||
gating_output, nullptr, softmax_workspace, num_experts);
|
||||
moeTopK<TPB><<<num_tokens, TPB, 0, stream>>>(
|
||||
softmax_workspace, nullptr, topk_weights, topk_indices, token_expert_indices,
|
||||
num_experts, topk, 0, num_experts);
|
||||
num_experts, topk, 0, num_experts, renormalize);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -520,11 +634,50 @@ void topkGatingSoftmaxKernelLauncher(
|
||||
} // namespace moe
|
||||
} // namespace vllm
|
||||
|
||||
|
||||
template<typename ComputeType>
|
||||
void dispatch_topk_softmax_launch(
|
||||
torch::Tensor& gating_output,
|
||||
torch::Tensor& topk_weights,
|
||||
torch::Tensor& topk_indices,
|
||||
torch::Tensor& token_expert_indices,
|
||||
torch::Tensor& softmax_workspace,
|
||||
int num_tokens, int num_experts, int topk, bool renormalize, cudaStream_t stream)
|
||||
{
|
||||
if (topk_indices.scalar_type() == at::ScalarType::Int) {
|
||||
vllm::moe::topkGatingSoftmaxKernelLauncher<int, ComputeType>(
|
||||
reinterpret_cast<const ComputeType*>(gating_output.data_ptr()),
|
||||
topk_weights.data_ptr<float>(),
|
||||
topk_indices.data_ptr<int>(),
|
||||
token_expert_indices.data_ptr<int>(),
|
||||
softmax_workspace.data_ptr<float>(),
|
||||
num_tokens, num_experts, topk, renormalize, stream);
|
||||
} else if (topk_indices.scalar_type() == at::ScalarType::UInt32) {
|
||||
vllm::moe::topkGatingSoftmaxKernelLauncher<uint32_t, ComputeType>(
|
||||
reinterpret_cast<const ComputeType*>(gating_output.data_ptr()),
|
||||
topk_weights.data_ptr<float>(),
|
||||
topk_indices.data_ptr<uint32_t>(),
|
||||
token_expert_indices.data_ptr<int>(),
|
||||
softmax_workspace.data_ptr<float>(),
|
||||
num_tokens, num_experts, topk, renormalize, stream);
|
||||
} else {
|
||||
TORCH_CHECK(topk_indices.scalar_type() == at::ScalarType::Long);
|
||||
vllm::moe::topkGatingSoftmaxKernelLauncher<int64_t, ComputeType>(
|
||||
reinterpret_cast<const ComputeType*>(gating_output.data_ptr()),
|
||||
topk_weights.data_ptr<float>(),
|
||||
topk_indices.data_ptr<int64_t>(),
|
||||
token_expert_indices.data_ptr<int>(),
|
||||
softmax_workspace.data_ptr<float>(),
|
||||
num_tokens, num_experts, topk, renormalize, stream);
|
||||
}
|
||||
}
|
||||
|
||||
void topk_softmax(
|
||||
torch::Tensor& topk_weights, // [num_tokens, topk]
|
||||
torch::Tensor& topk_indices, // [num_tokens, topk]
|
||||
torch::Tensor& token_expert_indices, // [num_tokens, topk]
|
||||
torch::Tensor& gating_output) // [num_tokens, num_experts]
|
||||
torch::Tensor& gating_output, // [num_tokens, num_experts]
|
||||
bool renormalize)
|
||||
{
|
||||
const int num_experts = gating_output.size(-1);
|
||||
const auto num_tokens = gating_output.numel() / num_experts;
|
||||
@ -536,45 +689,19 @@ void topk_softmax(
|
||||
|
||||
const at::cuda::OptionalCUDAGuard device_guard(device_of(gating_output));
|
||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
torch::Tensor softmax_workspace = torch::empty({workspace_size}, gating_output.options());
|
||||
const auto workspace_options = gating_output.options().dtype(at::ScalarType::Float);
|
||||
torch::Tensor softmax_workspace = torch::empty({workspace_size}, workspace_options);
|
||||
|
||||
if(topk_indices.scalar_type() == at::ScalarType::Int)
|
||||
{
|
||||
vllm::moe::topkGatingSoftmaxKernelLauncher(
|
||||
gating_output.data_ptr<float>(),
|
||||
topk_weights.data_ptr<float>(),
|
||||
topk_indices.data_ptr<int>(),
|
||||
token_expert_indices.data_ptr<int>(),
|
||||
softmax_workspace.data_ptr<float>(),
|
||||
num_tokens,
|
||||
num_experts,
|
||||
topk,
|
||||
stream);
|
||||
}
|
||||
else if (topk_indices.scalar_type() == at::ScalarType::UInt32)
|
||||
{
|
||||
vllm::moe::topkGatingSoftmaxKernelLauncher(
|
||||
gating_output.data_ptr<float>(),
|
||||
topk_weights.data_ptr<float>(),
|
||||
topk_indices.data_ptr<uint32_t>(),
|
||||
token_expert_indices.data_ptr<int>(),
|
||||
softmax_workspace.data_ptr<float>(),
|
||||
num_tokens,
|
||||
num_experts,
|
||||
topk,
|
||||
stream);
|
||||
}
|
||||
else {
|
||||
TORCH_CHECK(topk_indices.scalar_type() == at::ScalarType::Long);
|
||||
vllm::moe::topkGatingSoftmaxKernelLauncher(
|
||||
gating_output.data_ptr<float>(),
|
||||
topk_weights.data_ptr<float>(),
|
||||
topk_indices.data_ptr<int64_t>(),
|
||||
token_expert_indices.data_ptr<int>(),
|
||||
softmax_workspace.data_ptr<float>(),
|
||||
num_tokens,
|
||||
num_experts,
|
||||
topk,
|
||||
stream);
|
||||
if (gating_output.scalar_type() == at::ScalarType::Float) {
|
||||
dispatch_topk_softmax_launch<float>(gating_output, topk_weights, topk_indices,
|
||||
token_expert_indices, softmax_workspace, num_tokens, num_experts, topk, renormalize, stream);
|
||||
} else if (gating_output.scalar_type() == at::ScalarType::Half) {
|
||||
dispatch_topk_softmax_launch<__half>(gating_output, topk_weights, topk_indices,
|
||||
token_expert_indices, softmax_workspace, num_tokens, num_experts, topk, renormalize, stream);
|
||||
} else if (gating_output.scalar_type() == at::ScalarType::BFloat16) {
|
||||
dispatch_topk_softmax_launch<__nv_bfloat16>(gating_output, topk_weights, topk_indices,
|
||||
token_expert_indices, softmax_workspace, num_tokens, num_experts, topk, renormalize, stream);
|
||||
} else {
|
||||
TORCH_CHECK(false, "Unsupported gating_output data type: ", gating_output.scalar_type());
|
||||
}
|
||||
}
|
||||
|
||||
@ -5,7 +5,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
|
||||
// Apply topk softmax to the gating outputs.
|
||||
m.def(
|
||||
"topk_softmax(Tensor! topk_weights, Tensor! topk_indices, Tensor! "
|
||||
"token_expert_indices, Tensor gating_output) -> ()");
|
||||
"token_expert_indices, Tensor gating_output, bool renormalize) -> ()");
|
||||
m.impl("topk_softmax", torch::kCUDA, &topk_softmax);
|
||||
|
||||
// Calculate the result of moe by summing up the partial results
|
||||
@ -22,6 +22,17 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
|
||||
" Tensor! num_tokens_post_pad) -> ()");
|
||||
m.impl("moe_align_block_size", torch::kCUDA, &moe_align_block_size);
|
||||
|
||||
// Aligning the number of tokens to be processed by each expert such
|
||||
// that it is divisible by the block size, but for the batched case.
|
||||
m.def(
|
||||
"batched_moe_align_block_size(int max_tokens_per_batch,"
|
||||
" int block_size, Tensor expert_num_tokens,"
|
||||
" Tensor! sorted_token_ids,"
|
||||
" Tensor! experts_ids,"
|
||||
" Tensor! num_tokens_post_pad) -> ()");
|
||||
m.impl("batched_moe_align_block_size", torch::kCUDA,
|
||||
&batched_moe_align_block_size);
|
||||
|
||||
#ifndef USE_ROCM
|
||||
m.def(
|
||||
"moe_wna16_gemm(Tensor input, Tensor! output, Tensor b_qweight, "
|
||||
|
||||
@ -145,7 +145,11 @@ void rms_norm_dynamic_per_token_quant(
|
||||
if (scale_ub.has_value()) {
|
||||
TORCH_CHECK(out.dtype() == kFp8Type);
|
||||
}
|
||||
TORCH_CHECK(weight.dtype() == input.dtype());
|
||||
TORCH_CHECK(scales.dtype() == torch::kFloat32);
|
||||
if (residual) {
|
||||
TORCH_CHECK(residual->scalar_type() == input.scalar_type());
|
||||
}
|
||||
|
||||
VLLM_DISPATCH_FLOATING_TYPES(
|
||||
input.scalar_type(), "rms_norm_dynamic_per_token_quant_dispatch", [&] {
|
||||
|
||||
@ -9,7 +9,6 @@ from collections.abc import Iterable
|
||||
from copy import deepcopy
|
||||
from dataclasses import dataclass, fields
|
||||
from functools import reduce
|
||||
from typing import Optional, Union
|
||||
|
||||
import jinja2
|
||||
from vllm_cutlass_library_extension import (
|
||||
@ -259,7 +258,7 @@ class ScheduleConfig:
|
||||
@dataclass(frozen=True)
|
||||
class TypeConfig:
|
||||
a: DataType
|
||||
b: Union[DataType, VLLMDataType]
|
||||
b: DataType | VLLMDataType
|
||||
b_group_scale: DataType
|
||||
b_group_zeropoint: DataType
|
||||
b_channel_scale: DataType
|
||||
@ -280,7 +279,7 @@ class PrepackTypeConfig:
|
||||
class ImplConfig:
|
||||
types: TypeConfig
|
||||
schedules: list[ScheduleConfig]
|
||||
heuristic: list[tuple[Optional[str], ScheduleConfig]]
|
||||
heuristic: list[tuple[str | None, ScheduleConfig]]
|
||||
|
||||
|
||||
def generate_sch_sig(schedule_config: ScheduleConfig) -> str:
|
||||
|
||||
@ -22,13 +22,14 @@ template <typename AllReduceKernel, typename T>
|
||||
__global__ __quickreduce_launch_bounds_two_shot__ static void
|
||||
allreduce_prototype_twoshot(T const* A, T* B, uint32_t N, uint32_t num_blocks,
|
||||
int rank, uint8_t** dbuffer_list,
|
||||
uint32_t data_offset, uint32_t flag_color) {
|
||||
uint32_t data_offset, uint32_t flag_color,
|
||||
int64_t data_size_per_phase) {
|
||||
int block = blockIdx.x;
|
||||
int grid = gridDim.x;
|
||||
|
||||
while (block < num_blocks) {
|
||||
AllReduceKernel::run(A, B, N, block, rank, dbuffer_list, data_offset,
|
||||
flag_color);
|
||||
flag_color, data_size_per_phase);
|
||||
block += grid;
|
||||
flag_color++;
|
||||
}
|
||||
@ -41,21 +42,21 @@ allreduce_prototype_twoshot(T const* A, T* B, uint32_t N, uint32_t num_blocks,
|
||||
hipLaunchKernelGGL((allreduce_prototype_twoshot<AllReduceKernel, T>), \
|
||||
dim3(grid), dim3(kBlockTwoShot), 0, stream, A, B, N, \
|
||||
num_blocks, rank, dbuffer_list, data_offset, \
|
||||
flag_color); \
|
||||
flag_color, this->kMaxProblemSize); \
|
||||
} else if (world_size == 4) { \
|
||||
using LineCodec = __codec<T, 4>; \
|
||||
using AllReduceKernel = AllReduceTwoshot<T, LineCodec, cast_bf2half>; \
|
||||
hipLaunchKernelGGL((allreduce_prototype_twoshot<AllReduceKernel, T>), \
|
||||
dim3(grid), dim3(kBlockTwoShot), 0, stream, A, B, N, \
|
||||
num_blocks, rank, dbuffer_list, data_offset, \
|
||||
flag_color); \
|
||||
flag_color, this->kMaxProblemSize); \
|
||||
} else if (world_size == 8) { \
|
||||
using LineCodec = __codec<T, 8>; \
|
||||
using AllReduceKernel = AllReduceTwoshot<T, LineCodec, cast_bf2half>; \
|
||||
hipLaunchKernelGGL((allreduce_prototype_twoshot<AllReduceKernel, T>), \
|
||||
dim3(grid), dim3(kBlockTwoShot), 0, stream, A, B, N, \
|
||||
num_blocks, rank, dbuffer_list, data_offset, \
|
||||
flag_color); \
|
||||
flag_color, this->kMaxProblemSize); \
|
||||
}
|
||||
|
||||
enum QuickReduceQuantLevel {
|
||||
|
||||
@ -553,13 +553,12 @@ struct AllReduceTwoshot {
|
||||
int const rank, // rank index
|
||||
uint8_t** __restrict__ buffer_list, // communication buffers
|
||||
uint32_t const data_offset, // offset to start of the data buffer
|
||||
uint32_t flag_color) {
|
||||
uint32_t flag_color, int64_t data_size_per_phase) {
|
||||
// Topology
|
||||
int thread = threadIdx.x + threadIdx.y * kWavefront;
|
||||
uint8_t* rank_buffer = buffer_list[rank];
|
||||
Codec codec(thread, rank);
|
||||
int block_id = blockIdx.x;
|
||||
int grid_size = gridDim.x;
|
||||
// --------------------------------------------------------
|
||||
// Read input into registers
|
||||
int32x4_t tA[kAtoms];
|
||||
@ -588,12 +587,10 @@ struct AllReduceTwoshot {
|
||||
// rank responsible for this segment.
|
||||
uint32_t comm_data0_offset =
|
||||
data_offset + block_id * Codec::kTransmittedTileSize;
|
||||
uint32_t comm_data1_offset =
|
||||
grid_size * Codec::kTransmittedTileSize + comm_data0_offset;
|
||||
uint32_t comm_data1_offset = data_size_per_phase + comm_data0_offset;
|
||||
|
||||
uint32_t comm_flags0_offset = block_id * (kWorldSize * sizeof(uint32_t));
|
||||
uint32_t comm_flags1_offset =
|
||||
grid_size * (kWorldSize * sizeof(uint32_t)) + comm_flags0_offset;
|
||||
uint32_t comm_flags1_offset = (data_offset / 2) + comm_flags0_offset;
|
||||
|
||||
for (int r = 0; r < kWorldSize; r++) {
|
||||
int32x4_t* send_buffer =
|
||||
|
||||
@ -229,7 +229,7 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
|
||||
# Check the size of the wheel if RUN_WHEEL_CHECK is true
|
||||
COPY .buildkite/check-wheel-size.py check-wheel-size.py
|
||||
# sync the default value with .buildkite/check-wheel-size.py
|
||||
ARG VLLM_MAX_SIZE_MB=450
|
||||
ARG VLLM_MAX_SIZE_MB=500
|
||||
ENV VLLM_MAX_SIZE_MB=$VLLM_MAX_SIZE_MB
|
||||
ARG RUN_WHEEL_CHECK=true
|
||||
RUN if [ "$RUN_WHEEL_CHECK" = "true" ]; then \
|
||||
@ -359,8 +359,8 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
|
||||
# Install FlashInfer pre-compiled kernel cache and binaries
|
||||
# https://docs.flashinfer.ai/installation.html
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --system flashinfer-cubin==0.4.0 \
|
||||
&& uv pip install --system flashinfer-jit-cache==0.4.0 \
|
||||
uv pip install --system flashinfer-cubin==0.4.1 \
|
||||
&& uv pip install --system flashinfer-jit-cache==0.4.1 \
|
||||
--extra-index-url https://flashinfer.ai/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \
|
||||
&& flashinfer show-config
|
||||
|
||||
|
||||
@ -246,7 +246,7 @@ RUN pip install setuptools==75.6.0 packaging==23.2 ninja==1.11.1.3 build==1.2.2.
|
||||
|
||||
|
||||
# build flashinfer for torch nightly from source around 10 mins
|
||||
# release version: v0.4.0
|
||||
# release version: v0.4.1
|
||||
# todo(elainewy): cache flashinfer build result for faster build
|
||||
ENV CCACHE_DIR=/root/.cache/ccache
|
||||
RUN --mount=type=cache,target=/root/.cache/ccache \
|
||||
@ -254,7 +254,7 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
|
||||
echo "git clone flashinfer..." \
|
||||
&& git clone --recursive https://github.com/flashinfer-ai/flashinfer.git \
|
||||
&& cd flashinfer \
|
||||
&& git checkout v0.4.0 \
|
||||
&& git checkout v0.4.1\
|
||||
&& git submodule update --init --recursive \
|
||||
&& echo "finish git clone flashinfer..." \
|
||||
&& rm -rf build \
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
ARG BASE_UBI_IMAGE_TAG=9.5-1741850109
|
||||
ARG BASE_UBI_IMAGE_TAG=9.6-1754584681
|
||||
|
||||
###############################################################
|
||||
# Stage to build openblas
|
||||
@ -7,7 +7,7 @@ ARG BASE_UBI_IMAGE_TAG=9.5-1741850109
|
||||
FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} AS openblas-builder
|
||||
|
||||
ARG MAX_JOBS
|
||||
ARG OPENBLAS_VERSION=0.3.29
|
||||
ARG OPENBLAS_VERSION=0.3.30
|
||||
RUN microdnf install -y dnf && dnf install -y gcc-toolset-13 make wget unzip \
|
||||
&& source /opt/rh/gcc-toolset-13/enable \
|
||||
&& wget https://github.com/OpenMathLib/OpenBLAS/releases/download/v$OPENBLAS_VERSION/OpenBLAS-$OPENBLAS_VERSION.zip \
|
||||
@ -38,7 +38,7 @@ RUN dnf install -y openjpeg2-devel lcms2-devel tcl-devel tk-devel fribidi-devel
|
||||
FROM centos-deps-builder AS base-builder
|
||||
|
||||
ARG PYTHON_VERSION=3.12
|
||||
ARG OPENBLAS_VERSION=0.3.29
|
||||
ARG OPENBLAS_VERSION=0.3.30
|
||||
|
||||
# Set Environment Variables for venv, cargo & openblas
|
||||
ENV VIRTUAL_ENV=/opt/vllm
|
||||
@ -61,7 +61,7 @@ RUN --mount=type=bind,from=openblas-builder,source=/OpenBLAS-$OPENBLAS_VERSION/,
|
||||
pkgconfig xsimd zeromq-devel kmod findutils protobuf* \
|
||||
libtiff-devel libjpeg-devel zlib-devel freetype-devel libwebp-devel \
|
||||
harfbuzz-devel libraqm-devel libimagequant-devel libxcb-devel \
|
||||
python${PYTHON_VERSION}-devel python${PYTHON_VERSION}-pip \
|
||||
python${PYTHON_VERSION}-devel python${PYTHON_VERSION}-pip clang-devel \
|
||||
&& dnf clean all \
|
||||
&& PREFIX=/usr/local make -C /openblas install \
|
||||
&& ln -sf /usr/lib64/libatomic.so.1 /usr/lib64/libatomic.so \
|
||||
@ -79,9 +79,9 @@ RUN --mount=type=bind,from=openblas-builder,source=/OpenBLAS-$OPENBLAS_VERSION/,
|
||||
FROM base-builder AS torch-builder
|
||||
|
||||
ARG MAX_JOBS
|
||||
ARG TORCH_VERSION=2.6.0
|
||||
ARG TORCH_VERSION=2.7.0
|
||||
ARG _GLIBCXX_USE_CXX11_ABI=1
|
||||
ARG OPENBLAS_VERSION=0.3.29
|
||||
ARG OPENBLAS_VERSION=0.3.30
|
||||
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
source /opt/rh/gcc-toolset-13/enable && \
|
||||
@ -93,7 +93,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
MAX_JOBS=${MAX_JOBS:-$(nproc)} \
|
||||
PYTORCH_BUILD_VERSION=${TORCH_VERSION} PYTORCH_BUILD_NUMBER=1 uv build --wheel --out-dir /torchwheels/
|
||||
|
||||
ARG TORCHVISION_VERSION=0.21.0
|
||||
ARG TORCHVISION_VERSION=0.22.0
|
||||
ARG TORCHVISION_USE_NVJPEG=0
|
||||
ARG TORCHVISION_USE_FFMPEG=0
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
@ -104,7 +104,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
BUILD_VERSION=${TORCHVISION_VERSION} \
|
||||
uv build --wheel --out-dir /torchwheels/ --no-build-isolation
|
||||
|
||||
ARG TORCHAUDIO_VERSION=2.6.0
|
||||
ARG TORCHAUDIO_VERSION=2.7.0
|
||||
ARG BUILD_SOX=1
|
||||
ARG BUILD_KALDI=1
|
||||
ARG BUILD_RNNT=1
|
||||
@ -128,7 +128,7 @@ FROM base-builder AS arrow-builder
|
||||
|
||||
ARG MAX_JOBS
|
||||
ARG PYARROW_PARALLEL
|
||||
ARG PYARROW_VERSION=19.0.1
|
||||
ARG PYARROW_VERSION=21.0.0
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
source /opt/rh/gcc-toolset-13/enable && \
|
||||
git clone --recursive https://github.com/apache/arrow.git -b apache-arrow-${PYARROW_VERSION} && \
|
||||
@ -145,7 +145,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
make install -j ${MAX_JOBS:-$(nproc)} && \
|
||||
cd ../../python/ && \
|
||||
uv pip install -v -r requirements-build.txt && uv pip install numpy==2.1.3 && \
|
||||
pip show numpy && ls -lrt /opt/vllm/lib/python3.12/site-packages/numpy && \
|
||||
PYARROW_PARALLEL=${PYARROW_PARALLEL:-$(nproc)} \
|
||||
python setup.py build_ext \
|
||||
--build-type=release --bundle-arrow-cpp \
|
||||
@ -187,6 +186,23 @@ RUN git clone --recursive https://github.com/numactl/numactl.git -b v${NUMACTL_V
|
||||
&& make -j ${MAX_JOBS:-$(nproc)}
|
||||
|
||||
|
||||
###############################################################
|
||||
# Stage to build numba
|
||||
###############################################################
|
||||
|
||||
FROM base-builder AS numba-builder
|
||||
|
||||
ARG MAX_JOBS
|
||||
ARG NUMBA_VERSION=0.61.2
|
||||
|
||||
# Clone all required dependencies
|
||||
RUN dnf install ninja-build llvm15 llvm15-devel -y && source /opt/rh/gcc-toolset-13/enable && export PATH=$PATH:/usr/lib64/llvm15/bin && \
|
||||
git clone --recursive https://github.com/numba/numba.git -b ${NUMBA_VERSION} && \
|
||||
cd ./numba && \
|
||||
if ! grep '#include "dynamic_annotations.h"' numba/_dispatcher.cpp; then \
|
||||
sed -i '/#include "internal\/pycore_atomic.h"/i\#include "dynamic_annotations.h"' numba/_dispatcher.cpp; \
|
||||
fi && python -m build --wheel --installer=uv --outdir /numbawheels/
|
||||
|
||||
###############################################################
|
||||
# Stage to build vllm - this stage builds and installs
|
||||
# vllm, tensorizer and vllm-tgis-adapter and builds uv cache
|
||||
@ -199,6 +215,7 @@ COPY --from=torch-builder /tmp/control /dev/null
|
||||
COPY --from=arrow-builder /tmp/control /dev/null
|
||||
COPY --from=cv-builder /tmp/control /dev/null
|
||||
COPY --from=numa-builder /tmp/control /dev/null
|
||||
COPY --from=numba-builder /tmp/control /dev/null
|
||||
|
||||
ARG VLLM_TARGET_DEVICE=cpu
|
||||
ARG GRPC_PYTHON_BUILD_SYSTEM_OPENSSL=1
|
||||
@ -206,6 +223,8 @@ ARG GRPC_PYTHON_BUILD_SYSTEM_OPENSSL=1
|
||||
# this step installs vllm and populates uv cache
|
||||
# with all the transitive dependencies
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
dnf install llvm15 llvm15-devel -y && \
|
||||
rpm -ivh --nodeps https://mirror.stream.centos.org/9-stream/CRB/ppc64le/os/Packages/protobuf-lite-devel-3.14.0-16.el9.ppc64le.rpm && \
|
||||
source /opt/rh/gcc-toolset-13/enable && \
|
||||
git clone https://github.com/huggingface/xet-core.git && cd xet-core/hf_xet/ && \
|
||||
uv pip install maturin && \
|
||||
@ -215,15 +234,18 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
--mount=type=bind,from=arrow-builder,source=/arrowwheels/,target=/arrowwheels/,ro \
|
||||
--mount=type=bind,from=cv-builder,source=/opencvwheels/,target=/opencvwheels/,ro \
|
||||
--mount=type=bind,from=numa-builder,source=/numactl/,target=/numactl/,rw \
|
||||
--mount=type=bind,from=numba-builder,source=/numbawheels/,target=/numbawheels/,ro \
|
||||
--mount=type=bind,src=.,dst=/src/,rw \
|
||||
source /opt/rh/gcc-toolset-13/enable && \
|
||||
uv pip install /opencvwheels/*.whl /arrowwheels/*.whl /torchwheels/*.whl && \
|
||||
export PATH=$PATH:/usr/lib64/llvm15/bin && \
|
||||
uv pip install /opencvwheels/*.whl /arrowwheels/*.whl /torchwheels/*.whl /numbawheels/*.whl && \
|
||||
sed -i -e 's/.*torch.*//g' /src/pyproject.toml /src/requirements/*.txt && \
|
||||
uv pip install pandas pythran pybind11 /hf_wheels/*.whl && \
|
||||
sed -i -e 's/.*sentencepiece.*//g' /src/pyproject.toml /src/requirements/*.txt && \
|
||||
uv pip install sentencepiece==0.2.0 pandas pythran nanobind pybind11 /hf_wheels/*.whl && \
|
||||
make -C /numactl install && \
|
||||
# sentencepiece.pc is in some pkgconfig inside uv cache
|
||||
export PKG_CONFIG_PATH=$(find / -type d -name "pkgconfig" 2>/dev/null | tr '\n' ':') && \
|
||||
uv pip install -r /src/requirements/common.txt -r /src/requirements/cpu.txt -r /src/requirements/build.txt --no-build-isolation && \
|
||||
nanobind_DIR=$(uv pip show nanobind | grep Location | sed 's/^Location: //;s/$/\/nanobind\/cmake/') && uv pip install -r /src/requirements/common.txt -r /src/requirements/cpu.txt -r /src/requirements/build.txt --no-build-isolation && \
|
||||
cd /src/ && \
|
||||
uv build --wheel --out-dir /vllmwheel/ --no-build-isolation && \
|
||||
uv pip install /vllmwheel/*.whl
|
||||
@ -250,7 +272,7 @@ RUN git clone --recursive https://github.com/Reference-LAPACK/lapack.git -b v${L
|
||||
FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} AS vllm-openai
|
||||
|
||||
ARG PYTHON_VERSION=3.12
|
||||
ARG OPENBLAS_VERSION=0.3.29
|
||||
ARG OPENBLAS_VERSION=0.3.30
|
||||
|
||||
# Set Environment Variables for venv & openblas
|
||||
ENV VIRTUAL_ENV=/opt/vllm
|
||||
@ -268,6 +290,7 @@ COPY --from=vllmcache-builder /tmp/control /dev/null
|
||||
COPY --from=numa-builder /tmp/control /dev/null
|
||||
COPY --from=lapack-builder /tmp/control /dev/null
|
||||
COPY --from=openblas-builder /tmp/control /dev/null
|
||||
COPY --from=numba-builder /tmp/control /dev/null
|
||||
|
||||
# install gcc-11, python, openblas, numactl, lapack
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
@ -276,13 +299,13 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
--mount=type=bind,from=openblas-builder,source=/OpenBLAS-$OPENBLAS_VERSION/,target=/openblas/,rw \
|
||||
rpm -ivh https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm && \
|
||||
microdnf install --nodocs -y \
|
||||
tar findutils openssl \
|
||||
libomp tar findutils openssl llvm15 llvm15-devel \
|
||||
pkgconfig xsimd g++ gcc-fortran libsndfile \
|
||||
libtiff libjpeg openjpeg2 zlib zeromq \
|
||||
freetype lcms2 libwebp tcl tk utf8proc \
|
||||
harfbuzz fribidi libraqm libimagequant libxcb \
|
||||
harfbuzz fribidi libraqm libimagequant libxcb util-linux \
|
||||
python${PYTHON_VERSION}-devel python${PYTHON_VERSION}-pip \
|
||||
&& microdnf clean all \
|
||||
&& export PATH=$PATH:/usr/lib64/llvm15/bin && microdnf clean all \
|
||||
&& python${PYTHON_VERSION} -m venv ${VIRTUAL_ENV} \
|
||||
&& python -m pip install -U pip uv --no-cache \
|
||||
&& make -C /numactl install \
|
||||
@ -298,7 +321,10 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
--mount=type=bind,from=cv-builder,source=/opencvwheels/,target=/opencvwheels/,ro \
|
||||
--mount=type=bind,from=vllmcache-builder,source=/hf_wheels/,target=/hf_wheels/,ro \
|
||||
--mount=type=bind,from=vllmcache-builder,source=/vllmwheel/,target=/vllmwheel/,ro \
|
||||
HOME=/root uv pip install /opencvwheels/*.whl /arrowwheels/*.whl /torchwheels/*.whl /hf_wheels/*.whl /vllmwheel/*.whl
|
||||
--mount=type=bind,from=numba-builder,source=/numbawheels/,target=/numbawheels/,ro \
|
||||
export PKG_CONFIG_PATH=$(find / -type d -name "pkgconfig" 2>/dev/null | tr '\n' ':') && uv pip install sentencepiece==0.2.0 && \
|
||||
HOME=/root uv pip install /opencvwheels/*.whl /arrowwheels/*.whl /torchwheels/*.whl /numbawheels/*.whl /hf_wheels/*.whl /vllmwheel/*.whl
|
||||
|
||||
|
||||
COPY ./ /workspace/vllm
|
||||
WORKDIR /workspace/vllm
|
||||
@ -314,4 +340,4 @@ WORKDIR /workspace/
|
||||
|
||||
RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
|
||||
|
||||
ENTRYPOINT ["vllm", "serve"]
|
||||
ENTRYPOINT ["vllm", "serve"]
|
||||
@ -12,7 +12,7 @@ ENV PYTORCH_ROCM_ARCH=${ARG_PYTORCH_ROCM_ARCH:-${PYTORCH_ROCM_ARCH}}
|
||||
RUN apt-get update -q -y && apt-get install -q -y \
|
||||
sqlite3 libsqlite3-dev libfmt-dev libmsgpack-dev libsuitesparse-dev \
|
||||
apt-transport-https ca-certificates wget curl
|
||||
# Remove sccache
|
||||
# Remove sccache
|
||||
RUN python3 -m pip install --upgrade pip
|
||||
RUN apt-get purge -y sccache; python3 -m pip uninstall -y sccache; rm -f "$(which sccache)"
|
||||
ARG COMMON_WORKDIR
|
||||
|
||||
@ -69,4 +69,9 @@ RUN --mount=type=cache,target=/root/.cache/pip \
|
||||
|
||||
# install development dependencies (for testing)
|
||||
RUN python3 -m pip install -e tests/vllm_test_utils
|
||||
|
||||
# install nixl from source code
|
||||
RUN python3 /workspace/vllm/tools/install_nixl_from_source_ubuntu.py
|
||||
ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/lib/python3.12/dist-packages/.nixl.mesonpy.libs/plugins/"
|
||||
|
||||
ENTRYPOINT ["vllm", "serve"]
|
||||
|
||||
@ -20,8 +20,6 @@ API documentation for vLLM's configuration classes.
|
||||
- [vllm.config.CompilationConfig][]
|
||||
- [vllm.config.VllmConfig][]
|
||||
|
||||
[](){ #offline-inference-api }
|
||||
|
||||
## Offline Inference
|
||||
|
||||
LLM Class.
|
||||
@ -45,18 +43,14 @@ Engine classes for offline and online inference.
|
||||
|
||||
Inference parameters for vLLM APIs.
|
||||
|
||||
[](){ #sampling-params }
|
||||
|
||||
- [vllm.SamplingParams][]
|
||||
- [vllm.PoolingParams][]
|
||||
|
||||
[](){ #multi-modality }
|
||||
|
||||
## Multi-Modality
|
||||
|
||||
vLLM provides experimental support for multi-modal models through the [vllm.multimodal][] package.
|
||||
|
||||
Multi-modal inputs can be passed alongside text and token prompts to [supported models][supported-mm-models]
|
||||
Multi-modal inputs can be passed alongside text and token prompts to [supported models](../models/supported_models.md#list-of-multimodal-language-models)
|
||||
via the `multi_modal_data` field in [vllm.inputs.PromptType][].
|
||||
|
||||
Looking to add your own multi-modal model? Please follow the instructions listed [here](../contributing/model/multimodal.md).
|
||||
|
||||
@ -4,6 +4,6 @@ This section lists the most common options for running vLLM.
|
||||
|
||||
There are three main levels of configuration, from highest priority to lowest priority:
|
||||
|
||||
- [Request parameters][completions-api] and [input arguments][sampling-params]
|
||||
- [Request parameters](../serving/openai_compatible_server.md#completions-api) and [input arguments](../api/README.md#inference-parameters)
|
||||
- [Engine arguments](./engine_args.md)
|
||||
- [Environment variables](./env_vars.md)
|
||||
|
||||
@ -11,8 +11,7 @@ The following code splits the model across 2 GPUs.
|
||||
```python
|
||||
from vllm import LLM
|
||||
|
||||
llm = LLM(model="ibm-granite/granite-3.1-8b-instruct",
|
||||
tensor_parallel_size=2)
|
||||
llm = LLM(model="ibm-granite/granite-3.1-8b-instruct", tensor_parallel_size=2)
|
||||
```
|
||||
|
||||
!!! warning
|
||||
@ -24,7 +23,7 @@ llm = LLM(model="ibm-granite/granite-3.1-8b-instruct",
|
||||
!!! note
|
||||
With tensor parallelism enabled, each process will read the whole model and split it into chunks, which makes the disk reading time even longer (proportional to the size of tensor parallelism).
|
||||
|
||||
You can convert the model checkpoint to a sharded checkpoint using <gh-file:examples/offline_inference/save_sharded_state.py>. The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism.
|
||||
You can convert the model checkpoint to a sharded checkpoint using [examples/offline_inference/save_sharded_state.py](../../examples/offline_inference/save_sharded_state.py). The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism.
|
||||
|
||||
## Quantization
|
||||
|
||||
@ -38,14 +37,13 @@ Dynamic quantization is also supported via the `quantization` option -- see [her
|
||||
## Context length and batch size
|
||||
|
||||
You can further reduce memory usage by limiting the context length of the model (`max_model_len` option)
|
||||
and the maximum batch size (`max_num_seqs` option).
|
||||
and the maximum batch size (`max_num_seqs` option). Setting `max_model_len=-1` lets vLLM automatically
|
||||
pick the largest context length that fits in GPU memory, up to the model's default maximum.
|
||||
|
||||
```python
|
||||
from vllm import LLM
|
||||
|
||||
llm = LLM(model="adept/fuyu-8b",
|
||||
max_model_len=2048,
|
||||
max_num_seqs=2)
|
||||
llm = LLM(model="adept/fuyu-8b", max_model_len=2048, max_num_seqs=2)
|
||||
```
|
||||
|
||||
## Reduce CUDA Graphs
|
||||
@ -61,12 +59,12 @@ You can adjust `compilation_config` to achieve a better balance between inferenc
|
||||
|
||||
```python
|
||||
from vllm import LLM
|
||||
from vllm.config import CompilationConfig, CompilationLevel
|
||||
from vllm.config import CompilationConfig, CompilationMode
|
||||
|
||||
llm = LLM(
|
||||
model="meta-llama/Llama-3.1-8B-Instruct",
|
||||
compilation_config=CompilationConfig(
|
||||
level=CompilationLevel.PIECEWISE,
|
||||
mode=CompilationMode.VLLM_COMPILE,
|
||||
# By default, it goes up to max_num_seqs
|
||||
cudagraph_capture_sizes=[1, 2, 4, 8, 16],
|
||||
),
|
||||
@ -78,8 +76,7 @@ You can disable graph capturing completely via the `enforce_eager` flag:
|
||||
```python
|
||||
from vllm import LLM
|
||||
|
||||
llm = LLM(model="meta-llama/Llama-3.1-8B-Instruct",
|
||||
enforce_eager=True)
|
||||
llm = LLM(model="meta-llama/Llama-3.1-8B-Instruct", enforce_eager=True)
|
||||
```
|
||||
|
||||
## Adjust cache size
|
||||
@ -97,8 +94,10 @@ You can allow a smaller number of multi-modal items per prompt to reduce the mem
|
||||
from vllm import LLM
|
||||
|
||||
# Accept up to 3 images and 1 video per prompt
|
||||
llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
|
||||
limit_mm_per_prompt={"image": 3, "video": 1})
|
||||
llm = LLM(
|
||||
model="Qwen/Qwen2.5-VL-3B-Instruct",
|
||||
limit_mm_per_prompt={"image": 3, "video": 1},
|
||||
)
|
||||
```
|
||||
|
||||
You can go a step further and disable unused modalities completely by setting its limit to zero.
|
||||
@ -108,8 +107,10 @@ For example, if your application only accepts image input, there is no need to a
|
||||
from vllm import LLM
|
||||
|
||||
# Accept any number of images but no videos
|
||||
llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
|
||||
limit_mm_per_prompt={"video": 0})
|
||||
llm = LLM(
|
||||
model="Qwen/Qwen2.5-VL-3B-Instruct",
|
||||
limit_mm_per_prompt={"video": 0},
|
||||
)
|
||||
```
|
||||
|
||||
You can even run a multi-modal model for text-only inference:
|
||||
@ -118,8 +119,10 @@ You can even run a multi-modal model for text-only inference:
|
||||
from vllm import LLM
|
||||
|
||||
# Don't accept images. Just text.
|
||||
llm = LLM(model="google/gemma-3-27b-it",
|
||||
limit_mm_per_prompt={"image": 0})
|
||||
llm = LLM(
|
||||
model="google/gemma-3-27b-it",
|
||||
limit_mm_per_prompt={"image": 0},
|
||||
)
|
||||
```
|
||||
|
||||
### Configurable options
|
||||
@ -173,14 +176,14 @@ Here are some examples:
|
||||
from vllm import LLM
|
||||
|
||||
# Available for Qwen2-VL series models
|
||||
llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
|
||||
mm_processor_kwargs={
|
||||
"max_pixels": 768 * 768, # Default is 1280 * 28 * 28
|
||||
})
|
||||
llm = LLM(
|
||||
model="Qwen/Qwen2.5-VL-3B-Instruct",
|
||||
mm_processor_kwargs={"max_pixels": 768 * 768}, # Default is 1280 * 28 * 28
|
||||
)
|
||||
|
||||
# Available for InternVL series models
|
||||
llm = LLM(model="OpenGVLab/InternVL2-2B",
|
||||
mm_processor_kwargs={
|
||||
"max_dynamic_patch": 4, # Default is 12
|
||||
})
|
||||
llm = LLM(
|
||||
model="OpenGVLab/InternVL2-2B",
|
||||
mm_processor_kwargs={"max_dynamic_patch": 4}, # Default is 12
|
||||
)
|
||||
```
|
||||
|
||||
@ -27,8 +27,6 @@ You can monitor the number of preemption requests through Prometheus metrics exp
|
||||
|
||||
In vLLM V1, the default preemption mode is `RECOMPUTE` rather than `SWAP`, as recomputation has lower overhead in the V1 architecture.
|
||||
|
||||
[](){ #chunked-prefill }
|
||||
|
||||
## Chunked Prefill
|
||||
|
||||
Chunked prefill allows vLLM to process large prefills in smaller chunks and batch them together with decode requests. This feature helps improve both throughput and latency by better balancing compute-bound (prefill) and memory-bound (decode) operations.
|
||||
@ -100,7 +98,7 @@ from vllm import LLM
|
||||
llm = LLM(
|
||||
model="meta-llama/Llama-3.3-70B-Instruct,
|
||||
tensor_parallel_size=4,
|
||||
pipeline_parallel_size=2
|
||||
pipeline_parallel_size=2,
|
||||
)
|
||||
```
|
||||
|
||||
@ -174,14 +172,14 @@ Regardless, you need to set `mm_encoder_tp_mode="data"` in engine arguments to u
|
||||
|
||||
Known supported models (with corresponding benchmarks):
|
||||
|
||||
- dots_ocr (<gh-pr:25466>)
|
||||
- GLM-4.1V or above (<gh-pr:23168>)
|
||||
- InternVL (<gh-pr:23909>)
|
||||
- Kimi-VL (<gh-pr:23817>)
|
||||
- Llama4 (<gh-pr:18368>)
|
||||
- MiniCPM-V-2.5 or above (<gh-pr:23327>, <gh-pr:23948>)
|
||||
- Qwen2-VL or above (<gh-pr:22742>, <gh-pr:24955>, <gh-pr:25445>)
|
||||
- Step3 (<gh-pr:22697>)
|
||||
- dots_ocr (<https://github.com/vllm-project/vllm/pull/25466>)
|
||||
- GLM-4.1V or above (<https://github.com/vllm-project/vllm/pull/23168>)
|
||||
- InternVL (<https://github.com/vllm-project/vllm/pull/23909>)
|
||||
- Kimi-VL (<https://github.com/vllm-project/vllm/pull/23817>)
|
||||
- Llama4 (<https://github.com/vllm-project/vllm/pull/18368>)
|
||||
- MiniCPM-V-2.5 or above (<https://github.com/vllm-project/vllm/pull/23327>, <https://github.com/vllm-project/vllm/pull/23948>)
|
||||
- Qwen2-VL or above (<https://github.com/vllm-project/vllm/pull/22742>, <https://github.com/vllm-project/vllm/pull/24955>, <https://github.com/vllm-project/vllm/pull/25445>)
|
||||
- Step3 (<https://github.com/vllm-project/vllm/pull/22697>)
|
||||
|
||||
## Input Processing
|
||||
|
||||
@ -257,18 +255,24 @@ Examples:
|
||||
|
||||
```python
|
||||
# Use a larger cache
|
||||
llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
|
||||
mm_processor_cache_gb=8)
|
||||
llm = LLM(
|
||||
model="Qwen/Qwen2.5-VL-3B-Instruct",
|
||||
mm_processor_cache_gb=8,
|
||||
)
|
||||
|
||||
# Use a shared-memory based IPC cache
|
||||
llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
|
||||
tensor_parallel_size=2,
|
||||
mm_processor_cache_type="shm",
|
||||
mm_processor_cache_gb=8)
|
||||
llm = LLM(
|
||||
model="Qwen/Qwen2.5-VL-3B-Instruct",
|
||||
tensor_parallel_size=2,
|
||||
mm_processor_cache_type="shm",
|
||||
mm_processor_cache_gb=8,
|
||||
)
|
||||
|
||||
# Disable the cache
|
||||
llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
|
||||
mm_processor_cache_gb=0)
|
||||
llm = LLM(
|
||||
model="Qwen/Qwen2.5-VL-3B-Instruct",
|
||||
mm_processor_cache_gb=0,
|
||||
)
|
||||
```
|
||||
|
||||
### Cache Placement
|
||||
|
||||
@ -96,7 +96,7 @@ Although it’s common to do this with GPUs, don't try to fragment 2 or 8 differ
|
||||
|
||||
### Tune your workloads
|
||||
|
||||
Although we try to have great default configs, we strongly recommend you check out the [vLLM auto-tuner](gh-file:benchmarks/auto_tune/README.md) to optimize your workloads for your use case.
|
||||
Although we try to have great default configs, we strongly recommend you check out the [vLLM auto-tuner](../../benchmarks/auto_tune/README.md) to optimize your workloads for your use case.
|
||||
|
||||
### Future Topics We'll Cover
|
||||
|
||||
|
||||
@ -22,7 +22,7 @@ Unsure on where to start? Check out the following links for tasks to work on:
|
||||
|
||||
## License
|
||||
|
||||
See <gh-file:LICENSE>.
|
||||
See [LICENSE](../../LICENSE).
|
||||
|
||||
## Developing
|
||||
|
||||
@ -54,7 +54,7 @@ For more details about installing from source and installing for other hardware,
|
||||
For an optimized workflow when iterating on C++/CUDA kernels, see the [Incremental Compilation Workflow](./incremental_build.md) for recommendations.
|
||||
|
||||
!!! tip
|
||||
vLLM is compatible with Python versions 3.10 to 3.13. However, vLLM's default [Dockerfile](gh-file:docker/Dockerfile) ships with Python 3.12 and tests in CI (except `mypy`) are run with Python 3.12.
|
||||
vLLM is compatible with Python versions 3.10 to 3.13. However, vLLM's default [Dockerfile](../../docker/Dockerfile) ships with Python 3.12 and tests in CI (except `mypy`) are run with Python 3.12.
|
||||
|
||||
Therefore, we recommend developing with Python 3.12 to minimise the chance of your local environment clashing with our CI environment.
|
||||
|
||||
@ -88,7 +88,7 @@ vLLM's `pre-commit` hooks will now run automatically every time you commit.
|
||||
|
||||
### Documentation
|
||||
|
||||
MkDocs is a fast, simple and downright gorgeous static site generator that's geared towards building project documentation. Documentation source files are written in Markdown, and configured with a single YAML configuration file, <gh-file:mkdocs.yaml>.
|
||||
MkDocs is a fast, simple and downright gorgeous static site generator that's geared towards building project documentation. Documentation source files are written in Markdown, and configured with a single YAML configuration file, [mkdocs.yaml](../../mkdocs.yaml).
|
||||
|
||||
Get started with:
|
||||
|
||||
@ -152,7 +152,7 @@ pytest -s -v tests/test_logger.py
|
||||
If you encounter a bug or have a feature request, please [search existing issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue) first to see if it has already been reported. If not, please [file a new issue](https://github.com/vllm-project/vllm/issues/new/choose), providing as much relevant information as possible.
|
||||
|
||||
!!! important
|
||||
If you discover a security vulnerability, please follow the instructions [here](gh-file:SECURITY.md#reporting-a-vulnerability).
|
||||
If you discover a security vulnerability, please follow the instructions [here](../../SECURITY.md).
|
||||
|
||||
## Pull Requests & Code Reviews
|
||||
|
||||
@ -162,7 +162,7 @@ code quality and improve the efficiency of the review process.
|
||||
|
||||
### DCO and Signed-off-by
|
||||
|
||||
When contributing changes to this project, you must agree to the <gh-file:DCO>.
|
||||
When contributing changes to this project, you must agree to the [DCO](../../DCO).
|
||||
Commits must include a `Signed-off-by:` header which certifies agreement with
|
||||
the terms of the DCO.
|
||||
|
||||
|
||||
@ -7,8 +7,8 @@ toc_depth: 4
|
||||
vLLM provides comprehensive benchmarking tools for performance testing and evaluation:
|
||||
|
||||
- **[Benchmark CLI]**: `vllm bench` CLI tools and specialized benchmark scripts for interactive performance testing
|
||||
- **[Performance benchmarks][performance-benchmarks]**: Automated CI benchmarks for development
|
||||
- **[Nightly benchmarks][nightly-benchmarks]**: Comparative benchmarks against alternatives
|
||||
- **[Performance benchmarks](#performance-benchmarks)**: Automated CI benchmarks for development
|
||||
- **[Nightly benchmarks](#nightly-benchmarks)**: Comparative benchmarks against alternatives
|
||||
|
||||
[Benchmark CLI]: #benchmark-cli
|
||||
|
||||
@ -35,6 +35,7 @@ th {
|
||||
| Sonnet (deprecated) | ✅ | ✅ | Local file: `benchmarks/sonnet.txt` |
|
||||
| Random | ✅ | ✅ | `synthetic` |
|
||||
| RandomMultiModal (Image/Video) | 🟡 | 🚧 | `synthetic` |
|
||||
| RandomForReranking | ✅ | ✅ | `synthetic` |
|
||||
| Prefix Repetition | ✅ | ✅ | `synthetic` |
|
||||
| HuggingFace-VisionArena | ✅ | ✅ | `lmarena-ai/VisionArena-Chat` |
|
||||
| HuggingFace-MMVU | ✅ | ✅ | `yale-nlp/MMVU` |
|
||||
@ -821,7 +822,7 @@ you should set `--endpoint /v1/embeddings` to use the Embeddings API. The backen
|
||||
- CLIP: `--backend openai-embeddings-clip`
|
||||
- VLM2Vec: `--backend openai-embeddings-vlm2vec`
|
||||
|
||||
For other models, please add your own implementation inside <gh-file:vllm/benchmarks/lib/endpoint_request_func.py> to match the expected instruction format.
|
||||
For other models, please add your own implementation inside [vllm/benchmarks/lib/endpoint_request_func.py](../../vllm/benchmarks/lib/endpoint_request_func.py) to match the expected instruction format.
|
||||
|
||||
You can use any text or multi-modal dataset to benchmark the model, as long as the model supports it.
|
||||
For example, you can use ShareGPT and VisionArena to benchmark vision-language embeddings.
|
||||
@ -878,7 +879,50 @@ vllm bench serve \
|
||||
|
||||
</details>
|
||||
|
||||
[](){ #performance-benchmarks }
|
||||
#### Reranker Benchmark
|
||||
|
||||
Benchmark the performance of rerank requests in vLLM.
|
||||
|
||||
<details class="admonition abstract" markdown="1">
|
||||
<summary>Show more</summary>
|
||||
|
||||
Unlike generative models which use Completions API or Chat Completions API,
|
||||
you should set `--backend vllm-rerank` and `--endpoint /v1/rerank` to use the Reranker API.
|
||||
|
||||
For reranking, the only supported dataset is `--dataset-name random-rerank`
|
||||
|
||||
Start the server:
|
||||
|
||||
```bash
|
||||
vllm serve BAAI/bge-reranker-v2-m3
|
||||
```
|
||||
|
||||
Run the benchmark:
|
||||
|
||||
```bash
|
||||
vllm bench serve \
|
||||
--model BAAI/bge-reranker-v2-m3 \
|
||||
--backend vllm-rerank \
|
||||
--endpoint /v1/rerank \
|
||||
--dataset-name random-rerank \
|
||||
--tokenizer BAAI/bge-reranker-v2-m3 \
|
||||
--random-input-len 512 \
|
||||
--num-prompts 10 \
|
||||
--random-batch-size 5
|
||||
```
|
||||
|
||||
For reranker models, this will create `num_prompts / random_batch_size` requests with
|
||||
`random_batch_size` "documents" where each one has close to `random_input_len` tokens.
|
||||
In the example above, this results in 2 rerank requests with 5 "documents" each where
|
||||
each document has close to 512 tokens.
|
||||
|
||||
Please note that the `/v1/rerank` is also supported by embedding models. So if you're running
|
||||
with an embedding model, also set `--no_reranker`. Because in this case the query is
|
||||
treated as a individual prompt by the server, here we send `random_batch_size - 1` documents
|
||||
to account for the extra prompt which is the query. The token accounting to report the
|
||||
throughput numbers correctly is also adjusted.
|
||||
|
||||
</details>
|
||||
|
||||
## Performance Benchmarks
|
||||
|
||||
@ -916,7 +960,7 @@ For more results visualization, check the [visualizing the results](https://gith
|
||||
|
||||
The latest performance results are hosted on the public [vLLM Performance Dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm).
|
||||
|
||||
More information on the performance benchmarks and their parameters can be found in [Benchmark README](https://github.com/intel-ai-tce/vllm/blob/more_cpu_models/.buildkite/nightly-benchmarks/README.md) and [performance benchmark description](gh-file:.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md).
|
||||
More information on the performance benchmarks and their parameters can be found in [Benchmark README](https://github.com/intel-ai-tce/vllm/blob/more_cpu_models/.buildkite/nightly-benchmarks/README.md) and [performance benchmark description](../../.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md).
|
||||
|
||||
### Continuous Benchmarking
|
||||
|
||||
@ -942,12 +986,10 @@ The benchmarking currently runs on a predefined set of models configured in the
|
||||
|
||||
All continuous benchmarking results are automatically published to the public [vLLM Performance Dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm).
|
||||
|
||||
[](){ #nightly-benchmarks }
|
||||
|
||||
## Nightly Benchmarks
|
||||
|
||||
These compare vLLM's performance against alternatives (`tgi`, `trt-llm`, and `lmdeploy`) when there are major updates of vLLM (e.g., bumping up to a new version). They are primarily intended for consumers to evaluate when to choose vLLM over other options and are triggered on every commit with both the `perf-benchmarks` and `nightly-benchmarks` labels.
|
||||
|
||||
The latest nightly benchmark results are shared in major release blog posts such as [vLLM v0.6.0](https://blog.vllm.ai/2024/09/05/perf-update.html).
|
||||
|
||||
More information on the nightly benchmarks and their parameters can be found [here](gh-file:.buildkite/nightly-benchmarks/nightly-descriptions.md).
|
||||
More information on the nightly benchmarks and their parameters can be found [here](../../.buildkite/nightly-benchmarks/nightly-descriptions.md).
|
||||
|
||||
@ -64,7 +64,7 @@ Download the full log file from Buildkite locally.
|
||||
|
||||
Strip timestamps and colorization:
|
||||
|
||||
<gh-file:.buildkite/scripts/ci-clean-log.sh>
|
||||
[.buildkite/scripts/ci-clean-log.sh](../../../.buildkite/scripts/ci-clean-log.sh)
|
||||
|
||||
```bash
|
||||
./ci-clean-log.sh ci.log
|
||||
@ -87,7 +87,7 @@ tail -525 ci_build.log | wl-copy
|
||||
|
||||
CI test failures may be flaky. Use a bash loop to run repeatedly:
|
||||
|
||||
<gh-file:.buildkite/scripts/rerun-test.sh>
|
||||
[.buildkite/scripts/rerun-test.sh](../../../.buildkite/scripts/rerun-test.sh)
|
||||
|
||||
```bash
|
||||
./rerun-test.sh tests/v1/engine/test_engine_core_client.py::test_kv_cache_events[True-tcp]
|
||||
|
||||
@ -5,7 +5,7 @@ release in CI/CD. It is standard practice to submit a PR to update the
|
||||
PyTorch version as early as possible when a new [PyTorch stable
|
||||
release](https://github.com/pytorch/pytorch/blob/main/RELEASE.md#release-cadence) becomes available.
|
||||
This process is non-trivial due to the gap between PyTorch
|
||||
releases. Using <gh-pr:16859> as an example, this document outlines common steps to achieve this
|
||||
releases. Using <https://github.com/vllm-project/vllm/pull/16859> as an example, this document outlines common steps to achieve this
|
||||
update along with a list of potential issues and how to address them.
|
||||
|
||||
## Test PyTorch release candidates (RCs)
|
||||
@ -85,7 +85,7 @@ and timeout. Additionally, since vLLM's fastcheck pipeline runs in read-only mod
|
||||
it doesn't populate the cache, so re-running it to warm up the cache
|
||||
is ineffective.
|
||||
|
||||
While ongoing efforts like [#17419](gh-issue:17419)
|
||||
While ongoing efforts like <https://github.com/vllm-project/vllm/issues/17419>
|
||||
address the long build time at its source, the current workaround is to set `VLLM_CI_BRANCH`
|
||||
to a custom branch provided by @khluu (`VLLM_CI_BRANCH=khluu/use_postmerge_q`)
|
||||
when manually triggering a build on Buildkite. This branch accomplishes two things:
|
||||
@ -138,5 +138,5 @@ to handle some platforms separately. The separation of requirements and Dockerfi
|
||||
for different platforms in vLLM CI/CD allows us to selectively choose
|
||||
which platforms to update. For instance, updating XPU requires the corresponding
|
||||
release from [Intel Extension for PyTorch](https://github.com/intel/intel-extension-for-pytorch) by Intel.
|
||||
While <gh-pr:16859> updated vLLM to PyTorch 2.7.0 on CPU, CUDA, and ROCm,
|
||||
<gh-pr:17444> completed the update for XPU.
|
||||
While <https://github.com/vllm-project/vllm/pull/16859> updated vLLM to PyTorch 2.7.0 on CPU, CUDA, and ROCm,
|
||||
<https://github.com/vllm-project/vllm/pull/17444> completed the update for XPU.
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
# Dockerfile
|
||||
|
||||
We provide a <gh-file:docker/Dockerfile> to construct the image for running an OpenAI compatible server with vLLM.
|
||||
We provide a [docker/Dockerfile](../../../docker/Dockerfile) to construct the image for running an OpenAI compatible server with vLLM.
|
||||
More information about deploying with Docker can be found [here](../../deployment/docker.md).
|
||||
|
||||
Below is a visual representation of the multi-stage Dockerfile. The build graph contains the following nodes:
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
# Summary
|
||||
|
||||
!!! important
|
||||
Many decoder language models can now be automatically loaded using the [Transformers backend][transformers-backend] without having to implement them in vLLM. See if `vllm serve <model>` works first!
|
||||
Many decoder language models can now be automatically loaded using the [Transformers backend](../../models/supported_models.md#transformers) without having to implement them in vLLM. See if `vllm serve <model>` works first!
|
||||
|
||||
vLLM models are specialized [PyTorch](https://pytorch.org/) models that take advantage of various [features](../../features/README.md#compatibility-matrix) to optimize their performance.
|
||||
|
||||
|
||||
@ -5,7 +5,7 @@ This guide walks you through the steps to implement a basic vLLM model.
|
||||
## 1. Bring your model code
|
||||
|
||||
First, clone the PyTorch model code from the source repository.
|
||||
For instance, vLLM's [OPT model](gh-file:vllm/model_executor/models/opt.py) was adapted from
|
||||
For instance, vLLM's [OPT model](../../../vllm/model_executor/models/opt.py) was adapted from
|
||||
HuggingFace's [modeling_opt.py](https://github.com/huggingface/transformers/blob/main/src/transformers/models/opt/modeling_opt.py) file.
|
||||
|
||||
!!! warning
|
||||
@ -73,8 +73,8 @@ def forward(
|
||||
self,
|
||||
input_ids: torch.Tensor,
|
||||
positions: torch.Tensor,
|
||||
intermediate_tensors: Optional[IntermediateTensors] = None,
|
||||
inputs_embeds: Optional[torch.Tensor] = None,
|
||||
intermediate_tensors: IntermediateTensors | None = None,
|
||||
inputs_embeds: torch.Tensor | None = None,
|
||||
) -> torch.Tensor:
|
||||
...
|
||||
```
|
||||
@ -83,7 +83,7 @@ def forward(
|
||||
Currently, vLLM supports the basic multi-head attention mechanism and its variant with rotary positional embeddings.
|
||||
If your model employs a different attention mechanism, you will need to implement a new attention layer in vLLM.
|
||||
|
||||
For reference, check out our [Llama implementation](gh-file:vllm/model_executor/models/llama.py). vLLM already supports a large number of models. It is recommended to find a model similar to yours and adapt it to your model's architecture. Check out <gh-dir:vllm/model_executor/models> for more examples.
|
||||
For reference, check out our [Llama implementation](../../../vllm/model_executor/models/llama.py). vLLM already supports a large number of models. It is recommended to find a model similar to yours and adapt it to your model's architecture. Check out [vllm/model_executor/models](../../../vllm/model_executor/models) for more examples.
|
||||
|
||||
## 3. (Optional) Implement tensor parallelism and quantization support
|
||||
|
||||
@ -130,22 +130,22 @@ We consider 3 different scenarios:
|
||||
2. Models that combine Mamba layers (either Mamba-1 or Mamba-2) together with attention layers.
|
||||
3. Models that combine Mamba-like mechanisms (e.g., Linear Attention, ShortConv) together with attention layers.
|
||||
|
||||
For case (1), we recommend looking at the implementation of [`MambaForCausalLM`](gh-file:vllm/model_executor/models/mamba.py) (for Mamba-1) or [`Mamba2ForCausalLM`](gh-file:vllm/model_executor/models/mamba2.py) (for Mamba-2) as a reference.
|
||||
For case (1), we recommend looking at the implementation of [`MambaForCausalLM`](../../../vllm/model_executor/models/mamba.py) (for Mamba-1) or [`Mamba2ForCausalLM`](../../../vllm/model_executor/models/mamba2.py) (for Mamba-2) as a reference.
|
||||
The model should inherit protocol `IsAttentionFree` and also implement class methods `get_mamba_state_dtype_from_config` and `get_mamba_state_shape_from_config` to calculate the state shapes and data types from the config.
|
||||
For the mamba layers themselves, please use the [`MambaMixer`](gh-file:vllm/model_executor/layers/mamba/mamba_mixer.py) (for Mamba-1) or [`MambaMixer2`](gh-file:vllm/model_executor/layers/mamba/mamba_mixer2.py) (for Mamba-2) classes.
|
||||
For the mamba layers themselves, please use the [`MambaMixer`](../../../vllm/model_executor/layers/mamba/mamba_mixer.py) (for Mamba-1) or [`MambaMixer2`](../../../vllm/model_executor/layers/mamba/mamba_mixer2.py) (for Mamba-2) classes.
|
||||
Please *do not* use the `MambaCacheManager` (deprecated in V1) or replicate any of the V0-specific code paths in the existing model implementations.
|
||||
V0-only classes and code will be removed in the very near future.
|
||||
The model should also be added to the `MODELS_CONFIG_MAP` dictionary in <gh-file:vllm/model_executor/models/config.py> to ensure that the runtime defaults are optimized.
|
||||
The model should also be added to the `MODELS_CONFIG_MAP` dictionary in [vllm/model_executor/models/config.py](../../../vllm/model_executor/models/config.py) to ensure that the runtime defaults are optimized.
|
||||
|
||||
For case (2), we recommend using as a reference the implementation of [`JambaForCausalLM`](gh-file:vllm/model_executor/models/jamba.py) (for an example of a model that uses Mamba-1 and attention together) or [`BambaForCausalLM`](gh-file:vllm/model_executor/models/bamba.py) (for an example of a model that uses Mamba-2 and attention together).
|
||||
For case (2), we recommend using as a reference the implementation of [`JambaForCausalLM`](../../../vllm/model_executor/models/jamba.py) (for an example of a model that uses Mamba-1 and attention together) or [`BambaForCausalLM`](../../../vllm/model_executor/models/bamba.py) (for an example of a model that uses Mamba-2 and attention together).
|
||||
These models should follow the same instructions as case (1), but they should inherit protocol `IsHybrid` (instead of `IsAttentionFree`) and it is *not* necessary to add them to the `MODELS_CONFIG_MAP` (their runtime defaults will be inferred from the protocol).
|
||||
|
||||
For case (3), we recommend looking at the implementation of [`MiniMaxText01ForCausalLM`](gh-file:vllm/model_executor/models/minimax_text_01.py) or [`Lfm2ForCausalLM`](gh-file:vllm/model_executor/models/lfm2.py) as a reference, which use custom "mamba-like" layers `MiniMaxText01LinearAttention` and `ShortConv` respectively.
|
||||
For case (3), we recommend looking at the implementation of [`MiniMaxText01ForCausalLM`](../../../vllm/model_executor/models/minimax_text_01.py) or [`Lfm2ForCausalLM`](../../../vllm/model_executor/models/lfm2.py) as a reference, which use custom "mamba-like" layers `MiniMaxText01LinearAttention` and `ShortConv` respectively.
|
||||
Please follow the same guidelines as case (2) for implementing these models.
|
||||
We use "mamba-like" to refer to layers that posses a state that is updated in-place, rather than being appended-to (like KV cache for attention).
|
||||
For implementing new custom mamba-like layers, one should inherit from `MambaBase` and implement the methods `get_state_dtype`, `get_state_shape` to calculate the data types and state shapes at runtime, as well as `mamba_type` and `get_attn_backend`.
|
||||
It is also necessary to implement the "attention meta-data" class which handles the meta-data that is common across all layers.
|
||||
Please see [`LinearAttentionMetadata`](gh-file:vllm/v1/attention/backends/linear_attn.py) or [`ShortConvAttentionMetadata`](gh-file:v1/attention/backends/short_conv_attn.py) for examples of this.
|
||||
Please see [`LinearAttentionMetadata`](../../../vllm/v1/attention/backends/linear_attn.py) or [`ShortConvAttentionMetadata`](../../../vllm/v1/attention/backends/short_conv_attn.py) for examples of this.
|
||||
Finally, if one wants to support torch compile and CUDA graphs, it necessary to wrap the call to the mamba-like layer inside a custom op and register it.
|
||||
Please see the calls to `direct_register_custom_op` in <gh-file:vllm/model_executor/models/minimax_text_01.py> or <gh-file:vllm/model_executor/layers/mamba/short_conv.py> for examples of this.
|
||||
The new custom op should then be added to the list `_attention_ops` in <gh-file:vllm/config/compilation.py> to ensure that piecewise CUDA graphs works as intended.
|
||||
Please see the calls to `direct_register_custom_op` in [vllm/model_executor/models/minimax_text_01.py](../../../vllm/model_executor/models/minimax_text_01.py) or [vllm/model_executor/layers/mamba/short_conv.py](../../../vllm/model_executor/layers/mamba/short_conv.py) for examples of this.
|
||||
The new custom op should then be added to the list `_attention_ops` in [vllm/config/compilation.py](../../../vllm/config/compilation.py) to ensure that piecewise CUDA graphs works as intended.
|
||||
|
||||
@ -16,7 +16,7 @@ Further update the model as follows:
|
||||
...
|
||||
|
||||
@classmethod
|
||||
def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
|
||||
def get_placeholder_str(cls, modality: str, i: int) -> str | None:
|
||||
if modality.startswith("image"):
|
||||
return "<image>"
|
||||
|
||||
@ -45,14 +45,14 @@ Further update the model as follows:
|
||||
...
|
||||
|
||||
def _process_image_input(self, image_input: YourModelImageInputs) -> torch.Tensor:
|
||||
|
||||
assert self.vision_encoder is not None
|
||||
image_features = self.vision_encoder(image_input)
|
||||
return self.multi_modal_projector(image_features)
|
||||
|
||||
def get_multimodal_embeddings(
|
||||
self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
|
||||
|
||||
self,
|
||||
**kwargs: object,
|
||||
) -> MultiModalEmbeddings | None:
|
||||
# Validate the multimodal input keyword arguments
|
||||
image_input = self._parse_and_validate_image_input(**kwargs)
|
||||
if image_input is None:
|
||||
@ -110,7 +110,7 @@ to return the maximum number of input items for each modality supported by the m
|
||||
For example, if the model supports any number of images but only one video per prompt:
|
||||
|
||||
```python
|
||||
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
|
||||
def get_supported_mm_limits(self) -> Mapping[str, int | None]:
|
||||
return {"image": None, "video": 1}
|
||||
```
|
||||
|
||||
@ -258,7 +258,7 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
mm_options: Optional[Mapping[str, BaseDummyOptions]] = None,
|
||||
mm_options: Mapping[str, BaseDummyOptions] | None = None,
|
||||
) -> MultiModalDataDict:
|
||||
num_images = mm_counts.get("image", 0)
|
||||
|
||||
@ -421,8 +421,10 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
|
||||
```python
|
||||
def get_image_size_with_most_features(self) -> ImageSize:
|
||||
image_processor = self.get_image_processor()
|
||||
return ImageSize(width=image_processor.size["width"],
|
||||
height=image_processor.size["height"])
|
||||
return ImageSize(
|
||||
width=image_processor.size["width"],
|
||||
height=image_processor.size["height"],
|
||||
)
|
||||
```
|
||||
|
||||
Fuyu does not expect image placeholders in the inputs to HF processor, so
|
||||
@ -452,10 +454,12 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
|
||||
|
||||
return {
|
||||
"image":
|
||||
self._get_dummy_images(width=target_width,
|
||||
height=target_height,
|
||||
num_images=num_images,
|
||||
overrides=image_overrides)
|
||||
self._get_dummy_images(
|
||||
width=target_width,
|
||||
height=target_height,
|
||||
num_images=num_images,
|
||||
overrides=image_overrides,
|
||||
)
|
||||
}
|
||||
```
|
||||
|
||||
@ -503,7 +507,7 @@ return a schema of the tensors outputted by the HF processor that are related to
|
||||
```
|
||||
|
||||
!!! note
|
||||
Our [actual code](gh-file:vllm/model_executor/models/llava.py) additionally supports
|
||||
Our [actual code](../../../vllm/model_executor/models/llava.py) additionally supports
|
||||
pre-computed image embeddings, which can be passed to be model via the `image_embeds` argument.
|
||||
|
||||
=== "With postprocessing: Fuyu"
|
||||
@ -565,7 +569,7 @@ return a schema of the tensors outputted by the HF processor that are related to
|
||||
```
|
||||
|
||||
!!! note
|
||||
Our [actual code](gh-file:vllm/model_executor/models/fuyu.py) has special handling
|
||||
Our [actual code](../../../vllm/model_executor/models/fuyu.py) has special handling
|
||||
for text-only inputs to prevent unnecessary warnings from HF processor.
|
||||
|
||||
!!! note
|
||||
@ -744,8 +748,7 @@ Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies
|
||||
image_width=image_size.width,
|
||||
image_height=image_size.height,
|
||||
)
|
||||
image_tokens = ([_IMAGE_TOKEN_ID] * ncols +
|
||||
[_NEWLINE_TOKEN_ID]) * nrows
|
||||
image_tokens = ([_IMAGE_TOKEN_ID] * ncols + [_NEWLINE_TOKEN_ID]) * nrows
|
||||
|
||||
return PromptUpdateDetails.select_token_id(
|
||||
image_tokens + [bos_token_id],
|
||||
@ -781,8 +784,7 @@ Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies
|
||||
image_width=image_size.width,
|
||||
image_height=image_size.height,
|
||||
)
|
||||
image_tokens = ([_IMAGE_TOKEN_ID] * ncols +
|
||||
[_NEWLINE_TOKEN_ID]) * nrows
|
||||
image_tokens = ([_IMAGE_TOKEN_ID] * ncols + [_NEWLINE_TOKEN_ID]) * nrows
|
||||
|
||||
return PromptUpdateDetails.select_token_id(
|
||||
image_tokens + [bos_token_id],
|
||||
@ -810,9 +812,11 @@ to register them to the multi-modal registry:
|
||||
from vllm.model_executor.models.interfaces import SupportsMultiModal
|
||||
+ from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
|
||||
+ @MULTIMODAL_REGISTRY.register_processor(YourMultiModalProcessor,
|
||||
+ info=YourProcessingInfo,
|
||||
+ dummy_inputs=YourDummyInputsBuilder)
|
||||
+ @MULTIMODAL_REGISTRY.register_processor(
|
||||
+ YourMultiModalProcessor,
|
||||
+ info=YourProcessingInfo,
|
||||
+ dummy_inputs=YourDummyInputsBuilder,
|
||||
+ )
|
||||
class YourModelForImage2Seq(nn.Module, SupportsMultiModal):
|
||||
```
|
||||
|
||||
@ -824,8 +828,8 @@ Some HF processors directly insert feature tokens without replacing anything in
|
||||
|
||||
Examples:
|
||||
|
||||
- BLIP-2 (insert at start of prompt): <gh-file:vllm/model_executor/models/blip2.py>
|
||||
- Molmo (insert after `<|endoftext|>` token): <gh-file:vllm/model_executor/models/molmo.py>
|
||||
- BLIP-2 (insert at start of prompt): [vllm/model_executor/models/blip2.py](../../../vllm/model_executor/models/blip2.py)
|
||||
- Molmo (insert after `<|endoftext|>` token): [vllm/model_executor/models/molmo.py](../../../vllm/model_executor/models/molmo.py)
|
||||
|
||||
### Handling prompt updates unrelated to multi-modal data
|
||||
|
||||
@ -833,9 +837,9 @@ Examples:
|
||||
|
||||
Examples:
|
||||
|
||||
- Chameleon (appends `sep_token`): <gh-file:vllm/model_executor/models/chameleon.py>
|
||||
- Fuyu (appends `boa_token`): <gh-file:vllm/model_executor/models/fuyu.py>
|
||||
- Molmo (applies chat template which is not defined elsewhere): <gh-file:vllm/model_executor/models/molmo.py>
|
||||
- Chameleon (appends `sep_token`): [vllm/model_executor/models/chameleon.py](../../../vllm/model_executor/models/chameleon.py)
|
||||
- Fuyu (appends `boa_token`): [vllm/model_executor/models/fuyu.py](../../../vllm/model_executor/models/fuyu.py)
|
||||
- Molmo (applies chat template which is not defined elsewhere): [vllm/model_executor/models/molmo.py](../../../vllm/model_executor/models/molmo.py)
|
||||
|
||||
### Custom HF processor
|
||||
|
||||
@ -843,6 +847,6 @@ Some models don't define an HF processor class on HF Hub. In that case, you can
|
||||
|
||||
Examples:
|
||||
|
||||
- DeepSeek-VL2: <gh-file:vllm/model_executor/models/deepseek_vl2.py>
|
||||
- InternVL: <gh-file:vllm/model_executor/models/internvl.py>
|
||||
- Qwen-VL: <gh-file:vllm/model_executor/models/qwen_vl.py>
|
||||
- DeepSeek-VL2: [vllm/model_executor/models/deepseek_vl2.py](../../../vllm/model_executor/models/deepseek_vl2.py)
|
||||
- InternVL: [vllm/model_executor/models/internvl.py](../../../vllm/model_executor/models/internvl.py)
|
||||
- Qwen-VL: [vllm/model_executor/models/qwen_vl.py](../../../vllm/model_executor/models/qwen_vl.py)
|
||||
|
||||
@ -8,11 +8,11 @@ This page provides detailed instructions on how to do so.
|
||||
|
||||
## Built-in models
|
||||
|
||||
To add a model directly to the vLLM library, start by forking our [GitHub repository](https://github.com/vllm-project/vllm) and then [build it from source][build-from-source].
|
||||
To add a model directly to the vLLM library, start by forking our [GitHub repository](https://github.com/vllm-project/vllm) and then [build it from source](../../getting_started/installation/gpu.md#build-wheel-from-source).
|
||||
This gives you the ability to modify the codebase and test your model.
|
||||
|
||||
After you have implemented your model (see [tutorial](basic.md)), put it into the <gh-dir:vllm/model_executor/models> directory.
|
||||
Then, add your model class to `_VLLM_MODELS` in <gh-file:vllm/model_executor/models/registry.py> so that it is automatically registered upon importing vLLM.
|
||||
After you have implemented your model (see [tutorial](basic.md)), put it into the [vllm/model_executor/models](../../../vllm/model_executor/models) directory.
|
||||
Then, add your model class to `_VLLM_MODELS` in [vllm/model_executor/models/registry.py](../../../vllm/model_executor/models/registry.py) so that it is automatically registered upon importing vLLM.
|
||||
Finally, update our [list of supported models](../../models/supported_models.md) to promote your model!
|
||||
|
||||
!!! important
|
||||
@ -42,7 +42,7 @@ def register():
|
||||
|
||||
ModelRegistry.register_model(
|
||||
"YourModelForCausalLM",
|
||||
"your_code:YourModelForCausalLM"
|
||||
"your_code:YourModelForCausalLM",
|
||||
)
|
||||
```
|
||||
|
||||
|
||||
@ -9,7 +9,7 @@ Without them, the CI for your PR will fail.
|
||||
|
||||
### Model loading
|
||||
|
||||
Include an example HuggingFace repository for your model in <gh-file:tests/models/registry.py>.
|
||||
Include an example HuggingFace repository for your model in [tests/models/registry.py](../../../tests/models/registry.py).
|
||||
This enables a unit test that loads dummy weights to ensure that the model can be initialized in vLLM.
|
||||
|
||||
!!! important
|
||||
@ -26,26 +26,24 @@ Passing these tests provides more confidence that your implementation is correct
|
||||
|
||||
### Model correctness
|
||||
|
||||
These tests compare the model outputs of vLLM against [HF Transformers](https://github.com/huggingface/transformers). You can add new tests under the subdirectories of <gh-dir:tests/models>.
|
||||
These tests compare the model outputs of vLLM against [HF Transformers](https://github.com/huggingface/transformers). You can add new tests under the subdirectories of [tests/models](../../../tests/models).
|
||||
|
||||
#### Generative models
|
||||
|
||||
For [generative models](../../models/generative_models.md), there are two levels of correctness tests, as defined in <gh-file:tests/models/utils.py>:
|
||||
For [generative models](../../models/generative_models.md), there are two levels of correctness tests, as defined in [tests/models/utils.py](../../../tests/models/utils.py):
|
||||
|
||||
- Exact correctness (`check_outputs_equal`): The text outputted by vLLM should exactly match the text outputted by HF.
|
||||
- Logprobs similarity (`check_logprobs_close`): The logprobs outputted by vLLM should be in the top-k logprobs outputted by HF, and vice versa.
|
||||
|
||||
#### Pooling models
|
||||
|
||||
For [pooling models](../../models/pooling_models.md), we simply check the cosine similarity, as defined in <gh-file:tests/models/utils.py>.
|
||||
|
||||
[](){ #mm-processing-tests }
|
||||
For [pooling models](../../models/pooling_models.md), we simply check the cosine similarity, as defined in [tests/models/utils.py](../../../tests/models/utils.py).
|
||||
|
||||
### Multi-modal processing
|
||||
|
||||
#### Common tests
|
||||
|
||||
Adding your model to <gh-file:tests/models/multimodal/processing/test_common.py> verifies that the following input combinations result in the same outputs:
|
||||
Adding your model to [tests/models/multimodal/processing/test_common.py](../../../tests/models/multimodal/processing/test_common.py) verifies that the following input combinations result in the same outputs:
|
||||
|
||||
- Text + multi-modal data
|
||||
- Tokens + multi-modal data
|
||||
@ -54,6 +52,6 @@ Adding your model to <gh-file:tests/models/multimodal/processing/test_common.py>
|
||||
|
||||
#### Model-specific tests
|
||||
|
||||
You can add a new file under <gh-dir:tests/models/multimodal/processing> to run tests that only apply to your model.
|
||||
You can add a new file under [tests/models/multimodal/processing](../../../tests/models/multimodal/processing) to run tests that only apply to your model.
|
||||
|
||||
For example, if the HF processor for your model accepts user-specified keyword arguments, you can verify that the keyword arguments are being applied correctly, such as in <gh-file:tests/models/multimodal/processing/test_phi3v.py>.
|
||||
For example, if the HF processor for your model accepts user-specified keyword arguments, you can verify that the keyword arguments are being applied correctly, such as in [tests/models/multimodal/processing/test_phi3v.py](../../../tests/models/multimodal/processing/test_phi3v.py).
|
||||
|
||||
@ -15,8 +15,9 @@ Declare supported languages and capabilities:
|
||||
- Set `supports_transcription_only=True` if the model should not serve text generation (eg Whisper).
|
||||
|
||||
??? code "supported_languages and supports_transcription_only"
|
||||
|
||||
```python
|
||||
from typing import ClassVar, Mapping, Optional, Literal
|
||||
from typing import ClassVar, Mapping, Literal
|
||||
import numpy as np
|
||||
import torch
|
||||
from torch import nn
|
||||
@ -43,6 +44,7 @@ Provide an ASR configuration via [get_speech_to_text_config][vllm.model_executor
|
||||
This is for controlling general behavior of the API when serving your model:
|
||||
|
||||
??? code "get_speech_to_text_config()"
|
||||
|
||||
```python
|
||||
class YourASRModel(nn.Module, SupportsTranscription):
|
||||
...
|
||||
@ -71,6 +73,7 @@ Implement the prompt construction via [get_generation_prompt][vllm.model_executo
|
||||
Return a dict containing `multi_modal_data` with the audio, and either a `prompt` string or `prompt_token_ids`:
|
||||
|
||||
??? code "get_generation_prompt()"
|
||||
|
||||
```python
|
||||
class YourASRModel(nn.Module, SupportsTranscription):
|
||||
...
|
||||
@ -81,10 +84,10 @@ Return a dict containing `multi_modal_data` with the audio, and either a `prompt
|
||||
audio: np.ndarray,
|
||||
stt_config: SpeechToTextConfig,
|
||||
model_config: ModelConfig,
|
||||
language: Optional[str],
|
||||
language: str | None,
|
||||
task_type: Literal["transcribe", "translate"],
|
||||
request_prompt: str,
|
||||
to_language: Optional[str],
|
||||
to_language: str | None,
|
||||
) -> PromptType:
|
||||
# Example with a free-form instruction prompt
|
||||
task_word = "Transcribe" if task_type == "transcribe" else "Translate"
|
||||
@ -107,6 +110,7 @@ Return a dict containing `multi_modal_data` with the audio, and either a `prompt
|
||||
Return a dict with separate `encoder_prompt` and `decoder_prompt` entries:
|
||||
|
||||
??? code "get_generation_prompt()"
|
||||
|
||||
```python
|
||||
class YourASRModel(nn.Module, SupportsTranscription):
|
||||
...
|
||||
@ -117,10 +121,10 @@ Return a dict with separate `encoder_prompt` and `decoder_prompt` entries:
|
||||
audio: np.ndarray,
|
||||
stt_config: SpeechToTextConfig,
|
||||
model_config: ModelConfig,
|
||||
language: Optional[str],
|
||||
language: str | None,
|
||||
task_type: Literal["transcribe", "translate"],
|
||||
request_prompt: str,
|
||||
to_language: Optional[str],
|
||||
to_language: str | None,
|
||||
) -> PromptType:
|
||||
if language is None:
|
||||
raise ValueError("Language must be specified")
|
||||
@ -148,12 +152,16 @@ Language validation via [validate_language][vllm.model_executor.models.interface
|
||||
If your model requires a language and you want a default, override this method (see Whisper):
|
||||
|
||||
??? code "validate_language()"
|
||||
|
||||
```python
|
||||
@classmethod
|
||||
def validate_language(cls, language: Optional[str]) -> Optional[str]:
|
||||
def validate_language(cls, language: str | None) -> str | None:
|
||||
if language is None:
|
||||
logger.warning(
|
||||
"Defaulting to language='en'. If you wish to transcribe audio in a different language, pass the `language` field.")
|
||||
"Defaulting to language='en'. If you wish to transcribe "
|
||||
"audio in a different language, pass the `language` field "
|
||||
"in the TranscriptionRequest."
|
||||
)
|
||||
language = "en"
|
||||
return super().validate_language(language)
|
||||
```
|
||||
@ -165,6 +173,7 @@ Token accounting for streaming via [get_num_audio_tokens][vllm.model_executor.mo
|
||||
Provide a fast duration→token estimate to improve streaming usage statistics:
|
||||
|
||||
??? code "get_num_audio_tokens()"
|
||||
|
||||
```python
|
||||
class YourASRModel(nn.Module, SupportsTranscription):
|
||||
...
|
||||
@ -175,7 +184,7 @@ Provide a fast duration→token estimate to improve streaming usage statistics:
|
||||
audio_duration_s: float,
|
||||
stt_config: SpeechToTextConfig,
|
||||
model_config: ModelConfig,
|
||||
) -> Optional[int]:
|
||||
) -> int | None:
|
||||
# Return None if unknown; otherwise return an estimate.
|
||||
return int(audio_duration_s * stt_config.sample_rate // 320) # example
|
||||
```
|
||||
@ -191,6 +200,7 @@ The API server takes care of basic audio I/O and optional chunking before buildi
|
||||
Relevant server logic:
|
||||
|
||||
??? code "_preprocess_speech_to_text()"
|
||||
|
||||
```python
|
||||
# vllm/entrypoints/openai/speech_to_text.py
|
||||
async def _preprocess_speech_to_text(...):
|
||||
@ -238,9 +248,9 @@ No extra registration is required beyond having your model class available via t
|
||||
|
||||
## Examples in-tree
|
||||
|
||||
- Whisper encoder–decoder (audio-only): <gh-file:vllm/model_executor/models/whisper.py>
|
||||
- Voxtral decoder-only (audio embeddings + LLM): <gh-file:vllm/model_executor/models/voxtral.py>
|
||||
- Gemma3n decoder-only with fixed instruction prompt: <gh-file:vllm/model_executor/models/gemma3n_mm.py>
|
||||
- Whisper encoder–decoder (audio-only): [vllm/model_executor/models/whisper.py](../../../vllm/model_executor/models/whisper.py)
|
||||
- Voxtral decoder-only (audio embeddings + LLM): [vllm/model_executor/models/voxtral.py](../../../vllm/model_executor/models/voxtral.py)
|
||||
- Gemma3n decoder-only with fixed instruction prompt: [vllm/model_executor/models/gemma3n_mm.py](../../../vllm/model_executor/models/gemma3n_mm.py)
|
||||
|
||||
## Test with the API
|
||||
|
||||
@ -268,7 +278,7 @@ Once your model implements `SupportsTranscription`, you can test the endpoints (
|
||||
http://localhost:8000/v1/audio/translations
|
||||
```
|
||||
|
||||
Or check out more examples in <gh-file:examples/online_serving>.
|
||||
Or check out more examples in [examples/online_serving](../../../examples/online_serving).
|
||||
|
||||
!!! note
|
||||
- If your model handles chunking internally (e.g., via its processor or encoder), set `min_energy_split_window_size=None` in the returned `SpeechToTextConfig` to disable server-side chunking.
|
||||
|
||||
@ -33,7 +33,7 @@ Traces can be visualized using <https://ui.perfetto.dev/>.
|
||||
|
||||
#### Offline Inference
|
||||
|
||||
Refer to <gh-file:examples/offline_inference/simple_profiling.py> for an example.
|
||||
Refer to [examples/offline_inference/simple_profiling.py](../../examples/offline_inference/simple_profiling.py) for an example.
|
||||
|
||||
#### OpenAI Server
|
||||
|
||||
|
||||
@ -1,7 +1,5 @@
|
||||
# Using Docker
|
||||
|
||||
[](){ #deployment-docker-pre-built-image }
|
||||
|
||||
## Use vLLM's Official Docker Image
|
||||
|
||||
vLLM offers an official Docker image for deployment.
|
||||
@ -10,7 +8,7 @@ The image can be used to run OpenAI compatible server and is available on Docker
|
||||
```bash
|
||||
docker run --runtime nvidia --gpus all \
|
||||
-v ~/.cache/huggingface:/root/.cache/huggingface \
|
||||
--env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \
|
||||
--env "HF_TOKEN=$HF_TOKEN" \
|
||||
-p 8000:8000 \
|
||||
--ipc=host \
|
||||
vllm/vllm-openai:latest \
|
||||
@ -22,7 +20,7 @@ This image can also be used with other container engines such as [Podman](https:
|
||||
```bash
|
||||
podman run --device nvidia.com/gpu=all \
|
||||
-v ~/.cache/huggingface:/root/.cache/huggingface \
|
||||
--env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \
|
||||
--env "HF_TOKEN=$HF_TOKEN" \
|
||||
-p 8000:8000 \
|
||||
--ipc=host \
|
||||
docker.io/vllm/vllm-openai:latest \
|
||||
@ -37,7 +35,7 @@ You can add any other [engine-args](../configuration/engine_args.md) you need af
|
||||
memory to share data between processes under the hood, particularly for tensor parallel inference.
|
||||
|
||||
!!! note
|
||||
Optional dependencies are not included in order to avoid licensing issues (e.g. <gh-issue:8030>).
|
||||
Optional dependencies are not included in order to avoid licensing issues (e.g. <https://github.com/vllm-project/vllm/issues/8030>).
|
||||
|
||||
If you need to use those dependencies (having accepted the license terms),
|
||||
create a custom Dockerfile on top of the base image with an extra layer that installs them:
|
||||
@ -62,11 +60,9 @@ You can add any other [engine-args](../configuration/engine_args.md) you need af
|
||||
RUN uv pip install --system git+https://github.com/huggingface/transformers.git
|
||||
```
|
||||
|
||||
[](){ #deployment-docker-build-image-from-source }
|
||||
|
||||
## Building vLLM's Docker Image from Source
|
||||
|
||||
You can build and run vLLM from source via the provided <gh-file:docker/Dockerfile>. To build vLLM:
|
||||
You can build and run vLLM from source via the provided [docker/Dockerfile](../../docker/Dockerfile). To build vLLM:
|
||||
|
||||
```bash
|
||||
# optionally specifies: --build-arg max_jobs=8 --build-arg nvcc_threads=2
|
||||
@ -128,7 +124,7 @@ To run vLLM with the custom-built Docker image:
|
||||
docker run --runtime nvidia --gpus all \
|
||||
-v ~/.cache/huggingface:/root/.cache/huggingface \
|
||||
-p 8000:8000 \
|
||||
--env "HUGGING_FACE_HUB_TOKEN=<secret>" \
|
||||
--env "HF_TOKEN=<secret>" \
|
||||
vllm/vllm-openai <args...>
|
||||
```
|
||||
|
||||
|
||||
@ -1,11 +1,9 @@
|
||||
# Anyscale
|
||||
|
||||
[](){ #deployment-anyscale }
|
||||
|
||||
[Anyscale](https://www.anyscale.com) is a managed, multi-cloud platform developed by the creators of Ray.
|
||||
|
||||
Anyscale automates the entire lifecycle of Ray clusters in your AWS, GCP, or Azure account, delivering the flexibility of open-source Ray
|
||||
without the operational overhead of maintaining Kubernetes control planes, configuring autoscalers, managing observability stacks, or manually managing head and worker nodes with helper scripts like <gh-file:examples/online_serving/run_cluster.sh>.
|
||||
without the operational overhead of maintaining Kubernetes control planes, configuring autoscalers, managing observability stacks, or manually managing head and worker nodes with helper scripts like [examples/online_serving/run_cluster.sh](../../../examples/online_serving/run_cluster.sh).
|
||||
|
||||
When serving large language models with vLLM, Anyscale can rapidly provision [production-ready HTTPS endpoints](https://docs.anyscale.com/examples/deploy-ray-serve-llms) or [fault-tolerant batch inference jobs](https://docs.anyscale.com/examples/ray-data-llm).
|
||||
|
||||
|
||||
@ -63,7 +63,7 @@ If successful, you should be returned a CURL command that you can call inference
|
||||
|
||||
??? console "Command"
|
||||
|
||||
```python
|
||||
```bash
|
||||
curl -X POST https://api.cortex.cerebrium.ai/v4/p-xxxxxx/vllm/run \
|
||||
-H 'Content-Type: application/json' \
|
||||
-H 'Authorization: <JWT TOKEN>' \
|
||||
@ -81,7 +81,7 @@ You should get a response like:
|
||||
|
||||
??? console "Response"
|
||||
|
||||
```python
|
||||
```json
|
||||
{
|
||||
"run_id": "52911756-3066-9ae8-bcc9-d9129d1bd262",
|
||||
"result": {
|
||||
|
||||
@ -83,7 +83,7 @@ After the provisioning, you can interact with the model by using the OpenAI SDK:
|
||||
|
||||
client = OpenAI(
|
||||
base_url="https://gateway.<gateway domain>",
|
||||
api_key="<YOUR-DSTACK-SERVER-ACCESS-TOKEN>"
|
||||
api_key="<YOUR-DSTACK-SERVER-ACCESS-TOKEN>",
|
||||
)
|
||||
|
||||
completion = client.chat.completions.create(
|
||||
@ -93,7 +93,7 @@ After the provisioning, you can interact with the model by using the OpenAI SDK:
|
||||
"role": "user",
|
||||
"content": "Compose a poem that explains the concept of recursion in programming.",
|
||||
}
|
||||
]
|
||||
],
|
||||
)
|
||||
|
||||
print(completion.choices[0].message.content)
|
||||
|
||||
@ -34,7 +34,7 @@ pip install vllm haystack-ai
|
||||
api_key=Secret.from_token("VLLM-PLACEHOLDER-API-KEY"),
|
||||
model="mistralai/Mistral-7B-Instruct-v0.1",
|
||||
api_base_url="http://{your-vLLM-host-ip}:{your-vLLM-host-port}/v1",
|
||||
generation_kwargs = {"max_tokens": 512}
|
||||
generation_kwargs={"max_tokens": 512},
|
||||
)
|
||||
|
||||
response = generator.run(
|
||||
|
||||
@ -32,28 +32,28 @@ This is the easiest way to get started with vLLM on Hugging Face Inference Endpo
|
||||
import os
|
||||
|
||||
client = OpenAI(
|
||||
base_url = DEPLOYMENT_URL,
|
||||
api_key = os.environ["HF_TOKEN"] # https://huggingface.co/settings/tokens
|
||||
base_url=DEPLOYMENT_URL,
|
||||
api_key=os.environ["HF_TOKEN"], # https://huggingface.co/settings/tokens
|
||||
)
|
||||
|
||||
chat_completion = client.chat.completions.create(
|
||||
model = "HuggingFaceTB/SmolLM3-3B",
|
||||
messages = [
|
||||
model="HuggingFaceTB/SmolLM3-3B",
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "Give me a brief explanation of gravity in simple terms."
|
||||
"text": "Give me a brief explanation of gravity in simple terms.",
|
||||
}
|
||||
]
|
||||
],
|
||||
}
|
||||
],
|
||||
stream = True
|
||||
stream=True,
|
||||
)
|
||||
|
||||
for message in chat_completion:
|
||||
print(message.choices[0].delta.content, end = "")
|
||||
print(message.choices[0].delta.content, end="")
|
||||
```
|
||||
|
||||
!!! note
|
||||
@ -86,34 +86,34 @@ This method applies to models with the [`transformers` library tag](https://hugg
|
||||
import os
|
||||
|
||||
client = OpenAI(
|
||||
base_url = DEPLOYMENT_URL,
|
||||
api_key = os.environ["HF_TOKEN"] # https://huggingface.co/settings/tokens
|
||||
base_url=DEPLOYMENT_URL,
|
||||
api_key=os.environ["HF_TOKEN"], # https://huggingface.co/settings/tokens
|
||||
)
|
||||
|
||||
chat_completion = client.chat.completions.create(
|
||||
model = "ibm-granite/granite-docling-258M",
|
||||
messages = [
|
||||
model="ibm-granite/granite-docling-258M",
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": "https://huggingface.co/ibm-granite/granite-docling-258M/resolve/main/assets/new_arxiv.png"
|
||||
}
|
||||
"url": "https://huggingface.co/ibm-granite/granite-docling-258M/resolve/main/assets/new_arxiv.png",
|
||||
},
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "Convert this page to docling."
|
||||
}
|
||||
"text": "Convert this page to docling.",
|
||||
},
|
||||
]
|
||||
}
|
||||
],
|
||||
stream = True
|
||||
stream=True,
|
||||
)
|
||||
|
||||
for message in chat_completion:
|
||||
print(message.choices[0].delta.content, end = "")
|
||||
print(message.choices[0].delta.content, end="")
|
||||
```
|
||||
|
||||
!!! note
|
||||
|
||||
@ -36,15 +36,16 @@ pip install vllm litellm
|
||||
```python
|
||||
import litellm
|
||||
|
||||
messages = [{ "content": "Hello, how are you?","role": "user"}]
|
||||
messages = [{"content": "Hello, how are you?", "role": "user"}]
|
||||
|
||||
# hosted_vllm is prefix key word and necessary
|
||||
response = litellm.completion(
|
||||
model="hosted_vllm/qwen/Qwen1.5-0.5B-Chat", # pass the vllm model name
|
||||
messages=messages,
|
||||
api_base="http://{your-vllm-server-host}:{your-vllm-server-port}/v1",
|
||||
temperature=0.2,
|
||||
max_tokens=80)
|
||||
model="hosted_vllm/qwen/Qwen1.5-0.5B-Chat", # pass the vllm model name
|
||||
messages=messages,
|
||||
api_base="http://{your-vllm-server-host}:{your-vllm-server-port}/v1",
|
||||
temperature=0.2,
|
||||
max_tokens=80,
|
||||
)
|
||||
|
||||
print(response)
|
||||
```
|
||||
|
||||
@ -35,7 +35,7 @@ Deploy the following yaml file `lws.yaml`
|
||||
- name: vllm-leader
|
||||
image: docker.io/vllm/vllm-openai:latest
|
||||
env:
|
||||
- name: HUGGING_FACE_HUB_TOKEN
|
||||
- name: HF_TOKEN
|
||||
value: <your-hf-token>
|
||||
command:
|
||||
- sh
|
||||
@ -83,7 +83,7 @@ Deploy the following yaml file `lws.yaml`
|
||||
ephemeral-storage: 800Gi
|
||||
cpu: 125
|
||||
env:
|
||||
- name: HUGGING_FACE_HUB_TOKEN
|
||||
- name: HF_TOKEN
|
||||
value: <your-hf-token>
|
||||
volumeMounts:
|
||||
- mountPath: /dev/shm
|
||||
|
||||
@ -36,11 +36,11 @@ pip install -U vllm \
|
||||
vllm serve qwen/Qwen1.5-0.5B-Chat --port 8001
|
||||
```
|
||||
|
||||
1. Use the script: <gh-file:examples/online_serving/retrieval_augmented_generation_with_langchain.py>
|
||||
1. Use the script: [examples/online_serving/retrieval_augmented_generation_with_langchain.py](../../../examples/online_serving/retrieval_augmented_generation_with_langchain.py)
|
||||
|
||||
1. Run the script
|
||||
|
||||
```python
|
||||
```bash
|
||||
python retrieval_augmented_generation_with_langchain.py
|
||||
```
|
||||
|
||||
@ -74,10 +74,10 @@ pip install vllm \
|
||||
vllm serve qwen/Qwen1.5-0.5B-Chat --port 8001
|
||||
```
|
||||
|
||||
1. Use the script: <gh-file:examples/online_serving/retrieval_augmented_generation_with_llamaindex.py>
|
||||
1. Use the script: [examples/online_serving/retrieval_augmented_generation_with_llamaindex.py](../../../examples/online_serving/retrieval_augmented_generation_with_llamaindex.py)
|
||||
|
||||
1. Run the script:
|
||||
|
||||
```python
|
||||
```bash
|
||||
python retrieval_augmented_generation_with_llamaindex.py
|
||||
```
|
||||
|
||||
@ -20,7 +20,7 @@ pip install vllm streamlit openai
|
||||
vllm serve Qwen/Qwen1.5-0.5B-Chat
|
||||
```
|
||||
|
||||
1. Use the script: <gh-file:examples/online_serving/streamlit_openai_chatbot_webserver.py>
|
||||
1. Use the script: [examples/online_serving/streamlit_openai_chatbot_webserver.py](../../../examples/online_serving/streamlit_openai_chatbot_webserver.py)
|
||||
|
||||
1. Start the streamlit web UI and start to chat:
|
||||
|
||||
|
||||
@ -82,7 +82,7 @@ Next, start the vLLM server as a Kubernetes Deployment and Service:
|
||||
"vllm serve meta-llama/Llama-3.2-1B-Instruct"
|
||||
]
|
||||
env:
|
||||
- name: HUGGING_FACE_HUB_TOKEN
|
||||
- name: HF_TOKEN
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: hf-token-secret
|
||||
@ -209,7 +209,7 @@ INFO: Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
|
||||
"vllm serve mistralai/Mistral-7B-Instruct-v0.3 --trust-remote-code --enable-chunked-prefill --max_num_batched_tokens 1024"
|
||||
]
|
||||
env:
|
||||
- name: HUGGING_FACE_HUB_TOKEN
|
||||
- name: HF_TOKEN
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: hf-token-secret
|
||||
@ -298,7 +298,7 @@ INFO: Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
|
||||
"vllm serve mistralai/Mistral-7B-v0.3 --port 8000 --trust-remote-code --enable-chunked-prefill --max_num_batched_tokens 1024"
|
||||
]
|
||||
env:
|
||||
- name: HUGGING_FACE_HUB_TOKEN
|
||||
- name: HF_TOKEN
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: hf-token-secret
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user