Compare commits
172 Commits
skip-lmfe-
...
woosuk/tes
| Author | SHA1 | Date | |
|---|---|---|---|
| cb439737db | |||
| a1cac48477 | |||
| 6102536d65 | |||
| f65da69c72 | |||
| a5281395e9 | |||
| eda71c2847 | |||
| 1bff9a59ec | |||
| 69c9a01538 | |||
| 8935ca208d | |||
| 938c43ea7f | |||
| 0a9ef0cfce | |||
| e5b438a247 | |||
| 0b99f5d302 | |||
| 1f491aa0c8 | |||
| de92d916fe | |||
| a1063628a4 | |||
| d796375258 | |||
| 14f8456344 | |||
| 4794c2bd92 | |||
| d3cbaa08dc | |||
| 828523ad8e | |||
| 136a17fe6e | |||
| f57438338d | |||
| 5d598680e3 | |||
| 8f4b313c37 | |||
| f93e348010 | |||
| f54f85129e | |||
| d4d1a6024f | |||
| db1764e4e0 | |||
| 7f83b4ee8e | |||
| 5c3bae1a6a | |||
| 5210dc3940 | |||
| 650b51f9f9 | |||
| 6256697997 | |||
| 71557a5f7c | |||
| f3c378ffa7 | |||
| f5ed68ef63 | |||
| efdef57b1f | |||
| b8a4572157 | |||
| 302ef403a2 | |||
| 8865da157b | |||
| f0862eae43 | |||
| 8c851f6d04 | |||
| 7cfa420f49 | |||
| a27b288e4a | |||
| e471d7ca7e | |||
| c43ca8259e | |||
| 85a65e7f51 | |||
| a2986b3e33 | |||
| 96b9aa5aa0 | |||
| e66d787bce | |||
| bfad142e25 | |||
| 9354660036 | |||
| 07ca70af8d | |||
| 2dcd12d357 | |||
| 579d2e5458 | |||
| 0512c04aee | |||
| 7e0ef4084a | |||
| 4aed506b65 | |||
| a86b4c58e8 | |||
| dddad8a81c | |||
| 7f783b8a4a | |||
| ff4810ba73 | |||
| 9d6964926e | |||
| 0e65818910 | |||
| 380f17527c | |||
| b92ab3deda | |||
| acaa2c0a4a | |||
| 82af928c41 | |||
| 87efc681db | |||
| c3a722fcb2 | |||
| aba48f7db1 | |||
| 04b5f9802d | |||
| efc8f7d814 | |||
| 6d87a2838c | |||
| e6cdbd6792 | |||
| df850c4912 | |||
| 720394de43 | |||
| 88a49745af | |||
| ca683a2a72 | |||
| e9f1b8c9e9 | |||
| ea97940d6c | |||
| fdd32750f0 | |||
| c715ba3735 | |||
| 9c4cb68339 | |||
| 780eb03d9b | |||
| ef9676a1f1 | |||
| 70b1b330e1 | |||
| d1d063a588 | |||
| 7e6edb1469 | |||
| 74704d4553 | |||
| d2f816d6ff | |||
| 577d498212 | |||
| fd85c9f426 | |||
| d32c611f45 | |||
| 01ad27faff | |||
| 481545b397 | |||
| d3cc8427c0 | |||
| 4821ac1b4d | |||
| 4497c8f821 | |||
| 2e36cdbe2b | |||
| fe3edb4cf0 | |||
| 29350922c6 | |||
| 8ae169286f | |||
| 8a0af6a561 | |||
| cfded80793 | |||
| b59dd19b55 | |||
| 3e051bda82 | |||
| 8317f72354 | |||
| d8bebb008a | |||
| 35bc22f23c | |||
| fa96fb9c70 | |||
| e3fdb627d9 | |||
| 7200a21cd1 | |||
| 577c72a227 | |||
| 314285d4f2 | |||
| d2a7938582 | |||
| 89342ce4c0 | |||
| f89f599395 | |||
| e251e457c5 | |||
| afc47e4de7 | |||
| e3b90c1ba2 | |||
| 134f70b3ed | |||
| a1b2d658ee | |||
| 5c7fe25491 | |||
| 53c9a7cee2 | |||
| 0d21b9b51e | |||
| 10214b6935 | |||
| 4a61950f4d | |||
| 3263799056 | |||
| 8e67b2557a | |||
| 4073c82c4e | |||
| 767c3ab869 | |||
| 4f207c7174 | |||
| 782505ed8e | |||
| 98f30b8cba | |||
| 3cd36660f7 | |||
| 46ad73955a | |||
| 41f3884438 | |||
| 60e419c1ee | |||
| 7ef6052804 | |||
| 4fca1a1bd2 | |||
| a6049be73c | |||
| 18ed7746ea | |||
| 8fcaaf6a16 | |||
| 9bb38130cb | |||
| b91d8db873 | |||
| 045b396d09 | |||
| 76852017ea | |||
| 82e64c7a20 | |||
| 4ca204055e | |||
| c5c8f5ea59 | |||
| 01653a917b | |||
| 0cd103e7cb | |||
| 5be7ca1b99 | |||
| f0a30a067b | |||
| 9d6cff3ede | |||
| a25f2adee9 | |||
| d0bed837ac | |||
| f7ee69868a | |||
| d2a71530c1 | |||
| 086609de64 | |||
| 727144bed1 | |||
| 55392bc879 | |||
| ddaff2938e | |||
| 27ed39a347 | |||
| 8f8474fbe3 | |||
| be067861c6 | |||
| 5bc26c438d | |||
| eef921f45e | |||
| e317414ce1 | |||
| 949cb0170d |
@ -5,11 +5,11 @@ import os
|
||||
import sys
|
||||
import zipfile
|
||||
|
||||
# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 450 MiB
|
||||
# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 500 MiB
|
||||
# Note that we have 800 MiB quota, please use it wisely.
|
||||
# See https://github.com/pypi/support/issues/6326 .
|
||||
# Please also sync the value with the one in Dockerfile.
|
||||
VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 450))
|
||||
VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 500))
|
||||
|
||||
|
||||
def print_top_10_largest_files(zip_file):
|
||||
|
||||
12
.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml
Normal file
12
.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml
Normal file
@ -0,0 +1,12 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m HandH1998/QQQ-Llama-3-8b-g128 -b 32 -l 1000 -f 5 -t 1
|
||||
model_name: "HandH1998/QQQ-Llama-3-8b-g128"
|
||||
tasks:
|
||||
- name: "gsm8k"
|
||||
metrics:
|
||||
- name: "exact_match,strict-match"
|
||||
value: 0.419
|
||||
- name: "exact_match,flexible-extract"
|
||||
value: 0.416
|
||||
limit: 1000
|
||||
num_fewshot: 5
|
||||
@ -0,0 +1,11 @@
|
||||
# For hf script, without -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh -m meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -b 32 -l 100 -t 8
|
||||
model_name: "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
|
||||
backend: "vllm-vlm"
|
||||
tasks:
|
||||
- name: "chartqa"
|
||||
metrics:
|
||||
- name: "relaxed_accuracy,none"
|
||||
value: 0.90
|
||||
limit: 100
|
||||
num_fewshot: 0
|
||||
@ -0,0 +1,11 @@
|
||||
# For hf script, without -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -b 32 -l 250 -t 8 -f 5
|
||||
model_name: "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
|
||||
backend: "vllm-vlm"
|
||||
tasks:
|
||||
- name: "mmlu_pro"
|
||||
metrics:
|
||||
- name: "exact_match,custom-extract"
|
||||
value: 0.80
|
||||
limit: 250 # will run on 250 * 14 subjects = 3500 samples
|
||||
num_fewshot: 5
|
||||
@ -1,4 +1,5 @@
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -b auto -l 1319 -f 5 -t 1
|
||||
# For vllm script, with -t option (tensor parallel size)
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -l 1319 -t 1
|
||||
model_name: "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic"
|
||||
tasks:
|
||||
- name: "gsm8k"
|
||||
|
||||
@ -0,0 +1,12 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh -m Qwen/Qwen2.5-VL-7B-Instruct -l 2500 -t 1
|
||||
|
||||
model_name: "Qwen/Qwen2.5-VL-7B-Instruct"
|
||||
backend: "vllm-vlm"
|
||||
tasks:
|
||||
- name: "chartqa"
|
||||
metrics:
|
||||
- name: "relaxed_accuracy,none"
|
||||
value: 0.855
|
||||
limit: 2500
|
||||
num_fewshot: 0
|
||||
1
.buildkite/lm-eval-harness/configs/models-large-h100.txt
Normal file
1
.buildkite/lm-eval-harness/configs/models-large-h100.txt
Normal file
@ -0,0 +1 @@
|
||||
Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
|
||||
@ -0,0 +1 @@
|
||||
Meta-Llama-4-Maverick-17B-128E-Instruct-FP8-MM.yaml
|
||||
1
.buildkite/lm-eval-harness/configs/models-mm-small.txt
Normal file
1
.buildkite/lm-eval-harness/configs/models-mm-small.txt
Normal file
@ -0,0 +1 @@
|
||||
Qwen2.5-VL-7B-Instruct.yaml
|
||||
44
.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh
Executable file
44
.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh
Executable file
@ -0,0 +1,44 @@
|
||||
#!/bin/bash
|
||||
# We can use this script to compute baseline accuracy on chartqa for vllm.
|
||||
#
|
||||
# Make sure you have lm-eval-harness installed:
|
||||
# pip install lm-eval==0.4.9
|
||||
|
||||
usage() {
|
||||
echo``
|
||||
echo "Runs lm eval harness on ChartQA using multimodal vllm."
|
||||
echo "This pathway is intended to be used to create baselines for "
|
||||
echo "our correctness tests in vllm's CI."
|
||||
echo
|
||||
echo "usage: ${0} <options>"
|
||||
echo
|
||||
echo " -m - huggingface stub or local directory of the model"
|
||||
echo " -l - limit number of samples to run"
|
||||
echo " -t - tensor parallel size to run at"
|
||||
echo
|
||||
}
|
||||
|
||||
while getopts "m:l:t:" OPT; do
|
||||
case ${OPT} in
|
||||
m )
|
||||
MODEL="$OPTARG"
|
||||
;;
|
||||
l )
|
||||
LIMIT="$OPTARG"
|
||||
;;
|
||||
t )
|
||||
TP_SIZE="$OPTARG"
|
||||
;;
|
||||
\? )
|
||||
usage
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
lm_eval --model vllm-vlm \
|
||||
--model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE" \
|
||||
--tasks chartqa \
|
||||
--batch_size auto \
|
||||
--apply_chat_template \
|
||||
--limit $LIMIT
|
||||
0
.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
Normal file → Executable file
0
.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
Normal file → Executable file
@ -0,0 +1,50 @@
|
||||
#!/bin/bash
|
||||
# We can use this script to compute baseline accuracy on MMLUPRO for vllm.
|
||||
# We use this for fp8, which HF does not support.
|
||||
#
|
||||
# Make sure you have lm-eval-harness installed:
|
||||
# pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
|
||||
|
||||
usage() {
|
||||
echo``
|
||||
echo "Runs lm eval harness on MMLU Pro using huggingface transformers."
|
||||
echo "This pathway is intended to be used to create baselines for "
|
||||
echo "our automated nm-test-accuracy workflow"
|
||||
echo
|
||||
echo "usage: ${0} <options>"
|
||||
echo
|
||||
echo " -m - huggingface stub or local directory of the model"
|
||||
echo " -l - limit number of samples to run"
|
||||
echo " -f - number of fewshot samples to use"
|
||||
echo " -t - tensor parallel size to run at"
|
||||
echo
|
||||
}
|
||||
|
||||
while getopts "m:b:l:f:t:" OPT; do
|
||||
case ${OPT} in
|
||||
m )
|
||||
MODEL="$OPTARG"
|
||||
;;
|
||||
b )
|
||||
BATCH_SIZE="$OPTARG"
|
||||
;;
|
||||
l )
|
||||
LIMIT="$OPTARG"
|
||||
;;
|
||||
f )
|
||||
FEWSHOT="$OPTARG"
|
||||
;;
|
||||
t )
|
||||
TP_SIZE="$OPTARG"
|
||||
;;
|
||||
\? )
|
||||
usage
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
lm_eval --model vllm \
|
||||
--model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,add_bos_token=true,trust_remote_code=true,max_model_len=4096" \
|
||||
--tasks mmlu_pro --num_fewshot "$FEWSHOT" --limit "$LIMIT" \
|
||||
--batch_size auto
|
||||
@ -19,21 +19,27 @@ RTOL = 0.08
|
||||
def launch_lm_eval(eval_config, tp_size):
|
||||
trust_remote_code = eval_config.get("trust_remote_code", False)
|
||||
max_model_len = eval_config.get("max_model_len", 4096)
|
||||
batch_size = eval_config.get("batch_size", "auto")
|
||||
backend = eval_config.get("backend", "vllm")
|
||||
model_args = (
|
||||
f"pretrained={eval_config['model_name']},"
|
||||
f"tensor_parallel_size={tp_size},"
|
||||
f"enforce_eager=true,"
|
||||
f"add_bos_token=true,"
|
||||
f"trust_remote_code={trust_remote_code},"
|
||||
f"max_model_len={max_model_len}"
|
||||
f"max_model_len={max_model_len},"
|
||||
)
|
||||
results = lm_eval.simple_evaluate(
|
||||
model="vllm",
|
||||
model=backend,
|
||||
model_args=model_args,
|
||||
tasks=[task["name"] for task in eval_config["tasks"]],
|
||||
num_fewshot=eval_config["num_fewshot"],
|
||||
limit=eval_config["limit"],
|
||||
batch_size="auto",
|
||||
# TODO(yeq): using chat template w/ fewshot_as_multiturn is supposed help
|
||||
# text models. however, this is regressing measured strict-match for
|
||||
# existing text models in CI, so only apply it for mm.
|
||||
apply_chat_template=backend == "vllm-vlm",
|
||||
batch_size=batch_size,
|
||||
)
|
||||
return results
|
||||
|
||||
|
||||
@ -8,7 +8,7 @@ steps:
|
||||
commands:
|
||||
# #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
|
||||
# https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg VLLM_MAIN_CUDA_VERSION=12.9 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg VLLM_MAIN_CUDA_VERSION=12.9 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||
- "mkdir artifacts"
|
||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||
- "bash .buildkite/scripts/upload-wheels.sh"
|
||||
@ -76,7 +76,7 @@ steps:
|
||||
queue: arm64_cpu_queue_postmerge
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
|
||||
|
||||
# Add job to create multi-arch manifest
|
||||
|
||||
@ -25,25 +25,28 @@ function cpu_tests() {
|
||||
|
||||
# offline inference
|
||||
podman exec -it "$container_id" bash -c "
|
||||
set -e
|
||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
|
||||
set -xve
|
||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" >> $HOME/test_basic.log
|
||||
|
||||
# Run basic model test
|
||||
podman exec -it "$container_id" bash -c "
|
||||
set -e
|
||||
set -evx
|
||||
pip install pytest pytest-asyncio einops peft Pillow soundfile transformers_stream_generator matplotlib
|
||||
pip install sentence-transformers datamodel_code_generator
|
||||
pytest -v -s tests/models/language/generation/test_bart.py -m cpu_model
|
||||
|
||||
# Note: disable Bart until supports V1
|
||||
# pytest -v -s tests/models/language/generation/test_bart.py -m cpu_model
|
||||
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-openai-community/gpt2]
|
||||
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-facebook/opt-125m]
|
||||
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-google/gemma-1.1-2b-it]
|
||||
pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach]
|
||||
pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model"
|
||||
# TODO: Below test case tests/models/language/pooling/test_embedding.py::test_models[True-ssmits/Qwen2-7B-Instruct-embed-base] fails on ppc64le. Disabling it for time being.
|
||||
# pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model" >> $HOME/test_rest.log
|
||||
}
|
||||
|
||||
# All of CPU tests are expected to be finished less than 40 mins.
|
||||
|
||||
export container_id
|
||||
export -f cpu_tests
|
||||
timeout 40m bash -c cpu_tests
|
||||
timeout 120m bash -c cpu_tests
|
||||
|
||||
|
||||
@ -44,6 +44,5 @@ docker run \
|
||||
pytest -v -s v1/structured_output
|
||||
pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py
|
||||
pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py
|
||||
pytest -v -s v1/test_metrics
|
||||
pytest -v -s v1/test_serial_utils.py
|
||||
'
|
||||
|
||||
1267
.buildkite/test-amd.yaml
Normal file
1267
.buildkite/test-amd.yaml
Normal file
File diff suppressed because it is too large
Load Diff
@ -403,6 +403,7 @@ steps:
|
||||
- pytest -v -s compile/test_fusion_all_reduce.py
|
||||
- pytest -v -s compile/test_decorator.py
|
||||
- pytest -v -s compile/test_noop_elimination.py
|
||||
- pytest -v -s compile/test_aot_compile.py
|
||||
|
||||
- label: PyTorch Fullgraph Smoke Test # 15min
|
||||
timeout_in_minutes: 30
|
||||
@ -526,7 +527,8 @@ steps:
|
||||
# since torchao nightly is only compatible with torch nightly currently
|
||||
# https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
|
||||
# we can only upgrade after this is resolved
|
||||
- pip install --pre torchao==0.13.0.dev20250814 --index-url https://download.pytorch.org/whl/nightly/cu128
|
||||
# TODO(jerryzh168): resolve the above comment
|
||||
- uv pip install --system torchao==0.13.0
|
||||
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/
|
||||
|
||||
- label: LM Eval Small Models # 53min
|
||||
@ -732,6 +734,16 @@ steps:
|
||||
- pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
|
||||
- cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
|
||||
|
||||
- label: Multi-Modal Accuracy Eval (Small Models) # 50min
|
||||
timeout_in_minutes: 70
|
||||
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
||||
source_file_dependencies:
|
||||
- vllm/multimodal/
|
||||
- vllm/inputs/
|
||||
- vllm/v1/core/
|
||||
commands:
|
||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
|
||||
|
||||
- label: Multi-Modal Models Test (Extended) 1
|
||||
mirror_hardwares: [amdexperimental]
|
||||
optional: true
|
||||
|
||||
17
.coveragerc
17
.coveragerc
@ -1,5 +1,10 @@
|
||||
[run]
|
||||
source = vllm
|
||||
# Track the installed vllm package (this is what actually gets imported during tests)
|
||||
# Use wildcard pattern to match the installed location
|
||||
source =
|
||||
vllm
|
||||
*/dist-packages/vllm
|
||||
*/site-packages/vllm
|
||||
omit =
|
||||
*/tests/*
|
||||
*/test_*
|
||||
@ -12,6 +17,16 @@ omit =
|
||||
*/benchmarks/*
|
||||
*/docs/*
|
||||
|
||||
[paths]
|
||||
# Map all possible vllm locations to a canonical "vllm" path
|
||||
# This ensures coverage.combine properly merges data from different test runs
|
||||
source =
|
||||
vllm
|
||||
/vllm-workspace/src/vllm
|
||||
/vllm-workspace/vllm
|
||||
*/site-packages/vllm
|
||||
*/dist-packages/vllm
|
||||
|
||||
[report]
|
||||
exclude_lines =
|
||||
pragma: no cover
|
||||
|
||||
4
.git-blame-ignore-revs
Normal file
4
.git-blame-ignore-revs
Normal file
@ -0,0 +1,4 @@
|
||||
# Migrate from `yapf` & `isort` to `ruff`
|
||||
d6953beb91da4e9c99be4c0a1304a2d24189535c
|
||||
# Convert `Optional[x]` to `x | None` and `Union[x, y]` to `x | y`
|
||||
8fcaaf6a165e661f63fc51be906bc05b0767332f
|
||||
11
.github/CODEOWNERS
vendored
11
.github/CODEOWNERS
vendored
@ -5,9 +5,7 @@
|
||||
/vllm/attention @LucasWilkinson
|
||||
/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
||||
/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
|
||||
/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
|
||||
/vllm/model_executor/layers/fused_moe @mgoin
|
||||
/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @NickLucche
|
||||
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256
|
||||
/vllm/model_executor/layers/mamba @tdoublep
|
||||
/vllm/model_executor/model_loader @22quinn
|
||||
@ -26,7 +24,6 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
|
||||
/vllm/config/cache.py @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg @heheda12345
|
||||
|
||||
# vLLM V1
|
||||
/vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
|
||||
/vllm/v1/attention @LucasWilkinson
|
||||
/vllm/v1/attention/backends/flashinfer.py @mgoin
|
||||
/vllm/v1/attention/backends/triton_attn.py @tdoublep
|
||||
@ -121,3 +118,11 @@ mkdocs.yaml @hmellor
|
||||
|
||||
# KVConnector installation files
|
||||
/requirements/kv_connectors.txt @NickLucche
|
||||
|
||||
# Pooling models
|
||||
/examples/*/pooling/ @noooop
|
||||
/tests/models/*/pooling* @noooop
|
||||
/tests/entrypoints/pooling @noooop
|
||||
/vllm/config/pooler.py @noooop
|
||||
/vllm/pooling_params.py @noooop
|
||||
/vllm/model_executor/layers/pooler.py @noooop
|
||||
|
||||
138
.github/workflows/issue_autolabel.yml
vendored
138
.github/workflows/issue_autolabel.yml
vendored
@ -13,6 +13,7 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Label issues based on keywords
|
||||
id: label-step
|
||||
uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
|
||||
with:
|
||||
script: |
|
||||
@ -42,7 +43,6 @@ jobs:
|
||||
searchIn: "body"
|
||||
},
|
||||
],
|
||||
|
||||
// Substring search - matches anywhere in text (partial matches)
|
||||
substrings: [
|
||||
{
|
||||
@ -89,14 +89,12 @@ jobs:
|
||||
term: "hip_",
|
||||
searchIn: "both"
|
||||
},
|
||||
|
||||
// ROCm tools and libraries
|
||||
{
|
||||
term: "hipify",
|
||||
searchIn: "both"
|
||||
},
|
||||
],
|
||||
|
||||
// Regex patterns - for complex pattern matching
|
||||
regexPatterns: [
|
||||
{
|
||||
@ -107,13 +105,17 @@ jobs:
|
||||
}
|
||||
],
|
||||
},
|
||||
// Add more label configurations here as needed
|
||||
// example: {
|
||||
// keywords: [...],
|
||||
// substrings: [...],
|
||||
// regexPatterns: [...]
|
||||
// },
|
||||
};
|
||||
|
||||
// Helper function to create regex based on search type
|
||||
function createSearchRegex(term, type) {
|
||||
// Escape special regex characters in the term
|
||||
const escapedTerm = term.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
|
||||
|
||||
switch (type) {
|
||||
case 'keyword':
|
||||
// Word boundary search - matches whole words only
|
||||
@ -125,16 +127,13 @@ jobs:
|
||||
throw new Error(`Unknown search type: ${type}`);
|
||||
}
|
||||
}
|
||||
|
||||
// Helper function to find matching terms in text with line information
|
||||
function findMatchingTermsWithLines(text, searchTerms = [], searchType = 'keyword', searchLocation = '') {
|
||||
const matches = [];
|
||||
const lines = text.split('\n');
|
||||
|
||||
for (const termConfig of searchTerms) {
|
||||
let regex;
|
||||
let term, searchIn, pattern, description, flags;
|
||||
|
||||
// Handle different input formats (string or object)
|
||||
if (typeof termConfig === 'string') {
|
||||
term = termConfig;
|
||||
@ -146,21 +145,17 @@ jobs:
|
||||
description = termConfig.description;
|
||||
flags = termConfig.flags;
|
||||
}
|
||||
|
||||
// Skip if this term shouldn't be searched in the current location
|
||||
if (searchIn !== 'both' && searchIn !== searchLocation) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Create appropriate regex
|
||||
if (searchType === 'regex') {
|
||||
regex = new RegExp(pattern, flags || "gi");
|
||||
} else {
|
||||
regex = createSearchRegex(term, searchType);
|
||||
}
|
||||
|
||||
const termMatches = [];
|
||||
|
||||
// Check each line for matches
|
||||
lines.forEach((line, lineIndex) => {
|
||||
const lineMatches = line.match(regex);
|
||||
@ -175,15 +170,14 @@ jobs:
|
||||
originalTerm: term || pattern,
|
||||
description: description,
|
||||
// Show context around the match in the line
|
||||
context: line.length > 100 ?
|
||||
line.substring(Math.max(0, line.toLowerCase().indexOf(match.toLowerCase()) - 30),
|
||||
line.toLowerCase().indexOf(match.toLowerCase()) + match.length + 30) + '...'
|
||||
context: line.length > 100 ?
|
||||
line.substring(Math.max(0, line.toLowerCase().indexOf(match.toLowerCase()) - 30),
|
||||
line.toLowerCase().indexOf(match.toLowerCase()) + match.length + 30) + '...'
|
||||
: line.trim()
|
||||
});
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
if (termMatches.length > 0) {
|
||||
matches.push({
|
||||
term: term || (description || pattern),
|
||||
@ -196,64 +190,48 @@ jobs:
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return matches;
|
||||
}
|
||||
|
||||
// Helper function to check if label should be added
|
||||
async function processLabel(labelName, config) {
|
||||
const body = context.payload.issue.body || "";
|
||||
const title = context.payload.issue.title || "";
|
||||
|
||||
core.notice(`Processing label: ${labelName}`);
|
||||
core.notice(`Issue Title: "${title}"`);
|
||||
core.notice(`Issue Body length: ${body.length} characters`);
|
||||
|
||||
let shouldAddLabel = false;
|
||||
let allMatches = [];
|
||||
let reason = '';
|
||||
|
||||
const keywords = config.keywords || [];
|
||||
const substrings = config.substrings || [];
|
||||
const regexPatterns = config.regexPatterns || [];
|
||||
|
||||
core.notice(`Searching with ${keywords.length} keywords, ${substrings.length} substrings, and ${regexPatterns.length} regex patterns`);
|
||||
|
||||
// Search in title
|
||||
if (title.trim()) {
|
||||
core.notice(`Searching in title: "${title}"`);
|
||||
|
||||
const titleKeywordMatches = findMatchingTermsWithLines(title, keywords, 'keyword', 'title');
|
||||
const titleSubstringMatches = findMatchingTermsWithLines(title, substrings, 'substring', 'title');
|
||||
const titleRegexMatches = findMatchingTermsWithLines(title, regexPatterns, 'regex', 'title');
|
||||
|
||||
allMatches.push(...titleKeywordMatches, ...titleSubstringMatches, ...titleRegexMatches);
|
||||
}
|
||||
|
||||
// Search in body
|
||||
if (body.trim()) {
|
||||
core.notice(`Searching in body (${body.length} characters)`);
|
||||
|
||||
const bodyKeywordMatches = findMatchingTermsWithLines(body, keywords, 'keyword', 'body');
|
||||
const bodySubstringMatches = findMatchingTermsWithLines(body, substrings, 'substring', 'body');
|
||||
const bodyRegexMatches = findMatchingTermsWithLines(body, regexPatterns, 'regex', 'body');
|
||||
|
||||
allMatches.push(...bodyKeywordMatches, ...bodySubstringMatches, ...bodyRegexMatches);
|
||||
}
|
||||
|
||||
if (allMatches.length > 0) {
|
||||
core.notice(`Found ${allMatches.length} matching term(s):`);
|
||||
|
||||
for (const termMatch of allMatches) {
|
||||
const locationText = termMatch.searchLocation === 'title' ? 'title' : 'body';
|
||||
const searchInText = termMatch.searchIn === 'both' ? 'both' : termMatch.searchIn;
|
||||
|
||||
if (termMatch.searchType === 'regex') {
|
||||
core.notice(` 📍 Regex: "${termMatch.term}" (pattern: ${termMatch.pattern}) found ${termMatch.count} time(s) in ${locationText} (configured to search in: ${searchInText}):`);
|
||||
} else {
|
||||
core.notice(` 📍 Term: "${termMatch.term}" (${termMatch.searchType} search) found ${termMatch.count} time(s) in ${locationText} (configured to search in: ${searchInText}):`);
|
||||
}
|
||||
|
||||
// Show details for each match
|
||||
termMatch.matches.forEach((match, index) => {
|
||||
core.notice(` ${index + 1}. Line ${match.lineNumber} in ${match.searchLocation}: "${match.match}" [${match.searchType}]`);
|
||||
@ -266,7 +244,6 @@ jobs:
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
shouldAddLabel = true;
|
||||
const totalMatches = allMatches.reduce((sum, t) => sum + t.count, 0);
|
||||
const titleMatches = allMatches.filter(t => t.searchLocation === 'title').reduce((sum, t) => sum + t.count, 0);
|
||||
@ -274,13 +251,10 @@ jobs:
|
||||
const keywordMatches = allMatches.filter(t => t.searchType === 'keyword').reduce((sum, t) => sum + t.count, 0);
|
||||
const substringMatches = allMatches.filter(t => t.searchType === 'substring').reduce((sum, t) => sum + t.count, 0);
|
||||
const regexMatches = allMatches.filter(t => t.searchType === 'regex').reduce((sum, t) => sum + t.count, 0);
|
||||
|
||||
reason = `Found ${totalMatches} total matches (${titleMatches} in title, ${bodyMatches} in body) - ${keywordMatches} keyword matches, ${substringMatches} substring matches, ${regexMatches} regex matches`;
|
||||
}
|
||||
|
||||
core.notice(`Final decision: ${shouldAddLabel ? 'ADD LABEL' : 'DO NOT ADD LABEL'}`);
|
||||
core.notice(`Reason: ${reason || 'No matching terms found'}`);
|
||||
|
||||
if (shouldAddLabel) {
|
||||
const existingLabels = context.payload.issue.labels.map(l => l.name);
|
||||
if (!existingLabels.includes(labelName)) {
|
||||
@ -296,14 +270,92 @@ jobs:
|
||||
core.notice(`Label "${labelName}" already present.`);
|
||||
return false;
|
||||
}
|
||||
|
||||
core.notice(`No matching terms found for label "${labelName}".`);
|
||||
return false;
|
||||
}
|
||||
|
||||
// Process all configured labels
|
||||
const processLabels = Object.entries(labelConfig)
|
||||
.map(([labelName, config]) => processLabel(labelName, config));
|
||||
const labelsAdded = await Promise.all(processLabels);
|
||||
const numLabelsAdded = labelsAdded.reduce((x, y) => x + y, 0);
|
||||
core.notice(`Processing complete. ${numLabelsAdded} label(s) added.`);
|
||||
const labelsAddedResults = await Promise.all(
|
||||
Object.entries(labelConfig).map(([labelName, config]) =>
|
||||
processLabel(labelName, config).then(added => ({ labelName, added }))
|
||||
)
|
||||
);
|
||||
|
||||
const numLabelsAdded = labelsAddedResults.filter(r => r.added).length;
|
||||
core.notice(`Processing complete. ${numLabelsAdded} label(s) added.`);
|
||||
|
||||
// Return which labels were added for the next step
|
||||
const addedLabels = labelsAddedResults.filter(r => r.added).map(r => r.labelName);
|
||||
core.setOutput('labels_added', JSON.stringify(addedLabels));
|
||||
return addedLabels;
|
||||
|
||||
- name: CC users for labeled issues
|
||||
if: steps.label-step.outputs.labels_added != '[]'
|
||||
uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
|
||||
with:
|
||||
script: |
|
||||
// Configuration: Map labels to GitHub users to CC
|
||||
// You can add multiple users per label, and multiple label configurations
|
||||
const ccConfig = {
|
||||
rocm: {
|
||||
users: ['hongxiayang', 'tjtanaa', 'vllmellm'], // Add more users as needed: ['user1', 'user2', 'user3']
|
||||
message: 'CC {users} for ROCm-related issue' // {users} will be replaced with @mentions
|
||||
},
|
||||
// Add more label -> user mappings here
|
||||
// Example:
|
||||
// cuda: {
|
||||
// users: ['user1', 'user2'],
|
||||
// message: 'CC {users} for CUDA-related issue'
|
||||
// },
|
||||
// performance: {
|
||||
// users: ['perfexpert'],
|
||||
// message: 'CC {users} for performance issue'
|
||||
// },
|
||||
};
|
||||
|
||||
const labelsAdded = JSON.parse('${{ steps.label-step.outputs.labels_added }}');
|
||||
core.notice(`Labels added: ${labelsAdded.join(', ')}`);
|
||||
|
||||
// Get existing comments to check for already mentioned users
|
||||
const comments = await github.rest.issues.listComments({
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
issue_number: context.issue.number,
|
||||
});
|
||||
|
||||
const issueBody = context.payload.issue.body || '';
|
||||
const allExistingText = issueBody + '\n' + comments.data.map(c => c.body).join('\n');
|
||||
|
||||
// Process each label that was added
|
||||
for (const label of labelsAdded) {
|
||||
if (ccConfig[label]) {
|
||||
const config = ccConfig[label];
|
||||
const usersToMention = [];
|
||||
|
||||
// Check which users haven't been mentioned yet
|
||||
for (const user of config.users) {
|
||||
const mentionPattern = new RegExp(`@${user}\\b`, 'i');
|
||||
if (!mentionPattern.test(allExistingText)) {
|
||||
usersToMention.push(user);
|
||||
} else {
|
||||
core.notice(`@${user} already mentioned for label "${label}", skipping`);
|
||||
}
|
||||
}
|
||||
|
||||
// Post comment if there are users to mention
|
||||
if (usersToMention.length > 0) {
|
||||
const mentions = usersToMention.map(u => `@${u}`).join(' ');
|
||||
const message = config.message.replace('{users}', mentions);
|
||||
|
||||
await github.rest.issues.createComment({
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
issue_number: context.issue.number,
|
||||
body: message
|
||||
});
|
||||
|
||||
core.notice(`CC comment added for label "${label}": ${mentions}`);
|
||||
} else {
|
||||
core.notice(`All users for label "${label}" already mentioned, skipping comment`);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -16,6 +16,7 @@ repos:
|
||||
rev: v1.38.1
|
||||
hooks:
|
||||
- id: typos
|
||||
args: [--force-exclude]
|
||||
- repo: https://github.com/pre-commit/mirrors-clang-format
|
||||
rev: v21.1.2
|
||||
hooks:
|
||||
|
||||
@ -8,7 +8,6 @@ import sys
|
||||
import time
|
||||
import traceback
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Optional, Union
|
||||
|
||||
import aiohttp
|
||||
import huggingface_hub.constants
|
||||
@ -28,13 +27,13 @@ class RequestFuncInput:
|
||||
prompt_len: int
|
||||
output_len: int
|
||||
model: str
|
||||
model_name: Optional[str] = None
|
||||
logprobs: Optional[int] = None
|
||||
extra_body: Optional[dict] = None
|
||||
multi_modal_content: Optional[dict | list[dict]] = None
|
||||
model_name: str | None = None
|
||||
logprobs: int | None = None
|
||||
extra_body: dict | None = None
|
||||
multi_modal_content: dict | list[dict] | None = None
|
||||
ignore_eos: bool = False
|
||||
language: Optional[str] = None
|
||||
request_id: Optional[str] = None
|
||||
language: str | None = None
|
||||
request_id: str | None = None
|
||||
|
||||
|
||||
@dataclass
|
||||
@ -52,7 +51,7 @@ class RequestFuncOutput:
|
||||
|
||||
async def async_request_tgi(
|
||||
request_func_input: RequestFuncInput,
|
||||
pbar: Optional[tqdm] = None,
|
||||
pbar: tqdm | None = None,
|
||||
) -> RequestFuncOutput:
|
||||
api_url = request_func_input.api_url
|
||||
assert api_url.endswith("generate_stream")
|
||||
@ -133,7 +132,7 @@ async def async_request_tgi(
|
||||
|
||||
async def async_request_trt_llm(
|
||||
request_func_input: RequestFuncInput,
|
||||
pbar: Optional[tqdm] = None,
|
||||
pbar: tqdm | None = None,
|
||||
) -> RequestFuncOutput:
|
||||
api_url = request_func_input.api_url
|
||||
assert api_url.endswith("generate_stream")
|
||||
@ -204,7 +203,7 @@ async def async_request_trt_llm(
|
||||
|
||||
async def async_request_deepspeed_mii(
|
||||
request_func_input: RequestFuncInput,
|
||||
pbar: Optional[tqdm] = None,
|
||||
pbar: tqdm | None = None,
|
||||
) -> RequestFuncOutput:
|
||||
api_url = request_func_input.api_url
|
||||
assert api_url.endswith(("completions", "profile")), (
|
||||
@ -267,7 +266,7 @@ async def async_request_deepspeed_mii(
|
||||
|
||||
async def async_request_openai_completions(
|
||||
request_func_input: RequestFuncInput,
|
||||
pbar: Optional[tqdm] = None,
|
||||
pbar: tqdm | None = None,
|
||||
) -> RequestFuncOutput:
|
||||
api_url = request_func_input.api_url
|
||||
assert api_url.endswith(("completions", "profile")), (
|
||||
@ -367,7 +366,7 @@ async def async_request_openai_completions(
|
||||
|
||||
async def async_request_openai_chat_completions(
|
||||
request_func_input: RequestFuncInput,
|
||||
pbar: Optional[tqdm] = None,
|
||||
pbar: tqdm | None = None,
|
||||
) -> RequestFuncOutput:
|
||||
api_url = request_func_input.api_url
|
||||
assert api_url.endswith(("chat/completions", "profile")), (
|
||||
@ -476,7 +475,7 @@ async def async_request_openai_chat_completions(
|
||||
|
||||
async def async_request_openai_audio(
|
||||
request_func_input: RequestFuncInput,
|
||||
pbar: Optional[tqdm] = None,
|
||||
pbar: tqdm | None = None,
|
||||
) -> RequestFuncOutput:
|
||||
# Lazy import without PlaceholderModule to avoid vllm dep.
|
||||
import soundfile
|
||||
@ -610,7 +609,7 @@ def get_tokenizer(
|
||||
tokenizer_mode: str = "auto",
|
||||
trust_remote_code: bool = False,
|
||||
**kwargs,
|
||||
) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
|
||||
) -> PreTrainedTokenizer | PreTrainedTokenizerFast:
|
||||
if pretrained_model_name_or_path is not None and not os.path.exists(
|
||||
pretrained_model_name_or_path
|
||||
):
|
||||
|
||||
@ -32,7 +32,6 @@ import dataclasses
|
||||
import json
|
||||
import random
|
||||
import time
|
||||
from typing import Optional
|
||||
|
||||
from transformers import PreTrainedTokenizerBase
|
||||
|
||||
@ -80,7 +79,7 @@ def sample_requests_from_dataset(
|
||||
num_requests: int,
|
||||
tokenizer: PreTrainedTokenizerBase,
|
||||
input_length_range: tuple[int, int],
|
||||
fixed_output_len: Optional[int],
|
||||
fixed_output_len: int | None,
|
||||
) -> list[Request]:
|
||||
if fixed_output_len is not None and fixed_output_len < 4:
|
||||
raise ValueError("output_len too small")
|
||||
@ -128,7 +127,7 @@ def sample_requests_from_random(
|
||||
num_requests: int,
|
||||
tokenizer: PreTrainedTokenizerBase,
|
||||
input_length_range: tuple[int, int],
|
||||
fixed_output_len: Optional[int],
|
||||
fixed_output_len: int | None,
|
||||
prefix_len: int,
|
||||
) -> list[Request]:
|
||||
requests = []
|
||||
|
||||
@ -7,7 +7,6 @@ import dataclasses
|
||||
import json
|
||||
import random
|
||||
import time
|
||||
from typing import Optional
|
||||
|
||||
from transformers import AutoTokenizer, PreTrainedTokenizerBase
|
||||
|
||||
@ -24,7 +23,7 @@ def sample_requests(
|
||||
dataset_path: str,
|
||||
num_requests: int,
|
||||
tokenizer: PreTrainedTokenizerBase,
|
||||
fixed_output_len: Optional[int],
|
||||
fixed_output_len: int | None,
|
||||
) -> list[tuple[str, int, int, int]]:
|
||||
if fixed_output_len is not None and fixed_output_len < 4:
|
||||
raise ValueError("output_len too small")
|
||||
|
||||
@ -32,7 +32,6 @@ import uuid
|
||||
import warnings
|
||||
from collections.abc import AsyncGenerator
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional
|
||||
|
||||
import datasets
|
||||
import numpy as np
|
||||
@ -316,7 +315,7 @@ def calculate_metrics(
|
||||
tokenizer: PreTrainedTokenizerBase,
|
||||
selected_percentile_metrics: list[str],
|
||||
selected_percentiles: list[float],
|
||||
goodput_config_dict: Optional[dict[str, float]] = None,
|
||||
goodput_config_dict: dict[str, float] | None = None,
|
||||
) -> tuple[BenchmarkMetrics, list[int]]:
|
||||
actual_output_lens: list[int] = []
|
||||
total_input = 0
|
||||
@ -436,9 +435,9 @@ async def benchmark(
|
||||
selected_percentile_metrics: list[str],
|
||||
selected_percentiles: list[str],
|
||||
ignore_eos: bool,
|
||||
max_concurrency: Optional[int],
|
||||
max_concurrency: int | None,
|
||||
structured_output_ratio: float,
|
||||
goodput_config_dict: Optional[dict[str, float]] = None,
|
||||
goodput_config_dict: dict[str, float] | None = None,
|
||||
):
|
||||
if backend in ASYNC_REQUEST_FUNCS:
|
||||
request_func = ASYNC_REQUEST_FUNCS[backend]
|
||||
|
||||
@ -6,7 +6,7 @@ import math
|
||||
import os
|
||||
import time
|
||||
from types import TracebackType
|
||||
from typing import Any, Optional, Union
|
||||
from typing import Any
|
||||
|
||||
|
||||
def convert_to_pytorch_benchmark_format(
|
||||
@ -92,7 +92,7 @@ class TimeCollector:
|
||||
def __init__(self, scale: int) -> None:
|
||||
self.cnt: int = 0
|
||||
self._sum: int = 0
|
||||
self._max: Optional[int] = None
|
||||
self._max: int | None = None
|
||||
self.scale = scale
|
||||
self.start_time: int = time.monotonic_ns()
|
||||
|
||||
@ -104,13 +104,13 @@ class TimeCollector:
|
||||
else:
|
||||
self._max = max(self._max, v)
|
||||
|
||||
def avg(self) -> Union[float, str]:
|
||||
def avg(self) -> float | str:
|
||||
return self._sum * 1.0 / self.cnt / self.scale if self.cnt > 0 else "N/A"
|
||||
|
||||
def max(self) -> Union[float, str]:
|
||||
def max(self) -> float | str:
|
||||
return self._max / self.scale if self._max else "N/A"
|
||||
|
||||
def dump_avg_max(self) -> list[Union[float, str]]:
|
||||
def dump_avg_max(self) -> list[float | str]:
|
||||
return [self.avg(), self.max()]
|
||||
|
||||
def __enter__(self) -> None:
|
||||
@ -118,8 +118,8 @@ class TimeCollector:
|
||||
|
||||
def __exit__(
|
||||
self,
|
||||
exc_type: Optional[type[BaseException]],
|
||||
exc_value: Optional[BaseException],
|
||||
exc_traceback: Optional[TracebackType],
|
||||
exc_type: type[BaseException] | None,
|
||||
exc_value: BaseException | None,
|
||||
exc_traceback: TracebackType | None,
|
||||
) -> None:
|
||||
self.collect(time.monotonic_ns() - self.start_time)
|
||||
|
||||
@ -6,8 +6,7 @@ import copy
|
||||
import itertools
|
||||
import pickle as pkl
|
||||
import time
|
||||
from collections.abc import Iterable
|
||||
from typing import Callable
|
||||
from collections.abc import Callable, Iterable
|
||||
|
||||
import torch
|
||||
import torch.utils.benchmark as TBenchmark
|
||||
|
||||
@ -6,8 +6,7 @@ import copy
|
||||
import itertools
|
||||
import pickle as pkl
|
||||
import time
|
||||
from collections.abc import Iterable
|
||||
from typing import Callable, Optional
|
||||
from collections.abc import Callable, Iterable
|
||||
|
||||
import torch
|
||||
import torch.utils.benchmark as TBenchmark
|
||||
@ -53,7 +52,7 @@ def bench_int8(
|
||||
n: int,
|
||||
label: str,
|
||||
sub_label: str,
|
||||
bench_kernels: Optional[list[str]] = None,
|
||||
bench_kernels: list[str] | None = None,
|
||||
) -> Iterable[TMeasurement]:
|
||||
"""Benchmark INT8-based kernels."""
|
||||
assert dtype == torch.int8
|
||||
@ -108,7 +107,7 @@ def bench_fp8(
|
||||
n: int,
|
||||
label: str,
|
||||
sub_label: str,
|
||||
bench_kernels: Optional[list[str]] = None,
|
||||
bench_kernels: list[str] | None = None,
|
||||
) -> Iterable[TMeasurement]:
|
||||
"""Benchmark FP8-based kernels."""
|
||||
assert dtype == torch.float8_e4m3fn
|
||||
@ -183,7 +182,7 @@ def bench(
|
||||
n: int,
|
||||
label: str,
|
||||
sub_label: str,
|
||||
bench_kernels: Optional[list[str]] = None,
|
||||
bench_kernels: list[str] | None = None,
|
||||
) -> Iterable[TMeasurement]:
|
||||
if dtype == torch.int8:
|
||||
return bench_int8(dtype, m, k, n, label, sub_label, bench_kernels)
|
||||
@ -201,7 +200,7 @@ def print_timers(timers: Iterable[TMeasurement]):
|
||||
def run(
|
||||
dtype: torch.dtype,
|
||||
MKNs: Iterable[tuple[int, int, int]],
|
||||
bench_kernels: Optional[list[str]] = None,
|
||||
bench_kernels: list[str] | None = None,
|
||||
) -> Iterable[TMeasurement]:
|
||||
results = []
|
||||
for m, k, n in MKNs:
|
||||
|
||||
@ -3,10 +3,9 @@
|
||||
|
||||
import pickle as pkl
|
||||
import time
|
||||
from collections.abc import Iterable
|
||||
from collections.abc import Callable, Iterable
|
||||
from dataclasses import dataclass
|
||||
from itertools import product
|
||||
from typing import Callable, Optional
|
||||
|
||||
import torch
|
||||
import torch.utils.benchmark as TBenchmark
|
||||
@ -51,7 +50,7 @@ def get_bench_params() -> list[bench_params_t]:
|
||||
def unfused_int8_impl(
|
||||
rms_norm_layer: RMSNorm,
|
||||
x: torch.Tensor,
|
||||
residual: Optional[torch.Tensor],
|
||||
residual: torch.Tensor | None,
|
||||
quant_dtype: torch.dtype,
|
||||
):
|
||||
# Norm
|
||||
@ -68,7 +67,7 @@ def unfused_int8_impl(
|
||||
def unfused_fp8_impl(
|
||||
rms_norm_layer: RMSNorm,
|
||||
x: torch.Tensor,
|
||||
residual: Optional[torch.Tensor],
|
||||
residual: torch.Tensor | None,
|
||||
quant_dtype: torch.dtype,
|
||||
):
|
||||
# Norm
|
||||
@ -85,7 +84,7 @@ def unfused_fp8_impl(
|
||||
def fused_impl(
|
||||
rms_norm_layer: RMSNorm, # this stores the weights
|
||||
x: torch.Tensor,
|
||||
residual: Optional[torch.Tensor],
|
||||
residual: torch.Tensor | None,
|
||||
quant_dtype: torch.dtype,
|
||||
):
|
||||
out, _ = ops.rms_norm_dynamic_per_token_quant(
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import itertools
|
||||
from typing import Callable
|
||||
from collections.abc import Callable
|
||||
from unittest.mock import patch
|
||||
|
||||
import pandas as pd
|
||||
|
||||
@ -22,8 +22,8 @@ Example:
|
||||
import json
|
||||
import os
|
||||
import time
|
||||
from collections.abc import Callable
|
||||
from contextlib import nullcontext
|
||||
from typing import Callable, Optional
|
||||
|
||||
import torch
|
||||
import torch.distributed as dist
|
||||
@ -264,12 +264,12 @@ class CommunicatorBenchmark:
|
||||
def benchmark_allreduce_single(
|
||||
self,
|
||||
sequence_length: int,
|
||||
allreduce_fn: Callable[[torch.Tensor], Optional[torch.Tensor]],
|
||||
allreduce_fn: Callable[[torch.Tensor], torch.Tensor | None],
|
||||
should_use_fn: Callable[[torch.Tensor], bool],
|
||||
context,
|
||||
num_warmup: int,
|
||||
num_trials: int,
|
||||
) -> Optional[float]:
|
||||
) -> float | None:
|
||||
"""Benchmark method with CUDA graph optimization."""
|
||||
try:
|
||||
# Create test tensor (2D: sequence_length x hidden_size)
|
||||
|
||||
@ -6,11 +6,12 @@ import copy
|
||||
import json
|
||||
import pickle
|
||||
import time
|
||||
from collections.abc import Callable
|
||||
from dataclasses import dataclass
|
||||
from enum import Enum, auto
|
||||
from itertools import product
|
||||
from pathlib import Path
|
||||
from typing import Any, Callable, Optional
|
||||
from typing import Any
|
||||
|
||||
import torch
|
||||
import torch.utils.benchmark as TBenchmark
|
||||
@ -158,7 +159,7 @@ def ref_group_gemm(
|
||||
seq_lens_cpu: torch.Tensor,
|
||||
prompt_lora_mapping_cpu: torch.Tensor,
|
||||
scaling: float,
|
||||
add_inputs: Optional[bool],
|
||||
add_inputs: bool | None,
|
||||
):
|
||||
"""
|
||||
Torch group gemm reference implementation to test correctness of
|
||||
@ -316,8 +317,8 @@ class BenchmarkContext:
|
||||
lora_rank: int
|
||||
sort_by_lora_id: bool
|
||||
dtype: torch.dtype
|
||||
seq_length: Optional[int] = None
|
||||
num_slices: Optional[int] = None # num_slices for slice based ops
|
||||
seq_length: int | None = None
|
||||
num_slices: int | None = None # num_slices for slice based ops
|
||||
|
||||
def with_seq_length(self, seq_length: int) -> "BenchmarkContext":
|
||||
ctx = copy.copy(self)
|
||||
@ -561,7 +562,7 @@ class BenchmarkTensors:
|
||||
}
|
||||
|
||||
def bench_fn_kwargs(
|
||||
self, op_type: OpType, add_inputs: Optional[bool] = None
|
||||
self, op_type: OpType, add_inputs: bool | None = None
|
||||
) -> dict[str, Any]:
|
||||
if op_type.is_shrink_fn():
|
||||
assert add_inputs is None
|
||||
@ -575,7 +576,7 @@ class BenchmarkTensors:
|
||||
raise ValueError(f"Unrecognized optype {self}")
|
||||
|
||||
def test_correctness(
|
||||
self, op_type: OpType, expand_fn_add_inputs: Optional[bool]
|
||||
self, op_type: OpType, expand_fn_add_inputs: bool | None
|
||||
) -> bool:
|
||||
"""
|
||||
Test correctness of op_type implementation against a grouped gemm
|
||||
@ -611,8 +612,8 @@ def bench_optype(
|
||||
ctx: BenchmarkContext,
|
||||
arg_pool_size: int,
|
||||
op_type: OpType,
|
||||
cuda_graph_nops: Optional[int] = None,
|
||||
expand_fn_add_inputs: Optional[bool] = None,
|
||||
cuda_graph_nops: int | None = None,
|
||||
expand_fn_add_inputs: bool | None = None,
|
||||
test_correctness: bool = False,
|
||||
) -> TMeasurement:
|
||||
assert arg_pool_size >= 1
|
||||
@ -679,7 +680,7 @@ def bench_torch_mm(
|
||||
ctx: BenchmarkContext,
|
||||
arg_pool_size: int,
|
||||
op_type: OpType,
|
||||
cuda_graph_nops: Optional[int] = None,
|
||||
cuda_graph_nops: int | None = None,
|
||||
) -> TMeasurement:
|
||||
"""
|
||||
Benchmark basic torch.mm as a roofline.
|
||||
@ -744,7 +745,7 @@ def use_cuda_graph_recommendation() -> str:
|
||||
"""
|
||||
|
||||
|
||||
def print_timers(timers: list[TMeasurement], args: Optional[argparse.Namespace] = None):
|
||||
def print_timers(timers: list[TMeasurement], args: argparse.Namespace | None = None):
|
||||
compare = TBenchmark.Compare(timers)
|
||||
compare.print()
|
||||
|
||||
|
||||
@ -8,10 +8,9 @@ import math
|
||||
import os
|
||||
import pickle as pkl
|
||||
import time
|
||||
from collections.abc import Iterable
|
||||
from collections.abc import Callable, Iterable
|
||||
from dataclasses import dataclass
|
||||
from itertools import product
|
||||
from typing import Callable, Optional
|
||||
|
||||
import pandas as pd
|
||||
import torch
|
||||
@ -63,23 +62,23 @@ class BenchmarkTensors:
|
||||
a: torch.Tensor
|
||||
|
||||
w_q: torch.Tensor
|
||||
group_size: Optional[int]
|
||||
group_size: int | None
|
||||
wtype: ScalarType
|
||||
w_g_s: torch.Tensor
|
||||
w_g_zp: Optional[torch.Tensor]
|
||||
w_ch_s: Optional[torch.Tensor]
|
||||
w_tok_s: Optional[torch.Tensor]
|
||||
w_g_zp: torch.Tensor | None
|
||||
w_ch_s: torch.Tensor | None
|
||||
w_tok_s: torch.Tensor | None
|
||||
|
||||
|
||||
@dataclass
|
||||
class TypeConfig:
|
||||
act_type: torch.dtype
|
||||
weight_type: ScalarType
|
||||
output_type: Optional[torch.dtype]
|
||||
group_scale_type: Optional[torch.dtype]
|
||||
group_zero_type: Optional[torch.dtype]
|
||||
channel_scale_type: Optional[torch.dtype]
|
||||
token_scale_type: Optional[torch.dtype]
|
||||
output_type: torch.dtype | None
|
||||
group_scale_type: torch.dtype | None
|
||||
group_zero_type: torch.dtype | None
|
||||
channel_scale_type: torch.dtype | None
|
||||
token_scale_type: torch.dtype | None
|
||||
|
||||
|
||||
def rand_data(shape, dtype=torch.float16, scale=1):
|
||||
@ -93,8 +92,8 @@ def quantize_and_pack(
|
||||
atype: torch.dtype,
|
||||
w: torch.Tensor,
|
||||
wtype: ScalarType,
|
||||
stype: Optional[torch.dtype],
|
||||
group_size: Optional[int],
|
||||
stype: torch.dtype | None,
|
||||
group_size: int | None,
|
||||
zero_points: bool = False,
|
||||
):
|
||||
assert wtype.is_integer(), "TODO: support floating point weights"
|
||||
@ -113,7 +112,7 @@ def quantize_and_pack(
|
||||
|
||||
|
||||
def create_bench_tensors(
|
||||
shape: tuple[int, int, int], types: TypeConfig, group_size: Optional[int]
|
||||
shape: tuple[int, int, int], types: TypeConfig, group_size: int | None
|
||||
) -> list[BenchmarkTensors]:
|
||||
m, n, k = shape
|
||||
|
||||
@ -331,8 +330,8 @@ def bench_fns(label: str, sub_label: str, description: str, fns: list[Callable])
|
||||
return res
|
||||
|
||||
|
||||
_SWEEP_SCHEDULES_RESULTS: Optional[pd.DataFrame] = None
|
||||
_SWEEP_SCHEDULES_RESULTS_CSV: Optional[str] = None
|
||||
_SWEEP_SCHEDULES_RESULTS: pd.DataFrame | None = None
|
||||
_SWEEP_SCHEDULES_RESULTS_CSV: str | None = None
|
||||
|
||||
|
||||
def bench(
|
||||
|
||||
@ -631,7 +631,7 @@ def main(args: argparse.Namespace):
|
||||
else:
|
||||
ensure_divisibility(intermediate_size, args.tp_size, "intermediate_size")
|
||||
shard_intermediate_size = 2 * intermediate_size // args.tp_size
|
||||
dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype
|
||||
dtype = torch.float16 if current_platform.is_rocm() else config.dtype
|
||||
use_fp8_w8a8 = args.dtype == "fp8_w8a8"
|
||||
use_int8_w8a16 = args.dtype == "int8_w8a16"
|
||||
block_quant_shape = get_weight_block_size_safety(config)
|
||||
|
||||
@ -344,7 +344,7 @@ def main(args: argparse.Namespace):
|
||||
topk = config.num_experts_per_tok
|
||||
|
||||
hidden_size = config.hidden_size
|
||||
dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype
|
||||
dtype = torch.float16 if current_platform.is_rocm() else config.dtype
|
||||
use_fp8_w8a8 = args.dtype == "fp8_w8a8"
|
||||
use_int8_w8a16 = args.dtype == "int8_w8a16"
|
||||
use_customized_permute = args.use_customized_permute
|
||||
|
||||
@ -3,7 +3,6 @@
|
||||
|
||||
import random
|
||||
import time
|
||||
from typing import Optional
|
||||
|
||||
import torch
|
||||
|
||||
@ -37,7 +36,7 @@ def main(
|
||||
seed: int,
|
||||
do_profile: bool,
|
||||
device: str = "cuda",
|
||||
kv_cache_dtype: Optional[str] = None,
|
||||
kv_cache_dtype: str | None = None,
|
||||
) -> None:
|
||||
current_platform.seed_everything(seed)
|
||||
|
||||
|
||||
@ -3,8 +3,8 @@
|
||||
|
||||
import argparse
|
||||
import math
|
||||
from collections.abc import Callable
|
||||
from contextlib import contextmanager
|
||||
from typing import Callable
|
||||
from unittest.mock import patch
|
||||
|
||||
import torch
|
||||
|
||||
@ -1,7 +1,5 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from __future__ import annotations
|
||||
|
||||
import random
|
||||
import time
|
||||
|
||||
|
||||
@ -1,7 +1,5 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from __future__ import annotations
|
||||
|
||||
import random
|
||||
import time
|
||||
|
||||
|
||||
@ -2,7 +2,6 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import itertools
|
||||
from typing import Optional, Union
|
||||
|
||||
import torch
|
||||
from flashinfer.norm import fused_add_rmsnorm, rmsnorm
|
||||
@ -21,8 +20,8 @@ class HuggingFaceRMSNorm(nn.Module):
|
||||
def forward(
|
||||
self,
|
||||
x: torch.Tensor,
|
||||
residual: Optional[torch.Tensor] = None,
|
||||
) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
|
||||
residual: torch.Tensor | None = None,
|
||||
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
|
||||
orig_dtype = x.dtype
|
||||
x = x.to(torch.float32)
|
||||
if residual is not None:
|
||||
@ -41,7 +40,7 @@ class HuggingFaceRMSNorm(nn.Module):
|
||||
def rmsnorm_naive(
|
||||
x: torch.Tensor,
|
||||
weight: torch.Tensor,
|
||||
residual: Optional[torch.Tensor] = None,
|
||||
residual: torch.Tensor | None = None,
|
||||
eps: float = 1e-6,
|
||||
):
|
||||
naive_norm = HuggingFaceRMSNorm(x.shape[-1], eps=eps)
|
||||
@ -65,7 +64,7 @@ def rmsnorm_naive(
|
||||
def rmsnorm_flashinfer(
|
||||
x: torch.Tensor,
|
||||
weight: torch.Tensor,
|
||||
residual: Optional[torch.Tensor] = None,
|
||||
residual: torch.Tensor | None = None,
|
||||
eps: float = 1e-6,
|
||||
):
|
||||
orig_shape = x.shape
|
||||
@ -89,7 +88,7 @@ def rmsnorm_flashinfer(
|
||||
def rmsnorm_vllm(
|
||||
x: torch.Tensor,
|
||||
weight: torch.Tensor,
|
||||
residual: Optional[torch.Tensor] = None,
|
||||
residual: torch.Tensor | None = None,
|
||||
eps: float = 1e-6,
|
||||
):
|
||||
orig_shape = x.shape
|
||||
|
||||
@ -2,7 +2,6 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from itertools import accumulate
|
||||
from typing import Optional
|
||||
|
||||
import nvtx
|
||||
import torch
|
||||
@ -18,7 +17,7 @@ def benchmark_rope_kernels_multi_lora(
|
||||
seq_len: int,
|
||||
num_heads: int,
|
||||
head_size: int,
|
||||
rotary_dim: Optional[int],
|
||||
rotary_dim: int | None,
|
||||
dtype: torch.dtype,
|
||||
seed: int,
|
||||
device: str,
|
||||
|
||||
@ -4,7 +4,6 @@
|
||||
import csv
|
||||
import os
|
||||
from datetime import datetime
|
||||
from typing import Optional
|
||||
|
||||
import flashinfer
|
||||
import torch
|
||||
@ -28,9 +27,7 @@ def to_float8(x, dtype=torch.float8_e4m3fn):
|
||||
@torch.no_grad()
|
||||
def benchmark_decode(
|
||||
dtype: torch.dtype,
|
||||
quant_dtypes: tuple[
|
||||
Optional[torch.dtype], Optional[torch.dtype], Optional[torch.dtype]
|
||||
],
|
||||
quant_dtypes: tuple[torch.dtype | None, torch.dtype | None, torch.dtype | None],
|
||||
batch_size: int,
|
||||
max_seq_len: int,
|
||||
num_heads: tuple[int, int] = (64, 8),
|
||||
|
||||
@ -4,7 +4,6 @@
|
||||
import csv
|
||||
import os
|
||||
from datetime import datetime
|
||||
from typing import Optional
|
||||
|
||||
import flashinfer
|
||||
import torch
|
||||
@ -28,9 +27,7 @@ def to_float8(x, dtype=torch.float8_e4m3fn):
|
||||
@torch.no_grad()
|
||||
def benchmark_prefill(
|
||||
dtype: torch.dtype,
|
||||
quant_dtypes: tuple[
|
||||
Optional[torch.dtype], Optional[torch.dtype], Optional[torch.dtype]
|
||||
],
|
||||
quant_dtypes: tuple[torch.dtype | None, torch.dtype | None, torch.dtype | None],
|
||||
batch_size: int,
|
||||
max_seq_len: int,
|
||||
num_heads: tuple[int, int] = (64, 8),
|
||||
|
||||
@ -2,8 +2,8 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import dataclasses
|
||||
from collections.abc import Iterable
|
||||
from typing import Any, Callable, Optional
|
||||
from collections.abc import Callable, Iterable
|
||||
from typing import Any
|
||||
|
||||
import torch
|
||||
import torch.utils.benchmark as TBenchmark
|
||||
@ -55,7 +55,7 @@ class Bench:
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
cuda_graph_params: Optional[CudaGraphBenchParams],
|
||||
cuda_graph_params: CudaGraphBenchParams | None,
|
||||
label: str,
|
||||
sub_label: str,
|
||||
description: str,
|
||||
|
||||
@ -2,7 +2,7 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from abc import ABC, abstractmethod
|
||||
from statistics import mean
|
||||
from typing import Any, NamedTuple, Optional, Union
|
||||
from typing import Any, NamedTuple
|
||||
|
||||
import numpy as np # type: ignore
|
||||
import pandas as pd # type: ignore
|
||||
@ -35,8 +35,8 @@ class Distribution(ABC):
|
||||
class UniformDistribution(Distribution):
|
||||
def __init__(
|
||||
self,
|
||||
min_val: Union[int, float],
|
||||
max_val: Union[int, float],
|
||||
min_val: int | float,
|
||||
max_val: int | float,
|
||||
is_integer: bool = True,
|
||||
) -> None:
|
||||
self.min_val = min_val
|
||||
@ -56,7 +56,7 @@ class UniformDistribution(Distribution):
|
||||
|
||||
|
||||
class ConstantDistribution(Distribution):
|
||||
def __init__(self, value: Union[int, float]) -> None:
|
||||
def __init__(self, value: int | float) -> None:
|
||||
self.value = value
|
||||
self.max_val = value
|
||||
|
||||
@ -68,7 +68,7 @@ class ConstantDistribution(Distribution):
|
||||
|
||||
|
||||
class ZipfDistribution(Distribution):
|
||||
def __init__(self, alpha: float, max_val: Optional[int] = None) -> None:
|
||||
def __init__(self, alpha: float, max_val: int | None = None) -> None:
|
||||
self.alpha = alpha
|
||||
self.max_val = max_val
|
||||
|
||||
@ -83,7 +83,7 @@ class ZipfDistribution(Distribution):
|
||||
|
||||
|
||||
class PoissonDistribution(Distribution):
|
||||
def __init__(self, alpha: float, max_val: Optional[int] = None) -> None:
|
||||
def __init__(self, alpha: float, max_val: int | None = None) -> None:
|
||||
self.alpha = alpha
|
||||
self.max_val = max_val
|
||||
|
||||
@ -100,11 +100,11 @@ class PoissonDistribution(Distribution):
|
||||
class LognormalDistribution(Distribution):
|
||||
def __init__(
|
||||
self,
|
||||
mean: Optional[float] = None,
|
||||
sigma: Optional[float] = None,
|
||||
average: Optional[int] = None,
|
||||
median_ratio: Optional[float] = None,
|
||||
max_val: Optional[int] = None,
|
||||
mean: float | None = None,
|
||||
sigma: float | None = None,
|
||||
average: int | None = None,
|
||||
median_ratio: float | None = None,
|
||||
max_val: int | None = None,
|
||||
) -> None:
|
||||
self.average = average
|
||||
self.median_ratio = median_ratio
|
||||
|
||||
@ -13,7 +13,7 @@ from datetime import datetime
|
||||
from enum import Enum
|
||||
from http import HTTPStatus
|
||||
from statistics import mean
|
||||
from typing import NamedTuple, Union
|
||||
from typing import NamedTuple
|
||||
|
||||
import aiohttp # type: ignore
|
||||
import numpy as np # type: ignore
|
||||
@ -169,7 +169,7 @@ class MovingAverage:
|
||||
class DebugStats:
|
||||
def __init__(self, logger: logging.Logger, window_size: int) -> None:
|
||||
self.logger = logger
|
||||
self.metrics: dict[str, Union[MovingAverage, MetricStats]] = {
|
||||
self.metrics: dict[str, MovingAverage | MetricStats] = {
|
||||
"moving_avg_ttft_ms": MovingAverage(window_size),
|
||||
"moving_avg_tpot_ms": MovingAverage(window_size),
|
||||
"ttft_ms": MetricStats(),
|
||||
@ -636,7 +636,7 @@ async def client_main(
|
||||
|
||||
if args.verbose:
|
||||
curr_time_sec: float = time.perf_counter()
|
||||
time_since_last_turn: Union[str, float] = "N/A"
|
||||
time_since_last_turn: str | float = "N/A"
|
||||
if conv_id in time_of_last_turn:
|
||||
time_since_last_turn = round(
|
||||
curr_time_sec - time_of_last_turn[conv_id], 3
|
||||
@ -928,13 +928,13 @@ async def main_mp(
|
||||
f"{num_clients_finished} out of {bench_args.num_clients} clients finished, collected {len(client_metrics)} measurements, runtime {runtime_sec:.3f} sec{Color.RESET}" # noqa: E501
|
||||
)
|
||||
|
||||
rps: Union[str, float] = round(len(client_metrics) / runtime_sec, 3)
|
||||
rps: str | float = round(len(client_metrics) / runtime_sec, 3)
|
||||
if len(client_metrics) < (5 * bench_args.num_clients):
|
||||
# Do not estimate the RPS if the number of samples is very low
|
||||
# (threshold can be tuned if needed)
|
||||
rps = "N/A"
|
||||
|
||||
runtime_left_sec: Union[str, float] = round(
|
||||
runtime_left_sec: str | float = round(
|
||||
(runtime_sec / finished_convs) * (total_convs - finished_convs), 3
|
||||
)
|
||||
if percent < 0.05:
|
||||
|
||||
@ -13,7 +13,7 @@ import argparse
|
||||
import json
|
||||
import random
|
||||
from statistics import mean
|
||||
from typing import Any, Optional
|
||||
from typing import Any
|
||||
|
||||
import pandas as pd # type: ignore
|
||||
import tqdm # type: ignore
|
||||
@ -25,7 +25,7 @@ def has_non_english_chars(text: str) -> bool:
|
||||
|
||||
|
||||
def content_is_valid(
|
||||
content: str, min_content_len: Optional[int], max_content_len: Optional[int]
|
||||
content: str, min_content_len: int | None, max_content_len: int | None
|
||||
) -> bool:
|
||||
if min_content_len and len(content) < min_content_len:
|
||||
return False
|
||||
@ -37,7 +37,7 @@ def content_is_valid(
|
||||
|
||||
|
||||
def print_stats(
|
||||
conversations: "list[dict[Any, Any]]", tokenizer: Optional[AutoTokenizer] = None
|
||||
conversations: "list[dict[Any, Any]]", tokenizer: AutoTokenizer | None = None
|
||||
) -> None:
|
||||
# Collect statistics
|
||||
stats = []
|
||||
@ -109,12 +109,12 @@ def convert_sharegpt_to_openai(
|
||||
seed: int,
|
||||
input_file: str,
|
||||
output_file: str,
|
||||
max_items: Optional[int],
|
||||
min_content_len: Optional[int] = None,
|
||||
max_content_len: Optional[int] = None,
|
||||
min_turns: Optional[int] = None,
|
||||
max_turns: Optional[int] = None,
|
||||
model: Optional[str] = None,
|
||||
max_items: int | None,
|
||||
min_content_len: int | None = None,
|
||||
max_content_len: int | None = None,
|
||||
min_turns: int | None = None,
|
||||
max_turns: int | None = None,
|
||||
model: str | None = None,
|
||||
) -> None:
|
||||
if min_turns and max_turns:
|
||||
assert min_turns <= max_turns
|
||||
|
||||
@ -198,13 +198,24 @@ else()
|
||||
endif()
|
||||
|
||||
if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON_FOUND) OR POWER9_FOUND OR POWER10_FOUND OR POWER11_FOUND)
|
||||
FetchContent_Declare(
|
||||
oneDNN
|
||||
GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git
|
||||
GIT_TAG v3.9
|
||||
GIT_PROGRESS TRUE
|
||||
GIT_SHALLOW TRUE
|
||||
)
|
||||
set(FETCHCONTENT_SOURCE_DIR_ONEDNN "$ENV{FETCHCONTENT_SOURCE_DIR_ONEDNN}" CACHE PATH "Path to a local oneDNN source directory.")
|
||||
|
||||
if(FETCHCONTENT_SOURCE_DIR_ONEDNN)
|
||||
message(STATUS "Using oneDNN from specified source directory: ${FETCHCONTENT_SOURCE_DIR_ONEDNN}")
|
||||
FetchContent_Declare(
|
||||
oneDNN
|
||||
SOURCE_DIR ${FETCHCONTENT_SOURCE_DIR_ONEDNN}
|
||||
)
|
||||
else()
|
||||
message(STATUS "Downloading oneDNN from GitHub")
|
||||
FetchContent_Declare(
|
||||
oneDNN
|
||||
GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git
|
||||
GIT_TAG v3.9
|
||||
GIT_PROGRESS TRUE
|
||||
GIT_SHALLOW TRUE
|
||||
)
|
||||
endif()
|
||||
|
||||
if(USE_ACL)
|
||||
find_library(ARM_COMPUTE_LIBRARY NAMES arm_compute PATHS $ENV{ACL_ROOT_DIR}/build/)
|
||||
@ -227,7 +238,7 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON
|
||||
set(ONEDNN_ENABLE_ITT_TASKS "OFF")
|
||||
set(ONEDNN_ENABLE_MAX_CPU_ISA "OFF")
|
||||
set(ONEDNN_ENABLE_CPU_ISA_HINTS "OFF")
|
||||
set(ONEDNN_VERBOSE "ON")
|
||||
set(ONEDNN_VERBOSE "OFF")
|
||||
set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)
|
||||
|
||||
FetchContent_MakeAvailable(oneDNN)
|
||||
@ -309,4 +320,4 @@ define_gpu_extension_target(
|
||||
WITH_SOABI
|
||||
)
|
||||
|
||||
message(STATUS "Enabling C extension.")
|
||||
message(STATUS "Enabling C extension.")
|
||||
@ -22,10 +22,10 @@ else()
|
||||
CONFIGURE_COMMAND ""
|
||||
BUILD_COMMAND ""
|
||||
)
|
||||
FetchContent_Populate(qutlass)
|
||||
set(qutlass_SOURCE_DIR "${qutlass_SOURCE_DIR}")
|
||||
endif()
|
||||
|
||||
FetchContent_Populate(qutlass)
|
||||
|
||||
if(NOT qutlass_SOURCE_DIR)
|
||||
message(FATAL_ERROR "[QUTLASS] source directory could not be resolved.")
|
||||
endif()
|
||||
|
||||
12
codecov.yml
Normal file
12
codecov.yml
Normal file
@ -0,0 +1,12 @@
|
||||
codecov:
|
||||
require_ci_to_pass: false
|
||||
|
||||
fixes:
|
||||
# Map source code paths to repository root paths
|
||||
# Wildcards match any Python version (python3.*)
|
||||
- "/vllm-workspace/src/vllm/::vllm/"
|
||||
- "/vllm-workspace/vllm/::vllm/"
|
||||
- "/usr/local/lib/python3.*/dist-packages/vllm/::vllm/"
|
||||
- "/usr/local/lib/python3.*/site-packages/vllm/::vllm/"
|
||||
- "/usr/lib/python3.*/dist-packages/vllm/::vllm/"
|
||||
- "/usr/lib/python3.*/site-packages/vllm/::vllm/"
|
||||
@ -8,9 +8,12 @@ namespace vllm {
|
||||
// vllm_kernel_override_batch_invariant(); returns true
|
||||
// if env VLLM_KERNEL_OVERRIDE_BATCH_INVARIANT=1
|
||||
inline bool vllm_kernel_override_batch_invariant() {
|
||||
std::string env_key = "VLLM_KERNEL_OVERRIDE_BATCH_INVARIANT";
|
||||
const char* val = std::getenv(env_key.c_str());
|
||||
return (val && std::atoi(val) != 0) ? 1 : 0;
|
||||
static bool cached = []() {
|
||||
std::string env_key = "VLLM_KERNEL_OVERRIDE_BATCH_INVARIANT";
|
||||
const char* val = std::getenv(env_key.c_str());
|
||||
return (val && std::atoi(val) != 0) ? 1 : 0;
|
||||
}();
|
||||
return cached;
|
||||
}
|
||||
|
||||
} // namespace vllm
|
||||
|
||||
@ -2,7 +2,6 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import enum
|
||||
from typing import Union
|
||||
|
||||
from cutlass_library import *
|
||||
|
||||
@ -22,7 +21,7 @@ class MixedInputKernelScheduleType(enum.Enum):
|
||||
TmaWarpSpecializedCooperative = enum_auto()
|
||||
|
||||
|
||||
VLLMDataTypeNames: dict[Union[VLLMDataType, DataType], str] = {
|
||||
VLLMDataTypeNames: dict[VLLMDataType | DataType, str] = {
|
||||
**DataTypeNames, # type: ignore
|
||||
**{
|
||||
VLLMDataType.u4b8: "u4b8",
|
||||
@ -30,7 +29,7 @@ VLLMDataTypeNames: dict[Union[VLLMDataType, DataType], str] = {
|
||||
},
|
||||
}
|
||||
|
||||
VLLMDataTypeTag: dict[Union[VLLMDataType, DataType], str] = {
|
||||
VLLMDataTypeTag: dict[VLLMDataType | DataType, str] = {
|
||||
**DataTypeTag, # type: ignore
|
||||
**{
|
||||
VLLMDataType.u4b8: "cutlass::vllm_uint4b8_t",
|
||||
@ -38,7 +37,7 @@ VLLMDataTypeTag: dict[Union[VLLMDataType, DataType], str] = {
|
||||
},
|
||||
}
|
||||
|
||||
VLLMDataTypeSize: dict[Union[VLLMDataType, DataType], int] = {
|
||||
VLLMDataTypeSize: dict[VLLMDataType | DataType, int] = {
|
||||
**DataTypeSize, # type: ignore
|
||||
**{
|
||||
VLLMDataType.u4b8: 4,
|
||||
@ -46,7 +45,7 @@ VLLMDataTypeSize: dict[Union[VLLMDataType, DataType], int] = {
|
||||
},
|
||||
}
|
||||
|
||||
VLLMDataTypeVLLMScalarTypeTag: dict[Union[VLLMDataType, DataType], str] = {
|
||||
VLLMDataTypeVLLMScalarTypeTag: dict[VLLMDataType | DataType, str] = {
|
||||
VLLMDataType.u4b8: "vllm::kU4B8",
|
||||
VLLMDataType.u8b128: "vllm::kU8B128",
|
||||
DataType.u4: "vllm::kU4",
|
||||
@ -57,7 +56,7 @@ VLLMDataTypeVLLMScalarTypeTag: dict[Union[VLLMDataType, DataType], str] = {
|
||||
DataType.bf16: "vllm::kBfloat16",
|
||||
}
|
||||
|
||||
VLLMDataTypeTorchDataTypeTag: dict[Union[VLLMDataType, DataType], str] = {
|
||||
VLLMDataTypeTorchDataTypeTag: dict[VLLMDataType | DataType, str] = {
|
||||
DataType.u8: "at::ScalarType::Byte",
|
||||
DataType.s8: "at::ScalarType::Char",
|
||||
DataType.e4m3: "at::ScalarType::Float8_e4m3fn",
|
||||
@ -67,9 +66,7 @@ VLLMDataTypeTorchDataTypeTag: dict[Union[VLLMDataType, DataType], str] = {
|
||||
DataType.f32: "at::ScalarType::Float",
|
||||
}
|
||||
|
||||
VLLMKernelScheduleTag: dict[
|
||||
Union[MixedInputKernelScheduleType, KernelScheduleType], str
|
||||
] = {
|
||||
VLLMKernelScheduleTag: dict[MixedInputKernelScheduleType | KernelScheduleType, str] = {
|
||||
**KernelScheduleTag, # type: ignore
|
||||
**{
|
||||
MixedInputKernelScheduleType.TmaWarpSpecialized: "cutlass::gemm::KernelTmaWarpSpecialized", # noqa: E501
|
||||
|
||||
@ -2,6 +2,7 @@
|
||||
#include "dispatch_utils.h"
|
||||
#include "cub_helpers.h"
|
||||
#include "core/batch_invariant.hpp"
|
||||
#include "quantization/vectorization_utils.cuh"
|
||||
|
||||
#include <torch/cuda.h>
|
||||
#include <c10/cuda/CUDAGuard.h>
|
||||
@ -18,11 +19,22 @@ __global__ void rms_norm_kernel(
|
||||
const float epsilon, const int num_tokens, const int hidden_size) {
|
||||
__shared__ float s_variance;
|
||||
float variance = 0.0f;
|
||||
const scalar_t* input_row = input + blockIdx.x * input_stride;
|
||||
|
||||
for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
|
||||
const float x = (float)input[blockIdx.x * input_stride + idx];
|
||||
constexpr int VEC_SIZE = 8;
|
||||
auto vec_op = [&variance](const vec_n_t<scalar_t, VEC_SIZE>& vec) {
|
||||
#pragma unroll
|
||||
for (int i = 0; i < VEC_SIZE; ++i) {
|
||||
float x = static_cast<float>(vec.val[i]);
|
||||
variance += x * x;
|
||||
}
|
||||
};
|
||||
auto scalar_op = [&variance](const scalar_t& val) {
|
||||
float x = static_cast<float>(val);
|
||||
variance += x * x;
|
||||
}
|
||||
};
|
||||
vllm::vectorize_read_with_alignment<VEC_SIZE>(
|
||||
input_row, hidden_size, threadIdx.x, blockDim.x, vec_op, scalar_op);
|
||||
|
||||
using BlockReduce = cub::BlockReduce<float, 1024>;
|
||||
__shared__ typename BlockReduce::TempStorage reduceStore;
|
||||
|
||||
@ -10,6 +10,7 @@
|
||||
#include "dispatch_utils.h"
|
||||
#include "cub_helpers.h"
|
||||
#include "core/batch_invariant.hpp"
|
||||
#include "quantization/vectorization_utils.cuh"
|
||||
|
||||
#include <torch/cuda.h>
|
||||
#include <c10/cuda/CUDAGuard.h>
|
||||
@ -28,10 +29,22 @@ __global__ void rms_norm_static_fp8_quant_kernel(
|
||||
__shared__ float s_variance;
|
||||
float variance = 0.0f;
|
||||
|
||||
for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
|
||||
const float x = (float)input[blockIdx.x * input_stride + idx];
|
||||
const scalar_t* input_row = input + blockIdx.x * input_stride;
|
||||
|
||||
constexpr int VEC_SIZE = 8;
|
||||
auto vec_op = [&variance](const vec_n_t<scalar_t, VEC_SIZE>& vec) {
|
||||
#pragma unroll
|
||||
for (int i = 0; i < VEC_SIZE; ++i) {
|
||||
float x = static_cast<float>(vec.val[i]);
|
||||
variance += x * x;
|
||||
}
|
||||
};
|
||||
auto scalar_op = [&variance](const scalar_t& val) {
|
||||
float x = static_cast<float>(val);
|
||||
variance += x * x;
|
||||
}
|
||||
};
|
||||
vllm::vectorize_read_with_alignment<VEC_SIZE>(
|
||||
input_row, hidden_size, threadIdx.x, blockDim.x, vec_op, scalar_op);
|
||||
|
||||
using BlockReduce = cub::BlockReduce<float, 1024>;
|
||||
__shared__ typename BlockReduce::TempStorage reduceStore;
|
||||
|
||||
@ -21,7 +21,6 @@
|
||||
#include <c10/cuda/CUDAGuard.h>
|
||||
#include "../cuda_compat.h"
|
||||
#include "../cub_helpers.h"
|
||||
#include "../core/batch_invariant.hpp"
|
||||
|
||||
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
||||
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
||||
@ -406,8 +405,7 @@ void topkGatingSoftmaxLauncherHelper(const float* input, const bool* finished, f
|
||||
using Constants = detail::TopkConstants<EXPERTS, BYTES_PER_LDG, WARP_SIZE_PARAM>;
|
||||
static constexpr int VPT = Constants::VPT;
|
||||
static constexpr int ROWS_PER_WARP = Constants::ROWS_PER_WARP;
|
||||
const bool batch_invariant_launch = vllm::vllm_kernel_override_batch_invariant();
|
||||
const int num_warps = batch_invariant_launch ? 32 : (num_rows + ROWS_PER_WARP - 1) / ROWS_PER_WARP;
|
||||
const int num_warps = (num_rows + ROWS_PER_WARP - 1) / ROWS_PER_WARP;
|
||||
const int num_blocks = (num_warps + WARPS_PER_TB - 1) / WARPS_PER_TB;
|
||||
|
||||
dim3 block_dim(WARP_SIZE_PARAM, WARPS_PER_TB);
|
||||
|
||||
@ -9,7 +9,6 @@ from collections.abc import Iterable
|
||||
from copy import deepcopy
|
||||
from dataclasses import dataclass, fields
|
||||
from functools import reduce
|
||||
from typing import Optional, Union
|
||||
|
||||
import jinja2
|
||||
from vllm_cutlass_library_extension import (
|
||||
@ -259,7 +258,7 @@ class ScheduleConfig:
|
||||
@dataclass(frozen=True)
|
||||
class TypeConfig:
|
||||
a: DataType
|
||||
b: Union[DataType, VLLMDataType]
|
||||
b: DataType | VLLMDataType
|
||||
b_group_scale: DataType
|
||||
b_group_zeropoint: DataType
|
||||
b_channel_scale: DataType
|
||||
@ -280,7 +279,7 @@ class PrepackTypeConfig:
|
||||
class ImplConfig:
|
||||
types: TypeConfig
|
||||
schedules: list[ScheduleConfig]
|
||||
heuristic: list[tuple[Optional[str], ScheduleConfig]]
|
||||
heuristic: list[tuple[str | None, ScheduleConfig]]
|
||||
|
||||
|
||||
def generate_sch_sig(schedule_config: ScheduleConfig) -> str:
|
||||
|
||||
@ -22,13 +22,14 @@ template <typename AllReduceKernel, typename T>
|
||||
__global__ __quickreduce_launch_bounds_two_shot__ static void
|
||||
allreduce_prototype_twoshot(T const* A, T* B, uint32_t N, uint32_t num_blocks,
|
||||
int rank, uint8_t** dbuffer_list,
|
||||
uint32_t data_offset, uint32_t flag_color) {
|
||||
uint32_t data_offset, uint32_t flag_color,
|
||||
int64_t data_size_per_phase) {
|
||||
int block = blockIdx.x;
|
||||
int grid = gridDim.x;
|
||||
|
||||
while (block < num_blocks) {
|
||||
AllReduceKernel::run(A, B, N, block, rank, dbuffer_list, data_offset,
|
||||
flag_color);
|
||||
flag_color, data_size_per_phase);
|
||||
block += grid;
|
||||
flag_color++;
|
||||
}
|
||||
@ -41,21 +42,21 @@ allreduce_prototype_twoshot(T const* A, T* B, uint32_t N, uint32_t num_blocks,
|
||||
hipLaunchKernelGGL((allreduce_prototype_twoshot<AllReduceKernel, T>), \
|
||||
dim3(grid), dim3(kBlockTwoShot), 0, stream, A, B, N, \
|
||||
num_blocks, rank, dbuffer_list, data_offset, \
|
||||
flag_color); \
|
||||
flag_color, this->kMaxProblemSize); \
|
||||
} else if (world_size == 4) { \
|
||||
using LineCodec = __codec<T, 4>; \
|
||||
using AllReduceKernel = AllReduceTwoshot<T, LineCodec, cast_bf2half>; \
|
||||
hipLaunchKernelGGL((allreduce_prototype_twoshot<AllReduceKernel, T>), \
|
||||
dim3(grid), dim3(kBlockTwoShot), 0, stream, A, B, N, \
|
||||
num_blocks, rank, dbuffer_list, data_offset, \
|
||||
flag_color); \
|
||||
flag_color, this->kMaxProblemSize); \
|
||||
} else if (world_size == 8) { \
|
||||
using LineCodec = __codec<T, 8>; \
|
||||
using AllReduceKernel = AllReduceTwoshot<T, LineCodec, cast_bf2half>; \
|
||||
hipLaunchKernelGGL((allreduce_prototype_twoshot<AllReduceKernel, T>), \
|
||||
dim3(grid), dim3(kBlockTwoShot), 0, stream, A, B, N, \
|
||||
num_blocks, rank, dbuffer_list, data_offset, \
|
||||
flag_color); \
|
||||
flag_color, this->kMaxProblemSize); \
|
||||
}
|
||||
|
||||
enum QuickReduceQuantLevel {
|
||||
|
||||
@ -553,13 +553,12 @@ struct AllReduceTwoshot {
|
||||
int const rank, // rank index
|
||||
uint8_t** __restrict__ buffer_list, // communication buffers
|
||||
uint32_t const data_offset, // offset to start of the data buffer
|
||||
uint32_t flag_color) {
|
||||
uint32_t flag_color, int64_t data_size_per_phase) {
|
||||
// Topology
|
||||
int thread = threadIdx.x + threadIdx.y * kWavefront;
|
||||
uint8_t* rank_buffer = buffer_list[rank];
|
||||
Codec codec(thread, rank);
|
||||
int block_id = blockIdx.x;
|
||||
int grid_size = gridDim.x;
|
||||
// --------------------------------------------------------
|
||||
// Read input into registers
|
||||
int32x4_t tA[kAtoms];
|
||||
@ -588,12 +587,10 @@ struct AllReduceTwoshot {
|
||||
// rank responsible for this segment.
|
||||
uint32_t comm_data0_offset =
|
||||
data_offset + block_id * Codec::kTransmittedTileSize;
|
||||
uint32_t comm_data1_offset =
|
||||
grid_size * Codec::kTransmittedTileSize + comm_data0_offset;
|
||||
uint32_t comm_data1_offset = data_size_per_phase + comm_data0_offset;
|
||||
|
||||
uint32_t comm_flags0_offset = block_id * (kWorldSize * sizeof(uint32_t));
|
||||
uint32_t comm_flags1_offset =
|
||||
grid_size * (kWorldSize * sizeof(uint32_t)) + comm_flags0_offset;
|
||||
uint32_t comm_flags1_offset = (data_offset / 2) + comm_flags0_offset;
|
||||
|
||||
for (int r = 0; r < kWorldSize; r++) {
|
||||
int32x4_t* send_buffer =
|
||||
|
||||
@ -229,7 +229,7 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
|
||||
# Check the size of the wheel if RUN_WHEEL_CHECK is true
|
||||
COPY .buildkite/check-wheel-size.py check-wheel-size.py
|
||||
# sync the default value with .buildkite/check-wheel-size.py
|
||||
ARG VLLM_MAX_SIZE_MB=450
|
||||
ARG VLLM_MAX_SIZE_MB=500
|
||||
ENV VLLM_MAX_SIZE_MB=$VLLM_MAX_SIZE_MB
|
||||
ARG RUN_WHEEL_CHECK=true
|
||||
RUN if [ "$RUN_WHEEL_CHECK" = "true" ]; then \
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
ARG BASE_UBI_IMAGE_TAG=9.5-1741850109
|
||||
ARG BASE_UBI_IMAGE_TAG=9.6-1754584681
|
||||
|
||||
###############################################################
|
||||
# Stage to build openblas
|
||||
@ -7,7 +7,7 @@ ARG BASE_UBI_IMAGE_TAG=9.5-1741850109
|
||||
FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} AS openblas-builder
|
||||
|
||||
ARG MAX_JOBS
|
||||
ARG OPENBLAS_VERSION=0.3.29
|
||||
ARG OPENBLAS_VERSION=0.3.30
|
||||
RUN microdnf install -y dnf && dnf install -y gcc-toolset-13 make wget unzip \
|
||||
&& source /opt/rh/gcc-toolset-13/enable \
|
||||
&& wget https://github.com/OpenMathLib/OpenBLAS/releases/download/v$OPENBLAS_VERSION/OpenBLAS-$OPENBLAS_VERSION.zip \
|
||||
@ -38,7 +38,7 @@ RUN dnf install -y openjpeg2-devel lcms2-devel tcl-devel tk-devel fribidi-devel
|
||||
FROM centos-deps-builder AS base-builder
|
||||
|
||||
ARG PYTHON_VERSION=3.12
|
||||
ARG OPENBLAS_VERSION=0.3.29
|
||||
ARG OPENBLAS_VERSION=0.3.30
|
||||
|
||||
# Set Environment Variables for venv, cargo & openblas
|
||||
ENV VIRTUAL_ENV=/opt/vllm
|
||||
@ -61,7 +61,7 @@ RUN --mount=type=bind,from=openblas-builder,source=/OpenBLAS-$OPENBLAS_VERSION/,
|
||||
pkgconfig xsimd zeromq-devel kmod findutils protobuf* \
|
||||
libtiff-devel libjpeg-devel zlib-devel freetype-devel libwebp-devel \
|
||||
harfbuzz-devel libraqm-devel libimagequant-devel libxcb-devel \
|
||||
python${PYTHON_VERSION}-devel python${PYTHON_VERSION}-pip \
|
||||
python${PYTHON_VERSION}-devel python${PYTHON_VERSION}-pip clang-devel \
|
||||
&& dnf clean all \
|
||||
&& PREFIX=/usr/local make -C /openblas install \
|
||||
&& ln -sf /usr/lib64/libatomic.so.1 /usr/lib64/libatomic.so \
|
||||
@ -79,9 +79,9 @@ RUN --mount=type=bind,from=openblas-builder,source=/OpenBLAS-$OPENBLAS_VERSION/,
|
||||
FROM base-builder AS torch-builder
|
||||
|
||||
ARG MAX_JOBS
|
||||
ARG TORCH_VERSION=2.6.0
|
||||
ARG TORCH_VERSION=2.7.0
|
||||
ARG _GLIBCXX_USE_CXX11_ABI=1
|
||||
ARG OPENBLAS_VERSION=0.3.29
|
||||
ARG OPENBLAS_VERSION=0.3.30
|
||||
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
source /opt/rh/gcc-toolset-13/enable && \
|
||||
@ -93,7 +93,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
MAX_JOBS=${MAX_JOBS:-$(nproc)} \
|
||||
PYTORCH_BUILD_VERSION=${TORCH_VERSION} PYTORCH_BUILD_NUMBER=1 uv build --wheel --out-dir /torchwheels/
|
||||
|
||||
ARG TORCHVISION_VERSION=0.21.0
|
||||
ARG TORCHVISION_VERSION=0.22.0
|
||||
ARG TORCHVISION_USE_NVJPEG=0
|
||||
ARG TORCHVISION_USE_FFMPEG=0
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
@ -104,7 +104,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
BUILD_VERSION=${TORCHVISION_VERSION} \
|
||||
uv build --wheel --out-dir /torchwheels/ --no-build-isolation
|
||||
|
||||
ARG TORCHAUDIO_VERSION=2.6.0
|
||||
ARG TORCHAUDIO_VERSION=2.7.0
|
||||
ARG BUILD_SOX=1
|
||||
ARG BUILD_KALDI=1
|
||||
ARG BUILD_RNNT=1
|
||||
@ -128,7 +128,7 @@ FROM base-builder AS arrow-builder
|
||||
|
||||
ARG MAX_JOBS
|
||||
ARG PYARROW_PARALLEL
|
||||
ARG PYARROW_VERSION=19.0.1
|
||||
ARG PYARROW_VERSION=21.0.0
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
source /opt/rh/gcc-toolset-13/enable && \
|
||||
git clone --recursive https://github.com/apache/arrow.git -b apache-arrow-${PYARROW_VERSION} && \
|
||||
@ -145,7 +145,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
make install -j ${MAX_JOBS:-$(nproc)} && \
|
||||
cd ../../python/ && \
|
||||
uv pip install -v -r requirements-build.txt && uv pip install numpy==2.1.3 && \
|
||||
pip show numpy && ls -lrt /opt/vllm/lib/python3.12/site-packages/numpy && \
|
||||
PYARROW_PARALLEL=${PYARROW_PARALLEL:-$(nproc)} \
|
||||
python setup.py build_ext \
|
||||
--build-type=release --bundle-arrow-cpp \
|
||||
@ -187,6 +186,23 @@ RUN git clone --recursive https://github.com/numactl/numactl.git -b v${NUMACTL_V
|
||||
&& make -j ${MAX_JOBS:-$(nproc)}
|
||||
|
||||
|
||||
###############################################################
|
||||
# Stage to build numba
|
||||
###############################################################
|
||||
|
||||
FROM base-builder AS numba-builder
|
||||
|
||||
ARG MAX_JOBS
|
||||
ARG NUMBA_VERSION=0.61.2
|
||||
|
||||
# Clone all required dependencies
|
||||
RUN dnf install ninja-build llvm15 llvm15-devel -y && source /opt/rh/gcc-toolset-13/enable && export PATH=$PATH:/usr/lib64/llvm15/bin && \
|
||||
git clone --recursive https://github.com/numba/numba.git -b ${NUMBA_VERSION} && \
|
||||
cd ./numba && \
|
||||
if ! grep '#include "dynamic_annotations.h"' numba/_dispatcher.cpp; then \
|
||||
sed -i '/#include "internal\/pycore_atomic.h"/i\#include "dynamic_annotations.h"' numba/_dispatcher.cpp; \
|
||||
fi && python -m build --wheel --installer=uv --outdir /numbawheels/
|
||||
|
||||
###############################################################
|
||||
# Stage to build vllm - this stage builds and installs
|
||||
# vllm, tensorizer and vllm-tgis-adapter and builds uv cache
|
||||
@ -199,6 +215,7 @@ COPY --from=torch-builder /tmp/control /dev/null
|
||||
COPY --from=arrow-builder /tmp/control /dev/null
|
||||
COPY --from=cv-builder /tmp/control /dev/null
|
||||
COPY --from=numa-builder /tmp/control /dev/null
|
||||
COPY --from=numba-builder /tmp/control /dev/null
|
||||
|
||||
ARG VLLM_TARGET_DEVICE=cpu
|
||||
ARG GRPC_PYTHON_BUILD_SYSTEM_OPENSSL=1
|
||||
@ -206,6 +223,8 @@ ARG GRPC_PYTHON_BUILD_SYSTEM_OPENSSL=1
|
||||
# this step installs vllm and populates uv cache
|
||||
# with all the transitive dependencies
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
dnf install llvm15 llvm15-devel -y && \
|
||||
rpm -ivh --nodeps https://mirror.stream.centos.org/9-stream/CRB/ppc64le/os/Packages/protobuf-lite-devel-3.14.0-16.el9.ppc64le.rpm && \
|
||||
source /opt/rh/gcc-toolset-13/enable && \
|
||||
git clone https://github.com/huggingface/xet-core.git && cd xet-core/hf_xet/ && \
|
||||
uv pip install maturin && \
|
||||
@ -215,15 +234,18 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
--mount=type=bind,from=arrow-builder,source=/arrowwheels/,target=/arrowwheels/,ro \
|
||||
--mount=type=bind,from=cv-builder,source=/opencvwheels/,target=/opencvwheels/,ro \
|
||||
--mount=type=bind,from=numa-builder,source=/numactl/,target=/numactl/,rw \
|
||||
--mount=type=bind,from=numba-builder,source=/numbawheels/,target=/numbawheels/,ro \
|
||||
--mount=type=bind,src=.,dst=/src/,rw \
|
||||
source /opt/rh/gcc-toolset-13/enable && \
|
||||
uv pip install /opencvwheels/*.whl /arrowwheels/*.whl /torchwheels/*.whl && \
|
||||
export PATH=$PATH:/usr/lib64/llvm15/bin && \
|
||||
uv pip install /opencvwheels/*.whl /arrowwheels/*.whl /torchwheels/*.whl /numbawheels/*.whl && \
|
||||
sed -i -e 's/.*torch.*//g' /src/pyproject.toml /src/requirements/*.txt && \
|
||||
uv pip install pandas pythran pybind11 /hf_wheels/*.whl && \
|
||||
sed -i -e 's/.*sentencepiece.*//g' /src/pyproject.toml /src/requirements/*.txt && \
|
||||
uv pip install sentencepiece==0.2.0 pandas pythran nanobind pybind11 /hf_wheels/*.whl && \
|
||||
make -C /numactl install && \
|
||||
# sentencepiece.pc is in some pkgconfig inside uv cache
|
||||
export PKG_CONFIG_PATH=$(find / -type d -name "pkgconfig" 2>/dev/null | tr '\n' ':') && \
|
||||
uv pip install -r /src/requirements/common.txt -r /src/requirements/cpu.txt -r /src/requirements/build.txt --no-build-isolation && \
|
||||
nanobind_DIR=$(uv pip show nanobind | grep Location | sed 's/^Location: //;s/$/\/nanobind\/cmake/') && uv pip install -r /src/requirements/common.txt -r /src/requirements/cpu.txt -r /src/requirements/build.txt --no-build-isolation && \
|
||||
cd /src/ && \
|
||||
uv build --wheel --out-dir /vllmwheel/ --no-build-isolation && \
|
||||
uv pip install /vllmwheel/*.whl
|
||||
@ -250,7 +272,7 @@ RUN git clone --recursive https://github.com/Reference-LAPACK/lapack.git -b v${L
|
||||
FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} AS vllm-openai
|
||||
|
||||
ARG PYTHON_VERSION=3.12
|
||||
ARG OPENBLAS_VERSION=0.3.29
|
||||
ARG OPENBLAS_VERSION=0.3.30
|
||||
|
||||
# Set Environment Variables for venv & openblas
|
||||
ENV VIRTUAL_ENV=/opt/vllm
|
||||
@ -268,6 +290,7 @@ COPY --from=vllmcache-builder /tmp/control /dev/null
|
||||
COPY --from=numa-builder /tmp/control /dev/null
|
||||
COPY --from=lapack-builder /tmp/control /dev/null
|
||||
COPY --from=openblas-builder /tmp/control /dev/null
|
||||
COPY --from=numba-builder /tmp/control /dev/null
|
||||
|
||||
# install gcc-11, python, openblas, numactl, lapack
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
@ -276,13 +299,13 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
--mount=type=bind,from=openblas-builder,source=/OpenBLAS-$OPENBLAS_VERSION/,target=/openblas/,rw \
|
||||
rpm -ivh https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm && \
|
||||
microdnf install --nodocs -y \
|
||||
tar findutils openssl \
|
||||
libomp tar findutils openssl llvm15 llvm15-devel \
|
||||
pkgconfig xsimd g++ gcc-fortran libsndfile \
|
||||
libtiff libjpeg openjpeg2 zlib zeromq \
|
||||
freetype lcms2 libwebp tcl tk utf8proc \
|
||||
harfbuzz fribidi libraqm libimagequant libxcb \
|
||||
harfbuzz fribidi libraqm libimagequant libxcb util-linux \
|
||||
python${PYTHON_VERSION}-devel python${PYTHON_VERSION}-pip \
|
||||
&& microdnf clean all \
|
||||
&& export PATH=$PATH:/usr/lib64/llvm15/bin && microdnf clean all \
|
||||
&& python${PYTHON_VERSION} -m venv ${VIRTUAL_ENV} \
|
||||
&& python -m pip install -U pip uv --no-cache \
|
||||
&& make -C /numactl install \
|
||||
@ -298,7 +321,10 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
--mount=type=bind,from=cv-builder,source=/opencvwheels/,target=/opencvwheels/,ro \
|
||||
--mount=type=bind,from=vllmcache-builder,source=/hf_wheels/,target=/hf_wheels/,ro \
|
||||
--mount=type=bind,from=vllmcache-builder,source=/vllmwheel/,target=/vllmwheel/,ro \
|
||||
HOME=/root uv pip install /opencvwheels/*.whl /arrowwheels/*.whl /torchwheels/*.whl /hf_wheels/*.whl /vllmwheel/*.whl
|
||||
--mount=type=bind,from=numba-builder,source=/numbawheels/,target=/numbawheels/,ro \
|
||||
export PKG_CONFIG_PATH=$(find / -type d -name "pkgconfig" 2>/dev/null | tr '\n' ':') && uv pip install sentencepiece==0.2.0 && \
|
||||
HOME=/root uv pip install /opencvwheels/*.whl /arrowwheels/*.whl /torchwheels/*.whl /numbawheels/*.whl /hf_wheels/*.whl /vllmwheel/*.whl
|
||||
|
||||
|
||||
COPY ./ /workspace/vllm
|
||||
WORKDIR /workspace/vllm
|
||||
@ -314,4 +340,4 @@ WORKDIR /workspace/
|
||||
|
||||
RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
|
||||
|
||||
ENTRYPOINT ["vllm", "serve"]
|
||||
ENTRYPOINT ["vllm", "serve"]
|
||||
@ -69,4 +69,9 @@ RUN --mount=type=cache,target=/root/.cache/pip \
|
||||
|
||||
# install development dependencies (for testing)
|
||||
RUN python3 -m pip install -e tests/vllm_test_utils
|
||||
|
||||
# install nixl from source code
|
||||
RUN python3 /workspace/vllm/tools/install_nixl_from_source_ubuntu.py
|
||||
ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/lib/python3.12/dist-packages/.nixl.mesonpy.libs/plugins/"
|
||||
|
||||
ENTRYPOINT ["vllm", "serve"]
|
||||
|
||||
@ -11,8 +11,7 @@ The following code splits the model across 2 GPUs.
|
||||
```python
|
||||
from vllm import LLM
|
||||
|
||||
llm = LLM(model="ibm-granite/granite-3.1-8b-instruct",
|
||||
tensor_parallel_size=2)
|
||||
llm = LLM(model="ibm-granite/granite-3.1-8b-instruct", tensor_parallel_size=2)
|
||||
```
|
||||
|
||||
!!! warning
|
||||
@ -43,9 +42,7 @@ and the maximum batch size (`max_num_seqs` option).
|
||||
```python
|
||||
from vllm import LLM
|
||||
|
||||
llm = LLM(model="adept/fuyu-8b",
|
||||
max_model_len=2048,
|
||||
max_num_seqs=2)
|
||||
llm = LLM(model="adept/fuyu-8b", max_model_len=2048, max_num_seqs=2)
|
||||
```
|
||||
|
||||
## Reduce CUDA Graphs
|
||||
@ -61,12 +58,12 @@ You can adjust `compilation_config` to achieve a better balance between inferenc
|
||||
|
||||
```python
|
||||
from vllm import LLM
|
||||
from vllm.config import CompilationConfig, CompilationLevel
|
||||
from vllm.config import CompilationConfig, CompilationMode
|
||||
|
||||
llm = LLM(
|
||||
model="meta-llama/Llama-3.1-8B-Instruct",
|
||||
compilation_config=CompilationConfig(
|
||||
level=CompilationLevel.PIECEWISE,
|
||||
mode=CompilationMode.VLLM_COMPILE,
|
||||
# By default, it goes up to max_num_seqs
|
||||
cudagraph_capture_sizes=[1, 2, 4, 8, 16],
|
||||
),
|
||||
@ -78,8 +75,7 @@ You can disable graph capturing completely via the `enforce_eager` flag:
|
||||
```python
|
||||
from vllm import LLM
|
||||
|
||||
llm = LLM(model="meta-llama/Llama-3.1-8B-Instruct",
|
||||
enforce_eager=True)
|
||||
llm = LLM(model="meta-llama/Llama-3.1-8B-Instruct", enforce_eager=True)
|
||||
```
|
||||
|
||||
## Adjust cache size
|
||||
@ -97,8 +93,10 @@ You can allow a smaller number of multi-modal items per prompt to reduce the mem
|
||||
from vllm import LLM
|
||||
|
||||
# Accept up to 3 images and 1 video per prompt
|
||||
llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
|
||||
limit_mm_per_prompt={"image": 3, "video": 1})
|
||||
llm = LLM(
|
||||
model="Qwen/Qwen2.5-VL-3B-Instruct",
|
||||
limit_mm_per_prompt={"image": 3, "video": 1},
|
||||
)
|
||||
```
|
||||
|
||||
You can go a step further and disable unused modalities completely by setting its limit to zero.
|
||||
@ -108,8 +106,10 @@ For example, if your application only accepts image input, there is no need to a
|
||||
from vllm import LLM
|
||||
|
||||
# Accept any number of images but no videos
|
||||
llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
|
||||
limit_mm_per_prompt={"video": 0})
|
||||
llm = LLM(
|
||||
model="Qwen/Qwen2.5-VL-3B-Instruct",
|
||||
limit_mm_per_prompt={"video": 0},
|
||||
)
|
||||
```
|
||||
|
||||
You can even run a multi-modal model for text-only inference:
|
||||
@ -118,8 +118,10 @@ You can even run a multi-modal model for text-only inference:
|
||||
from vllm import LLM
|
||||
|
||||
# Don't accept images. Just text.
|
||||
llm = LLM(model="google/gemma-3-27b-it",
|
||||
limit_mm_per_prompt={"image": 0})
|
||||
llm = LLM(
|
||||
model="google/gemma-3-27b-it",
|
||||
limit_mm_per_prompt={"image": 0},
|
||||
)
|
||||
```
|
||||
|
||||
### Configurable options
|
||||
@ -173,14 +175,14 @@ Here are some examples:
|
||||
from vllm import LLM
|
||||
|
||||
# Available for Qwen2-VL series models
|
||||
llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
|
||||
mm_processor_kwargs={
|
||||
"max_pixels": 768 * 768, # Default is 1280 * 28 * 28
|
||||
})
|
||||
llm = LLM(
|
||||
model="Qwen/Qwen2.5-VL-3B-Instruct",
|
||||
mm_processor_kwargs={"max_pixels": 768 * 768}, # Default is 1280 * 28 * 28
|
||||
)
|
||||
|
||||
# Available for InternVL series models
|
||||
llm = LLM(model="OpenGVLab/InternVL2-2B",
|
||||
mm_processor_kwargs={
|
||||
"max_dynamic_patch": 4, # Default is 12
|
||||
})
|
||||
llm = LLM(
|
||||
model="OpenGVLab/InternVL2-2B",
|
||||
mm_processor_kwargs={"max_dynamic_patch": 4}, # Default is 12
|
||||
)
|
||||
```
|
||||
|
||||
@ -100,7 +100,7 @@ from vllm import LLM
|
||||
llm = LLM(
|
||||
model="meta-llama/Llama-3.3-70B-Instruct,
|
||||
tensor_parallel_size=4,
|
||||
pipeline_parallel_size=2
|
||||
pipeline_parallel_size=2,
|
||||
)
|
||||
```
|
||||
|
||||
@ -257,18 +257,24 @@ Examples:
|
||||
|
||||
```python
|
||||
# Use a larger cache
|
||||
llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
|
||||
mm_processor_cache_gb=8)
|
||||
llm = LLM(
|
||||
model="Qwen/Qwen2.5-VL-3B-Instruct",
|
||||
mm_processor_cache_gb=8,
|
||||
)
|
||||
|
||||
# Use a shared-memory based IPC cache
|
||||
llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
|
||||
tensor_parallel_size=2,
|
||||
mm_processor_cache_type="shm",
|
||||
mm_processor_cache_gb=8)
|
||||
llm = LLM(
|
||||
model="Qwen/Qwen2.5-VL-3B-Instruct",
|
||||
tensor_parallel_size=2,
|
||||
mm_processor_cache_type="shm",
|
||||
mm_processor_cache_gb=8,
|
||||
)
|
||||
|
||||
# Disable the cache
|
||||
llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
|
||||
mm_processor_cache_gb=0)
|
||||
llm = LLM(
|
||||
model="Qwen/Qwen2.5-VL-3B-Instruct",
|
||||
mm_processor_cache_gb=0,
|
||||
)
|
||||
```
|
||||
|
||||
### Cache Placement
|
||||
|
||||
@ -35,6 +35,7 @@ th {
|
||||
| Sonnet (deprecated) | ✅ | ✅ | Local file: `benchmarks/sonnet.txt` |
|
||||
| Random | ✅ | ✅ | `synthetic` |
|
||||
| RandomMultiModal (Image/Video) | 🟡 | 🚧 | `synthetic` |
|
||||
| RandomForReranking | ✅ | ✅ | `synthetic` |
|
||||
| Prefix Repetition | ✅ | ✅ | `synthetic` |
|
||||
| HuggingFace-VisionArena | ✅ | ✅ | `lmarena-ai/VisionArena-Chat` |
|
||||
| HuggingFace-MMVU | ✅ | ✅ | `yale-nlp/MMVU` |
|
||||
@ -878,6 +879,51 @@ vllm bench serve \
|
||||
|
||||
</details>
|
||||
|
||||
#### Reranker Benchmark
|
||||
|
||||
Benchmark the performance of rerank requests in vLLM.
|
||||
|
||||
<details class="admonition abstract" markdown="1">
|
||||
<summary>Show more</summary>
|
||||
|
||||
Unlike generative models which use Completions API or Chat Completions API,
|
||||
you should set `--backend vllm-rerank` and `--endpoint /v1/rerank` to use the Reranker API.
|
||||
|
||||
For reranking, the only supported dataset is `--dataset-name random-rerank`
|
||||
|
||||
Start the server:
|
||||
|
||||
```bash
|
||||
vllm serve BAAI/bge-reranker-v2-m3
|
||||
```
|
||||
|
||||
Run the benchmark:
|
||||
|
||||
```bash
|
||||
vllm bench serve \
|
||||
--model BAAI/bge-reranker-v2-m3 \
|
||||
--backend vllm-rerank \
|
||||
--endpoint /v1/rerank \
|
||||
--dataset-name random-rerank \
|
||||
--tokenizer BAAI/bge-reranker-v2-m3 \
|
||||
--random-input-len 512 \
|
||||
--num-prompts 10 \
|
||||
--random-batch-size 5
|
||||
```
|
||||
|
||||
For reranker models, this will create `num_prompts / random_batch_size` requests with
|
||||
`random_batch_size` "documents" where each one has close to `random_input_len` tokens.
|
||||
In the example above, this results in 2 rerank requests with 5 "documents" each where
|
||||
each document has close to 512 tokens.
|
||||
|
||||
Please note that the `/v1/rerank` is also supported by embedding models. So if you're running
|
||||
with an embedding model, also set `--no_reranker`. Because in this case the query is
|
||||
treated as a individual prompt by the server, here we send `random_batch_size - 1` documents
|
||||
to account for the extra prompt which is the query. The token accounting to report the
|
||||
throughput numbers correctly is also adjusted.
|
||||
|
||||
</details>
|
||||
|
||||
[](){ #performance-benchmarks }
|
||||
|
||||
## Performance Benchmarks
|
||||
|
||||
@ -73,8 +73,8 @@ def forward(
|
||||
self,
|
||||
input_ids: torch.Tensor,
|
||||
positions: torch.Tensor,
|
||||
intermediate_tensors: Optional[IntermediateTensors] = None,
|
||||
inputs_embeds: Optional[torch.Tensor] = None,
|
||||
intermediate_tensors: IntermediateTensors | None = None,
|
||||
inputs_embeds: torch.Tensor | None = None,
|
||||
) -> torch.Tensor:
|
||||
...
|
||||
```
|
||||
|
||||
@ -16,7 +16,7 @@ Further update the model as follows:
|
||||
...
|
||||
|
||||
@classmethod
|
||||
def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
|
||||
def get_placeholder_str(cls, modality: str, i: int) -> str | None:
|
||||
if modality.startswith("image"):
|
||||
return "<image>"
|
||||
|
||||
@ -45,14 +45,14 @@ Further update the model as follows:
|
||||
...
|
||||
|
||||
def _process_image_input(self, image_input: YourModelImageInputs) -> torch.Tensor:
|
||||
|
||||
assert self.vision_encoder is not None
|
||||
image_features = self.vision_encoder(image_input)
|
||||
return self.multi_modal_projector(image_features)
|
||||
|
||||
def get_multimodal_embeddings(
|
||||
self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
|
||||
|
||||
self,
|
||||
**kwargs: object,
|
||||
) -> MultiModalEmbeddings | None:
|
||||
# Validate the multimodal input keyword arguments
|
||||
image_input = self._parse_and_validate_image_input(**kwargs)
|
||||
if image_input is None:
|
||||
@ -110,7 +110,7 @@ to return the maximum number of input items for each modality supported by the m
|
||||
For example, if the model supports any number of images but only one video per prompt:
|
||||
|
||||
```python
|
||||
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
|
||||
def get_supported_mm_limits(self) -> Mapping[str, int | None]:
|
||||
return {"image": None, "video": 1}
|
||||
```
|
||||
|
||||
@ -258,7 +258,7 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
mm_options: Optional[Mapping[str, BaseDummyOptions]] = None,
|
||||
mm_options: Mapping[str, BaseDummyOptions] | None = None,
|
||||
) -> MultiModalDataDict:
|
||||
num_images = mm_counts.get("image", 0)
|
||||
|
||||
@ -421,8 +421,10 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
|
||||
```python
|
||||
def get_image_size_with_most_features(self) -> ImageSize:
|
||||
image_processor = self.get_image_processor()
|
||||
return ImageSize(width=image_processor.size["width"],
|
||||
height=image_processor.size["height"])
|
||||
return ImageSize(
|
||||
width=image_processor.size["width"],
|
||||
height=image_processor.size["height"],
|
||||
)
|
||||
```
|
||||
|
||||
Fuyu does not expect image placeholders in the inputs to HF processor, so
|
||||
@ -452,10 +454,12 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
|
||||
|
||||
return {
|
||||
"image":
|
||||
self._get_dummy_images(width=target_width,
|
||||
height=target_height,
|
||||
num_images=num_images,
|
||||
overrides=image_overrides)
|
||||
self._get_dummy_images(
|
||||
width=target_width,
|
||||
height=target_height,
|
||||
num_images=num_images,
|
||||
overrides=image_overrides,
|
||||
)
|
||||
}
|
||||
```
|
||||
|
||||
@ -744,8 +748,7 @@ Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies
|
||||
image_width=image_size.width,
|
||||
image_height=image_size.height,
|
||||
)
|
||||
image_tokens = ([_IMAGE_TOKEN_ID] * ncols +
|
||||
[_NEWLINE_TOKEN_ID]) * nrows
|
||||
image_tokens = ([_IMAGE_TOKEN_ID] * ncols + [_NEWLINE_TOKEN_ID]) * nrows
|
||||
|
||||
return PromptUpdateDetails.select_token_id(
|
||||
image_tokens + [bos_token_id],
|
||||
@ -781,8 +784,7 @@ Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies
|
||||
image_width=image_size.width,
|
||||
image_height=image_size.height,
|
||||
)
|
||||
image_tokens = ([_IMAGE_TOKEN_ID] * ncols +
|
||||
[_NEWLINE_TOKEN_ID]) * nrows
|
||||
image_tokens = ([_IMAGE_TOKEN_ID] * ncols + [_NEWLINE_TOKEN_ID]) * nrows
|
||||
|
||||
return PromptUpdateDetails.select_token_id(
|
||||
image_tokens + [bos_token_id],
|
||||
@ -810,9 +812,11 @@ to register them to the multi-modal registry:
|
||||
from vllm.model_executor.models.interfaces import SupportsMultiModal
|
||||
+ from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
|
||||
+ @MULTIMODAL_REGISTRY.register_processor(YourMultiModalProcessor,
|
||||
+ info=YourProcessingInfo,
|
||||
+ dummy_inputs=YourDummyInputsBuilder)
|
||||
+ @MULTIMODAL_REGISTRY.register_processor(
|
||||
+ YourMultiModalProcessor,
|
||||
+ info=YourProcessingInfo,
|
||||
+ dummy_inputs=YourDummyInputsBuilder,
|
||||
+ )
|
||||
class YourModelForImage2Seq(nn.Module, SupportsMultiModal):
|
||||
```
|
||||
|
||||
|
||||
@ -42,7 +42,7 @@ def register():
|
||||
|
||||
ModelRegistry.register_model(
|
||||
"YourModelForCausalLM",
|
||||
"your_code:YourModelForCausalLM"
|
||||
"your_code:YourModelForCausalLM",
|
||||
)
|
||||
```
|
||||
|
||||
|
||||
@ -15,8 +15,9 @@ Declare supported languages and capabilities:
|
||||
- Set `supports_transcription_only=True` if the model should not serve text generation (eg Whisper).
|
||||
|
||||
??? code "supported_languages and supports_transcription_only"
|
||||
|
||||
```python
|
||||
from typing import ClassVar, Mapping, Optional, Literal
|
||||
from typing import ClassVar, Mapping, Literal
|
||||
import numpy as np
|
||||
import torch
|
||||
from torch import nn
|
||||
@ -43,6 +44,7 @@ Provide an ASR configuration via [get_speech_to_text_config][vllm.model_executor
|
||||
This is for controlling general behavior of the API when serving your model:
|
||||
|
||||
??? code "get_speech_to_text_config()"
|
||||
|
||||
```python
|
||||
class YourASRModel(nn.Module, SupportsTranscription):
|
||||
...
|
||||
@ -71,6 +73,7 @@ Implement the prompt construction via [get_generation_prompt][vllm.model_executo
|
||||
Return a dict containing `multi_modal_data` with the audio, and either a `prompt` string or `prompt_token_ids`:
|
||||
|
||||
??? code "get_generation_prompt()"
|
||||
|
||||
```python
|
||||
class YourASRModel(nn.Module, SupportsTranscription):
|
||||
...
|
||||
@ -81,10 +84,10 @@ Return a dict containing `multi_modal_data` with the audio, and either a `prompt
|
||||
audio: np.ndarray,
|
||||
stt_config: SpeechToTextConfig,
|
||||
model_config: ModelConfig,
|
||||
language: Optional[str],
|
||||
language: str | None,
|
||||
task_type: Literal["transcribe", "translate"],
|
||||
request_prompt: str,
|
||||
to_language: Optional[str],
|
||||
to_language: str | None,
|
||||
) -> PromptType:
|
||||
# Example with a free-form instruction prompt
|
||||
task_word = "Transcribe" if task_type == "transcribe" else "Translate"
|
||||
@ -107,6 +110,7 @@ Return a dict containing `multi_modal_data` with the audio, and either a `prompt
|
||||
Return a dict with separate `encoder_prompt` and `decoder_prompt` entries:
|
||||
|
||||
??? code "get_generation_prompt()"
|
||||
|
||||
```python
|
||||
class YourASRModel(nn.Module, SupportsTranscription):
|
||||
...
|
||||
@ -117,10 +121,10 @@ Return a dict with separate `encoder_prompt` and `decoder_prompt` entries:
|
||||
audio: np.ndarray,
|
||||
stt_config: SpeechToTextConfig,
|
||||
model_config: ModelConfig,
|
||||
language: Optional[str],
|
||||
language: str | None,
|
||||
task_type: Literal["transcribe", "translate"],
|
||||
request_prompt: str,
|
||||
to_language: Optional[str],
|
||||
to_language: str | None,
|
||||
) -> PromptType:
|
||||
if language is None:
|
||||
raise ValueError("Language must be specified")
|
||||
@ -148,12 +152,16 @@ Language validation via [validate_language][vllm.model_executor.models.interface
|
||||
If your model requires a language and you want a default, override this method (see Whisper):
|
||||
|
||||
??? code "validate_language()"
|
||||
|
||||
```python
|
||||
@classmethod
|
||||
def validate_language(cls, language: Optional[str]) -> Optional[str]:
|
||||
def validate_language(cls, language: str | None) -> str | None:
|
||||
if language is None:
|
||||
logger.warning(
|
||||
"Defaulting to language='en'. If you wish to transcribe audio in a different language, pass the `language` field.")
|
||||
"Defaulting to language='en'. If you wish to transcribe "
|
||||
"audio in a different language, pass the `language` field "
|
||||
"in the TranscriptionRequest."
|
||||
)
|
||||
language = "en"
|
||||
return super().validate_language(language)
|
||||
```
|
||||
@ -165,6 +173,7 @@ Token accounting for streaming via [get_num_audio_tokens][vllm.model_executor.mo
|
||||
Provide a fast duration→token estimate to improve streaming usage statistics:
|
||||
|
||||
??? code "get_num_audio_tokens()"
|
||||
|
||||
```python
|
||||
class YourASRModel(nn.Module, SupportsTranscription):
|
||||
...
|
||||
@ -175,7 +184,7 @@ Provide a fast duration→token estimate to improve streaming usage statistics:
|
||||
audio_duration_s: float,
|
||||
stt_config: SpeechToTextConfig,
|
||||
model_config: ModelConfig,
|
||||
) -> Optional[int]:
|
||||
) -> int | None:
|
||||
# Return None if unknown; otherwise return an estimate.
|
||||
return int(audio_duration_s * stt_config.sample_rate // 320) # example
|
||||
```
|
||||
@ -191,6 +200,7 @@ The API server takes care of basic audio I/O and optional chunking before buildi
|
||||
Relevant server logic:
|
||||
|
||||
??? code "_preprocess_speech_to_text()"
|
||||
|
||||
```python
|
||||
# vllm/entrypoints/openai/speech_to_text.py
|
||||
async def _preprocess_speech_to_text(...):
|
||||
|
||||
@ -63,7 +63,7 @@ If successful, you should be returned a CURL command that you can call inference
|
||||
|
||||
??? console "Command"
|
||||
|
||||
```python
|
||||
```bash
|
||||
curl -X POST https://api.cortex.cerebrium.ai/v4/p-xxxxxx/vllm/run \
|
||||
-H 'Content-Type: application/json' \
|
||||
-H 'Authorization: <JWT TOKEN>' \
|
||||
@ -81,7 +81,7 @@ You should get a response like:
|
||||
|
||||
??? console "Response"
|
||||
|
||||
```python
|
||||
```json
|
||||
{
|
||||
"run_id": "52911756-3066-9ae8-bcc9-d9129d1bd262",
|
||||
"result": {
|
||||
|
||||
@ -83,7 +83,7 @@ After the provisioning, you can interact with the model by using the OpenAI SDK:
|
||||
|
||||
client = OpenAI(
|
||||
base_url="https://gateway.<gateway domain>",
|
||||
api_key="<YOUR-DSTACK-SERVER-ACCESS-TOKEN>"
|
||||
api_key="<YOUR-DSTACK-SERVER-ACCESS-TOKEN>",
|
||||
)
|
||||
|
||||
completion = client.chat.completions.create(
|
||||
@ -93,7 +93,7 @@ After the provisioning, you can interact with the model by using the OpenAI SDK:
|
||||
"role": "user",
|
||||
"content": "Compose a poem that explains the concept of recursion in programming.",
|
||||
}
|
||||
]
|
||||
],
|
||||
)
|
||||
|
||||
print(completion.choices[0].message.content)
|
||||
|
||||
@ -34,7 +34,7 @@ pip install vllm haystack-ai
|
||||
api_key=Secret.from_token("VLLM-PLACEHOLDER-API-KEY"),
|
||||
model="mistralai/Mistral-7B-Instruct-v0.1",
|
||||
api_base_url="http://{your-vLLM-host-ip}:{your-vLLM-host-port}/v1",
|
||||
generation_kwargs = {"max_tokens": 512}
|
||||
generation_kwargs={"max_tokens": 512},
|
||||
)
|
||||
|
||||
response = generator.run(
|
||||
|
||||
@ -32,28 +32,28 @@ This is the easiest way to get started with vLLM on Hugging Face Inference Endpo
|
||||
import os
|
||||
|
||||
client = OpenAI(
|
||||
base_url = DEPLOYMENT_URL,
|
||||
api_key = os.environ["HF_TOKEN"] # https://huggingface.co/settings/tokens
|
||||
base_url=DEPLOYMENT_URL,
|
||||
api_key=os.environ["HF_TOKEN"], # https://huggingface.co/settings/tokens
|
||||
)
|
||||
|
||||
chat_completion = client.chat.completions.create(
|
||||
model = "HuggingFaceTB/SmolLM3-3B",
|
||||
messages = [
|
||||
model="HuggingFaceTB/SmolLM3-3B",
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "Give me a brief explanation of gravity in simple terms."
|
||||
"text": "Give me a brief explanation of gravity in simple terms.",
|
||||
}
|
||||
]
|
||||
],
|
||||
}
|
||||
],
|
||||
stream = True
|
||||
stream=True,
|
||||
)
|
||||
|
||||
for message in chat_completion:
|
||||
print(message.choices[0].delta.content, end = "")
|
||||
print(message.choices[0].delta.content, end="")
|
||||
```
|
||||
|
||||
!!! note
|
||||
@ -86,34 +86,34 @@ This method applies to models with the [`transformers` library tag](https://hugg
|
||||
import os
|
||||
|
||||
client = OpenAI(
|
||||
base_url = DEPLOYMENT_URL,
|
||||
api_key = os.environ["HF_TOKEN"] # https://huggingface.co/settings/tokens
|
||||
base_url=DEPLOYMENT_URL,
|
||||
api_key=os.environ["HF_TOKEN"], # https://huggingface.co/settings/tokens
|
||||
)
|
||||
|
||||
chat_completion = client.chat.completions.create(
|
||||
model = "ibm-granite/granite-docling-258M",
|
||||
messages = [
|
||||
model="ibm-granite/granite-docling-258M",
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": "https://huggingface.co/ibm-granite/granite-docling-258M/resolve/main/assets/new_arxiv.png"
|
||||
}
|
||||
"url": "https://huggingface.co/ibm-granite/granite-docling-258M/resolve/main/assets/new_arxiv.png",
|
||||
},
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "Convert this page to docling."
|
||||
}
|
||||
"text": "Convert this page to docling.",
|
||||
},
|
||||
]
|
||||
}
|
||||
],
|
||||
stream = True
|
||||
stream=True,
|
||||
)
|
||||
|
||||
for message in chat_completion:
|
||||
print(message.choices[0].delta.content, end = "")
|
||||
print(message.choices[0].delta.content, end="")
|
||||
```
|
||||
|
||||
!!! note
|
||||
|
||||
@ -36,15 +36,16 @@ pip install vllm litellm
|
||||
```python
|
||||
import litellm
|
||||
|
||||
messages = [{ "content": "Hello, how are you?","role": "user"}]
|
||||
messages = [{"content": "Hello, how are you?", "role": "user"}]
|
||||
|
||||
# hosted_vllm is prefix key word and necessary
|
||||
response = litellm.completion(
|
||||
model="hosted_vllm/qwen/Qwen1.5-0.5B-Chat", # pass the vllm model name
|
||||
messages=messages,
|
||||
api_base="http://{your-vllm-server-host}:{your-vllm-server-port}/v1",
|
||||
temperature=0.2,
|
||||
max_tokens=80)
|
||||
model="hosted_vllm/qwen/Qwen1.5-0.5B-Chat", # pass the vllm model name
|
||||
messages=messages,
|
||||
api_base="http://{your-vllm-server-host}:{your-vllm-server-port}/v1",
|
||||
temperature=0.2,
|
||||
max_tokens=80,
|
||||
)
|
||||
|
||||
print(response)
|
||||
```
|
||||
|
||||
@ -40,7 +40,7 @@ pip install -U vllm \
|
||||
|
||||
1. Run the script
|
||||
|
||||
```python
|
||||
```bash
|
||||
python retrieval_augmented_generation_with_langchain.py
|
||||
```
|
||||
|
||||
@ -78,6 +78,6 @@ pip install vllm \
|
||||
|
||||
1. Run the script:
|
||||
|
||||
```python
|
||||
```bash
|
||||
python retrieval_augmented_generation_with_llamaindex.py
|
||||
```
|
||||
|
||||
@ -106,9 +106,11 @@ The dispatch code looks like:
|
||||
batch_descriptor=BatchDescriptor(num_tokens=num_input_tokens, uniform_decode=...)
|
||||
runtime_mode, batch_descriptor = cudagraphdispatcher.dispatch(batch_descriptor)
|
||||
# execution
|
||||
with set_forward_context(...,
|
||||
cudagraph_runtime_mode=runtime_mode,
|
||||
batch_descriptor=batch_descriptor):
|
||||
with set_forward_context(
|
||||
...,
|
||||
cudagraph_runtime_mode=runtime_mode,
|
||||
batch_descriptor=batch_descriptor,
|
||||
):
|
||||
output = self.model(...)
|
||||
```
|
||||
|
||||
@ -165,7 +167,7 @@ class AttentionCGSupport(enum.Enum):
|
||||
"""NO CUDA Graphs support"""
|
||||
```
|
||||
|
||||
Suppose we have hybrid attention backends (e.g., in mamba mixer models). In that case, we seek the minimum capability of all backends to determine the final capability of the model, and we might resolve the incompatible CUDA Graphs mode by downgrading the mode to the best fit one. For example, downgrading `FULL` mode to `FULL_AND_PIECEWISE` mode if the minimum capability is `UNIFORM_BATCH`, or `PIECEWISE` mode if the minimum capability is `NEVER` for -O3 compilation level. For the complete fallback policy, please see the code of [initialize_cudagraph_capture][vllm.v1.worker.gpu_model_runner.GPUModelRunner.initialize_cudagraph_capture].
|
||||
Suppose we have hybrid attention backends (e.g., in mamba mixer models). In that case, we seek the minimum capability of all backends to determine the final capability of the model, and we might resolve the incompatible CUDA Graphs mode by downgrading the mode to the best fit one. For example, downgrading `FULL` mode to `FULL_AND_PIECEWISE` mode if the minimum capability is `UNIFORM_BATCH`, or `PIECEWISE` mode if the minimum capability is `NEVER` for -O3 compilation mode. For the complete fallback policy, please see the code of [initialize_cudagraph_capture][vllm.v1.worker.gpu_model_runner.GPUModelRunner.initialize_cudagraph_capture].
|
||||
|
||||
The following table lists backends that support full CUDA Graphs at the time of writing.
|
||||
|
||||
@ -200,12 +202,12 @@ os.environ.setdefault("VLLM_LOGGING_LEVEL", "DEBUG")
|
||||
import vllm
|
||||
from vllm.config import CUDAGraphMode
|
||||
|
||||
compilation_config = {"level": 3, "cudagraph_mode": "FULL_AND_PIECEWISE"}
|
||||
compilation_config = {"mode": 3, "cudagraph_mode": "FULL_AND_PIECEWISE"}
|
||||
model = vllm.LLM(
|
||||
model="meta-llama/Llama-3.1-8B-Instruct",
|
||||
dtype='auto',
|
||||
compilation_config = compilation_config,
|
||||
)
|
||||
model="meta-llama/Llama-3.1-8B-Instruct",
|
||||
dtype="auto",
|
||||
compilation_config=compilation_config,
|
||||
)
|
||||
sampling_params = vllm.SamplingParams(
|
||||
temperature=0, # greedy decoding
|
||||
max_tokens=1024,
|
||||
|
||||
@ -34,10 +34,10 @@ To enable the DBO system pass in the `--enable-dbo` argument to your vllm serve
|
||||
* `--dbo-decode-token-threshold` the minimum number of tokens in a decode-only batch required to enable DBO for that batch
|
||||
* `--dbo-prefill-token-threshold` the minimum number of tokens in a batch containing at least one prefill required to enable DBO for that batch
|
||||
|
||||
Currently, DBO is only supported with DeepEP, so DeepEP must be installed and the `VLLM_ALL2ALL_BACKEND` environment variable must be set to `deepep_low_latency` if your workload is primarily decode requests, or `deepep_high_throughput` if your workload is primarily prefill requests.
|
||||
Currently, DBO is only supported with DeepEP, so DeepEP must be installed and the `--all2all-backend` argument must be set to `deepep_low_latency` if your workload is primarily decode requests, or `deepep_high_throughput` if your workload is primarily prefill requests.
|
||||
|
||||
Below is a command that will spin up a two DP rank server with expert parallelism and DBO enabled.
|
||||
EX: `VLLM_ALL2ALL_BACKEND=deepep_low_latency vllm serve --model="deepseek-ai/DeepSeek-V2-Lite" --trust-remote-code --data-parallel-size 2 --enable-expert-parallel --enable-dbo`
|
||||
EX: `vllm serve deepseek-ai/DeepSeek-V2-Lite --trust-remote-code --data-parallel-size 2 --enable-expert-parallel --enable-dbo --all2all-backend deepep_low_latency`
|
||||
|
||||
Note that there must be at least two GPUs visible in `CUDA_VISIBLE_DEVICES`
|
||||
|
||||
|
||||
@ -9,8 +9,8 @@ When performing an inference with IO Processor plugins, the prompt type is defin
|
||||
IO Processor plugins implement the `IOProcessor` interface (<gh-file:vllm/plugins/io_processors/interface.py>):
|
||||
|
||||
```python
|
||||
IOProcessorInput = TypeVar('IOProcessorInput')
|
||||
IOProcessorOutput = TypeVar('IOProcessorOutput')
|
||||
IOProcessorInput = TypeVar("IOProcessorInput")
|
||||
IOProcessorOutput = TypeVar("IOProcessorOutput")
|
||||
|
||||
class IOProcessor(ABC, Generic[IOProcessorInput, IOProcessorOutput]):
|
||||
|
||||
@ -21,30 +21,32 @@ class IOProcessor(ABC, Generic[IOProcessorInput, IOProcessorOutput]):
|
||||
def pre_process(
|
||||
self,
|
||||
prompt: IOProcessorInput,
|
||||
request_id: Optional[str] = None,
|
||||
request_id: str | None = None,
|
||||
**kwargs,
|
||||
) -> Union[PromptType, Sequence[PromptType]]:
|
||||
) -> PromptType | Sequence[PromptType]:
|
||||
raise NotImplementedError
|
||||
|
||||
async def pre_process_async(
|
||||
self,
|
||||
prompt: IOProcessorInput,
|
||||
request_id: Optional[str] = None,
|
||||
request_id: str | None = None,
|
||||
**kwargs,
|
||||
) -> Union[PromptType, Sequence[PromptType]]:
|
||||
) -> PromptType | Sequence[PromptType]:
|
||||
return self.pre_process(prompt, request_id, **kwargs)
|
||||
|
||||
@abstractmethod
|
||||
def post_process(self,
|
||||
model_output: Sequence[PoolingRequestOutput],
|
||||
request_id: Optional[str] = None,
|
||||
**kwargs) -> IOProcessorOutput:
|
||||
def post_process(
|
||||
self,
|
||||
model_output: Sequence[PoolingRequestOutput],
|
||||
request_id: str | None = None,
|
||||
**kwargs,
|
||||
) -> IOProcessorOutput:
|
||||
raise NotImplementedError
|
||||
|
||||
async def post_process_async(
|
||||
self,
|
||||
model_output: AsyncGenerator[tuple[int, PoolingRequestOutput]],
|
||||
request_id: Optional[str] = None,
|
||||
request_id: str | None = None,
|
||||
**kwargs,
|
||||
) -> IOProcessorOutput:
|
||||
collected_output = [item async for i, item in model_output]
|
||||
@ -56,7 +58,8 @@ class IOProcessor(ABC, Generic[IOProcessorInput, IOProcessorOutput]):
|
||||
|
||||
@abstractmethod
|
||||
def output_to_response(
|
||||
self, plugin_output: IOProcessorOutput) -> IOProcessorResponse:
|
||||
self, plugin_output: IOProcessorOutput
|
||||
) -> IOProcessorResponse:
|
||||
raise NotImplementedError
|
||||
```
|
||||
|
||||
|
||||
@ -174,7 +174,7 @@ The previous sections alluded to the interfaces which vLLM logits processors mus
|
||||
from collections.abc import Sequence
|
||||
from dataclasses import dataclass
|
||||
from enum import Enum, auto
|
||||
from typing import TYPE_CHECKING, Optional
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import torch
|
||||
|
||||
@ -244,7 +244,7 @@ The previous sections alluded to the interfaces which vLLM logits processors mus
|
||||
@abstractmethod
|
||||
def update_state(
|
||||
self,
|
||||
batch_update: Optional["BatchUpdate"],
|
||||
batch_update: "BatchUpdate" | None,
|
||||
) -> None:
|
||||
"""Called when there are new output tokens, prior
|
||||
to each forward pass.
|
||||
@ -274,7 +274,7 @@ A vLLM logits processor must subclass `LogitsProcessor` and define (at minimum)
|
||||
* Return `True` if the logits processor is argmax invariant (never changes what is the highest-logit-value token ID for a given request), `False` if the logits processor may modify argmax
|
||||
* `is_argmax_invariant()` is evaluated once at startup; if `True`, vLLM will skip applying this logits processor in a given step when all requests use greedy sampling
|
||||
|
||||
* `update_state(self, batch_update: Optional["BatchUpdate"]) -> None`:
|
||||
* `update_state(self, batch_update: "BatchUpdate" | None) -> None`:
|
||||
* Consume a `BatchUpdate` data structure representing persistent batch state changes at the beginning of the current engine step
|
||||
* Use the `BatchUpdate` members to update logits processor internal state
|
||||
* **Note:** batch update data structure may be `None`, signaling no change to the batch constituents. In this case, the LogitsProcessor might still want to update its state based on the updated `output_token_ids` lists that it could have retained when they were added.
|
||||
|
||||
@ -478,15 +478,17 @@ us with:
|
||||
|
||||
```python
|
||||
if seq_group.is_finished():
|
||||
if (seq_group.metrics.first_scheduled_time is not None and
|
||||
seq_group.metrics.first_token_time is not None):
|
||||
if (
|
||||
seq_group.metrics.first_scheduled_time is not None
|
||||
and seq_group.metrics.first_token_time is not None
|
||||
):
|
||||
time_queue_requests.append(
|
||||
seq_group.metrics.first_scheduled_time -
|
||||
seq_group.metrics.arrival_time)
|
||||
seq_group.metrics.arrival_time
|
||||
)
|
||||
...
|
||||
if seq_group.metrics.time_in_queue is not None:
|
||||
time_in_queue_requests.append(
|
||||
seq_group.metrics.time_in_queue)
|
||||
time_in_queue_requests.append(seq_group.metrics.time_in_queue)
|
||||
```
|
||||
|
||||
This seems duplicative, and one of them should be removed. The latter
|
||||
|
||||
@ -112,8 +112,8 @@ class KVCacheBlock:
|
||||
ref_cnt: int
|
||||
|
||||
# The pointers to form a doubly linked list for the free queue.
|
||||
prev_free_block: Optional["KVCacheBlock"] = None
|
||||
next_free_block: Optional["KVCacheBlock"] = None
|
||||
prev_free_block: "KVCacheBlock | None" = None
|
||||
next_free_block: "KVCacheBlock | None" = None
|
||||
```
|
||||
|
||||
There are two design points to highlight:
|
||||
|
||||
@ -93,7 +93,6 @@ The contrived example below implements a custom logits processor which consumes
|
||||
??? code "Example custom logits processor definition"
|
||||
|
||||
``` python
|
||||
from typing import Optional
|
||||
import torch
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.sampling_params import SamplingParams
|
||||
@ -112,7 +111,7 @@ The contrived example below implements a custom logits processor which consumes
|
||||
"""Never impacts greedy sampling"""
|
||||
return False
|
||||
|
||||
def update_state(self, batch_update: Optional[BatchUpdate]):
|
||||
def update_state(self, batch_update: BatchUpdate | None):
|
||||
if not batch_update:
|
||||
return
|
||||
|
||||
|
||||
@ -32,7 +32,7 @@ the third parameter is the path to the LoRA adapter.
|
||||
sampling_params = SamplingParams(
|
||||
temperature=0,
|
||||
max_tokens=256,
|
||||
stop=["[/assistant]"]
|
||||
stop=["[/assistant]"],
|
||||
)
|
||||
|
||||
prompts = [
|
||||
@ -43,7 +43,7 @@ the third parameter is the path to the LoRA adapter.
|
||||
outputs = llm.generate(
|
||||
prompts,
|
||||
sampling_params,
|
||||
lora_request=LoRARequest("sql_adapter", 1, sql_lora_path)
|
||||
lora_request=LoRARequest("sql_adapter", 1, sql_lora_path),
|
||||
)
|
||||
```
|
||||
|
||||
@ -197,7 +197,7 @@ Alternatively, follow these example steps to implement your own plugin:
|
||||
lora_request = LoRARequest(
|
||||
lora_name=lora_name,
|
||||
lora_path=local_path,
|
||||
lora_int_id=abs(hash(lora_name))
|
||||
lora_int_id=abs(hash(lora_name)),
|
||||
)
|
||||
return lora_request
|
||||
```
|
||||
@ -296,10 +296,7 @@ To this end, we allow registration of default multimodal LoRAs to handle this au
|
||||
if has_audio:
|
||||
question = f"<|audio|>{question}"
|
||||
chat = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": question
|
||||
}
|
||||
{"role": "user", "content": question},
|
||||
]
|
||||
return tokenizer.apply_chat_template(chat, tokenize=False)
|
||||
|
||||
|
||||
@ -154,9 +154,7 @@ To substitute multiple images inside the same text prompt, you can pass in a lis
|
||||
|
||||
outputs = llm.generate({
|
||||
"prompt": prompt,
|
||||
"multi_modal_data": {
|
||||
"image": [image1, image2]
|
||||
},
|
||||
"multi_modal_data": {"image": [image1, image2]},
|
||||
})
|
||||
|
||||
for o in outputs:
|
||||
@ -183,21 +181,24 @@ conversation = [
|
||||
{"role": "assistant", "content": "Hello! How can I assist you today?"},
|
||||
{
|
||||
"role": "user",
|
||||
"content": [{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": image_url
|
||||
}
|
||||
},{
|
||||
"type": "image_pil",
|
||||
"image_pil": image_pil
|
||||
}, {
|
||||
"type": "image_embeds",
|
||||
"image_embeds": image_embeds
|
||||
}, {
|
||||
"type": "text",
|
||||
"text": "What's in these images?"
|
||||
}],
|
||||
"content": [
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {"url": image_url},
|
||||
},
|
||||
{
|
||||
"type": "image_pil",
|
||||
"image_pil": image_pil,
|
||||
},
|
||||
{
|
||||
"type": "image_embeds",
|
||||
"image_embeds": image_embeds,
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "What's in these images?",
|
||||
},
|
||||
],
|
||||
},
|
||||
]
|
||||
|
||||
@ -224,7 +225,10 @@ Multi-image input can be extended to perform video captioning. We show this with
|
||||
message = {
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": "Describe this set of frames. Consider the frames to be a part of the same video."},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "Describe this set of frames. Consider the frames to be a part of the same video.",
|
||||
},
|
||||
],
|
||||
}
|
||||
for i in range(len(video_frames)):
|
||||
@ -255,13 +259,13 @@ When loading RGBA images (images with transparency), vLLM converts them to RGB f
|
||||
# Custom black background for dark theme
|
||||
llm = LLM(
|
||||
model="llava-hf/llava-1.5-7b-hf",
|
||||
media_io_kwargs={"image": {"rgba_background_color": [0, 0, 0]}}
|
||||
media_io_kwargs={"image": {"rgba_background_color": [0, 0, 0]}},
|
||||
)
|
||||
|
||||
# Custom brand color background (e.g., blue)
|
||||
llm = LLM(
|
||||
model="llava-hf/llava-1.5-7b-hf",
|
||||
media_io_kwargs={"image": {"rgba_background_color": [0, 0, 255]}}
|
||||
media_io_kwargs={"image": {"rgba_background_color": [0, 0, 255]}},
|
||||
)
|
||||
```
|
||||
|
||||
@ -294,20 +298,23 @@ Instead of NumPy arrays, you can also pass `'torch.Tensor'` instances, as shown
|
||||
limit_mm_per_prompt={"video": 1},
|
||||
)
|
||||
|
||||
sampling_params = SamplingParams(
|
||||
max_tokens=1024,
|
||||
)
|
||||
sampling_params = SamplingParams(max_tokens=1024)
|
||||
|
||||
video_messages = [
|
||||
{"role": "system", "content": "You are a helpful assistant."},
|
||||
{"role": "user", "content": [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful assistant.",
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": "describe this video."},
|
||||
{
|
||||
"type": "video",
|
||||
"video": video_path,
|
||||
"total_pixels": 20480 * 28 * 28,
|
||||
"min_pixels": 16 * 28 * 28
|
||||
}
|
||||
"min_pixels": 16 * 28 * 28,
|
||||
},
|
||||
]
|
||||
},
|
||||
]
|
||||
@ -465,21 +472,24 @@ Then, you can use the OpenAI client as follows:
|
||||
|
||||
chat_response = client.chat.completions.create(
|
||||
model="microsoft/Phi-3.5-vision-instruct",
|
||||
messages=[{
|
||||
"role": "user",
|
||||
"content": [
|
||||
# NOTE: The prompt formatting with the image token `<image>` is not needed
|
||||
# since the prompt will be processed automatically by the API server.
|
||||
{"type": "text", "text": "What’s in this image?"},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
url": image_url
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
# NOTE: The prompt formatting with the image token `<image>` is not needed
|
||||
# since the prompt will be processed automatically by the API server.
|
||||
{
|
||||
"type": "text",
|
||||
"text": "What’s in this image?",
|
||||
},
|
||||
"uuid": image_url # Optional
|
||||
},
|
||||
],
|
||||
}],
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {"url": image_url},
|
||||
"uuid": image_url, # Optional
|
||||
},
|
||||
],
|
||||
}
|
||||
],
|
||||
)
|
||||
print("Chat completion output:", chat_response.choices[0].message.content)
|
||||
|
||||
@ -489,26 +499,27 @@ Then, you can use the OpenAI client as follows:
|
||||
|
||||
chat_response = client.chat.completions.create(
|
||||
model="microsoft/Phi-3.5-vision-instruct",
|
||||
messages=[{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": "What are the animals in these images?"},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": image_url_duck
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "What are the animals in these images?",
|
||||
},
|
||||
"uuid": image_url_duck # Optional
|
||||
},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": image_url_lion
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {"url": image_url_duck},
|
||||
"uuid": image_url_duck, # Optional
|
||||
},
|
||||
"uuid": image_url_lion # Optional
|
||||
},
|
||||
],
|
||||
}],
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {"url": image_url_lion},
|
||||
"uuid": image_url_lion, # Optional
|
||||
},
|
||||
],
|
||||
}
|
||||
],
|
||||
)
|
||||
print("Chat completion output:", chat_response.choices[0].message.content)
|
||||
```
|
||||
@ -560,23 +571,22 @@ Then, you can use the OpenAI client as follows:
|
||||
|
||||
## Use video url in the payload
|
||||
chat_completion_from_url = client.chat.completions.create(
|
||||
messages=[{
|
||||
"role":
|
||||
"user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "What's in this video?"
|
||||
},
|
||||
{
|
||||
"type": "video_url",
|
||||
"video_url": {
|
||||
"url": video_url
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "What's in this video?",
|
||||
},
|
||||
"uuid": video_url # Optional
|
||||
},
|
||||
],
|
||||
}],
|
||||
{
|
||||
"type": "video_url",
|
||||
"video_url": {"url": video_url},
|
||||
"uuid": video_url, # Optional
|
||||
},
|
||||
],
|
||||
}
|
||||
],
|
||||
model=model,
|
||||
max_completion_tokens=64,
|
||||
)
|
||||
@ -652,23 +662,25 @@ Then, you can use the OpenAI client as follows:
|
||||
audio_base64 = encode_base64_content_from_url(audio_url)
|
||||
|
||||
chat_completion_from_base64 = client.chat.completions.create(
|
||||
messages=[{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "What's in this audio?"
|
||||
},
|
||||
{
|
||||
"type": "input_audio",
|
||||
"input_audio": {
|
||||
"data": audio_base64,
|
||||
"format": "wav"
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "What's in this audio?",
|
||||
},
|
||||
"uuid": audio_url # Optional
|
||||
},
|
||||
],
|
||||
}],
|
||||
{
|
||||
"type": "input_audio",
|
||||
"input_audio": {
|
||||
"data": audio_base64,
|
||||
"format": "wav",
|
||||
},
|
||||
"uuid": audio_url, # Optional
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
model=model,
|
||||
max_completion_tokens=64,
|
||||
)
|
||||
@ -683,22 +695,22 @@ Alternatively, you can pass `audio_url`, which is the audio counterpart of `imag
|
||||
|
||||
```python
|
||||
chat_completion_from_url = client.chat.completions.create(
|
||||
messages=[{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "What's in this audio?"
|
||||
},
|
||||
{
|
||||
"type": "audio_url",
|
||||
"audio_url": {
|
||||
"url": audio_url
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "What's in this audio?",
|
||||
},
|
||||
"uuid": audio_url # Optional
|
||||
},
|
||||
],
|
||||
}],
|
||||
{
|
||||
"type": "audio_url",
|
||||
"audio_url": {"url": audio_url},
|
||||
"uuid": audio_url, # Optional
|
||||
},
|
||||
],
|
||||
}
|
||||
],
|
||||
model=model,
|
||||
max_completion_tokens=64,
|
||||
)
|
||||
@ -747,43 +759,48 @@ The following example demonstrates how to pass image embeddings to the OpenAI se
|
||||
|
||||
# Basic usage - this is equivalent to the LLaVA example for offline inference
|
||||
model = "llava-hf/llava-1.5-7b-hf"
|
||||
embeds = {
|
||||
embeds = {
|
||||
"type": "image_embeds",
|
||||
"image_embeds": f"{base64_image_embedding}",
|
||||
"uuid": image_url # Optional
|
||||
"uuid": image_url, # Optional
|
||||
}
|
||||
|
||||
# Pass additional parameters (available to Qwen2-VL and MiniCPM-V)
|
||||
model = "Qwen/Qwen2-VL-2B-Instruct"
|
||||
embeds = {
|
||||
embeds = {
|
||||
"type": "image_embeds",
|
||||
"image_embeds": {
|
||||
"image_embeds": f"{base64_image_embedding}" , # Required
|
||||
"image_grid_thw": f"{base64_image_grid_thw}" # Required by Qwen/Qwen2-VL-2B-Instruct
|
||||
"image_embeds": f"{base64_image_embedding}", # Required
|
||||
"image_grid_thw": f"{base64_image_grid_thw}", # Required by Qwen/Qwen2-VL-2B-Instruct
|
||||
},
|
||||
"uuid": image_url # Optional
|
||||
"uuid": image_url, # Optional
|
||||
}
|
||||
model = "openbmb/MiniCPM-V-2_6"
|
||||
embeds = {
|
||||
embeds = {
|
||||
"type": "image_embeds",
|
||||
"image_embeds": {
|
||||
"image_embeds": f"{base64_image_embedding}" , # Required
|
||||
"image_sizes": f"{base64_image_sizes}" # Required by openbmb/MiniCPM-V-2_6
|
||||
"image_embeds": f"{base64_image_embedding}", # Required
|
||||
"image_sizes": f"{base64_image_sizes}", # Required by openbmb/MiniCPM-V-2_6
|
||||
},
|
||||
"uuid": image_url # Optional
|
||||
"uuid": image_url, # Optional
|
||||
}
|
||||
chat_completion = client.chat.completions.create(
|
||||
messages=[
|
||||
{"role": "system", "content": "You are a helpful assistant."},
|
||||
{"role": "user", "content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "What's in this image?",
|
||||
"role": "system",
|
||||
"content": "You are a helpful assistant.",
|
||||
},
|
||||
embeds,
|
||||
],
|
||||
},
|
||||
],
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "What's in this image?",
|
||||
},
|
||||
embeds,
|
||||
],
|
||||
},
|
||||
],
|
||||
model=model,
|
||||
)
|
||||
```
|
||||
@ -802,22 +819,22 @@ For Online Serving, you can also skip sending media if you expect cache hits wit
|
||||
{
|
||||
"type": "image_embeds",
|
||||
"image_embeds": None,
|
||||
"uuid": image_uuid
|
||||
"uuid": image_uuid,
|
||||
},
|
||||
|
||||
# input_audio:
|
||||
{
|
||||
"type": "input_audio",
|
||||
"input_audio": None,
|
||||
"uuid": audio_uuid
|
||||
"uuid": audio_uuid,
|
||||
},
|
||||
|
||||
# PIL Image:
|
||||
{
|
||||
"type": "image_pil",
|
||||
"image_pil": None
|
||||
"uuid": image_uuid
|
||||
}
|
||||
"image_pil": None,
|
||||
"uuid": image_uuid,
|
||||
},
|
||||
|
||||
```
|
||||
|
||||
|
||||
@ -156,6 +156,16 @@ python tests/v1/kv_connector/nixl_integration/toy_proxy_server.py \
|
||||
NixlConnector currently does not distinguish `kv_role`; the actual prefiller/decoder roles are determined by the upper-level proxy (e.g., `toy_proxy_server.py` using `--prefiller-hosts` and `--decoder-hosts`).
|
||||
Therefore, `kv_role` in `--kv-transfer-config` is effectively a placeholder and does not affect NixlConnector's behavior.
|
||||
|
||||
## Experimental Feature
|
||||
|
||||
### Heterogenuous KV Layout support
|
||||
|
||||
Support use case: Prefill with 'HND' and decode with 'NHD' with experimental configuration
|
||||
|
||||
```bash
|
||||
--kv-transfer-config '{..., "enable_permute_local_kv":"True"}'
|
||||
```
|
||||
|
||||
## Example Scripts/Code
|
||||
|
||||
Refer to these example scripts in the vLLM repository:
|
||||
|
||||
@ -1,5 +1,9 @@
|
||||
# AutoAWQ
|
||||
|
||||
> ⚠️ **Warning:**
|
||||
The `AutoAWQ` library is deprecated. This functionality has been adopted by the vLLM project in [`llm-compressor`](https://github.com/vllm-project/llm-compressor/tree/main/examples/awq).
|
||||
For the recommended quantization workflow, please see the AWQ examples in [`llm-compressor`](https://github.com/vllm-project/llm-compressor/tree/main/examples/awq). For more details on the deprecation, refer to the original [AutoAWQ repository](https://github.com/casper-hansen/AutoAWQ).
|
||||
|
||||
To create a new 4-bit quantized model, you can leverage [AutoAWQ](https://github.com/casper-hansen/AutoAWQ).
|
||||
Quantization reduces the model's precision from BF16/FP16 to INT4 which effectively reduces the total model memory footprint.
|
||||
The main benefits are lower latency and memory usage.
|
||||
@ -18,13 +22,15 @@ After installing AutoAWQ, you are ready to quantize a model. Please refer to the
|
||||
from awq import AutoAWQForCausalLM
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
model_path = 'mistralai/Mistral-7B-Instruct-v0.2'
|
||||
quant_path = 'mistral-instruct-v0.2-awq'
|
||||
quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM" }
|
||||
model_path = "mistralai/Mistral-7B-Instruct-v0.2"
|
||||
quant_path = "mistral-instruct-v0.2-awq"
|
||||
quant_config = {"zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM"}
|
||||
|
||||
# Load model
|
||||
model = AutoAWQForCausalLM.from_pretrained(
|
||||
model_path, **{"low_cpu_mem_usage": True, "use_cache": False}
|
||||
model_path,
|
||||
low_cpu_mem_usage=True,
|
||||
use_cache=False,
|
||||
)
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
||||
|
||||
|
||||
@ -58,7 +58,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
from auto_round import AutoRound
|
||||
|
||||
model_name = "Qwen/Qwen3-0.6B"
|
||||
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto")
|
||||
model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto")
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
||||
|
||||
bits, group_size, sym = 4, 128, True
|
||||
|
||||
@ -34,7 +34,7 @@ llm = LLM(
|
||||
model=model_id,
|
||||
dtype=torch.bfloat16,
|
||||
trust_remote_code=True,
|
||||
quantization="bitblas"
|
||||
quantization="bitblas",
|
||||
)
|
||||
```
|
||||
|
||||
@ -53,6 +53,6 @@ llm = LLM(
|
||||
dtype=torch.float16,
|
||||
trust_remote_code=True,
|
||||
quantization="bitblas",
|
||||
max_model_len=1024
|
||||
max_model_len=1024,
|
||||
)
|
||||
```
|
||||
|
||||
@ -27,7 +27,7 @@ model_id = "unsloth/tinyllama-bnb-4bit"
|
||||
llm = LLM(
|
||||
model=model_id,
|
||||
dtype=torch.bfloat16,
|
||||
trust_remote_code=True
|
||||
trust_remote_code=True,
|
||||
)
|
||||
```
|
||||
|
||||
@ -43,7 +43,7 @@ llm = LLM(
|
||||
model=model_id,
|
||||
dtype=torch.bfloat16,
|
||||
trust_remote_code=True,
|
||||
quantization="bitsandbytes"
|
||||
quantization="bitsandbytes",
|
||||
)
|
||||
```
|
||||
|
||||
|
||||
@ -41,7 +41,9 @@ from transformers import AutoTokenizer, AutoModelForCausalLM
|
||||
|
||||
MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
MODEL_ID, device_map="auto", torch_dtype="auto",
|
||||
MODEL_ID,
|
||||
device_map="auto",
|
||||
dtype="auto",
|
||||
)
|
||||
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
|
||||
```
|
||||
@ -63,7 +65,10 @@ Since simple RTN does not require data for weight quantization and the activatio
|
||||
|
||||
# Configure the simple PTQ quantization
|
||||
recipe = QuantizationModifier(
|
||||
targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"])
|
||||
targets="Linear",
|
||||
scheme="FP8_DYNAMIC",
|
||||
ignore=["lm_head"],
|
||||
)
|
||||
|
||||
# Apply the quantization algorithm.
|
||||
oneshot(model=model, recipe=recipe)
|
||||
|
||||
@ -47,15 +47,15 @@ You can also use the GGUF model directly through the LLM entrypoint:
|
||||
conversation = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful assistant"
|
||||
"content": "You are a helpful assistant",
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Hello"
|
||||
"content": "Hello",
|
||||
},
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": "Hello! How can I assist you today?"
|
||||
"content": "Hello! How can I assist you today?",
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
@ -67,8 +67,10 @@ You can also use the GGUF model directly through the LLM entrypoint:
|
||||
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
||||
|
||||
# Create an LLM.
|
||||
llm = LLM(model="./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
|
||||
tokenizer="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
|
||||
llm = LLM(
|
||||
model="./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
|
||||
tokenizer="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
|
||||
)
|
||||
# Generate texts from the prompts. The output is a list of RequestOutput objects
|
||||
# that contain the prompt, generated text, and other information.
|
||||
outputs = llm.chat(conversation, sampling_params)
|
||||
|
||||
@ -40,7 +40,7 @@ Here is an example of how to quantize `meta-llama/Llama-3.2-1B-Instruct`:
|
||||
calibration_dataset = load_dataset(
|
||||
"allenai/c4",
|
||||
data_files="en/c4-train.00001-of-01024.json.gz",
|
||||
split="train"
|
||||
split="train",
|
||||
).select(range(1024))["text"]
|
||||
|
||||
quant_config = QuantizeConfig(bits=4, group_size=128)
|
||||
|
||||
@ -39,7 +39,9 @@ from transformers import AutoTokenizer, AutoModelForCausalLM
|
||||
|
||||
MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
MODEL_ID, device_map="auto", torch_dtype="auto",
|
||||
MODEL_ID,
|
||||
device_map="auto",
|
||||
dtype="auto",
|
||||
)
|
||||
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
|
||||
```
|
||||
@ -166,7 +168,7 @@ The following is an example of an expanded quantization recipe you can tune to y
|
||||
},
|
||||
ignore=["lm_head"],
|
||||
update_size=NUM_CALIBRATION_SAMPLES,
|
||||
dampening_frac=0.01
|
||||
dampening_frac=0.01,
|
||||
)
|
||||
```
|
||||
|
||||
|
||||
@ -44,7 +44,9 @@ from transformers import AutoTokenizer, AutoModelForCausalLM
|
||||
|
||||
MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
MODEL_ID, device_map="auto", torch_dtype="auto",
|
||||
MODEL_ID,
|
||||
device_map="auto",
|
||||
dtype="auto",
|
||||
)
|
||||
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
|
||||
```
|
||||
|
||||
@ -56,9 +56,9 @@ The quantized checkpoint can then be deployed with vLLM. As an example, the foll
|
||||
from vllm import LLM, SamplingParams
|
||||
|
||||
def main():
|
||||
|
||||
model_id = "nvidia/Llama-3.1-8B-Instruct-FP8"
|
||||
# Ensure you specify quantization='modelopt' when loading the modelopt checkpoint
|
||||
|
||||
# Ensure you specify quantization="modelopt" when loading the modelopt checkpoint
|
||||
llm = LLM(model=model_id, quantization="modelopt", trust_remote_code=True)
|
||||
|
||||
sampling_params = SamplingParams(temperature=0.8, top_p=0.9)
|
||||
|
||||
@ -41,9 +41,11 @@ Here is an example of how to enable FP8 quantization:
|
||||
from vllm import LLM, SamplingParams
|
||||
|
||||
sampling_params = SamplingParams(temperature=0.7, top_p=0.8)
|
||||
llm = LLM(model="meta-llama/Llama-2-7b-chat-hf",
|
||||
kv_cache_dtype="fp8",
|
||||
calculate_kv_scales=True)
|
||||
llm = LLM(
|
||||
model="meta-llama/Llama-2-7b-chat-hf",
|
||||
kv_cache_dtype="fp8",
|
||||
calculate_kv_scales=True,
|
||||
)
|
||||
prompt = "London is the capital of"
|
||||
out = llm.generate(prompt, sampling_params)[0].outputs[0].text
|
||||
print(out)
|
||||
@ -80,7 +82,7 @@ Here's a complete example using `meta-llama/Llama-3.1-8B-Instruct` (most models
|
||||
|
||||
# Select model and load it
|
||||
MODEL_ID = "meta-llama/Llama-3.1-8B-Instruct"
|
||||
model = AutoModelForCausalLM.from_pretrained(MODEL_ID, device_map="auto", torch_dtype="auto")
|
||||
model = AutoModelForCausalLM.from_pretrained(MODEL_ID, device_map="auto", dtype="auto")
|
||||
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
|
||||
|
||||
# Select calibration dataset
|
||||
|
||||
@ -48,7 +48,9 @@ to fetch model and tokenizer.
|
||||
MAX_SEQ_LEN = 512
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
MODEL_ID, device_map="auto", torch_dtype="auto",
|
||||
MODEL_ID,
|
||||
device_map="auto",
|
||||
dtype="auto",
|
||||
)
|
||||
model.eval()
|
||||
|
||||
@ -75,10 +77,18 @@ to [Adding Calibration Datasets](https://quark.docs.amd.com/latest/pytorch/calib
|
||||
dataset = load_dataset("mit-han-lab/pile-val-backup", split="validation")
|
||||
text_data = dataset["text"][:NUM_CALIBRATION_DATA]
|
||||
|
||||
tokenized_outputs = tokenizer(text_data, return_tensors="pt",
|
||||
padding=True, truncation=True, max_length=MAX_SEQ_LEN)
|
||||
calib_dataloader = DataLoader(tokenized_outputs['input_ids'],
|
||||
batch_size=BATCH_SIZE, drop_last=True)
|
||||
tokenized_outputs = tokenizer(
|
||||
text_data,
|
||||
return_tensors="pt",
|
||||
padding=True,
|
||||
truncation=True,
|
||||
max_length=MAX_SEQ_LEN,
|
||||
)
|
||||
calib_dataloader = DataLoader(
|
||||
tokenized_outputs['input_ids'],
|
||||
batch_size=BATCH_SIZE,
|
||||
drop_last=True,
|
||||
)
|
||||
```
|
||||
|
||||
### 3. Set the Quantization Configuration
|
||||
@ -103,26 +113,32 @@ kv-cache and the quantization algorithm is AutoSmoothQuant.
|
||||
load_quant_algo_config_from_file)
|
||||
|
||||
# Define fp8/per-tensor/static spec.
|
||||
FP8_PER_TENSOR_SPEC = FP8E4M3PerTensorSpec(observer_method="min_max",
|
||||
is_dynamic=False).to_quantization_spec()
|
||||
FP8_PER_TENSOR_SPEC = FP8E4M3PerTensorSpec(
|
||||
observer_method="min_max",
|
||||
is_dynamic=False,
|
||||
).to_quantization_spec()
|
||||
|
||||
# Define global quantization config, input tensors and weight apply FP8_PER_TENSOR_SPEC.
|
||||
global_quant_config = QuantizationConfig(input_tensors=FP8_PER_TENSOR_SPEC,
|
||||
weight=FP8_PER_TENSOR_SPEC)
|
||||
global_quant_config = QuantizationConfig(
|
||||
input_tensors=FP8_PER_TENSOR_SPEC,
|
||||
weight=FP8_PER_TENSOR_SPEC,
|
||||
)
|
||||
|
||||
# Define quantization config for kv-cache layers, output tensors apply FP8_PER_TENSOR_SPEC.
|
||||
KV_CACHE_SPEC = FP8_PER_TENSOR_SPEC
|
||||
kv_cache_layer_names_for_llama = ["*k_proj", "*v_proj"]
|
||||
kv_cache_quant_config = {name :
|
||||
QuantizationConfig(input_tensors=global_quant_config.input_tensors,
|
||||
weight=global_quant_config.weight,
|
||||
output_tensors=KV_CACHE_SPEC)
|
||||
for name in kv_cache_layer_names_for_llama}
|
||||
kv_cache_quant_config = {
|
||||
name: QuantizationConfig(
|
||||
input_tensors=global_quant_config.input_tensors,
|
||||
weight=global_quant_config.weight,
|
||||
output_tensors=KV_CACHE_SPEC,
|
||||
)
|
||||
for name in kv_cache_layer_names_for_llama
|
||||
}
|
||||
layer_quant_config = kv_cache_quant_config.copy()
|
||||
|
||||
# Define algorithm config by config file.
|
||||
LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE =
|
||||
'examples/torch/language_modeling/llm_ptq/models/llama/autosmoothquant_config.json'
|
||||
LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE = "examples/torch/language_modeling/llm_ptq/models/llama/autosmoothquant_config.json"
|
||||
algo_config = load_quant_algo_config_from_file(LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE)
|
||||
|
||||
EXCLUDE_LAYERS = ["lm_head"]
|
||||
@ -131,7 +147,8 @@ kv-cache and the quantization algorithm is AutoSmoothQuant.
|
||||
layer_quant_config=layer_quant_config,
|
||||
kv_cache_quant_config=kv_cache_quant_config,
|
||||
exclude=EXCLUDE_LAYERS,
|
||||
algo_config=algo_config)
|
||||
algo_config=algo_config,
|
||||
)
|
||||
```
|
||||
|
||||
### 4. Quantize the Model and Export
|
||||
@ -165,8 +182,11 @@ for more exporting format details.
|
||||
EXPORT_DIR = MODEL_ID.split("/")[1] + "-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant"
|
||||
exporter = ModelExporter(config=export_config, export_dir=EXPORT_DIR)
|
||||
with torch.no_grad():
|
||||
exporter.export_safetensors_model(freezed_model,
|
||||
quant_config=quant_config, tokenizer=tokenizer)
|
||||
exporter.export_safetensors_model(
|
||||
freezed_model,
|
||||
quant_config=quant_config,
|
||||
tokenizer=tokenizer,
|
||||
)
|
||||
```
|
||||
|
||||
### 5. Evaluation in vLLM
|
||||
@ -189,8 +209,11 @@ Now, you can load and run the Quark quantized model directly through the LLM ent
|
||||
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
||||
|
||||
# Create an LLM.
|
||||
llm = LLM(model="Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant",
|
||||
kv_cache_dtype='fp8',quantization='quark')
|
||||
llm = LLM(
|
||||
model="Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant",
|
||||
kv_cache_dtype="fp8",
|
||||
quantization="quark",
|
||||
)
|
||||
# Generate texts from the prompts. The output is a list of RequestOutput objects
|
||||
# that contain the prompt, generated text, and other information.
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
|
||||
@ -27,7 +27,7 @@ You can quantize your own huggingface model with torchao, e.g. [transformers](ht
|
||||
quantization_config = TorchAoConfig(Int8WeightOnlyConfig())
|
||||
quantized_model = AutoModelForCausalLM.from_pretrained(
|
||||
model_name,
|
||||
torch_dtype="auto",
|
||||
dtype="auto",
|
||||
device_map="auto",
|
||||
quantization_config=quantization_config
|
||||
)
|
||||
|
||||
@ -11,6 +11,9 @@ vLLM currently supports the following reasoning models:
|
||||
| Model Series | Parser Name | Structured Output Support | Tool Calling |
|
||||
|--------------|-------------|------------------|-------------|
|
||||
| [DeepSeek R1 series](https://huggingface.co/collections/deepseek-ai/deepseek-r1-678e1e131c0169c0bc89728d) | `deepseek_r1` | `json`, `regex` | ❌ |
|
||||
| [DeepSeek-V3.1](https://huggingface.co/collections/deepseek-ai/deepseek-v31-68a491bed32bd77e7fca048f) | `deepseek_v3` | `json`, `regex` | ❌ |
|
||||
| [ERNIE-4.5-VL series](https://huggingface.co/baidu/ERNIE-4.5-VL-28B-A3B-PT) | `ernie45` | `json`, `regex` | ❌ |
|
||||
| [ERNIE-4.5-21B-A3B-Thinking](https://huggingface.co/baidu/ERNIE-4.5-21B-A3B-Thinking) | `ernie45` | `json`, `regex` | ✅ |
|
||||
| [QwQ-32B](https://huggingface.co/Qwen/QwQ-32B) | `deepseek_r1` | `json`, `regex` | ✅ |
|
||||
| [IBM Granite 3.2 language models](https://huggingface.co/collections/ibm-granite/granite-32-language-models-67b3bc8c13508f6d064cff9a) | `granite` | ❌ | ❌ |
|
||||
| [Qwen3 series](https://huggingface.co/collections/Qwen/qwen3-67dd247413f0e2e4f653967f) | `qwen3` | `json`, `regex` | ✅ |
|
||||
@ -18,8 +21,9 @@ vLLM currently supports the following reasoning models:
|
||||
| [GLM-4.5 series](https://huggingface.co/collections/zai-org/glm-45-687c621d34bda8c9e4bf503b) | `glm45` | `json`, `regex` | ✅ |
|
||||
|
||||
!!! note
|
||||
IBM Granite 3.2 reasoning is disabled by default; to enable it, you must also pass `thinking=True` in your `chat_template_kwargs`.
|
||||
IBM Granite 3.2 and DeepSeek-V3.1 reasoning is disabled by default; to enable it, you must also pass `thinking=True` in your `chat_template_kwargs`.
|
||||
The reasoning feature for the Qwen3 series is enabled by default. To disable it, you must pass `enable_thinking=False` in your `chat_template_kwargs`.
|
||||
DeepSeek-V3.1 tool calling is supported in non-thinking mode.
|
||||
|
||||
## Quickstart
|
||||
|
||||
@ -115,9 +119,11 @@ OpenAI Python client library does not officially support `reasoning_content` att
|
||||
# For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
|
||||
# For Qwen3 series, if you want to disable thinking in reasoning mode, add:
|
||||
# extra_body={"chat_template_kwargs": {"enable_thinking": False}}
|
||||
stream = client.chat.completions.create(model=model,
|
||||
messages=messages,
|
||||
stream=True)
|
||||
stream = client.chat.completions.create(
|
||||
model=model,
|
||||
messages=messages,
|
||||
stream=True,
|
||||
)
|
||||
|
||||
print("client: Start streaming chat completions...")
|
||||
printed_reasoning_content = False
|
||||
@ -157,27 +163,29 @@ The reasoning content is also available when both tool calling and the reasoning
|
||||
|
||||
client = OpenAI(base_url="http://localhost:8000/v1", api_key="dummy")
|
||||
|
||||
tools = [{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "get_weather",
|
||||
"description": "Get the current weather in a given location",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"location": {"type": "string", "description": "City and state, e.g., 'San Francisco, CA'"},
|
||||
"unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}
|
||||
},
|
||||
"required": ["location", "unit"]
|
||||
}
|
||||
tools = [
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "get_weather",
|
||||
"description": "Get the current weather in a given location",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"location": {"type": "string", "description": "City and state, e.g., 'San Francisco, CA'"},
|
||||
"unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
|
||||
},
|
||||
"required": ["location", "unit"],
|
||||
}
|
||||
},
|
||||
}
|
||||
}]
|
||||
]
|
||||
|
||||
response = client.chat.completions.create(
|
||||
model=client.models.list().data[0].id,
|
||||
messages=[{"role": "user", "content": "What's the weather like in San Francisco?"}],
|
||||
tools=tools,
|
||||
tool_choice="auto"
|
||||
tool_choice="auto",
|
||||
)
|
||||
|
||||
print(response)
|
||||
@ -223,7 +231,7 @@ You can add a new `ReasoningParser` similar to <gh-file:vllm/reasoning/deepseek_
|
||||
previous_token_ids: Sequence[int],
|
||||
current_token_ids: Sequence[int],
|
||||
delta_token_ids: Sequence[int],
|
||||
) -> Union[DeltaMessage, None]:
|
||||
) -> DeltaMessage | None:
|
||||
"""
|
||||
Instance method that should be implemented for extracting reasoning
|
||||
from an incomplete response; for use when handling reasoning calls and
|
||||
@ -233,8 +241,10 @@ You can add a new `ReasoningParser` similar to <gh-file:vllm/reasoning/deepseek_
|
||||
"""
|
||||
|
||||
def extract_reasoning_content(
|
||||
self, model_output: str, request: ChatCompletionRequest
|
||||
) -> tuple[Optional[str], Optional[str]]:
|
||||
self,
|
||||
model_output: str,
|
||||
request: ChatCompletionRequest | ResponsesRequest,
|
||||
) -> tuple[str | None, str | None]:
|
||||
"""
|
||||
Extract reasoning content from a complete model-generated string.
|
||||
|
||||
@ -272,10 +282,10 @@ Additionally, to enable structured output, you'll need to create a new `Reasoner
|
||||
|
||||
@classmethod
|
||||
def from_tokenizer(cls, tokenizer: PreTrainedTokenizer) -> Reasoner:
|
||||
return cls(start_token_id=tokenizer.encode(
|
||||
"<think>", add_special_tokens=False)[0],
|
||||
end_token_id=tokenizer.encode("</think>",
|
||||
add_special_tokens=False)[0])
|
||||
return cls(
|
||||
start_token_id=tokenizer.encode("<think>", add_special_tokens=False)[0],
|
||||
end_token_id=tokenizer.encode("</think>", add_special_tokens=False)[0],
|
||||
)
|
||||
|
||||
def is_reasoning_end(self, input_ids: list[int]) -> bool:
|
||||
return self.end_token_id in input_ids
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user