Compare commits
4 Commits
zhuohan/re
...
woosuk/rem
| Author | SHA1 | Date | |
|---|---|---|---|
| 22bf5c5077 | |||
| 3a8990743e | |||
| fbc2cc8217 | |||
| efd4bc967d |
@ -5,11 +5,11 @@ import os
|
|||||||
import sys
|
import sys
|
||||||
import zipfile
|
import zipfile
|
||||||
|
|
||||||
# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 500 MiB
|
# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 450 MiB
|
||||||
# Note that we have 800 MiB quota, please use it wisely.
|
# Note that we have 800 MiB quota, please use it wisely.
|
||||||
# See https://github.com/pypi/support/issues/6326 .
|
# See https://github.com/pypi/support/issues/6326 .
|
||||||
# Please also sync the value with the one in Dockerfile.
|
# Please also sync the value with the one in Dockerfile.
|
||||||
VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 500))
|
VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 450))
|
||||||
|
|
||||||
|
|
||||||
def print_top_10_largest_files(zip_file):
|
def print_top_10_largest_files(zip_file):
|
||||||
|
|||||||
@ -1,12 +0,0 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m HandH1998/QQQ-Llama-3-8b-g128 -b 32 -l 1000 -f 5 -t 1
|
|
||||||
model_name: "HandH1998/QQQ-Llama-3-8b-g128"
|
|
||||||
tasks:
|
|
||||||
- name: "gsm8k"
|
|
||||||
metrics:
|
|
||||||
- name: "exact_match,strict-match"
|
|
||||||
value: 0.419
|
|
||||||
- name: "exact_match,flexible-extract"
|
|
||||||
value: 0.416
|
|
||||||
limit: 1000
|
|
||||||
num_fewshot: 5
|
|
||||||
@ -1,11 +0,0 @@
|
|||||||
# For hf script, without -t option (tensor parallel size).
|
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh -m meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -b 32 -l 100 -t 8
|
|
||||||
model_name: "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
|
|
||||||
backend: "vllm-vlm"
|
|
||||||
tasks:
|
|
||||||
- name: "chartqa"
|
|
||||||
metrics:
|
|
||||||
- name: "relaxed_accuracy,none"
|
|
||||||
value: 0.90
|
|
||||||
limit: 100
|
|
||||||
num_fewshot: 0
|
|
||||||
@ -1,11 +0,0 @@
|
|||||||
# For hf script, without -t option (tensor parallel size).
|
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -b 32 -l 250 -t 8 -f 5
|
|
||||||
model_name: "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
|
|
||||||
backend: "vllm-vlm"
|
|
||||||
tasks:
|
|
||||||
- name: "mmlu_pro"
|
|
||||||
metrics:
|
|
||||||
- name: "exact_match,custom-extract"
|
|
||||||
value: 0.80
|
|
||||||
limit: 250 # will run on 250 * 14 subjects = 3500 samples
|
|
||||||
num_fewshot: 5
|
|
||||||
@ -1,5 +1,4 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size)
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -b auto -l 1319 -f 5 -t 1
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -l 1319 -t 1
|
|
||||||
model_name: "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic"
|
model_name: "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic"
|
||||||
tasks:
|
tasks:
|
||||||
- name: "gsm8k"
|
- name: "gsm8k"
|
||||||
|
|||||||
@ -1,12 +0,0 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh -m Qwen/Qwen2.5-VL-7B-Instruct -l 2500 -t 1
|
|
||||||
|
|
||||||
model_name: "Qwen/Qwen2.5-VL-7B-Instruct"
|
|
||||||
backend: "vllm-vlm"
|
|
||||||
tasks:
|
|
||||||
- name: "chartqa"
|
|
||||||
metrics:
|
|
||||||
- name: "relaxed_accuracy,none"
|
|
||||||
value: 0.855
|
|
||||||
limit: 2500
|
|
||||||
num_fewshot: 0
|
|
||||||
@ -1 +0,0 @@
|
|||||||
Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
|
|
||||||
@ -1 +0,0 @@
|
|||||||
Meta-Llama-4-Maverick-17B-128E-Instruct-FP8-MM.yaml
|
|
||||||
@ -1 +0,0 @@
|
|||||||
Qwen2.5-VL-7B-Instruct.yaml
|
|
||||||
@ -1,44 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
# We can use this script to compute baseline accuracy on chartqa for vllm.
|
|
||||||
#
|
|
||||||
# Make sure you have lm-eval-harness installed:
|
|
||||||
# pip install lm-eval==0.4.9
|
|
||||||
|
|
||||||
usage() {
|
|
||||||
echo``
|
|
||||||
echo "Runs lm eval harness on ChartQA using multimodal vllm."
|
|
||||||
echo "This pathway is intended to be used to create baselines for "
|
|
||||||
echo "our correctness tests in vllm's CI."
|
|
||||||
echo
|
|
||||||
echo "usage: ${0} <options>"
|
|
||||||
echo
|
|
||||||
echo " -m - huggingface stub or local directory of the model"
|
|
||||||
echo " -l - limit number of samples to run"
|
|
||||||
echo " -t - tensor parallel size to run at"
|
|
||||||
echo
|
|
||||||
}
|
|
||||||
|
|
||||||
while getopts "m:l:t:" OPT; do
|
|
||||||
case ${OPT} in
|
|
||||||
m )
|
|
||||||
MODEL="$OPTARG"
|
|
||||||
;;
|
|
||||||
l )
|
|
||||||
LIMIT="$OPTARG"
|
|
||||||
;;
|
|
||||||
t )
|
|
||||||
TP_SIZE="$OPTARG"
|
|
||||||
;;
|
|
||||||
\? )
|
|
||||||
usage
|
|
||||||
exit 1
|
|
||||||
;;
|
|
||||||
esac
|
|
||||||
done
|
|
||||||
|
|
||||||
lm_eval --model vllm-vlm \
|
|
||||||
--model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE" \
|
|
||||||
--tasks chartqa \
|
|
||||||
--batch_size auto \
|
|
||||||
--apply_chat_template \
|
|
||||||
--limit $LIMIT
|
|
||||||
0
.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
Executable file → Normal file
0
.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
Executable file → Normal file
@ -1,50 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
# We can use this script to compute baseline accuracy on MMLUPRO for vllm.
|
|
||||||
# We use this for fp8, which HF does not support.
|
|
||||||
#
|
|
||||||
# Make sure you have lm-eval-harness installed:
|
|
||||||
# pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
|
|
||||||
|
|
||||||
usage() {
|
|
||||||
echo``
|
|
||||||
echo "Runs lm eval harness on MMLU Pro using huggingface transformers."
|
|
||||||
echo "This pathway is intended to be used to create baselines for "
|
|
||||||
echo "our automated nm-test-accuracy workflow"
|
|
||||||
echo
|
|
||||||
echo "usage: ${0} <options>"
|
|
||||||
echo
|
|
||||||
echo " -m - huggingface stub or local directory of the model"
|
|
||||||
echo " -l - limit number of samples to run"
|
|
||||||
echo " -f - number of fewshot samples to use"
|
|
||||||
echo " -t - tensor parallel size to run at"
|
|
||||||
echo
|
|
||||||
}
|
|
||||||
|
|
||||||
while getopts "m:b:l:f:t:" OPT; do
|
|
||||||
case ${OPT} in
|
|
||||||
m )
|
|
||||||
MODEL="$OPTARG"
|
|
||||||
;;
|
|
||||||
b )
|
|
||||||
BATCH_SIZE="$OPTARG"
|
|
||||||
;;
|
|
||||||
l )
|
|
||||||
LIMIT="$OPTARG"
|
|
||||||
;;
|
|
||||||
f )
|
|
||||||
FEWSHOT="$OPTARG"
|
|
||||||
;;
|
|
||||||
t )
|
|
||||||
TP_SIZE="$OPTARG"
|
|
||||||
;;
|
|
||||||
\? )
|
|
||||||
usage
|
|
||||||
exit 1
|
|
||||||
;;
|
|
||||||
esac
|
|
||||||
done
|
|
||||||
|
|
||||||
lm_eval --model vllm \
|
|
||||||
--model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,add_bos_token=true,trust_remote_code=true,max_model_len=4096" \
|
|
||||||
--tasks mmlu_pro --num_fewshot "$FEWSHOT" --limit "$LIMIT" \
|
|
||||||
--batch_size auto
|
|
||||||
@ -19,27 +19,21 @@ RTOL = 0.08
|
|||||||
def launch_lm_eval(eval_config, tp_size):
|
def launch_lm_eval(eval_config, tp_size):
|
||||||
trust_remote_code = eval_config.get("trust_remote_code", False)
|
trust_remote_code = eval_config.get("trust_remote_code", False)
|
||||||
max_model_len = eval_config.get("max_model_len", 4096)
|
max_model_len = eval_config.get("max_model_len", 4096)
|
||||||
batch_size = eval_config.get("batch_size", "auto")
|
|
||||||
backend = eval_config.get("backend", "vllm")
|
|
||||||
model_args = (
|
model_args = (
|
||||||
f"pretrained={eval_config['model_name']},"
|
f"pretrained={eval_config['model_name']},"
|
||||||
f"tensor_parallel_size={tp_size},"
|
f"tensor_parallel_size={tp_size},"
|
||||||
f"enforce_eager=true,"
|
f"enforce_eager=true,"
|
||||||
f"add_bos_token=true,"
|
f"add_bos_token=true,"
|
||||||
f"trust_remote_code={trust_remote_code},"
|
f"trust_remote_code={trust_remote_code},"
|
||||||
f"max_model_len={max_model_len},"
|
f"max_model_len={max_model_len}"
|
||||||
)
|
)
|
||||||
results = lm_eval.simple_evaluate(
|
results = lm_eval.simple_evaluate(
|
||||||
model=backend,
|
model="vllm",
|
||||||
model_args=model_args,
|
model_args=model_args,
|
||||||
tasks=[task["name"] for task in eval_config["tasks"]],
|
tasks=[task["name"] for task in eval_config["tasks"]],
|
||||||
num_fewshot=eval_config["num_fewshot"],
|
num_fewshot=eval_config["num_fewshot"],
|
||||||
limit=eval_config["limit"],
|
limit=eval_config["limit"],
|
||||||
# TODO(yeq): using chat template w/ fewshot_as_multiturn is supposed help
|
batch_size="auto",
|
||||||
# text models. however, this is regressing measured strict-match for
|
|
||||||
# existing text models in CI, so only apply it for mm.
|
|
||||||
apply_chat_template=backend == "vllm-vlm",
|
|
||||||
batch_size=batch_size,
|
|
||||||
)
|
)
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
|||||||
@ -8,7 +8,7 @@ steps:
|
|||||||
commands:
|
commands:
|
||||||
# #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
|
# #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
|
||||||
# https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
|
# https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg VLLM_MAIN_CUDA_VERSION=12.9 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg VLLM_MAIN_CUDA_VERSION=12.9 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||||
- "mkdir artifacts"
|
- "mkdir artifacts"
|
||||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||||
- "bash .buildkite/scripts/upload-wheels.sh"
|
- "bash .buildkite/scripts/upload-wheels.sh"
|
||||||
@ -76,7 +76,7 @@ steps:
|
|||||||
queue: arm64_cpu_queue_postmerge
|
queue: arm64_cpu_queue_postmerge
|
||||||
commands:
|
commands:
|
||||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
|
||||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
|
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
|
||||||
|
|
||||||
# Add job to create multi-arch manifest
|
# Add job to create multi-arch manifest
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@ -527,9 +527,8 @@ steps:
|
|||||||
# since torchao nightly is only compatible with torch nightly currently
|
# since torchao nightly is only compatible with torch nightly currently
|
||||||
# https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
|
# https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
|
||||||
# we can only upgrade after this is resolved
|
# we can only upgrade after this is resolved
|
||||||
# TODO(jerryzh168): resolve the above comment
|
- pip install --pre torchao==0.13.0.dev20250814 --index-url https://download.pytorch.org/whl/nightly/cu128
|
||||||
- uv pip install --system torchao==0.13.0
|
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/
|
||||||
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
|
|
||||||
|
|
||||||
- label: LM Eval Small Models # 53min
|
- label: LM Eval Small Models # 53min
|
||||||
timeout_in_minutes: 75
|
timeout_in_minutes: 75
|
||||||
@ -734,16 +733,6 @@ steps:
|
|||||||
- pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
|
- pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
|
||||||
- cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
|
- cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
|
||||||
|
|
||||||
- label: Multi-Modal Accuracy Eval (Small Models) # 50min
|
|
||||||
timeout_in_minutes: 70
|
|
||||||
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/multimodal/
|
|
||||||
- vllm/inputs/
|
|
||||||
- vllm/v1/core/
|
|
||||||
commands:
|
|
||||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
|
|
||||||
|
|
||||||
- label: Multi-Modal Models Test (Extended) 1
|
- label: Multi-Modal Models Test (Extended) 1
|
||||||
mirror_hardwares: [amdexperimental]
|
mirror_hardwares: [amdexperimental]
|
||||||
optional: true
|
optional: true
|
||||||
|
|||||||
17
.coveragerc
17
.coveragerc
@ -1,10 +1,5 @@
|
|||||||
[run]
|
[run]
|
||||||
# Track the installed vllm package (this is what actually gets imported during tests)
|
source = vllm
|
||||||
# Use wildcard pattern to match the installed location
|
|
||||||
source =
|
|
||||||
vllm
|
|
||||||
*/dist-packages/vllm
|
|
||||||
*/site-packages/vllm
|
|
||||||
omit =
|
omit =
|
||||||
*/tests/*
|
*/tests/*
|
||||||
*/test_*
|
*/test_*
|
||||||
@ -17,16 +12,6 @@ omit =
|
|||||||
*/benchmarks/*
|
*/benchmarks/*
|
||||||
*/docs/*
|
*/docs/*
|
||||||
|
|
||||||
[paths]
|
|
||||||
# Map all possible vllm locations to a canonical "vllm" path
|
|
||||||
# This ensures coverage.combine properly merges data from different test runs
|
|
||||||
source =
|
|
||||||
vllm
|
|
||||||
/vllm-workspace/src/vllm
|
|
||||||
/vllm-workspace/vllm
|
|
||||||
*/site-packages/vllm
|
|
||||||
*/dist-packages/vllm
|
|
||||||
|
|
||||||
[report]
|
[report]
|
||||||
exclude_lines =
|
exclude_lines =
|
||||||
pragma: no cover
|
pragma: no cover
|
||||||
|
|||||||
@ -1,4 +0,0 @@
|
|||||||
# Migrate from `yapf` & `isort` to `ruff`
|
|
||||||
d6953beb91da4e9c99be4c0a1304a2d24189535c
|
|
||||||
# Convert `Optional[x]` to `x | None` and `Union[x, y]` to `x | y`
|
|
||||||
8fcaaf6a165e661f63fc51be906bc05b0767332f
|
|
||||||
13
.github/CODEOWNERS
vendored
13
.github/CODEOWNERS
vendored
@ -5,7 +5,9 @@
|
|||||||
/vllm/attention @LucasWilkinson
|
/vllm/attention @LucasWilkinson
|
||||||
/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
||||||
/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
|
/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
|
||||||
|
/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
|
||||||
/vllm/model_executor/layers/fused_moe @mgoin
|
/vllm/model_executor/layers/fused_moe @mgoin
|
||||||
|
/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @NickLucche
|
||||||
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256
|
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256
|
||||||
/vllm/model_executor/layers/mamba @tdoublep
|
/vllm/model_executor/layers/mamba @tdoublep
|
||||||
/vllm/model_executor/model_loader @22quinn
|
/vllm/model_executor/model_loader @22quinn
|
||||||
@ -24,6 +26,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
|
|||||||
/vllm/config/cache.py @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg @heheda12345
|
/vllm/config/cache.py @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg @heheda12345
|
||||||
|
|
||||||
# vLLM V1
|
# vLLM V1
|
||||||
|
/vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
|
||||||
/vllm/v1/attention @LucasWilkinson
|
/vllm/v1/attention @LucasWilkinson
|
||||||
/vllm/v1/attention/backends/flashinfer.py @mgoin
|
/vllm/v1/attention/backends/flashinfer.py @mgoin
|
||||||
/vllm/v1/attention/backends/triton_attn.py @tdoublep
|
/vllm/v1/attention/backends/triton_attn.py @tdoublep
|
||||||
@ -57,7 +60,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
|
|||||||
/tests/v1/offloading @ApostaC
|
/tests/v1/offloading @ApostaC
|
||||||
|
|
||||||
# Transformers backend
|
# Transformers backend
|
||||||
/vllm/model_executor/models/transformers @hmellor
|
/vllm/model_executor/models/transformers.py @hmellor
|
||||||
/tests/models/test_transformers.py @hmellor
|
/tests/models/test_transformers.py @hmellor
|
||||||
|
|
||||||
# Docs
|
# Docs
|
||||||
@ -118,11 +121,3 @@ mkdocs.yaml @hmellor
|
|||||||
|
|
||||||
# KVConnector installation files
|
# KVConnector installation files
|
||||||
/requirements/kv_connectors.txt @NickLucche
|
/requirements/kv_connectors.txt @NickLucche
|
||||||
|
|
||||||
# Pooling models
|
|
||||||
/examples/*/pooling/ @noooop
|
|
||||||
/tests/models/*/pooling* @noooop
|
|
||||||
/tests/entrypoints/pooling @noooop
|
|
||||||
/vllm/config/pooler.py @noooop
|
|
||||||
/vllm/pooling_params.py @noooop
|
|
||||||
/vllm/model_executor/layers/pooler.py @noooop
|
|
||||||
|
|||||||
138
.github/workflows/issue_autolabel.yml
vendored
138
.github/workflows/issue_autolabel.yml
vendored
@ -13,7 +13,6 @@ jobs:
|
|||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- name: Label issues based on keywords
|
- name: Label issues based on keywords
|
||||||
id: label-step
|
|
||||||
uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
|
uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
|
||||||
with:
|
with:
|
||||||
script: |
|
script: |
|
||||||
@ -43,6 +42,7 @@ jobs:
|
|||||||
searchIn: "body"
|
searchIn: "body"
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
|
|
||||||
// Substring search - matches anywhere in text (partial matches)
|
// Substring search - matches anywhere in text (partial matches)
|
||||||
substrings: [
|
substrings: [
|
||||||
{
|
{
|
||||||
@ -89,12 +89,14 @@ jobs:
|
|||||||
term: "hip_",
|
term: "hip_",
|
||||||
searchIn: "both"
|
searchIn: "both"
|
||||||
},
|
},
|
||||||
|
|
||||||
// ROCm tools and libraries
|
// ROCm tools and libraries
|
||||||
{
|
{
|
||||||
term: "hipify",
|
term: "hipify",
|
||||||
searchIn: "both"
|
searchIn: "both"
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
|
|
||||||
// Regex patterns - for complex pattern matching
|
// Regex patterns - for complex pattern matching
|
||||||
regexPatterns: [
|
regexPatterns: [
|
||||||
{
|
{
|
||||||
@ -105,17 +107,13 @@ jobs:
|
|||||||
}
|
}
|
||||||
],
|
],
|
||||||
},
|
},
|
||||||
// Add more label configurations here as needed
|
|
||||||
// example: {
|
|
||||||
// keywords: [...],
|
|
||||||
// substrings: [...],
|
|
||||||
// regexPatterns: [...]
|
|
||||||
// },
|
|
||||||
};
|
};
|
||||||
|
|
||||||
// Helper function to create regex based on search type
|
// Helper function to create regex based on search type
|
||||||
function createSearchRegex(term, type) {
|
function createSearchRegex(term, type) {
|
||||||
// Escape special regex characters in the term
|
// Escape special regex characters in the term
|
||||||
const escapedTerm = term.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
|
const escapedTerm = term.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
|
||||||
|
|
||||||
switch (type) {
|
switch (type) {
|
||||||
case 'keyword':
|
case 'keyword':
|
||||||
// Word boundary search - matches whole words only
|
// Word boundary search - matches whole words only
|
||||||
@ -127,13 +125,16 @@ jobs:
|
|||||||
throw new Error(`Unknown search type: ${type}`);
|
throw new Error(`Unknown search type: ${type}`);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Helper function to find matching terms in text with line information
|
// Helper function to find matching terms in text with line information
|
||||||
function findMatchingTermsWithLines(text, searchTerms = [], searchType = 'keyword', searchLocation = '') {
|
function findMatchingTermsWithLines(text, searchTerms = [], searchType = 'keyword', searchLocation = '') {
|
||||||
const matches = [];
|
const matches = [];
|
||||||
const lines = text.split('\n');
|
const lines = text.split('\n');
|
||||||
|
|
||||||
for (const termConfig of searchTerms) {
|
for (const termConfig of searchTerms) {
|
||||||
let regex;
|
let regex;
|
||||||
let term, searchIn, pattern, description, flags;
|
let term, searchIn, pattern, description, flags;
|
||||||
|
|
||||||
// Handle different input formats (string or object)
|
// Handle different input formats (string or object)
|
||||||
if (typeof termConfig === 'string') {
|
if (typeof termConfig === 'string') {
|
||||||
term = termConfig;
|
term = termConfig;
|
||||||
@ -145,17 +146,21 @@ jobs:
|
|||||||
description = termConfig.description;
|
description = termConfig.description;
|
||||||
flags = termConfig.flags;
|
flags = termConfig.flags;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Skip if this term shouldn't be searched in the current location
|
// Skip if this term shouldn't be searched in the current location
|
||||||
if (searchIn !== 'both' && searchIn !== searchLocation) {
|
if (searchIn !== 'both' && searchIn !== searchLocation) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Create appropriate regex
|
// Create appropriate regex
|
||||||
if (searchType === 'regex') {
|
if (searchType === 'regex') {
|
||||||
regex = new RegExp(pattern, flags || "gi");
|
regex = new RegExp(pattern, flags || "gi");
|
||||||
} else {
|
} else {
|
||||||
regex = createSearchRegex(term, searchType);
|
regex = createSearchRegex(term, searchType);
|
||||||
}
|
}
|
||||||
|
|
||||||
const termMatches = [];
|
const termMatches = [];
|
||||||
|
|
||||||
// Check each line for matches
|
// Check each line for matches
|
||||||
lines.forEach((line, lineIndex) => {
|
lines.forEach((line, lineIndex) => {
|
||||||
const lineMatches = line.match(regex);
|
const lineMatches = line.match(regex);
|
||||||
@ -170,14 +175,15 @@ jobs:
|
|||||||
originalTerm: term || pattern,
|
originalTerm: term || pattern,
|
||||||
description: description,
|
description: description,
|
||||||
// Show context around the match in the line
|
// Show context around the match in the line
|
||||||
context: line.length > 100 ?
|
context: line.length > 100 ?
|
||||||
line.substring(Math.max(0, line.toLowerCase().indexOf(match.toLowerCase()) - 30),
|
line.substring(Math.max(0, line.toLowerCase().indexOf(match.toLowerCase()) - 30),
|
||||||
line.toLowerCase().indexOf(match.toLowerCase()) + match.length + 30) + '...'
|
line.toLowerCase().indexOf(match.toLowerCase()) + match.length + 30) + '...'
|
||||||
: line.trim()
|
: line.trim()
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
if (termMatches.length > 0) {
|
if (termMatches.length > 0) {
|
||||||
matches.push({
|
matches.push({
|
||||||
term: term || (description || pattern),
|
term: term || (description || pattern),
|
||||||
@ -190,48 +196,64 @@ jobs:
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return matches;
|
return matches;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Helper function to check if label should be added
|
// Helper function to check if label should be added
|
||||||
async function processLabel(labelName, config) {
|
async function processLabel(labelName, config) {
|
||||||
const body = context.payload.issue.body || "";
|
const body = context.payload.issue.body || "";
|
||||||
const title = context.payload.issue.title || "";
|
const title = context.payload.issue.title || "";
|
||||||
|
|
||||||
core.notice(`Processing label: ${labelName}`);
|
core.notice(`Processing label: ${labelName}`);
|
||||||
core.notice(`Issue Title: "${title}"`);
|
core.notice(`Issue Title: "${title}"`);
|
||||||
core.notice(`Issue Body length: ${body.length} characters`);
|
core.notice(`Issue Body length: ${body.length} characters`);
|
||||||
|
|
||||||
let shouldAddLabel = false;
|
let shouldAddLabel = false;
|
||||||
let allMatches = [];
|
let allMatches = [];
|
||||||
let reason = '';
|
let reason = '';
|
||||||
|
|
||||||
const keywords = config.keywords || [];
|
const keywords = config.keywords || [];
|
||||||
const substrings = config.substrings || [];
|
const substrings = config.substrings || [];
|
||||||
const regexPatterns = config.regexPatterns || [];
|
const regexPatterns = config.regexPatterns || [];
|
||||||
|
|
||||||
core.notice(`Searching with ${keywords.length} keywords, ${substrings.length} substrings, and ${regexPatterns.length} regex patterns`);
|
core.notice(`Searching with ${keywords.length} keywords, ${substrings.length} substrings, and ${regexPatterns.length} regex patterns`);
|
||||||
|
|
||||||
// Search in title
|
// Search in title
|
||||||
if (title.trim()) {
|
if (title.trim()) {
|
||||||
core.notice(`Searching in title: "${title}"`);
|
core.notice(`Searching in title: "${title}"`);
|
||||||
|
|
||||||
const titleKeywordMatches = findMatchingTermsWithLines(title, keywords, 'keyword', 'title');
|
const titleKeywordMatches = findMatchingTermsWithLines(title, keywords, 'keyword', 'title');
|
||||||
const titleSubstringMatches = findMatchingTermsWithLines(title, substrings, 'substring', 'title');
|
const titleSubstringMatches = findMatchingTermsWithLines(title, substrings, 'substring', 'title');
|
||||||
const titleRegexMatches = findMatchingTermsWithLines(title, regexPatterns, 'regex', 'title');
|
const titleRegexMatches = findMatchingTermsWithLines(title, regexPatterns, 'regex', 'title');
|
||||||
|
|
||||||
allMatches.push(...titleKeywordMatches, ...titleSubstringMatches, ...titleRegexMatches);
|
allMatches.push(...titleKeywordMatches, ...titleSubstringMatches, ...titleRegexMatches);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Search in body
|
// Search in body
|
||||||
if (body.trim()) {
|
if (body.trim()) {
|
||||||
core.notice(`Searching in body (${body.length} characters)`);
|
core.notice(`Searching in body (${body.length} characters)`);
|
||||||
|
|
||||||
const bodyKeywordMatches = findMatchingTermsWithLines(body, keywords, 'keyword', 'body');
|
const bodyKeywordMatches = findMatchingTermsWithLines(body, keywords, 'keyword', 'body');
|
||||||
const bodySubstringMatches = findMatchingTermsWithLines(body, substrings, 'substring', 'body');
|
const bodySubstringMatches = findMatchingTermsWithLines(body, substrings, 'substring', 'body');
|
||||||
const bodyRegexMatches = findMatchingTermsWithLines(body, regexPatterns, 'regex', 'body');
|
const bodyRegexMatches = findMatchingTermsWithLines(body, regexPatterns, 'regex', 'body');
|
||||||
|
|
||||||
allMatches.push(...bodyKeywordMatches, ...bodySubstringMatches, ...bodyRegexMatches);
|
allMatches.push(...bodyKeywordMatches, ...bodySubstringMatches, ...bodyRegexMatches);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (allMatches.length > 0) {
|
if (allMatches.length > 0) {
|
||||||
core.notice(`Found ${allMatches.length} matching term(s):`);
|
core.notice(`Found ${allMatches.length} matching term(s):`);
|
||||||
|
|
||||||
for (const termMatch of allMatches) {
|
for (const termMatch of allMatches) {
|
||||||
const locationText = termMatch.searchLocation === 'title' ? 'title' : 'body';
|
const locationText = termMatch.searchLocation === 'title' ? 'title' : 'body';
|
||||||
const searchInText = termMatch.searchIn === 'both' ? 'both' : termMatch.searchIn;
|
const searchInText = termMatch.searchIn === 'both' ? 'both' : termMatch.searchIn;
|
||||||
|
|
||||||
if (termMatch.searchType === 'regex') {
|
if (termMatch.searchType === 'regex') {
|
||||||
core.notice(` 📍 Regex: "${termMatch.term}" (pattern: ${termMatch.pattern}) found ${termMatch.count} time(s) in ${locationText} (configured to search in: ${searchInText}):`);
|
core.notice(` 📍 Regex: "${termMatch.term}" (pattern: ${termMatch.pattern}) found ${termMatch.count} time(s) in ${locationText} (configured to search in: ${searchInText}):`);
|
||||||
} else {
|
} else {
|
||||||
core.notice(` 📍 Term: "${termMatch.term}" (${termMatch.searchType} search) found ${termMatch.count} time(s) in ${locationText} (configured to search in: ${searchInText}):`);
|
core.notice(` 📍 Term: "${termMatch.term}" (${termMatch.searchType} search) found ${termMatch.count} time(s) in ${locationText} (configured to search in: ${searchInText}):`);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Show details for each match
|
// Show details for each match
|
||||||
termMatch.matches.forEach((match, index) => {
|
termMatch.matches.forEach((match, index) => {
|
||||||
core.notice(` ${index + 1}. Line ${match.lineNumber} in ${match.searchLocation}: "${match.match}" [${match.searchType}]`);
|
core.notice(` ${index + 1}. Line ${match.lineNumber} in ${match.searchLocation}: "${match.match}" [${match.searchType}]`);
|
||||||
@ -244,6 +266,7 @@ jobs:
|
|||||||
}
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
shouldAddLabel = true;
|
shouldAddLabel = true;
|
||||||
const totalMatches = allMatches.reduce((sum, t) => sum + t.count, 0);
|
const totalMatches = allMatches.reduce((sum, t) => sum + t.count, 0);
|
||||||
const titleMatches = allMatches.filter(t => t.searchLocation === 'title').reduce((sum, t) => sum + t.count, 0);
|
const titleMatches = allMatches.filter(t => t.searchLocation === 'title').reduce((sum, t) => sum + t.count, 0);
|
||||||
@ -251,10 +274,13 @@ jobs:
|
|||||||
const keywordMatches = allMatches.filter(t => t.searchType === 'keyword').reduce((sum, t) => sum + t.count, 0);
|
const keywordMatches = allMatches.filter(t => t.searchType === 'keyword').reduce((sum, t) => sum + t.count, 0);
|
||||||
const substringMatches = allMatches.filter(t => t.searchType === 'substring').reduce((sum, t) => sum + t.count, 0);
|
const substringMatches = allMatches.filter(t => t.searchType === 'substring').reduce((sum, t) => sum + t.count, 0);
|
||||||
const regexMatches = allMatches.filter(t => t.searchType === 'regex').reduce((sum, t) => sum + t.count, 0);
|
const regexMatches = allMatches.filter(t => t.searchType === 'regex').reduce((sum, t) => sum + t.count, 0);
|
||||||
|
|
||||||
reason = `Found ${totalMatches} total matches (${titleMatches} in title, ${bodyMatches} in body) - ${keywordMatches} keyword matches, ${substringMatches} substring matches, ${regexMatches} regex matches`;
|
reason = `Found ${totalMatches} total matches (${titleMatches} in title, ${bodyMatches} in body) - ${keywordMatches} keyword matches, ${substringMatches} substring matches, ${regexMatches} regex matches`;
|
||||||
}
|
}
|
||||||
|
|
||||||
core.notice(`Final decision: ${shouldAddLabel ? 'ADD LABEL' : 'DO NOT ADD LABEL'}`);
|
core.notice(`Final decision: ${shouldAddLabel ? 'ADD LABEL' : 'DO NOT ADD LABEL'}`);
|
||||||
core.notice(`Reason: ${reason || 'No matching terms found'}`);
|
core.notice(`Reason: ${reason || 'No matching terms found'}`);
|
||||||
|
|
||||||
if (shouldAddLabel) {
|
if (shouldAddLabel) {
|
||||||
const existingLabels = context.payload.issue.labels.map(l => l.name);
|
const existingLabels = context.payload.issue.labels.map(l => l.name);
|
||||||
if (!existingLabels.includes(labelName)) {
|
if (!existingLabels.includes(labelName)) {
|
||||||
@ -270,92 +296,14 @@ jobs:
|
|||||||
core.notice(`Label "${labelName}" already present.`);
|
core.notice(`Label "${labelName}" already present.`);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
core.notice(`No matching terms found for label "${labelName}".`);
|
core.notice(`No matching terms found for label "${labelName}".`);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Process all configured labels
|
// Process all configured labels
|
||||||
const labelsAddedResults = await Promise.all(
|
const processLabels = Object.entries(labelConfig)
|
||||||
Object.entries(labelConfig).map(([labelName, config]) =>
|
.map(([labelName, config]) => processLabel(labelName, config));
|
||||||
processLabel(labelName, config).then(added => ({ labelName, added }))
|
const labelsAdded = await Promise.all(processLabels);
|
||||||
)
|
const numLabelsAdded = labelsAdded.reduce((x, y) => x + y, 0);
|
||||||
);
|
core.notice(`Processing complete. ${numLabelsAdded} label(s) added.`);
|
||||||
|
|
||||||
const numLabelsAdded = labelsAddedResults.filter(r => r.added).length;
|
|
||||||
core.notice(`Processing complete. ${numLabelsAdded} label(s) added.`);
|
|
||||||
|
|
||||||
// Return which labels were added for the next step
|
|
||||||
const addedLabels = labelsAddedResults.filter(r => r.added).map(r => r.labelName);
|
|
||||||
core.setOutput('labels_added', JSON.stringify(addedLabels));
|
|
||||||
return addedLabels;
|
|
||||||
|
|
||||||
- name: CC users for labeled issues
|
|
||||||
if: steps.label-step.outputs.labels_added != '[]'
|
|
||||||
uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
|
|
||||||
with:
|
|
||||||
script: |
|
|
||||||
// Configuration: Map labels to GitHub users to CC
|
|
||||||
// You can add multiple users per label, and multiple label configurations
|
|
||||||
const ccConfig = {
|
|
||||||
rocm: {
|
|
||||||
users: ['hongxiayang', 'tjtanaa', 'vllmellm'], // Add more users as needed: ['user1', 'user2', 'user3']
|
|
||||||
message: 'CC {users} for ROCm-related issue' // {users} will be replaced with @mentions
|
|
||||||
},
|
|
||||||
// Add more label -> user mappings here
|
|
||||||
// Example:
|
|
||||||
// cuda: {
|
|
||||||
// users: ['user1', 'user2'],
|
|
||||||
// message: 'CC {users} for CUDA-related issue'
|
|
||||||
// },
|
|
||||||
// performance: {
|
|
||||||
// users: ['perfexpert'],
|
|
||||||
// message: 'CC {users} for performance issue'
|
|
||||||
// },
|
|
||||||
};
|
|
||||||
|
|
||||||
const labelsAdded = JSON.parse('${{ steps.label-step.outputs.labels_added }}');
|
|
||||||
core.notice(`Labels added: ${labelsAdded.join(', ')}`);
|
|
||||||
|
|
||||||
// Get existing comments to check for already mentioned users
|
|
||||||
const comments = await github.rest.issues.listComments({
|
|
||||||
owner: context.repo.owner,
|
|
||||||
repo: context.repo.repo,
|
|
||||||
issue_number: context.issue.number,
|
|
||||||
});
|
|
||||||
|
|
||||||
const issueBody = context.payload.issue.body || '';
|
|
||||||
const allExistingText = issueBody + '\n' + comments.data.map(c => c.body).join('\n');
|
|
||||||
|
|
||||||
// Process each label that was added
|
|
||||||
for (const label of labelsAdded) {
|
|
||||||
if (ccConfig[label]) {
|
|
||||||
const config = ccConfig[label];
|
|
||||||
const usersToMention = [];
|
|
||||||
|
|
||||||
// Check which users haven't been mentioned yet
|
|
||||||
for (const user of config.users) {
|
|
||||||
const mentionPattern = new RegExp(`@${user}\\b`, 'i');
|
|
||||||
if (!mentionPattern.test(allExistingText)) {
|
|
||||||
usersToMention.push(user);
|
|
||||||
} else {
|
|
||||||
core.notice(`@${user} already mentioned for label "${label}", skipping`);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Post comment if there are users to mention
|
|
||||||
if (usersToMention.length > 0) {
|
|
||||||
const mentions = usersToMention.map(u => `@${u}`).join(' ');
|
|
||||||
const message = config.message.replace('{users}', mentions);
|
|
||||||
|
|
||||||
await github.rest.issues.createComment({
|
|
||||||
owner: context.repo.owner,
|
|
||||||
repo: context.repo.repo,
|
|
||||||
issue_number: context.issue.number,
|
|
||||||
body: message
|
|
||||||
});
|
|
||||||
|
|
||||||
core.notice(`CC comment added for label "${label}": ${mentions}`);
|
|
||||||
} else {
|
|
||||||
core.notice(`All users for label "${label}" already mentioned, skipping comment`);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@ -16,7 +16,6 @@ repos:
|
|||||||
rev: v1.38.1
|
rev: v1.38.1
|
||||||
hooks:
|
hooks:
|
||||||
- id: typos
|
- id: typos
|
||||||
args: [--force-exclude]
|
|
||||||
- repo: https://github.com/pre-commit/mirrors-clang-format
|
- repo: https://github.com/pre-commit/mirrors-clang-format
|
||||||
rev: v21.1.2
|
rev: v21.1.2
|
||||||
hooks:
|
hooks:
|
||||||
|
|||||||
@ -8,6 +8,7 @@ import sys
|
|||||||
import time
|
import time
|
||||||
import traceback
|
import traceback
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
|
from typing import Optional, Union
|
||||||
|
|
||||||
import aiohttp
|
import aiohttp
|
||||||
import huggingface_hub.constants
|
import huggingface_hub.constants
|
||||||
@ -27,13 +28,13 @@ class RequestFuncInput:
|
|||||||
prompt_len: int
|
prompt_len: int
|
||||||
output_len: int
|
output_len: int
|
||||||
model: str
|
model: str
|
||||||
model_name: str | None = None
|
model_name: Optional[str] = None
|
||||||
logprobs: int | None = None
|
logprobs: Optional[int] = None
|
||||||
extra_body: dict | None = None
|
extra_body: Optional[dict] = None
|
||||||
multi_modal_content: dict | list[dict] | None = None
|
multi_modal_content: Optional[dict | list[dict]] = None
|
||||||
ignore_eos: bool = False
|
ignore_eos: bool = False
|
||||||
language: str | None = None
|
language: Optional[str] = None
|
||||||
request_id: str | None = None
|
request_id: Optional[str] = None
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
@ -51,7 +52,7 @@ class RequestFuncOutput:
|
|||||||
|
|
||||||
async def async_request_tgi(
|
async def async_request_tgi(
|
||||||
request_func_input: RequestFuncInput,
|
request_func_input: RequestFuncInput,
|
||||||
pbar: tqdm | None = None,
|
pbar: Optional[tqdm] = None,
|
||||||
) -> RequestFuncOutput:
|
) -> RequestFuncOutput:
|
||||||
api_url = request_func_input.api_url
|
api_url = request_func_input.api_url
|
||||||
assert api_url.endswith("generate_stream")
|
assert api_url.endswith("generate_stream")
|
||||||
@ -132,7 +133,7 @@ async def async_request_tgi(
|
|||||||
|
|
||||||
async def async_request_trt_llm(
|
async def async_request_trt_llm(
|
||||||
request_func_input: RequestFuncInput,
|
request_func_input: RequestFuncInput,
|
||||||
pbar: tqdm | None = None,
|
pbar: Optional[tqdm] = None,
|
||||||
) -> RequestFuncOutput:
|
) -> RequestFuncOutput:
|
||||||
api_url = request_func_input.api_url
|
api_url = request_func_input.api_url
|
||||||
assert api_url.endswith("generate_stream")
|
assert api_url.endswith("generate_stream")
|
||||||
@ -203,7 +204,7 @@ async def async_request_trt_llm(
|
|||||||
|
|
||||||
async def async_request_deepspeed_mii(
|
async def async_request_deepspeed_mii(
|
||||||
request_func_input: RequestFuncInput,
|
request_func_input: RequestFuncInput,
|
||||||
pbar: tqdm | None = None,
|
pbar: Optional[tqdm] = None,
|
||||||
) -> RequestFuncOutput:
|
) -> RequestFuncOutput:
|
||||||
api_url = request_func_input.api_url
|
api_url = request_func_input.api_url
|
||||||
assert api_url.endswith(("completions", "profile")), (
|
assert api_url.endswith(("completions", "profile")), (
|
||||||
@ -266,7 +267,7 @@ async def async_request_deepspeed_mii(
|
|||||||
|
|
||||||
async def async_request_openai_completions(
|
async def async_request_openai_completions(
|
||||||
request_func_input: RequestFuncInput,
|
request_func_input: RequestFuncInput,
|
||||||
pbar: tqdm | None = None,
|
pbar: Optional[tqdm] = None,
|
||||||
) -> RequestFuncOutput:
|
) -> RequestFuncOutput:
|
||||||
api_url = request_func_input.api_url
|
api_url = request_func_input.api_url
|
||||||
assert api_url.endswith(("completions", "profile")), (
|
assert api_url.endswith(("completions", "profile")), (
|
||||||
@ -366,7 +367,7 @@ async def async_request_openai_completions(
|
|||||||
|
|
||||||
async def async_request_openai_chat_completions(
|
async def async_request_openai_chat_completions(
|
||||||
request_func_input: RequestFuncInput,
|
request_func_input: RequestFuncInput,
|
||||||
pbar: tqdm | None = None,
|
pbar: Optional[tqdm] = None,
|
||||||
) -> RequestFuncOutput:
|
) -> RequestFuncOutput:
|
||||||
api_url = request_func_input.api_url
|
api_url = request_func_input.api_url
|
||||||
assert api_url.endswith(("chat/completions", "profile")), (
|
assert api_url.endswith(("chat/completions", "profile")), (
|
||||||
@ -475,7 +476,7 @@ async def async_request_openai_chat_completions(
|
|||||||
|
|
||||||
async def async_request_openai_audio(
|
async def async_request_openai_audio(
|
||||||
request_func_input: RequestFuncInput,
|
request_func_input: RequestFuncInput,
|
||||||
pbar: tqdm | None = None,
|
pbar: Optional[tqdm] = None,
|
||||||
) -> RequestFuncOutput:
|
) -> RequestFuncOutput:
|
||||||
# Lazy import without PlaceholderModule to avoid vllm dep.
|
# Lazy import without PlaceholderModule to avoid vllm dep.
|
||||||
import soundfile
|
import soundfile
|
||||||
@ -609,7 +610,7 @@ def get_tokenizer(
|
|||||||
tokenizer_mode: str = "auto",
|
tokenizer_mode: str = "auto",
|
||||||
trust_remote_code: bool = False,
|
trust_remote_code: bool = False,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
) -> PreTrainedTokenizer | PreTrainedTokenizerFast:
|
) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
|
||||||
if pretrained_model_name_or_path is not None and not os.path.exists(
|
if pretrained_model_name_or_path is not None and not os.path.exists(
|
||||||
pretrained_model_name_or_path
|
pretrained_model_name_or_path
|
||||||
):
|
):
|
||||||
|
|||||||
@ -32,6 +32,7 @@ import dataclasses
|
|||||||
import json
|
import json
|
||||||
import random
|
import random
|
||||||
import time
|
import time
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
from transformers import PreTrainedTokenizerBase
|
from transformers import PreTrainedTokenizerBase
|
||||||
|
|
||||||
@ -79,7 +80,7 @@ def sample_requests_from_dataset(
|
|||||||
num_requests: int,
|
num_requests: int,
|
||||||
tokenizer: PreTrainedTokenizerBase,
|
tokenizer: PreTrainedTokenizerBase,
|
||||||
input_length_range: tuple[int, int],
|
input_length_range: tuple[int, int],
|
||||||
fixed_output_len: int | None,
|
fixed_output_len: Optional[int],
|
||||||
) -> list[Request]:
|
) -> list[Request]:
|
||||||
if fixed_output_len is not None and fixed_output_len < 4:
|
if fixed_output_len is not None and fixed_output_len < 4:
|
||||||
raise ValueError("output_len too small")
|
raise ValueError("output_len too small")
|
||||||
@ -127,7 +128,7 @@ def sample_requests_from_random(
|
|||||||
num_requests: int,
|
num_requests: int,
|
||||||
tokenizer: PreTrainedTokenizerBase,
|
tokenizer: PreTrainedTokenizerBase,
|
||||||
input_length_range: tuple[int, int],
|
input_length_range: tuple[int, int],
|
||||||
fixed_output_len: int | None,
|
fixed_output_len: Optional[int],
|
||||||
prefix_len: int,
|
prefix_len: int,
|
||||||
) -> list[Request]:
|
) -> list[Request]:
|
||||||
requests = []
|
requests = []
|
||||||
|
|||||||
@ -7,6 +7,7 @@ import dataclasses
|
|||||||
import json
|
import json
|
||||||
import random
|
import random
|
||||||
import time
|
import time
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
from transformers import AutoTokenizer, PreTrainedTokenizerBase
|
from transformers import AutoTokenizer, PreTrainedTokenizerBase
|
||||||
|
|
||||||
@ -23,7 +24,7 @@ def sample_requests(
|
|||||||
dataset_path: str,
|
dataset_path: str,
|
||||||
num_requests: int,
|
num_requests: int,
|
||||||
tokenizer: PreTrainedTokenizerBase,
|
tokenizer: PreTrainedTokenizerBase,
|
||||||
fixed_output_len: int | None,
|
fixed_output_len: Optional[int],
|
||||||
) -> list[tuple[str, int, int, int]]:
|
) -> list[tuple[str, int, int, int]]:
|
||||||
if fixed_output_len is not None and fixed_output_len < 4:
|
if fixed_output_len is not None and fixed_output_len < 4:
|
||||||
raise ValueError("output_len too small")
|
raise ValueError("output_len too small")
|
||||||
|
|||||||
@ -31,8 +31,8 @@ import time
|
|||||||
import uuid
|
import uuid
|
||||||
import warnings
|
import warnings
|
||||||
from collections.abc import AsyncGenerator
|
from collections.abc import AsyncGenerator
|
||||||
from contextlib import nullcontext
|
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
import datasets
|
import datasets
|
||||||
import numpy as np
|
import numpy as np
|
||||||
@ -316,7 +316,7 @@ def calculate_metrics(
|
|||||||
tokenizer: PreTrainedTokenizerBase,
|
tokenizer: PreTrainedTokenizerBase,
|
||||||
selected_percentile_metrics: list[str],
|
selected_percentile_metrics: list[str],
|
||||||
selected_percentiles: list[float],
|
selected_percentiles: list[float],
|
||||||
goodput_config_dict: dict[str, float] | None = None,
|
goodput_config_dict: Optional[dict[str, float]] = None,
|
||||||
) -> tuple[BenchmarkMetrics, list[int]]:
|
) -> tuple[BenchmarkMetrics, list[int]]:
|
||||||
actual_output_lens: list[int] = []
|
actual_output_lens: list[int] = []
|
||||||
total_input = 0
|
total_input = 0
|
||||||
@ -436,9 +436,9 @@ async def benchmark(
|
|||||||
selected_percentile_metrics: list[str],
|
selected_percentile_metrics: list[str],
|
||||||
selected_percentiles: list[str],
|
selected_percentiles: list[str],
|
||||||
ignore_eos: bool,
|
ignore_eos: bool,
|
||||||
max_concurrency: int | None,
|
max_concurrency: Optional[int],
|
||||||
structured_output_ratio: float,
|
structured_output_ratio: float,
|
||||||
goodput_config_dict: dict[str, float] | None = None,
|
goodput_config_dict: Optional[dict[str, float]] = None,
|
||||||
):
|
):
|
||||||
if backend in ASYNC_REQUEST_FUNCS:
|
if backend in ASYNC_REQUEST_FUNCS:
|
||||||
request_func = ASYNC_REQUEST_FUNCS[backend]
|
request_func = ASYNC_REQUEST_FUNCS[backend]
|
||||||
@ -502,9 +502,15 @@ async def benchmark(
|
|||||||
|
|
||||||
pbar = None if disable_tqdm else tqdm(total=len(input_requests))
|
pbar = None if disable_tqdm else tqdm(total=len(input_requests))
|
||||||
|
|
||||||
semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else nullcontext()
|
# This can be used once the minimum Python version is 3.10 or higher,
|
||||||
|
# and it will simplify the code in limited_request_func.
|
||||||
|
# semaphore = (asyncio.Semaphore(max_concurrency)
|
||||||
|
# if max_concurrency else contextlib.nullcontext())
|
||||||
|
semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else None
|
||||||
|
|
||||||
async def limited_request_func(request_func_input, pbar):
|
async def limited_request_func(request_func_input, pbar):
|
||||||
|
if semaphore is None:
|
||||||
|
return await request_func(request_func_input=request_func_input, pbar=pbar)
|
||||||
async with semaphore:
|
async with semaphore:
|
||||||
return await request_func(request_func_input=request_func_input, pbar=pbar)
|
return await request_func(request_func_input=request_func_input, pbar=pbar)
|
||||||
|
|
||||||
|
|||||||
@ -6,7 +6,7 @@ import math
|
|||||||
import os
|
import os
|
||||||
import time
|
import time
|
||||||
from types import TracebackType
|
from types import TracebackType
|
||||||
from typing import Any
|
from typing import Any, Optional, Union
|
||||||
|
|
||||||
|
|
||||||
def convert_to_pytorch_benchmark_format(
|
def convert_to_pytorch_benchmark_format(
|
||||||
@ -92,7 +92,7 @@ class TimeCollector:
|
|||||||
def __init__(self, scale: int) -> None:
|
def __init__(self, scale: int) -> None:
|
||||||
self.cnt: int = 0
|
self.cnt: int = 0
|
||||||
self._sum: int = 0
|
self._sum: int = 0
|
||||||
self._max: int | None = None
|
self._max: Optional[int] = None
|
||||||
self.scale = scale
|
self.scale = scale
|
||||||
self.start_time: int = time.monotonic_ns()
|
self.start_time: int = time.monotonic_ns()
|
||||||
|
|
||||||
@ -104,13 +104,13 @@ class TimeCollector:
|
|||||||
else:
|
else:
|
||||||
self._max = max(self._max, v)
|
self._max = max(self._max, v)
|
||||||
|
|
||||||
def avg(self) -> float | str:
|
def avg(self) -> Union[float, str]:
|
||||||
return self._sum * 1.0 / self.cnt / self.scale if self.cnt > 0 else "N/A"
|
return self._sum * 1.0 / self.cnt / self.scale if self.cnt > 0 else "N/A"
|
||||||
|
|
||||||
def max(self) -> float | str:
|
def max(self) -> Union[float, str]:
|
||||||
return self._max / self.scale if self._max else "N/A"
|
return self._max / self.scale if self._max else "N/A"
|
||||||
|
|
||||||
def dump_avg_max(self) -> list[float | str]:
|
def dump_avg_max(self) -> list[Union[float, str]]:
|
||||||
return [self.avg(), self.max()]
|
return [self.avg(), self.max()]
|
||||||
|
|
||||||
def __enter__(self) -> None:
|
def __enter__(self) -> None:
|
||||||
@ -118,8 +118,8 @@ class TimeCollector:
|
|||||||
|
|
||||||
def __exit__(
|
def __exit__(
|
||||||
self,
|
self,
|
||||||
exc_type: type[BaseException] | None,
|
exc_type: Optional[type[BaseException]],
|
||||||
exc_value: BaseException | None,
|
exc_value: Optional[BaseException],
|
||||||
exc_traceback: TracebackType | None,
|
exc_traceback: Optional[TracebackType],
|
||||||
) -> None:
|
) -> None:
|
||||||
self.collect(time.monotonic_ns() - self.start_time)
|
self.collect(time.monotonic_ns() - self.start_time)
|
||||||
|
|||||||
@ -6,7 +6,8 @@ import copy
|
|||||||
import itertools
|
import itertools
|
||||||
import pickle as pkl
|
import pickle as pkl
|
||||||
import time
|
import time
|
||||||
from collections.abc import Callable, Iterable
|
from collections.abc import Iterable
|
||||||
|
from typing import Callable
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torch.utils.benchmark as TBenchmark
|
import torch.utils.benchmark as TBenchmark
|
||||||
|
|||||||
@ -6,7 +6,8 @@ import copy
|
|||||||
import itertools
|
import itertools
|
||||||
import pickle as pkl
|
import pickle as pkl
|
||||||
import time
|
import time
|
||||||
from collections.abc import Callable, Iterable
|
from collections.abc import Iterable
|
||||||
|
from typing import Callable, Optional
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torch.utils.benchmark as TBenchmark
|
import torch.utils.benchmark as TBenchmark
|
||||||
@ -52,7 +53,7 @@ def bench_int8(
|
|||||||
n: int,
|
n: int,
|
||||||
label: str,
|
label: str,
|
||||||
sub_label: str,
|
sub_label: str,
|
||||||
bench_kernels: list[str] | None = None,
|
bench_kernels: Optional[list[str]] = None,
|
||||||
) -> Iterable[TMeasurement]:
|
) -> Iterable[TMeasurement]:
|
||||||
"""Benchmark INT8-based kernels."""
|
"""Benchmark INT8-based kernels."""
|
||||||
assert dtype == torch.int8
|
assert dtype == torch.int8
|
||||||
@ -107,7 +108,7 @@ def bench_fp8(
|
|||||||
n: int,
|
n: int,
|
||||||
label: str,
|
label: str,
|
||||||
sub_label: str,
|
sub_label: str,
|
||||||
bench_kernels: list[str] | None = None,
|
bench_kernels: Optional[list[str]] = None,
|
||||||
) -> Iterable[TMeasurement]:
|
) -> Iterable[TMeasurement]:
|
||||||
"""Benchmark FP8-based kernels."""
|
"""Benchmark FP8-based kernels."""
|
||||||
assert dtype == torch.float8_e4m3fn
|
assert dtype == torch.float8_e4m3fn
|
||||||
@ -182,7 +183,7 @@ def bench(
|
|||||||
n: int,
|
n: int,
|
||||||
label: str,
|
label: str,
|
||||||
sub_label: str,
|
sub_label: str,
|
||||||
bench_kernels: list[str] | None = None,
|
bench_kernels: Optional[list[str]] = None,
|
||||||
) -> Iterable[TMeasurement]:
|
) -> Iterable[TMeasurement]:
|
||||||
if dtype == torch.int8:
|
if dtype == torch.int8:
|
||||||
return bench_int8(dtype, m, k, n, label, sub_label, bench_kernels)
|
return bench_int8(dtype, m, k, n, label, sub_label, bench_kernels)
|
||||||
@ -200,7 +201,7 @@ def print_timers(timers: Iterable[TMeasurement]):
|
|||||||
def run(
|
def run(
|
||||||
dtype: torch.dtype,
|
dtype: torch.dtype,
|
||||||
MKNs: Iterable[tuple[int, int, int]],
|
MKNs: Iterable[tuple[int, int, int]],
|
||||||
bench_kernels: list[str] | None = None,
|
bench_kernels: Optional[list[str]] = None,
|
||||||
) -> Iterable[TMeasurement]:
|
) -> Iterable[TMeasurement]:
|
||||||
results = []
|
results = []
|
||||||
for m, k, n in MKNs:
|
for m, k, n in MKNs:
|
||||||
|
|||||||
@ -3,9 +3,10 @@
|
|||||||
|
|
||||||
import pickle as pkl
|
import pickle as pkl
|
||||||
import time
|
import time
|
||||||
from collections.abc import Callable, Iterable
|
from collections.abc import Iterable
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from itertools import product
|
from itertools import product
|
||||||
|
from typing import Callable, Optional
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torch.utils.benchmark as TBenchmark
|
import torch.utils.benchmark as TBenchmark
|
||||||
@ -50,7 +51,7 @@ def get_bench_params() -> list[bench_params_t]:
|
|||||||
def unfused_int8_impl(
|
def unfused_int8_impl(
|
||||||
rms_norm_layer: RMSNorm,
|
rms_norm_layer: RMSNorm,
|
||||||
x: torch.Tensor,
|
x: torch.Tensor,
|
||||||
residual: torch.Tensor | None,
|
residual: Optional[torch.Tensor],
|
||||||
quant_dtype: torch.dtype,
|
quant_dtype: torch.dtype,
|
||||||
):
|
):
|
||||||
# Norm
|
# Norm
|
||||||
@ -67,7 +68,7 @@ def unfused_int8_impl(
|
|||||||
def unfused_fp8_impl(
|
def unfused_fp8_impl(
|
||||||
rms_norm_layer: RMSNorm,
|
rms_norm_layer: RMSNorm,
|
||||||
x: torch.Tensor,
|
x: torch.Tensor,
|
||||||
residual: torch.Tensor | None,
|
residual: Optional[torch.Tensor],
|
||||||
quant_dtype: torch.dtype,
|
quant_dtype: torch.dtype,
|
||||||
):
|
):
|
||||||
# Norm
|
# Norm
|
||||||
@ -84,7 +85,7 @@ def unfused_fp8_impl(
|
|||||||
def fused_impl(
|
def fused_impl(
|
||||||
rms_norm_layer: RMSNorm, # this stores the weights
|
rms_norm_layer: RMSNorm, # this stores the weights
|
||||||
x: torch.Tensor,
|
x: torch.Tensor,
|
||||||
residual: torch.Tensor | None,
|
residual: Optional[torch.Tensor],
|
||||||
quant_dtype: torch.dtype,
|
quant_dtype: torch.dtype,
|
||||||
):
|
):
|
||||||
out, _ = ops.rms_norm_dynamic_per_token_quant(
|
out, _ = ops.rms_norm_dynamic_per_token_quant(
|
||||||
|
|||||||
@ -1,7 +1,7 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
import itertools
|
import itertools
|
||||||
from collections.abc import Callable
|
from typing import Callable
|
||||||
from unittest.mock import patch
|
from unittest.mock import patch
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|||||||
@ -22,8 +22,8 @@ Example:
|
|||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import time
|
import time
|
||||||
from collections.abc import Callable
|
|
||||||
from contextlib import nullcontext
|
from contextlib import nullcontext
|
||||||
|
from typing import Callable, Optional
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torch.distributed as dist
|
import torch.distributed as dist
|
||||||
@ -264,12 +264,12 @@ class CommunicatorBenchmark:
|
|||||||
def benchmark_allreduce_single(
|
def benchmark_allreduce_single(
|
||||||
self,
|
self,
|
||||||
sequence_length: int,
|
sequence_length: int,
|
||||||
allreduce_fn: Callable[[torch.Tensor], torch.Tensor | None],
|
allreduce_fn: Callable[[torch.Tensor], Optional[torch.Tensor]],
|
||||||
should_use_fn: Callable[[torch.Tensor], bool],
|
should_use_fn: Callable[[torch.Tensor], bool],
|
||||||
context,
|
context,
|
||||||
num_warmup: int,
|
num_warmup: int,
|
||||||
num_trials: int,
|
num_trials: int,
|
||||||
) -> float | None:
|
) -> Optional[float]:
|
||||||
"""Benchmark method with CUDA graph optimization."""
|
"""Benchmark method with CUDA graph optimization."""
|
||||||
try:
|
try:
|
||||||
# Create test tensor (2D: sequence_length x hidden_size)
|
# Create test tensor (2D: sequence_length x hidden_size)
|
||||||
|
|||||||
@ -6,12 +6,11 @@ import copy
|
|||||||
import json
|
import json
|
||||||
import pickle
|
import pickle
|
||||||
import time
|
import time
|
||||||
from collections.abc import Callable
|
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from enum import Enum, auto
|
from enum import Enum, auto
|
||||||
from itertools import product
|
from itertools import product
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any
|
from typing import Any, Callable, Optional
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torch.utils.benchmark as TBenchmark
|
import torch.utils.benchmark as TBenchmark
|
||||||
@ -159,7 +158,7 @@ def ref_group_gemm(
|
|||||||
seq_lens_cpu: torch.Tensor,
|
seq_lens_cpu: torch.Tensor,
|
||||||
prompt_lora_mapping_cpu: torch.Tensor,
|
prompt_lora_mapping_cpu: torch.Tensor,
|
||||||
scaling: float,
|
scaling: float,
|
||||||
add_inputs: bool | None,
|
add_inputs: Optional[bool],
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Torch group gemm reference implementation to test correctness of
|
Torch group gemm reference implementation to test correctness of
|
||||||
@ -317,8 +316,8 @@ class BenchmarkContext:
|
|||||||
lora_rank: int
|
lora_rank: int
|
||||||
sort_by_lora_id: bool
|
sort_by_lora_id: bool
|
||||||
dtype: torch.dtype
|
dtype: torch.dtype
|
||||||
seq_length: int | None = None
|
seq_length: Optional[int] = None
|
||||||
num_slices: int | None = None # num_slices for slice based ops
|
num_slices: Optional[int] = None # num_slices for slice based ops
|
||||||
|
|
||||||
def with_seq_length(self, seq_length: int) -> "BenchmarkContext":
|
def with_seq_length(self, seq_length: int) -> "BenchmarkContext":
|
||||||
ctx = copy.copy(self)
|
ctx = copy.copy(self)
|
||||||
@ -562,7 +561,7 @@ class BenchmarkTensors:
|
|||||||
}
|
}
|
||||||
|
|
||||||
def bench_fn_kwargs(
|
def bench_fn_kwargs(
|
||||||
self, op_type: OpType, add_inputs: bool | None = None
|
self, op_type: OpType, add_inputs: Optional[bool] = None
|
||||||
) -> dict[str, Any]:
|
) -> dict[str, Any]:
|
||||||
if op_type.is_shrink_fn():
|
if op_type.is_shrink_fn():
|
||||||
assert add_inputs is None
|
assert add_inputs is None
|
||||||
@ -576,7 +575,7 @@ class BenchmarkTensors:
|
|||||||
raise ValueError(f"Unrecognized optype {self}")
|
raise ValueError(f"Unrecognized optype {self}")
|
||||||
|
|
||||||
def test_correctness(
|
def test_correctness(
|
||||||
self, op_type: OpType, expand_fn_add_inputs: bool | None
|
self, op_type: OpType, expand_fn_add_inputs: Optional[bool]
|
||||||
) -> bool:
|
) -> bool:
|
||||||
"""
|
"""
|
||||||
Test correctness of op_type implementation against a grouped gemm
|
Test correctness of op_type implementation against a grouped gemm
|
||||||
@ -612,8 +611,8 @@ def bench_optype(
|
|||||||
ctx: BenchmarkContext,
|
ctx: BenchmarkContext,
|
||||||
arg_pool_size: int,
|
arg_pool_size: int,
|
||||||
op_type: OpType,
|
op_type: OpType,
|
||||||
cuda_graph_nops: int | None = None,
|
cuda_graph_nops: Optional[int] = None,
|
||||||
expand_fn_add_inputs: bool | None = None,
|
expand_fn_add_inputs: Optional[bool] = None,
|
||||||
test_correctness: bool = False,
|
test_correctness: bool = False,
|
||||||
) -> TMeasurement:
|
) -> TMeasurement:
|
||||||
assert arg_pool_size >= 1
|
assert arg_pool_size >= 1
|
||||||
@ -680,7 +679,7 @@ def bench_torch_mm(
|
|||||||
ctx: BenchmarkContext,
|
ctx: BenchmarkContext,
|
||||||
arg_pool_size: int,
|
arg_pool_size: int,
|
||||||
op_type: OpType,
|
op_type: OpType,
|
||||||
cuda_graph_nops: int | None = None,
|
cuda_graph_nops: Optional[int] = None,
|
||||||
) -> TMeasurement:
|
) -> TMeasurement:
|
||||||
"""
|
"""
|
||||||
Benchmark basic torch.mm as a roofline.
|
Benchmark basic torch.mm as a roofline.
|
||||||
@ -745,7 +744,7 @@ def use_cuda_graph_recommendation() -> str:
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
def print_timers(timers: list[TMeasurement], args: argparse.Namespace | None = None):
|
def print_timers(timers: list[TMeasurement], args: Optional[argparse.Namespace] = None):
|
||||||
compare = TBenchmark.Compare(timers)
|
compare = TBenchmark.Compare(timers)
|
||||||
compare.print()
|
compare.print()
|
||||||
|
|
||||||
|
|||||||
@ -8,9 +8,10 @@ import math
|
|||||||
import os
|
import os
|
||||||
import pickle as pkl
|
import pickle as pkl
|
||||||
import time
|
import time
|
||||||
from collections.abc import Callable, Iterable
|
from collections.abc import Iterable
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from itertools import product
|
from itertools import product
|
||||||
|
from typing import Callable, Optional
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import torch
|
import torch
|
||||||
@ -62,23 +63,23 @@ class BenchmarkTensors:
|
|||||||
a: torch.Tensor
|
a: torch.Tensor
|
||||||
|
|
||||||
w_q: torch.Tensor
|
w_q: torch.Tensor
|
||||||
group_size: int | None
|
group_size: Optional[int]
|
||||||
wtype: ScalarType
|
wtype: ScalarType
|
||||||
w_g_s: torch.Tensor
|
w_g_s: torch.Tensor
|
||||||
w_g_zp: torch.Tensor | None
|
w_g_zp: Optional[torch.Tensor]
|
||||||
w_ch_s: torch.Tensor | None
|
w_ch_s: Optional[torch.Tensor]
|
||||||
w_tok_s: torch.Tensor | None
|
w_tok_s: Optional[torch.Tensor]
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class TypeConfig:
|
class TypeConfig:
|
||||||
act_type: torch.dtype
|
act_type: torch.dtype
|
||||||
weight_type: ScalarType
|
weight_type: ScalarType
|
||||||
output_type: torch.dtype | None
|
output_type: Optional[torch.dtype]
|
||||||
group_scale_type: torch.dtype | None
|
group_scale_type: Optional[torch.dtype]
|
||||||
group_zero_type: torch.dtype | None
|
group_zero_type: Optional[torch.dtype]
|
||||||
channel_scale_type: torch.dtype | None
|
channel_scale_type: Optional[torch.dtype]
|
||||||
token_scale_type: torch.dtype | None
|
token_scale_type: Optional[torch.dtype]
|
||||||
|
|
||||||
|
|
||||||
def rand_data(shape, dtype=torch.float16, scale=1):
|
def rand_data(shape, dtype=torch.float16, scale=1):
|
||||||
@ -92,8 +93,8 @@ def quantize_and_pack(
|
|||||||
atype: torch.dtype,
|
atype: torch.dtype,
|
||||||
w: torch.Tensor,
|
w: torch.Tensor,
|
||||||
wtype: ScalarType,
|
wtype: ScalarType,
|
||||||
stype: torch.dtype | None,
|
stype: Optional[torch.dtype],
|
||||||
group_size: int | None,
|
group_size: Optional[int],
|
||||||
zero_points: bool = False,
|
zero_points: bool = False,
|
||||||
):
|
):
|
||||||
assert wtype.is_integer(), "TODO: support floating point weights"
|
assert wtype.is_integer(), "TODO: support floating point weights"
|
||||||
@ -112,7 +113,7 @@ def quantize_and_pack(
|
|||||||
|
|
||||||
|
|
||||||
def create_bench_tensors(
|
def create_bench_tensors(
|
||||||
shape: tuple[int, int, int], types: TypeConfig, group_size: int | None
|
shape: tuple[int, int, int], types: TypeConfig, group_size: Optional[int]
|
||||||
) -> list[BenchmarkTensors]:
|
) -> list[BenchmarkTensors]:
|
||||||
m, n, k = shape
|
m, n, k = shape
|
||||||
|
|
||||||
@ -330,8 +331,8 @@ def bench_fns(label: str, sub_label: str, description: str, fns: list[Callable])
|
|||||||
return res
|
return res
|
||||||
|
|
||||||
|
|
||||||
_SWEEP_SCHEDULES_RESULTS: pd.DataFrame | None = None
|
_SWEEP_SCHEDULES_RESULTS: Optional[pd.DataFrame] = None
|
||||||
_SWEEP_SCHEDULES_RESULTS_CSV: str | None = None
|
_SWEEP_SCHEDULES_RESULTS_CSV: Optional[str] = None
|
||||||
|
|
||||||
|
|
||||||
def bench(
|
def bench(
|
||||||
|
|||||||
@ -631,7 +631,7 @@ def main(args: argparse.Namespace):
|
|||||||
else:
|
else:
|
||||||
ensure_divisibility(intermediate_size, args.tp_size, "intermediate_size")
|
ensure_divisibility(intermediate_size, args.tp_size, "intermediate_size")
|
||||||
shard_intermediate_size = 2 * intermediate_size // args.tp_size
|
shard_intermediate_size = 2 * intermediate_size // args.tp_size
|
||||||
dtype = torch.float16 if current_platform.is_rocm() else config.dtype
|
dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype
|
||||||
use_fp8_w8a8 = args.dtype == "fp8_w8a8"
|
use_fp8_w8a8 = args.dtype == "fp8_w8a8"
|
||||||
use_int8_w8a16 = args.dtype == "int8_w8a16"
|
use_int8_w8a16 = args.dtype == "int8_w8a16"
|
||||||
block_quant_shape = get_weight_block_size_safety(config)
|
block_quant_shape = get_weight_block_size_safety(config)
|
||||||
|
|||||||
@ -344,7 +344,7 @@ def main(args: argparse.Namespace):
|
|||||||
topk = config.num_experts_per_tok
|
topk = config.num_experts_per_tok
|
||||||
|
|
||||||
hidden_size = config.hidden_size
|
hidden_size = config.hidden_size
|
||||||
dtype = torch.float16 if current_platform.is_rocm() else config.dtype
|
dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype
|
||||||
use_fp8_w8a8 = args.dtype == "fp8_w8a8"
|
use_fp8_w8a8 = args.dtype == "fp8_w8a8"
|
||||||
use_int8_w8a16 = args.dtype == "int8_w8a16"
|
use_int8_w8a16 = args.dtype == "int8_w8a16"
|
||||||
use_customized_permute = args.use_customized_permute
|
use_customized_permute = args.use_customized_permute
|
||||||
|
|||||||
@ -3,6 +3,7 @@
|
|||||||
|
|
||||||
import random
|
import random
|
||||||
import time
|
import time
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
@ -36,7 +37,7 @@ def main(
|
|||||||
seed: int,
|
seed: int,
|
||||||
do_profile: bool,
|
do_profile: bool,
|
||||||
device: str = "cuda",
|
device: str = "cuda",
|
||||||
kv_cache_dtype: str | None = None,
|
kv_cache_dtype: Optional[str] = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
current_platform.seed_everything(seed)
|
current_platform.seed_everything(seed)
|
||||||
|
|
||||||
|
|||||||
@ -3,8 +3,8 @@
|
|||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import math
|
import math
|
||||||
from collections.abc import Callable
|
|
||||||
from contextlib import contextmanager
|
from contextlib import contextmanager
|
||||||
|
from typing import Callable
|
||||||
from unittest.mock import patch
|
from unittest.mock import patch
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
|||||||
@ -1,5 +1,7 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
import random
|
import random
|
||||||
import time
|
import time
|
||||||
|
|
||||||
|
|||||||
@ -1,5 +1,7 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
import random
|
import random
|
||||||
import time
|
import time
|
||||||
|
|
||||||
|
|||||||
@ -2,6 +2,7 @@
|
|||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
import itertools
|
import itertools
|
||||||
|
from typing import Optional, Union
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
from flashinfer.norm import fused_add_rmsnorm, rmsnorm
|
from flashinfer.norm import fused_add_rmsnorm, rmsnorm
|
||||||
@ -20,8 +21,8 @@ class HuggingFaceRMSNorm(nn.Module):
|
|||||||
def forward(
|
def forward(
|
||||||
self,
|
self,
|
||||||
x: torch.Tensor,
|
x: torch.Tensor,
|
||||||
residual: torch.Tensor | None = None,
|
residual: Optional[torch.Tensor] = None,
|
||||||
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
|
) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
|
||||||
orig_dtype = x.dtype
|
orig_dtype = x.dtype
|
||||||
x = x.to(torch.float32)
|
x = x.to(torch.float32)
|
||||||
if residual is not None:
|
if residual is not None:
|
||||||
@ -40,7 +41,7 @@ class HuggingFaceRMSNorm(nn.Module):
|
|||||||
def rmsnorm_naive(
|
def rmsnorm_naive(
|
||||||
x: torch.Tensor,
|
x: torch.Tensor,
|
||||||
weight: torch.Tensor,
|
weight: torch.Tensor,
|
||||||
residual: torch.Tensor | None = None,
|
residual: Optional[torch.Tensor] = None,
|
||||||
eps: float = 1e-6,
|
eps: float = 1e-6,
|
||||||
):
|
):
|
||||||
naive_norm = HuggingFaceRMSNorm(x.shape[-1], eps=eps)
|
naive_norm = HuggingFaceRMSNorm(x.shape[-1], eps=eps)
|
||||||
@ -64,7 +65,7 @@ def rmsnorm_naive(
|
|||||||
def rmsnorm_flashinfer(
|
def rmsnorm_flashinfer(
|
||||||
x: torch.Tensor,
|
x: torch.Tensor,
|
||||||
weight: torch.Tensor,
|
weight: torch.Tensor,
|
||||||
residual: torch.Tensor | None = None,
|
residual: Optional[torch.Tensor] = None,
|
||||||
eps: float = 1e-6,
|
eps: float = 1e-6,
|
||||||
):
|
):
|
||||||
orig_shape = x.shape
|
orig_shape = x.shape
|
||||||
@ -88,7 +89,7 @@ def rmsnorm_flashinfer(
|
|||||||
def rmsnorm_vllm(
|
def rmsnorm_vllm(
|
||||||
x: torch.Tensor,
|
x: torch.Tensor,
|
||||||
weight: torch.Tensor,
|
weight: torch.Tensor,
|
||||||
residual: torch.Tensor | None = None,
|
residual: Optional[torch.Tensor] = None,
|
||||||
eps: float = 1e-6,
|
eps: float = 1e-6,
|
||||||
):
|
):
|
||||||
orig_shape = x.shape
|
orig_shape = x.shape
|
||||||
|
|||||||
@ -2,6 +2,7 @@
|
|||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
from itertools import accumulate
|
from itertools import accumulate
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
import nvtx
|
import nvtx
|
||||||
import torch
|
import torch
|
||||||
@ -17,7 +18,7 @@ def benchmark_rope_kernels_multi_lora(
|
|||||||
seq_len: int,
|
seq_len: int,
|
||||||
num_heads: int,
|
num_heads: int,
|
||||||
head_size: int,
|
head_size: int,
|
||||||
rotary_dim: int | None,
|
rotary_dim: Optional[int],
|
||||||
dtype: torch.dtype,
|
dtype: torch.dtype,
|
||||||
seed: int,
|
seed: int,
|
||||||
device: str,
|
device: str,
|
||||||
|
|||||||
@ -4,6 +4,7 @@
|
|||||||
import csv
|
import csv
|
||||||
import os
|
import os
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
import flashinfer
|
import flashinfer
|
||||||
import torch
|
import torch
|
||||||
@ -27,7 +28,9 @@ def to_float8(x, dtype=torch.float8_e4m3fn):
|
|||||||
@torch.no_grad()
|
@torch.no_grad()
|
||||||
def benchmark_decode(
|
def benchmark_decode(
|
||||||
dtype: torch.dtype,
|
dtype: torch.dtype,
|
||||||
quant_dtypes: tuple[torch.dtype | None, torch.dtype | None, torch.dtype | None],
|
quant_dtypes: tuple[
|
||||||
|
Optional[torch.dtype], Optional[torch.dtype], Optional[torch.dtype]
|
||||||
|
],
|
||||||
batch_size: int,
|
batch_size: int,
|
||||||
max_seq_len: int,
|
max_seq_len: int,
|
||||||
num_heads: tuple[int, int] = (64, 8),
|
num_heads: tuple[int, int] = (64, 8),
|
||||||
|
|||||||
@ -4,6 +4,7 @@
|
|||||||
import csv
|
import csv
|
||||||
import os
|
import os
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
import flashinfer
|
import flashinfer
|
||||||
import torch
|
import torch
|
||||||
@ -27,7 +28,9 @@ def to_float8(x, dtype=torch.float8_e4m3fn):
|
|||||||
@torch.no_grad()
|
@torch.no_grad()
|
||||||
def benchmark_prefill(
|
def benchmark_prefill(
|
||||||
dtype: torch.dtype,
|
dtype: torch.dtype,
|
||||||
quant_dtypes: tuple[torch.dtype | None, torch.dtype | None, torch.dtype | None],
|
quant_dtypes: tuple[
|
||||||
|
Optional[torch.dtype], Optional[torch.dtype], Optional[torch.dtype]
|
||||||
|
],
|
||||||
batch_size: int,
|
batch_size: int,
|
||||||
max_seq_len: int,
|
max_seq_len: int,
|
||||||
num_heads: tuple[int, int] = (64, 8),
|
num_heads: tuple[int, int] = (64, 8),
|
||||||
|
|||||||
@ -2,8 +2,8 @@
|
|||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
import dataclasses
|
import dataclasses
|
||||||
from collections.abc import Callable, Iterable
|
from collections.abc import Iterable
|
||||||
from typing import Any
|
from typing import Any, Callable, Optional
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torch.utils.benchmark as TBenchmark
|
import torch.utils.benchmark as TBenchmark
|
||||||
@ -55,7 +55,7 @@ class Bench:
|
|||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
cuda_graph_params: CudaGraphBenchParams | None,
|
cuda_graph_params: Optional[CudaGraphBenchParams],
|
||||||
label: str,
|
label: str,
|
||||||
sub_label: str,
|
sub_label: str,
|
||||||
description: str,
|
description: str,
|
||||||
|
|||||||
@ -2,7 +2,7 @@
|
|||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from statistics import mean
|
from statistics import mean
|
||||||
from typing import Any, NamedTuple
|
from typing import Any, NamedTuple, Optional, Union
|
||||||
|
|
||||||
import numpy as np # type: ignore
|
import numpy as np # type: ignore
|
||||||
import pandas as pd # type: ignore
|
import pandas as pd # type: ignore
|
||||||
@ -35,8 +35,8 @@ class Distribution(ABC):
|
|||||||
class UniformDistribution(Distribution):
|
class UniformDistribution(Distribution):
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
min_val: int | float,
|
min_val: Union[int, float],
|
||||||
max_val: int | float,
|
max_val: Union[int, float],
|
||||||
is_integer: bool = True,
|
is_integer: bool = True,
|
||||||
) -> None:
|
) -> None:
|
||||||
self.min_val = min_val
|
self.min_val = min_val
|
||||||
@ -56,7 +56,7 @@ class UniformDistribution(Distribution):
|
|||||||
|
|
||||||
|
|
||||||
class ConstantDistribution(Distribution):
|
class ConstantDistribution(Distribution):
|
||||||
def __init__(self, value: int | float) -> None:
|
def __init__(self, value: Union[int, float]) -> None:
|
||||||
self.value = value
|
self.value = value
|
||||||
self.max_val = value
|
self.max_val = value
|
||||||
|
|
||||||
@ -68,7 +68,7 @@ class ConstantDistribution(Distribution):
|
|||||||
|
|
||||||
|
|
||||||
class ZipfDistribution(Distribution):
|
class ZipfDistribution(Distribution):
|
||||||
def __init__(self, alpha: float, max_val: int | None = None) -> None:
|
def __init__(self, alpha: float, max_val: Optional[int] = None) -> None:
|
||||||
self.alpha = alpha
|
self.alpha = alpha
|
||||||
self.max_val = max_val
|
self.max_val = max_val
|
||||||
|
|
||||||
@ -83,7 +83,7 @@ class ZipfDistribution(Distribution):
|
|||||||
|
|
||||||
|
|
||||||
class PoissonDistribution(Distribution):
|
class PoissonDistribution(Distribution):
|
||||||
def __init__(self, alpha: float, max_val: int | None = None) -> None:
|
def __init__(self, alpha: float, max_val: Optional[int] = None) -> None:
|
||||||
self.alpha = alpha
|
self.alpha = alpha
|
||||||
self.max_val = max_val
|
self.max_val = max_val
|
||||||
|
|
||||||
@ -100,11 +100,11 @@ class PoissonDistribution(Distribution):
|
|||||||
class LognormalDistribution(Distribution):
|
class LognormalDistribution(Distribution):
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
mean: float | None = None,
|
mean: Optional[float] = None,
|
||||||
sigma: float | None = None,
|
sigma: Optional[float] = None,
|
||||||
average: int | None = None,
|
average: Optional[int] = None,
|
||||||
median_ratio: float | None = None,
|
median_ratio: Optional[float] = None,
|
||||||
max_val: int | None = None,
|
max_val: Optional[int] = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
self.average = average
|
self.average = average
|
||||||
self.median_ratio = median_ratio
|
self.median_ratio = median_ratio
|
||||||
|
|||||||
@ -13,7 +13,7 @@ from datetime import datetime
|
|||||||
from enum import Enum
|
from enum import Enum
|
||||||
from http import HTTPStatus
|
from http import HTTPStatus
|
||||||
from statistics import mean
|
from statistics import mean
|
||||||
from typing import NamedTuple
|
from typing import NamedTuple, Union
|
||||||
|
|
||||||
import aiohttp # type: ignore
|
import aiohttp # type: ignore
|
||||||
import numpy as np # type: ignore
|
import numpy as np # type: ignore
|
||||||
@ -169,7 +169,7 @@ class MovingAverage:
|
|||||||
class DebugStats:
|
class DebugStats:
|
||||||
def __init__(self, logger: logging.Logger, window_size: int) -> None:
|
def __init__(self, logger: logging.Logger, window_size: int) -> None:
|
||||||
self.logger = logger
|
self.logger = logger
|
||||||
self.metrics: dict[str, MovingAverage | MetricStats] = {
|
self.metrics: dict[str, Union[MovingAverage, MetricStats]] = {
|
||||||
"moving_avg_ttft_ms": MovingAverage(window_size),
|
"moving_avg_ttft_ms": MovingAverage(window_size),
|
||||||
"moving_avg_tpot_ms": MovingAverage(window_size),
|
"moving_avg_tpot_ms": MovingAverage(window_size),
|
||||||
"ttft_ms": MetricStats(),
|
"ttft_ms": MetricStats(),
|
||||||
@ -636,7 +636,7 @@ async def client_main(
|
|||||||
|
|
||||||
if args.verbose:
|
if args.verbose:
|
||||||
curr_time_sec: float = time.perf_counter()
|
curr_time_sec: float = time.perf_counter()
|
||||||
time_since_last_turn: str | float = "N/A"
|
time_since_last_turn: Union[str, float] = "N/A"
|
||||||
if conv_id in time_of_last_turn:
|
if conv_id in time_of_last_turn:
|
||||||
time_since_last_turn = round(
|
time_since_last_turn = round(
|
||||||
curr_time_sec - time_of_last_turn[conv_id], 3
|
curr_time_sec - time_of_last_turn[conv_id], 3
|
||||||
@ -928,13 +928,13 @@ async def main_mp(
|
|||||||
f"{num_clients_finished} out of {bench_args.num_clients} clients finished, collected {len(client_metrics)} measurements, runtime {runtime_sec:.3f} sec{Color.RESET}" # noqa: E501
|
f"{num_clients_finished} out of {bench_args.num_clients} clients finished, collected {len(client_metrics)} measurements, runtime {runtime_sec:.3f} sec{Color.RESET}" # noqa: E501
|
||||||
)
|
)
|
||||||
|
|
||||||
rps: str | float = round(len(client_metrics) / runtime_sec, 3)
|
rps: Union[str, float] = round(len(client_metrics) / runtime_sec, 3)
|
||||||
if len(client_metrics) < (5 * bench_args.num_clients):
|
if len(client_metrics) < (5 * bench_args.num_clients):
|
||||||
# Do not estimate the RPS if the number of samples is very low
|
# Do not estimate the RPS if the number of samples is very low
|
||||||
# (threshold can be tuned if needed)
|
# (threshold can be tuned if needed)
|
||||||
rps = "N/A"
|
rps = "N/A"
|
||||||
|
|
||||||
runtime_left_sec: str | float = round(
|
runtime_left_sec: Union[str, float] = round(
|
||||||
(runtime_sec / finished_convs) * (total_convs - finished_convs), 3
|
(runtime_sec / finished_convs) * (total_convs - finished_convs), 3
|
||||||
)
|
)
|
||||||
if percent < 0.05:
|
if percent < 0.05:
|
||||||
|
|||||||
@ -13,7 +13,7 @@ import argparse
|
|||||||
import json
|
import json
|
||||||
import random
|
import random
|
||||||
from statistics import mean
|
from statistics import mean
|
||||||
from typing import Any
|
from typing import Any, Optional
|
||||||
|
|
||||||
import pandas as pd # type: ignore
|
import pandas as pd # type: ignore
|
||||||
import tqdm # type: ignore
|
import tqdm # type: ignore
|
||||||
@ -25,7 +25,7 @@ def has_non_english_chars(text: str) -> bool:
|
|||||||
|
|
||||||
|
|
||||||
def content_is_valid(
|
def content_is_valid(
|
||||||
content: str, min_content_len: int | None, max_content_len: int | None
|
content: str, min_content_len: Optional[int], max_content_len: Optional[int]
|
||||||
) -> bool:
|
) -> bool:
|
||||||
if min_content_len and len(content) < min_content_len:
|
if min_content_len and len(content) < min_content_len:
|
||||||
return False
|
return False
|
||||||
@ -37,7 +37,7 @@ def content_is_valid(
|
|||||||
|
|
||||||
|
|
||||||
def print_stats(
|
def print_stats(
|
||||||
conversations: "list[dict[Any, Any]]", tokenizer: AutoTokenizer | None = None
|
conversations: "list[dict[Any, Any]]", tokenizer: Optional[AutoTokenizer] = None
|
||||||
) -> None:
|
) -> None:
|
||||||
# Collect statistics
|
# Collect statistics
|
||||||
stats = []
|
stats = []
|
||||||
@ -109,12 +109,12 @@ def convert_sharegpt_to_openai(
|
|||||||
seed: int,
|
seed: int,
|
||||||
input_file: str,
|
input_file: str,
|
||||||
output_file: str,
|
output_file: str,
|
||||||
max_items: int | None,
|
max_items: Optional[int],
|
||||||
min_content_len: int | None = None,
|
min_content_len: Optional[int] = None,
|
||||||
max_content_len: int | None = None,
|
max_content_len: Optional[int] = None,
|
||||||
min_turns: int | None = None,
|
min_turns: Optional[int] = None,
|
||||||
max_turns: int | None = None,
|
max_turns: Optional[int] = None,
|
||||||
model: str | None = None,
|
model: Optional[str] = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
if min_turns and max_turns:
|
if min_turns and max_turns:
|
||||||
assert min_turns <= max_turns
|
assert min_turns <= max_turns
|
||||||
|
|||||||
@ -22,10 +22,10 @@ else()
|
|||||||
CONFIGURE_COMMAND ""
|
CONFIGURE_COMMAND ""
|
||||||
BUILD_COMMAND ""
|
BUILD_COMMAND ""
|
||||||
)
|
)
|
||||||
|
FetchContent_Populate(qutlass)
|
||||||
|
set(qutlass_SOURCE_DIR "${qutlass_SOURCE_DIR}")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
FetchContent_Populate(qutlass)
|
|
||||||
|
|
||||||
if(NOT qutlass_SOURCE_DIR)
|
if(NOT qutlass_SOURCE_DIR)
|
||||||
message(FATAL_ERROR "[QUTLASS] source directory could not be resolved.")
|
message(FATAL_ERROR "[QUTLASS] source directory could not be resolved.")
|
||||||
endif()
|
endif()
|
||||||
|
|||||||
12
codecov.yml
12
codecov.yml
@ -1,12 +0,0 @@
|
|||||||
codecov:
|
|
||||||
require_ci_to_pass: false
|
|
||||||
|
|
||||||
fixes:
|
|
||||||
# Map source code paths to repository root paths
|
|
||||||
# Wildcards match any Python version (python3.*)
|
|
||||||
- "/vllm-workspace/src/vllm/::vllm/"
|
|
||||||
- "/vllm-workspace/vllm/::vllm/"
|
|
||||||
- "/usr/local/lib/python3.*/dist-packages/vllm/::vllm/"
|
|
||||||
- "/usr/local/lib/python3.*/site-packages/vllm/::vllm/"
|
|
||||||
- "/usr/lib/python3.*/dist-packages/vllm/::vllm/"
|
|
||||||
- "/usr/lib/python3.*/site-packages/vllm/::vllm/"
|
|
||||||
@ -125,37 +125,32 @@ public:
|
|||||||
}
|
}
|
||||||
|
|
||||||
static void set_split_kv (KernelArguments& args) {
|
static void set_split_kv (KernelArguments& args) {
|
||||||
|
// printf("set_split_kv start");
|
||||||
if (args.split_kv >= 1) return;
|
if (args.split_kv >= 1) return;
|
||||||
auto [H, K, D, B] = args.problem_shape;
|
auto [H, K, D, B] = args.problem_shape;
|
||||||
|
// std::cout << H << " " << K << " " << D << " " << B << "\n";
|
||||||
int sm_count = args.hw_info.sm_count;
|
int sm_count = args.hw_info.sm_count;
|
||||||
float seq_length_k = static_cast<float>(K) / 1024.0f;
|
// printf(" sm_count = %d\n", sm_count);
|
||||||
int max_splits = 1;
|
int max_splits = ceil_div(K, 128);
|
||||||
|
max_splits = min(16, max_splits);
|
||||||
|
|
||||||
if (B <= 4 && seq_length_k >= 16) {
|
// TODO: This avoids a hang when the batch size larger than 1 and
|
||||||
max_splits = 16;
|
// there is more than 1 kv_splits.
|
||||||
|
// Discuss with NVIDIA how this can be fixed.
|
||||||
|
if (B > 1) {
|
||||||
|
max_splits = min(1, max_splits);
|
||||||
}
|
}
|
||||||
else if (B <= 8 && seq_length_k >= 4) {
|
|
||||||
max_splits = 8;
|
// printf(" max_splits = %d\n", max_splits);
|
||||||
}
|
|
||||||
else if ((B <= 16 && seq_length_k >= 8) ||
|
|
||||||
(B == 48 && seq_length_k >= 32)) {
|
|
||||||
max_splits = 4;
|
|
||||||
}
|
|
||||||
else if ((B <= 32 && seq_length_k >= 16) ||
|
|
||||||
(B == 96 && seq_length_k >= 16)) {
|
|
||||||
max_splits = 2;
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
max_splits = 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Wave-aware scheduling: ensure integer number of waves in K dimension
|
|
||||||
int sms_per_batch = max(1, sm_count / B);
|
int sms_per_batch = max(1, sm_count / B);
|
||||||
|
// printf(" sms_per_batch = %d\n", sms_per_batch);
|
||||||
int split_heur = min(max_splits, sms_per_batch);
|
int split_heur = min(max_splits, sms_per_batch);
|
||||||
int waves = ceil_div(B * split_heur, sm_count);
|
int waves = ceil_div(B * split_heur, sm_count);
|
||||||
int k_waves = ceil_div(max_splits, split_heur);
|
int k_waves = ceil_div(max_splits, split_heur);
|
||||||
int split_wave_aware = ceil_div(max_splits, k_waves);
|
int split_wave_aware = ceil_div(max_splits, k_waves);
|
||||||
args.split_kv = split_wave_aware;
|
args.split_kv = split_wave_aware;
|
||||||
|
// printf(" args.split_kv = %d\n", args.split_kv);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Determines whether the GEMM can execute the given problem.
|
/// Determines whether the GEMM can execute the given problem.
|
||||||
|
|||||||
@ -5,11 +5,11 @@
|
|||||||
|
|
||||||
namespace vllm {
|
namespace vllm {
|
||||||
|
|
||||||
// vllm_is_batch_invariant(); returns true
|
// vllm_kernel_override_batch_invariant(); returns true
|
||||||
// if env VLLM_BATCH_INVARIANT=1
|
// if env VLLM_KERNEL_OVERRIDE_BATCH_INVARIANT=1
|
||||||
inline bool vllm_is_batch_invariant() {
|
inline bool vllm_kernel_override_batch_invariant() {
|
||||||
static bool cached = []() {
|
static bool cached = []() {
|
||||||
std::string env_key = "VLLM_BATCH_INVARIANT";
|
std::string env_key = "VLLM_KERNEL_OVERRIDE_BATCH_INVARIANT";
|
||||||
const char* val = std::getenv(env_key.c_str());
|
const char* val = std::getenv(env_key.c_str());
|
||||||
return (val && std::atoi(val) != 0) ? 1 : 0;
|
return (val && std::atoi(val) != 0) ? 1 : 0;
|
||||||
}();
|
}();
|
||||||
|
|||||||
@ -2,6 +2,7 @@
|
|||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
import enum
|
import enum
|
||||||
|
from typing import Union
|
||||||
|
|
||||||
from cutlass_library import *
|
from cutlass_library import *
|
||||||
|
|
||||||
@ -21,7 +22,7 @@ class MixedInputKernelScheduleType(enum.Enum):
|
|||||||
TmaWarpSpecializedCooperative = enum_auto()
|
TmaWarpSpecializedCooperative = enum_auto()
|
||||||
|
|
||||||
|
|
||||||
VLLMDataTypeNames: dict[VLLMDataType | DataType, str] = {
|
VLLMDataTypeNames: dict[Union[VLLMDataType, DataType], str] = {
|
||||||
**DataTypeNames, # type: ignore
|
**DataTypeNames, # type: ignore
|
||||||
**{
|
**{
|
||||||
VLLMDataType.u4b8: "u4b8",
|
VLLMDataType.u4b8: "u4b8",
|
||||||
@ -29,7 +30,7 @@ VLLMDataTypeNames: dict[VLLMDataType | DataType, str] = {
|
|||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
VLLMDataTypeTag: dict[VLLMDataType | DataType, str] = {
|
VLLMDataTypeTag: dict[Union[VLLMDataType, DataType], str] = {
|
||||||
**DataTypeTag, # type: ignore
|
**DataTypeTag, # type: ignore
|
||||||
**{
|
**{
|
||||||
VLLMDataType.u4b8: "cutlass::vllm_uint4b8_t",
|
VLLMDataType.u4b8: "cutlass::vllm_uint4b8_t",
|
||||||
@ -37,7 +38,7 @@ VLLMDataTypeTag: dict[VLLMDataType | DataType, str] = {
|
|||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
VLLMDataTypeSize: dict[VLLMDataType | DataType, int] = {
|
VLLMDataTypeSize: dict[Union[VLLMDataType, DataType], int] = {
|
||||||
**DataTypeSize, # type: ignore
|
**DataTypeSize, # type: ignore
|
||||||
**{
|
**{
|
||||||
VLLMDataType.u4b8: 4,
|
VLLMDataType.u4b8: 4,
|
||||||
@ -45,7 +46,7 @@ VLLMDataTypeSize: dict[VLLMDataType | DataType, int] = {
|
|||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
VLLMDataTypeVLLMScalarTypeTag: dict[VLLMDataType | DataType, str] = {
|
VLLMDataTypeVLLMScalarTypeTag: dict[Union[VLLMDataType, DataType], str] = {
|
||||||
VLLMDataType.u4b8: "vllm::kU4B8",
|
VLLMDataType.u4b8: "vllm::kU4B8",
|
||||||
VLLMDataType.u8b128: "vllm::kU8B128",
|
VLLMDataType.u8b128: "vllm::kU8B128",
|
||||||
DataType.u4: "vllm::kU4",
|
DataType.u4: "vllm::kU4",
|
||||||
@ -56,7 +57,7 @@ VLLMDataTypeVLLMScalarTypeTag: dict[VLLMDataType | DataType, str] = {
|
|||||||
DataType.bf16: "vllm::kBfloat16",
|
DataType.bf16: "vllm::kBfloat16",
|
||||||
}
|
}
|
||||||
|
|
||||||
VLLMDataTypeTorchDataTypeTag: dict[VLLMDataType | DataType, str] = {
|
VLLMDataTypeTorchDataTypeTag: dict[Union[VLLMDataType, DataType], str] = {
|
||||||
DataType.u8: "at::ScalarType::Byte",
|
DataType.u8: "at::ScalarType::Byte",
|
||||||
DataType.s8: "at::ScalarType::Char",
|
DataType.s8: "at::ScalarType::Char",
|
||||||
DataType.e4m3: "at::ScalarType::Float8_e4m3fn",
|
DataType.e4m3: "at::ScalarType::Float8_e4m3fn",
|
||||||
@ -66,7 +67,9 @@ VLLMDataTypeTorchDataTypeTag: dict[VLLMDataType | DataType, str] = {
|
|||||||
DataType.f32: "at::ScalarType::Float",
|
DataType.f32: "at::ScalarType::Float",
|
||||||
}
|
}
|
||||||
|
|
||||||
VLLMKernelScheduleTag: dict[MixedInputKernelScheduleType | KernelScheduleType, str] = {
|
VLLMKernelScheduleTag: dict[
|
||||||
|
Union[MixedInputKernelScheduleType, KernelScheduleType], str
|
||||||
|
] = {
|
||||||
**KernelScheduleTag, # type: ignore
|
**KernelScheduleTag, # type: ignore
|
||||||
**{
|
**{
|
||||||
MixedInputKernelScheduleType.TmaWarpSpecialized: "cutlass::gemm::KernelTmaWarpSpecialized", # noqa: E501
|
MixedInputKernelScheduleType.TmaWarpSpecialized: "cutlass::gemm::KernelTmaWarpSpecialized", # noqa: E501
|
||||||
|
|||||||
@ -2,7 +2,6 @@
|
|||||||
#include "dispatch_utils.h"
|
#include "dispatch_utils.h"
|
||||||
#include "cub_helpers.h"
|
#include "cub_helpers.h"
|
||||||
#include "core/batch_invariant.hpp"
|
#include "core/batch_invariant.hpp"
|
||||||
#include "quantization/vectorization_utils.cuh"
|
|
||||||
|
|
||||||
#include <torch/cuda.h>
|
#include <torch/cuda.h>
|
||||||
#include <c10/cuda/CUDAGuard.h>
|
#include <c10/cuda/CUDAGuard.h>
|
||||||
@ -19,22 +18,11 @@ __global__ void rms_norm_kernel(
|
|||||||
const float epsilon, const int num_tokens, const int hidden_size) {
|
const float epsilon, const int num_tokens, const int hidden_size) {
|
||||||
__shared__ float s_variance;
|
__shared__ float s_variance;
|
||||||
float variance = 0.0f;
|
float variance = 0.0f;
|
||||||
const scalar_t* input_row = input + blockIdx.x * input_stride;
|
|
||||||
|
|
||||||
constexpr int VEC_SIZE = 8;
|
for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
|
||||||
auto vec_op = [&variance](const vec_n_t<scalar_t, VEC_SIZE>& vec) {
|
const float x = (float)input[blockIdx.x * input_stride + idx];
|
||||||
#pragma unroll
|
|
||||||
for (int i = 0; i < VEC_SIZE; ++i) {
|
|
||||||
float x = static_cast<float>(vec.val[i]);
|
|
||||||
variance += x * x;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
auto scalar_op = [&variance](const scalar_t& val) {
|
|
||||||
float x = static_cast<float>(val);
|
|
||||||
variance += x * x;
|
variance += x * x;
|
||||||
};
|
}
|
||||||
vllm::vectorize_read_with_alignment<VEC_SIZE>(
|
|
||||||
input_row, hidden_size, threadIdx.x, blockDim.x, vec_op, scalar_op);
|
|
||||||
|
|
||||||
using BlockReduce = cub::BlockReduce<float, 1024>;
|
using BlockReduce = cub::BlockReduce<float, 1024>;
|
||||||
__shared__ typename BlockReduce::TempStorage reduceStore;
|
__shared__ typename BlockReduce::TempStorage reduceStore;
|
||||||
@ -426,7 +414,7 @@ void fused_add_rms_norm(torch::Tensor& input, // [..., hidden_size]
|
|||||||
wt_ptr % req_alignment_bytes == 0;
|
wt_ptr % req_alignment_bytes == 0;
|
||||||
bool offsets_are_multiple_of_vector_width =
|
bool offsets_are_multiple_of_vector_width =
|
||||||
hidden_size % vector_width == 0 && input_stride % vector_width == 0;
|
hidden_size % vector_width == 0 && input_stride % vector_width == 0;
|
||||||
bool batch_invariant_launch = vllm::vllm_is_batch_invariant();
|
bool batch_invariant_launch = vllm::vllm_kernel_override_batch_invariant();
|
||||||
if (ptrs_are_aligned && offsets_are_multiple_of_vector_width &&
|
if (ptrs_are_aligned && offsets_are_multiple_of_vector_width &&
|
||||||
!batch_invariant_launch) {
|
!batch_invariant_launch) {
|
||||||
LAUNCH_FUSED_ADD_RMS_NORM(8);
|
LAUNCH_FUSED_ADD_RMS_NORM(8);
|
||||||
@ -474,7 +462,7 @@ void poly_norm(torch::Tensor& out, // [..., hidden_size]
|
|||||||
auto inp_ptr = reinterpret_cast<std::uintptr_t>(input.data_ptr());
|
auto inp_ptr = reinterpret_cast<std::uintptr_t>(input.data_ptr());
|
||||||
auto out_ptr = reinterpret_cast<std::uintptr_t>(out.data_ptr());
|
auto out_ptr = reinterpret_cast<std::uintptr_t>(out.data_ptr());
|
||||||
bool ptrs_are_aligned = inp_ptr % 16 == 0 && out_ptr % 16 == 0;
|
bool ptrs_are_aligned = inp_ptr % 16 == 0 && out_ptr % 16 == 0;
|
||||||
bool batch_invariant_launch = vllm::vllm_is_batch_invariant();
|
bool batch_invariant_launch = vllm::vllm_kernel_override_batch_invariant();
|
||||||
if (ptrs_are_aligned && hidden_size % 8 == 0 && !batch_invariant_launch) {
|
if (ptrs_are_aligned && hidden_size % 8 == 0 && !batch_invariant_launch) {
|
||||||
LAUNCH_FUSED_POLY_NORM(8);
|
LAUNCH_FUSED_POLY_NORM(8);
|
||||||
} else {
|
} else {
|
||||||
|
|||||||
@ -10,7 +10,6 @@
|
|||||||
#include "dispatch_utils.h"
|
#include "dispatch_utils.h"
|
||||||
#include "cub_helpers.h"
|
#include "cub_helpers.h"
|
||||||
#include "core/batch_invariant.hpp"
|
#include "core/batch_invariant.hpp"
|
||||||
#include "quantization/vectorization_utils.cuh"
|
|
||||||
|
|
||||||
#include <torch/cuda.h>
|
#include <torch/cuda.h>
|
||||||
#include <c10/cuda/CUDAGuard.h>
|
#include <c10/cuda/CUDAGuard.h>
|
||||||
@ -29,22 +28,10 @@ __global__ void rms_norm_static_fp8_quant_kernel(
|
|||||||
__shared__ float s_variance;
|
__shared__ float s_variance;
|
||||||
float variance = 0.0f;
|
float variance = 0.0f;
|
||||||
|
|
||||||
const scalar_t* input_row = input + blockIdx.x * input_stride;
|
for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
|
||||||
|
const float x = (float)input[blockIdx.x * input_stride + idx];
|
||||||
constexpr int VEC_SIZE = 8;
|
|
||||||
auto vec_op = [&variance](const vec_n_t<scalar_t, VEC_SIZE>& vec) {
|
|
||||||
#pragma unroll
|
|
||||||
for (int i = 0; i < VEC_SIZE; ++i) {
|
|
||||||
float x = static_cast<float>(vec.val[i]);
|
|
||||||
variance += x * x;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
auto scalar_op = [&variance](const scalar_t& val) {
|
|
||||||
float x = static_cast<float>(val);
|
|
||||||
variance += x * x;
|
variance += x * x;
|
||||||
};
|
}
|
||||||
vllm::vectorize_read_with_alignment<VEC_SIZE>(
|
|
||||||
input_row, hidden_size, threadIdx.x, blockDim.x, vec_op, scalar_op);
|
|
||||||
|
|
||||||
using BlockReduce = cub::BlockReduce<float, 1024>;
|
using BlockReduce = cub::BlockReduce<float, 1024>;
|
||||||
__shared__ typename BlockReduce::TempStorage reduceStore;
|
__shared__ typename BlockReduce::TempStorage reduceStore;
|
||||||
@ -254,7 +241,7 @@ void fused_add_rms_norm_static_fp8_quant(
|
|||||||
auto wt_ptr = reinterpret_cast<std::uintptr_t>(weight.data_ptr());
|
auto wt_ptr = reinterpret_cast<std::uintptr_t>(weight.data_ptr());
|
||||||
bool ptrs_are_aligned =
|
bool ptrs_are_aligned =
|
||||||
inp_ptr % 16 == 0 && res_ptr % 16 == 0 && wt_ptr % 16 == 0;
|
inp_ptr % 16 == 0 && res_ptr % 16 == 0 && wt_ptr % 16 == 0;
|
||||||
bool batch_invariant_launch = vllm::vllm_is_batch_invariant();
|
bool batch_invariant_launch = vllm::vllm_kernel_override_batch_invariant();
|
||||||
if (ptrs_are_aligned && hidden_size % 8 == 0 && input_stride % 8 == 0 &&
|
if (ptrs_are_aligned && hidden_size % 8 == 0 && input_stride % 8 == 0 &&
|
||||||
!batch_invariant_launch) {
|
!batch_invariant_launch) {
|
||||||
LAUNCH_FUSED_ADD_RMS_NORM(8);
|
LAUNCH_FUSED_ADD_RMS_NORM(8);
|
||||||
|
|||||||
@ -8,77 +8,12 @@
|
|||||||
|
|
||||||
#include "../cuda_compat.h"
|
#include "../cuda_compat.h"
|
||||||
#include "../dispatch_utils.h"
|
#include "../dispatch_utils.h"
|
||||||
#include "core/math.hpp"
|
|
||||||
|
|
||||||
#define CEILDIV(x, y) (((x) + (y) - 1) / (y))
|
#define CEILDIV(x, y) (((x) + (y) - 1) / (y))
|
||||||
|
|
||||||
namespace vllm {
|
namespace vllm {
|
||||||
namespace moe {
|
namespace moe {
|
||||||
|
|
||||||
namespace batched_moe_align_block_size {
|
|
||||||
|
|
||||||
// Note num_threads needs to be 1024 for BlockScan Reduction in the kernel.
|
|
||||||
static constexpr int32_t num_threads = 1024;
|
|
||||||
static constexpr int32_t num_blocks = 1;
|
|
||||||
__global__ void batched_moe_align_block_size_kernel(
|
|
||||||
int32_t const num_batches, int32_t const max_tokens_per_batch,
|
|
||||||
int32_t const block_size, int32_t const* __restrict__ batch_num_tokens,
|
|
||||||
int32_t* __restrict__ sorted_ids, int32_t* __restrict__ block_ids,
|
|
||||||
int32_t* __restrict__ num_tokens_post_pad) {
|
|
||||||
// TODO(varun): This is a naive implementation. Could be optimized.
|
|
||||||
|
|
||||||
size_t const batch_id = threadIdx.x;
|
|
||||||
size_t const stride = blockDim.x * gridDim.x;
|
|
||||||
int32_t const num_blocks_per_batch =
|
|
||||||
CEILDIV(max_tokens_per_batch, block_size);
|
|
||||||
int32_t const sorted_ids_size =
|
|
||||||
num_blocks_per_batch * num_batches * block_size;
|
|
||||||
int32_t const block_ids_size = sorted_ids_size / block_size;
|
|
||||||
int32_t const SENTINEL =
|
|
||||||
num_batches * max_tokens_per_batch; // To denote invalid entries.
|
|
||||||
// Intialize sorted_ids
|
|
||||||
for (size_t i = threadIdx.x; i < sorted_ids_size; i += stride) {
|
|
||||||
sorted_ids[i] = SENTINEL;
|
|
||||||
}
|
|
||||||
// Intialize expert_ids with -1
|
|
||||||
for (size_t i = threadIdx.x; i < block_ids_size; i += stride) {
|
|
||||||
block_ids[i] = -1;
|
|
||||||
}
|
|
||||||
|
|
||||||
int32_t b_num_tokens = 0;
|
|
||||||
if (batch_id < num_batches) {
|
|
||||||
b_num_tokens = batch_num_tokens[batch_id];
|
|
||||||
}
|
|
||||||
int32_t const ceil_b_num_tokens =
|
|
||||||
CEILDIV(b_num_tokens, block_size) * block_size;
|
|
||||||
|
|
||||||
// Compute prefix sum over token counts per expert
|
|
||||||
using BlockScan = cub::BlockScan<int32_t, 1024>;
|
|
||||||
__shared__ typename BlockScan::TempStorage temp_storage;
|
|
||||||
int cumsum_val;
|
|
||||||
BlockScan(temp_storage).ExclusiveSum(ceil_b_num_tokens, cumsum_val);
|
|
||||||
__syncthreads();
|
|
||||||
|
|
||||||
bool const is_last_batch = batch_id == (num_batches - 1);
|
|
||||||
if (is_last_batch) {
|
|
||||||
*num_tokens_post_pad = cumsum_val + ceil_b_num_tokens;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (batch_id < num_batches) {
|
|
||||||
int32_t const batch_offset = batch_id * max_tokens_per_batch;
|
|
||||||
for (size_t i = 0; i < b_num_tokens; ++i) {
|
|
||||||
sorted_ids[cumsum_val + i] = batch_offset + i;
|
|
||||||
}
|
|
||||||
|
|
||||||
int32_t const block_start = cumsum_val / block_size;
|
|
||||||
int32_t const num_blocks = ceil_b_num_tokens / block_size;
|
|
||||||
for (size_t i = 0; i < num_blocks; ++i) {
|
|
||||||
block_ids[block_start + i] = batch_id;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} // namespace batched_moe_align_block_size
|
|
||||||
|
|
||||||
template <typename scalar_t>
|
template <typename scalar_t>
|
||||||
__global__ void moe_align_block_size_kernel(
|
__global__ void moe_align_block_size_kernel(
|
||||||
const scalar_t* __restrict__ topk_ids,
|
const scalar_t* __restrict__ topk_ids,
|
||||||
@ -345,33 +280,6 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
void batched_moe_align_block_size(int64_t max_tokens_per_batch,
|
|
||||||
int64_t block_size,
|
|
||||||
torch::Tensor const& batch_num_tokens,
|
|
||||||
torch::Tensor sorted_ids,
|
|
||||||
torch::Tensor batch_ids,
|
|
||||||
torch::Tensor num_tokens_post_pad) {
|
|
||||||
namespace batched_kernel = vllm::moe::batched_moe_align_block_size;
|
|
||||||
|
|
||||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
|
||||||
int32_t const B = batch_num_tokens.size(0);
|
|
||||||
int32_t const num_blocks_per_batch =
|
|
||||||
round_to_next_multiple_of(max_tokens_per_batch, block_size) / block_size;
|
|
||||||
int32_t const num_blocks = num_blocks_per_batch * B;
|
|
||||||
int64_t const sorted_ids_size = num_blocks * block_size;
|
|
||||||
|
|
||||||
TORCH_CHECK(sorted_ids.size(0) == sorted_ids_size);
|
|
||||||
TORCH_CHECK(batch_ids.size(0) == sorted_ids_size / block_size);
|
|
||||||
TORCH_CHECK(num_tokens_post_pad.size(0) == 1);
|
|
||||||
TORCH_CHECK(B <= batched_kernel::num_threads);
|
|
||||||
|
|
||||||
batched_kernel::batched_moe_align_block_size_kernel<<<
|
|
||||||
batched_kernel::num_blocks, batched_kernel::num_threads, 0, stream>>>(
|
|
||||||
B, max_tokens_per_batch, block_size, batch_num_tokens.data_ptr<int32_t>(),
|
|
||||||
sorted_ids.data_ptr<int32_t>(), batch_ids.data_ptr<int32_t>(),
|
|
||||||
num_tokens_post_pad.data_ptr<int32_t>());
|
|
||||||
}
|
|
||||||
|
|
||||||
void moe_sum(torch::Tensor& input, // [num_tokens, topk, hidden_size]
|
void moe_sum(torch::Tensor& input, // [num_tokens, topk, hidden_size]
|
||||||
torch::Tensor& output) // [num_tokens, hidden_size]
|
torch::Tensor& output) // [num_tokens, hidden_size]
|
||||||
{
|
{
|
||||||
|
|||||||
@ -12,14 +12,6 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
|
|||||||
int64_t block_size, torch::Tensor sorted_token_ids,
|
int64_t block_size, torch::Tensor sorted_token_ids,
|
||||||
torch::Tensor experts_ids,
|
torch::Tensor experts_ids,
|
||||||
torch::Tensor num_tokens_post_pad);
|
torch::Tensor num_tokens_post_pad);
|
||||||
|
|
||||||
void batched_moe_align_block_size(int64_t max_tokens_per_batch,
|
|
||||||
int64_t block_size,
|
|
||||||
torch::Tensor const& expert_num_tokens,
|
|
||||||
torch::Tensor sorted_ids,
|
|
||||||
torch::Tensor expert_ids,
|
|
||||||
torch::Tensor num_tokens_post_pad);
|
|
||||||
|
|
||||||
#ifndef USE_ROCM
|
#ifndef USE_ROCM
|
||||||
torch::Tensor moe_wna16_gemm(torch::Tensor input, torch::Tensor output,
|
torch::Tensor moe_wna16_gemm(torch::Tensor input, torch::Tensor output,
|
||||||
torch::Tensor b_qweight, torch::Tensor b_scales,
|
torch::Tensor b_qweight, torch::Tensor b_scales,
|
||||||
|
|||||||
@ -21,6 +21,7 @@
|
|||||||
#include <c10/cuda/CUDAGuard.h>
|
#include <c10/cuda/CUDAGuard.h>
|
||||||
#include "../cuda_compat.h"
|
#include "../cuda_compat.h"
|
||||||
#include "../cub_helpers.h"
|
#include "../cub_helpers.h"
|
||||||
|
#include "../core/batch_invariant.hpp"
|
||||||
|
|
||||||
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
||||||
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
||||||
@ -405,7 +406,8 @@ void topkGatingSoftmaxLauncherHelper(const float* input, const bool* finished, f
|
|||||||
using Constants = detail::TopkConstants<EXPERTS, BYTES_PER_LDG, WARP_SIZE_PARAM>;
|
using Constants = detail::TopkConstants<EXPERTS, BYTES_PER_LDG, WARP_SIZE_PARAM>;
|
||||||
static constexpr int VPT = Constants::VPT;
|
static constexpr int VPT = Constants::VPT;
|
||||||
static constexpr int ROWS_PER_WARP = Constants::ROWS_PER_WARP;
|
static constexpr int ROWS_PER_WARP = Constants::ROWS_PER_WARP;
|
||||||
const int num_warps = (num_rows + ROWS_PER_WARP - 1) / ROWS_PER_WARP;
|
const bool batch_invariant_launch = vllm::vllm_kernel_override_batch_invariant();
|
||||||
|
const int num_warps = batch_invariant_launch ? 32 : (num_rows + ROWS_PER_WARP - 1) / ROWS_PER_WARP;
|
||||||
const int num_blocks = (num_warps + WARPS_PER_TB - 1) / WARPS_PER_TB;
|
const int num_blocks = (num_warps + WARPS_PER_TB - 1) / WARPS_PER_TB;
|
||||||
|
|
||||||
dim3 block_dim(WARP_SIZE_PARAM, WARPS_PER_TB);
|
dim3 block_dim(WARP_SIZE_PARAM, WARPS_PER_TB);
|
||||||
|
|||||||
@ -22,17 +22,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
|
|||||||
" Tensor! num_tokens_post_pad) -> ()");
|
" Tensor! num_tokens_post_pad) -> ()");
|
||||||
m.impl("moe_align_block_size", torch::kCUDA, &moe_align_block_size);
|
m.impl("moe_align_block_size", torch::kCUDA, &moe_align_block_size);
|
||||||
|
|
||||||
// Aligning the number of tokens to be processed by each expert such
|
|
||||||
// that it is divisible by the block size, but for the batched case.
|
|
||||||
m.def(
|
|
||||||
"batched_moe_align_block_size(int max_tokens_per_batch,"
|
|
||||||
" int block_size, Tensor expert_num_tokens,"
|
|
||||||
" Tensor! sorted_token_ids,"
|
|
||||||
" Tensor! experts_ids,"
|
|
||||||
" Tensor! num_tokens_post_pad) -> ()");
|
|
||||||
m.impl("batched_moe_align_block_size", torch::kCUDA,
|
|
||||||
&batched_moe_align_block_size);
|
|
||||||
|
|
||||||
#ifndef USE_ROCM
|
#ifndef USE_ROCM
|
||||||
m.def(
|
m.def(
|
||||||
"moe_wna16_gemm(Tensor input, Tensor! output, Tensor b_qweight, "
|
"moe_wna16_gemm(Tensor input, Tensor! output, Tensor b_qweight, "
|
||||||
|
|||||||
@ -9,6 +9,7 @@ from collections.abc import Iterable
|
|||||||
from copy import deepcopy
|
from copy import deepcopy
|
||||||
from dataclasses import dataclass, fields
|
from dataclasses import dataclass, fields
|
||||||
from functools import reduce
|
from functools import reduce
|
||||||
|
from typing import Optional, Union
|
||||||
|
|
||||||
import jinja2
|
import jinja2
|
||||||
from vllm_cutlass_library_extension import (
|
from vllm_cutlass_library_extension import (
|
||||||
@ -258,7 +259,7 @@ class ScheduleConfig:
|
|||||||
@dataclass(frozen=True)
|
@dataclass(frozen=True)
|
||||||
class TypeConfig:
|
class TypeConfig:
|
||||||
a: DataType
|
a: DataType
|
||||||
b: DataType | VLLMDataType
|
b: Union[DataType, VLLMDataType]
|
||||||
b_group_scale: DataType
|
b_group_scale: DataType
|
||||||
b_group_zeropoint: DataType
|
b_group_zeropoint: DataType
|
||||||
b_channel_scale: DataType
|
b_channel_scale: DataType
|
||||||
@ -279,7 +280,7 @@ class PrepackTypeConfig:
|
|||||||
class ImplConfig:
|
class ImplConfig:
|
||||||
types: TypeConfig
|
types: TypeConfig
|
||||||
schedules: list[ScheduleConfig]
|
schedules: list[ScheduleConfig]
|
||||||
heuristic: list[tuple[str | None, ScheduleConfig]]
|
heuristic: list[tuple[Optional[str], ScheduleConfig]]
|
||||||
|
|
||||||
|
|
||||||
def generate_sch_sig(schedule_config: ScheduleConfig) -> str:
|
def generate_sch_sig(schedule_config: ScheduleConfig) -> str:
|
||||||
|
|||||||
@ -22,14 +22,13 @@ template <typename AllReduceKernel, typename T>
|
|||||||
__global__ __quickreduce_launch_bounds_two_shot__ static void
|
__global__ __quickreduce_launch_bounds_two_shot__ static void
|
||||||
allreduce_prototype_twoshot(T const* A, T* B, uint32_t N, uint32_t num_blocks,
|
allreduce_prototype_twoshot(T const* A, T* B, uint32_t N, uint32_t num_blocks,
|
||||||
int rank, uint8_t** dbuffer_list,
|
int rank, uint8_t** dbuffer_list,
|
||||||
uint32_t data_offset, uint32_t flag_color,
|
uint32_t data_offset, uint32_t flag_color) {
|
||||||
int64_t data_size_per_phase) {
|
|
||||||
int block = blockIdx.x;
|
int block = blockIdx.x;
|
||||||
int grid = gridDim.x;
|
int grid = gridDim.x;
|
||||||
|
|
||||||
while (block < num_blocks) {
|
while (block < num_blocks) {
|
||||||
AllReduceKernel::run(A, B, N, block, rank, dbuffer_list, data_offset,
|
AllReduceKernel::run(A, B, N, block, rank, dbuffer_list, data_offset,
|
||||||
flag_color, data_size_per_phase);
|
flag_color);
|
||||||
block += grid;
|
block += grid;
|
||||||
flag_color++;
|
flag_color++;
|
||||||
}
|
}
|
||||||
@ -42,21 +41,21 @@ allreduce_prototype_twoshot(T const* A, T* B, uint32_t N, uint32_t num_blocks,
|
|||||||
hipLaunchKernelGGL((allreduce_prototype_twoshot<AllReduceKernel, T>), \
|
hipLaunchKernelGGL((allreduce_prototype_twoshot<AllReduceKernel, T>), \
|
||||||
dim3(grid), dim3(kBlockTwoShot), 0, stream, A, B, N, \
|
dim3(grid), dim3(kBlockTwoShot), 0, stream, A, B, N, \
|
||||||
num_blocks, rank, dbuffer_list, data_offset, \
|
num_blocks, rank, dbuffer_list, data_offset, \
|
||||||
flag_color, this->kMaxProblemSize); \
|
flag_color); \
|
||||||
} else if (world_size == 4) { \
|
} else if (world_size == 4) { \
|
||||||
using LineCodec = __codec<T, 4>; \
|
using LineCodec = __codec<T, 4>; \
|
||||||
using AllReduceKernel = AllReduceTwoshot<T, LineCodec, cast_bf2half>; \
|
using AllReduceKernel = AllReduceTwoshot<T, LineCodec, cast_bf2half>; \
|
||||||
hipLaunchKernelGGL((allreduce_prototype_twoshot<AllReduceKernel, T>), \
|
hipLaunchKernelGGL((allreduce_prototype_twoshot<AllReduceKernel, T>), \
|
||||||
dim3(grid), dim3(kBlockTwoShot), 0, stream, A, B, N, \
|
dim3(grid), dim3(kBlockTwoShot), 0, stream, A, B, N, \
|
||||||
num_blocks, rank, dbuffer_list, data_offset, \
|
num_blocks, rank, dbuffer_list, data_offset, \
|
||||||
flag_color, this->kMaxProblemSize); \
|
flag_color); \
|
||||||
} else if (world_size == 8) { \
|
} else if (world_size == 8) { \
|
||||||
using LineCodec = __codec<T, 8>; \
|
using LineCodec = __codec<T, 8>; \
|
||||||
using AllReduceKernel = AllReduceTwoshot<T, LineCodec, cast_bf2half>; \
|
using AllReduceKernel = AllReduceTwoshot<T, LineCodec, cast_bf2half>; \
|
||||||
hipLaunchKernelGGL((allreduce_prototype_twoshot<AllReduceKernel, T>), \
|
hipLaunchKernelGGL((allreduce_prototype_twoshot<AllReduceKernel, T>), \
|
||||||
dim3(grid), dim3(kBlockTwoShot), 0, stream, A, B, N, \
|
dim3(grid), dim3(kBlockTwoShot), 0, stream, A, B, N, \
|
||||||
num_blocks, rank, dbuffer_list, data_offset, \
|
num_blocks, rank, dbuffer_list, data_offset, \
|
||||||
flag_color, this->kMaxProblemSize); \
|
flag_color); \
|
||||||
}
|
}
|
||||||
|
|
||||||
enum QuickReduceQuantLevel {
|
enum QuickReduceQuantLevel {
|
||||||
|
|||||||
@ -553,12 +553,13 @@ struct AllReduceTwoshot {
|
|||||||
int const rank, // rank index
|
int const rank, // rank index
|
||||||
uint8_t** __restrict__ buffer_list, // communication buffers
|
uint8_t** __restrict__ buffer_list, // communication buffers
|
||||||
uint32_t const data_offset, // offset to start of the data buffer
|
uint32_t const data_offset, // offset to start of the data buffer
|
||||||
uint32_t flag_color, int64_t data_size_per_phase) {
|
uint32_t flag_color) {
|
||||||
// Topology
|
// Topology
|
||||||
int thread = threadIdx.x + threadIdx.y * kWavefront;
|
int thread = threadIdx.x + threadIdx.y * kWavefront;
|
||||||
uint8_t* rank_buffer = buffer_list[rank];
|
uint8_t* rank_buffer = buffer_list[rank];
|
||||||
Codec codec(thread, rank);
|
Codec codec(thread, rank);
|
||||||
int block_id = blockIdx.x;
|
int block_id = blockIdx.x;
|
||||||
|
int grid_size = gridDim.x;
|
||||||
// --------------------------------------------------------
|
// --------------------------------------------------------
|
||||||
// Read input into registers
|
// Read input into registers
|
||||||
int32x4_t tA[kAtoms];
|
int32x4_t tA[kAtoms];
|
||||||
@ -587,10 +588,12 @@ struct AllReduceTwoshot {
|
|||||||
// rank responsible for this segment.
|
// rank responsible for this segment.
|
||||||
uint32_t comm_data0_offset =
|
uint32_t comm_data0_offset =
|
||||||
data_offset + block_id * Codec::kTransmittedTileSize;
|
data_offset + block_id * Codec::kTransmittedTileSize;
|
||||||
uint32_t comm_data1_offset = data_size_per_phase + comm_data0_offset;
|
uint32_t comm_data1_offset =
|
||||||
|
grid_size * Codec::kTransmittedTileSize + comm_data0_offset;
|
||||||
|
|
||||||
uint32_t comm_flags0_offset = block_id * (kWorldSize * sizeof(uint32_t));
|
uint32_t comm_flags0_offset = block_id * (kWorldSize * sizeof(uint32_t));
|
||||||
uint32_t comm_flags1_offset = (data_offset / 2) + comm_flags0_offset;
|
uint32_t comm_flags1_offset =
|
||||||
|
grid_size * (kWorldSize * sizeof(uint32_t)) + comm_flags0_offset;
|
||||||
|
|
||||||
for (int r = 0; r < kWorldSize; r++) {
|
for (int r = 0; r < kWorldSize; r++) {
|
||||||
int32x4_t* send_buffer =
|
int32x4_t* send_buffer =
|
||||||
|
|||||||
@ -229,7 +229,7 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
|
|||||||
# Check the size of the wheel if RUN_WHEEL_CHECK is true
|
# Check the size of the wheel if RUN_WHEEL_CHECK is true
|
||||||
COPY .buildkite/check-wheel-size.py check-wheel-size.py
|
COPY .buildkite/check-wheel-size.py check-wheel-size.py
|
||||||
# sync the default value with .buildkite/check-wheel-size.py
|
# sync the default value with .buildkite/check-wheel-size.py
|
||||||
ARG VLLM_MAX_SIZE_MB=500
|
ARG VLLM_MAX_SIZE_MB=450
|
||||||
ENV VLLM_MAX_SIZE_MB=$VLLM_MAX_SIZE_MB
|
ENV VLLM_MAX_SIZE_MB=$VLLM_MAX_SIZE_MB
|
||||||
ARG RUN_WHEEL_CHECK=true
|
ARG RUN_WHEEL_CHECK=true
|
||||||
RUN if [ "$RUN_WHEEL_CHECK" = "true" ]; then \
|
RUN if [ "$RUN_WHEEL_CHECK" = "true" ]; then \
|
||||||
@ -359,8 +359,8 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
|
|||||||
# Install FlashInfer pre-compiled kernel cache and binaries
|
# Install FlashInfer pre-compiled kernel cache and binaries
|
||||||
# https://docs.flashinfer.ai/installation.html
|
# https://docs.flashinfer.ai/installation.html
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||||
uv pip install --system flashinfer-cubin==0.4.1 \
|
uv pip install --system flashinfer-cubin==0.4.0 \
|
||||||
&& uv pip install --system flashinfer-jit-cache==0.4.1 \
|
&& uv pip install --system flashinfer-jit-cache==0.4.0 \
|
||||||
--extra-index-url https://flashinfer.ai/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \
|
--extra-index-url https://flashinfer.ai/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \
|
||||||
&& flashinfer show-config
|
&& flashinfer show-config
|
||||||
|
|
||||||
|
|||||||
@ -246,7 +246,7 @@ RUN pip install setuptools==75.6.0 packaging==23.2 ninja==1.11.1.3 build==1.2.2.
|
|||||||
|
|
||||||
|
|
||||||
# build flashinfer for torch nightly from source around 10 mins
|
# build flashinfer for torch nightly from source around 10 mins
|
||||||
# release version: v0.4.1
|
# release version: v0.4.0
|
||||||
# todo(elainewy): cache flashinfer build result for faster build
|
# todo(elainewy): cache flashinfer build result for faster build
|
||||||
ENV CCACHE_DIR=/root/.cache/ccache
|
ENV CCACHE_DIR=/root/.cache/ccache
|
||||||
RUN --mount=type=cache,target=/root/.cache/ccache \
|
RUN --mount=type=cache,target=/root/.cache/ccache \
|
||||||
@ -254,7 +254,7 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
|
|||||||
echo "git clone flashinfer..." \
|
echo "git clone flashinfer..." \
|
||||||
&& git clone --recursive https://github.com/flashinfer-ai/flashinfer.git \
|
&& git clone --recursive https://github.com/flashinfer-ai/flashinfer.git \
|
||||||
&& cd flashinfer \
|
&& cd flashinfer \
|
||||||
&& git checkout v0.4.1\
|
&& git checkout v0.4.0 \
|
||||||
&& git submodule update --init --recursive \
|
&& git submodule update --init --recursive \
|
||||||
&& echo "finish git clone flashinfer..." \
|
&& echo "finish git clone flashinfer..." \
|
||||||
&& rm -rf build \
|
&& rm -rf build \
|
||||||
|
|||||||
@ -12,7 +12,7 @@ ENV PYTORCH_ROCM_ARCH=${ARG_PYTORCH_ROCM_ARCH:-${PYTORCH_ROCM_ARCH}}
|
|||||||
RUN apt-get update -q -y && apt-get install -q -y \
|
RUN apt-get update -q -y && apt-get install -q -y \
|
||||||
sqlite3 libsqlite3-dev libfmt-dev libmsgpack-dev libsuitesparse-dev \
|
sqlite3 libsqlite3-dev libfmt-dev libmsgpack-dev libsuitesparse-dev \
|
||||||
apt-transport-https ca-certificates wget curl
|
apt-transport-https ca-certificates wget curl
|
||||||
# Remove sccache
|
# Remove sccache
|
||||||
RUN python3 -m pip install --upgrade pip
|
RUN python3 -m pip install --upgrade pip
|
||||||
RUN apt-get purge -y sccache; python3 -m pip uninstall -y sccache; rm -f "$(which sccache)"
|
RUN apt-get purge -y sccache; python3 -m pip uninstall -y sccache; rm -f "$(which sccache)"
|
||||||
ARG COMMON_WORKDIR
|
ARG COMMON_WORKDIR
|
||||||
|
|||||||
@ -11,7 +11,8 @@ The following code splits the model across 2 GPUs.
|
|||||||
```python
|
```python
|
||||||
from vllm import LLM
|
from vllm import LLM
|
||||||
|
|
||||||
llm = LLM(model="ibm-granite/granite-3.1-8b-instruct", tensor_parallel_size=2)
|
llm = LLM(model="ibm-granite/granite-3.1-8b-instruct",
|
||||||
|
tensor_parallel_size=2)
|
||||||
```
|
```
|
||||||
|
|
||||||
!!! warning
|
!!! warning
|
||||||
@ -23,7 +24,7 @@ llm = LLM(model="ibm-granite/granite-3.1-8b-instruct", tensor_parallel_size=2)
|
|||||||
!!! note
|
!!! note
|
||||||
With tensor parallelism enabled, each process will read the whole model and split it into chunks, which makes the disk reading time even longer (proportional to the size of tensor parallelism).
|
With tensor parallelism enabled, each process will read the whole model and split it into chunks, which makes the disk reading time even longer (proportional to the size of tensor parallelism).
|
||||||
|
|
||||||
You can convert the model checkpoint to a sharded checkpoint using [examples/offline_inference/save_sharded_state.py](../../examples/offline_inference/save_sharded_state.py). The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism.
|
You can convert the model checkpoint to a sharded checkpoint using <gh-file:examples/offline_inference/save_sharded_state.py>. The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism.
|
||||||
|
|
||||||
## Quantization
|
## Quantization
|
||||||
|
|
||||||
@ -42,7 +43,9 @@ and the maximum batch size (`max_num_seqs` option).
|
|||||||
```python
|
```python
|
||||||
from vllm import LLM
|
from vllm import LLM
|
||||||
|
|
||||||
llm = LLM(model="adept/fuyu-8b", max_model_len=2048, max_num_seqs=2)
|
llm = LLM(model="adept/fuyu-8b",
|
||||||
|
max_model_len=2048,
|
||||||
|
max_num_seqs=2)
|
||||||
```
|
```
|
||||||
|
|
||||||
## Reduce CUDA Graphs
|
## Reduce CUDA Graphs
|
||||||
@ -58,12 +61,12 @@ You can adjust `compilation_config` to achieve a better balance between inferenc
|
|||||||
|
|
||||||
```python
|
```python
|
||||||
from vllm import LLM
|
from vllm import LLM
|
||||||
from vllm.config import CompilationConfig, CompilationMode
|
from vllm.config import CompilationConfig, CompilationLevel
|
||||||
|
|
||||||
llm = LLM(
|
llm = LLM(
|
||||||
model="meta-llama/Llama-3.1-8B-Instruct",
|
model="meta-llama/Llama-3.1-8B-Instruct",
|
||||||
compilation_config=CompilationConfig(
|
compilation_config=CompilationConfig(
|
||||||
mode=CompilationMode.VLLM_COMPILE,
|
level=CompilationLevel.PIECEWISE,
|
||||||
# By default, it goes up to max_num_seqs
|
# By default, it goes up to max_num_seqs
|
||||||
cudagraph_capture_sizes=[1, 2, 4, 8, 16],
|
cudagraph_capture_sizes=[1, 2, 4, 8, 16],
|
||||||
),
|
),
|
||||||
@ -75,7 +78,8 @@ You can disable graph capturing completely via the `enforce_eager` flag:
|
|||||||
```python
|
```python
|
||||||
from vllm import LLM
|
from vllm import LLM
|
||||||
|
|
||||||
llm = LLM(model="meta-llama/Llama-3.1-8B-Instruct", enforce_eager=True)
|
llm = LLM(model="meta-llama/Llama-3.1-8B-Instruct",
|
||||||
|
enforce_eager=True)
|
||||||
```
|
```
|
||||||
|
|
||||||
## Adjust cache size
|
## Adjust cache size
|
||||||
@ -93,10 +97,8 @@ You can allow a smaller number of multi-modal items per prompt to reduce the mem
|
|||||||
from vllm import LLM
|
from vllm import LLM
|
||||||
|
|
||||||
# Accept up to 3 images and 1 video per prompt
|
# Accept up to 3 images and 1 video per prompt
|
||||||
llm = LLM(
|
llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
|
||||||
model="Qwen/Qwen2.5-VL-3B-Instruct",
|
limit_mm_per_prompt={"image": 3, "video": 1})
|
||||||
limit_mm_per_prompt={"image": 3, "video": 1},
|
|
||||||
)
|
|
||||||
```
|
```
|
||||||
|
|
||||||
You can go a step further and disable unused modalities completely by setting its limit to zero.
|
You can go a step further and disable unused modalities completely by setting its limit to zero.
|
||||||
@ -106,10 +108,8 @@ For example, if your application only accepts image input, there is no need to a
|
|||||||
from vllm import LLM
|
from vllm import LLM
|
||||||
|
|
||||||
# Accept any number of images but no videos
|
# Accept any number of images but no videos
|
||||||
llm = LLM(
|
llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
|
||||||
model="Qwen/Qwen2.5-VL-3B-Instruct",
|
limit_mm_per_prompt={"video": 0})
|
||||||
limit_mm_per_prompt={"video": 0},
|
|
||||||
)
|
|
||||||
```
|
```
|
||||||
|
|
||||||
You can even run a multi-modal model for text-only inference:
|
You can even run a multi-modal model for text-only inference:
|
||||||
@ -118,10 +118,8 @@ You can even run a multi-modal model for text-only inference:
|
|||||||
from vllm import LLM
|
from vllm import LLM
|
||||||
|
|
||||||
# Don't accept images. Just text.
|
# Don't accept images. Just text.
|
||||||
llm = LLM(
|
llm = LLM(model="google/gemma-3-27b-it",
|
||||||
model="google/gemma-3-27b-it",
|
limit_mm_per_prompt={"image": 0})
|
||||||
limit_mm_per_prompt={"image": 0},
|
|
||||||
)
|
|
||||||
```
|
```
|
||||||
|
|
||||||
### Configurable options
|
### Configurable options
|
||||||
@ -175,14 +173,14 @@ Here are some examples:
|
|||||||
from vllm import LLM
|
from vllm import LLM
|
||||||
|
|
||||||
# Available for Qwen2-VL series models
|
# Available for Qwen2-VL series models
|
||||||
llm = LLM(
|
llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
|
||||||
model="Qwen/Qwen2.5-VL-3B-Instruct",
|
mm_processor_kwargs={
|
||||||
mm_processor_kwargs={"max_pixels": 768 * 768}, # Default is 1280 * 28 * 28
|
"max_pixels": 768 * 768, # Default is 1280 * 28 * 28
|
||||||
)
|
})
|
||||||
|
|
||||||
# Available for InternVL series models
|
# Available for InternVL series models
|
||||||
llm = LLM(
|
llm = LLM(model="OpenGVLab/InternVL2-2B",
|
||||||
model="OpenGVLab/InternVL2-2B",
|
mm_processor_kwargs={
|
||||||
mm_processor_kwargs={"max_dynamic_patch": 4}, # Default is 12
|
"max_dynamic_patch": 4, # Default is 12
|
||||||
)
|
})
|
||||||
```
|
```
|
||||||
|
|||||||
@ -100,7 +100,7 @@ from vllm import LLM
|
|||||||
llm = LLM(
|
llm = LLM(
|
||||||
model="meta-llama/Llama-3.3-70B-Instruct,
|
model="meta-llama/Llama-3.3-70B-Instruct,
|
||||||
tensor_parallel_size=4,
|
tensor_parallel_size=4,
|
||||||
pipeline_parallel_size=2,
|
pipeline_parallel_size=2
|
||||||
)
|
)
|
||||||
```
|
```
|
||||||
|
|
||||||
@ -174,14 +174,14 @@ Regardless, you need to set `mm_encoder_tp_mode="data"` in engine arguments to u
|
|||||||
|
|
||||||
Known supported models (with corresponding benchmarks):
|
Known supported models (with corresponding benchmarks):
|
||||||
|
|
||||||
- dots_ocr (<https://github.com/vllm-project/vllm/pull/25466>)
|
- dots_ocr (<gh-pr:25466>)
|
||||||
- GLM-4.1V or above (<https://github.com/vllm-project/vllm/pull/23168>)
|
- GLM-4.1V or above (<gh-pr:23168>)
|
||||||
- InternVL (<https://github.com/vllm-project/vllm/pull/23909>)
|
- InternVL (<gh-pr:23909>)
|
||||||
- Kimi-VL (<https://github.com/vllm-project/vllm/pull/23817>)
|
- Kimi-VL (<gh-pr:23817>)
|
||||||
- Llama4 (<https://github.com/vllm-project/vllm/pull/18368>)
|
- Llama4 (<gh-pr:18368>)
|
||||||
- MiniCPM-V-2.5 or above (<https://github.com/vllm-project/vllm/pull/23327>, <https://github.com/vllm-project/vllm/pull/23948>)
|
- MiniCPM-V-2.5 or above (<gh-pr:23327>, <gh-pr:23948>)
|
||||||
- Qwen2-VL or above (<https://github.com/vllm-project/vllm/pull/22742>, <https://github.com/vllm-project/vllm/pull/24955>, <https://github.com/vllm-project/vllm/pull/25445>)
|
- Qwen2-VL or above (<gh-pr:22742>, <gh-pr:24955>, <gh-pr:25445>)
|
||||||
- Step3 (<https://github.com/vllm-project/vllm/pull/22697>)
|
- Step3 (<gh-pr:22697>)
|
||||||
|
|
||||||
## Input Processing
|
## Input Processing
|
||||||
|
|
||||||
@ -257,24 +257,18 @@ Examples:
|
|||||||
|
|
||||||
```python
|
```python
|
||||||
# Use a larger cache
|
# Use a larger cache
|
||||||
llm = LLM(
|
llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
|
||||||
model="Qwen/Qwen2.5-VL-3B-Instruct",
|
mm_processor_cache_gb=8)
|
||||||
mm_processor_cache_gb=8,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Use a shared-memory based IPC cache
|
# Use a shared-memory based IPC cache
|
||||||
llm = LLM(
|
llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
|
||||||
model="Qwen/Qwen2.5-VL-3B-Instruct",
|
tensor_parallel_size=2,
|
||||||
tensor_parallel_size=2,
|
mm_processor_cache_type="shm",
|
||||||
mm_processor_cache_type="shm",
|
mm_processor_cache_gb=8)
|
||||||
mm_processor_cache_gb=8,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Disable the cache
|
# Disable the cache
|
||||||
llm = LLM(
|
llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
|
||||||
model="Qwen/Qwen2.5-VL-3B-Instruct",
|
mm_processor_cache_gb=0)
|
||||||
mm_processor_cache_gb=0,
|
|
||||||
)
|
|
||||||
```
|
```
|
||||||
|
|
||||||
### Cache Placement
|
### Cache Placement
|
||||||
|
|||||||
@ -96,7 +96,7 @@ Although it’s common to do this with GPUs, don't try to fragment 2 or 8 differ
|
|||||||
|
|
||||||
### Tune your workloads
|
### Tune your workloads
|
||||||
|
|
||||||
Although we try to have great default configs, we strongly recommend you check out the [vLLM auto-tuner](../../benchmarks/auto_tune/README.md) to optimize your workloads for your use case.
|
Although we try to have great default configs, we strongly recommend you check out the [vLLM auto-tuner](gh-file:benchmarks/auto_tune/README.md) to optimize your workloads for your use case.
|
||||||
|
|
||||||
### Future Topics We'll Cover
|
### Future Topics We'll Cover
|
||||||
|
|
||||||
|
|||||||
@ -22,7 +22,7 @@ Unsure on where to start? Check out the following links for tasks to work on:
|
|||||||
|
|
||||||
## License
|
## License
|
||||||
|
|
||||||
See [LICENSE](../../LICENSE).
|
See <gh-file:LICENSE>.
|
||||||
|
|
||||||
## Developing
|
## Developing
|
||||||
|
|
||||||
@ -54,7 +54,7 @@ For more details about installing from source and installing for other hardware,
|
|||||||
For an optimized workflow when iterating on C++/CUDA kernels, see the [Incremental Compilation Workflow](./incremental_build.md) for recommendations.
|
For an optimized workflow when iterating on C++/CUDA kernels, see the [Incremental Compilation Workflow](./incremental_build.md) for recommendations.
|
||||||
|
|
||||||
!!! tip
|
!!! tip
|
||||||
vLLM is compatible with Python versions 3.10 to 3.13. However, vLLM's default [Dockerfile](../../docker/Dockerfile) ships with Python 3.12 and tests in CI (except `mypy`) are run with Python 3.12.
|
vLLM is compatible with Python versions 3.10 to 3.13. However, vLLM's default [Dockerfile](gh-file:docker/Dockerfile) ships with Python 3.12 and tests in CI (except `mypy`) are run with Python 3.12.
|
||||||
|
|
||||||
Therefore, we recommend developing with Python 3.12 to minimise the chance of your local environment clashing with our CI environment.
|
Therefore, we recommend developing with Python 3.12 to minimise the chance of your local environment clashing with our CI environment.
|
||||||
|
|
||||||
@ -88,7 +88,7 @@ vLLM's `pre-commit` hooks will now run automatically every time you commit.
|
|||||||
|
|
||||||
### Documentation
|
### Documentation
|
||||||
|
|
||||||
MkDocs is a fast, simple and downright gorgeous static site generator that's geared towards building project documentation. Documentation source files are written in Markdown, and configured with a single YAML configuration file, [mkdocs.yaml](../../mkdocs.yaml).
|
MkDocs is a fast, simple and downright gorgeous static site generator that's geared towards building project documentation. Documentation source files are written in Markdown, and configured with a single YAML configuration file, <gh-file:mkdocs.yaml>.
|
||||||
|
|
||||||
Get started with:
|
Get started with:
|
||||||
|
|
||||||
@ -152,7 +152,7 @@ pytest -s -v tests/test_logger.py
|
|||||||
If you encounter a bug or have a feature request, please [search existing issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue) first to see if it has already been reported. If not, please [file a new issue](https://github.com/vllm-project/vllm/issues/new/choose), providing as much relevant information as possible.
|
If you encounter a bug or have a feature request, please [search existing issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue) first to see if it has already been reported. If not, please [file a new issue](https://github.com/vllm-project/vllm/issues/new/choose), providing as much relevant information as possible.
|
||||||
|
|
||||||
!!! important
|
!!! important
|
||||||
If you discover a security vulnerability, please follow the instructions [here](../../SECURITY.md).
|
If you discover a security vulnerability, please follow the instructions [here](gh-file:SECURITY.md#reporting-a-vulnerability).
|
||||||
|
|
||||||
## Pull Requests & Code Reviews
|
## Pull Requests & Code Reviews
|
||||||
|
|
||||||
@ -162,7 +162,7 @@ code quality and improve the efficiency of the review process.
|
|||||||
|
|
||||||
### DCO and Signed-off-by
|
### DCO and Signed-off-by
|
||||||
|
|
||||||
When contributing changes to this project, you must agree to the [DCO](../../DCO).
|
When contributing changes to this project, you must agree to the <gh-file:DCO>.
|
||||||
Commits must include a `Signed-off-by:` header which certifies agreement with
|
Commits must include a `Signed-off-by:` header which certifies agreement with
|
||||||
the terms of the DCO.
|
the terms of the DCO.
|
||||||
|
|
||||||
|
|||||||
@ -35,7 +35,6 @@ th {
|
|||||||
| Sonnet (deprecated) | ✅ | ✅ | Local file: `benchmarks/sonnet.txt` |
|
| Sonnet (deprecated) | ✅ | ✅ | Local file: `benchmarks/sonnet.txt` |
|
||||||
| Random | ✅ | ✅ | `synthetic` |
|
| Random | ✅ | ✅ | `synthetic` |
|
||||||
| RandomMultiModal (Image/Video) | 🟡 | 🚧 | `synthetic` |
|
| RandomMultiModal (Image/Video) | 🟡 | 🚧 | `synthetic` |
|
||||||
| RandomForReranking | ✅ | ✅ | `synthetic` |
|
|
||||||
| Prefix Repetition | ✅ | ✅ | `synthetic` |
|
| Prefix Repetition | ✅ | ✅ | `synthetic` |
|
||||||
| HuggingFace-VisionArena | ✅ | ✅ | `lmarena-ai/VisionArena-Chat` |
|
| HuggingFace-VisionArena | ✅ | ✅ | `lmarena-ai/VisionArena-Chat` |
|
||||||
| HuggingFace-MMVU | ✅ | ✅ | `yale-nlp/MMVU` |
|
| HuggingFace-MMVU | ✅ | ✅ | `yale-nlp/MMVU` |
|
||||||
@ -822,7 +821,7 @@ you should set `--endpoint /v1/embeddings` to use the Embeddings API. The backen
|
|||||||
- CLIP: `--backend openai-embeddings-clip`
|
- CLIP: `--backend openai-embeddings-clip`
|
||||||
- VLM2Vec: `--backend openai-embeddings-vlm2vec`
|
- VLM2Vec: `--backend openai-embeddings-vlm2vec`
|
||||||
|
|
||||||
For other models, please add your own implementation inside [vllm/benchmarks/lib/endpoint_request_func.py](../../vllm/benchmarks/lib/endpoint_request_func.py) to match the expected instruction format.
|
For other models, please add your own implementation inside <gh-file:vllm/benchmarks/lib/endpoint_request_func.py> to match the expected instruction format.
|
||||||
|
|
||||||
You can use any text or multi-modal dataset to benchmark the model, as long as the model supports it.
|
You can use any text or multi-modal dataset to benchmark the model, as long as the model supports it.
|
||||||
For example, you can use ShareGPT and VisionArena to benchmark vision-language embeddings.
|
For example, you can use ShareGPT and VisionArena to benchmark vision-language embeddings.
|
||||||
@ -879,51 +878,6 @@ vllm bench serve \
|
|||||||
|
|
||||||
</details>
|
</details>
|
||||||
|
|
||||||
#### Reranker Benchmark
|
|
||||||
|
|
||||||
Benchmark the performance of rerank requests in vLLM.
|
|
||||||
|
|
||||||
<details class="admonition abstract" markdown="1">
|
|
||||||
<summary>Show more</summary>
|
|
||||||
|
|
||||||
Unlike generative models which use Completions API or Chat Completions API,
|
|
||||||
you should set `--backend vllm-rerank` and `--endpoint /v1/rerank` to use the Reranker API.
|
|
||||||
|
|
||||||
For reranking, the only supported dataset is `--dataset-name random-rerank`
|
|
||||||
|
|
||||||
Start the server:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
vllm serve BAAI/bge-reranker-v2-m3
|
|
||||||
```
|
|
||||||
|
|
||||||
Run the benchmark:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
vllm bench serve \
|
|
||||||
--model BAAI/bge-reranker-v2-m3 \
|
|
||||||
--backend vllm-rerank \
|
|
||||||
--endpoint /v1/rerank \
|
|
||||||
--dataset-name random-rerank \
|
|
||||||
--tokenizer BAAI/bge-reranker-v2-m3 \
|
|
||||||
--random-input-len 512 \
|
|
||||||
--num-prompts 10 \
|
|
||||||
--random-batch-size 5
|
|
||||||
```
|
|
||||||
|
|
||||||
For reranker models, this will create `num_prompts / random_batch_size` requests with
|
|
||||||
`random_batch_size` "documents" where each one has close to `random_input_len` tokens.
|
|
||||||
In the example above, this results in 2 rerank requests with 5 "documents" each where
|
|
||||||
each document has close to 512 tokens.
|
|
||||||
|
|
||||||
Please note that the `/v1/rerank` is also supported by embedding models. So if you're running
|
|
||||||
with an embedding model, also set `--no_reranker`. Because in this case the query is
|
|
||||||
treated as a individual prompt by the server, here we send `random_batch_size - 1` documents
|
|
||||||
to account for the extra prompt which is the query. The token accounting to report the
|
|
||||||
throughput numbers correctly is also adjusted.
|
|
||||||
|
|
||||||
</details>
|
|
||||||
|
|
||||||
[](){ #performance-benchmarks }
|
[](){ #performance-benchmarks }
|
||||||
|
|
||||||
## Performance Benchmarks
|
## Performance Benchmarks
|
||||||
@ -962,7 +916,7 @@ For more results visualization, check the [visualizing the results](https://gith
|
|||||||
|
|
||||||
The latest performance results are hosted on the public [vLLM Performance Dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm).
|
The latest performance results are hosted on the public [vLLM Performance Dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm).
|
||||||
|
|
||||||
More information on the performance benchmarks and their parameters can be found in [Benchmark README](https://github.com/intel-ai-tce/vllm/blob/more_cpu_models/.buildkite/nightly-benchmarks/README.md) and [performance benchmark description](../../.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md).
|
More information on the performance benchmarks and their parameters can be found in [Benchmark README](https://github.com/intel-ai-tce/vllm/blob/more_cpu_models/.buildkite/nightly-benchmarks/README.md) and [performance benchmark description](gh-file:.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md).
|
||||||
|
|
||||||
### Continuous Benchmarking
|
### Continuous Benchmarking
|
||||||
|
|
||||||
@ -996,4 +950,4 @@ These compare vLLM's performance against alternatives (`tgi`, `trt-llm`, and `lm
|
|||||||
|
|
||||||
The latest nightly benchmark results are shared in major release blog posts such as [vLLM v0.6.0](https://blog.vllm.ai/2024/09/05/perf-update.html).
|
The latest nightly benchmark results are shared in major release blog posts such as [vLLM v0.6.0](https://blog.vllm.ai/2024/09/05/perf-update.html).
|
||||||
|
|
||||||
More information on the nightly benchmarks and their parameters can be found [here](../../.buildkite/nightly-benchmarks/nightly-descriptions.md).
|
More information on the nightly benchmarks and their parameters can be found [here](gh-file:.buildkite/nightly-benchmarks/nightly-descriptions.md).
|
||||||
|
|||||||
@ -64,7 +64,7 @@ Download the full log file from Buildkite locally.
|
|||||||
|
|
||||||
Strip timestamps and colorization:
|
Strip timestamps and colorization:
|
||||||
|
|
||||||
[.buildkite/scripts/ci-clean-log.sh](../../../.buildkite/scripts/ci-clean-log.sh)
|
<gh-file:.buildkite/scripts/ci-clean-log.sh>
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
./ci-clean-log.sh ci.log
|
./ci-clean-log.sh ci.log
|
||||||
@ -87,7 +87,7 @@ tail -525 ci_build.log | wl-copy
|
|||||||
|
|
||||||
CI test failures may be flaky. Use a bash loop to run repeatedly:
|
CI test failures may be flaky. Use a bash loop to run repeatedly:
|
||||||
|
|
||||||
[.buildkite/scripts/rerun-test.sh](../../../.buildkite/scripts/rerun-test.sh)
|
<gh-file:.buildkite/scripts/rerun-test.sh>
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
./rerun-test.sh tests/v1/engine/test_engine_core_client.py::test_kv_cache_events[True-tcp]
|
./rerun-test.sh tests/v1/engine/test_engine_core_client.py::test_kv_cache_events[True-tcp]
|
||||||
|
|||||||
@ -5,7 +5,7 @@ release in CI/CD. It is standard practice to submit a PR to update the
|
|||||||
PyTorch version as early as possible when a new [PyTorch stable
|
PyTorch version as early as possible when a new [PyTorch stable
|
||||||
release](https://github.com/pytorch/pytorch/blob/main/RELEASE.md#release-cadence) becomes available.
|
release](https://github.com/pytorch/pytorch/blob/main/RELEASE.md#release-cadence) becomes available.
|
||||||
This process is non-trivial due to the gap between PyTorch
|
This process is non-trivial due to the gap between PyTorch
|
||||||
releases. Using <https://github.com/vllm-project/vllm/pull/16859> as an example, this document outlines common steps to achieve this
|
releases. Using <gh-pr:16859> as an example, this document outlines common steps to achieve this
|
||||||
update along with a list of potential issues and how to address them.
|
update along with a list of potential issues and how to address them.
|
||||||
|
|
||||||
## Test PyTorch release candidates (RCs)
|
## Test PyTorch release candidates (RCs)
|
||||||
@ -85,7 +85,7 @@ and timeout. Additionally, since vLLM's fastcheck pipeline runs in read-only mod
|
|||||||
it doesn't populate the cache, so re-running it to warm up the cache
|
it doesn't populate the cache, so re-running it to warm up the cache
|
||||||
is ineffective.
|
is ineffective.
|
||||||
|
|
||||||
While ongoing efforts like <https://github.com/vllm-project/vllm/issues/17419>
|
While ongoing efforts like [#17419](gh-issue:17419)
|
||||||
address the long build time at its source, the current workaround is to set `VLLM_CI_BRANCH`
|
address the long build time at its source, the current workaround is to set `VLLM_CI_BRANCH`
|
||||||
to a custom branch provided by @khluu (`VLLM_CI_BRANCH=khluu/use_postmerge_q`)
|
to a custom branch provided by @khluu (`VLLM_CI_BRANCH=khluu/use_postmerge_q`)
|
||||||
when manually triggering a build on Buildkite. This branch accomplishes two things:
|
when manually triggering a build on Buildkite. This branch accomplishes two things:
|
||||||
@ -138,5 +138,5 @@ to handle some platforms separately. The separation of requirements and Dockerfi
|
|||||||
for different platforms in vLLM CI/CD allows us to selectively choose
|
for different platforms in vLLM CI/CD allows us to selectively choose
|
||||||
which platforms to update. For instance, updating XPU requires the corresponding
|
which platforms to update. For instance, updating XPU requires the corresponding
|
||||||
release from [Intel Extension for PyTorch](https://github.com/intel/intel-extension-for-pytorch) by Intel.
|
release from [Intel Extension for PyTorch](https://github.com/intel/intel-extension-for-pytorch) by Intel.
|
||||||
While <https://github.com/vllm-project/vllm/pull/16859> updated vLLM to PyTorch 2.7.0 on CPU, CUDA, and ROCm,
|
While <gh-pr:16859> updated vLLM to PyTorch 2.7.0 on CPU, CUDA, and ROCm,
|
||||||
<https://github.com/vllm-project/vllm/pull/17444> completed the update for XPU.
|
<gh-pr:17444> completed the update for XPU.
|
||||||
|
|||||||
@ -1,6 +1,6 @@
|
|||||||
# Dockerfile
|
# Dockerfile
|
||||||
|
|
||||||
We provide a [docker/Dockerfile](../../../docker/Dockerfile) to construct the image for running an OpenAI compatible server with vLLM.
|
We provide a <gh-file:docker/Dockerfile> to construct the image for running an OpenAI compatible server with vLLM.
|
||||||
More information about deploying with Docker can be found [here](../../deployment/docker.md).
|
More information about deploying with Docker can be found [here](../../deployment/docker.md).
|
||||||
|
|
||||||
Below is a visual representation of the multi-stage Dockerfile. The build graph contains the following nodes:
|
Below is a visual representation of the multi-stage Dockerfile. The build graph contains the following nodes:
|
||||||
|
|||||||
@ -5,7 +5,7 @@ This guide walks you through the steps to implement a basic vLLM model.
|
|||||||
## 1. Bring your model code
|
## 1. Bring your model code
|
||||||
|
|
||||||
First, clone the PyTorch model code from the source repository.
|
First, clone the PyTorch model code from the source repository.
|
||||||
For instance, vLLM's [OPT model](../../../vllm/model_executor/models/opt.py) was adapted from
|
For instance, vLLM's [OPT model](gh-file:vllm/model_executor/models/opt.py) was adapted from
|
||||||
HuggingFace's [modeling_opt.py](https://github.com/huggingface/transformers/blob/main/src/transformers/models/opt/modeling_opt.py) file.
|
HuggingFace's [modeling_opt.py](https://github.com/huggingface/transformers/blob/main/src/transformers/models/opt/modeling_opt.py) file.
|
||||||
|
|
||||||
!!! warning
|
!!! warning
|
||||||
@ -73,8 +73,8 @@ def forward(
|
|||||||
self,
|
self,
|
||||||
input_ids: torch.Tensor,
|
input_ids: torch.Tensor,
|
||||||
positions: torch.Tensor,
|
positions: torch.Tensor,
|
||||||
intermediate_tensors: IntermediateTensors | None = None,
|
intermediate_tensors: Optional[IntermediateTensors] = None,
|
||||||
inputs_embeds: torch.Tensor | None = None,
|
inputs_embeds: Optional[torch.Tensor] = None,
|
||||||
) -> torch.Tensor:
|
) -> torch.Tensor:
|
||||||
...
|
...
|
||||||
```
|
```
|
||||||
@ -83,7 +83,7 @@ def forward(
|
|||||||
Currently, vLLM supports the basic multi-head attention mechanism and its variant with rotary positional embeddings.
|
Currently, vLLM supports the basic multi-head attention mechanism and its variant with rotary positional embeddings.
|
||||||
If your model employs a different attention mechanism, you will need to implement a new attention layer in vLLM.
|
If your model employs a different attention mechanism, you will need to implement a new attention layer in vLLM.
|
||||||
|
|
||||||
For reference, check out our [Llama implementation](../../../vllm/model_executor/models/llama.py). vLLM already supports a large number of models. It is recommended to find a model similar to yours and adapt it to your model's architecture. Check out [vllm/model_executor/models](../../../vllm/model_executor/models) for more examples.
|
For reference, check out our [Llama implementation](gh-file:vllm/model_executor/models/llama.py). vLLM already supports a large number of models. It is recommended to find a model similar to yours and adapt it to your model's architecture. Check out <gh-dir:vllm/model_executor/models> for more examples.
|
||||||
|
|
||||||
## 3. (Optional) Implement tensor parallelism and quantization support
|
## 3. (Optional) Implement tensor parallelism and quantization support
|
||||||
|
|
||||||
@ -130,22 +130,22 @@ We consider 3 different scenarios:
|
|||||||
2. Models that combine Mamba layers (either Mamba-1 or Mamba-2) together with attention layers.
|
2. Models that combine Mamba layers (either Mamba-1 or Mamba-2) together with attention layers.
|
||||||
3. Models that combine Mamba-like mechanisms (e.g., Linear Attention, ShortConv) together with attention layers.
|
3. Models that combine Mamba-like mechanisms (e.g., Linear Attention, ShortConv) together with attention layers.
|
||||||
|
|
||||||
For case (1), we recommend looking at the implementation of [`MambaForCausalLM`](../../../vllm/model_executor/models/mamba.py) (for Mamba-1) or [`Mamba2ForCausalLM`](../../../vllm/model_executor/models/mamba2.py) (for Mamba-2) as a reference.
|
For case (1), we recommend looking at the implementation of [`MambaForCausalLM`](gh-file:vllm/model_executor/models/mamba.py) (for Mamba-1) or [`Mamba2ForCausalLM`](gh-file:vllm/model_executor/models/mamba2.py) (for Mamba-2) as a reference.
|
||||||
The model should inherit protocol `IsAttentionFree` and also implement class methods `get_mamba_state_dtype_from_config` and `get_mamba_state_shape_from_config` to calculate the state shapes and data types from the config.
|
The model should inherit protocol `IsAttentionFree` and also implement class methods `get_mamba_state_dtype_from_config` and `get_mamba_state_shape_from_config` to calculate the state shapes and data types from the config.
|
||||||
For the mamba layers themselves, please use the [`MambaMixer`](../../../vllm/model_executor/layers/mamba/mamba_mixer.py) (for Mamba-1) or [`MambaMixer2`](../../../vllm/model_executor/layers/mamba/mamba_mixer2.py) (for Mamba-2) classes.
|
For the mamba layers themselves, please use the [`MambaMixer`](gh-file:vllm/model_executor/layers/mamba/mamba_mixer.py) (for Mamba-1) or [`MambaMixer2`](gh-file:vllm/model_executor/layers/mamba/mamba_mixer2.py) (for Mamba-2) classes.
|
||||||
Please *do not* use the `MambaCacheManager` (deprecated in V1) or replicate any of the V0-specific code paths in the existing model implementations.
|
Please *do not* use the `MambaCacheManager` (deprecated in V1) or replicate any of the V0-specific code paths in the existing model implementations.
|
||||||
V0-only classes and code will be removed in the very near future.
|
V0-only classes and code will be removed in the very near future.
|
||||||
The model should also be added to the `MODELS_CONFIG_MAP` dictionary in [vllm/model_executor/models/config.py](../../../vllm/model_executor/models/config.py) to ensure that the runtime defaults are optimized.
|
The model should also be added to the `MODELS_CONFIG_MAP` dictionary in <gh-file:vllm/model_executor/models/config.py> to ensure that the runtime defaults are optimized.
|
||||||
|
|
||||||
For case (2), we recommend using as a reference the implementation of [`JambaForCausalLM`](../../../vllm/model_executor/models/jamba.py) (for an example of a model that uses Mamba-1 and attention together) or [`BambaForCausalLM`](../../../vllm/model_executor/models/bamba.py) (for an example of a model that uses Mamba-2 and attention together).
|
For case (2), we recommend using as a reference the implementation of [`JambaForCausalLM`](gh-file:vllm/model_executor/models/jamba.py) (for an example of a model that uses Mamba-1 and attention together) or [`BambaForCausalLM`](gh-file:vllm/model_executor/models/bamba.py) (for an example of a model that uses Mamba-2 and attention together).
|
||||||
These models should follow the same instructions as case (1), but they should inherit protocol `IsHybrid` (instead of `IsAttentionFree`) and it is *not* necessary to add them to the `MODELS_CONFIG_MAP` (their runtime defaults will be inferred from the protocol).
|
These models should follow the same instructions as case (1), but they should inherit protocol `IsHybrid` (instead of `IsAttentionFree`) and it is *not* necessary to add them to the `MODELS_CONFIG_MAP` (their runtime defaults will be inferred from the protocol).
|
||||||
|
|
||||||
For case (3), we recommend looking at the implementation of [`MiniMaxText01ForCausalLM`](../../../vllm/model_executor/models/minimax_text_01.py) or [`Lfm2ForCausalLM`](../../../vllm/model_executor/models/lfm2.py) as a reference, which use custom "mamba-like" layers `MiniMaxText01LinearAttention` and `ShortConv` respectively.
|
For case (3), we recommend looking at the implementation of [`MiniMaxText01ForCausalLM`](gh-file:vllm/model_executor/models/minimax_text_01.py) or [`Lfm2ForCausalLM`](gh-file:vllm/model_executor/models/lfm2.py) as a reference, which use custom "mamba-like" layers `MiniMaxText01LinearAttention` and `ShortConv` respectively.
|
||||||
Please follow the same guidelines as case (2) for implementing these models.
|
Please follow the same guidelines as case (2) for implementing these models.
|
||||||
We use "mamba-like" to refer to layers that posses a state that is updated in-place, rather than being appended-to (like KV cache for attention).
|
We use "mamba-like" to refer to layers that posses a state that is updated in-place, rather than being appended-to (like KV cache for attention).
|
||||||
For implementing new custom mamba-like layers, one should inherit from `MambaBase` and implement the methods `get_state_dtype`, `get_state_shape` to calculate the data types and state shapes at runtime, as well as `mamba_type` and `get_attn_backend`.
|
For implementing new custom mamba-like layers, one should inherit from `MambaBase` and implement the methods `get_state_dtype`, `get_state_shape` to calculate the data types and state shapes at runtime, as well as `mamba_type` and `get_attn_backend`.
|
||||||
It is also necessary to implement the "attention meta-data" class which handles the meta-data that is common across all layers.
|
It is also necessary to implement the "attention meta-data" class which handles the meta-data that is common across all layers.
|
||||||
Please see [`LinearAttentionMetadata`](../../../vllm/v1/attention/backends/linear_attn.py) or [`ShortConvAttentionMetadata`](../../../vllm/v1/attention/backends/short_conv_attn.py) for examples of this.
|
Please see [`LinearAttentionMetadata`](gh-file:vllm/v1/attention/backends/linear_attn.py) or [`ShortConvAttentionMetadata`](gh-file:v1/attention/backends/short_conv_attn.py) for examples of this.
|
||||||
Finally, if one wants to support torch compile and CUDA graphs, it necessary to wrap the call to the mamba-like layer inside a custom op and register it.
|
Finally, if one wants to support torch compile and CUDA graphs, it necessary to wrap the call to the mamba-like layer inside a custom op and register it.
|
||||||
Please see the calls to `direct_register_custom_op` in [vllm/model_executor/models/minimax_text_01.py](../../../vllm/model_executor/models/minimax_text_01.py) or [vllm/model_executor/layers/mamba/short_conv.py](../../../vllm/model_executor/layers/mamba/short_conv.py) for examples of this.
|
Please see the calls to `direct_register_custom_op` in <gh-file:vllm/model_executor/models/minimax_text_01.py> or <gh-file:vllm/model_executor/layers/mamba/short_conv.py> for examples of this.
|
||||||
The new custom op should then be added to the list `_attention_ops` in [vllm/config/compilation.py](../../../vllm/config/compilation.py) to ensure that piecewise CUDA graphs works as intended.
|
The new custom op should then be added to the list `_attention_ops` in <gh-file:vllm/config/compilation.py> to ensure that piecewise CUDA graphs works as intended.
|
||||||
|
|||||||
@ -16,7 +16,7 @@ Further update the model as follows:
|
|||||||
...
|
...
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_placeholder_str(cls, modality: str, i: int) -> str | None:
|
def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
|
||||||
if modality.startswith("image"):
|
if modality.startswith("image"):
|
||||||
return "<image>"
|
return "<image>"
|
||||||
|
|
||||||
@ -45,14 +45,14 @@ Further update the model as follows:
|
|||||||
...
|
...
|
||||||
|
|
||||||
def _process_image_input(self, image_input: YourModelImageInputs) -> torch.Tensor:
|
def _process_image_input(self, image_input: YourModelImageInputs) -> torch.Tensor:
|
||||||
|
|
||||||
assert self.vision_encoder is not None
|
assert self.vision_encoder is not None
|
||||||
image_features = self.vision_encoder(image_input)
|
image_features = self.vision_encoder(image_input)
|
||||||
return self.multi_modal_projector(image_features)
|
return self.multi_modal_projector(image_features)
|
||||||
|
|
||||||
def get_multimodal_embeddings(
|
def get_multimodal_embeddings(
|
||||||
self,
|
self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
|
||||||
**kwargs: object,
|
|
||||||
) -> MultiModalEmbeddings | None:
|
|
||||||
# Validate the multimodal input keyword arguments
|
# Validate the multimodal input keyword arguments
|
||||||
image_input = self._parse_and_validate_image_input(**kwargs)
|
image_input = self._parse_and_validate_image_input(**kwargs)
|
||||||
if image_input is None:
|
if image_input is None:
|
||||||
@ -110,7 +110,7 @@ to return the maximum number of input items for each modality supported by the m
|
|||||||
For example, if the model supports any number of images but only one video per prompt:
|
For example, if the model supports any number of images but only one video per prompt:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
def get_supported_mm_limits(self) -> Mapping[str, int | None]:
|
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
|
||||||
return {"image": None, "video": 1}
|
return {"image": None, "video": 1}
|
||||||
```
|
```
|
||||||
|
|
||||||
@ -258,7 +258,7 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
|
|||||||
self,
|
self,
|
||||||
seq_len: int,
|
seq_len: int,
|
||||||
mm_counts: Mapping[str, int],
|
mm_counts: Mapping[str, int],
|
||||||
mm_options: Mapping[str, BaseDummyOptions] | None = None,
|
mm_options: Optional[Mapping[str, BaseDummyOptions]] = None,
|
||||||
) -> MultiModalDataDict:
|
) -> MultiModalDataDict:
|
||||||
num_images = mm_counts.get("image", 0)
|
num_images = mm_counts.get("image", 0)
|
||||||
|
|
||||||
@ -421,10 +421,8 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
|
|||||||
```python
|
```python
|
||||||
def get_image_size_with_most_features(self) -> ImageSize:
|
def get_image_size_with_most_features(self) -> ImageSize:
|
||||||
image_processor = self.get_image_processor()
|
image_processor = self.get_image_processor()
|
||||||
return ImageSize(
|
return ImageSize(width=image_processor.size["width"],
|
||||||
width=image_processor.size["width"],
|
height=image_processor.size["height"])
|
||||||
height=image_processor.size["height"],
|
|
||||||
)
|
|
||||||
```
|
```
|
||||||
|
|
||||||
Fuyu does not expect image placeholders in the inputs to HF processor, so
|
Fuyu does not expect image placeholders in the inputs to HF processor, so
|
||||||
@ -454,12 +452,10 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
|
|||||||
|
|
||||||
return {
|
return {
|
||||||
"image":
|
"image":
|
||||||
self._get_dummy_images(
|
self._get_dummy_images(width=target_width,
|
||||||
width=target_width,
|
height=target_height,
|
||||||
height=target_height,
|
num_images=num_images,
|
||||||
num_images=num_images,
|
overrides=image_overrides)
|
||||||
overrides=image_overrides,
|
|
||||||
)
|
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
@ -507,7 +503,7 @@ return a schema of the tensors outputted by the HF processor that are related to
|
|||||||
```
|
```
|
||||||
|
|
||||||
!!! note
|
!!! note
|
||||||
Our [actual code](../../../vllm/model_executor/models/llava.py) additionally supports
|
Our [actual code](gh-file:vllm/model_executor/models/llava.py) additionally supports
|
||||||
pre-computed image embeddings, which can be passed to be model via the `image_embeds` argument.
|
pre-computed image embeddings, which can be passed to be model via the `image_embeds` argument.
|
||||||
|
|
||||||
=== "With postprocessing: Fuyu"
|
=== "With postprocessing: Fuyu"
|
||||||
@ -569,7 +565,7 @@ return a schema of the tensors outputted by the HF processor that are related to
|
|||||||
```
|
```
|
||||||
|
|
||||||
!!! note
|
!!! note
|
||||||
Our [actual code](../../../vllm/model_executor/models/fuyu.py) has special handling
|
Our [actual code](gh-file:vllm/model_executor/models/fuyu.py) has special handling
|
||||||
for text-only inputs to prevent unnecessary warnings from HF processor.
|
for text-only inputs to prevent unnecessary warnings from HF processor.
|
||||||
|
|
||||||
!!! note
|
!!! note
|
||||||
@ -748,7 +744,8 @@ Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies
|
|||||||
image_width=image_size.width,
|
image_width=image_size.width,
|
||||||
image_height=image_size.height,
|
image_height=image_size.height,
|
||||||
)
|
)
|
||||||
image_tokens = ([_IMAGE_TOKEN_ID] * ncols + [_NEWLINE_TOKEN_ID]) * nrows
|
image_tokens = ([_IMAGE_TOKEN_ID] * ncols +
|
||||||
|
[_NEWLINE_TOKEN_ID]) * nrows
|
||||||
|
|
||||||
return PromptUpdateDetails.select_token_id(
|
return PromptUpdateDetails.select_token_id(
|
||||||
image_tokens + [bos_token_id],
|
image_tokens + [bos_token_id],
|
||||||
@ -784,7 +781,8 @@ Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies
|
|||||||
image_width=image_size.width,
|
image_width=image_size.width,
|
||||||
image_height=image_size.height,
|
image_height=image_size.height,
|
||||||
)
|
)
|
||||||
image_tokens = ([_IMAGE_TOKEN_ID] * ncols + [_NEWLINE_TOKEN_ID]) * nrows
|
image_tokens = ([_IMAGE_TOKEN_ID] * ncols +
|
||||||
|
[_NEWLINE_TOKEN_ID]) * nrows
|
||||||
|
|
||||||
return PromptUpdateDetails.select_token_id(
|
return PromptUpdateDetails.select_token_id(
|
||||||
image_tokens + [bos_token_id],
|
image_tokens + [bos_token_id],
|
||||||
@ -812,11 +810,9 @@ to register them to the multi-modal registry:
|
|||||||
from vllm.model_executor.models.interfaces import SupportsMultiModal
|
from vllm.model_executor.models.interfaces import SupportsMultiModal
|
||||||
+ from vllm.multimodal import MULTIMODAL_REGISTRY
|
+ from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||||
|
|
||||||
+ @MULTIMODAL_REGISTRY.register_processor(
|
+ @MULTIMODAL_REGISTRY.register_processor(YourMultiModalProcessor,
|
||||||
+ YourMultiModalProcessor,
|
+ info=YourProcessingInfo,
|
||||||
+ info=YourProcessingInfo,
|
+ dummy_inputs=YourDummyInputsBuilder)
|
||||||
+ dummy_inputs=YourDummyInputsBuilder,
|
|
||||||
+ )
|
|
||||||
class YourModelForImage2Seq(nn.Module, SupportsMultiModal):
|
class YourModelForImage2Seq(nn.Module, SupportsMultiModal):
|
||||||
```
|
```
|
||||||
|
|
||||||
@ -828,8 +824,8 @@ Some HF processors directly insert feature tokens without replacing anything in
|
|||||||
|
|
||||||
Examples:
|
Examples:
|
||||||
|
|
||||||
- BLIP-2 (insert at start of prompt): [vllm/model_executor/models/blip2.py](../../../vllm/model_executor/models/blip2.py)
|
- BLIP-2 (insert at start of prompt): <gh-file:vllm/model_executor/models/blip2.py>
|
||||||
- Molmo (insert after `<|endoftext|>` token): [vllm/model_executor/models/molmo.py](../../../vllm/model_executor/models/molmo.py)
|
- Molmo (insert after `<|endoftext|>` token): <gh-file:vllm/model_executor/models/molmo.py>
|
||||||
|
|
||||||
### Handling prompt updates unrelated to multi-modal data
|
### Handling prompt updates unrelated to multi-modal data
|
||||||
|
|
||||||
@ -837,9 +833,9 @@ Examples:
|
|||||||
|
|
||||||
Examples:
|
Examples:
|
||||||
|
|
||||||
- Chameleon (appends `sep_token`): [vllm/model_executor/models/chameleon.py](../../../vllm/model_executor/models/chameleon.py)
|
- Chameleon (appends `sep_token`): <gh-file:vllm/model_executor/models/chameleon.py>
|
||||||
- Fuyu (appends `boa_token`): [vllm/model_executor/models/fuyu.py](../../../vllm/model_executor/models/fuyu.py)
|
- Fuyu (appends `boa_token`): <gh-file:vllm/model_executor/models/fuyu.py>
|
||||||
- Molmo (applies chat template which is not defined elsewhere): [vllm/model_executor/models/molmo.py](../../../vllm/model_executor/models/molmo.py)
|
- Molmo (applies chat template which is not defined elsewhere): <gh-file:vllm/model_executor/models/molmo.py>
|
||||||
|
|
||||||
### Custom HF processor
|
### Custom HF processor
|
||||||
|
|
||||||
@ -847,6 +843,6 @@ Some models don't define an HF processor class on HF Hub. In that case, you can
|
|||||||
|
|
||||||
Examples:
|
Examples:
|
||||||
|
|
||||||
- DeepSeek-VL2: [vllm/model_executor/models/deepseek_vl2.py](../../../vllm/model_executor/models/deepseek_vl2.py)
|
- DeepSeek-VL2: <gh-file:vllm/model_executor/models/deepseek_vl2.py>
|
||||||
- InternVL: [vllm/model_executor/models/internvl.py](../../../vllm/model_executor/models/internvl.py)
|
- InternVL: <gh-file:vllm/model_executor/models/internvl.py>
|
||||||
- Qwen-VL: [vllm/model_executor/models/qwen_vl.py](../../../vllm/model_executor/models/qwen_vl.py)
|
- Qwen-VL: <gh-file:vllm/model_executor/models/qwen_vl.py>
|
||||||
|
|||||||
@ -11,8 +11,8 @@ This page provides detailed instructions on how to do so.
|
|||||||
To add a model directly to the vLLM library, start by forking our [GitHub repository](https://github.com/vllm-project/vllm) and then [build it from source][build-from-source].
|
To add a model directly to the vLLM library, start by forking our [GitHub repository](https://github.com/vllm-project/vllm) and then [build it from source][build-from-source].
|
||||||
This gives you the ability to modify the codebase and test your model.
|
This gives you the ability to modify the codebase and test your model.
|
||||||
|
|
||||||
After you have implemented your model (see [tutorial](basic.md)), put it into the [vllm/model_executor/models](../../../vllm/model_executor/models) directory.
|
After you have implemented your model (see [tutorial](basic.md)), put it into the <gh-dir:vllm/model_executor/models> directory.
|
||||||
Then, add your model class to `_VLLM_MODELS` in [vllm/model_executor/models/registry.py](../../../vllm/model_executor/models/registry.py) so that it is automatically registered upon importing vLLM.
|
Then, add your model class to `_VLLM_MODELS` in <gh-file:vllm/model_executor/models/registry.py> so that it is automatically registered upon importing vLLM.
|
||||||
Finally, update our [list of supported models](../../models/supported_models.md) to promote your model!
|
Finally, update our [list of supported models](../../models/supported_models.md) to promote your model!
|
||||||
|
|
||||||
!!! important
|
!!! important
|
||||||
@ -42,7 +42,7 @@ def register():
|
|||||||
|
|
||||||
ModelRegistry.register_model(
|
ModelRegistry.register_model(
|
||||||
"YourModelForCausalLM",
|
"YourModelForCausalLM",
|
||||||
"your_code:YourModelForCausalLM",
|
"your_code:YourModelForCausalLM"
|
||||||
)
|
)
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|||||||
@ -9,7 +9,7 @@ Without them, the CI for your PR will fail.
|
|||||||
|
|
||||||
### Model loading
|
### Model loading
|
||||||
|
|
||||||
Include an example HuggingFace repository for your model in [tests/models/registry.py](../../../tests/models/registry.py).
|
Include an example HuggingFace repository for your model in <gh-file:tests/models/registry.py>.
|
||||||
This enables a unit test that loads dummy weights to ensure that the model can be initialized in vLLM.
|
This enables a unit test that loads dummy weights to ensure that the model can be initialized in vLLM.
|
||||||
|
|
||||||
!!! important
|
!!! important
|
||||||
@ -26,18 +26,18 @@ Passing these tests provides more confidence that your implementation is correct
|
|||||||
|
|
||||||
### Model correctness
|
### Model correctness
|
||||||
|
|
||||||
These tests compare the model outputs of vLLM against [HF Transformers](https://github.com/huggingface/transformers). You can add new tests under the subdirectories of [tests/models](../../../tests/models).
|
These tests compare the model outputs of vLLM against [HF Transformers](https://github.com/huggingface/transformers). You can add new tests under the subdirectories of <gh-dir:tests/models>.
|
||||||
|
|
||||||
#### Generative models
|
#### Generative models
|
||||||
|
|
||||||
For [generative models](../../models/generative_models.md), there are two levels of correctness tests, as defined in [tests/models/utils.py](../../../tests/models/utils.py):
|
For [generative models](../../models/generative_models.md), there are two levels of correctness tests, as defined in <gh-file:tests/models/utils.py>:
|
||||||
|
|
||||||
- Exact correctness (`check_outputs_equal`): The text outputted by vLLM should exactly match the text outputted by HF.
|
- Exact correctness (`check_outputs_equal`): The text outputted by vLLM should exactly match the text outputted by HF.
|
||||||
- Logprobs similarity (`check_logprobs_close`): The logprobs outputted by vLLM should be in the top-k logprobs outputted by HF, and vice versa.
|
- Logprobs similarity (`check_logprobs_close`): The logprobs outputted by vLLM should be in the top-k logprobs outputted by HF, and vice versa.
|
||||||
|
|
||||||
#### Pooling models
|
#### Pooling models
|
||||||
|
|
||||||
For [pooling models](../../models/pooling_models.md), we simply check the cosine similarity, as defined in [tests/models/utils.py](../../../tests/models/utils.py).
|
For [pooling models](../../models/pooling_models.md), we simply check the cosine similarity, as defined in <gh-file:tests/models/utils.py>.
|
||||||
|
|
||||||
[](){ #mm-processing-tests }
|
[](){ #mm-processing-tests }
|
||||||
|
|
||||||
@ -45,7 +45,7 @@ For [pooling models](../../models/pooling_models.md), we simply check the cosine
|
|||||||
|
|
||||||
#### Common tests
|
#### Common tests
|
||||||
|
|
||||||
Adding your model to [tests/models/multimodal/processing/test_common.py](../../../tests/models/multimodal/processing/test_common.py) verifies that the following input combinations result in the same outputs:
|
Adding your model to <gh-file:tests/models/multimodal/processing/test_common.py> verifies that the following input combinations result in the same outputs:
|
||||||
|
|
||||||
- Text + multi-modal data
|
- Text + multi-modal data
|
||||||
- Tokens + multi-modal data
|
- Tokens + multi-modal data
|
||||||
@ -54,6 +54,6 @@ Adding your model to [tests/models/multimodal/processing/test_common.py](../../.
|
|||||||
|
|
||||||
#### Model-specific tests
|
#### Model-specific tests
|
||||||
|
|
||||||
You can add a new file under [tests/models/multimodal/processing](../../../tests/models/multimodal/processing) to run tests that only apply to your model.
|
You can add a new file under <gh-dir:tests/models/multimodal/processing> to run tests that only apply to your model.
|
||||||
|
|
||||||
For example, if the HF processor for your model accepts user-specified keyword arguments, you can verify that the keyword arguments are being applied correctly, such as in [tests/models/multimodal/processing/test_phi3v.py](../../../tests/models/multimodal/processing/test_phi3v.py).
|
For example, if the HF processor for your model accepts user-specified keyword arguments, you can verify that the keyword arguments are being applied correctly, such as in <gh-file:tests/models/multimodal/processing/test_phi3v.py>.
|
||||||
|
|||||||
@ -15,9 +15,8 @@ Declare supported languages and capabilities:
|
|||||||
- Set `supports_transcription_only=True` if the model should not serve text generation (eg Whisper).
|
- Set `supports_transcription_only=True` if the model should not serve text generation (eg Whisper).
|
||||||
|
|
||||||
??? code "supported_languages and supports_transcription_only"
|
??? code "supported_languages and supports_transcription_only"
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from typing import ClassVar, Mapping, Literal
|
from typing import ClassVar, Mapping, Optional, Literal
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import torch
|
import torch
|
||||||
from torch import nn
|
from torch import nn
|
||||||
@ -44,7 +43,6 @@ Provide an ASR configuration via [get_speech_to_text_config][vllm.model_executor
|
|||||||
This is for controlling general behavior of the API when serving your model:
|
This is for controlling general behavior of the API when serving your model:
|
||||||
|
|
||||||
??? code "get_speech_to_text_config()"
|
??? code "get_speech_to_text_config()"
|
||||||
|
|
||||||
```python
|
```python
|
||||||
class YourASRModel(nn.Module, SupportsTranscription):
|
class YourASRModel(nn.Module, SupportsTranscription):
|
||||||
...
|
...
|
||||||
@ -73,7 +71,6 @@ Implement the prompt construction via [get_generation_prompt][vllm.model_executo
|
|||||||
Return a dict containing `multi_modal_data` with the audio, and either a `prompt` string or `prompt_token_ids`:
|
Return a dict containing `multi_modal_data` with the audio, and either a `prompt` string or `prompt_token_ids`:
|
||||||
|
|
||||||
??? code "get_generation_prompt()"
|
??? code "get_generation_prompt()"
|
||||||
|
|
||||||
```python
|
```python
|
||||||
class YourASRModel(nn.Module, SupportsTranscription):
|
class YourASRModel(nn.Module, SupportsTranscription):
|
||||||
...
|
...
|
||||||
@ -84,10 +81,10 @@ Return a dict containing `multi_modal_data` with the audio, and either a `prompt
|
|||||||
audio: np.ndarray,
|
audio: np.ndarray,
|
||||||
stt_config: SpeechToTextConfig,
|
stt_config: SpeechToTextConfig,
|
||||||
model_config: ModelConfig,
|
model_config: ModelConfig,
|
||||||
language: str | None,
|
language: Optional[str],
|
||||||
task_type: Literal["transcribe", "translate"],
|
task_type: Literal["transcribe", "translate"],
|
||||||
request_prompt: str,
|
request_prompt: str,
|
||||||
to_language: str | None,
|
to_language: Optional[str],
|
||||||
) -> PromptType:
|
) -> PromptType:
|
||||||
# Example with a free-form instruction prompt
|
# Example with a free-form instruction prompt
|
||||||
task_word = "Transcribe" if task_type == "transcribe" else "Translate"
|
task_word = "Transcribe" if task_type == "transcribe" else "Translate"
|
||||||
@ -110,7 +107,6 @@ Return a dict containing `multi_modal_data` with the audio, and either a `prompt
|
|||||||
Return a dict with separate `encoder_prompt` and `decoder_prompt` entries:
|
Return a dict with separate `encoder_prompt` and `decoder_prompt` entries:
|
||||||
|
|
||||||
??? code "get_generation_prompt()"
|
??? code "get_generation_prompt()"
|
||||||
|
|
||||||
```python
|
```python
|
||||||
class YourASRModel(nn.Module, SupportsTranscription):
|
class YourASRModel(nn.Module, SupportsTranscription):
|
||||||
...
|
...
|
||||||
@ -121,10 +117,10 @@ Return a dict with separate `encoder_prompt` and `decoder_prompt` entries:
|
|||||||
audio: np.ndarray,
|
audio: np.ndarray,
|
||||||
stt_config: SpeechToTextConfig,
|
stt_config: SpeechToTextConfig,
|
||||||
model_config: ModelConfig,
|
model_config: ModelConfig,
|
||||||
language: str | None,
|
language: Optional[str],
|
||||||
task_type: Literal["transcribe", "translate"],
|
task_type: Literal["transcribe", "translate"],
|
||||||
request_prompt: str,
|
request_prompt: str,
|
||||||
to_language: str | None,
|
to_language: Optional[str],
|
||||||
) -> PromptType:
|
) -> PromptType:
|
||||||
if language is None:
|
if language is None:
|
||||||
raise ValueError("Language must be specified")
|
raise ValueError("Language must be specified")
|
||||||
@ -152,16 +148,12 @@ Language validation via [validate_language][vllm.model_executor.models.interface
|
|||||||
If your model requires a language and you want a default, override this method (see Whisper):
|
If your model requires a language and you want a default, override this method (see Whisper):
|
||||||
|
|
||||||
??? code "validate_language()"
|
??? code "validate_language()"
|
||||||
|
|
||||||
```python
|
```python
|
||||||
@classmethod
|
@classmethod
|
||||||
def validate_language(cls, language: str | None) -> str | None:
|
def validate_language(cls, language: Optional[str]) -> Optional[str]:
|
||||||
if language is None:
|
if language is None:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
"Defaulting to language='en'. If you wish to transcribe "
|
"Defaulting to language='en'. If you wish to transcribe audio in a different language, pass the `language` field.")
|
||||||
"audio in a different language, pass the `language` field "
|
|
||||||
"in the TranscriptionRequest."
|
|
||||||
)
|
|
||||||
language = "en"
|
language = "en"
|
||||||
return super().validate_language(language)
|
return super().validate_language(language)
|
||||||
```
|
```
|
||||||
@ -173,7 +165,6 @@ Token accounting for streaming via [get_num_audio_tokens][vllm.model_executor.mo
|
|||||||
Provide a fast duration→token estimate to improve streaming usage statistics:
|
Provide a fast duration→token estimate to improve streaming usage statistics:
|
||||||
|
|
||||||
??? code "get_num_audio_tokens()"
|
??? code "get_num_audio_tokens()"
|
||||||
|
|
||||||
```python
|
```python
|
||||||
class YourASRModel(nn.Module, SupportsTranscription):
|
class YourASRModel(nn.Module, SupportsTranscription):
|
||||||
...
|
...
|
||||||
@ -184,7 +175,7 @@ Provide a fast duration→token estimate to improve streaming usage statistics:
|
|||||||
audio_duration_s: float,
|
audio_duration_s: float,
|
||||||
stt_config: SpeechToTextConfig,
|
stt_config: SpeechToTextConfig,
|
||||||
model_config: ModelConfig,
|
model_config: ModelConfig,
|
||||||
) -> int | None:
|
) -> Optional[int]:
|
||||||
# Return None if unknown; otherwise return an estimate.
|
# Return None if unknown; otherwise return an estimate.
|
||||||
return int(audio_duration_s * stt_config.sample_rate // 320) # example
|
return int(audio_duration_s * stt_config.sample_rate // 320) # example
|
||||||
```
|
```
|
||||||
@ -200,7 +191,6 @@ The API server takes care of basic audio I/O and optional chunking before buildi
|
|||||||
Relevant server logic:
|
Relevant server logic:
|
||||||
|
|
||||||
??? code "_preprocess_speech_to_text()"
|
??? code "_preprocess_speech_to_text()"
|
||||||
|
|
||||||
```python
|
```python
|
||||||
# vllm/entrypoints/openai/speech_to_text.py
|
# vllm/entrypoints/openai/speech_to_text.py
|
||||||
async def _preprocess_speech_to_text(...):
|
async def _preprocess_speech_to_text(...):
|
||||||
@ -248,9 +238,9 @@ No extra registration is required beyond having your model class available via t
|
|||||||
|
|
||||||
## Examples in-tree
|
## Examples in-tree
|
||||||
|
|
||||||
- Whisper encoder–decoder (audio-only): [vllm/model_executor/models/whisper.py](../../../vllm/model_executor/models/whisper.py)
|
- Whisper encoder–decoder (audio-only): <gh-file:vllm/model_executor/models/whisper.py>
|
||||||
- Voxtral decoder-only (audio embeddings + LLM): [vllm/model_executor/models/voxtral.py](../../../vllm/model_executor/models/voxtral.py)
|
- Voxtral decoder-only (audio embeddings + LLM): <gh-file:vllm/model_executor/models/voxtral.py>
|
||||||
- Gemma3n decoder-only with fixed instruction prompt: [vllm/model_executor/models/gemma3n_mm.py](../../../vllm/model_executor/models/gemma3n_mm.py)
|
- Gemma3n decoder-only with fixed instruction prompt: <gh-file:vllm/model_executor/models/gemma3n_mm.py>
|
||||||
|
|
||||||
## Test with the API
|
## Test with the API
|
||||||
|
|
||||||
@ -278,7 +268,7 @@ Once your model implements `SupportsTranscription`, you can test the endpoints (
|
|||||||
http://localhost:8000/v1/audio/translations
|
http://localhost:8000/v1/audio/translations
|
||||||
```
|
```
|
||||||
|
|
||||||
Or check out more examples in [examples/online_serving](../../../examples/online_serving).
|
Or check out more examples in <gh-file:examples/online_serving>.
|
||||||
|
|
||||||
!!! note
|
!!! note
|
||||||
- If your model handles chunking internally (e.g., via its processor or encoder), set `min_energy_split_window_size=None` in the returned `SpeechToTextConfig` to disable server-side chunking.
|
- If your model handles chunking internally (e.g., via its processor or encoder), set `min_energy_split_window_size=None` in the returned `SpeechToTextConfig` to disable server-side chunking.
|
||||||
|
|||||||
@ -33,7 +33,7 @@ Traces can be visualized using <https://ui.perfetto.dev/>.
|
|||||||
|
|
||||||
#### Offline Inference
|
#### Offline Inference
|
||||||
|
|
||||||
Refer to [examples/offline_inference/simple_profiling.py](../../examples/offline_inference/simple_profiling.py) for an example.
|
Refer to <gh-file:examples/offline_inference/simple_profiling.py> for an example.
|
||||||
|
|
||||||
#### OpenAI Server
|
#### OpenAI Server
|
||||||
|
|
||||||
|
|||||||
@ -10,7 +10,7 @@ The image can be used to run OpenAI compatible server and is available on Docker
|
|||||||
```bash
|
```bash
|
||||||
docker run --runtime nvidia --gpus all \
|
docker run --runtime nvidia --gpus all \
|
||||||
-v ~/.cache/huggingface:/root/.cache/huggingface \
|
-v ~/.cache/huggingface:/root/.cache/huggingface \
|
||||||
--env "HF_TOKEN=$HF_TOKEN" \
|
--env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \
|
||||||
-p 8000:8000 \
|
-p 8000:8000 \
|
||||||
--ipc=host \
|
--ipc=host \
|
||||||
vllm/vllm-openai:latest \
|
vllm/vllm-openai:latest \
|
||||||
@ -22,7 +22,7 @@ This image can also be used with other container engines such as [Podman](https:
|
|||||||
```bash
|
```bash
|
||||||
podman run --device nvidia.com/gpu=all \
|
podman run --device nvidia.com/gpu=all \
|
||||||
-v ~/.cache/huggingface:/root/.cache/huggingface \
|
-v ~/.cache/huggingface:/root/.cache/huggingface \
|
||||||
--env "HF_TOKEN=$HF_TOKEN" \
|
--env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \
|
||||||
-p 8000:8000 \
|
-p 8000:8000 \
|
||||||
--ipc=host \
|
--ipc=host \
|
||||||
docker.io/vllm/vllm-openai:latest \
|
docker.io/vllm/vllm-openai:latest \
|
||||||
@ -37,7 +37,7 @@ You can add any other [engine-args](../configuration/engine_args.md) you need af
|
|||||||
memory to share data between processes under the hood, particularly for tensor parallel inference.
|
memory to share data between processes under the hood, particularly for tensor parallel inference.
|
||||||
|
|
||||||
!!! note
|
!!! note
|
||||||
Optional dependencies are not included in order to avoid licensing issues (e.g. <https://github.com/vllm-project/vllm/issues/8030>).
|
Optional dependencies are not included in order to avoid licensing issues (e.g. <gh-issue:8030>).
|
||||||
|
|
||||||
If you need to use those dependencies (having accepted the license terms),
|
If you need to use those dependencies (having accepted the license terms),
|
||||||
create a custom Dockerfile on top of the base image with an extra layer that installs them:
|
create a custom Dockerfile on top of the base image with an extra layer that installs them:
|
||||||
@ -66,7 +66,7 @@ You can add any other [engine-args](../configuration/engine_args.md) you need af
|
|||||||
|
|
||||||
## Building vLLM's Docker Image from Source
|
## Building vLLM's Docker Image from Source
|
||||||
|
|
||||||
You can build and run vLLM from source via the provided [docker/Dockerfile](../../docker/Dockerfile). To build vLLM:
|
You can build and run vLLM from source via the provided <gh-file:docker/Dockerfile>. To build vLLM:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# optionally specifies: --build-arg max_jobs=8 --build-arg nvcc_threads=2
|
# optionally specifies: --build-arg max_jobs=8 --build-arg nvcc_threads=2
|
||||||
@ -128,7 +128,7 @@ To run vLLM with the custom-built Docker image:
|
|||||||
docker run --runtime nvidia --gpus all \
|
docker run --runtime nvidia --gpus all \
|
||||||
-v ~/.cache/huggingface:/root/.cache/huggingface \
|
-v ~/.cache/huggingface:/root/.cache/huggingface \
|
||||||
-p 8000:8000 \
|
-p 8000:8000 \
|
||||||
--env "HF_TOKEN=<secret>" \
|
--env "HUGGING_FACE_HUB_TOKEN=<secret>" \
|
||||||
vllm/vllm-openai <args...>
|
vllm/vllm-openai <args...>
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|||||||
@ -5,7 +5,7 @@
|
|||||||
[Anyscale](https://www.anyscale.com) is a managed, multi-cloud platform developed by the creators of Ray.
|
[Anyscale](https://www.anyscale.com) is a managed, multi-cloud platform developed by the creators of Ray.
|
||||||
|
|
||||||
Anyscale automates the entire lifecycle of Ray clusters in your AWS, GCP, or Azure account, delivering the flexibility of open-source Ray
|
Anyscale automates the entire lifecycle of Ray clusters in your AWS, GCP, or Azure account, delivering the flexibility of open-source Ray
|
||||||
without the operational overhead of maintaining Kubernetes control planes, configuring autoscalers, managing observability stacks, or manually managing head and worker nodes with helper scripts like [examples/online_serving/run_cluster.sh](../../../examples/online_serving/run_cluster.sh).
|
without the operational overhead of maintaining Kubernetes control planes, configuring autoscalers, managing observability stacks, or manually managing head and worker nodes with helper scripts like <gh-file:examples/online_serving/run_cluster.sh>.
|
||||||
|
|
||||||
When serving large language models with vLLM, Anyscale can rapidly provision [production-ready HTTPS endpoints](https://docs.anyscale.com/examples/deploy-ray-serve-llms) or [fault-tolerant batch inference jobs](https://docs.anyscale.com/examples/ray-data-llm).
|
When serving large language models with vLLM, Anyscale can rapidly provision [production-ready HTTPS endpoints](https://docs.anyscale.com/examples/deploy-ray-serve-llms) or [fault-tolerant batch inference jobs](https://docs.anyscale.com/examples/ray-data-llm).
|
||||||
|
|
||||||
|
|||||||
@ -63,7 +63,7 @@ If successful, you should be returned a CURL command that you can call inference
|
|||||||
|
|
||||||
??? console "Command"
|
??? console "Command"
|
||||||
|
|
||||||
```bash
|
```python
|
||||||
curl -X POST https://api.cortex.cerebrium.ai/v4/p-xxxxxx/vllm/run \
|
curl -X POST https://api.cortex.cerebrium.ai/v4/p-xxxxxx/vllm/run \
|
||||||
-H 'Content-Type: application/json' \
|
-H 'Content-Type: application/json' \
|
||||||
-H 'Authorization: <JWT TOKEN>' \
|
-H 'Authorization: <JWT TOKEN>' \
|
||||||
@ -81,7 +81,7 @@ You should get a response like:
|
|||||||
|
|
||||||
??? console "Response"
|
??? console "Response"
|
||||||
|
|
||||||
```json
|
```python
|
||||||
{
|
{
|
||||||
"run_id": "52911756-3066-9ae8-bcc9-d9129d1bd262",
|
"run_id": "52911756-3066-9ae8-bcc9-d9129d1bd262",
|
||||||
"result": {
|
"result": {
|
||||||
|
|||||||
@ -83,7 +83,7 @@ After the provisioning, you can interact with the model by using the OpenAI SDK:
|
|||||||
|
|
||||||
client = OpenAI(
|
client = OpenAI(
|
||||||
base_url="https://gateway.<gateway domain>",
|
base_url="https://gateway.<gateway domain>",
|
||||||
api_key="<YOUR-DSTACK-SERVER-ACCESS-TOKEN>",
|
api_key="<YOUR-DSTACK-SERVER-ACCESS-TOKEN>"
|
||||||
)
|
)
|
||||||
|
|
||||||
completion = client.chat.completions.create(
|
completion = client.chat.completions.create(
|
||||||
@ -93,7 +93,7 @@ After the provisioning, you can interact with the model by using the OpenAI SDK:
|
|||||||
"role": "user",
|
"role": "user",
|
||||||
"content": "Compose a poem that explains the concept of recursion in programming.",
|
"content": "Compose a poem that explains the concept of recursion in programming.",
|
||||||
}
|
}
|
||||||
],
|
]
|
||||||
)
|
)
|
||||||
|
|
||||||
print(completion.choices[0].message.content)
|
print(completion.choices[0].message.content)
|
||||||
|
|||||||
@ -34,7 +34,7 @@ pip install vllm haystack-ai
|
|||||||
api_key=Secret.from_token("VLLM-PLACEHOLDER-API-KEY"),
|
api_key=Secret.from_token("VLLM-PLACEHOLDER-API-KEY"),
|
||||||
model="mistralai/Mistral-7B-Instruct-v0.1",
|
model="mistralai/Mistral-7B-Instruct-v0.1",
|
||||||
api_base_url="http://{your-vLLM-host-ip}:{your-vLLM-host-port}/v1",
|
api_base_url="http://{your-vLLM-host-ip}:{your-vLLM-host-port}/v1",
|
||||||
generation_kwargs={"max_tokens": 512},
|
generation_kwargs = {"max_tokens": 512}
|
||||||
)
|
)
|
||||||
|
|
||||||
response = generator.run(
|
response = generator.run(
|
||||||
|
|||||||
@ -32,28 +32,28 @@ This is the easiest way to get started with vLLM on Hugging Face Inference Endpo
|
|||||||
import os
|
import os
|
||||||
|
|
||||||
client = OpenAI(
|
client = OpenAI(
|
||||||
base_url=DEPLOYMENT_URL,
|
base_url = DEPLOYMENT_URL,
|
||||||
api_key=os.environ["HF_TOKEN"], # https://huggingface.co/settings/tokens
|
api_key = os.environ["HF_TOKEN"] # https://huggingface.co/settings/tokens
|
||||||
)
|
)
|
||||||
|
|
||||||
chat_completion = client.chat.completions.create(
|
chat_completion = client.chat.completions.create(
|
||||||
model="HuggingFaceTB/SmolLM3-3B",
|
model = "HuggingFaceTB/SmolLM3-3B",
|
||||||
messages=[
|
messages = [
|
||||||
{
|
{
|
||||||
"role": "user",
|
"role": "user",
|
||||||
"content": [
|
"content": [
|
||||||
{
|
{
|
||||||
"type": "text",
|
"type": "text",
|
||||||
"text": "Give me a brief explanation of gravity in simple terms.",
|
"text": "Give me a brief explanation of gravity in simple terms."
|
||||||
}
|
}
|
||||||
],
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
stream=True,
|
stream = True
|
||||||
)
|
)
|
||||||
|
|
||||||
for message in chat_completion:
|
for message in chat_completion:
|
||||||
print(message.choices[0].delta.content, end="")
|
print(message.choices[0].delta.content, end = "")
|
||||||
```
|
```
|
||||||
|
|
||||||
!!! note
|
!!! note
|
||||||
@ -86,34 +86,34 @@ This method applies to models with the [`transformers` library tag](https://hugg
|
|||||||
import os
|
import os
|
||||||
|
|
||||||
client = OpenAI(
|
client = OpenAI(
|
||||||
base_url=DEPLOYMENT_URL,
|
base_url = DEPLOYMENT_URL,
|
||||||
api_key=os.environ["HF_TOKEN"], # https://huggingface.co/settings/tokens
|
api_key = os.environ["HF_TOKEN"] # https://huggingface.co/settings/tokens
|
||||||
)
|
)
|
||||||
|
|
||||||
chat_completion = client.chat.completions.create(
|
chat_completion = client.chat.completions.create(
|
||||||
model="ibm-granite/granite-docling-258M",
|
model = "ibm-granite/granite-docling-258M",
|
||||||
messages=[
|
messages = [
|
||||||
{
|
{
|
||||||
"role": "user",
|
"role": "user",
|
||||||
"content": [
|
"content": [
|
||||||
{
|
{
|
||||||
"type": "image_url",
|
"type": "image_url",
|
||||||
"image_url": {
|
"image_url": {
|
||||||
"url": "https://huggingface.co/ibm-granite/granite-docling-258M/resolve/main/assets/new_arxiv.png",
|
"url": "https://huggingface.co/ibm-granite/granite-docling-258M/resolve/main/assets/new_arxiv.png"
|
||||||
},
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"type": "text",
|
"type": "text",
|
||||||
"text": "Convert this page to docling.",
|
"text": "Convert this page to docling."
|
||||||
},
|
}
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
stream=True,
|
stream = True
|
||||||
)
|
)
|
||||||
|
|
||||||
for message in chat_completion:
|
for message in chat_completion:
|
||||||
print(message.choices[0].delta.content, end="")
|
print(message.choices[0].delta.content, end = "")
|
||||||
```
|
```
|
||||||
|
|
||||||
!!! note
|
!!! note
|
||||||
|
|||||||
@ -36,16 +36,15 @@ pip install vllm litellm
|
|||||||
```python
|
```python
|
||||||
import litellm
|
import litellm
|
||||||
|
|
||||||
messages = [{"content": "Hello, how are you?", "role": "user"}]
|
messages = [{ "content": "Hello, how are you?","role": "user"}]
|
||||||
|
|
||||||
# hosted_vllm is prefix key word and necessary
|
# hosted_vllm is prefix key word and necessary
|
||||||
response = litellm.completion(
|
response = litellm.completion(
|
||||||
model="hosted_vllm/qwen/Qwen1.5-0.5B-Chat", # pass the vllm model name
|
model="hosted_vllm/qwen/Qwen1.5-0.5B-Chat", # pass the vllm model name
|
||||||
messages=messages,
|
messages=messages,
|
||||||
api_base="http://{your-vllm-server-host}:{your-vllm-server-port}/v1",
|
api_base="http://{your-vllm-server-host}:{your-vllm-server-port}/v1",
|
||||||
temperature=0.2,
|
temperature=0.2,
|
||||||
max_tokens=80,
|
max_tokens=80)
|
||||||
)
|
|
||||||
|
|
||||||
print(response)
|
print(response)
|
||||||
```
|
```
|
||||||
|
|||||||
@ -35,7 +35,7 @@ Deploy the following yaml file `lws.yaml`
|
|||||||
- name: vllm-leader
|
- name: vllm-leader
|
||||||
image: docker.io/vllm/vllm-openai:latest
|
image: docker.io/vllm/vllm-openai:latest
|
||||||
env:
|
env:
|
||||||
- name: HF_TOKEN
|
- name: HUGGING_FACE_HUB_TOKEN
|
||||||
value: <your-hf-token>
|
value: <your-hf-token>
|
||||||
command:
|
command:
|
||||||
- sh
|
- sh
|
||||||
@ -83,7 +83,7 @@ Deploy the following yaml file `lws.yaml`
|
|||||||
ephemeral-storage: 800Gi
|
ephemeral-storage: 800Gi
|
||||||
cpu: 125
|
cpu: 125
|
||||||
env:
|
env:
|
||||||
- name: HF_TOKEN
|
- name: HUGGING_FACE_HUB_TOKEN
|
||||||
value: <your-hf-token>
|
value: <your-hf-token>
|
||||||
volumeMounts:
|
volumeMounts:
|
||||||
- mountPath: /dev/shm
|
- mountPath: /dev/shm
|
||||||
|
|||||||
@ -36,11 +36,11 @@ pip install -U vllm \
|
|||||||
vllm serve qwen/Qwen1.5-0.5B-Chat --port 8001
|
vllm serve qwen/Qwen1.5-0.5B-Chat --port 8001
|
||||||
```
|
```
|
||||||
|
|
||||||
1. Use the script: [examples/online_serving/retrieval_augmented_generation_with_langchain.py](../../../examples/online_serving/retrieval_augmented_generation_with_langchain.py)
|
1. Use the script: <gh-file:examples/online_serving/retrieval_augmented_generation_with_langchain.py>
|
||||||
|
|
||||||
1. Run the script
|
1. Run the script
|
||||||
|
|
||||||
```bash
|
```python
|
||||||
python retrieval_augmented_generation_with_langchain.py
|
python retrieval_augmented_generation_with_langchain.py
|
||||||
```
|
```
|
||||||
|
|
||||||
@ -74,10 +74,10 @@ pip install vllm \
|
|||||||
vllm serve qwen/Qwen1.5-0.5B-Chat --port 8001
|
vllm serve qwen/Qwen1.5-0.5B-Chat --port 8001
|
||||||
```
|
```
|
||||||
|
|
||||||
1. Use the script: [examples/online_serving/retrieval_augmented_generation_with_llamaindex.py](../../../examples/online_serving/retrieval_augmented_generation_with_llamaindex.py)
|
1. Use the script: <gh-file:examples/online_serving/retrieval_augmented_generation_with_llamaindex.py>
|
||||||
|
|
||||||
1. Run the script:
|
1. Run the script:
|
||||||
|
|
||||||
```bash
|
```python
|
||||||
python retrieval_augmented_generation_with_llamaindex.py
|
python retrieval_augmented_generation_with_llamaindex.py
|
||||||
```
|
```
|
||||||
|
|||||||
@ -20,7 +20,7 @@ pip install vllm streamlit openai
|
|||||||
vllm serve Qwen/Qwen1.5-0.5B-Chat
|
vllm serve Qwen/Qwen1.5-0.5B-Chat
|
||||||
```
|
```
|
||||||
|
|
||||||
1. Use the script: [examples/online_serving/streamlit_openai_chatbot_webserver.py](../../../examples/online_serving/streamlit_openai_chatbot_webserver.py)
|
1. Use the script: <gh-file:examples/online_serving/streamlit_openai_chatbot_webserver.py>
|
||||||
|
|
||||||
1. Start the streamlit web UI and start to chat:
|
1. Start the streamlit web UI and start to chat:
|
||||||
|
|
||||||
|
|||||||
@ -82,7 +82,7 @@ Next, start the vLLM server as a Kubernetes Deployment and Service:
|
|||||||
"vllm serve meta-llama/Llama-3.2-1B-Instruct"
|
"vllm serve meta-llama/Llama-3.2-1B-Instruct"
|
||||||
]
|
]
|
||||||
env:
|
env:
|
||||||
- name: HF_TOKEN
|
- name: HUGGING_FACE_HUB_TOKEN
|
||||||
valueFrom:
|
valueFrom:
|
||||||
secretKeyRef:
|
secretKeyRef:
|
||||||
name: hf-token-secret
|
name: hf-token-secret
|
||||||
@ -209,7 +209,7 @@ INFO: Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
|
|||||||
"vllm serve mistralai/Mistral-7B-Instruct-v0.3 --trust-remote-code --enable-chunked-prefill --max_num_batched_tokens 1024"
|
"vllm serve mistralai/Mistral-7B-Instruct-v0.3 --trust-remote-code --enable-chunked-prefill --max_num_batched_tokens 1024"
|
||||||
]
|
]
|
||||||
env:
|
env:
|
||||||
- name: HF_TOKEN
|
- name: HUGGING_FACE_HUB_TOKEN
|
||||||
valueFrom:
|
valueFrom:
|
||||||
secretKeyRef:
|
secretKeyRef:
|
||||||
name: hf-token-secret
|
name: hf-token-secret
|
||||||
@ -298,7 +298,7 @@ INFO: Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
|
|||||||
"vllm serve mistralai/Mistral-7B-v0.3 --port 8000 --trust-remote-code --enable-chunked-prefill --max_num_batched_tokens 1024"
|
"vllm serve mistralai/Mistral-7B-v0.3 --port 8000 --trust-remote-code --enable-chunked-prefill --max_num_batched_tokens 1024"
|
||||||
]
|
]
|
||||||
env:
|
env:
|
||||||
- name: HF_TOKEN
|
- name: HUGGING_FACE_HUB_TOKEN
|
||||||
valueFrom:
|
valueFrom:
|
||||||
secretKeyRef:
|
secretKeyRef:
|
||||||
name: hf-token-secret
|
name: hf-token-secret
|
||||||
|
|||||||
@ -49,7 +49,7 @@ Here is a sample of `LLM` class usage:
|
|||||||
|
|
||||||
More API details can be found in the [Offline Inference](#offline-inference-api) section of the API docs.
|
More API details can be found in the [Offline Inference](#offline-inference-api) section of the API docs.
|
||||||
|
|
||||||
The code for the `LLM` class can be found in [vllm/entrypoints/llm.py](../../vllm/entrypoints/llm.py).
|
The code for the `LLM` class can be found in <gh-file:vllm/entrypoints/llm.py>.
|
||||||
|
|
||||||
### OpenAI-Compatible API Server
|
### OpenAI-Compatible API Server
|
||||||
|
|
||||||
@ -60,7 +60,7 @@ This server can be started using the `vllm serve` command.
|
|||||||
vllm serve <model>
|
vllm serve <model>
|
||||||
```
|
```
|
||||||
|
|
||||||
The code for the `vllm` CLI can be found in [vllm/entrypoints/cli/main.py](../../vllm/entrypoints/cli/main.py).
|
The code for the `vllm` CLI can be found in <gh-file:vllm/entrypoints/cli/main.py>.
|
||||||
|
|
||||||
Sometimes you may see the API server entrypoint used directly instead of via the
|
Sometimes you may see the API server entrypoint used directly instead of via the
|
||||||
`vllm` CLI command. For example:
|
`vllm` CLI command. For example:
|
||||||
@ -74,7 +74,7 @@ python -m vllm.entrypoints.openai.api_server --model <model>
|
|||||||
`python -m vllm.entrypoints.openai.api_server` is deprecated
|
`python -m vllm.entrypoints.openai.api_server` is deprecated
|
||||||
and may become unsupported in a future release.
|
and may become unsupported in a future release.
|
||||||
|
|
||||||
That code can be found in [vllm/entrypoints/openai/api_server.py](../../vllm/entrypoints/openai/api_server.py).
|
That code can be found in <gh-file:vllm/entrypoints/openai/api_server.py>.
|
||||||
|
|
||||||
More details on the API server can be found in the [OpenAI-Compatible Server](../serving/openai_compatible_server.md) document.
|
More details on the API server can be found in the [OpenAI-Compatible Server](../serving/openai_compatible_server.md) document.
|
||||||
|
|
||||||
@ -101,7 +101,7 @@ processing.
|
|||||||
- **Output Processing**: Processes the outputs generated by the model, decoding the
|
- **Output Processing**: Processes the outputs generated by the model, decoding the
|
||||||
token IDs from a language model into human-readable text.
|
token IDs from a language model into human-readable text.
|
||||||
|
|
||||||
The code for `LLMEngine` can be found in [vllm/engine/llm_engine.py](../../vllm/engine/llm_engine.py).
|
The code for `LLMEngine` can be found in <gh-file:vllm/engine/llm_engine.py>.
|
||||||
|
|
||||||
### AsyncLLMEngine
|
### AsyncLLMEngine
|
||||||
|
|
||||||
@ -111,9 +111,9 @@ incoming requests. The `AsyncLLMEngine` is designed for online serving, where it
|
|||||||
can handle multiple concurrent requests and stream outputs to clients.
|
can handle multiple concurrent requests and stream outputs to clients.
|
||||||
|
|
||||||
The OpenAI-compatible API server uses the `AsyncLLMEngine`. There is also a demo
|
The OpenAI-compatible API server uses the `AsyncLLMEngine`. There is also a demo
|
||||||
API server that serves as a simpler example in [vllm/entrypoints/api_server.py](../../vllm/entrypoints/api_server.py).
|
API server that serves as a simpler example in <gh-file:vllm/entrypoints/api_server.py>.
|
||||||
|
|
||||||
The code for `AsyncLLMEngine` can be found in [vllm/engine/async_llm_engine.py](../../vllm/engine/async_llm_engine.py).
|
The code for `AsyncLLMEngine` can be found in <gh-file:vllm/engine/async_llm_engine.py>.
|
||||||
|
|
||||||
## Worker
|
## Worker
|
||||||
|
|
||||||
|
|||||||
@ -17,7 +17,7 @@ In this document we will discuss the:
|
|||||||
In this document, we refer to pure decode (`max_query_len=1`) or speculative decode (`max_query_len =1+num_spec_tokens`) as **uniform decode** batches, and the opposite would be **non-uniform** batches (i.e., prefill or mixed prefill-decode batches).
|
In this document, we refer to pure decode (`max_query_len=1`) or speculative decode (`max_query_len =1+num_spec_tokens`) as **uniform decode** batches, and the opposite would be **non-uniform** batches (i.e., prefill or mixed prefill-decode batches).
|
||||||
|
|
||||||
!!! note
|
!!! note
|
||||||
The following contents are mostly based on the last commit of <https://github.com/vllm-project/vllm/pull/20059>.
|
The following contents are mostly based on the last commit of <gh-pr:20059>.
|
||||||
|
|
||||||
## Motivation
|
## Motivation
|
||||||
|
|
||||||
@ -92,7 +92,7 @@ where `num_tokens` can be the padded token length, and `uniform_decode` is deter
|
|||||||
The goal of this structure is to uniquely identify a (padded) batch with minimal possible items corresponding to a CUDA Graphs item. We are safe to exclude items like `uniform_query_len` because it is a constant at runtime for a certain setup currently. For example, it should be either `1` for a commonly pure decode or `1+num_spec_tokens` for a validation phase of speculative decode.
|
The goal of this structure is to uniquely identify a (padded) batch with minimal possible items corresponding to a CUDA Graphs item. We are safe to exclude items like `uniform_query_len` because it is a constant at runtime for a certain setup currently. For example, it should be either `1` for a commonly pure decode or `1+num_spec_tokens` for a validation phase of speculative decode.
|
||||||
|
|
||||||
!!! note
|
!!! note
|
||||||
The prototype of `BatchDescriptor` may be extended for more general situations in the future, e.g., include more items, like `uniform_query_len` to support multiple different uniform decode lengths settings (<https://github.com/vllm-project/vllm/pull/23679>), or other modifications needed to support CUDA Graphs for models whose inputs are not necessarily token length aware (for example, some multi-modal inputs).
|
The prototype of `BatchDescriptor` may be extended for more general situations in the future, e.g., include more items, like `uniform_query_len` to support multiple different uniform decode lengths settings (<gh-pr:23679>), or other modifications needed to support CUDA Graphs for models whose inputs are not necessarily token length aware (for example, some multi-modal inputs).
|
||||||
|
|
||||||
### `CudagraphDispatcher`
|
### `CudagraphDispatcher`
|
||||||
|
|
||||||
@ -106,11 +106,9 @@ The dispatch code looks like:
|
|||||||
batch_descriptor=BatchDescriptor(num_tokens=num_input_tokens, uniform_decode=...)
|
batch_descriptor=BatchDescriptor(num_tokens=num_input_tokens, uniform_decode=...)
|
||||||
runtime_mode, batch_descriptor = cudagraphdispatcher.dispatch(batch_descriptor)
|
runtime_mode, batch_descriptor = cudagraphdispatcher.dispatch(batch_descriptor)
|
||||||
# execution
|
# execution
|
||||||
with set_forward_context(
|
with set_forward_context(...,
|
||||||
...,
|
cudagraph_runtime_mode=runtime_mode,
|
||||||
cudagraph_runtime_mode=runtime_mode,
|
batch_descriptor=batch_descriptor):
|
||||||
batch_descriptor=batch_descriptor,
|
|
||||||
):
|
|
||||||
output = self.model(...)
|
output = self.model(...)
|
||||||
```
|
```
|
||||||
|
|
||||||
@ -167,7 +165,7 @@ class AttentionCGSupport(enum.Enum):
|
|||||||
"""NO CUDA Graphs support"""
|
"""NO CUDA Graphs support"""
|
||||||
```
|
```
|
||||||
|
|
||||||
Suppose we have hybrid attention backends (e.g., in mamba mixer models). In that case, we seek the minimum capability of all backends to determine the final capability of the model, and we might resolve the incompatible CUDA Graphs mode by downgrading the mode to the best fit one. For example, downgrading `FULL` mode to `FULL_AND_PIECEWISE` mode if the minimum capability is `UNIFORM_BATCH`, or `PIECEWISE` mode if the minimum capability is `NEVER` for -O3 compilation mode. For the complete fallback policy, please see the code of [initialize_cudagraph_capture][vllm.v1.worker.gpu_model_runner.GPUModelRunner.initialize_cudagraph_capture].
|
Suppose we have hybrid attention backends (e.g., in mamba mixer models). In that case, we seek the minimum capability of all backends to determine the final capability of the model, and we might resolve the incompatible CUDA Graphs mode by downgrading the mode to the best fit one. For example, downgrading `FULL` mode to `FULL_AND_PIECEWISE` mode if the minimum capability is `UNIFORM_BATCH`, or `PIECEWISE` mode if the minimum capability is `NEVER` for -O3 compilation level. For the complete fallback policy, please see the code of [initialize_cudagraph_capture][vllm.v1.worker.gpu_model_runner.GPUModelRunner.initialize_cudagraph_capture].
|
||||||
|
|
||||||
The following table lists backends that support full CUDA Graphs at the time of writing.
|
The following table lists backends that support full CUDA Graphs at the time of writing.
|
||||||
|
|
||||||
@ -202,12 +200,12 @@ os.environ.setdefault("VLLM_LOGGING_LEVEL", "DEBUG")
|
|||||||
import vllm
|
import vllm
|
||||||
from vllm.config import CUDAGraphMode
|
from vllm.config import CUDAGraphMode
|
||||||
|
|
||||||
compilation_config = {"mode": 3, "cudagraph_mode": "FULL_AND_PIECEWISE"}
|
compilation_config = {"level": 3, "cudagraph_mode": "FULL_AND_PIECEWISE"}
|
||||||
model = vllm.LLM(
|
model = vllm.LLM(
|
||||||
model="meta-llama/Llama-3.1-8B-Instruct",
|
model="meta-llama/Llama-3.1-8B-Instruct",
|
||||||
dtype="auto",
|
dtype='auto',
|
||||||
compilation_config=compilation_config,
|
compilation_config = compilation_config,
|
||||||
)
|
)
|
||||||
sampling_params = vllm.SamplingParams(
|
sampling_params = vllm.SamplingParams(
|
||||||
temperature=0, # greedy decoding
|
temperature=0, # greedy decoding
|
||||||
max_tokens=1024,
|
max_tokens=1024,
|
||||||
|
|||||||
@ -34,10 +34,10 @@ To enable the DBO system pass in the `--enable-dbo` argument to your vllm serve
|
|||||||
* `--dbo-decode-token-threshold` the minimum number of tokens in a decode-only batch required to enable DBO for that batch
|
* `--dbo-decode-token-threshold` the minimum number of tokens in a decode-only batch required to enable DBO for that batch
|
||||||
* `--dbo-prefill-token-threshold` the minimum number of tokens in a batch containing at least one prefill required to enable DBO for that batch
|
* `--dbo-prefill-token-threshold` the minimum number of tokens in a batch containing at least one prefill required to enable DBO for that batch
|
||||||
|
|
||||||
Currently, DBO is only supported with DeepEP, so DeepEP must be installed and the `--all2all-backend` argument must be set to `deepep_low_latency` if your workload is primarily decode requests, or `deepep_high_throughput` if your workload is primarily prefill requests.
|
Currently, DBO is only supported with DeepEP, so DeepEP must be installed and the `VLLM_ALL2ALL_BACKEND` environment variable must be set to `deepep_low_latency` if your workload is primarily decode requests, or `deepep_high_throughput` if your workload is primarily prefill requests.
|
||||||
|
|
||||||
Below is a command that will spin up a two DP rank server with expert parallelism and DBO enabled.
|
Below is a command that will spin up a two DP rank server with expert parallelism and DBO enabled.
|
||||||
EX: `vllm serve deepseek-ai/DeepSeek-V2-Lite --trust-remote-code --data-parallel-size 2 --enable-expert-parallel --enable-dbo --all2all-backend deepep_low_latency`
|
EX: `VLLM_ALL2ALL_BACKEND=deepep_low_latency vllm serve --model="deepseek-ai/DeepSeek-V2-Lite" --trust-remote-code --data-parallel-size 2 --enable-expert-parallel --enable-dbo`
|
||||||
|
|
||||||
Note that there must be at least two GPUs visible in `CUDA_VISIBLE_DEVICES`
|
Note that there must be at least two GPUs visible in `CUDA_VISIBLE_DEVICES`
|
||||||
|
|
||||||
|
|||||||
@ -2,7 +2,7 @@
|
|||||||
|
|
||||||
## Introduction
|
## Introduction
|
||||||
|
|
||||||
FusedMoEModularKernel is implemented [here](../..//vllm/model_executor/layers/fused_moe/modular_kernel.py)
|
FusedMoEModularKernel is implemented [here](gh-file:/vllm/model_executor/layers/fused_moe/modular_kernel.py)
|
||||||
|
|
||||||
Based on the format of the input activations, FusedMoE implementations are broadly classified into 2 types.
|
Based on the format of the input activations, FusedMoE implementations are broadly classified into 2 types.
|
||||||
|
|
||||||
@ -44,7 +44,7 @@ FusedMoEModularKernel splits the FusedMoE operation into 3 parts,
|
|||||||
|
|
||||||
The TopK Weight Application and Reduction components happen right after the Unpermute operation and before the All2All Combine. Note that the `FusedMoEPermuteExpertsUnpermute` is responsible for the Unpermute and `FusedMoEPrepareAndFinalize` is responsible for the All2All Combine. There is value in doing the TopK Weight Application and Reduction in the `FusedMoEPermuteExpertsUnpermute`. But some implementations choose to do it `FusedMoEPrepareAndFinalize`. In order to enable this flexibility, we have a TopKWeightAndReduce abstract class.
|
The TopK Weight Application and Reduction components happen right after the Unpermute operation and before the All2All Combine. Note that the `FusedMoEPermuteExpertsUnpermute` is responsible for the Unpermute and `FusedMoEPrepareAndFinalize` is responsible for the All2All Combine. There is value in doing the TopK Weight Application and Reduction in the `FusedMoEPermuteExpertsUnpermute`. But some implementations choose to do it `FusedMoEPrepareAndFinalize`. In order to enable this flexibility, we have a TopKWeightAndReduce abstract class.
|
||||||
|
|
||||||
Please find the implementations of TopKWeightAndReduce [here](../../vllm/model_executor/layers/fused_moe/topk_weight_and_reduce.py).
|
Please find the implementations of TopKWeightAndReduce [here](gh-file:vllm/model_executor/layers/fused_moe/topk_weight_and_reduce.py).
|
||||||
|
|
||||||
`FusedMoEPrepareAndFinalize::finalize()` method accepts a `TopKWeightAndReduce` argument that is invoked inside the method.
|
`FusedMoEPrepareAndFinalize::finalize()` method accepts a `TopKWeightAndReduce` argument that is invoked inside the method.
|
||||||
The `FusedMoEModularKernel` acts as a bridge between the `FusedMoEPermuteExpertsUnpermute` and `FusedMoEPerpareAndFinalize` implementations to determine where the TopK Weight Application and Reduction happens.
|
The `FusedMoEModularKernel` acts as a bridge between the `FusedMoEPermuteExpertsUnpermute` and `FusedMoEPerpareAndFinalize` implementations to determine where the TopK Weight Application and Reduction happens.
|
||||||
@ -138,7 +138,7 @@ Typically a FusedMoEPrepareAndFinalize type is backed by an All2All Dispatch & C
|
|||||||
|
|
||||||
#### Step 1: Add an All2All manager
|
#### Step 1: Add an All2All manager
|
||||||
|
|
||||||
The purpose of the All2All Manager is to set up the All2All kernel implementations. The `FusedMoEPrepareAndFinalize` implementations typically fetch a kernel-implementation "handle" from the All2All Manager to invoke the Dispatch and Combine functions. Please look at the All2All Manager implementations [here](../../vllm/distributed/device_communicators/all2all.py).
|
The purpose of the All2All Manager is to set up the All2All kernel implementations. The `FusedMoEPrepareAndFinalize` implementations typically fetch a kernel-implementation "handle" from the All2All Manager to invoke the Dispatch and Combine functions. Please look at the All2All Manager implementations [here](gh-file:vllm/distributed/device_communicators/all2all.py).
|
||||||
|
|
||||||
#### Step 2: Add a FusedMoEPrepareAndFinalize Type
|
#### Step 2: Add a FusedMoEPrepareAndFinalize Type
|
||||||
|
|
||||||
@ -213,29 +213,29 @@ Please take a look at [init_prepare_finalize](https://github.com/vllm-project/vl
|
|||||||
|
|
||||||
### How To Unit Test
|
### How To Unit Test
|
||||||
|
|
||||||
We have `FusedMoEModularKernel` unit tests at [test_modular_kernel_combinations.py](../../tests/kernels/moe/test_modular_kernel_combinations.py).
|
We have `FusedMoEModularKernel` unit tests at [test_modular_kernel_combinations.py](gh-file:tests/kernels/moe/test_modular_kernel_combinations.py).
|
||||||
|
|
||||||
The unit test iterates through all combinations of `FusedMoEPrepareAndFinalize` and `FusedMoEPremuteExpertsUnpermute` types and if they are
|
The unit test iterates through all combinations of `FusedMoEPrepareAndFinalize` and `FusedMoEPremuteExpertsUnpermute` types and if they are
|
||||||
compatible, runs some correctness tests.
|
compatible, runs some correctness tests.
|
||||||
If you are adding some `FusedMoEPrepareAndFinalize` / `FusedMoEPermuteExpertsUnpermute` implementations,
|
If you are adding some `FusedMoEPrepareAndFinalize` / `FusedMoEPermuteExpertsUnpermute` implementations,
|
||||||
|
|
||||||
1. Add the implementation type to `MK_ALL_PREPARE_FINALIZE_TYPES` and `MK_FUSED_EXPERT_TYPES` in [mk_objects.py](../../tests/kernels/moe/modular_kernel_tools/mk_objects.py) respectively.
|
1. Add the implementation type to `MK_ALL_PREPARE_FINALIZE_TYPES` and `MK_FUSED_EXPERT_TYPES` in [mk_objects.py](gh-file:tests/kernels/moe/modular_kernel_tools/mk_objects.py) respectively.
|
||||||
2. Update `Config::is_batched_prepare_finalize()`, `Config::is_batched_fused_experts()`, `Config::is_standard_fused_experts()`,
|
2. Update `Config::is_batched_prepare_finalize()`, `Config::is_batched_fused_experts()`, `Config::is_standard_fused_experts()`,
|
||||||
`Config::is_fe_16bit_supported()`, `Config::is_fe_fp8_supported()`, `Config::is_fe_block_fp8_supported()`,
|
`Config::is_fe_16bit_supported()`, `Config::is_fe_fp8_supported()`, `Config::is_fe_block_fp8_supported()`,
|
||||||
`Config::is_fe_supports_chunking()` methods in [/tests/kernels/moe/modular_kernel_tools/common.py](../../tests/kernels/moe/modular_kernel_tools/common.py)
|
`Config::is_fe_supports_chunking()` methods in [/tests/kernels/moe/modular_kernel_tools/common.py](gh-file:tests/kernels/moe/modular_kernel_tools/common.py)
|
||||||
|
|
||||||
Doing this will add the new implementation to the test suite.
|
Doing this will add the new implementation to the test suite.
|
||||||
|
|
||||||
### How To Check `FusedMoEPrepareAndFinalize` & `FusedMoEPermuteExpertsUnpermute` Compatibility
|
### How To Check `FusedMoEPrepareAndFinalize` & `FusedMoEPermuteExpertsUnpermute` Compatibility
|
||||||
|
|
||||||
The unit test file [test_modular_kernel_combinations.py](../../tests/kernels/moe/test_modular_kernel_combinations.py) can also be executed as a standalone script.
|
The unit test file [test_modular_kernel_combinations.py](gh-file:tests/kernels/moe/test_modular_kernel_combinations.py) can also be executed as a standalone script.
|
||||||
Example: `python3 -m tests.kernels.moe.test_modular_kernel_combinations --pf-type PplxPrepareAndFinalize --experts-type BatchedTritonExperts`
|
Example: `python3 -m tests.kernels.moe.test_modular_kernel_combinations --pf-type PplxPrepareAndFinalize --experts-type BatchedTritonExperts`
|
||||||
As a side effect, this script can be used to test `FusedMoEPrepareAndFinalize` & `FusedMoEPermuteExpertsUnpermute` compatibility. When invoked
|
As a side effect, this script can be used to test `FusedMoEPrepareAndFinalize` & `FusedMoEPermuteExpertsUnpermute` compatibility. When invoked
|
||||||
with incompatible types, the script will error.
|
with incompatible types, the script will error.
|
||||||
|
|
||||||
### How To Profile
|
### How To Profile
|
||||||
|
|
||||||
Please take a look at [profile_modular_kernel.py](../../tests/kernels/moe/modular_kernel_tools/profile_modular_kernel.py)
|
Please take a look at [profile_modular_kernel.py](gh-file:tests/kernels/moe/modular_kernel_tools/profile_modular_kernel.py)
|
||||||
The script can be used to generate Torch traces for a single `FusedMoEModularKernel::forward()` call for any compatible
|
The script can be used to generate Torch traces for a single `FusedMoEModularKernel::forward()` call for any compatible
|
||||||
`FusedMoEPrepareAndFinalize` and `FusedMoEPermuteExpertsUnpermute` types.
|
`FusedMoEPrepareAndFinalize` and `FusedMoEPermuteExpertsUnpermute` types.
|
||||||
Example: `python3 -m tests.kernels.moe.modular_kernel_tools.profile_modular_kernel --pf-type PplxPrepareAndFinalize --experts-type BatchedTritonExperts`
|
Example: `python3 -m tests.kernels.moe.modular_kernel_tools.profile_modular_kernel --pf-type PplxPrepareAndFinalize --experts-type BatchedTritonExperts`
|
||||||
|
|||||||
@ -6,11 +6,11 @@ When performing an inference with IO Processor plugins, the prompt type is defin
|
|||||||
|
|
||||||
## Writing an IO Processor Plugin
|
## Writing an IO Processor Plugin
|
||||||
|
|
||||||
IO Processor plugins implement the [`IOProcessor`][vllm.plugins.io_processors.interface.IOProcessor] interface:
|
IO Processor plugins implement the `IOProcessor` interface (<gh-file:vllm/plugins/io_processors/interface.py>):
|
||||||
|
|
||||||
```python
|
```python
|
||||||
IOProcessorInput = TypeVar("IOProcessorInput")
|
IOProcessorInput = TypeVar('IOProcessorInput')
|
||||||
IOProcessorOutput = TypeVar("IOProcessorOutput")
|
IOProcessorOutput = TypeVar('IOProcessorOutput')
|
||||||
|
|
||||||
class IOProcessor(ABC, Generic[IOProcessorInput, IOProcessorOutput]):
|
class IOProcessor(ABC, Generic[IOProcessorInput, IOProcessorOutput]):
|
||||||
|
|
||||||
@ -21,32 +21,30 @@ class IOProcessor(ABC, Generic[IOProcessorInput, IOProcessorOutput]):
|
|||||||
def pre_process(
|
def pre_process(
|
||||||
self,
|
self,
|
||||||
prompt: IOProcessorInput,
|
prompt: IOProcessorInput,
|
||||||
request_id: str | None = None,
|
request_id: Optional[str] = None,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
) -> PromptType | Sequence[PromptType]:
|
) -> Union[PromptType, Sequence[PromptType]]:
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
async def pre_process_async(
|
async def pre_process_async(
|
||||||
self,
|
self,
|
||||||
prompt: IOProcessorInput,
|
prompt: IOProcessorInput,
|
||||||
request_id: str | None = None,
|
request_id: Optional[str] = None,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
) -> PromptType | Sequence[PromptType]:
|
) -> Union[PromptType, Sequence[PromptType]]:
|
||||||
return self.pre_process(prompt, request_id, **kwargs)
|
return self.pre_process(prompt, request_id, **kwargs)
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def post_process(
|
def post_process(self,
|
||||||
self,
|
model_output: Sequence[PoolingRequestOutput],
|
||||||
model_output: Sequence[PoolingRequestOutput],
|
request_id: Optional[str] = None,
|
||||||
request_id: str | None = None,
|
**kwargs) -> IOProcessorOutput:
|
||||||
**kwargs,
|
|
||||||
) -> IOProcessorOutput:
|
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
async def post_process_async(
|
async def post_process_async(
|
||||||
self,
|
self,
|
||||||
model_output: AsyncGenerator[tuple[int, PoolingRequestOutput]],
|
model_output: AsyncGenerator[tuple[int, PoolingRequestOutput]],
|
||||||
request_id: str | None = None,
|
request_id: Optional[str] = None,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
) -> IOProcessorOutput:
|
) -> IOProcessorOutput:
|
||||||
collected_output = [item async for i, item in model_output]
|
collected_output = [item async for i, item in model_output]
|
||||||
@ -58,8 +56,7 @@ class IOProcessor(ABC, Generic[IOProcessorInput, IOProcessorOutput]):
|
|||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def output_to_response(
|
def output_to_response(
|
||||||
self, plugin_output: IOProcessorOutput
|
self, plugin_output: IOProcessorOutput) -> IOProcessorResponse:
|
||||||
) -> IOProcessorResponse:
|
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
```
|
```
|
||||||
|
|
||||||
@ -67,9 +64,9 @@ The `parse_request` method is used for validating the user prompt and converting
|
|||||||
The `pre_process*` methods take the validated plugin input to generate vLLM's model prompts for regular inference.
|
The `pre_process*` methods take the validated plugin input to generate vLLM's model prompts for regular inference.
|
||||||
The `post_process*` methods take `PoolingRequestOutput` objects as input and generate a custom plugin output.
|
The `post_process*` methods take `PoolingRequestOutput` objects as input and generate a custom plugin output.
|
||||||
|
|
||||||
The `output_to_response` method is used only for online serving and converts the plugin output to the `IOProcessorResponse` type that is then returned by the API Server. The implementation of the `/pooling` serving endpoint is available here [vllm/entrypoints/openai/serving_pooling.py](../../vllm/entrypoints/openai/serving_pooling.py).
|
The `output_to_response` method is used only for online serving and converts the plugin output to the `IOProcessorResponse` type that is then returned by the API Server. The implementation of the `/io_processor_pooling` serving endpoint is available here <gh-file:vllm/entrypoints/openai/serving_pooling_with_io_plugin.py>.
|
||||||
|
|
||||||
An example implementation of a plugin that enables generating geotiff images with the PrithviGeospatialMAE model is available [here](https://github.com/christian-pinto/prithvi_io_processor_plugin). Please, also refer to our online ([examples/online_serving/prithvi_geospatial_mae.py](../../examples/online_serving/prithvi_geospatial_mae.py)) and offline ([examples/offline_inference/prithvi_geospatial_mae_io_processor.py](../../examples/offline_inference/prithvi_geospatial_mae_io_processor.py)) inference examples.
|
An example implementation of a plugin that enables generating geotiff images with the PrithviGeospatialMAE model is available [here](https://github.com/christian-pinto/prithvi_io_processor_plugin). Please, also refer to our online (<gh-file:examples/online_serving/prithvi_geospatial_mae.py>) and offline (<gh-file:examples/offline_inference/prithvi_geospatial_mae_io_processor.py>) inference examples.
|
||||||
|
|
||||||
## Using an IO Processor plugin
|
## Using an IO Processor plugin
|
||||||
|
|
||||||
|
|||||||
@ -174,7 +174,7 @@ The previous sections alluded to the interfaces which vLLM logits processors mus
|
|||||||
from collections.abc import Sequence
|
from collections.abc import Sequence
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from enum import Enum, auto
|
from enum import Enum, auto
|
||||||
from typing import TYPE_CHECKING
|
from typing import TYPE_CHECKING, Optional
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
@ -244,7 +244,7 @@ The previous sections alluded to the interfaces which vLLM logits processors mus
|
|||||||
@abstractmethod
|
@abstractmethod
|
||||||
def update_state(
|
def update_state(
|
||||||
self,
|
self,
|
||||||
batch_update: "BatchUpdate" | None,
|
batch_update: Optional["BatchUpdate"],
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Called when there are new output tokens, prior
|
"""Called when there are new output tokens, prior
|
||||||
to each forward pass.
|
to each forward pass.
|
||||||
@ -274,7 +274,7 @@ A vLLM logits processor must subclass `LogitsProcessor` and define (at minimum)
|
|||||||
* Return `True` if the logits processor is argmax invariant (never changes what is the highest-logit-value token ID for a given request), `False` if the logits processor may modify argmax
|
* Return `True` if the logits processor is argmax invariant (never changes what is the highest-logit-value token ID for a given request), `False` if the logits processor may modify argmax
|
||||||
* `is_argmax_invariant()` is evaluated once at startup; if `True`, vLLM will skip applying this logits processor in a given step when all requests use greedy sampling
|
* `is_argmax_invariant()` is evaluated once at startup; if `True`, vLLM will skip applying this logits processor in a given step when all requests use greedy sampling
|
||||||
|
|
||||||
* `update_state(self, batch_update: "BatchUpdate" | None) -> None`:
|
* `update_state(self, batch_update: Optional["BatchUpdate"]) -> None`:
|
||||||
* Consume a `BatchUpdate` data structure representing persistent batch state changes at the beginning of the current engine step
|
* Consume a `BatchUpdate` data structure representing persistent batch state changes at the beginning of the current engine step
|
||||||
* Use the `BatchUpdate` members to update logits processor internal state
|
* Use the `BatchUpdate` members to update logits processor internal state
|
||||||
* **Note:** batch update data structure may be `None`, signaling no change to the batch constituents. In this case, the LogitsProcessor might still want to update its state based on the updated `output_token_ids` lists that it could have retained when they were added.
|
* **Note:** batch update data structure may be `None`, signaling no change to the batch constituents. In this case, the LogitsProcessor might still want to update its state based on the updated `output_token_ids` lists that it could have retained when they were added.
|
||||||
|
|||||||
@ -80,13 +80,13 @@ The subset of metrics exposed in the Grafana dashboard gives us an indication of
|
|||||||
- `vllm:request_decode_time_seconds` - Requests decode time.
|
- `vllm:request_decode_time_seconds` - Requests decode time.
|
||||||
- `vllm:request_max_num_generation_tokens` - Max generation tokens in a sequence group.
|
- `vllm:request_max_num_generation_tokens` - Max generation tokens in a sequence group.
|
||||||
|
|
||||||
See [the PR which added this Dashboard](https://github.com/vllm-project/vllm/pull/2316) for interesting and useful background on the choices made here.
|
See [the PR which added this Dashboard](gh-pr:2316) for interesting and useful background on the choices made here.
|
||||||
|
|
||||||
### Prometheus Client Library
|
### Prometheus Client Library
|
||||||
|
|
||||||
Prometheus support was initially added [using the aioprometheus library](https://github.com/vllm-project/vllm/pull/1890), but a switch was made quickly to [prometheus_client](https://github.com/vllm-project/vllm/pull/2730). The rationale is discussed in both linked PRs.
|
Prometheus support was initially added [using the aioprometheus library](gh-pr:1890), but a switch was made quickly to [prometheus_client](gh-pr:2730). The rationale is discussed in both linked PRs.
|
||||||
|
|
||||||
With the switch to `aioprometheus`, we lost a `MetricsMiddleware` to track HTTP metrics, but this was reinstated [using prometheus_fastapi_instrumentator](https://github.com/vllm-project/vllm/pull/15657):
|
With the switch to `aioprometheus`, we lost a `MetricsMiddleware` to track HTTP metrics, but this was reinstated [using prometheus_fastapi_instrumentator](gh-pr:15657):
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
$ curl http://0.0.0.0:8000/metrics 2>/dev/null | grep -P '^http_(?!.*(_bucket|_created|_sum)).*'
|
$ curl http://0.0.0.0:8000/metrics 2>/dev/null | grep -P '^http_(?!.*(_bucket|_created|_sum)).*'
|
||||||
@ -99,7 +99,7 @@ http_request_duration_seconds_count{handler="/v1/completions",method="POST"} 201
|
|||||||
|
|
||||||
### Multi-process Mode
|
### Multi-process Mode
|
||||||
|
|
||||||
In v0, metrics are collected in the engine core process and we use multiprocess mode to make them available in the API server process. See <https://github.com/vllm-project/vllm/pull/7279>.
|
In v0, metrics are collected in the engine core process and we use multiprocess mode to make them available in the API server process. See <gh-pr:7279>.
|
||||||
|
|
||||||
### Built in Python/Process Metrics
|
### Built in Python/Process Metrics
|
||||||
|
|
||||||
@ -125,32 +125,32 @@ vLLM instance.
|
|||||||
|
|
||||||
For background, these are some of the relevant PRs which added the v0 metrics:
|
For background, these are some of the relevant PRs which added the v0 metrics:
|
||||||
|
|
||||||
- <https://github.com/vllm-project/vllm/pull/1890>
|
- <gh-pr:1890>
|
||||||
- <https://github.com/vllm-project/vllm/pull/2316>
|
- <gh-pr:2316>
|
||||||
- <https://github.com/vllm-project/vllm/pull/2730>
|
- <gh-pr:2730>
|
||||||
- <https://github.com/vllm-project/vllm/pull/4464>
|
- <gh-pr:4464>
|
||||||
- <https://github.com/vllm-project/vllm/pull/7279>
|
- <gh-pr:7279>
|
||||||
|
|
||||||
Also note the ["Even Better Observability"](https://github.com/vllm-project/vllm/issues/3616) feature where e.g. [a detailed roadmap was laid out](https://github.com/vllm-project/vllm/issues/3616#issuecomment-2030858781).
|
Also note the ["Even Better Observability"](gh-issue:3616) feature where e.g. [a detailed roadmap was laid out](gh-issue:3616#issuecomment-2030858781).
|
||||||
|
|
||||||
## v1 Design
|
## v1 Design
|
||||||
|
|
||||||
### v1 PRs
|
### v1 PRs
|
||||||
|
|
||||||
For background, here are the relevant v1 PRs relating to the v1
|
For background, here are the relevant v1 PRs relating to the v1
|
||||||
metrics issue <https://github.com/vllm-project/vllm/issues/10582>:
|
metrics issue <gh-issue:10582>:
|
||||||
|
|
||||||
- <https://github.com/vllm-project/vllm/pull/11962>
|
- <gh-pr:11962>
|
||||||
- <https://github.com/vllm-project/vllm/pull/11973>
|
- <gh-pr:11973>
|
||||||
- <https://github.com/vllm-project/vllm/pull/10907>
|
- <gh-pr:10907>
|
||||||
- <https://github.com/vllm-project/vllm/pull/12416>
|
- <gh-pr:12416>
|
||||||
- <https://github.com/vllm-project/vllm/pull/12478>
|
- <gh-pr:12478>
|
||||||
- <https://github.com/vllm-project/vllm/pull/12516>
|
- <gh-pr:12516>
|
||||||
- <https://github.com/vllm-project/vllm/pull/12530>
|
- <gh-pr:12530>
|
||||||
- <https://github.com/vllm-project/vllm/pull/12561>
|
- <gh-pr:12561>
|
||||||
- <https://github.com/vllm-project/vllm/pull/12579>
|
- <gh-pr:12579>
|
||||||
- <https://github.com/vllm-project/vllm/pull/12592>
|
- <gh-pr:12592>
|
||||||
- <https://github.com/vllm-project/vllm/pull/12644>
|
- <gh-pr:12644>
|
||||||
|
|
||||||
### Metrics Collection
|
### Metrics Collection
|
||||||
|
|
||||||
@ -369,7 +369,7 @@ vllm:cache_config_info{block_size="16",cache_dtype="auto",calculate_kv_scales="F
|
|||||||
|
|
||||||
However, `prometheus_client` has
|
However, `prometheus_client` has
|
||||||
[never supported Info metrics in multiprocessing mode](https://github.com/prometheus/client_python/pull/300) -
|
[never supported Info metrics in multiprocessing mode](https://github.com/prometheus/client_python/pull/300) -
|
||||||
for [unclear reasons](https://github.com/vllm-project/vllm/pull/7279#discussion_r1710417152). We
|
for [unclear reasons](gh-pr:7279#discussion_r1710417152). We
|
||||||
simply use a `Gauge` metric set to 1 and
|
simply use a `Gauge` metric set to 1 and
|
||||||
`multiprocess_mode="mostrecent"` instead.
|
`multiprocess_mode="mostrecent"` instead.
|
||||||
|
|
||||||
@ -394,7 +394,7 @@ distinguish between per-adapter counts. This should be revisited.
|
|||||||
Note that `multiprocess_mode="livemostrecent"` is used - the most
|
Note that `multiprocess_mode="livemostrecent"` is used - the most
|
||||||
recent metric is used, but only from currently running processes.
|
recent metric is used, but only from currently running processes.
|
||||||
|
|
||||||
This was added in <https://github.com/vllm-project/vllm/pull/9477> and there is
|
This was added in <gh-pr:9477> and there is
|
||||||
[at least one known user](https://github.com/kubernetes-sigs/gateway-api-inference-extension/pull/54).
|
[at least one known user](https://github.com/kubernetes-sigs/gateway-api-inference-extension/pull/54).
|
||||||
If we revisit this design and deprecate the old metric, we should reduce
|
If we revisit this design and deprecate the old metric, we should reduce
|
||||||
the need for a significant deprecation period by making the change in
|
the need for a significant deprecation period by making the change in
|
||||||
@ -402,7 +402,7 @@ v0 also and asking this project to move to the new metric.
|
|||||||
|
|
||||||
### Prefix Cache metrics
|
### Prefix Cache metrics
|
||||||
|
|
||||||
The discussion in <https://github.com/vllm-project/vllm/issues/10582> about adding prefix cache metrics yielded
|
The discussion in <gh-issue:10582> about adding prefix cache metrics yielded
|
||||||
some interesting points which may be relevant to how we approach
|
some interesting points which may be relevant to how we approach
|
||||||
future metrics.
|
future metrics.
|
||||||
|
|
||||||
@ -439,8 +439,8 @@ suddenly (from their perspective) when it is removed, even if there is
|
|||||||
an equivalent metric for them to use.
|
an equivalent metric for them to use.
|
||||||
|
|
||||||
As an example, see how `vllm:avg_prompt_throughput_toks_per_s` was
|
As an example, see how `vllm:avg_prompt_throughput_toks_per_s` was
|
||||||
[deprecated](https://github.com/vllm-project/vllm/pull/2764) (with a comment in the code),
|
[deprecated](gh-pr:2764) (with a comment in the code),
|
||||||
[removed](https://github.com/vllm-project/vllm/pull/12383), and then [noticed by a user](https://github.com/vllm-project/vllm/issues/13218).
|
[removed](gh-pr:12383), and then [noticed by a user](gh-issue:13218).
|
||||||
|
|
||||||
In general:
|
In general:
|
||||||
|
|
||||||
@ -460,35 +460,33 @@ the project-wide deprecation policy.
|
|||||||
|
|
||||||
### Unimplemented - `vllm:tokens_total`
|
### Unimplemented - `vllm:tokens_total`
|
||||||
|
|
||||||
Added by <https://github.com/vllm-project/vllm/pull/4464>, but apparently never implemented. This can just be
|
Added by <gh-pr:4464>, but apparently never implemented. This can just be
|
||||||
removed.
|
removed.
|
||||||
|
|
||||||
### Duplicated - Queue Time
|
### Duplicated - Queue Time
|
||||||
|
|
||||||
The `vllm:time_in_queue_requests` Histogram metric was added by
|
The `vllm:time_in_queue_requests` Histogram metric was added by
|
||||||
<https://github.com/vllm-project/vllm/pull/9659> and its calculation is:
|
<gh-pr:9659> and its calculation is:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
self.metrics.first_scheduled_time = now
|
self.metrics.first_scheduled_time = now
|
||||||
self.metrics.time_in_queue = now - self.metrics.arrival_time
|
self.metrics.time_in_queue = now - self.metrics.arrival_time
|
||||||
```
|
```
|
||||||
|
|
||||||
Two weeks later, <https://github.com/vllm-project/vllm/pull/4464> added `vllm:request_queue_time_seconds` leaving
|
Two weeks later, <gh-pr:4464> added `vllm:request_queue_time_seconds` leaving
|
||||||
us with:
|
us with:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
if seq_group.is_finished():
|
if seq_group.is_finished():
|
||||||
if (
|
if (seq_group.metrics.first_scheduled_time is not None and
|
||||||
seq_group.metrics.first_scheduled_time is not None
|
seq_group.metrics.first_token_time is not None):
|
||||||
and seq_group.metrics.first_token_time is not None
|
|
||||||
):
|
|
||||||
time_queue_requests.append(
|
time_queue_requests.append(
|
||||||
seq_group.metrics.first_scheduled_time -
|
seq_group.metrics.first_scheduled_time -
|
||||||
seq_group.metrics.arrival_time
|
seq_group.metrics.arrival_time)
|
||||||
)
|
|
||||||
...
|
...
|
||||||
if seq_group.metrics.time_in_queue is not None:
|
if seq_group.metrics.time_in_queue is not None:
|
||||||
time_in_queue_requests.append(seq_group.metrics.time_in_queue)
|
time_in_queue_requests.append(
|
||||||
|
seq_group.metrics.time_in_queue)
|
||||||
```
|
```
|
||||||
|
|
||||||
This seems duplicative, and one of them should be removed. The latter
|
This seems duplicative, and one of them should be removed. The latter
|
||||||
@ -513,7 +511,7 @@ cache to complete other requests), we swap kv cache blocks out to CPU
|
|||||||
memory. This is also known as "KV cache offloading" and is configured
|
memory. This is also known as "KV cache offloading" and is configured
|
||||||
with `--swap-space` and `--preemption-mode`.
|
with `--swap-space` and `--preemption-mode`.
|
||||||
|
|
||||||
In v0, [vLLM has long supported beam search](https://github.com/vllm-project/vllm/issues/6226). The
|
In v0, [vLLM has long supported beam search](gh-issue:6226). The
|
||||||
SequenceGroup encapsulated the idea of N Sequences which
|
SequenceGroup encapsulated the idea of N Sequences which
|
||||||
all shared the same prompt kv blocks. This enabled KV cache block
|
all shared the same prompt kv blocks. This enabled KV cache block
|
||||||
sharing between requests, and copy-on-write to do branching. CPU
|
sharing between requests, and copy-on-write to do branching. CPU
|
||||||
@ -526,7 +524,7 @@ and the part of the prompt that was evicted can be recomputed.
|
|||||||
|
|
||||||
SequenceGroup was removed in V1, although a replacement will be
|
SequenceGroup was removed in V1, although a replacement will be
|
||||||
required for "parallel sampling" (`n>1`).
|
required for "parallel sampling" (`n>1`).
|
||||||
[Beam search was moved out of the core (in V0)](https://github.com/vllm-project/vllm/issues/8306). There was a
|
[Beam search was moved out of the core (in V0)](gh-issue:8306). There was a
|
||||||
lot of complex code for a very uncommon feature.
|
lot of complex code for a very uncommon feature.
|
||||||
|
|
||||||
In V1, with prefix caching being better (zero over head) and therefore
|
In V1, with prefix caching being better (zero over head) and therefore
|
||||||
@ -541,7 +539,7 @@ Some v0 metrics are only relevant in the context of "parallel
|
|||||||
sampling". This is where the `n` parameter in a request is used to
|
sampling". This is where the `n` parameter in a request is used to
|
||||||
request multiple completions from the same prompt.
|
request multiple completions from the same prompt.
|
||||||
|
|
||||||
As part of adding parallel sampling support in <https://github.com/vllm-project/vllm/pull/10980>, we should
|
As part of adding parallel sampling support in <gh-pr:10980>, we should
|
||||||
also add these metrics.
|
also add these metrics.
|
||||||
|
|
||||||
- `vllm:request_params_n` (Histogram)
|
- `vllm:request_params_n` (Histogram)
|
||||||
@ -566,7 +564,7 @@ model and then validate those tokens with the larger model.
|
|||||||
- `vllm:spec_decode_num_draft_tokens_total` (Counter)
|
- `vllm:spec_decode_num_draft_tokens_total` (Counter)
|
||||||
- `vllm:spec_decode_num_emitted_tokens_total` (Counter)
|
- `vllm:spec_decode_num_emitted_tokens_total` (Counter)
|
||||||
|
|
||||||
There is a PR under review (<https://github.com/vllm-project/vllm/pull/12193>) to add "prompt lookup (ngram)"
|
There is a PR under review (<gh-pr:12193>) to add "prompt lookup (ngram)"
|
||||||
speculative decoding to v1. Other techniques will follow. We should
|
speculative decoding to v1. Other techniques will follow. We should
|
||||||
revisit the v0 metrics in this context.
|
revisit the v0 metrics in this context.
|
||||||
|
|
||||||
@ -587,7 +585,7 @@ see:
|
|||||||
- [Standardizing Large Model Server Metrics in Kubernetes](https://docs.google.com/document/d/1SpSp1E6moa4HSrJnS4x3NpLuj88sMXr2tbofKlzTZpk)
|
- [Standardizing Large Model Server Metrics in Kubernetes](https://docs.google.com/document/d/1SpSp1E6moa4HSrJnS4x3NpLuj88sMXr2tbofKlzTZpk)
|
||||||
- [Benchmarking LLM Workloads for Performance Evaluation and Autoscaling in Kubernetes](https://docs.google.com/document/d/1k4Q4X14hW4vftElIuYGDu5KDe2LtV1XammoG-Xi3bbQ)
|
- [Benchmarking LLM Workloads for Performance Evaluation and Autoscaling in Kubernetes](https://docs.google.com/document/d/1k4Q4X14hW4vftElIuYGDu5KDe2LtV1XammoG-Xi3bbQ)
|
||||||
- [Inference Perf](https://github.com/kubernetes-sigs/wg-serving/tree/main/proposals/013-inference-perf)
|
- [Inference Perf](https://github.com/kubernetes-sigs/wg-serving/tree/main/proposals/013-inference-perf)
|
||||||
- <https://github.com/vllm-project/vllm/issues/5041> and <https://github.com/vllm-project/vllm/pull/12726>.
|
- <gh-issue:5041> and <gh-pr:12726>.
|
||||||
|
|
||||||
This is a non-trivial topic. Consider this comment from Rob:
|
This is a non-trivial topic. Consider this comment from Rob:
|
||||||
|
|
||||||
@ -654,7 +652,7 @@ fall under the more general heading of "Observability".
|
|||||||
|
|
||||||
v0 has support for OpenTelemetry tracing:
|
v0 has support for OpenTelemetry tracing:
|
||||||
|
|
||||||
- Added by <https://github.com/vllm-project/vllm/pull/4687>
|
- Added by <gh-pr:4687>
|
||||||
- Configured with `--oltp-traces-endpoint` and `--collect-detailed-traces`
|
- Configured with `--oltp-traces-endpoint` and `--collect-detailed-traces`
|
||||||
- [OpenTelemetry blog post](https://opentelemetry.io/blog/2024/llm-observability/)
|
- [OpenTelemetry blog post](https://opentelemetry.io/blog/2024/llm-observability/)
|
||||||
- [User-facing docs](../examples/online_serving/opentelemetry.md)
|
- [User-facing docs](../examples/online_serving/opentelemetry.md)
|
||||||
@ -685,7 +683,7 @@ documentation for this option states:
|
|||||||
> use of possibly costly and or blocking operations and hence might
|
> use of possibly costly and or blocking operations and hence might
|
||||||
> have a performance impact.
|
> have a performance impact.
|
||||||
|
|
||||||
The metrics were added by <https://github.com/vllm-project/vllm/pull/7089> and who up in an OpenTelemetry trace
|
The metrics were added by <gh-pr:7089> and who up in an OpenTelemetry trace
|
||||||
as:
|
as:
|
||||||
|
|
||||||
```text
|
```text
|
||||||
|
|||||||
@ -60,7 +60,7 @@ With the help of dummy text and automatic prompt updating, our multi-modal proce
|
|||||||
|
|
||||||
## Processor Output Caching
|
## Processor Output Caching
|
||||||
|
|
||||||
Some HF processors, such as the one for Qwen2-VL, are [very slow](https://github.com/vllm-project/vllm/issues/9238). To alleviate this problem, we cache the multi-modal outputs of HF processor to avoid processing the same multi-modal input (e.g. image) again.
|
Some HF processors, such as the one for Qwen2-VL, are [very slow](gh-issue:9238). To alleviate this problem, we cache the multi-modal outputs of HF processor to avoid processing the same multi-modal input (e.g. image) again.
|
||||||
|
|
||||||
When new data is passed in, we first check which items are in the cache, and which ones are missing. The missing items are passed into the HF processor in a single batch and cached, before being merged with the existing items in the cache.
|
When new data is passed in, we first check which items are in the cache, and which ones are missing. The missing items are passed into the HF processor in a single batch and cached, before being merged with the existing items in the cache.
|
||||||
|
|
||||||
|
|||||||
@ -92,8 +92,8 @@ To be used with a particular `FusedMoEPrepareAndFinalize` sub-class, MoE kernels
|
|||||||
| flashinfer | standard | nvfp4,</br>fp8 | T | <sup>5</sup> | N | Y | [`flashinfer_cutlass_moe_fp4`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe.flashinfer_cutlass_moe_fp4],</br>[`FlashInferExperts`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe.FlashInferExperts] |
|
| flashinfer | standard | nvfp4,</br>fp8 | T | <sup>5</sup> | N | Y | [`flashinfer_cutlass_moe_fp4`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe.flashinfer_cutlass_moe_fp4],</br>[`FlashInferExperts`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe.FlashInferExperts] |
|
||||||
| gpt oss triton | standard | N/A | N/A | <sup>5</sup> | Y | Y | [`triton_kernel_fused_experts`][vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe.triton_kernel_fused_experts],</br>[`OAITritonExperts`][vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe.OAITritonExperts] |
|
| gpt oss triton | standard | N/A | N/A | <sup>5</sup> | Y | Y | [`triton_kernel_fused_experts`][vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe.triton_kernel_fused_experts],</br>[`OAITritonExperts`][vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe.OAITritonExperts] |
|
||||||
| deep gemm+triton<sup>2</sup> | standard,</br>batched | all<sup>1</sup> | G(128),A,T | silu, gelu | <sup>6</sup> | Y | [`TritonOrDeepGemmExperts`][vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe.TritonOrDeepGemmExperts],</br>[`BatchedTritonOrDeepGemmExperts`][vllm.model_executor.layers.fused_moe.batched_triton_or_deep_gemm_moe.BatchedTritonOrDeepGemmExperts] |
|
| deep gemm+triton<sup>2</sup> | standard,</br>batched | all<sup>1</sup> | G(128),A,T | silu, gelu | <sup>6</sup> | Y | [`TritonOrDeepGemmExperts`][vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe.TritonOrDeepGemmExperts],</br>[`BatchedTritonOrDeepGemmExperts`][vllm.model_executor.layers.fused_moe.batched_triton_or_deep_gemm_moe.BatchedTritonOrDeepGemmExperts] |
|
||||||
| marlin | standard | <sup>3</sup> | <sup>3</sup> | silu,</br>swigluoai | Y | Y | [`fused_marlin_moe`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.fused_marlin_moe],</br>[`MarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.MarlinExperts],</br>[`BatchedMarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.BatchedMarlinExperts] |
|
| marlin | standard | <sup>3</sup> | <sup>3</sup> | silu,</br>swigluoai | Y | N | [`fused_marlin_moe`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.fused_marlin_moe] |
|
||||||
| marlin experts | standard,</br>batched | N/A | N/A | silu,</br>swigluoai | Y | Y | [`MarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.MarlinExperts],</br>[`BatchedMarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.BatchedMarlinExperts] |
|
| marlin experts | standard | N/A | N/A | silu,</br>swigluoai | Y | Y | [`MarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.MarlinExperts] |
|
||||||
| trtllm | standard | mxfp4,</br>nvfp4 | G(16),G(32) | <sup>5</sup> | N | Y | [`TrtLlmGenExperts`][vllm.model_executor.layers.fused_moe.trtllm_moe.TrtLlmGenExperts] |
|
| trtllm | standard | mxfp4,</br>nvfp4 | G(16),G(32) | <sup>5</sup> | N | Y | [`TrtLlmGenExperts`][vllm.model_executor.layers.fused_moe.trtllm_moe.TrtLlmGenExperts] |
|
||||||
| pallas | standard | N/A | N/A | silu | N | N | [`fused_moe`][vllm.model_executor.layers.fused_moe.moe_pallas.fused_moe] |
|
| pallas | standard | N/A | N/A | silu | N | N | [`fused_moe`][vllm.model_executor.layers.fused_moe.moe_pallas.fused_moe] |
|
||||||
| iterative | standard | N/A | N/A | silu | N | N | [`fused_moe`][vllm.model_executor.layers.fused_moe.moe_torch_iterative.fused_moe] |
|
| iterative | standard | N/A | N/A | silu | N | N | [`fused_moe`][vllm.model_executor.layers.fused_moe.moe_torch_iterative.fused_moe] |
|
||||||
@ -115,6 +115,6 @@ The following table shows "families" of modular kernels that are intended to wor
|
|||||||
|
|
||||||
| backend | `FusedMoEPrepareAndFinalize` subclasses | `FusedMoEPermuteExpertsUnpermute` subclasses |
|
| backend | `FusedMoEPrepareAndFinalize` subclasses | `FusedMoEPermuteExpertsUnpermute` subclasses |
|
||||||
|----------------------------------|------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------|
|
|----------------------------------|------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------|
|
||||||
| deepep_high_throughput | `DeepEPHTPrepareAndFinalize` | `DeepGemmExperts`,</br>`TritonExperts`,</br>`TritonOrDeepGemmExperts`,</br>`CutlassExpertsFp8`, </br>`MarlinExperts` |
|
| deepep_high_throughput | `DeepEPHTPrepareAndFinalize` | `DeepGemmExperts`,</br>`TritonExperts`,</br>`TritonOrDeepGemmExperts`,</br>`CutlassExpertsFp8`, </br>`MarlinExperts` |
|
||||||
| deepep_low_latency,</br>pplx | `DeepEPLLPrepareAndFinalize`,</br>`PplxPrepareAndFinalize` | `BatchedDeepGemmExperts`,</br>`BatchedTritonExperts`,</br>`BatchedTritonOrDeepGemmExperts`,</br>`CutlassBatchedExpertsFp8`,</br>`BatchedMarlinExperts`|
|
| deepep_low_latency,</br>pplx | `DeepEPLLPrepareAndFinalize`,</br>`PplxPrepareAndFinalize` | `BatchedDeepGemmExperts`,</br>`BatchedTritonExperts`,</br>`BatchedTritonOrDeepGemmExperts`,</br>`CutlassBatchedExpertsFp8`|
|
||||||
| flashinfer | `FlashInferCutlassMoEPrepareAndFinalize` | `FlashInferExperts` |
|
| flashinfer | `FlashInferCutlassMoEPrepareAndFinalize` | `FlashInferExperts` |
|
||||||
|
|||||||
@ -82,7 +82,7 @@ There are other miscellaneous places hard-coding the use of `spawn`:
|
|||||||
|
|
||||||
Related PRs:
|
Related PRs:
|
||||||
|
|
||||||
- <https://github.com/vllm-project/vllm/pull/8823>
|
- <gh-pr:8823>
|
||||||
|
|
||||||
## Prior State in v1
|
## Prior State in v1
|
||||||
|
|
||||||
|
|||||||
@ -112,8 +112,8 @@ class KVCacheBlock:
|
|||||||
ref_cnt: int
|
ref_cnt: int
|
||||||
|
|
||||||
# The pointers to form a doubly linked list for the free queue.
|
# The pointers to form a doubly linked list for the free queue.
|
||||||
prev_free_block: "KVCacheBlock | None" = None
|
prev_free_block: Optional["KVCacheBlock"] = None
|
||||||
next_free_block: "KVCacheBlock | None" = None
|
next_free_block: Optional["KVCacheBlock"] = None
|
||||||
```
|
```
|
||||||
|
|
||||||
There are two design points to highlight:
|
There are two design points to highlight:
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user