Compare commits

..

4 Commits

Author SHA1 Message Date
22bf5c5077 fix
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-10-11 11:38:33 -07:00
3a8990743e add truncation
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-10-11 11:20:31 -07:00
fbc2cc8217 merge 2025-10-11 11:09:22 -07:00
efd4bc967d [Misc] Remove in ModelRunnerOutput
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-08-23 21:09:20 -07:00
1402 changed files with 25152 additions and 49064 deletions

View File

@ -5,11 +5,11 @@ import os
import sys import sys
import zipfile import zipfile
# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 500 MiB # Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 450 MiB
# Note that we have 800 MiB quota, please use it wisely. # Note that we have 800 MiB quota, please use it wisely.
# See https://github.com/pypi/support/issues/6326 . # See https://github.com/pypi/support/issues/6326 .
# Please also sync the value with the one in Dockerfile. # Please also sync the value with the one in Dockerfile.
VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 500)) VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 450))
def print_top_10_largest_files(zip_file): def print_top_10_largest_files(zip_file):

View File

@ -1,12 +0,0 @@
# For vllm script, with -t option (tensor parallel size).
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m HandH1998/QQQ-Llama-3-8b-g128 -b 32 -l 1000 -f 5 -t 1
model_name: "HandH1998/QQQ-Llama-3-8b-g128"
tasks:
- name: "gsm8k"
metrics:
- name: "exact_match,strict-match"
value: 0.419
- name: "exact_match,flexible-extract"
value: 0.416
limit: 1000
num_fewshot: 5

View File

@ -1,12 +0,0 @@
# For hf script, without -t option (tensor parallel size).
# bash .buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh -m meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -l 100 -t 8
model_name: "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
backend: "vllm-vlm"
tasks:
- name: "chartqa"
metrics:
- name: "relaxed_accuracy,none"
# TODO(zhewenl): model card is 0.90, but the actual score is 0.80.
value: 0.80
limit: 100
num_fewshot: 0

View File

@ -1,10 +0,0 @@
# For hf script, without -t option (tensor parallel size).
# bash .buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh -m meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -l 250 -t 8 -f 5
model_name: "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
tasks:
- name: "mmlu_pro"
metrics:
- name: "exact_match,custom-extract"
value: 0.80
limit: 250 # will run on 250 * 14 subjects = 3500 samples
num_fewshot: 5

View File

@ -1,5 +1,4 @@
# For vllm script, with -t option (tensor parallel size) # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -b auto -l 1319 -f 5 -t 1
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -l 1319 -t 1
model_name: "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic" model_name: "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic"
tasks: tasks:
- name: "gsm8k" - name: "gsm8k"

View File

@ -1,12 +0,0 @@
# For vllm script, with -t option (tensor parallel size).
# bash .buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh -m Qwen/Qwen2.5-VL-7B-Instruct -l 2500 -t 1
model_name: "Qwen/Qwen2.5-VL-7B-Instruct"
backend: "vllm-vlm"
tasks:
- name: "chartqa"
metrics:
- name: "relaxed_accuracy,none"
value: 0.855
limit: 2500
num_fewshot: 0

View File

@ -1 +0,0 @@
Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml

View File

@ -1 +0,0 @@
Meta-Llama-4-Maverick-17B-128E-Instruct-FP8-MM.yaml

View File

@ -1 +0,0 @@
Qwen2.5-VL-7B-Instruct.yaml

View File

@ -1,44 +0,0 @@
#!/bin/bash
# We can use this script to compute baseline accuracy on chartqa for vllm.
#
# Make sure you have lm-eval-harness installed:
# pip install lm-eval==0.4.9
usage() {
echo``
echo "Runs lm eval harness on ChartQA using multimodal vllm."
echo "This pathway is intended to be used to create baselines for "
echo "our correctness tests in vllm's CI."
echo
echo "usage: ${0} <options>"
echo
echo " -m - huggingface stub or local directory of the model"
echo " -l - limit number of samples to run"
echo " -t - tensor parallel size to run at"
echo
}
while getopts "m:l:t:" OPT; do
case ${OPT} in
m )
MODEL="$OPTARG"
;;
l )
LIMIT="$OPTARG"
;;
t )
TP_SIZE="$OPTARG"
;;
\? )
usage
exit 1
;;
esac
done
lm_eval --model vllm-vlm \
--model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE" \
--tasks chartqa \
--batch_size auto \
--apply_chat_template \
--limit $LIMIT

View File

View File

@ -1,50 +0,0 @@
#!/bin/bash
# We can use this script to compute baseline accuracy on MMLUPRO for vllm.
# We use this for fp8, which HF does not support.
#
# Make sure you have lm-eval-harness installed:
# pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
usage() {
echo``
echo "Runs lm eval harness on MMLU Pro using huggingface transformers."
echo "This pathway is intended to be used to create baselines for "
echo "our automated nm-test-accuracy workflow"
echo
echo "usage: ${0} <options>"
echo
echo " -m - huggingface stub or local directory of the model"
echo " -l - limit number of samples to run"
echo " -f - number of fewshot samples to use"
echo " -t - tensor parallel size to run at"
echo
}
while getopts "m:b:l:f:t:" OPT; do
case ${OPT} in
m )
MODEL="$OPTARG"
;;
b )
BATCH_SIZE="$OPTARG"
;;
l )
LIMIT="$OPTARG"
;;
f )
FEWSHOT="$OPTARG"
;;
t )
TP_SIZE="$OPTARG"
;;
\? )
usage
exit 1
;;
esac
done
lm_eval --model vllm \
--model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,add_bos_token=true,trust_remote_code=true,max_model_len=4096" \
--tasks mmlu_pro --num_fewshot "$FEWSHOT" --limit "$LIMIT" \
--batch_size auto

View File

@ -19,27 +19,21 @@ RTOL = 0.08
def launch_lm_eval(eval_config, tp_size): def launch_lm_eval(eval_config, tp_size):
trust_remote_code = eval_config.get("trust_remote_code", False) trust_remote_code = eval_config.get("trust_remote_code", False)
max_model_len = eval_config.get("max_model_len", 4096) max_model_len = eval_config.get("max_model_len", 4096)
batch_size = eval_config.get("batch_size", "auto")
backend = eval_config.get("backend", "vllm")
model_args = ( model_args = (
f"pretrained={eval_config['model_name']}," f"pretrained={eval_config['model_name']},"
f"tensor_parallel_size={tp_size}," f"tensor_parallel_size={tp_size},"
f"enforce_eager=true," f"enforce_eager=true,"
f"add_bos_token=true," f"add_bos_token=true,"
f"trust_remote_code={trust_remote_code}," f"trust_remote_code={trust_remote_code},"
f"max_model_len={max_model_len}," f"max_model_len={max_model_len}"
) )
results = lm_eval.simple_evaluate( results = lm_eval.simple_evaluate(
model=backend, model="vllm",
model_args=model_args, model_args=model_args,
tasks=[task["name"] for task in eval_config["tasks"]], tasks=[task["name"] for task in eval_config["tasks"]],
num_fewshot=eval_config["num_fewshot"], num_fewshot=eval_config["num_fewshot"],
limit=eval_config["limit"], limit=eval_config["limit"],
# TODO(yeq): using chat template w/ fewshot_as_multiturn is supposed help batch_size="auto",
# text models. however, this is regressing measured strict-match for
# existing text models in CI, so only apply it for mm.
apply_chat_template=backend == "vllm-vlm",
batch_size=batch_size,
) )
return results return results

View File

@ -1,5 +1,5 @@
steps: steps:
# aarch64 + CUDA builds # aarch64 + CUDA builds. PyTorch 2.8 aarch64 + CUDA wheel is only available on CUDA 12.9
- label: "Build arm64 wheel - CUDA 12.9" - label: "Build arm64 wheel - CUDA 12.9"
depends_on: ~ depends_on: ~
id: build-wheel-arm64-cuda-12-9 id: build-wheel-arm64-cuda-12-9
@ -8,28 +8,13 @@ steps:
commands: commands:
# #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here: # #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
# https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7 # https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg VLLM_MAIN_CUDA_VERSION=12.9 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg VLLM_MAIN_CUDA_VERSION=12.9 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts" - "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-wheels.sh" - "bash .buildkite/scripts/upload-wheels.sh"
env: env:
DOCKER_BUILDKIT: "1" DOCKER_BUILDKIT: "1"
# aarch64 build
- label: "Build arm64 CPU wheel"
depends_on: ~
id: build-wheel-arm64-cpu
agents:
queue: arm64_cpu_queue_postmerge
commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_BUILD_ACL=ON --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile.cpu ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-wheels.sh"
env:
DOCKER_BUILDKIT: "1"
# x86 + CUDA builds
- label: "Build wheel - CUDA 12.8" - label: "Build wheel - CUDA 12.8"
depends_on: ~ depends_on: ~
id: build-wheel-cuda-12-8 id: build-wheel-cuda-12-8
@ -43,6 +28,20 @@ steps:
env: env:
DOCKER_BUILDKIT: "1" DOCKER_BUILDKIT: "1"
- label: "Build wheel - CUDA 12.6"
depends_on: ~
id: build-wheel-cuda-12-6
agents:
queue: cpu_queue_postmerge
commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.6.3 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-wheels.sh"
env:
DOCKER_BUILDKIT: "1"
# x86 + CUDA builds
- label: "Build wheel - CUDA 12.9" - label: "Build wheel - CUDA 12.9"
depends_on: ~ depends_on: ~
id: build-wheel-cuda-12-9 id: build-wheel-cuda-12-9
@ -56,20 +55,6 @@ steps:
env: env:
DOCKER_BUILDKIT: "1" DOCKER_BUILDKIT: "1"
- label: "Build wheel - CUDA 13.0"
depends_on: ~
id: build-wheel-cuda-13-0
agents:
queue: cpu_queue_postmerge
commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-wheels.sh"
env:
DOCKER_BUILDKIT: "1"
# Build release images (12.9)
- label: "Build release image (x86)" - label: "Build release image (x86)"
depends_on: ~ depends_on: ~
id: build-release-image-x86 id: build-release-image-x86
@ -77,12 +62,13 @@ steps:
queue: cpu_queue_postmerge queue: cpu_queue_postmerge
commands: commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ." - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)" - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
# re-tag to default image tag and push, just in case arm64 build fails # re-tag to default image tag and push, just in case arm64 build fails
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT" - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT" - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
# PyTorch 2.8 aarch64 + CUDA wheel is only available on CUDA 12.9
- label: "Build release image (arm64)" - label: "Build release image (arm64)"
depends_on: ~ depends_on: ~
id: build-release-image-arm64 id: build-release-image-arm64
@ -90,7 +76,7 @@ steps:
queue: arm64_cpu_queue_postmerge queue: arm64_cpu_queue_postmerge
commands: commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ." - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)" - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
# Add job to create multi-arch manifest # Add job to create multi-arch manifest
@ -156,22 +142,6 @@ steps:
env: env:
DOCKER_BUILDKIT: "1" DOCKER_BUILDKIT: "1"
- block: "Build arm64 CPU release image"
key: block-arm64-cpu-release-image-build
depends_on: ~
- label: "Build and publish arm64 CPU release image"
depends_on: block-arm64-cpu-release-image-build
agents:
queue: arm64_cpu_queue_postmerge
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest"
- "docker push public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
env:
DOCKER_BUILDKIT: "1"
- label: "Build and publish nightly multi-arch image to DockerHub" - label: "Build and publish nightly multi-arch image to DockerHub"
depends_on: depends_on:
- create-multi-arch-manifest - create-multi-arch-manifest

View File

@ -70,7 +70,7 @@ function cpu_tests() {
docker exec cpu-test-"$NUMA_NODE" bash -c " docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e set -e
pytest -x -s -v \ pytest -x -s -v \
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs" tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs[False-10-32-neuralmagic/Llama-3.2-1B-quantized.w8a8]"
# Note: disable it until supports V1 # Note: disable it until supports V1
# Run AWQ test # Run AWQ test

View File

@ -58,25 +58,33 @@ python3 .buildkite/generate_index.py --wheel "$normal_wheel"
aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/" aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/" aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
if [[ $normal_wheel == *"cu129"* ]]; then if [[ $normal_wheel == *"cu126"* ]]; then
# if $normal_wheel matches cu126, do not upload the index.html
echo "Skipping index files for cu126 wheels"
elif [[ $normal_wheel == *"cu128"* ]]; then
# if $normal_wheel matches cu128, do not upload the index.html
echo "Skipping index files for cu128 wheels"
else
# only upload index.html for cu129 wheels (default wheels) as it # only upload index.html for cu129 wheels (default wheels) as it
# is available on both x86 and arm64 # is available on both x86 and arm64
aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html" aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html" aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
else
echo "Skipping index files for non-cu129 wheels"
fi fi
# generate index for nightly # generate index for nightly
aws s3 cp "$wheel" "s3://vllm-wheels/nightly/" aws s3 cp "$wheel" "s3://vllm-wheels/nightly/"
aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/" aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"
if [[ $normal_wheel == *"cu129"* ]]; then if [[ $normal_wheel == *"cu126"* ]]; then
# if $normal_wheel matches cu126, do not upload the index.html
echo "Skipping index files for cu126 wheels"
elif [[ $normal_wheel == *"cu128"* ]]; then
# if $normal_wheel matches cu128, do not upload the index.html
echo "Skipping index files for cu128 wheels"
else
# only upload index.html for cu129 wheels (default wheels) as it # only upload index.html for cu129 wheels (default wheels) as it
# is available on both x86 and arm64 # is available on both x86 and arm64
aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html" aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
else
echo "Skipping index files for non-cu129 wheels"
fi fi
aws s3 cp "$wheel" "s3://vllm-wheels/$version/" aws s3 cp "$wheel" "s3://vllm-wheels/$version/"

File diff suppressed because it is too large Load Diff

View File

@ -172,8 +172,6 @@ steps:
- tests/v1/engine/test_engine_core_client.py - tests/v1/engine/test_engine_core_client.py
- tests/distributed/test_symm_mem_allreduce.py - tests/distributed/test_symm_mem_allreduce.py
commands: commands:
# https://github.com/NVIDIA/nccl/issues/1838
- export NCCL_CUMEM_HOST_ENABLE=0
# test with torchrun tp=2 and external_dp=2 # test with torchrun tp=2 and external_dp=2
- torchrun --nproc-per-node=4 distributed/test_torchrun_example.py - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
# test with torchrun tp=2 and pp=2 # test with torchrun tp=2 and pp=2
@ -351,8 +349,7 @@ steps:
- python3 offline_inference/basic/embed.py - python3 offline_inference/basic/embed.py
- python3 offline_inference/basic/score.py - python3 offline_inference/basic/score.py
- python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
# https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
- python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
- label: Platform Tests (CUDA) # 4min - label: Platform Tests (CUDA) # 4min
timeout_in_minutes: 15 timeout_in_minutes: 15
@ -387,12 +384,7 @@ steps:
--num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \ --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
--ignore=lora/test_chatglm3_tp.py \ --ignore=lora/test_chatglm3_tp.py \
--ignore=lora/test_llama_tp.py \ --ignore=lora/test_llama_tp.py \
--ignore=lora/test_llm_with_multi_loras.py \ --ignore=lora/test_llm_with_multi_loras.py
--ignore=lora/test_olmoe_tp.py \
--ignore=lora/test_deepseekv2_tp.py \
--ignore=lora/test_gptoss.py \
--ignore=lora/test_qwen3moe_tp.py
parallelism: 4 parallelism: 4
- label: PyTorch Compilation Unit Tests # 15min - label: PyTorch Compilation Unit Tests # 15min
@ -424,8 +416,8 @@ steps:
- pytest -v -s compile/test_basic_correctness.py - pytest -v -s compile/test_basic_correctness.py
- pytest -v -s compile/piecewise/ - pytest -v -s compile/piecewise/
- label: PyTorch Fullgraph Test # 22min - label: PyTorch Fullgraph Test # 20min
timeout_in_minutes: 35 timeout_in_minutes: 30
mirror_hardwares: [amdexperimental] mirror_hardwares: [amdexperimental]
torch_nightly: true torch_nightly: true
source_file_dependencies: source_file_dependencies:
@ -433,7 +425,6 @@ steps:
- tests/compile - tests/compile
commands: commands:
- pytest -v -s compile/test_full_graph.py - pytest -v -s compile/test_full_graph.py
- pytest -v -s compile/test_fusions_e2e.py
- label: Kernels Core Operation Test # 48min - label: Kernels Core Operation Test # 48min
timeout_in_minutes: 75 timeout_in_minutes: 75
@ -536,9 +527,8 @@ steps:
# since torchao nightly is only compatible with torch nightly currently # since torchao nightly is only compatible with torch nightly currently
# https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
# we can only upgrade after this is resolved # we can only upgrade after this is resolved
# TODO(jerryzh168): resolve the above comment - pip install --pre torchao==0.13.0.dev20250814 --index-url https://download.pytorch.org/whl/nightly/cu128
- uv pip install --system torchao==0.13.0 --index-url https://download.pytorch.org/whl/cu129 - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
- label: LM Eval Small Models # 53min - label: LM Eval Small Models # 53min
timeout_in_minutes: 75 timeout_in_minutes: 75
@ -743,16 +733,6 @@ steps:
- pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
- cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
- label: Multi-Modal Accuracy Eval (Small Models) # 50min
timeout_in_minutes: 70
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
source_file_dependencies:
- vllm/multimodal/
- vllm/inputs/
- vllm/v1/core/
commands:
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
- label: Multi-Modal Models Test (Extended) 1 - label: Multi-Modal Models Test (Extended) 1
mirror_hardwares: [amdexperimental] mirror_hardwares: [amdexperimental]
optional: true optional: true
@ -816,8 +796,8 @@ steps:
# Whisper needs spawn method to avoid deadlock # Whisper needs spawn method to avoid deadlock
- VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
- label: Blackwell Test # 21 min - label: Blackwell Test # 38 min
timeout_in_minutes: 30 timeout_in_minutes: 60
working_dir: "/vllm-workspace/" working_dir: "/vllm-workspace/"
gpu: b200 gpu: b200
# optional: true # optional: true
@ -830,6 +810,8 @@ steps:
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
- vllm/v1/attention/backends/flashinfer.py - vllm/v1/attention/backends/flashinfer.py
- vllm/compilation/fusion.py
- vllm/compilation/fusion_attn.py
commands: commands:
- nvidia-smi - nvidia-smi
- python3 examples/offline_inference/basic/chat.py - python3 examples/offline_inference/basic/chat.py
@ -846,32 +828,15 @@ steps:
- pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py - pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
- pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py - pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
- pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py - pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
- pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py
- pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py
- pytest -v -s tests/kernels/moe/test_nvfp4_moe.py - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
- pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
- pytest -v -s tests/kernels/moe/test_flashinfer.py # Fusion
- label: Blackwell Fusion Tests # 30 min
timeout_in_minutes: 40
working_dir: "/vllm-workspace/"
gpu: b200
source_file_dependencies:
- csrc/quantization/fp4/
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
- vllm/v1/attention/backends/flashinfer.py
- vllm/compilation/
# can affect pattern matching
- vllm/model_executor/layers/layernorm.py
- vllm/model_executor/layers/activation.py
- vllm/model_executor/layers/quantization/input_quant_fp8.py
commands:
- nvidia-smi
- pytest -v -s tests/compile/test_fusion_attn.py
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
# this runner has 2 GPUs available even though num_gpus=2 is not set
- pytest -v -s tests/compile/test_fusion_all_reduce.py - pytest -v -s tests/compile/test_fusion_all_reduce.py
- pytest -v -s tests/compile/test_fusions_e2e.py - pytest -v -s tests/compile/test_fusion_attn.py::test_attention_quant_pattern
- pytest -v -s tests/kernels/moe/test_flashinfer.py
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
- pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py
- pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py
- label: Blackwell GPT-OSS Eval - label: Blackwell GPT-OSS Eval
timeout_in_minutes: 60 timeout_in_minutes: 60
@ -978,8 +943,6 @@ steps:
- tests/v1/shutdown - tests/v1/shutdown
- tests/v1/worker/test_worker_memory_snapshot.py - tests/v1/worker/test_worker_memory_snapshot.py
commands: commands:
# https://github.com/NVIDIA/nccl/issues/1838
- export NCCL_CUMEM_HOST_ENABLE=0
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
- DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
@ -987,7 +950,6 @@ steps:
- pytest -v -s ./compile/test_basic_correctness.py - pytest -v -s ./compile/test_basic_correctness.py
- pytest -v -s ./compile/test_wrapper.py - pytest -v -s ./compile/test_wrapper.py
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed' - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
- VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
- pytest -v -s distributed/test_sequence_parallel.py - pytest -v -s distributed/test_sequence_parallel.py
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
- pytest -v -s v1/worker/test_worker_memory_snapshot.py - pytest -v -s v1/worker/test_worker_memory_snapshot.py
@ -1031,11 +993,6 @@ steps:
- pytest -v -s plugins_tests/test_io_processor_plugins.py - pytest -v -s plugins_tests/test_io_processor_plugins.py
- pip uninstall prithvi_io_processor_plugin -y - pip uninstall prithvi_io_processor_plugin -y
# end io_processor plugins test # end io_processor plugins test
# begin stat_logger plugins test
- pip install -e ./plugins/vllm_add_dummy_stat_logger
- pytest -v -s plugins_tests/test_stats_logger_plugins.py
- pip uninstall dummy_stat_logger -y
# end stat_logger plugins test
# other tests continue here: # other tests continue here:
- pytest -v -s plugins_tests/test_scheduler_plugins.py - pytest -v -s plugins_tests/test_scheduler_plugins.py
- pip install -e ./plugins/vllm_add_dummy_model - pip install -e ./plugins/vllm_add_dummy_model
@ -1075,7 +1032,6 @@ steps:
- pytest -v -s -x lora/test_chatglm3_tp.py - pytest -v -s -x lora/test_chatglm3_tp.py
- pytest -v -s -x lora/test_llama_tp.py - pytest -v -s -x lora/test_llama_tp.py
- pytest -v -s -x lora/test_llm_with_multi_loras.py - pytest -v -s -x lora/test_llm_with_multi_loras.py
- pytest -v -s -x lora/test_olmoe_tp.py
- label: Weight Loading Multiple GPU Test # 33min - label: Weight Loading Multiple GPU Test # 33min
@ -1101,17 +1057,6 @@ steps:
- tests/weight_loading - tests/weight_loading
commands: commands:
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
- label: NixlConnector PD accuracy tests (Distributed) # 30min
timeout_in_minutes: 30
working_dir: "/vllm-workspace/tests"
num_gpus: 4
source_file_dependencies:
- vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
- tests/v1/kv_connector/nixl_integration/
commands:
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
- bash v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh
##### multi gpus test ##### ##### multi gpus test #####
@ -1144,7 +1089,7 @@ steps:
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4 - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
##### H200 test ##### ##### H200 test #####
- label: Distributed Tests (H200) # optional - label: Distrubted Tests (H200) # optional
gpu: h200 gpu: h200
optional: true optional: true
working_dir: "/vllm-workspace/" working_dir: "/vllm-workspace/"
@ -1152,8 +1097,6 @@ steps:
commands: commands:
- pytest -v -s tests/compile/test_async_tp.py - pytest -v -s tests/compile/test_async_tp.py
- pytest -v -s tests/compile/test_sequence_parallelism.py - pytest -v -s tests/compile/test_sequence_parallelism.py
- pytest -v -s tests/compile/test_fusion_all_reduce.py
- pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
- pytest -v -s tests/distributed/test_context_parallel.py - pytest -v -s tests/distributed/test_context_parallel.py
- CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048

View File

@ -1,10 +1,5 @@
[run] [run]
# Track the installed vllm package (this is what actually gets imported during tests) source = vllm
# Use wildcard pattern to match the installed location
source =
vllm
*/dist-packages/vllm
*/site-packages/vllm
omit = omit =
*/tests/* */tests/*
*/test_* */test_*
@ -17,16 +12,6 @@ omit =
*/benchmarks/* */benchmarks/*
*/docs/* */docs/*
[paths]
# Map all possible vllm locations to a canonical "vllm" path
# This ensures coverage.combine properly merges data from different test runs
source =
vllm
/vllm-workspace/src/vllm
/vllm-workspace/vllm
*/site-packages/vllm
*/dist-packages/vllm
[report] [report]
exclude_lines = exclude_lines =
pragma: no cover pragma: no cover

View File

@ -1,4 +0,0 @@
# Migrate from `yapf` & `isort` to `ruff`
d6953beb91da4e9c99be4c0a1304a2d24189535c
# Convert `Optional[x]` to `x | None` and `Union[x, y]` to `x | y`
8fcaaf6a165e661f63fc51be906bc05b0767332f

22
.github/CODEOWNERS vendored
View File

@ -5,8 +5,10 @@
/vllm/attention @LucasWilkinson /vllm/attention @LucasWilkinson
/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill /vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn /vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
/vllm/model_executor/layers/fused_moe @mgoin @pavanimajety /vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256 @pavanimajety /vllm/model_executor/layers/fused_moe @mgoin
/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @NickLucche
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256
/vllm/model_executor/layers/mamba @tdoublep /vllm/model_executor/layers/mamba @tdoublep
/vllm/model_executor/model_loader @22quinn /vllm/model_executor/model_loader @22quinn
/vllm/multimodal @DarkLight1337 @ywang96 @NickLucche /vllm/multimodal @DarkLight1337 @ywang96 @NickLucche
@ -24,9 +26,9 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
/vllm/config/cache.py @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg @heheda12345 /vllm/config/cache.py @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg @heheda12345
# vLLM V1 # vLLM V1
/vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
/vllm/v1/attention @LucasWilkinson /vllm/v1/attention @LucasWilkinson
/vllm/v1/attention/backends/mla @pavanimajety /vllm/v1/attention/backends/flashinfer.py @mgoin
/vllm/v1/attention/backends/flashinfer.py @mgoin @pavanimajety
/vllm/v1/attention/backends/triton_attn.py @tdoublep /vllm/v1/attention/backends/triton_attn.py @tdoublep
/vllm/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat @heheda12345 @ApostaC /vllm/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat @heheda12345 @ApostaC
/vllm/v1/sample @22quinn @houseroad @njhill /vllm/v1/sample @22quinn @houseroad @njhill
@ -45,7 +47,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
/tests/kernels @mgoin @tlrmchlsmth @WoosukKwon @yewentao256 /tests/kernels @mgoin @tlrmchlsmth @WoosukKwon @yewentao256
/tests/models @DarkLight1337 @ywang96 /tests/models @DarkLight1337 @ywang96
/tests/multimodal @DarkLight1337 @ywang96 @NickLucche /tests/multimodal @DarkLight1337 @ywang96 @NickLucche
/tests/quantization @mgoin @robertgshaw2-redhat @yewentao256 @pavanimajety /tests/quantization @mgoin @robertgshaw2-redhat @yewentao256
/tests/test_inputs.py @DarkLight1337 @ywang96 /tests/test_inputs.py @DarkLight1337 @ywang96
/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm /tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
/tests/v1/structured_output @mgoin @russellb @aarnphm /tests/v1/structured_output @mgoin @russellb @aarnphm
@ -58,7 +60,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
/tests/v1/offloading @ApostaC /tests/v1/offloading @ApostaC
# Transformers backend # Transformers backend
/vllm/model_executor/models/transformers @hmellor /vllm/model_executor/models/transformers.py @hmellor
/tests/models/test_transformers.py @hmellor /tests/models/test_transformers.py @hmellor
# Docs # Docs
@ -119,11 +121,3 @@ mkdocs.yaml @hmellor
# KVConnector installation files # KVConnector installation files
/requirements/kv_connectors.txt @NickLucche /requirements/kv_connectors.txt @NickLucche
# Pooling models
/examples/*/pooling/ @noooop
/tests/models/*/pooling* @noooop
/tests/entrypoints/pooling @noooop
/vllm/config/pooler.py @noooop
/vllm/pooling_params.py @noooop
/vllm/model_executor/layers/pooler.py @noooop

View File

@ -13,7 +13,6 @@ jobs:
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:
- name: Label issues based on keywords - name: Label issues based on keywords
id: label-step
uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0 uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
with: with:
script: | script: |
@ -43,6 +42,7 @@ jobs:
searchIn: "body" searchIn: "body"
}, },
], ],
// Substring search - matches anywhere in text (partial matches) // Substring search - matches anywhere in text (partial matches)
substrings: [ substrings: [
{ {
@ -89,12 +89,14 @@ jobs:
term: "hip_", term: "hip_",
searchIn: "both" searchIn: "both"
}, },
// ROCm tools and libraries // ROCm tools and libraries
{ {
term: "hipify", term: "hipify",
searchIn: "both" searchIn: "both"
}, },
], ],
// Regex patterns - for complex pattern matching // Regex patterns - for complex pattern matching
regexPatterns: [ regexPatterns: [
{ {
@ -105,17 +107,13 @@ jobs:
} }
], ],
}, },
// Add more label configurations here as needed
// example: {
// keywords: [...],
// substrings: [...],
// regexPatterns: [...]
// },
}; };
// Helper function to create regex based on search type // Helper function to create regex based on search type
function createSearchRegex(term, type) { function createSearchRegex(term, type) {
// Escape special regex characters in the term // Escape special regex characters in the term
const escapedTerm = term.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); const escapedTerm = term.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
switch (type) { switch (type) {
case 'keyword': case 'keyword':
// Word boundary search - matches whole words only // Word boundary search - matches whole words only
@ -127,13 +125,16 @@ jobs:
throw new Error(`Unknown search type: ${type}`); throw new Error(`Unknown search type: ${type}`);
} }
} }
// Helper function to find matching terms in text with line information // Helper function to find matching terms in text with line information
function findMatchingTermsWithLines(text, searchTerms = [], searchType = 'keyword', searchLocation = '') { function findMatchingTermsWithLines(text, searchTerms = [], searchType = 'keyword', searchLocation = '') {
const matches = []; const matches = [];
const lines = text.split('\n'); const lines = text.split('\n');
for (const termConfig of searchTerms) { for (const termConfig of searchTerms) {
let regex; let regex;
let term, searchIn, pattern, description, flags; let term, searchIn, pattern, description, flags;
// Handle different input formats (string or object) // Handle different input formats (string or object)
if (typeof termConfig === 'string') { if (typeof termConfig === 'string') {
term = termConfig; term = termConfig;
@ -145,17 +146,21 @@ jobs:
description = termConfig.description; description = termConfig.description;
flags = termConfig.flags; flags = termConfig.flags;
} }
// Skip if this term shouldn't be searched in the current location // Skip if this term shouldn't be searched in the current location
if (searchIn !== 'both' && searchIn !== searchLocation) { if (searchIn !== 'both' && searchIn !== searchLocation) {
continue; continue;
} }
// Create appropriate regex // Create appropriate regex
if (searchType === 'regex') { if (searchType === 'regex') {
regex = new RegExp(pattern, flags || "gi"); regex = new RegExp(pattern, flags || "gi");
} else { } else {
regex = createSearchRegex(term, searchType); regex = createSearchRegex(term, searchType);
} }
const termMatches = []; const termMatches = [];
// Check each line for matches // Check each line for matches
lines.forEach((line, lineIndex) => { lines.forEach((line, lineIndex) => {
const lineMatches = line.match(regex); const lineMatches = line.match(regex);
@ -170,14 +175,15 @@ jobs:
originalTerm: term || pattern, originalTerm: term || pattern,
description: description, description: description,
// Show context around the match in the line // Show context around the match in the line
context: line.length > 100 ? context: line.length > 100 ?
line.substring(Math.max(0, line.toLowerCase().indexOf(match.toLowerCase()) - 30), line.substring(Math.max(0, line.toLowerCase().indexOf(match.toLowerCase()) - 30),
line.toLowerCase().indexOf(match.toLowerCase()) + match.length + 30) + '...' line.toLowerCase().indexOf(match.toLowerCase()) + match.length + 30) + '...'
: line.trim() : line.trim()
}); });
}); });
} }
}); });
if (termMatches.length > 0) { if (termMatches.length > 0) {
matches.push({ matches.push({
term: term || (description || pattern), term: term || (description || pattern),
@ -190,48 +196,64 @@ jobs:
}); });
} }
} }
return matches; return matches;
} }
// Helper function to check if label should be added // Helper function to check if label should be added
async function processLabel(labelName, config) { async function processLabel(labelName, config) {
const body = context.payload.issue.body || ""; const body = context.payload.issue.body || "";
const title = context.payload.issue.title || ""; const title = context.payload.issue.title || "";
core.notice(`Processing label: ${labelName}`); core.notice(`Processing label: ${labelName}`);
core.notice(`Issue Title: "${title}"`); core.notice(`Issue Title: "${title}"`);
core.notice(`Issue Body length: ${body.length} characters`); core.notice(`Issue Body length: ${body.length} characters`);
let shouldAddLabel = false; let shouldAddLabel = false;
let allMatches = []; let allMatches = [];
let reason = ''; let reason = '';
const keywords = config.keywords || []; const keywords = config.keywords || [];
const substrings = config.substrings || []; const substrings = config.substrings || [];
const regexPatterns = config.regexPatterns || []; const regexPatterns = config.regexPatterns || [];
core.notice(`Searching with ${keywords.length} keywords, ${substrings.length} substrings, and ${regexPatterns.length} regex patterns`); core.notice(`Searching with ${keywords.length} keywords, ${substrings.length} substrings, and ${regexPatterns.length} regex patterns`);
// Search in title // Search in title
if (title.trim()) { if (title.trim()) {
core.notice(`Searching in title: "${title}"`); core.notice(`Searching in title: "${title}"`);
const titleKeywordMatches = findMatchingTermsWithLines(title, keywords, 'keyword', 'title'); const titleKeywordMatches = findMatchingTermsWithLines(title, keywords, 'keyword', 'title');
const titleSubstringMatches = findMatchingTermsWithLines(title, substrings, 'substring', 'title'); const titleSubstringMatches = findMatchingTermsWithLines(title, substrings, 'substring', 'title');
const titleRegexMatches = findMatchingTermsWithLines(title, regexPatterns, 'regex', 'title'); const titleRegexMatches = findMatchingTermsWithLines(title, regexPatterns, 'regex', 'title');
allMatches.push(...titleKeywordMatches, ...titleSubstringMatches, ...titleRegexMatches); allMatches.push(...titleKeywordMatches, ...titleSubstringMatches, ...titleRegexMatches);
} }
// Search in body // Search in body
if (body.trim()) { if (body.trim()) {
core.notice(`Searching in body (${body.length} characters)`); core.notice(`Searching in body (${body.length} characters)`);
const bodyKeywordMatches = findMatchingTermsWithLines(body, keywords, 'keyword', 'body'); const bodyKeywordMatches = findMatchingTermsWithLines(body, keywords, 'keyword', 'body');
const bodySubstringMatches = findMatchingTermsWithLines(body, substrings, 'substring', 'body'); const bodySubstringMatches = findMatchingTermsWithLines(body, substrings, 'substring', 'body');
const bodyRegexMatches = findMatchingTermsWithLines(body, regexPatterns, 'regex', 'body'); const bodyRegexMatches = findMatchingTermsWithLines(body, regexPatterns, 'regex', 'body');
allMatches.push(...bodyKeywordMatches, ...bodySubstringMatches, ...bodyRegexMatches); allMatches.push(...bodyKeywordMatches, ...bodySubstringMatches, ...bodyRegexMatches);
} }
if (allMatches.length > 0) { if (allMatches.length > 0) {
core.notice(`Found ${allMatches.length} matching term(s):`); core.notice(`Found ${allMatches.length} matching term(s):`);
for (const termMatch of allMatches) { for (const termMatch of allMatches) {
const locationText = termMatch.searchLocation === 'title' ? 'title' : 'body'; const locationText = termMatch.searchLocation === 'title' ? 'title' : 'body';
const searchInText = termMatch.searchIn === 'both' ? 'both' : termMatch.searchIn; const searchInText = termMatch.searchIn === 'both' ? 'both' : termMatch.searchIn;
if (termMatch.searchType === 'regex') { if (termMatch.searchType === 'regex') {
core.notice(` 📍 Regex: "${termMatch.term}" (pattern: ${termMatch.pattern}) found ${termMatch.count} time(s) in ${locationText} (configured to search in: ${searchInText}):`); core.notice(` 📍 Regex: "${termMatch.term}" (pattern: ${termMatch.pattern}) found ${termMatch.count} time(s) in ${locationText} (configured to search in: ${searchInText}):`);
} else { } else {
core.notice(` 📍 Term: "${termMatch.term}" (${termMatch.searchType} search) found ${termMatch.count} time(s) in ${locationText} (configured to search in: ${searchInText}):`); core.notice(` 📍 Term: "${termMatch.term}" (${termMatch.searchType} search) found ${termMatch.count} time(s) in ${locationText} (configured to search in: ${searchInText}):`);
} }
// Show details for each match // Show details for each match
termMatch.matches.forEach((match, index) => { termMatch.matches.forEach((match, index) => {
core.notice(` ${index + 1}. Line ${match.lineNumber} in ${match.searchLocation}: "${match.match}" [${match.searchType}]`); core.notice(` ${index + 1}. Line ${match.lineNumber} in ${match.searchLocation}: "${match.match}" [${match.searchType}]`);
@ -244,6 +266,7 @@ jobs:
} }
}); });
} }
shouldAddLabel = true; shouldAddLabel = true;
const totalMatches = allMatches.reduce((sum, t) => sum + t.count, 0); const totalMatches = allMatches.reduce((sum, t) => sum + t.count, 0);
const titleMatches = allMatches.filter(t => t.searchLocation === 'title').reduce((sum, t) => sum + t.count, 0); const titleMatches = allMatches.filter(t => t.searchLocation === 'title').reduce((sum, t) => sum + t.count, 0);
@ -251,10 +274,13 @@ jobs:
const keywordMatches = allMatches.filter(t => t.searchType === 'keyword').reduce((sum, t) => sum + t.count, 0); const keywordMatches = allMatches.filter(t => t.searchType === 'keyword').reduce((sum, t) => sum + t.count, 0);
const substringMatches = allMatches.filter(t => t.searchType === 'substring').reduce((sum, t) => sum + t.count, 0); const substringMatches = allMatches.filter(t => t.searchType === 'substring').reduce((sum, t) => sum + t.count, 0);
const regexMatches = allMatches.filter(t => t.searchType === 'regex').reduce((sum, t) => sum + t.count, 0); const regexMatches = allMatches.filter(t => t.searchType === 'regex').reduce((sum, t) => sum + t.count, 0);
reason = `Found ${totalMatches} total matches (${titleMatches} in title, ${bodyMatches} in body) - ${keywordMatches} keyword matches, ${substringMatches} substring matches, ${regexMatches} regex matches`; reason = `Found ${totalMatches} total matches (${titleMatches} in title, ${bodyMatches} in body) - ${keywordMatches} keyword matches, ${substringMatches} substring matches, ${regexMatches} regex matches`;
} }
core.notice(`Final decision: ${shouldAddLabel ? 'ADD LABEL' : 'DO NOT ADD LABEL'}`); core.notice(`Final decision: ${shouldAddLabel ? 'ADD LABEL' : 'DO NOT ADD LABEL'}`);
core.notice(`Reason: ${reason || 'No matching terms found'}`); core.notice(`Reason: ${reason || 'No matching terms found'}`);
if (shouldAddLabel) { if (shouldAddLabel) {
const existingLabels = context.payload.issue.labels.map(l => l.name); const existingLabels = context.payload.issue.labels.map(l => l.name);
if (!existingLabels.includes(labelName)) { if (!existingLabels.includes(labelName)) {
@ -270,92 +296,14 @@ jobs:
core.notice(`Label "${labelName}" already present.`); core.notice(`Label "${labelName}" already present.`);
return false; return false;
} }
core.notice(`No matching terms found for label "${labelName}".`); core.notice(`No matching terms found for label "${labelName}".`);
return false; return false;
} }
// Process all configured labels // Process all configured labels
const labelsAddedResults = await Promise.all( const processLabels = Object.entries(labelConfig)
Object.entries(labelConfig).map(([labelName, config]) => .map(([labelName, config]) => processLabel(labelName, config));
processLabel(labelName, config).then(added => ({ labelName, added })) const labelsAdded = await Promise.all(processLabels);
) const numLabelsAdded = labelsAdded.reduce((x, y) => x + y, 0);
); core.notice(`Processing complete. ${numLabelsAdded} label(s) added.`);
const numLabelsAdded = labelsAddedResults.filter(r => r.added).length;
core.notice(`Processing complete. ${numLabelsAdded} label(s) added.`);
// Return which labels were added for the next step
const addedLabels = labelsAddedResults.filter(r => r.added).map(r => r.labelName);
core.setOutput('labels_added', JSON.stringify(addedLabels));
return addedLabels;
- name: CC users for labeled issues
if: steps.label-step.outputs.labels_added != '[]'
uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
with:
script: |
// Configuration: Map labels to GitHub users to CC
// You can add multiple users per label, and multiple label configurations
const ccConfig = {
rocm: {
users: ['hongxiayang', 'tjtanaa', 'vllmellm'], // Add more users as needed: ['user1', 'user2', 'user3']
message: 'CC {users} for ROCm-related issue' // {users} will be replaced with @mentions
},
// Add more label -> user mappings here
// Example:
// cuda: {
// users: ['user1', 'user2'],
// message: 'CC {users} for CUDA-related issue'
// },
// performance: {
// users: ['perfexpert'],
// message: 'CC {users} for performance issue'
// },
};
const labelsAdded = JSON.parse('${{ steps.label-step.outputs.labels_added }}');
core.notice(`Labels added: ${labelsAdded.join(', ')}`);
// Get existing comments to check for already mentioned users
const comments = await github.rest.issues.listComments({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: context.issue.number,
});
const issueBody = context.payload.issue.body || '';
const allExistingText = issueBody + '\n' + comments.data.map(c => c.body).join('\n');
// Process each label that was added
for (const label of labelsAdded) {
if (ccConfig[label]) {
const config = ccConfig[label];
const usersToMention = [];
// Check which users haven't been mentioned yet
for (const user of config.users) {
const mentionPattern = new RegExp(`@${user}\\b`, 'i');
if (!mentionPattern.test(allExistingText)) {
usersToMention.push(user);
} else {
core.notice(`@${user} already mentioned for label "${label}", skipping`);
}
}
// Post comment if there are users to mention
if (usersToMention.length > 0) {
const mentions = usersToMention.map(u => `@${u}`).join(' ');
const message = config.message.replace('{users}', mentions);
await github.rest.issues.createComment({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: context.issue.number,
body: message
});
core.notice(`CC comment added for label "${label}": ${mentions}`);
} else {
core.notice(`All users for label "${label}" already mentioned, skipping comment`);
}
}
}

3
.gitignore vendored
View File

@ -94,9 +94,6 @@ ipython_config.py
# generated files # generated files
**/generated/** **/generated/**
# uv
uv.lock
# pyenv # pyenv
# For a library or package, you might want to ignore these files since the code is # For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in: # intended to run in multiple environments; otherwise, check them in:

View File

@ -4,6 +4,7 @@ MD013: false
MD024: MD024:
siblings_only: true siblings_only: true
MD033: false MD033: false
MD042: false
MD045: false MD045: false
MD046: false MD046: false
MD051: false MD051: false

View File

@ -16,7 +16,6 @@ repos:
rev: v1.38.1 rev: v1.38.1
hooks: hooks:
- id: typos - id: typos
args: [--force-exclude]
- repo: https://github.com/pre-commit/mirrors-clang-format - repo: https://github.com/pre-commit/mirrors-clang-format
rev: v21.1.2 rev: v21.1.2
hooks: hooks:
@ -38,7 +37,7 @@ repos:
rev: 0.9.1 rev: 0.9.1
hooks: hooks:
- id: pip-compile - id: pip-compile
args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu129, --python-platform, x86_64-manylinux_2_28] args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu128, --python-platform, x86_64-manylinux_2_28]
files: ^requirements/test\.(in|txt)$ files: ^requirements/test\.(in|txt)$
- repo: local - repo: local
hooks: hooks:

View File

@ -49,8 +49,8 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1
# requirements.txt files and should be kept consistent. The ROCm torch # requirements.txt files and should be kept consistent. The ROCm torch
# versions are derived from docker/Dockerfile.rocm # versions are derived from docker/Dockerfile.rocm
# #
set(TORCH_SUPPORTED_VERSION_CUDA "2.9.0") set(TORCH_SUPPORTED_VERSION_CUDA "2.8.0")
set(TORCH_SUPPORTED_VERSION_ROCM "2.9.0") set(TORCH_SUPPORTED_VERSION_ROCM "2.8.0")
# #
# Try to find python package with an executable that exactly matches # Try to find python package with an executable that exactly matches
@ -883,7 +883,6 @@ target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1)
set(VLLM_MOE_EXT_SRC set(VLLM_MOE_EXT_SRC
"csrc/moe/torch_bindings.cpp" "csrc/moe/torch_bindings.cpp"
"csrc/moe/moe_align_sum_kernels.cu" "csrc/moe/moe_align_sum_kernels.cu"
"csrc/moe/moe_lora_align_sum_kernels.cu"
"csrc/moe/topk_softmax_kernels.cu") "csrc/moe/topk_softmax_kernels.cu")
if(VLLM_GPU_LANG STREQUAL "CUDA") if(VLLM_GPU_LANG STREQUAL "CUDA")

View File

@ -8,6 +8,7 @@ import sys
import time import time
import traceback import traceback
from dataclasses import dataclass, field from dataclasses import dataclass, field
from typing import Optional, Union
import aiohttp import aiohttp
import huggingface_hub.constants import huggingface_hub.constants
@ -27,13 +28,13 @@ class RequestFuncInput:
prompt_len: int prompt_len: int
output_len: int output_len: int
model: str model: str
model_name: str | None = None model_name: Optional[str] = None
logprobs: int | None = None logprobs: Optional[int] = None
extra_body: dict | None = None extra_body: Optional[dict] = None
multi_modal_content: dict | list[dict] | None = None multi_modal_content: Optional[dict | list[dict]] = None
ignore_eos: bool = False ignore_eos: bool = False
language: str | None = None language: Optional[str] = None
request_id: str | None = None request_id: Optional[str] = None
@dataclass @dataclass
@ -51,7 +52,7 @@ class RequestFuncOutput:
async def async_request_tgi( async def async_request_tgi(
request_func_input: RequestFuncInput, request_func_input: RequestFuncInput,
pbar: tqdm | None = None, pbar: Optional[tqdm] = None,
) -> RequestFuncOutput: ) -> RequestFuncOutput:
api_url = request_func_input.api_url api_url = request_func_input.api_url
assert api_url.endswith("generate_stream") assert api_url.endswith("generate_stream")
@ -132,7 +133,7 @@ async def async_request_tgi(
async def async_request_trt_llm( async def async_request_trt_llm(
request_func_input: RequestFuncInput, request_func_input: RequestFuncInput,
pbar: tqdm | None = None, pbar: Optional[tqdm] = None,
) -> RequestFuncOutput: ) -> RequestFuncOutput:
api_url = request_func_input.api_url api_url = request_func_input.api_url
assert api_url.endswith("generate_stream") assert api_url.endswith("generate_stream")
@ -203,7 +204,7 @@ async def async_request_trt_llm(
async def async_request_deepspeed_mii( async def async_request_deepspeed_mii(
request_func_input: RequestFuncInput, request_func_input: RequestFuncInput,
pbar: tqdm | None = None, pbar: Optional[tqdm] = None,
) -> RequestFuncOutput: ) -> RequestFuncOutput:
api_url = request_func_input.api_url api_url = request_func_input.api_url
assert api_url.endswith(("completions", "profile")), ( assert api_url.endswith(("completions", "profile")), (
@ -266,7 +267,7 @@ async def async_request_deepspeed_mii(
async def async_request_openai_completions( async def async_request_openai_completions(
request_func_input: RequestFuncInput, request_func_input: RequestFuncInput,
pbar: tqdm | None = None, pbar: Optional[tqdm] = None,
) -> RequestFuncOutput: ) -> RequestFuncOutput:
api_url = request_func_input.api_url api_url = request_func_input.api_url
assert api_url.endswith(("completions", "profile")), ( assert api_url.endswith(("completions", "profile")), (
@ -366,7 +367,7 @@ async def async_request_openai_completions(
async def async_request_openai_chat_completions( async def async_request_openai_chat_completions(
request_func_input: RequestFuncInput, request_func_input: RequestFuncInput,
pbar: tqdm | None = None, pbar: Optional[tqdm] = None,
) -> RequestFuncOutput: ) -> RequestFuncOutput:
api_url = request_func_input.api_url api_url = request_func_input.api_url
assert api_url.endswith(("chat/completions", "profile")), ( assert api_url.endswith(("chat/completions", "profile")), (
@ -475,7 +476,7 @@ async def async_request_openai_chat_completions(
async def async_request_openai_audio( async def async_request_openai_audio(
request_func_input: RequestFuncInput, request_func_input: RequestFuncInput,
pbar: tqdm | None = None, pbar: Optional[tqdm] = None,
) -> RequestFuncOutput: ) -> RequestFuncOutput:
# Lazy import without PlaceholderModule to avoid vllm dep. # Lazy import without PlaceholderModule to avoid vllm dep.
import soundfile import soundfile
@ -609,7 +610,7 @@ def get_tokenizer(
tokenizer_mode: str = "auto", tokenizer_mode: str = "auto",
trust_remote_code: bool = False, trust_remote_code: bool = False,
**kwargs, **kwargs,
) -> PreTrainedTokenizer | PreTrainedTokenizerFast: ) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
if pretrained_model_name_or_path is not None and not os.path.exists( if pretrained_model_name_or_path is not None and not os.path.exists(
pretrained_model_name_or_path pretrained_model_name_or_path
): ):

View File

@ -32,6 +32,7 @@ import dataclasses
import json import json
import random import random
import time import time
from typing import Optional
from transformers import PreTrainedTokenizerBase from transformers import PreTrainedTokenizerBase
@ -79,7 +80,7 @@ def sample_requests_from_dataset(
num_requests: int, num_requests: int,
tokenizer: PreTrainedTokenizerBase, tokenizer: PreTrainedTokenizerBase,
input_length_range: tuple[int, int], input_length_range: tuple[int, int],
fixed_output_len: int | None, fixed_output_len: Optional[int],
) -> list[Request]: ) -> list[Request]:
if fixed_output_len is not None and fixed_output_len < 4: if fixed_output_len is not None and fixed_output_len < 4:
raise ValueError("output_len too small") raise ValueError("output_len too small")
@ -127,7 +128,7 @@ def sample_requests_from_random(
num_requests: int, num_requests: int,
tokenizer: PreTrainedTokenizerBase, tokenizer: PreTrainedTokenizerBase,
input_length_range: tuple[int, int], input_length_range: tuple[int, int],
fixed_output_len: int | None, fixed_output_len: Optional[int],
prefix_len: int, prefix_len: int,
) -> list[Request]: ) -> list[Request]:
requests = [] requests = []

View File

@ -7,6 +7,7 @@ import dataclasses
import json import json
import random import random
import time import time
from typing import Optional
from transformers import AutoTokenizer, PreTrainedTokenizerBase from transformers import AutoTokenizer, PreTrainedTokenizerBase
@ -23,7 +24,7 @@ def sample_requests(
dataset_path: str, dataset_path: str,
num_requests: int, num_requests: int,
tokenizer: PreTrainedTokenizerBase, tokenizer: PreTrainedTokenizerBase,
fixed_output_len: int | None, fixed_output_len: Optional[int],
) -> list[tuple[str, int, int, int]]: ) -> list[tuple[str, int, int, int]]:
if fixed_output_len is not None and fixed_output_len < 4: if fixed_output_len is not None and fixed_output_len < 4:
raise ValueError("output_len too small") raise ValueError("output_len too small")

View File

@ -31,8 +31,8 @@ import time
import uuid import uuid
import warnings import warnings
from collections.abc import AsyncGenerator from collections.abc import AsyncGenerator
from contextlib import nullcontext
from dataclasses import dataclass from dataclasses import dataclass
from typing import Optional
import datasets import datasets
import numpy as np import numpy as np
@ -316,7 +316,7 @@ def calculate_metrics(
tokenizer: PreTrainedTokenizerBase, tokenizer: PreTrainedTokenizerBase,
selected_percentile_metrics: list[str], selected_percentile_metrics: list[str],
selected_percentiles: list[float], selected_percentiles: list[float],
goodput_config_dict: dict[str, float] | None = None, goodput_config_dict: Optional[dict[str, float]] = None,
) -> tuple[BenchmarkMetrics, list[int]]: ) -> tuple[BenchmarkMetrics, list[int]]:
actual_output_lens: list[int] = [] actual_output_lens: list[int] = []
total_input = 0 total_input = 0
@ -436,9 +436,9 @@ async def benchmark(
selected_percentile_metrics: list[str], selected_percentile_metrics: list[str],
selected_percentiles: list[str], selected_percentiles: list[str],
ignore_eos: bool, ignore_eos: bool,
max_concurrency: int | None, max_concurrency: Optional[int],
structured_output_ratio: float, structured_output_ratio: float,
goodput_config_dict: dict[str, float] | None = None, goodput_config_dict: Optional[dict[str, float]] = None,
): ):
if backend in ASYNC_REQUEST_FUNCS: if backend in ASYNC_REQUEST_FUNCS:
request_func = ASYNC_REQUEST_FUNCS[backend] request_func = ASYNC_REQUEST_FUNCS[backend]
@ -502,9 +502,15 @@ async def benchmark(
pbar = None if disable_tqdm else tqdm(total=len(input_requests)) pbar = None if disable_tqdm else tqdm(total=len(input_requests))
semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else nullcontext() # This can be used once the minimum Python version is 3.10 or higher,
# and it will simplify the code in limited_request_func.
# semaphore = (asyncio.Semaphore(max_concurrency)
# if max_concurrency else contextlib.nullcontext())
semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else None
async def limited_request_func(request_func_input, pbar): async def limited_request_func(request_func_input, pbar):
if semaphore is None:
return await request_func(request_func_input=request_func_input, pbar=pbar)
async with semaphore: async with semaphore:
return await request_func(request_func_input=request_func_input, pbar=pbar) return await request_func(request_func_input=request_func_input, pbar=pbar)

View File

@ -6,7 +6,7 @@ import math
import os import os
import time import time
from types import TracebackType from types import TracebackType
from typing import Any from typing import Any, Optional, Union
def convert_to_pytorch_benchmark_format( def convert_to_pytorch_benchmark_format(
@ -92,7 +92,7 @@ class TimeCollector:
def __init__(self, scale: int) -> None: def __init__(self, scale: int) -> None:
self.cnt: int = 0 self.cnt: int = 0
self._sum: int = 0 self._sum: int = 0
self._max: int | None = None self._max: Optional[int] = None
self.scale = scale self.scale = scale
self.start_time: int = time.monotonic_ns() self.start_time: int = time.monotonic_ns()
@ -104,13 +104,13 @@ class TimeCollector:
else: else:
self._max = max(self._max, v) self._max = max(self._max, v)
def avg(self) -> float | str: def avg(self) -> Union[float, str]:
return self._sum * 1.0 / self.cnt / self.scale if self.cnt > 0 else "N/A" return self._sum * 1.0 / self.cnt / self.scale if self.cnt > 0 else "N/A"
def max(self) -> float | str: def max(self) -> Union[float, str]:
return self._max / self.scale if self._max else "N/A" return self._max / self.scale if self._max else "N/A"
def dump_avg_max(self) -> list[float | str]: def dump_avg_max(self) -> list[Union[float, str]]:
return [self.avg(), self.max()] return [self.avg(), self.max()]
def __enter__(self) -> None: def __enter__(self) -> None:
@ -118,8 +118,8 @@ class TimeCollector:
def __exit__( def __exit__(
self, self,
exc_type: type[BaseException] | None, exc_type: Optional[type[BaseException]],
exc_value: BaseException | None, exc_value: Optional[BaseException],
exc_traceback: TracebackType | None, exc_traceback: Optional[TracebackType],
) -> None: ) -> None:
self.collect(time.monotonic_ns() - self.start_time) self.collect(time.monotonic_ns() - self.start_time)

View File

@ -6,7 +6,8 @@ import copy
import itertools import itertools
import pickle as pkl import pickle as pkl
import time import time
from collections.abc import Callable, Iterable from collections.abc import Iterable
from typing import Callable
import torch import torch
import torch.utils.benchmark as TBenchmark import torch.utils.benchmark as TBenchmark

View File

@ -6,7 +6,8 @@ import copy
import itertools import itertools
import pickle as pkl import pickle as pkl
import time import time
from collections.abc import Callable, Iterable from collections.abc import Iterable
from typing import Callable, Optional
import torch import torch
import torch.utils.benchmark as TBenchmark import torch.utils.benchmark as TBenchmark
@ -52,7 +53,7 @@ def bench_int8(
n: int, n: int,
label: str, label: str,
sub_label: str, sub_label: str,
bench_kernels: list[str] | None = None, bench_kernels: Optional[list[str]] = None,
) -> Iterable[TMeasurement]: ) -> Iterable[TMeasurement]:
"""Benchmark INT8-based kernels.""" """Benchmark INT8-based kernels."""
assert dtype == torch.int8 assert dtype == torch.int8
@ -107,7 +108,7 @@ def bench_fp8(
n: int, n: int,
label: str, label: str,
sub_label: str, sub_label: str,
bench_kernels: list[str] | None = None, bench_kernels: Optional[list[str]] = None,
) -> Iterable[TMeasurement]: ) -> Iterable[TMeasurement]:
"""Benchmark FP8-based kernels.""" """Benchmark FP8-based kernels."""
assert dtype == torch.float8_e4m3fn assert dtype == torch.float8_e4m3fn
@ -182,7 +183,7 @@ def bench(
n: int, n: int,
label: str, label: str,
sub_label: str, sub_label: str,
bench_kernels: list[str] | None = None, bench_kernels: Optional[list[str]] = None,
) -> Iterable[TMeasurement]: ) -> Iterable[TMeasurement]:
if dtype == torch.int8: if dtype == torch.int8:
return bench_int8(dtype, m, k, n, label, sub_label, bench_kernels) return bench_int8(dtype, m, k, n, label, sub_label, bench_kernels)
@ -200,7 +201,7 @@ def print_timers(timers: Iterable[TMeasurement]):
def run( def run(
dtype: torch.dtype, dtype: torch.dtype,
MKNs: Iterable[tuple[int, int, int]], MKNs: Iterable[tuple[int, int, int]],
bench_kernels: list[str] | None = None, bench_kernels: Optional[list[str]] = None,
) -> Iterable[TMeasurement]: ) -> Iterable[TMeasurement]:
results = [] results = []
for m, k, n in MKNs: for m, k, n in MKNs:

View File

@ -3,9 +3,10 @@
import pickle as pkl import pickle as pkl
import time import time
from collections.abc import Callable, Iterable from collections.abc import Iterable
from dataclasses import dataclass from dataclasses import dataclass
from itertools import product from itertools import product
from typing import Callable, Optional
import torch import torch
import torch.utils.benchmark as TBenchmark import torch.utils.benchmark as TBenchmark
@ -50,7 +51,7 @@ def get_bench_params() -> list[bench_params_t]:
def unfused_int8_impl( def unfused_int8_impl(
rms_norm_layer: RMSNorm, rms_norm_layer: RMSNorm,
x: torch.Tensor, x: torch.Tensor,
residual: torch.Tensor | None, residual: Optional[torch.Tensor],
quant_dtype: torch.dtype, quant_dtype: torch.dtype,
): ):
# Norm # Norm
@ -67,7 +68,7 @@ def unfused_int8_impl(
def unfused_fp8_impl( def unfused_fp8_impl(
rms_norm_layer: RMSNorm, rms_norm_layer: RMSNorm,
x: torch.Tensor, x: torch.Tensor,
residual: torch.Tensor | None, residual: Optional[torch.Tensor],
quant_dtype: torch.dtype, quant_dtype: torch.dtype,
): ):
# Norm # Norm
@ -84,7 +85,7 @@ def unfused_fp8_impl(
def fused_impl( def fused_impl(
rms_norm_layer: RMSNorm, # this stores the weights rms_norm_layer: RMSNorm, # this stores the weights
x: torch.Tensor, x: torch.Tensor,
residual: torch.Tensor | None, residual: Optional[torch.Tensor],
quant_dtype: torch.dtype, quant_dtype: torch.dtype,
): ):
out, _ = ops.rms_norm_dynamic_per_token_quant( out, _ = ops.rms_norm_dynamic_per_token_quant(

View File

@ -1,7 +1,7 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import itertools import itertools
from collections.abc import Callable from typing import Callable
from unittest.mock import patch from unittest.mock import patch
import pandas as pd import pandas as pd
@ -10,8 +10,7 @@ import torch
from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8 from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
from vllm.triton_utils import triton from vllm.triton_utils import triton
from vllm.utils import FlexibleArgumentParser from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
def with_triton_mode(fn): def with_triton_mode(fn):

View File

@ -10,8 +10,7 @@ import vllm.model_executor.layers.activation # noqa F401
from vllm.model_executor.custom_op import CustomOp from vllm.model_executor.custom_op import CustomOp
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.triton_utils import triton from vllm.triton_utils import triton
from vllm.utils import FlexibleArgumentParser from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
batch_size_range = [1, 16, 32, 64, 128] batch_size_range = [1, 16, 32, 64, 128]
seq_len_range = [1, 16, 64, 128, 256, 512, 1024, 2048, 4096] seq_len_range = [1, 16, 64, 128, 256, 512, 1024, 2048, 4096]

View File

@ -22,8 +22,8 @@ Example:
import json import json
import os import os
import time import time
from collections.abc import Callable
from contextlib import nullcontext from contextlib import nullcontext
from typing import Callable, Optional
import torch import torch
import torch.distributed as dist import torch.distributed as dist
@ -264,12 +264,12 @@ class CommunicatorBenchmark:
def benchmark_allreduce_single( def benchmark_allreduce_single(
self, self,
sequence_length: int, sequence_length: int,
allreduce_fn: Callable[[torch.Tensor], torch.Tensor | None], allreduce_fn: Callable[[torch.Tensor], Optional[torch.Tensor]],
should_use_fn: Callable[[torch.Tensor], bool], should_use_fn: Callable[[torch.Tensor], bool],
context, context,
num_warmup: int, num_warmup: int,
num_trials: int, num_trials: int,
) -> float | None: ) -> Optional[float]:
"""Benchmark method with CUDA graph optimization.""" """Benchmark method with CUDA graph optimization."""
try: try:
# Create test tensor (2D: sequence_length x hidden_size) # Create test tensor (2D: sequence_length x hidden_size)

View File

@ -7,8 +7,7 @@ import torch
from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.layernorm import RMSNorm
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils import FlexibleArgumentParser from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
@torch.inference_mode() @torch.inference_mode()

View File

@ -6,12 +6,11 @@ import copy
import json import json
import pickle import pickle
import time import time
from collections.abc import Callable
from dataclasses import dataclass from dataclasses import dataclass
from enum import Enum, auto from enum import Enum, auto
from itertools import product from itertools import product
from pathlib import Path from pathlib import Path
from typing import Any from typing import Any, Callable, Optional
import torch import torch
import torch.utils.benchmark as TBenchmark import torch.utils.benchmark as TBenchmark
@ -159,7 +158,7 @@ def ref_group_gemm(
seq_lens_cpu: torch.Tensor, seq_lens_cpu: torch.Tensor,
prompt_lora_mapping_cpu: torch.Tensor, prompt_lora_mapping_cpu: torch.Tensor,
scaling: float, scaling: float,
add_inputs: bool | None, add_inputs: Optional[bool],
): ):
""" """
Torch group gemm reference implementation to test correctness of Torch group gemm reference implementation to test correctness of
@ -317,8 +316,8 @@ class BenchmarkContext:
lora_rank: int lora_rank: int
sort_by_lora_id: bool sort_by_lora_id: bool
dtype: torch.dtype dtype: torch.dtype
seq_length: int | None = None seq_length: Optional[int] = None
num_slices: int | None = None # num_slices for slice based ops num_slices: Optional[int] = None # num_slices for slice based ops
def with_seq_length(self, seq_length: int) -> "BenchmarkContext": def with_seq_length(self, seq_length: int) -> "BenchmarkContext":
ctx = copy.copy(self) ctx = copy.copy(self)
@ -562,7 +561,7 @@ class BenchmarkTensors:
} }
def bench_fn_kwargs( def bench_fn_kwargs(
self, op_type: OpType, add_inputs: bool | None = None self, op_type: OpType, add_inputs: Optional[bool] = None
) -> dict[str, Any]: ) -> dict[str, Any]:
if op_type.is_shrink_fn(): if op_type.is_shrink_fn():
assert add_inputs is None assert add_inputs is None
@ -576,7 +575,7 @@ class BenchmarkTensors:
raise ValueError(f"Unrecognized optype {self}") raise ValueError(f"Unrecognized optype {self}")
def test_correctness( def test_correctness(
self, op_type: OpType, expand_fn_add_inputs: bool | None self, op_type: OpType, expand_fn_add_inputs: Optional[bool]
) -> bool: ) -> bool:
""" """
Test correctness of op_type implementation against a grouped gemm Test correctness of op_type implementation against a grouped gemm
@ -612,8 +611,8 @@ def bench_optype(
ctx: BenchmarkContext, ctx: BenchmarkContext,
arg_pool_size: int, arg_pool_size: int,
op_type: OpType, op_type: OpType,
cuda_graph_nops: int | None = None, cuda_graph_nops: Optional[int] = None,
expand_fn_add_inputs: bool | None = None, expand_fn_add_inputs: Optional[bool] = None,
test_correctness: bool = False, test_correctness: bool = False,
) -> TMeasurement: ) -> TMeasurement:
assert arg_pool_size >= 1 assert arg_pool_size >= 1
@ -680,7 +679,7 @@ def bench_torch_mm(
ctx: BenchmarkContext, ctx: BenchmarkContext,
arg_pool_size: int, arg_pool_size: int,
op_type: OpType, op_type: OpType,
cuda_graph_nops: int | None = None, cuda_graph_nops: Optional[int] = None,
) -> TMeasurement: ) -> TMeasurement:
""" """
Benchmark basic torch.mm as a roofline. Benchmark basic torch.mm as a roofline.
@ -745,7 +744,7 @@ def use_cuda_graph_recommendation() -> str:
""" """
def print_timers(timers: list[TMeasurement], args: argparse.Namespace | None = None): def print_timers(timers: list[TMeasurement], args: Optional[argparse.Namespace] = None):
compare = TBenchmark.Compare(timers) compare = TBenchmark.Compare(timers)
compare.print() compare.print()

View File

@ -8,9 +8,10 @@ import math
import os import os
import pickle as pkl import pickle as pkl
import time import time
from collections.abc import Callable, Iterable from collections.abc import Iterable
from dataclasses import dataclass from dataclasses import dataclass
from itertools import product from itertools import product
from typing import Callable, Optional
import pandas as pd import pandas as pd
import torch import torch
@ -62,23 +63,23 @@ class BenchmarkTensors:
a: torch.Tensor a: torch.Tensor
w_q: torch.Tensor w_q: torch.Tensor
group_size: int | None group_size: Optional[int]
wtype: ScalarType wtype: ScalarType
w_g_s: torch.Tensor w_g_s: torch.Tensor
w_g_zp: torch.Tensor | None w_g_zp: Optional[torch.Tensor]
w_ch_s: torch.Tensor | None w_ch_s: Optional[torch.Tensor]
w_tok_s: torch.Tensor | None w_tok_s: Optional[torch.Tensor]
@dataclass @dataclass
class TypeConfig: class TypeConfig:
act_type: torch.dtype act_type: torch.dtype
weight_type: ScalarType weight_type: ScalarType
output_type: torch.dtype | None output_type: Optional[torch.dtype]
group_scale_type: torch.dtype | None group_scale_type: Optional[torch.dtype]
group_zero_type: torch.dtype | None group_zero_type: Optional[torch.dtype]
channel_scale_type: torch.dtype | None channel_scale_type: Optional[torch.dtype]
token_scale_type: torch.dtype | None token_scale_type: Optional[torch.dtype]
def rand_data(shape, dtype=torch.float16, scale=1): def rand_data(shape, dtype=torch.float16, scale=1):
@ -92,8 +93,8 @@ def quantize_and_pack(
atype: torch.dtype, atype: torch.dtype,
w: torch.Tensor, w: torch.Tensor,
wtype: ScalarType, wtype: ScalarType,
stype: torch.dtype | None, stype: Optional[torch.dtype],
group_size: int | None, group_size: Optional[int],
zero_points: bool = False, zero_points: bool = False,
): ):
assert wtype.is_integer(), "TODO: support floating point weights" assert wtype.is_integer(), "TODO: support floating point weights"
@ -112,7 +113,7 @@ def quantize_and_pack(
def create_bench_tensors( def create_bench_tensors(
shape: tuple[int, int, int], types: TypeConfig, group_size: int | None shape: tuple[int, int, int], types: TypeConfig, group_size: Optional[int]
) -> list[BenchmarkTensors]: ) -> list[BenchmarkTensors]:
m, n, k = shape m, n, k = shape
@ -330,8 +331,8 @@ def bench_fns(label: str, sub_label: str, description: str, fns: list[Callable])
return res return res
_SWEEP_SCHEDULES_RESULTS: pd.DataFrame | None = None _SWEEP_SCHEDULES_RESULTS: Optional[pd.DataFrame] = None
_SWEEP_SCHEDULES_RESULTS_CSV: str | None = None _SWEEP_SCHEDULES_RESULTS_CSV: Optional[str] = None
def bench( def bench(

View File

@ -631,7 +631,7 @@ def main(args: argparse.Namespace):
else: else:
ensure_divisibility(intermediate_size, args.tp_size, "intermediate_size") ensure_divisibility(intermediate_size, args.tp_size, "intermediate_size")
shard_intermediate_size = 2 * intermediate_size // args.tp_size shard_intermediate_size = 2 * intermediate_size // args.tp_size
dtype = torch.float16 if current_platform.is_rocm() else config.dtype dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype
use_fp8_w8a8 = args.dtype == "fp8_w8a8" use_fp8_w8a8 = args.dtype == "fp8_w8a8"
use_int8_w8a16 = args.dtype == "int8_w8a16" use_int8_w8a16 = args.dtype == "int8_w8a16"
block_quant_shape = get_weight_block_size_safety(config) block_quant_shape = get_weight_block_size_safety(config)

View File

@ -344,7 +344,7 @@ def main(args: argparse.Namespace):
topk = config.num_experts_per_tok topk = config.num_experts_per_tok
hidden_size = config.hidden_size hidden_size = config.hidden_size
dtype = torch.float16 if current_platform.is_rocm() else config.dtype dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype
use_fp8_w8a8 = args.dtype == "fp8_w8a8" use_fp8_w8a8 = args.dtype == "fp8_w8a8"
use_int8_w8a16 = args.dtype == "int8_w8a16" use_int8_w8a16 = args.dtype == "int8_w8a16"
use_customized_permute = args.use_customized_permute use_customized_permute = args.use_customized_permute

View File

@ -3,15 +3,16 @@
import random import random
import time import time
from typing import Optional
import torch import torch
from vllm import _custom_ops as ops from vllm import _custom_ops as ops
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils import FlexibleArgumentParser from vllm.utils import (
from vllm.utils.torch_utils import (
STR_DTYPE_TO_TORCH_DTYPE, STR_DTYPE_TO_TORCH_DTYPE,
FlexibleArgumentParser,
create_kv_caches_with_random, create_kv_caches_with_random,
) )
@ -36,7 +37,7 @@ def main(
seed: int, seed: int,
do_profile: bool, do_profile: bool,
device: str = "cuda", device: str = "cuda",
kv_cache_dtype: str | None = None, kv_cache_dtype: Optional[str] = None,
) -> None: ) -> None:
current_platform.seed_everything(seed) current_platform.seed_everything(seed)

View File

@ -3,8 +3,8 @@
import argparse import argparse
import math import math
from collections.abc import Callable
from contextlib import contextmanager from contextlib import contextmanager
from typing import Callable
from unittest.mock import patch from unittest.mock import patch
import torch import torch

View File

@ -0,0 +1,155 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import itertools
import torch
from vllm import _custom_ops as vllm_ops
from vllm.triton_utils import triton
def polynorm_naive(
x: torch.Tensor,
weight: torch.Tensor,
bias: torch.Tensor,
eps: float = 1e-6,
):
orig_shape = x.shape
x = x.view(-1, x.shape[-1])
def norm(x, eps: float):
return x / torch.sqrt(x.pow(2).mean(-1, keepdim=True) + eps)
x = x.float()
return (
(
weight[0] * norm(x**3, eps)
+ weight[1] * norm(x**2, eps)
+ weight[2] * norm(x, eps)
+ bias
)
.to(weight.dtype)
.view(orig_shape)
)
def polynorm_vllm(
x: torch.Tensor,
weight: torch.Tensor,
bias: torch.Tensor,
eps: float = 1e-6,
):
orig_shape = x.shape
x = x.view(-1, x.shape[-1])
out = torch.empty_like(x)
vllm_ops.poly_norm(out, x, weight, bias, eps)
output = out
output = output.view(orig_shape)
return output
def calculate_diff(batch_size, seq_len, hidden_dim):
dtype = torch.bfloat16
x = torch.randn(batch_size, seq_len, hidden_dim, dtype=dtype, device="cuda")
weight = torch.ones(3, dtype=dtype, device="cuda")
bias = torch.ones(1, dtype=dtype, device="cuda")
output_naive = polynorm_naive(x, weight, bias)
output_vllm = polynorm_vllm(x, weight, bias)
if torch.allclose(output_naive, output_vllm, atol=1e-2, rtol=1e-2):
print("✅ All implementations match")
else:
print("❌ Implementations differ")
batch_size_range = [2**i for i in range(0, 7, 2)]
seq_length_range = [2**i for i in range(6, 11, 1)]
dim_range = [2048, 4096]
configs = list(itertools.product(dim_range, batch_size_range, seq_length_range))
def get_benchmark():
@triton.testing.perf_report(
triton.testing.Benchmark(
x_names=["dim", "batch_size", "seq_len"],
x_vals=[list(_) for _ in configs],
line_arg="provider",
line_vals=["naive", "vllm"],
line_names=["Naive", "vLLM"],
styles=[("blue", "-"), ("red", "-")],
ylabel="us",
plot_name="polynorm-perf",
args={},
)
)
def benchmark(dim, batch_size, seq_len, provider):
dtype = torch.bfloat16
hidden_dim = dim * 4
x = torch.randn(batch_size, seq_len, hidden_dim, dtype=dtype, device="cuda")
weight = torch.ones(3, dtype=dtype, device="cuda")
bias = torch.ones(1, dtype=dtype, device="cuda")
quantiles = [0.5, 0.2, 0.8]
if provider == "naive":
ms, min_ms, max_ms = triton.testing.do_bench(
lambda: polynorm_naive(x, weight, bias),
quantiles=quantiles,
)
else:
ms, min_ms, max_ms = triton.testing.do_bench(
lambda: polynorm_vllm(x, weight, bias),
quantiles=quantiles,
)
return 1000 * ms, 1000 * max_ms, 1000 * min_ms
return benchmark
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser()
parser.add_argument(
"--batch-size",
type=int,
default=4,
help="Batch size",
)
parser.add_argument(
"--seq-len",
type=int,
default=128,
help="Sequence length",
)
parser.add_argument(
"--hidden-dim",
type=int,
default=8192,
help="Intermediate size of MLP",
)
parser.add_argument(
"--save-path",
type=str,
default="./configs/polnorm/",
help="Path to save polnorm benchmark results",
)
args = parser.parse_args()
# Run correctness test
calculate_diff(
batch_size=args.batch_size,
seq_len=args.seq_len,
hidden_dim=args.hidden_dim,
)
benchmark = get_benchmark()
# Run performance benchmark
benchmark.run(print_data=True, save_path=args.save_path)

View File

@ -7,8 +7,7 @@ import torch
from vllm import _custom_ops as ops from vllm import _custom_ops as ops
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils import FlexibleArgumentParser from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
@torch.inference_mode() @torch.inference_mode()

View File

@ -1,5 +1,7 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from __future__ import annotations
import random import random
import time import time
@ -9,9 +11,9 @@ from tabulate import tabulate
from vllm import _custom_ops as ops from vllm import _custom_ops as ops
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils import FlexibleArgumentParser from vllm.utils import (
from vllm.utils.torch_utils import (
STR_DTYPE_TO_TORCH_DTYPE, STR_DTYPE_TO_TORCH_DTYPE,
FlexibleArgumentParser,
create_kv_caches_with_random, create_kv_caches_with_random,
) )

View File

@ -1,5 +1,7 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from __future__ import annotations
import random import random
import time import time
@ -12,9 +14,9 @@ from vllm.attention.ops.triton_reshape_and_cache_flash import (
) )
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils import FlexibleArgumentParser from vllm.utils import (
from vllm.utils.torch_utils import (
STR_DTYPE_TO_TORCH_DTYPE, STR_DTYPE_TO_TORCH_DTYPE,
FlexibleArgumentParser,
create_kv_caches_with_random_flash, create_kv_caches_with_random_flash,
) )

View File

@ -2,6 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import itertools import itertools
from typing import Optional, Union
import torch import torch
from flashinfer.norm import fused_add_rmsnorm, rmsnorm from flashinfer.norm import fused_add_rmsnorm, rmsnorm
@ -20,8 +21,8 @@ class HuggingFaceRMSNorm(nn.Module):
def forward( def forward(
self, self,
x: torch.Tensor, x: torch.Tensor,
residual: torch.Tensor | None = None, residual: Optional[torch.Tensor] = None,
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
orig_dtype = x.dtype orig_dtype = x.dtype
x = x.to(torch.float32) x = x.to(torch.float32)
if residual is not None: if residual is not None:
@ -40,7 +41,7 @@ class HuggingFaceRMSNorm(nn.Module):
def rmsnorm_naive( def rmsnorm_naive(
x: torch.Tensor, x: torch.Tensor,
weight: torch.Tensor, weight: torch.Tensor,
residual: torch.Tensor | None = None, residual: Optional[torch.Tensor] = None,
eps: float = 1e-6, eps: float = 1e-6,
): ):
naive_norm = HuggingFaceRMSNorm(x.shape[-1], eps=eps) naive_norm = HuggingFaceRMSNorm(x.shape[-1], eps=eps)
@ -64,7 +65,7 @@ def rmsnorm_naive(
def rmsnorm_flashinfer( def rmsnorm_flashinfer(
x: torch.Tensor, x: torch.Tensor,
weight: torch.Tensor, weight: torch.Tensor,
residual: torch.Tensor | None = None, residual: Optional[torch.Tensor] = None,
eps: float = 1e-6, eps: float = 1e-6,
): ):
orig_shape = x.shape orig_shape = x.shape
@ -88,7 +89,7 @@ def rmsnorm_flashinfer(
def rmsnorm_vllm( def rmsnorm_vllm(
x: torch.Tensor, x: torch.Tensor,
weight: torch.Tensor, weight: torch.Tensor,
residual: torch.Tensor | None = None, residual: Optional[torch.Tensor] = None,
eps: float = 1e-6, eps: float = 1e-6,
): ):
orig_shape = x.shape orig_shape = x.shape

View File

@ -2,6 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from itertools import accumulate from itertools import accumulate
from typing import Optional
import nvtx import nvtx
import torch import torch
@ -17,7 +18,7 @@ def benchmark_rope_kernels_multi_lora(
seq_len: int, seq_len: int,
num_heads: int, num_heads: int,
head_size: int, head_size: int,
rotary_dim: int | None, rotary_dim: Optional[int],
dtype: torch.dtype, dtype: torch.dtype,
seed: int, seed: int,
device: str, device: str,

View File

@ -4,6 +4,7 @@
import csv import csv
import os import os
from datetime import datetime from datetime import datetime
from typing import Optional
import flashinfer import flashinfer
import torch import torch
@ -27,7 +28,9 @@ def to_float8(x, dtype=torch.float8_e4m3fn):
@torch.no_grad() @torch.no_grad()
def benchmark_decode( def benchmark_decode(
dtype: torch.dtype, dtype: torch.dtype,
quant_dtypes: tuple[torch.dtype | None, torch.dtype | None, torch.dtype | None], quant_dtypes: tuple[
Optional[torch.dtype], Optional[torch.dtype], Optional[torch.dtype]
],
batch_size: int, batch_size: int,
max_seq_len: int, max_seq_len: int,
num_heads: tuple[int, int] = (64, 8), num_heads: tuple[int, int] = (64, 8),

View File

@ -4,6 +4,7 @@
import csv import csv
import os import os
from datetime import datetime from datetime import datetime
from typing import Optional
import flashinfer import flashinfer
import torch import torch
@ -27,7 +28,9 @@ def to_float8(x, dtype=torch.float8_e4m3fn):
@torch.no_grad() @torch.no_grad()
def benchmark_prefill( def benchmark_prefill(
dtype: torch.dtype, dtype: torch.dtype,
quant_dtypes: tuple[torch.dtype | None, torch.dtype | None, torch.dtype | None], quant_dtypes: tuple[
Optional[torch.dtype], Optional[torch.dtype], Optional[torch.dtype]
],
batch_size: int, batch_size: int,
max_seq_len: int, max_seq_len: int,
num_heads: tuple[int, int] = (64, 8), num_heads: tuple[int, int] = (64, 8),

View File

@ -2,8 +2,8 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import dataclasses import dataclasses
from collections.abc import Callable, Iterable from collections.abc import Iterable
from typing import Any from typing import Any, Callable, Optional
import torch import torch
import torch.utils.benchmark as TBenchmark import torch.utils.benchmark as TBenchmark
@ -55,7 +55,7 @@ class Bench:
def __init__( def __init__(
self, self,
cuda_graph_params: CudaGraphBenchParams | None, cuda_graph_params: Optional[CudaGraphBenchParams],
label: str, label: str,
sub_label: str, sub_label: str,
description: str, description: str,

View File

@ -2,7 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from statistics import mean from statistics import mean
from typing import Any, NamedTuple from typing import Any, NamedTuple, Optional, Union
import numpy as np # type: ignore import numpy as np # type: ignore
import pandas as pd # type: ignore import pandas as pd # type: ignore
@ -35,8 +35,8 @@ class Distribution(ABC):
class UniformDistribution(Distribution): class UniformDistribution(Distribution):
def __init__( def __init__(
self, self,
min_val: int | float, min_val: Union[int, float],
max_val: int | float, max_val: Union[int, float],
is_integer: bool = True, is_integer: bool = True,
) -> None: ) -> None:
self.min_val = min_val self.min_val = min_val
@ -56,7 +56,7 @@ class UniformDistribution(Distribution):
class ConstantDistribution(Distribution): class ConstantDistribution(Distribution):
def __init__(self, value: int | float) -> None: def __init__(self, value: Union[int, float]) -> None:
self.value = value self.value = value
self.max_val = value self.max_val = value
@ -68,7 +68,7 @@ class ConstantDistribution(Distribution):
class ZipfDistribution(Distribution): class ZipfDistribution(Distribution):
def __init__(self, alpha: float, max_val: int | None = None) -> None: def __init__(self, alpha: float, max_val: Optional[int] = None) -> None:
self.alpha = alpha self.alpha = alpha
self.max_val = max_val self.max_val = max_val
@ -83,7 +83,7 @@ class ZipfDistribution(Distribution):
class PoissonDistribution(Distribution): class PoissonDistribution(Distribution):
def __init__(self, alpha: float, max_val: int | None = None) -> None: def __init__(self, alpha: float, max_val: Optional[int] = None) -> None:
self.alpha = alpha self.alpha = alpha
self.max_val = max_val self.max_val = max_val
@ -100,11 +100,11 @@ class PoissonDistribution(Distribution):
class LognormalDistribution(Distribution): class LognormalDistribution(Distribution):
def __init__( def __init__(
self, self,
mean: float | None = None, mean: Optional[float] = None,
sigma: float | None = None, sigma: Optional[float] = None,
average: int | None = None, average: Optional[int] = None,
median_ratio: float | None = None, median_ratio: Optional[float] = None,
max_val: int | None = None, max_val: Optional[int] = None,
) -> None: ) -> None:
self.average = average self.average = average
self.median_ratio = median_ratio self.median_ratio = median_ratio

View File

@ -13,7 +13,7 @@ from datetime import datetime
from enum import Enum from enum import Enum
from http import HTTPStatus from http import HTTPStatus
from statistics import mean from statistics import mean
from typing import NamedTuple from typing import NamedTuple, Union
import aiohttp # type: ignore import aiohttp # type: ignore
import numpy as np # type: ignore import numpy as np # type: ignore
@ -169,7 +169,7 @@ class MovingAverage:
class DebugStats: class DebugStats:
def __init__(self, logger: logging.Logger, window_size: int) -> None: def __init__(self, logger: logging.Logger, window_size: int) -> None:
self.logger = logger self.logger = logger
self.metrics: dict[str, MovingAverage | MetricStats] = { self.metrics: dict[str, Union[MovingAverage, MetricStats]] = {
"moving_avg_ttft_ms": MovingAverage(window_size), "moving_avg_ttft_ms": MovingAverage(window_size),
"moving_avg_tpot_ms": MovingAverage(window_size), "moving_avg_tpot_ms": MovingAverage(window_size),
"ttft_ms": MetricStats(), "ttft_ms": MetricStats(),
@ -636,7 +636,7 @@ async def client_main(
if args.verbose: if args.verbose:
curr_time_sec: float = time.perf_counter() curr_time_sec: float = time.perf_counter()
time_since_last_turn: str | float = "N/A" time_since_last_turn: Union[str, float] = "N/A"
if conv_id in time_of_last_turn: if conv_id in time_of_last_turn:
time_since_last_turn = round( time_since_last_turn = round(
curr_time_sec - time_of_last_turn[conv_id], 3 curr_time_sec - time_of_last_turn[conv_id], 3
@ -928,13 +928,13 @@ async def main_mp(
f"{num_clients_finished} out of {bench_args.num_clients} clients finished, collected {len(client_metrics)} measurements, runtime {runtime_sec:.3f} sec{Color.RESET}" # noqa: E501 f"{num_clients_finished} out of {bench_args.num_clients} clients finished, collected {len(client_metrics)} measurements, runtime {runtime_sec:.3f} sec{Color.RESET}" # noqa: E501
) )
rps: str | float = round(len(client_metrics) / runtime_sec, 3) rps: Union[str, float] = round(len(client_metrics) / runtime_sec, 3)
if len(client_metrics) < (5 * bench_args.num_clients): if len(client_metrics) < (5 * bench_args.num_clients):
# Do not estimate the RPS if the number of samples is very low # Do not estimate the RPS if the number of samples is very low
# (threshold can be tuned if needed) # (threshold can be tuned if needed)
rps = "N/A" rps = "N/A"
runtime_left_sec: str | float = round( runtime_left_sec: Union[str, float] = round(
(runtime_sec / finished_convs) * (total_convs - finished_convs), 3 (runtime_sec / finished_convs) * (total_convs - finished_convs), 3
) )
if percent < 0.05: if percent < 0.05:
@ -1251,7 +1251,7 @@ async def main() -> None:
default=None, default=None,
help="The model name used in the API. " help="The model name used in the API. "
"If not specified, the model name will be the " "If not specified, the model name will be the "
"same as the `--model` argument. ", "same as the ``--model`` argument. ",
) )
parser.add_argument( parser.add_argument(

View File

@ -13,7 +13,7 @@ import argparse
import json import json
import random import random
from statistics import mean from statistics import mean
from typing import Any from typing import Any, Optional
import pandas as pd # type: ignore import pandas as pd # type: ignore
import tqdm # type: ignore import tqdm # type: ignore
@ -25,7 +25,7 @@ def has_non_english_chars(text: str) -> bool:
def content_is_valid( def content_is_valid(
content: str, min_content_len: int | None, max_content_len: int | None content: str, min_content_len: Optional[int], max_content_len: Optional[int]
) -> bool: ) -> bool:
if min_content_len and len(content) < min_content_len: if min_content_len and len(content) < min_content_len:
return False return False
@ -37,7 +37,7 @@ def content_is_valid(
def print_stats( def print_stats(
conversations: "list[dict[Any, Any]]", tokenizer: AutoTokenizer | None = None conversations: "list[dict[Any, Any]]", tokenizer: Optional[AutoTokenizer] = None
) -> None: ) -> None:
# Collect statistics # Collect statistics
stats = [] stats = []
@ -109,12 +109,12 @@ def convert_sharegpt_to_openai(
seed: int, seed: int,
input_file: str, input_file: str,
output_file: str, output_file: str,
max_items: int | None, max_items: Optional[int],
min_content_len: int | None = None, min_content_len: Optional[int] = None,
max_content_len: int | None = None, max_content_len: Optional[int] = None,
min_turns: int | None = None, min_turns: Optional[int] = None,
max_turns: int | None = None, max_turns: Optional[int] = None,
model: str | None = None, model: Optional[str] = None,
) -> None: ) -> None:
if min_turns and max_turns: if min_turns and max_turns:
assert min_turns <= max_turns assert min_turns <= max_turns

View File

@ -188,47 +188,16 @@ else()
message(FATAL_ERROR "vLLM CPU backend requires AVX512, AVX2, Power9+ ISA, S390X ISA, ARMv8 or RISC-V support.") message(FATAL_ERROR "vLLM CPU backend requires AVX512, AVX2, Power9+ ISA, S390X ISA, ARMv8 or RISC-V support.")
endif() endif()
#
# Build oneDNN for W8A8 GEMM kernels (only for x86-AVX512 /ARM platforms)
# Flag to enable ACL kernels for AARCH64 platforms
if (VLLM_BUILD_ACL STREQUAL "ON")
set(USE_ACL ON)
else()
set(USE_ACL OFF)
endif()
# Build oneDNN for GEMM kernels (only for x86-AVX512 /ARM platforms)
if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON_FOUND) OR POWER9_FOUND OR POWER10_FOUND OR POWER11_FOUND) if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON_FOUND) OR POWER9_FOUND OR POWER10_FOUND OR POWER11_FOUND)
# Fetch and build Arm Compute Library (ACL) as oneDNN's backend for AArch64
# TODO [fadara01]: remove this once ACL can be fetched and built automatically as a dependency of oneDNN
if(ASIMD_FOUND)
if(DEFINED ENV{ACL_ROOT_DIR} AND IS_DIRECTORY "$ENV{ACL_ROOT_DIR}")
message(STATUS "Using ACL from specified source directory: $ENV{ACL_ROOT_DIR}")
else()
message(STATUS "Downloading Arm Compute Library (ACL) from GitHub")
FetchContent_Populate(arm_compute
SUBBUILD_DIR "${FETCHCONTENT_BASE_DIR}/arm_compute-subbuild"
SOURCE_DIR "${FETCHCONTENT_BASE_DIR}/arm_compute-src"
GIT_REPOSITORY https://github.com/ARM-software/ComputeLibrary.git
GIT_TAG v52.2.0
GIT_SHALLOW TRUE
GIT_PROGRESS TRUE
)
set(ENV{ACL_ROOT_DIR} "${arm_compute_SOURCE_DIR}")
endif()
# Build ACL with scons
include(ProcessorCount)
ProcessorCount(_NPROC)
execute_process(
COMMAND scons -j${_NPROC}
Werror=0 debug=0 neon=1 examples=0 embed_kernels=0 os=linux
arch=armv8.2-a build=native benchmark_examples=0 fixed_format_kernels=1
multi_isa=1 openmp=1 cppthreads=0
WORKING_DIRECTORY "$ENV{ACL_ROOT_DIR}"
RESULT_VARIABLE _acl_rc
)
if(NOT _acl_rc EQUAL 0)
message(FATAL_ERROR "ACL SCons build failed (exit ${_acl_rc}).")
endif()
set(ONEDNN_AARCH64_USE_ACL "ON")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wl,-rpath,$ENV{ACL_ROOT_DIR}/build/")
add_compile_definitions(VLLM_USE_ACL)
endif()
set(FETCHCONTENT_SOURCE_DIR_ONEDNN "$ENV{FETCHCONTENT_SOURCE_DIR_ONEDNN}" CACHE PATH "Path to a local oneDNN source directory.") set(FETCHCONTENT_SOURCE_DIR_ONEDNN "$ENV{FETCHCONTENT_SOURCE_DIR_ONEDNN}" CACHE PATH "Path to a local oneDNN source directory.")
if(FETCHCONTENT_SOURCE_DIR_ONEDNN) if(FETCHCONTENT_SOURCE_DIR_ONEDNN)
@ -248,6 +217,16 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON
) )
endif() endif()
if(USE_ACL)
find_library(ARM_COMPUTE_LIBRARY NAMES arm_compute PATHS $ENV{ACL_ROOT_DIR}/build/)
if(NOT ARM_COMPUTE_LIBRARY)
message(FATAL_ERROR "Could not find ARM Compute Library: please set ACL_ROOT_DIR")
endif()
set(ONEDNN_AARCH64_USE_ACL "ON")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wl,-rpath,$ENV{ACL_ROOT_DIR}/build/")
add_compile_definitions(VLLM_USE_ACL)
endif()
set(ONEDNN_LIBRARY_TYPE "STATIC") set(ONEDNN_LIBRARY_TYPE "STATIC")
set(ONEDNN_BUILD_DOC "OFF") set(ONEDNN_BUILD_DOC "OFF")
set(ONEDNN_BUILD_EXAMPLES "OFF") set(ONEDNN_BUILD_EXAMPLES "OFF")

View File

@ -19,7 +19,7 @@ else()
FetchContent_Declare( FetchContent_Declare(
flashmla flashmla
GIT_REPOSITORY https://github.com/vllm-project/FlashMLA GIT_REPOSITORY https://github.com/vllm-project/FlashMLA
GIT_TAG 28417e516fcbf6257a422ba117ef5b6f44da5682 GIT_TAG 5f65b85703c7ed75fda01e06495077caad207c3f
GIT_PROGRESS TRUE GIT_PROGRESS TRUE
CONFIGURE_COMMAND "" CONFIGURE_COMMAND ""
BUILD_COMMAND "" BUILD_COMMAND ""
@ -66,7 +66,6 @@ if(FLASH_MLA_ARCHS)
${flashmla_SOURCE_DIR}/csrc/extension/torch_api.cpp ${flashmla_SOURCE_DIR}/csrc/extension/torch_api.cpp
${flashmla_SOURCE_DIR}/csrc/extension/sm90/dense_fp8/pybind.cpp ${flashmla_SOURCE_DIR}/csrc/extension/sm90/dense_fp8/pybind.cpp
${flashmla_SOURCE_DIR}/csrc/extension/sm90/dense_fp8/flash_fwd_mla_fp8_sm90.cu ${flashmla_SOURCE_DIR}/csrc/extension/sm90/dense_fp8/flash_fwd_mla_fp8_sm90.cu
${flashmla_SOURCE_DIR}/csrc/extension/sm90/dense_fp8/flash_fwd_mla_metadata.cu
) )
set(FlashMLA_INCLUDES set(FlashMLA_INCLUDES

View File

@ -22,10 +22,10 @@ else()
CONFIGURE_COMMAND "" CONFIGURE_COMMAND ""
BUILD_COMMAND "" BUILD_COMMAND ""
) )
FetchContent_Populate(qutlass)
set(qutlass_SOURCE_DIR "${qutlass_SOURCE_DIR}")
endif() endif()
FetchContent_Populate(qutlass)
if(NOT qutlass_SOURCE_DIR) if(NOT qutlass_SOURCE_DIR)
message(FATAL_ERROR "[QUTLASS] source directory could not be resolved.") message(FATAL_ERROR "[QUTLASS] source directory could not be resolved.")
endif() endif()

View File

@ -38,7 +38,7 @@ else()
FetchContent_Declare( FetchContent_Declare(
vllm-flash-attn vllm-flash-attn
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
GIT_TAG a893712401d70362fbb299cd9c4b3476e8e9ed54 GIT_TAG 8f468e7da54a8e2f98abfa7c38636aac91c0cba1
GIT_PROGRESS TRUE GIT_PROGRESS TRUE
# Don't share the vllm-flash-attn build between build types # Don't share the vllm-flash-attn build between build types
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn

View File

@ -1,12 +0,0 @@
codecov:
require_ci_to_pass: false
fixes:
# Map source code paths to repository root paths
# Wildcards match any Python version (python3.*)
- "/vllm-workspace/src/vllm/::vllm/"
- "/vllm-workspace/vllm/::vllm/"
- "/usr/local/lib/python3.*/dist-packages/vllm/::vllm/"
- "/usr/local/lib/python3.*/site-packages/vllm/::vllm/"
- "/usr/lib/python3.*/dist-packages/vllm/::vllm/"
- "/usr/lib/python3.*/site-packages/vllm/::vllm/"

View File

@ -125,37 +125,32 @@ public:
} }
static void set_split_kv (KernelArguments& args) { static void set_split_kv (KernelArguments& args) {
// printf("set_split_kv start");
if (args.split_kv >= 1) return; if (args.split_kv >= 1) return;
auto [H, K, D, B] = args.problem_shape; auto [H, K, D, B] = args.problem_shape;
// std::cout << H << " " << K << " " << D << " " << B << "\n";
int sm_count = args.hw_info.sm_count; int sm_count = args.hw_info.sm_count;
float seq_length_k = static_cast<float>(K) / 1024.0f; // printf(" sm_count = %d\n", sm_count);
int max_splits = 1; int max_splits = ceil_div(K, 128);
max_splits = min(16, max_splits);
if (B <= 4 && seq_length_k >= 16) { // TODO: This avoids a hang when the batch size larger than 1 and
max_splits = 16; // there is more than 1 kv_splits.
// Discuss with NVIDIA how this can be fixed.
if (B > 1) {
max_splits = min(1, max_splits);
} }
else if (B <= 8 && seq_length_k >= 4) {
max_splits = 8; // printf(" max_splits = %d\n", max_splits);
}
else if ((B <= 16 && seq_length_k >= 8) ||
(B == 48 && seq_length_k >= 32)) {
max_splits = 4;
}
else if ((B <= 32 && seq_length_k >= 16) ||
(B == 96 && seq_length_k >= 16)) {
max_splits = 2;
}
else {
max_splits = 1;
}
// Wave-aware scheduling: ensure integer number of waves in K dimension
int sms_per_batch = max(1, sm_count / B); int sms_per_batch = max(1, sm_count / B);
// printf(" sms_per_batch = %d\n", sms_per_batch);
int split_heur = min(max_splits, sms_per_batch); int split_heur = min(max_splits, sms_per_batch);
int waves = ceil_div(B * split_heur, sm_count); int waves = ceil_div(B * split_heur, sm_count);
int k_waves = ceil_div(max_splits, split_heur); int k_waves = ceil_div(max_splits, split_heur);
int split_wave_aware = ceil_div(max_splits, k_waves); int split_wave_aware = ceil_div(max_splits, k_waves);
args.split_kv = split_wave_aware; args.split_kv = split_wave_aware;
// printf(" args.split_kv = %d\n", args.split_kv);
} }
/// Determines whether the GEMM can execute the given problem. /// Determines whether the GEMM can execute the given problem.

View File

@ -5,11 +5,11 @@
namespace vllm { namespace vllm {
// vllm_is_batch_invariant(); returns true // vllm_kernel_override_batch_invariant(); returns true
// if env VLLM_BATCH_INVARIANT=1 // if env VLLM_KERNEL_OVERRIDE_BATCH_INVARIANT=1
inline bool vllm_is_batch_invariant() { inline bool vllm_kernel_override_batch_invariant() {
static bool cached = []() { static bool cached = []() {
std::string env_key = "VLLM_BATCH_INVARIANT"; std::string env_key = "VLLM_KERNEL_OVERRIDE_BATCH_INVARIANT";
const char* val = std::getenv(env_key.c_str()); const char* val = std::getenv(env_key.c_str());
return (val && std::atoi(val) != 0) ? 1 : 0; return (val && std::atoi(val) != 0) ? 1 : 0;
}(); }();

View File

@ -2,6 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import enum import enum
from typing import Union
from cutlass_library import * from cutlass_library import *
@ -21,7 +22,7 @@ class MixedInputKernelScheduleType(enum.Enum):
TmaWarpSpecializedCooperative = enum_auto() TmaWarpSpecializedCooperative = enum_auto()
VLLMDataTypeNames: dict[VLLMDataType | DataType, str] = { VLLMDataTypeNames: dict[Union[VLLMDataType, DataType], str] = {
**DataTypeNames, # type: ignore **DataTypeNames, # type: ignore
**{ **{
VLLMDataType.u4b8: "u4b8", VLLMDataType.u4b8: "u4b8",
@ -29,7 +30,7 @@ VLLMDataTypeNames: dict[VLLMDataType | DataType, str] = {
}, },
} }
VLLMDataTypeTag: dict[VLLMDataType | DataType, str] = { VLLMDataTypeTag: dict[Union[VLLMDataType, DataType], str] = {
**DataTypeTag, # type: ignore **DataTypeTag, # type: ignore
**{ **{
VLLMDataType.u4b8: "cutlass::vllm_uint4b8_t", VLLMDataType.u4b8: "cutlass::vllm_uint4b8_t",
@ -37,7 +38,7 @@ VLLMDataTypeTag: dict[VLLMDataType | DataType, str] = {
}, },
} }
VLLMDataTypeSize: dict[VLLMDataType | DataType, int] = { VLLMDataTypeSize: dict[Union[VLLMDataType, DataType], int] = {
**DataTypeSize, # type: ignore **DataTypeSize, # type: ignore
**{ **{
VLLMDataType.u4b8: 4, VLLMDataType.u4b8: 4,
@ -45,7 +46,7 @@ VLLMDataTypeSize: dict[VLLMDataType | DataType, int] = {
}, },
} }
VLLMDataTypeVLLMScalarTypeTag: dict[VLLMDataType | DataType, str] = { VLLMDataTypeVLLMScalarTypeTag: dict[Union[VLLMDataType, DataType], str] = {
VLLMDataType.u4b8: "vllm::kU4B8", VLLMDataType.u4b8: "vllm::kU4B8",
VLLMDataType.u8b128: "vllm::kU8B128", VLLMDataType.u8b128: "vllm::kU8B128",
DataType.u4: "vllm::kU4", DataType.u4: "vllm::kU4",
@ -56,7 +57,7 @@ VLLMDataTypeVLLMScalarTypeTag: dict[VLLMDataType | DataType, str] = {
DataType.bf16: "vllm::kBfloat16", DataType.bf16: "vllm::kBfloat16",
} }
VLLMDataTypeTorchDataTypeTag: dict[VLLMDataType | DataType, str] = { VLLMDataTypeTorchDataTypeTag: dict[Union[VLLMDataType, DataType], str] = {
DataType.u8: "at::ScalarType::Byte", DataType.u8: "at::ScalarType::Byte",
DataType.s8: "at::ScalarType::Char", DataType.s8: "at::ScalarType::Char",
DataType.e4m3: "at::ScalarType::Float8_e4m3fn", DataType.e4m3: "at::ScalarType::Float8_e4m3fn",
@ -66,7 +67,9 @@ VLLMDataTypeTorchDataTypeTag: dict[VLLMDataType | DataType, str] = {
DataType.f32: "at::ScalarType::Float", DataType.f32: "at::ScalarType::Float",
} }
VLLMKernelScheduleTag: dict[MixedInputKernelScheduleType | KernelScheduleType, str] = { VLLMKernelScheduleTag: dict[
Union[MixedInputKernelScheduleType, KernelScheduleType], str
] = {
**KernelScheduleTag, # type: ignore **KernelScheduleTag, # type: ignore
**{ **{
MixedInputKernelScheduleType.TmaWarpSpecialized: "cutlass::gemm::KernelTmaWarpSpecialized", # noqa: E501 MixedInputKernelScheduleType.TmaWarpSpecialized: "cutlass::gemm::KernelTmaWarpSpecialized", # noqa: E501

View File

@ -2,7 +2,6 @@
#include "dispatch_utils.h" #include "dispatch_utils.h"
#include "cub_helpers.h" #include "cub_helpers.h"
#include "core/batch_invariant.hpp" #include "core/batch_invariant.hpp"
#include "quantization/vectorization_utils.cuh"
#include <torch/cuda.h> #include <torch/cuda.h>
#include <c10/cuda/CUDAGuard.h> #include <c10/cuda/CUDAGuard.h>
@ -19,22 +18,11 @@ __global__ void rms_norm_kernel(
const float epsilon, const int num_tokens, const int hidden_size) { const float epsilon, const int num_tokens, const int hidden_size) {
__shared__ float s_variance; __shared__ float s_variance;
float variance = 0.0f; float variance = 0.0f;
const scalar_t* input_row = input + blockIdx.x * input_stride;
constexpr int VEC_SIZE = 8; for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
auto vec_op = [&variance](const vec_n_t<scalar_t, VEC_SIZE>& vec) { const float x = (float)input[blockIdx.x * input_stride + idx];
#pragma unroll
for (int i = 0; i < VEC_SIZE; ++i) {
float x = static_cast<float>(vec.val[i]);
variance += x * x;
}
};
auto scalar_op = [&variance](const scalar_t& val) {
float x = static_cast<float>(val);
variance += x * x; variance += x * x;
}; }
vllm::vectorize_read_with_alignment<VEC_SIZE>(
input_row, hidden_size, threadIdx.x, blockDim.x, vec_op, scalar_op);
using BlockReduce = cub::BlockReduce<float, 1024>; using BlockReduce = cub::BlockReduce<float, 1024>;
__shared__ typename BlockReduce::TempStorage reduceStore; __shared__ typename BlockReduce::TempStorage reduceStore;
@ -148,6 +136,211 @@ fused_add_rms_norm_kernel(
} }
} }
/* Function specialization in the case of FP16/BF16 tensors.
Additional optimizations we can make in this case are
packed and vectorized operations, which help with the
memory latency bottleneck.
_f16VecPN struct extends _f16Vec to add operations specifically required for
polynomial normalization (poly norm).
The original _f16Vec does not include the sum-of-powers computation or
in-place polynomial normalization logic. */
template <typename scalar_t, int width>
struct alignas(16) _f16VecPN : _f16Vec<scalar_t, width> {
using Base = _f16Vec<scalar_t, width>;
using Converter = typename Base::Converter;
using T1 = typename Base::T1;
using T2 = typename Base::T2;
using Base::data;
__device__ auto sum_pows() const {
float s2 = 0.0f, s4 = 0.0f, s6 = 0.0f;
#pragma unroll
for (int i = 0; i < width; i += 2) {
float2 z = Converter::convert(T2{data[i], data[i + 1]});
float x2 = z.x * z.x;
float x4 = x2 * x2;
float x6 = x4 * x2;
float y2 = z.y * z.y;
float y4 = y2 * y2;
float y6 = y4 * y2;
s2 += x2 + y2;
s4 += x4 + y4;
s6 += x6 + y6;
}
return std::make_tuple(s2, s4, s6);
}
__device__ void poly_norm_inplace(const float w2_inv_std,
const float w1_inv_std2,
const float w0_inv_std3, const float bias) {
#pragma unroll
for (int i = 0; i < width; i += 2) {
float2 z = Converter::convert(T2{data[i], data[i + 1]});
float x2 = z.x * z.x;
float x3 = x2 * z.x;
z.x = w2_inv_std * z.x + w1_inv_std2 * x2 + w0_inv_std3 * x3 + bias;
float y2 = z.y * z.y;
float y3 = y2 * z.y;
z.y = w2_inv_std * z.y + w1_inv_std2 * y2 + w0_inv_std3 * y3 + bias;
auto out = Converter::convert(z);
data[i] = out.x;
data[i + 1] = out.y;
}
}
};
template <typename scalar_t, int width>
__global__ std::enable_if_t<(width > 0) && _typeConvert<scalar_t>::exists>
poly_norm_kernel(scalar_t* __restrict__ out, // [..., hidden_size]
const scalar_t* __restrict__ input, // [..., hidden_size]
const scalar_t* __restrict__ weight, // [3]
const scalar_t* __restrict__ bias, // [1]
const float epsilon, const int hidden_size) {
// Sanity checks on our vector struct and type-punned pointer arithmetic
static_assert(std::is_pod_v<_f16VecPN<scalar_t, width>>);
static_assert(sizeof(_f16VecPN<scalar_t, width>) == sizeof(scalar_t) * width);
/* These and the argument pointers are all declared `restrict` as they are
not aliased in practice. Argument pointers should not be dereferenced
in this kernel as that would be undefined behavior */
auto* __restrict__ input_v =
reinterpret_cast<const _f16VecPN<scalar_t, width>*>(input);
const int vec_hidden_size = hidden_size / width;
float variance = 0.0f;
float variance2 = 0.0f;
float variance3 = 0.0f;
for (int idx = threadIdx.x; idx < vec_hidden_size; idx += blockDim.x) {
int id = blockIdx.x * vec_hidden_size + idx;
_f16VecPN<scalar_t, width> temp = input_v[id];
auto [x2, x4, x6] = temp.sum_pows();
variance += x2;
variance2 += x4;
variance3 += x6;
}
float3 thread_variances = make_float3(variance, variance2, variance3);
struct SumOp {
__device__ float3 operator()(const float3& a, const float3& b) const {
return make_float3(a.x + b.x, a.y + b.y, a.z + b.z);
}
};
using BlockReduce = cub::BlockReduce<float3, 1024>;
__shared__ typename BlockReduce::TempStorage reduceStore;
float3 block_variances =
BlockReduce(reduceStore).Reduce(thread_variances, SumOp{}, blockDim.x);
variance = block_variances.x;
variance2 = block_variances.y;
variance3 = block_variances.z;
__shared__ float s_w2_inv_std;
__shared__ float s_w1_inv_std2;
__shared__ float s_w0_inv_std3;
__shared__ float s_bias;
if (threadIdx.x == 0) {
float w0 = (float)weight[0];
float w1 = (float)weight[1];
float w2 = (float)weight[2];
s_bias = (float)bias[0];
s_w2_inv_std = w2 * rsqrtf(variance / hidden_size + epsilon);
s_w1_inv_std2 = w1 * rsqrtf(variance2 / hidden_size + epsilon);
s_w0_inv_std3 = w0 * rsqrtf(variance3 / hidden_size + epsilon);
}
__syncthreads();
auto* __restrict__ out_v = reinterpret_cast<_f16VecPN<scalar_t, width>*>(out);
for (int idx = threadIdx.x; idx < vec_hidden_size; idx += blockDim.x) {
int id = blockIdx.x * vec_hidden_size + idx;
_f16VecPN<scalar_t, width> temp = input_v[id];
temp.poly_norm_inplace(s_w2_inv_std, s_w1_inv_std2, s_w0_inv_std3, s_bias);
out_v[id] = temp;
}
}
/* Generic poly_norm_kernel
The width field is not used here but necessary for other specializations.
*/
template <typename scalar_t, int width>
__global__ std::enable_if_t<(width == 0) || !_typeConvert<scalar_t>::exists>
poly_norm_kernel(scalar_t* __restrict__ out, // [..., hidden_size]
const scalar_t* __restrict__ input, // [..., hidden_size]
const scalar_t* __restrict__ weight, // [3]
const scalar_t* __restrict__ bias, // [1]
const float epsilon, const int hidden_size) {
float variance = 0.0f;
float variance2 = 0.0f;
float variance3 = 0.0f;
for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
float x = (float)input[blockIdx.x * hidden_size + idx];
float x2 = x * x;
float x4 = x2 * x2;
float x6 = x4 * x2;
variance += x2;
variance2 += x4;
variance3 += x6;
}
float3 thread_variances = make_float3(variance, variance2, variance3);
struct SumOp {
__device__ float3 operator()(const float3& a, const float3& b) const {
return make_float3(a.x + b.x, a.y + b.y, a.z + b.z);
}
};
using BlockReduce = cub::BlockReduce<float3, 1024>;
__shared__ typename BlockReduce::TempStorage reduceStore;
float3 block_variances =
BlockReduce(reduceStore).Reduce(thread_variances, SumOp{}, blockDim.x);
variance = block_variances.x;
variance2 = block_variances.y;
variance3 = block_variances.z;
__shared__ float s_w2_inv_std;
__shared__ float s_w1_inv_std2;
__shared__ float s_w0_inv_std3;
__shared__ float s_bias;
if (threadIdx.x == 0) {
float w0 = (float)weight[0];
float w1 = (float)weight[1];
float w2 = (float)weight[2];
s_bias = (float)bias[0];
s_w2_inv_std = w2 * rsqrtf(variance / hidden_size + epsilon);
s_w1_inv_std2 = w1 * rsqrtf(variance2 / hidden_size + epsilon);
s_w0_inv_std3 = w0 * rsqrtf(variance3 / hidden_size + epsilon);
}
__syncthreads();
for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
float x = (float)input[blockIdx.x * hidden_size + idx];
float x2 = x * x;
float x3 = x2 * x;
out[blockIdx.x * hidden_size + idx] =
(scalar_t)(x * s_w2_inv_std + x2 * s_w1_inv_std2 + x3 * s_w0_inv_std3 +
s_bias);
}
}
} // namespace vllm } // namespace vllm
void rms_norm(torch::Tensor& out, // [..., hidden_size] void rms_norm(torch::Tensor& out, // [..., hidden_size]
@ -159,26 +352,18 @@ void rms_norm(torch::Tensor& out, // [..., hidden_size]
TORCH_CHECK(weight.is_contiguous()); TORCH_CHECK(weight.is_contiguous());
int hidden_size = input.size(-1); int hidden_size = input.size(-1);
int num_tokens = input.numel() / hidden_size;
// We cannot just use `input.stride(-2)` if the tensor is not row-major. int64_t input_stride = input.stride(-2);
// Instead, we use a 2d view to get the second-innermost stride.
// That way the dimensions (except the last one) can be arbitrarily permuted.
torch::Tensor input_view = input.view({-1, hidden_size});
int num_tokens = input_view.numel() / hidden_size;
int64_t input_stride = input_view.stride(-2);
dim3 grid(num_tokens); dim3 grid(num_tokens);
dim3 block(std::min(hidden_size, 1024)); dim3 block(std::min(hidden_size, 1024));
const at::cuda::OptionalCUDAGuard device_guard(device_of(input_view)); const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
VLLM_DISPATCH_FLOATING_TYPES( VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "rms_norm_kernel", [&] {
input_view.scalar_type(), "rms_norm_kernel", [&] { vllm::rms_norm_kernel<scalar_t><<<grid, block, 0, stream>>>(
vllm::rms_norm_kernel<scalar_t><<<grid, block, 0, stream>>>( out.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(), input_stride,
out.data_ptr<scalar_t>(), input_view.data_ptr<scalar_t>(), weight.data_ptr<scalar_t>(), epsilon, num_tokens, hidden_size);
input_stride, weight.data_ptr<scalar_t>(), epsilon, num_tokens, });
hidden_size);
});
} }
#define LAUNCH_FUSED_ADD_RMS_NORM(width) \ #define LAUNCH_FUSED_ADD_RMS_NORM(width) \
@ -195,8 +380,6 @@ void fused_add_rms_norm(torch::Tensor& input, // [..., hidden_size]
torch::Tensor& residual, // [..., hidden_size] torch::Tensor& residual, // [..., hidden_size]
torch::Tensor& weight, // [hidden_size] torch::Tensor& weight, // [hidden_size]
double epsilon) { double epsilon) {
TORCH_CHECK(weight.scalar_type() == input.scalar_type());
TORCH_CHECK(input.scalar_type() == residual.scalar_type());
TORCH_CHECK(residual.is_contiguous()); TORCH_CHECK(residual.is_contiguous());
TORCH_CHECK(weight.is_contiguous()); TORCH_CHECK(weight.is_contiguous());
int hidden_size = input.size(-1); int hidden_size = input.size(-1);
@ -231,7 +414,7 @@ void fused_add_rms_norm(torch::Tensor& input, // [..., hidden_size]
wt_ptr % req_alignment_bytes == 0; wt_ptr % req_alignment_bytes == 0;
bool offsets_are_multiple_of_vector_width = bool offsets_are_multiple_of_vector_width =
hidden_size % vector_width == 0 && input_stride % vector_width == 0; hidden_size % vector_width == 0 && input_stride % vector_width == 0;
bool batch_invariant_launch = vllm::vllm_is_batch_invariant(); bool batch_invariant_launch = vllm::vllm_kernel_override_batch_invariant();
if (ptrs_are_aligned && offsets_are_multiple_of_vector_width && if (ptrs_are_aligned && offsets_are_multiple_of_vector_width &&
!batch_invariant_launch) { !batch_invariant_launch) {
LAUNCH_FUSED_ADD_RMS_NORM(8); LAUNCH_FUSED_ADD_RMS_NORM(8);
@ -239,3 +422,50 @@ void fused_add_rms_norm(torch::Tensor& input, // [..., hidden_size]
LAUNCH_FUSED_ADD_RMS_NORM(0); LAUNCH_FUSED_ADD_RMS_NORM(0);
} }
} }
#define LAUNCH_FUSED_POLY_NORM(width) \
VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "poly_norm_kernel", [&] { \
vllm::poly_norm_kernel<scalar_t, width><<<grid, block, 0, stream>>>( \
out.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(), \
weight.data_ptr<scalar_t>(), bias.data_ptr<scalar_t>(), epsilon, \
hidden_size); \
});
void poly_norm(torch::Tensor& out, // [..., hidden_size]
torch::Tensor& input, // [..., hidden_size]
torch::Tensor& weight, // [3]
torch::Tensor& bias, // [1]
double epsilon) {
TORCH_CHECK(out.is_contiguous());
TORCH_CHECK(input.is_contiguous());
TORCH_CHECK(out.data_ptr() != input.data_ptr());
int hidden_size = input.size(-1);
int num_tokens = input.numel() / hidden_size;
dim3 grid(num_tokens);
/* This kernel is memory-latency bound in many scenarios.
When num_tokens is large, a smaller block size allows
for increased block occupancy on CUs and better latency
hiding on global mem ops. */
const int max_block_size = (num_tokens < 256) ? 1024 : 256;
dim3 block(std::min(hidden_size, max_block_size));
const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
/*If the tensor types are FP16/BF16, try to use the optimized kernel
with packed + vectorized ops.
Max optimization is achieved with a width-8 vector of FP16/BF16s
since we can load at most 128 bits at once in a global memory op.
However, this requires each tensor's data to be aligned to 16
bytes.
*/
auto inp_ptr = reinterpret_cast<std::uintptr_t>(input.data_ptr());
auto out_ptr = reinterpret_cast<std::uintptr_t>(out.data_ptr());
bool ptrs_are_aligned = inp_ptr % 16 == 0 && out_ptr % 16 == 0;
bool batch_invariant_launch = vllm::vllm_kernel_override_batch_invariant();
if (ptrs_are_aligned && hidden_size % 8 == 0 && !batch_invariant_launch) {
LAUNCH_FUSED_POLY_NORM(8);
} else {
LAUNCH_FUSED_POLY_NORM(0);
}
}

View File

@ -10,7 +10,6 @@
#include "dispatch_utils.h" #include "dispatch_utils.h"
#include "cub_helpers.h" #include "cub_helpers.h"
#include "core/batch_invariant.hpp" #include "core/batch_invariant.hpp"
#include "quantization/vectorization_utils.cuh"
#include <torch/cuda.h> #include <torch/cuda.h>
#include <c10/cuda/CUDAGuard.h> #include <c10/cuda/CUDAGuard.h>
@ -29,22 +28,10 @@ __global__ void rms_norm_static_fp8_quant_kernel(
__shared__ float s_variance; __shared__ float s_variance;
float variance = 0.0f; float variance = 0.0f;
const scalar_t* input_row = input + blockIdx.x * input_stride; for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
const float x = (float)input[blockIdx.x * input_stride + idx];
constexpr int VEC_SIZE = 8;
auto vec_op = [&variance](const vec_n_t<scalar_t, VEC_SIZE>& vec) {
#pragma unroll
for (int i = 0; i < VEC_SIZE; ++i) {
float x = static_cast<float>(vec.val[i]);
variance += x * x;
}
};
auto scalar_op = [&variance](const scalar_t& val) {
float x = static_cast<float>(val);
variance += x * x; variance += x * x;
}; }
vllm::vectorize_read_with_alignment<VEC_SIZE>(
input_row, hidden_size, threadIdx.x, blockDim.x, vec_op, scalar_op);
using BlockReduce = cub::BlockReduce<float, 1024>; using BlockReduce = cub::BlockReduce<float, 1024>;
__shared__ typename BlockReduce::TempStorage reduceStore; __shared__ typename BlockReduce::TempStorage reduceStore;
@ -229,8 +216,6 @@ void fused_add_rms_norm_static_fp8_quant(
double epsilon) { double epsilon) {
TORCH_CHECK(out.is_contiguous()); TORCH_CHECK(out.is_contiguous());
TORCH_CHECK(residual.is_contiguous()); TORCH_CHECK(residual.is_contiguous());
TORCH_CHECK(residual.scalar_type() == input.scalar_type());
TORCH_CHECK(weight.scalar_type() == input.scalar_type());
int hidden_size = input.size(-1); int hidden_size = input.size(-1);
int input_stride = input.stride(-2); int input_stride = input.stride(-2);
int num_tokens = input.numel() / hidden_size; int num_tokens = input.numel() / hidden_size;
@ -256,7 +241,7 @@ void fused_add_rms_norm_static_fp8_quant(
auto wt_ptr = reinterpret_cast<std::uintptr_t>(weight.data_ptr()); auto wt_ptr = reinterpret_cast<std::uintptr_t>(weight.data_ptr());
bool ptrs_are_aligned = bool ptrs_are_aligned =
inp_ptr % 16 == 0 && res_ptr % 16 == 0 && wt_ptr % 16 == 0; inp_ptr % 16 == 0 && res_ptr % 16 == 0 && wt_ptr % 16 == 0;
bool batch_invariant_launch = vllm::vllm_is_batch_invariant(); bool batch_invariant_launch = vllm::vllm_kernel_override_batch_invariant();
if (ptrs_are_aligned && hidden_size % 8 == 0 && input_stride % 8 == 0 && if (ptrs_are_aligned && hidden_size % 8 == 0 && input_stride % 8 == 0 &&
!batch_invariant_launch) { !batch_invariant_launch) {
LAUNCH_FUSED_ADD_RMS_NORM(8); LAUNCH_FUSED_ADD_RMS_NORM(8);

View File

@ -8,77 +8,12 @@
#include "../cuda_compat.h" #include "../cuda_compat.h"
#include "../dispatch_utils.h" #include "../dispatch_utils.h"
#include "core/math.hpp"
#define CEILDIV(x, y) (((x) + (y) - 1) / (y)) #define CEILDIV(x, y) (((x) + (y) - 1) / (y))
namespace vllm { namespace vllm {
namespace moe { namespace moe {
namespace batched_moe_align_block_size {
// Note num_threads needs to be 1024 for BlockScan Reduction in the kernel.
static constexpr int32_t num_threads = 1024;
static constexpr int32_t num_blocks = 1;
__global__ void batched_moe_align_block_size_kernel(
int32_t const num_batches, int32_t const max_tokens_per_batch,
int32_t const block_size, int32_t const* __restrict__ batch_num_tokens,
int32_t* __restrict__ sorted_ids, int32_t* __restrict__ block_ids,
int32_t* __restrict__ num_tokens_post_pad) {
// TODO(varun): This is a naive implementation. Could be optimized.
size_t const batch_id = threadIdx.x;
size_t const stride = blockDim.x * gridDim.x;
int32_t const num_blocks_per_batch =
CEILDIV(max_tokens_per_batch, block_size);
int32_t const sorted_ids_size =
num_blocks_per_batch * num_batches * block_size;
int32_t const block_ids_size = sorted_ids_size / block_size;
int32_t const SENTINEL =
num_batches * max_tokens_per_batch; // To denote invalid entries.
// Intialize sorted_ids
for (size_t i = threadIdx.x; i < sorted_ids_size; i += stride) {
sorted_ids[i] = SENTINEL;
}
// Intialize expert_ids with -1
for (size_t i = threadIdx.x; i < block_ids_size; i += stride) {
block_ids[i] = -1;
}
int32_t b_num_tokens = 0;
if (batch_id < num_batches) {
b_num_tokens = batch_num_tokens[batch_id];
}
int32_t const ceil_b_num_tokens =
CEILDIV(b_num_tokens, block_size) * block_size;
// Compute prefix sum over token counts per expert
using BlockScan = cub::BlockScan<int32_t, 1024>;
__shared__ typename BlockScan::TempStorage temp_storage;
int cumsum_val;
BlockScan(temp_storage).ExclusiveSum(ceil_b_num_tokens, cumsum_val);
__syncthreads();
bool const is_last_batch = batch_id == (num_batches - 1);
if (is_last_batch) {
*num_tokens_post_pad = cumsum_val + ceil_b_num_tokens;
}
if (batch_id < num_batches) {
int32_t const batch_offset = batch_id * max_tokens_per_batch;
for (size_t i = 0; i < b_num_tokens; ++i) {
sorted_ids[cumsum_val + i] = batch_offset + i;
}
int32_t const block_start = cumsum_val / block_size;
int32_t const num_blocks = ceil_b_num_tokens / block_size;
for (size_t i = 0; i < num_blocks; ++i) {
block_ids[block_start + i] = batch_id;
}
}
}
} // namespace batched_moe_align_block_size
template <typename scalar_t> template <typename scalar_t>
__global__ void moe_align_block_size_kernel( __global__ void moe_align_block_size_kernel(
const scalar_t* __restrict__ topk_ids, const scalar_t* __restrict__ topk_ids,
@ -345,33 +280,6 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
}); });
} }
void batched_moe_align_block_size(int64_t max_tokens_per_batch,
int64_t block_size,
torch::Tensor const& batch_num_tokens,
torch::Tensor sorted_ids,
torch::Tensor batch_ids,
torch::Tensor num_tokens_post_pad) {
namespace batched_kernel = vllm::moe::batched_moe_align_block_size;
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
int32_t const B = batch_num_tokens.size(0);
int32_t const num_blocks_per_batch =
round_to_next_multiple_of(max_tokens_per_batch, block_size) / block_size;
int32_t const num_blocks = num_blocks_per_batch * B;
int64_t const sorted_ids_size = num_blocks * block_size;
TORCH_CHECK(sorted_ids.size(0) == sorted_ids_size);
TORCH_CHECK(batch_ids.size(0) == sorted_ids_size / block_size);
TORCH_CHECK(num_tokens_post_pad.size(0) == 1);
TORCH_CHECK(B <= batched_kernel::num_threads);
batched_kernel::batched_moe_align_block_size_kernel<<<
batched_kernel::num_blocks, batched_kernel::num_threads, 0, stream>>>(
B, max_tokens_per_batch, block_size, batch_num_tokens.data_ptr<int32_t>(),
sorted_ids.data_ptr<int32_t>(), batch_ids.data_ptr<int32_t>(),
num_tokens_post_pad.data_ptr<int32_t>());
}
void moe_sum(torch::Tensor& input, // [num_tokens, topk, hidden_size] void moe_sum(torch::Tensor& input, // [num_tokens, topk, hidden_size]
torch::Tensor& output) // [num_tokens, hidden_size] torch::Tensor& output) // [num_tokens, hidden_size]
{ {

View File

@ -1,169 +0,0 @@
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <torch/all.h>
#include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDAGuard.h>
#include <ATen/ATen.h>
#include <ATen/cuda/Atomic.cuh>
#include "../cuda_compat.h"
#include "../dispatch_utils.h"
#include "core/math.hpp"
namespace {
__device__ __forceinline__ int32_t index(int32_t total_col, int32_t row,
int32_t col) {
return row * total_col + col;
}
} // namespace
// TODO: Refactor common parts with moe_align_sum_kernels
template <typename scalar_t, typename token_cnts_t>
__global__ void moe_lora_align_sum_kernel(
scalar_t* __restrict__ topk_ids, int32_t* token_lora_mapping,
int64_t block_size, int num_experts, int max_loras, size_t numel,
int max_num_tokens_padded, int max_num_m_blocks,
int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ expert_ids,
int topk_num, int32_t* total_tokens_post_pad) {
const size_t tokens_per_thread = div_ceil(numel, blockDim.x);
const size_t start_idx = threadIdx.x * tokens_per_thread;
int lora_id = blockIdx.x;
extern __shared__ int32_t shared_mem[];
int32_t* cumsum = shared_mem;
token_cnts_t* tokens_cnts = (token_cnts_t*)(shared_mem + num_experts + 1);
// Initialize sorted_token_ids with numel
for (size_t it = threadIdx.x; it < max_num_tokens_padded; it += blockDim.x) {
sorted_token_ids[lora_id * max_num_tokens_padded + it] = numel;
}
// Initialize expert_ids with -1
for (size_t it = threadIdx.x; it < max_num_m_blocks; it += blockDim.x) {
expert_ids[lora_id * max_num_m_blocks + it] = -1;
}
// Initialize total_tokens_post_pad with 0
if (threadIdx.x == 0) {
total_tokens_post_pad[lora_id] = 0;
}
for (int i = 0; i < num_experts; ++i) {
tokens_cnts[index(num_experts, threadIdx.x + 1, i)] = 0;
}
for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) {
int mask = token_lora_mapping[i / topk_num] == lora_id;
int idx = index(num_experts, threadIdx.x + 1, topk_ids[i]);
tokens_cnts[idx] += mask;
}
__syncthreads();
// For each expert we accumulate the token counts from the different threads.
if (threadIdx.x < num_experts) {
tokens_cnts[index(num_experts, 0, threadIdx.x)] = 0;
for (int i = 1; i <= blockDim.x; ++i) {
tokens_cnts[index(num_experts, i, threadIdx.x)] +=
tokens_cnts[index(num_experts, i - 1, threadIdx.x)];
}
}
__syncthreads();
// We accumulate the token counts of all experts in thread 0.
if (threadIdx.x == 0) {
cumsum[0] = 0;
for (int i = 1; i <= num_experts; ++i) {
cumsum[i] = cumsum[i - 1] +
div_ceil(tokens_cnts[index(num_experts, blockDim.x, i - 1)],
block_size) *
block_size;
}
total_tokens_post_pad[lora_id] = static_cast<int32_t>(cumsum[num_experts]);
}
__syncthreads();
/**
* For each expert, each thread processes the tokens of the corresponding
* blocks and stores the corresponding expert_id for each block.
*/
if (threadIdx.x < num_experts) {
for (int i = cumsum[threadIdx.x]; i < cumsum[threadIdx.x + 1];
i += block_size) {
expert_ids[index(max_num_m_blocks, lora_id, i / block_size)] =
threadIdx.x;
}
}
for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) {
int32_t expert_id = topk_ids[i];
/** The cumsum[expert_id] stores the starting index of the tokens that the
* expert with expert_id needs to process, and
* tokens_cnts[threadIdx.x][expert_id] stores the indices of the tokens
* processed by the expert with expert_id within the current thread's token
* shard.
*/
int32_t rank_post_pad =
tokens_cnts[index(num_experts, threadIdx.x, expert_id)] +
cumsum[expert_id];
int mask = (int)token_lora_mapping[i / topk_num] == lora_id;
atomicAdd(
&sorted_token_ids[index(max_num_tokens_padded, lora_id, rank_post_pad)],
(i - numel) * mask);
tokens_cnts[index(num_experts, threadIdx.x, expert_id)] += mask;
}
}
void moe_lora_align_block_size(torch::Tensor topk_ids,
torch::Tensor token_lora_mapping,
int64_t num_experts, int64_t block_size,
int64_t max_loras, int64_t max_num_tokens_padded,
int64_t max_num_m_blocks,
torch::Tensor sorted_token_ids,
torch::Tensor expert_ids,
torch::Tensor num_tokens_post_pad) {
const int topk_num = topk_ids.size(1);
TORCH_CHECK(block_size > 0, "block_size should be greater than 0. ");
int device_max_shared_mem;
auto dev = topk_ids.get_device();
cudaDeviceGetAttribute(&device_max_shared_mem,
cudaDevAttrMaxSharedMemoryPerBlockOptin, dev);
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
const int32_t num_thread = max((int32_t)num_experts, 128); // WARP_SIZE,
TORCH_CHECK(num_thread <= 1024,
"num_thread must be less than 1024, "
"and fallback is not implemented yet.");
const int32_t shared_mem = (num_thread + 1) * num_experts * sizeof(int32_t) +
(num_experts + 1) * sizeof(int32_t);
if (shared_mem > device_max_shared_mem) {
TORCH_CHECK(false,
"Shared memory usage exceeds device limit, and global memory "
"fallback is not implemented yet.");
}
VLLM_DISPATCH_INTEGRAL_TYPES(
topk_ids.scalar_type(), "moe_lora_align_sum_kernel", [&] {
dim3 blockDim(num_thread);
auto kernel = moe_lora_align_sum_kernel<scalar_t, int32_t>;
AT_CUDA_CHECK(VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(
(void*)kernel, shared_mem));
kernel<<<max_loras, blockDim, shared_mem, stream>>>(
topk_ids.data_ptr<scalar_t>(),
token_lora_mapping.data_ptr<int32_t>(), block_size, num_experts,
max_loras, topk_ids.numel(), max_num_tokens_padded,
max_num_m_blocks, sorted_token_ids.data_ptr<int32_t>(),
expert_ids.data_ptr<int32_t>(), topk_num,
num_tokens_post_pad.data_ptr<int32_t>());
});
}

View File

@ -4,7 +4,7 @@
void topk_softmax(torch::Tensor& topk_weights, torch::Tensor& topk_indices, void topk_softmax(torch::Tensor& topk_weights, torch::Tensor& topk_indices,
torch::Tensor& token_expert_indices, torch::Tensor& token_expert_indices,
torch::Tensor& gating_output, bool renormalize); torch::Tensor& gating_output);
void moe_sum(torch::Tensor& input, torch::Tensor& output); void moe_sum(torch::Tensor& input, torch::Tensor& output);
@ -12,22 +12,6 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
int64_t block_size, torch::Tensor sorted_token_ids, int64_t block_size, torch::Tensor sorted_token_ids,
torch::Tensor experts_ids, torch::Tensor experts_ids,
torch::Tensor num_tokens_post_pad); torch::Tensor num_tokens_post_pad);
void batched_moe_align_block_size(int64_t max_tokens_per_batch,
int64_t block_size,
torch::Tensor const& expert_num_tokens,
torch::Tensor sorted_ids,
torch::Tensor expert_ids,
torch::Tensor num_tokens_post_pad);
void moe_lora_align_block_size(torch::Tensor topk_ids,
torch::Tensor token_lora_mapping,
int64_t num_experts, int64_t block_size,
int64_t max_loras, int64_t max_num_tokens_padded,
int64_t max_num_m_blocks,
torch::Tensor sorted_token_ids,
torch::Tensor expert_ids,
torch::Tensor num_tokens_post_pad);
#ifndef USE_ROCM #ifndef USE_ROCM
torch::Tensor moe_wna16_gemm(torch::Tensor input, torch::Tensor output, torch::Tensor moe_wna16_gemm(torch::Tensor input, torch::Tensor output,
torch::Tensor b_qweight, torch::Tensor b_scales, torch::Tensor b_qweight, torch::Tensor b_scales,

View File

@ -16,22 +16,12 @@
* See the License for the specific language governing permissions and * See the License for the specific language governing permissions and
* limitations under the License. * limitations under the License.
*/ */
#include <type_traits>
#include <torch/all.h> #include <torch/all.h>
#include <ATen/cuda/CUDAContext.h> #include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDAGuard.h> #include <c10/cuda/CUDAGuard.h>
#include "../cuda_compat.h" #include "../cuda_compat.h"
#include "../cub_helpers.h" #include "../cub_helpers.h"
#include "../core/batch_invariant.hpp"
#ifndef USE_ROCM
#include <cuda_bf16.h>
#include <cuda_fp16.h>
#else
#include <hip/hip_bf16.h>
#include <hip/hip_fp16.h>
typedef __hip_bfloat16 __nv_bfloat16;
typedef __hip_bfloat162 __nv_bfloat162;
#endif
#define MAX(a, b) ((a) > (b) ? (a) : (b)) #define MAX(a, b) ((a) > (b) ? (a) : (b))
#define MIN(a, b) ((a) < (b) ? (a) : (b)) #define MIN(a, b) ((a) < (b) ? (a) : (b))
@ -47,27 +37,16 @@ template <
/// Alignment requirement in bytes /// Alignment requirement in bytes
int Alignment = sizeof(T) * N int Alignment = sizeof(T) * N
> >
struct alignas(Alignment) AlignedArray { class alignas(Alignment) AlignedArray {
T data[N]; float data[N];
}; };
template <typename T>
__device__ __forceinline__ float toFloat(T value) {
if constexpr (std::is_same_v<T, float>) {
return value;
} else if constexpr (std::is_same_v<T, __nv_bfloat16>) {
return __bfloat162float(value);
} else if constexpr (std::is_same_v<T, __half>) {
return __half2float(value);
}
}
// ====================== Softmax things =============================== // ====================== Softmax things ===============================
// We have our own implementation of softmax here so we can support transposing the output // We have our own implementation of softmax here so we can support transposing the output
// in the softmax kernel when we extend this module to support expert-choice routing. // in the softmax kernel when we extend this module to support expert-choice routing.
template <int TPB, typename InputType> template <int TPB>
__launch_bounds__(TPB) __global__ __launch_bounds__(TPB) __global__
void moeSoftmax(const InputType* input, const bool* finished, float* output, const int num_cols) void moeSoftmax(const float* input, const bool* finished, float* output, const int num_cols)
{ {
using BlockReduce = cub::BlockReduce<float, TPB>; using BlockReduce = cub::BlockReduce<float, TPB>;
__shared__ typename BlockReduce::TempStorage tmpStorage; __shared__ typename BlockReduce::TempStorage tmpStorage;
@ -88,8 +67,7 @@ __launch_bounds__(TPB) __global__
for (int ii = threadIdx.x; ii < num_cols; ii += TPB) for (int ii = threadIdx.x; ii < num_cols; ii += TPB)
{ {
const int idx = thread_row_offset + ii; const int idx = thread_row_offset + ii;
const float val = toFloat(input[idx]); threadData = max(static_cast<float>(input[idx]), threadData);
threadData = max(val, threadData);
} }
const float maxElem = BlockReduce(tmpStorage).Reduce(threadData, CubMaxOp()); const float maxElem = BlockReduce(tmpStorage).Reduce(threadData, CubMaxOp());
@ -104,8 +82,7 @@ __launch_bounds__(TPB) __global__
for (int ii = threadIdx.x; ii < num_cols; ii += TPB) for (int ii = threadIdx.x; ii < num_cols; ii += TPB)
{ {
const int idx = thread_row_offset + ii; const int idx = thread_row_offset + ii;
const float val = toFloat(input[idx]); threadData += exp((static_cast<float>(input[idx]) - float_max));
threadData += expf(val - float_max);
} }
const auto Z = BlockReduce(tmpStorage).Reduce(threadData, CubAddOp()); const auto Z = BlockReduce(tmpStorage).Reduce(threadData, CubAddOp());
@ -119,9 +96,8 @@ __launch_bounds__(TPB) __global__
for (int ii = threadIdx.x; ii < num_cols; ii += TPB) for (int ii = threadIdx.x; ii < num_cols; ii += TPB)
{ {
const int idx = thread_row_offset + ii; const int idx = thread_row_offset + ii;
const float val = toFloat(input[idx]); const float val = exp((static_cast<float>(input[idx]) - float_max)) * normalizing_factor;
const float softmax_val = expf(val - float_max) * normalizing_factor; output[idx] = val;
output[idx] = softmax_val;
} }
} }
@ -135,8 +111,7 @@ __launch_bounds__(TPB) __global__ void moeTopK(
const int num_experts, const int num_experts,
const int k, const int k,
const int start_expert, const int start_expert,
const int end_expert, const int end_expert)
const bool renormalize)
{ {
using cub_kvp = cub::KeyValuePair<int, float>; using cub_kvp = cub::KeyValuePair<int, float>;
@ -151,7 +126,6 @@ __launch_bounds__(TPB) __global__ void moeTopK(
const bool row_is_active = finished ? !finished[block_row] : true; const bool row_is_active = finished ? !finished[block_row] : true;
const int thread_read_offset = blockIdx.x * num_experts; const int thread_read_offset = blockIdx.x * num_experts;
float selected_sum = 0.f;
for (int k_idx = 0; k_idx < k; ++k_idx) for (int k_idx = 0; k_idx < k; ++k_idx)
{ {
thread_kvp.key = 0; thread_kvp.key = 0;
@ -190,23 +164,9 @@ __launch_bounds__(TPB) __global__ void moeTopK(
indices[idx] = should_process_row ? (expert - start_expert) : num_experts; indices[idx] = should_process_row ? (expert - start_expert) : num_experts;
assert(indices[idx] >= 0); assert(indices[idx] >= 0);
source_rows[idx] = k_idx * num_rows + block_row; source_rows[idx] = k_idx * num_rows + block_row;
if (renormalize) {
selected_sum += result_kvp.value;
}
} }
__syncthreads(); __syncthreads();
} }
// Renormalize the k weights for this row to sum to 1, if requested.
if (renormalize) {
if (threadIdx.x == 0) {
const float denom = selected_sum > 0.f ? selected_sum : 1.f;
for (int k_idx = 0; k_idx < k; ++k_idx) {
const int idx = k * block_row + k_idx;
output[idx] = output[idx] / denom;
}
}
}
} }
// ====================== TopK softmax things =============================== // ====================== TopK softmax things ===============================
@ -225,30 +185,21 @@ __launch_bounds__(TPB) __global__ void moeTopK(
2) This implementation assumes k is small, but will work for any k. 2) This implementation assumes k is small, but will work for any k.
*/ */
template <int VPT, int NUM_EXPERTS, int WARPS_PER_CTA, int BYTES_PER_LDG, int WARP_SIZE_PARAM, typename IndType, typename InputType = float> template <int VPT, int NUM_EXPERTS, int WARPS_PER_CTA, int BYTES_PER_LDG, int WARP_SIZE_PARAM, typename IndType>
__launch_bounds__(WARPS_PER_CTA* WARP_SIZE_PARAM) __global__ __launch_bounds__(WARPS_PER_CTA* WARP_SIZE_PARAM) __global__
void topkGatingSoftmax(const InputType* input, const bool* finished, float* output, const int num_rows, IndType* indices, void topkGatingSoftmax(const float* input, const bool* finished, float* output, const int num_rows, IndType* indices,
int* source_rows, const int k, const int start_expert, const int end_expert, const bool renormalize) int* source_rows, const int k, const int start_expert, const int end_expert)
{ {
static_assert(std::is_same_v<InputType, float> || std::is_same_v<InputType, __nv_bfloat16> ||
std::is_same_v<InputType, __half>,
"InputType must be float, __nv_bfloat16, or __half");
// We begin by enforcing compile time assertions and setting up compile time constants. // We begin by enforcing compile time assertions and setting up compile time constants.
static_assert(BYTES_PER_LDG == (BYTES_PER_LDG & -BYTES_PER_LDG), "BYTES_PER_LDG must be power of 2"); static_assert(BYTES_PER_LDG == (BYTES_PER_LDG & -BYTES_PER_LDG), "BYTES_PER_LDG must be power of 2");
static_assert(BYTES_PER_LDG <= 16, "BYTES_PER_LDG must be leq 16"); static_assert(BYTES_PER_LDG <= 16, "BYTES_PER_LDG must be leq 16");
// Number of bytes each thread pulls in per load // Number of bytes each thread pulls in per load
static constexpr int ELTS_PER_LDG = BYTES_PER_LDG / sizeof(InputType); static constexpr int ELTS_PER_LDG = BYTES_PER_LDG / sizeof(float);
static constexpr int ELTS_PER_ROW = NUM_EXPERTS; static constexpr int ELTS_PER_ROW = NUM_EXPERTS;
static constexpr int THREADS_PER_ROW = ELTS_PER_ROW / VPT; static constexpr int THREADS_PER_ROW = ELTS_PER_ROW / VPT;
static constexpr int LDG_PER_THREAD = VPT / ELTS_PER_LDG; static constexpr int LDG_PER_THREAD = VPT / ELTS_PER_LDG;
if constexpr (std::is_same_v<InputType, __nv_bfloat16> || std::is_same_v<InputType, __half>) {
static_assert(ELTS_PER_LDG == 1 || ELTS_PER_LDG % 2 == 0,
"ELTS_PER_LDG must be 1 or even for 16-bit conversion");
}
// Restrictions based on previous section. // Restrictions based on previous section.
static_assert(VPT % ELTS_PER_LDG == 0, "The elements per thread must be a multiple of the elements per ldg"); static_assert(VPT % ELTS_PER_LDG == 0, "The elements per thread must be a multiple of the elements per ldg");
static_assert(WARP_SIZE_PARAM % THREADS_PER_ROW == 0, "The threads per row must cleanly divide the threads per warp"); static_assert(WARP_SIZE_PARAM % THREADS_PER_ROW == 0, "The threads per row must cleanly divide the threads per warp");
@ -286,71 +237,27 @@ __launch_bounds__(WARPS_PER_CTA* WARP_SIZE_PARAM) __global__
// We finally start setting up the read pointers for each thread. First, each thread jumps to the start of the // We finally start setting up the read pointers for each thread. First, each thread jumps to the start of the
// row it will read. // row it will read.
const InputType* thread_row_ptr = input + thread_row * ELTS_PER_ROW; const float* thread_row_ptr = input + thread_row * ELTS_PER_ROW;
// Now, we compute the group each thread belong to in order to determine the first column to start loads. // Now, we compute the group each thread belong to in order to determine the first column to start loads.
const int thread_group_idx = threadIdx.x % THREADS_PER_ROW; const int thread_group_idx = threadIdx.x % THREADS_PER_ROW;
const int first_elt_read_by_thread = thread_group_idx * ELTS_PER_LDG; const int first_elt_read_by_thread = thread_group_idx * ELTS_PER_LDG;
const InputType* thread_read_ptr = thread_row_ptr + first_elt_read_by_thread; const float* thread_read_ptr = thread_row_ptr + first_elt_read_by_thread;
// Determine the pointer type to use to read in the data depending on the BYTES_PER_LDG template param. In theory,
// this can support all powers of 2 up to 16.
// NOTE(woosuk): The original implementation uses CUTLASS aligned array here.
// We defined our own aligned array and use it here to avoid the dependency on CUTLASS.
using AccessType = AlignedArray<float, ELTS_PER_LDG>;
// Finally, we pull in the data from global mem // Finally, we pull in the data from global mem
float row_chunk[VPT]; float row_chunk[VPT];
AccessType* row_chunk_vec_ptr = reinterpret_cast<AccessType*>(&row_chunk);
// NOTE(zhuhaoran): dispatch different input types loading, BF16/FP16 convert to float const AccessType* vec_thread_read_ptr = reinterpret_cast<const AccessType*>(thread_read_ptr);
if constexpr (std::is_same_v<InputType, float>) {
using VecType = AlignedArray<float, ELTS_PER_LDG>;
VecType* row_chunk_vec_ptr = reinterpret_cast<VecType*>(&row_chunk);
const VecType* vec_thread_read_ptr = reinterpret_cast<const VecType*>(thread_read_ptr);
#pragma unroll #pragma unroll
for (int ii = 0; ii < LDG_PER_THREAD; ++ii) { for (int ii = 0; ii < LDG_PER_THREAD; ++ii)
row_chunk_vec_ptr[ii] = vec_thread_read_ptr[ii * THREADS_PER_ROW]; {
} row_chunk_vec_ptr[ii] = vec_thread_read_ptr[ii * THREADS_PER_ROW];
} else if constexpr (std::is_same_v<InputType, __nv_bfloat16>) {
if constexpr (ELTS_PER_LDG >= 2) {
using VecType = AlignedArray<__nv_bfloat16, ELTS_PER_LDG>;
float2* row_chunk_f2 = reinterpret_cast<float2*>(row_chunk);
const VecType* vec_thread_read_ptr = reinterpret_cast<const VecType*>(thread_read_ptr);
#pragma unroll
for (int ii = 0; ii < LDG_PER_THREAD; ++ii) {
VecType vec = vec_thread_read_ptr[ii * THREADS_PER_ROW];
int base_idx_f2 = ii * ELTS_PER_LDG / 2;
#pragma unroll
for (int jj = 0; jj < ELTS_PER_LDG / 2; ++jj) {
row_chunk_f2[base_idx_f2 + jj] = __bfloat1622float2(
*reinterpret_cast<const __nv_bfloat162*>(vec.data + jj * 2)
);
}
}
} else { // ELTS_PER_LDG == 1
#pragma unroll
for (int ii = 0; ii < LDG_PER_THREAD; ++ii) {
const __nv_bfloat16* scalar_ptr = thread_read_ptr + ii * THREADS_PER_ROW;
row_chunk[ii] = __bfloat162float(*scalar_ptr);
}
}
} else if constexpr (std::is_same_v<InputType, __half>) {
if constexpr (ELTS_PER_LDG >= 2) {
using VecType = AlignedArray<__half, ELTS_PER_LDG>;
float2* row_chunk_f2 = reinterpret_cast<float2*>(row_chunk);
const VecType* vec_thread_read_ptr = reinterpret_cast<const VecType*>(thread_read_ptr);
#pragma unroll
for (int ii = 0; ii < LDG_PER_THREAD; ++ii) {
VecType vec = vec_thread_read_ptr[ii * THREADS_PER_ROW];
int base_idx_f2 = ii * ELTS_PER_LDG / 2;
#pragma unroll
for (int jj = 0; jj < ELTS_PER_LDG / 2; ++jj) {
row_chunk_f2[base_idx_f2 + jj] = __half22float2(
*reinterpret_cast<const __half2*>(vec.data + jj * 2)
);
}
}
} else { // ELTS_PER_LDG == 1
#pragma unroll
for (int ii = 0; ii < LDG_PER_THREAD; ++ii) {
const __half* scalar_ptr = thread_read_ptr + ii * THREADS_PER_ROW;
row_chunk[ii] = __half2float(*scalar_ptr);
}
}
} }
// First, we perform a max reduce within the thread. We can do the max in fp16 safely (I think) and just // First, we perform a max reduce within the thread. We can do the max in fp16 safely (I think) and just
@ -404,7 +311,6 @@ __launch_bounds__(WARPS_PER_CTA* WARP_SIZE_PARAM) __global__
int start_col = first_elt_read_by_thread; int start_col = first_elt_read_by_thread;
static constexpr int COLS_PER_GROUP_LDG = ELTS_PER_LDG * THREADS_PER_ROW; static constexpr int COLS_PER_GROUP_LDG = ELTS_PER_LDG * THREADS_PER_ROW;
float selected_sum = 0.f;
for (int k_idx = 0; k_idx < k; ++k_idx) for (int k_idx = 0; k_idx < k; ++k_idx)
{ {
// First, each thread does the local argmax // First, each thread does the local argmax
@ -458,9 +364,6 @@ __launch_bounds__(WARPS_PER_CTA* WARP_SIZE_PARAM) __global__
output[idx] = max_val; output[idx] = max_val;
indices[idx] = should_process_row ? (expert - start_expert) : NUM_EXPERTS; indices[idx] = should_process_row ? (expert - start_expert) : NUM_EXPERTS;
source_rows[idx] = k_idx * num_rows + thread_row; source_rows[idx] = k_idx * num_rows + thread_row;
if (renormalize) {
selected_sum += max_val;
}
} }
// Finally, we clear the value in the thread with the current max if there is another iteration to run. // Finally, we clear the value in the thread with the current max if there is another iteration to run.
@ -478,28 +381,15 @@ __launch_bounds__(WARPS_PER_CTA* WARP_SIZE_PARAM) __global__
} }
} }
} }
// Renormalize the k weights for this row to sum to 1, if requested.
if (renormalize) {
if (thread_group_idx == 0)
{
const float denom = selected_sum > 0.f ? selected_sum : 1.f;
for (int k_idx = 0; k_idx < k; ++k_idx)
{
const int idx = k * thread_row + k_idx;
output[idx] = output[idx] / denom;
}
}
}
} }
namespace detail namespace detail
{ {
// Constructs some constants needed to partition the work across threads at compile time. // Constructs some constants needed to partition the work across threads at compile time.
template <int EXPERTS, int BYTES_PER_LDG, int WARP_SIZE_PARAM, typename InputType> template <int EXPERTS, int BYTES_PER_LDG, int WARP_SIZE_PARAM>
struct TopkConstants struct TopkConstants
{ {
static constexpr int ELTS_PER_LDG = BYTES_PER_LDG / sizeof(InputType); static constexpr int ELTS_PER_LDG = BYTES_PER_LDG / sizeof(float);
static_assert(EXPERTS / (ELTS_PER_LDG * WARP_SIZE_PARAM) == 0 || EXPERTS % (ELTS_PER_LDG * WARP_SIZE_PARAM) == 0, ""); static_assert(EXPERTS / (ELTS_PER_LDG * WARP_SIZE_PARAM) == 0 || EXPERTS % (ELTS_PER_LDG * WARP_SIZE_PARAM) == 0, "");
static constexpr int VECs_PER_THREAD = MAX(1, EXPERTS / (ELTS_PER_LDG * WARP_SIZE_PARAM)); static constexpr int VECs_PER_THREAD = MAX(1, EXPERTS / (ELTS_PER_LDG * WARP_SIZE_PARAM));
static constexpr int VPT = VECs_PER_THREAD * ELTS_PER_LDG; static constexpr int VPT = VECs_PER_THREAD * ELTS_PER_LDG;
@ -508,21 +398,21 @@ struct TopkConstants
}; };
} // namespace detail } // namespace detail
template <int EXPERTS, int WARPS_PER_TB, int WARP_SIZE_PARAM, int MAX_BYTES_PER_LDG, typename IndType, typename InputType> template <int EXPERTS, int WARPS_PER_TB, int WARP_SIZE_PARAM, int MAX_BYTES_PER_LDG, typename IndType>
void topkGatingSoftmaxLauncherHelper(const InputType* input, const bool* finished, float* output, IndType* indices, void topkGatingSoftmaxLauncherHelper(const float* input, const bool* finished, float* output, IndType* indices,
int* source_row, const int num_rows, const int k, const int start_expert, const int end_expert, const bool renormalize, int* source_row, const int num_rows, const int k, const int start_expert, const int end_expert, cudaStream_t stream)
cudaStream_t stream)
{ {
static constexpr int BYTES_PER_LDG = MIN(MAX_BYTES_PER_LDG, sizeof(InputType) * EXPERTS); static constexpr int BYTES_PER_LDG = MIN(MAX_BYTES_PER_LDG, sizeof(float) * EXPERTS);
using Constants = detail::TopkConstants<EXPERTS, BYTES_PER_LDG, WARP_SIZE_PARAM, InputType>; using Constants = detail::TopkConstants<EXPERTS, BYTES_PER_LDG, WARP_SIZE_PARAM>;
static constexpr int VPT = Constants::VPT; static constexpr int VPT = Constants::VPT;
static constexpr int ROWS_PER_WARP = Constants::ROWS_PER_WARP; static constexpr int ROWS_PER_WARP = Constants::ROWS_PER_WARP;
const int num_warps = (num_rows + ROWS_PER_WARP - 1) / ROWS_PER_WARP; const bool batch_invariant_launch = vllm::vllm_kernel_override_batch_invariant();
const int num_warps = batch_invariant_launch ? 32 : (num_rows + ROWS_PER_WARP - 1) / ROWS_PER_WARP;
const int num_blocks = (num_warps + WARPS_PER_TB - 1) / WARPS_PER_TB; const int num_blocks = (num_warps + WARPS_PER_TB - 1) / WARPS_PER_TB;
dim3 block_dim(WARP_SIZE_PARAM, WARPS_PER_TB); dim3 block_dim(WARP_SIZE_PARAM, WARPS_PER_TB);
topkGatingSoftmax<VPT, EXPERTS, WARPS_PER_TB, BYTES_PER_LDG, WARP_SIZE_PARAM, IndType, InputType><<<num_blocks, block_dim, 0, stream>>>( topkGatingSoftmax<VPT, EXPERTS, WARPS_PER_TB, BYTES_PER_LDG, WARP_SIZE_PARAM><<<num_blocks, block_dim, 0, stream>>>(
input, finished, output, num_rows, indices, source_row, k, start_expert, end_expert, renormalize); input, finished, output, num_rows, indices, source_row, k, start_expert, end_expert);
} }
#ifndef USE_ROCM #ifndef USE_ROCM
@ -530,26 +420,26 @@ void topkGatingSoftmaxLauncherHelper(const InputType* input, const bool* finishe
static_assert(WARP_SIZE == 32, \ static_assert(WARP_SIZE == 32, \
"Unsupported warp size. Only 32 is supported for CUDA"); \ "Unsupported warp size. Only 32 is supported for CUDA"); \
topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB, WARP_SIZE, MAX_BYTES>( \ topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB, WARP_SIZE, MAX_BYTES>( \
gating_output, nullptr, topk_weights, topk_indices, token_expert_indices, \ gating_output, nullptr, topk_weights, topk_indices, \
num_tokens, topk, 0, num_experts, renormalize, stream); token_expert_indices, num_tokens, topk, 0, num_experts, stream);
#else #else
#define LAUNCH_SOFTMAX(NUM_EXPERTS, WARPS_PER_TB, MAX_BYTES) \ #define LAUNCH_SOFTMAX(NUM_EXPERTS, WARPS_PER_TB, MAX_BYTES) \
if (WARP_SIZE == 64) { \ if (WARP_SIZE == 64) { \
topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB, 64, MAX_BYTES>( \ topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB, 64, MAX_BYTES>( \
gating_output, nullptr, topk_weights, topk_indices, token_expert_indices, \ gating_output, nullptr, topk_weights, topk_indices, \
num_tokens, topk, 0, num_experts, renormalize, stream); \ token_expert_indices, num_tokens, topk, 0, num_experts, stream); \
} else if (WARP_SIZE == 32) { \ } else if (WARP_SIZE == 32) { \
topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB, 32, MAX_BYTES>( \ topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB, 32, MAX_BYTES>( \
gating_output, nullptr, topk_weights, topk_indices, token_expert_indices, \ gating_output, nullptr, topk_weights, topk_indices, \
num_tokens, topk, 0, num_experts, renormalize, stream); \ token_expert_indices, num_tokens, topk, 0, num_experts, stream); \
} else { \ } else { \
assert(false && "Unsupported warp size. Only 32 and 64 are supported for ROCm"); \ assert(false && "Unsupported warp size. Only 32 and 64 are supported for ROCm"); \
} }
#endif #endif
template <typename IndType, typename InputType> template <typename IndType>
void topkGatingSoftmaxKernelLauncher( void topkGatingSoftmaxKernelLauncher(
const InputType* gating_output, const float* gating_output,
float* topk_weights, float* topk_weights,
IndType* topk_indices, IndType* topk_indices,
int* token_expert_indices, int* token_expert_indices,
@ -557,15 +447,11 @@ void topkGatingSoftmaxKernelLauncher(
const int num_tokens, const int num_tokens,
const int num_experts, const int num_experts,
const int topk, const int topk,
const bool renormalize,
cudaStream_t stream) { cudaStream_t stream) {
static constexpr int WARPS_PER_TB = 4; static constexpr int WARPS_PER_TB = 4;
static constexpr int BYTES_PER_LDG_POWER_OF_2 = 16; static constexpr int BYTES_PER_LDG_POWER_OF_2 = 16;
#ifndef USE_ROCM #ifndef USE_ROCM
// for bfloat16 dtype, we need 4 bytes loading to make sure num_experts static constexpr int BYTES_PER_LDG_MULTIPLE_64 = 8;
// elements can be loaded by a warp
static constexpr int BYTES_PER_LDG_MULTIPLE_64 =
(std::is_same_v<InputType, __nv_bfloat16> || std::is_same_v<InputType, __half>) ? 4 : 8;
#endif #endif
switch (num_experts) { switch (num_experts) {
case 1: case 1:
@ -622,11 +508,11 @@ void topkGatingSoftmaxKernelLauncher(
TORCH_CHECK(softmax_workspace != nullptr, TORCH_CHECK(softmax_workspace != nullptr,
"softmax_workspace must be provided for num_experts that are not a power of 2 or multiple of 64."); "softmax_workspace must be provided for num_experts that are not a power of 2 or multiple of 64.");
static constexpr int TPB = 256; static constexpr int TPB = 256;
moeSoftmax<TPB, InputType><<<num_tokens, TPB, 0, stream>>>( moeSoftmax<TPB><<<num_tokens, TPB, 0, stream>>>(
gating_output, nullptr, softmax_workspace, num_experts); gating_output, nullptr, softmax_workspace, num_experts);
moeTopK<TPB><<<num_tokens, TPB, 0, stream>>>( moeTopK<TPB><<<num_tokens, TPB, 0, stream>>>(
softmax_workspace, nullptr, topk_weights, topk_indices, token_expert_indices, softmax_workspace, nullptr, topk_weights, topk_indices, token_expert_indices,
num_experts, topk, 0, num_experts, renormalize); num_experts, topk, 0, num_experts);
} }
} }
} }
@ -634,50 +520,11 @@ void topkGatingSoftmaxKernelLauncher(
} // namespace moe } // namespace moe
} // namespace vllm } // namespace vllm
template<typename ComputeType>
void dispatch_topk_softmax_launch(
torch::Tensor& gating_output,
torch::Tensor& topk_weights,
torch::Tensor& topk_indices,
torch::Tensor& token_expert_indices,
torch::Tensor& softmax_workspace,
int num_tokens, int num_experts, int topk, bool renormalize, cudaStream_t stream)
{
if (topk_indices.scalar_type() == at::ScalarType::Int) {
vllm::moe::topkGatingSoftmaxKernelLauncher<int, ComputeType>(
reinterpret_cast<const ComputeType*>(gating_output.data_ptr()),
topk_weights.data_ptr<float>(),
topk_indices.data_ptr<int>(),
token_expert_indices.data_ptr<int>(),
softmax_workspace.data_ptr<float>(),
num_tokens, num_experts, topk, renormalize, stream);
} else if (topk_indices.scalar_type() == at::ScalarType::UInt32) {
vllm::moe::topkGatingSoftmaxKernelLauncher<uint32_t, ComputeType>(
reinterpret_cast<const ComputeType*>(gating_output.data_ptr()),
topk_weights.data_ptr<float>(),
topk_indices.data_ptr<uint32_t>(),
token_expert_indices.data_ptr<int>(),
softmax_workspace.data_ptr<float>(),
num_tokens, num_experts, topk, renormalize, stream);
} else {
TORCH_CHECK(topk_indices.scalar_type() == at::ScalarType::Long);
vllm::moe::topkGatingSoftmaxKernelLauncher<int64_t, ComputeType>(
reinterpret_cast<const ComputeType*>(gating_output.data_ptr()),
topk_weights.data_ptr<float>(),
topk_indices.data_ptr<int64_t>(),
token_expert_indices.data_ptr<int>(),
softmax_workspace.data_ptr<float>(),
num_tokens, num_experts, topk, renormalize, stream);
}
}
void topk_softmax( void topk_softmax(
torch::Tensor& topk_weights, // [num_tokens, topk] torch::Tensor& topk_weights, // [num_tokens, topk]
torch::Tensor& topk_indices, // [num_tokens, topk] torch::Tensor& topk_indices, // [num_tokens, topk]
torch::Tensor& token_expert_indices, // [num_tokens, topk] torch::Tensor& token_expert_indices, // [num_tokens, topk]
torch::Tensor& gating_output, // [num_tokens, num_experts] torch::Tensor& gating_output) // [num_tokens, num_experts]
bool renormalize)
{ {
const int num_experts = gating_output.size(-1); const int num_experts = gating_output.size(-1);
const auto num_tokens = gating_output.numel() / num_experts; const auto num_tokens = gating_output.numel() / num_experts;
@ -689,19 +536,45 @@ void topk_softmax(
const at::cuda::OptionalCUDAGuard device_guard(device_of(gating_output)); const at::cuda::OptionalCUDAGuard device_guard(device_of(gating_output));
const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
const auto workspace_options = gating_output.options().dtype(at::ScalarType::Float); torch::Tensor softmax_workspace = torch::empty({workspace_size}, gating_output.options());
torch::Tensor softmax_workspace = torch::empty({workspace_size}, workspace_options);
if (gating_output.scalar_type() == at::ScalarType::Float) { if(topk_indices.scalar_type() == at::ScalarType::Int)
dispatch_topk_softmax_launch<float>(gating_output, topk_weights, topk_indices, {
token_expert_indices, softmax_workspace, num_tokens, num_experts, topk, renormalize, stream); vllm::moe::topkGatingSoftmaxKernelLauncher(
} else if (gating_output.scalar_type() == at::ScalarType::Half) { gating_output.data_ptr<float>(),
dispatch_topk_softmax_launch<__half>(gating_output, topk_weights, topk_indices, topk_weights.data_ptr<float>(),
token_expert_indices, softmax_workspace, num_tokens, num_experts, topk, renormalize, stream); topk_indices.data_ptr<int>(),
} else if (gating_output.scalar_type() == at::ScalarType::BFloat16) { token_expert_indices.data_ptr<int>(),
dispatch_topk_softmax_launch<__nv_bfloat16>(gating_output, topk_weights, topk_indices, softmax_workspace.data_ptr<float>(),
token_expert_indices, softmax_workspace, num_tokens, num_experts, topk, renormalize, stream); num_tokens,
} else { num_experts,
TORCH_CHECK(false, "Unsupported gating_output data type: ", gating_output.scalar_type()); topk,
stream);
}
else if (topk_indices.scalar_type() == at::ScalarType::UInt32)
{
vllm::moe::topkGatingSoftmaxKernelLauncher(
gating_output.data_ptr<float>(),
topk_weights.data_ptr<float>(),
topk_indices.data_ptr<uint32_t>(),
token_expert_indices.data_ptr<int>(),
softmax_workspace.data_ptr<float>(),
num_tokens,
num_experts,
topk,
stream);
}
else {
TORCH_CHECK(topk_indices.scalar_type() == at::ScalarType::Long);
vllm::moe::topkGatingSoftmaxKernelLauncher(
gating_output.data_ptr<float>(),
topk_weights.data_ptr<float>(),
topk_indices.data_ptr<int64_t>(),
token_expert_indices.data_ptr<int>(),
softmax_workspace.data_ptr<float>(),
num_tokens,
num_experts,
topk,
stream);
} }
} }

View File

@ -5,7 +5,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
// Apply topk softmax to the gating outputs. // Apply topk softmax to the gating outputs.
m.def( m.def(
"topk_softmax(Tensor! topk_weights, Tensor! topk_indices, Tensor! " "topk_softmax(Tensor! topk_weights, Tensor! topk_indices, Tensor! "
"token_expert_indices, Tensor gating_output, bool renormalize) -> ()"); "token_expert_indices, Tensor gating_output) -> ()");
m.impl("topk_softmax", torch::kCUDA, &topk_softmax); m.impl("topk_softmax", torch::kCUDA, &topk_softmax);
// Calculate the result of moe by summing up the partial results // Calculate the result of moe by summing up the partial results
@ -22,31 +22,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
" Tensor! num_tokens_post_pad) -> ()"); " Tensor! num_tokens_post_pad) -> ()");
m.impl("moe_align_block_size", torch::kCUDA, &moe_align_block_size); m.impl("moe_align_block_size", torch::kCUDA, &moe_align_block_size);
// Aligning the number of tokens to be processed by each expert such
// that it is divisible by the block size, but for the batched case.
m.def(
"batched_moe_align_block_size(int max_tokens_per_batch,"
" int block_size, Tensor expert_num_tokens,"
" Tensor! sorted_token_ids,"
" Tensor! experts_ids,"
" Tensor! num_tokens_post_pad) -> ()");
m.impl("batched_moe_align_block_size", torch::kCUDA,
&batched_moe_align_block_size);
// Aligning the number of tokens to be processed by each expert such
// that it is divisible by the block size.
m.def(
"moe_lora_align_block_size(Tensor topk_ids,"
" Tensor token_lora_mapping,"
" int num_experts,"
" int block_size, int max_loras, "
" int max_num_tokens_padded, "
" int max_num_m_blocks, "
" Tensor !sorted_token_ids,"
" Tensor !experts_ids,"
" Tensor !num_tokens_post_pad) -> () ");
m.impl("moe_lora_align_block_size", torch::kCUDA, &moe_lora_align_block_size);
#ifndef USE_ROCM #ifndef USE_ROCM
m.def( m.def(
"moe_wna16_gemm(Tensor input, Tensor! output, Tensor b_qweight, " "moe_wna16_gemm(Tensor input, Tensor! output, Tensor b_qweight, "

View File

@ -92,6 +92,9 @@ void rms_norm(torch::Tensor& out, torch::Tensor& input, torch::Tensor& weight,
void fused_add_rms_norm(torch::Tensor& input, torch::Tensor& residual, void fused_add_rms_norm(torch::Tensor& input, torch::Tensor& residual,
torch::Tensor& weight, double epsilon); torch::Tensor& weight, double epsilon);
void poly_norm(torch::Tensor& out, torch::Tensor& input, torch::Tensor& weight,
torch::Tensor& bias, double epsilon);
void apply_repetition_penalties_(torch::Tensor& logits, void apply_repetition_penalties_(torch::Tensor& logits,
const torch::Tensor& prompt_mask, const torch::Tensor& prompt_mask,
const torch::Tensor& output_mask, const torch::Tensor& output_mask,
@ -99,11 +102,8 @@ void apply_repetition_penalties_(torch::Tensor& logits,
void top_k_per_row(const torch::Tensor& logits, const torch::Tensor& rowStarts, void top_k_per_row(const torch::Tensor& logits, const torch::Tensor& rowStarts,
const torch::Tensor& rowEnds, torch::Tensor& indices, const torch::Tensor& rowEnds, torch::Tensor& indices,
int64_t numRows, int64_t stride0, int64_t stride1); torch::Tensor& values, int64_t numRows, int64_t stride0,
int64_t stride1);
void top_k_per_row_decode(const torch::Tensor& logits, int64_t next_n,
const torch::Tensor& seq_lens, torch::Tensor& indices,
int64_t numRows, int64_t stride0, int64_t stride1);
void rms_norm_static_fp8_quant(torch::Tensor& out, torch::Tensor& input, void rms_norm_static_fp8_quant(torch::Tensor& out, torch::Tensor& input,
torch::Tensor& weight, torch::Tensor& scale, torch::Tensor& weight, torch::Tensor& scale,

View File

@ -145,11 +145,7 @@ void rms_norm_dynamic_per_token_quant(
if (scale_ub.has_value()) { if (scale_ub.has_value()) {
TORCH_CHECK(out.dtype() == kFp8Type); TORCH_CHECK(out.dtype() == kFp8Type);
} }
TORCH_CHECK(weight.dtype() == input.dtype());
TORCH_CHECK(scales.dtype() == torch::kFloat32); TORCH_CHECK(scales.dtype() == torch::kFloat32);
if (residual) {
TORCH_CHECK(residual->scalar_type() == input.scalar_type());
}
VLLM_DISPATCH_FLOATING_TYPES( VLLM_DISPATCH_FLOATING_TYPES(
input.scalar_type(), "rms_norm_dynamic_per_token_quant_dispatch", [&] { input.scalar_type(), "rms_norm_dynamic_per_token_quant_dispatch", [&] {

View File

@ -9,6 +9,7 @@ from collections.abc import Iterable
from copy import deepcopy from copy import deepcopy
from dataclasses import dataclass, fields from dataclasses import dataclass, fields
from functools import reduce from functools import reduce
from typing import Optional, Union
import jinja2 import jinja2
from vllm_cutlass_library_extension import ( from vllm_cutlass_library_extension import (
@ -258,7 +259,7 @@ class ScheduleConfig:
@dataclass(frozen=True) @dataclass(frozen=True)
class TypeConfig: class TypeConfig:
a: DataType a: DataType
b: DataType | VLLMDataType b: Union[DataType, VLLMDataType]
b_group_scale: DataType b_group_scale: DataType
b_group_zeropoint: DataType b_group_zeropoint: DataType
b_channel_scale: DataType b_channel_scale: DataType
@ -279,7 +280,7 @@ class PrepackTypeConfig:
class ImplConfig: class ImplConfig:
types: TypeConfig types: TypeConfig
schedules: list[ScheduleConfig] schedules: list[ScheduleConfig]
heuristic: list[tuple[str | None, ScheduleConfig]] heuristic: list[tuple[Optional[str], ScheduleConfig]]
def generate_sch_sig(schedule_config: ScheduleConfig) -> str: def generate_sch_sig(schedule_config: ScheduleConfig) -> str:

View File

@ -22,14 +22,13 @@ template <typename AllReduceKernel, typename T>
__global__ __quickreduce_launch_bounds_two_shot__ static void __global__ __quickreduce_launch_bounds_two_shot__ static void
allreduce_prototype_twoshot(T const* A, T* B, uint32_t N, uint32_t num_blocks, allreduce_prototype_twoshot(T const* A, T* B, uint32_t N, uint32_t num_blocks,
int rank, uint8_t** dbuffer_list, int rank, uint8_t** dbuffer_list,
uint32_t data_offset, uint32_t flag_color, uint32_t data_offset, uint32_t flag_color) {
int64_t data_size_per_phase) {
int block = blockIdx.x; int block = blockIdx.x;
int grid = gridDim.x; int grid = gridDim.x;
while (block < num_blocks) { while (block < num_blocks) {
AllReduceKernel::run(A, B, N, block, rank, dbuffer_list, data_offset, AllReduceKernel::run(A, B, N, block, rank, dbuffer_list, data_offset,
flag_color, data_size_per_phase); flag_color);
block += grid; block += grid;
flag_color++; flag_color++;
} }
@ -42,21 +41,21 @@ allreduce_prototype_twoshot(T const* A, T* B, uint32_t N, uint32_t num_blocks,
hipLaunchKernelGGL((allreduce_prototype_twoshot<AllReduceKernel, T>), \ hipLaunchKernelGGL((allreduce_prototype_twoshot<AllReduceKernel, T>), \
dim3(grid), dim3(kBlockTwoShot), 0, stream, A, B, N, \ dim3(grid), dim3(kBlockTwoShot), 0, stream, A, B, N, \
num_blocks, rank, dbuffer_list, data_offset, \ num_blocks, rank, dbuffer_list, data_offset, \
flag_color, this->kMaxProblemSize); \ flag_color); \
} else if (world_size == 4) { \ } else if (world_size == 4) { \
using LineCodec = __codec<T, 4>; \ using LineCodec = __codec<T, 4>; \
using AllReduceKernel = AllReduceTwoshot<T, LineCodec, cast_bf2half>; \ using AllReduceKernel = AllReduceTwoshot<T, LineCodec, cast_bf2half>; \
hipLaunchKernelGGL((allreduce_prototype_twoshot<AllReduceKernel, T>), \ hipLaunchKernelGGL((allreduce_prototype_twoshot<AllReduceKernel, T>), \
dim3(grid), dim3(kBlockTwoShot), 0, stream, A, B, N, \ dim3(grid), dim3(kBlockTwoShot), 0, stream, A, B, N, \
num_blocks, rank, dbuffer_list, data_offset, \ num_blocks, rank, dbuffer_list, data_offset, \
flag_color, this->kMaxProblemSize); \ flag_color); \
} else if (world_size == 8) { \ } else if (world_size == 8) { \
using LineCodec = __codec<T, 8>; \ using LineCodec = __codec<T, 8>; \
using AllReduceKernel = AllReduceTwoshot<T, LineCodec, cast_bf2half>; \ using AllReduceKernel = AllReduceTwoshot<T, LineCodec, cast_bf2half>; \
hipLaunchKernelGGL((allreduce_prototype_twoshot<AllReduceKernel, T>), \ hipLaunchKernelGGL((allreduce_prototype_twoshot<AllReduceKernel, T>), \
dim3(grid), dim3(kBlockTwoShot), 0, stream, A, B, N, \ dim3(grid), dim3(kBlockTwoShot), 0, stream, A, B, N, \
num_blocks, rank, dbuffer_list, data_offset, \ num_blocks, rank, dbuffer_list, data_offset, \
flag_color, this->kMaxProblemSize); \ flag_color); \
} }
enum QuickReduceQuantLevel { enum QuickReduceQuantLevel {

View File

@ -553,12 +553,13 @@ struct AllReduceTwoshot {
int const rank, // rank index int const rank, // rank index
uint8_t** __restrict__ buffer_list, // communication buffers uint8_t** __restrict__ buffer_list, // communication buffers
uint32_t const data_offset, // offset to start of the data buffer uint32_t const data_offset, // offset to start of the data buffer
uint32_t flag_color, int64_t data_size_per_phase) { uint32_t flag_color) {
// Topology // Topology
int thread = threadIdx.x + threadIdx.y * kWavefront; int thread = threadIdx.x + threadIdx.y * kWavefront;
uint8_t* rank_buffer = buffer_list[rank]; uint8_t* rank_buffer = buffer_list[rank];
Codec codec(thread, rank); Codec codec(thread, rank);
int block_id = blockIdx.x; int block_id = blockIdx.x;
int grid_size = gridDim.x;
// -------------------------------------------------------- // --------------------------------------------------------
// Read input into registers // Read input into registers
int32x4_t tA[kAtoms]; int32x4_t tA[kAtoms];
@ -587,10 +588,12 @@ struct AllReduceTwoshot {
// rank responsible for this segment. // rank responsible for this segment.
uint32_t comm_data0_offset = uint32_t comm_data0_offset =
data_offset + block_id * Codec::kTransmittedTileSize; data_offset + block_id * Codec::kTransmittedTileSize;
uint32_t comm_data1_offset = data_size_per_phase + comm_data0_offset; uint32_t comm_data1_offset =
grid_size * Codec::kTransmittedTileSize + comm_data0_offset;
uint32_t comm_flags0_offset = block_id * (kWorldSize * sizeof(uint32_t)); uint32_t comm_flags0_offset = block_id * (kWorldSize * sizeof(uint32_t));
uint32_t comm_flags1_offset = (data_offset / 2) + comm_flags0_offset; uint32_t comm_flags1_offset =
grid_size * (kWorldSize * sizeof(uint32_t)) + comm_flags0_offset;
for (int r = 0; r < kWorldSize; r++) { for (int r = 0; r < kWorldSize; r++) {
int32x4_t* send_buffer = int32x4_t* send_buffer =

View File

@ -54,10 +54,15 @@ static inline __device__ uint16_t extractBinIdx(float x) {
return 511 - (tmp.u16 >> 7); return 511 - (tmp.u16 >> 7);
} }
template <int kNumThreadsPerBlock = 512, int kNumBins = 512, int kTopK = 2048> template <int kNumThreadsPerBlock = 512>
__device__ void topKPerRowJob(const float* logits, const int rowStart, static __global__ void topKPerRow(const float* logits, const int* rowStarts,
const int rowEnd, const int rowIdx, const int* rowEnds, int* outIndices,
int* outIndices, int stride0, int stride1) { float* outLogits, int stride0, int stride1) {
// The number of bins in the histogram.
static constexpr int kNumBins = 512;
// The top-k width.
static constexpr int kTopK = 2048;
// The number of elements per thread for the final top-k sort. // The number of elements per thread for the final top-k sort.
static constexpr int kNumTopKItemsPerThread = kTopK / kNumThreadsPerBlock; static constexpr int kNumTopKItemsPerThread = kTopK / kNumThreadsPerBlock;
// The class to sort the elements during the final top-k sort. // The class to sort the elements during the final top-k sort.
@ -98,11 +103,17 @@ __device__ void topKPerRowJob(const float* logits, const int rowStart,
__shared__ int smemHistogram[kNumBins]; __shared__ int smemHistogram[kNumBins];
// Shared memory to store the selected indices. // Shared memory to store the selected indices.
__shared__ int smemIndices[kTopK]; __shared__ int smemIndices[kTopK];
// Shared memory to store the selected logits.
__shared__ float smemLogits[kTopK];
// Shared memory to store the threshold bin. // Shared memory to store the threshold bin.
__shared__ int smemThresholdBinIdx[1]; __shared__ int smemThresholdBinIdx[1];
// Shared memory counter to register the candidates for the final phase. // Shared memory counter to register the candidates for the final phase.
__shared__ int smemFinalDstIdx[1]; __shared__ int smemFinalDstIdx[1];
// The row computed by this block.
int rowIdx = blockIdx.x;
// The range of logits within the row.
int rowStart = rowStarts[rowIdx], rowEnd = rowEnds[rowIdx];
// The length of the row. // The length of the row.
int rowLen = rowEnd - rowStart; int rowLen = rowEnd - rowStart;
@ -113,10 +124,13 @@ __device__ void topKPerRowJob(const float* logits, const int rowStart,
rowIt += kNumThreadsPerBlock) { rowIt += kNumThreadsPerBlock) {
int idx = rowStart + rowIt; int idx = rowStart + rowIt;
outIndices[rowIdx * kTopK + rowIt] = idx - rowStart; outIndices[rowIdx * kTopK + rowIt] = idx - rowStart;
outLogits[rowIdx * kTopK + rowIt] =
logits[rowIdx * stride0 + idx * stride1];
} }
for (int rowIt = rowLen + threadIdx.x; rowIt < kTopK; for (int rowIt = rowLen + threadIdx.x; rowIt < kTopK;
rowIt += kNumThreadsPerBlock) { rowIt += kNumThreadsPerBlock) {
outIndices[rowIdx * kTopK + rowIt] = -1; outIndices[rowIdx * kTopK + rowIt] = -1;
outLogits[rowIdx * kTopK + rowIt] = -FLT_MAX;
} }
return; return;
} }
@ -187,6 +201,7 @@ __device__ void topKPerRowJob(const float* logits, const int rowStart,
uint16_t idx = extractBinIdx(logit); uint16_t idx = extractBinIdx(logit);
if (idx < thresholdBinIdx) { if (idx < thresholdBinIdx) {
int dstIdx = atomicAdd(&smemHistogram[idx], 1); int dstIdx = atomicAdd(&smemHistogram[idx], 1);
smemLogits[dstIdx] = logit;
smemIndices[dstIdx] = rowIt; smemIndices[dstIdx] = rowIt;
} else if (idx == thresholdBinIdx) { } else if (idx == thresholdBinIdx) {
int dstIdx = atomicAdd(&smemFinalDstIdx[0], 1); int dstIdx = atomicAdd(&smemFinalDstIdx[0], 1);
@ -235,6 +250,7 @@ __device__ void topKPerRowJob(const float* logits, const int rowStart,
int srcIdx = ii * kNumThreadsPerBlock + threadIdx.x; int srcIdx = ii * kNumThreadsPerBlock + threadIdx.x;
int dstIdx = baseIdx + srcIdx; int dstIdx = baseIdx + srcIdx;
if (dstIdx < kTopK) { if (dstIdx < kTopK) {
smemLogits[dstIdx] = finalLogits[ii];
smemIndices[dstIdx] = finalIndices[ii]; smemIndices[dstIdx] = finalIndices[ii];
} }
} }
@ -242,58 +258,31 @@ __device__ void topKPerRowJob(const float* logits, const int rowStart,
// Make sure the data is in shared memory. // Make sure the data is in shared memory.
__syncthreads(); __syncthreads();
// The topK logits.
float topKLogits[kNumTopKItemsPerThread];
// The topK indices.
int topKIndices[kNumTopKItemsPerThread];
// Load from shared memory.
#pragma unroll
for (int ii = 0; ii < kNumTopKItemsPerThread; ++ii) {
topKLogits[ii] = smemLogits[ii * kNumThreadsPerBlock + threadIdx.x];
topKIndices[ii] = smemIndices[ii * kNumThreadsPerBlock + threadIdx.x];
}
// Sort the elements.
TopKSort(smemFinal.topKSort)
.SortDescendingBlockedToStriped(topKLogits, topKIndices);
// Store to global memory. // Store to global memory.
#pragma unroll #pragma unroll
for (int ii = 0; ii < kNumTopKItemsPerThread; ++ii) { for (int ii = 0; ii < kNumTopKItemsPerThread; ++ii) {
int offset = rowIdx * kTopK + ii * kNumThreadsPerBlock + threadIdx.x; int offset = rowIdx * kTopK + ii * kNumThreadsPerBlock + threadIdx.x;
outIndices[offset] = outIndices[offset] = topKIndices[ii] - rowStart;
smemIndices[ii * kNumThreadsPerBlock + threadIdx.x] - rowStart; outLogits[offset] = topKLogits[ii];
} }
} }
template <int kNumThreadsPerBlock = 512>
static __global__ void topKPerRow(const float* logits, const int* rowStarts,
const int* rowEnds, int* outIndices,
int stride0, int stride1) {
// The number of bins in the histogram.
static constexpr int kNumBins = 512;
// The top-k width.
static constexpr int kTopK = 2048;
// The row computed by this block.
int rowIdx = blockIdx.x;
// The range of logits within the row.
int rowStart = rowStarts[rowIdx];
int rowEnd = rowEnds[rowIdx];
topKPerRowJob<kNumThreadsPerBlock, kNumBins, kTopK>(
logits, rowStart, rowEnd, rowIdx, outIndices, stride0, stride1);
}
template <int kNumThreadsPerBlock = 512>
static __global__ void topKPerRowDecode(const float* logits, const int* seqLens,
int* outIndices, int stride0,
int stride1, int next_n) {
// The number of bins in the histogram.
static constexpr int kNumBins = 512;
// The top-k width.
static constexpr int kTopK = 2048;
// The row computed by this block.
int rowIdx = blockIdx.x;
// The range of logits within the row.
int rowStart = 0;
int seq_len = seqLens[rowIdx / next_n];
int rowEnd = seq_len - next_n + (rowIdx % next_n) + 1;
topKPerRowJob<kNumThreadsPerBlock, kNumBins, kTopK>(
logits, rowStart, rowEnd, rowIdx, outIndices, stride0, stride1);
}
} // namespace vllm } // namespace vllm
void apply_repetition_penalties_( void apply_repetition_penalties_(
@ -337,23 +326,10 @@ void apply_repetition_penalties_(
}); });
} }
void top_k_per_row_decode(const torch::Tensor& logits, int64_t next_n,
const torch::Tensor& seqLens, torch::Tensor& indices,
int64_t numRows, int64_t stride0, int64_t stride1) {
// Compute the results on the device.
constexpr int kNumThreadsPerBlock = 512;
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
vllm::topKPerRowDecode<kNumThreadsPerBlock>
<<<numRows, kNumThreadsPerBlock, 0, stream>>>(
logits.data_ptr<float>(), seqLens.data_ptr<int>(),
indices.data_ptr<int>(), static_cast<int>(stride0),
static_cast<int>(stride1), static_cast<int>(next_n));
}
void top_k_per_row(const torch::Tensor& logits, const torch::Tensor& rowStarts, void top_k_per_row(const torch::Tensor& logits, const torch::Tensor& rowStarts,
const torch::Tensor& rowEnds, torch::Tensor& indices, const torch::Tensor& rowEnds, torch::Tensor& indices,
int64_t numRows, int64_t stride0, int64_t stride1) { torch::Tensor& values, int64_t numRows, int64_t stride0,
int64_t stride1) {
// Compute the results on the device. // Compute the results on the device.
constexpr int kNumThreadsPerBlock = 512; constexpr int kNumThreadsPerBlock = 512;
const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
@ -362,5 +338,6 @@ void top_k_per_row(const torch::Tensor& logits, const torch::Tensor& rowStarts,
<<<numRows, kNumThreadsPerBlock, 0, stream>>>( <<<numRows, kNumThreadsPerBlock, 0, stream>>>(
logits.data_ptr<float>(), rowStarts.data_ptr<int>(), logits.data_ptr<float>(), rowStarts.data_ptr<int>(),
rowEnds.data_ptr<int>(), indices.data_ptr<int>(), rowEnds.data_ptr<int>(), indices.data_ptr<int>(),
static_cast<int>(stride0), static_cast<int>(stride1)); values.data_ptr<float>(), static_cast<int>(stride0),
static_cast<int>(stride1));
} }

View File

@ -175,6 +175,12 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
"float epsilon) -> ()"); "float epsilon) -> ()");
ops.impl("fused_add_rms_norm", torch::kCUDA, &fused_add_rms_norm); ops.impl("fused_add_rms_norm", torch::kCUDA, &fused_add_rms_norm);
// Polynomial Normalization.
ops.def(
"poly_norm(Tensor! out, Tensor input, Tensor weight, Tensor bias, float "
"epsilon) -> ()");
ops.impl("poly_norm", torch::kCUDA, &poly_norm);
// Apply repetition penalties to logits in-place // Apply repetition penalties to logits in-place
ops.def( ops.def(
"apply_repetition_penalties_(Tensor! logits, Tensor prompt_mask, " "apply_repetition_penalties_(Tensor! logits, Tensor prompt_mask, "
@ -185,16 +191,10 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
// Optimized top-k per row operation // Optimized top-k per row operation
ops.def( ops.def(
"top_k_per_row(Tensor logits, Tensor rowStarts, Tensor rowEnds, " "top_k_per_row(Tensor logits, Tensor rowStarts, Tensor rowEnds, "
"Tensor! indices, int numRows, int stride0, " "Tensor! indices, Tensor! values, int numRows, int stride0, "
"int stride1) -> ()"); "int stride1) -> ()");
ops.impl("top_k_per_row", torch::kCUDA, &top_k_per_row); ops.impl("top_k_per_row", torch::kCUDA, &top_k_per_row);
ops.def(
"top_k_per_row_decode(Tensor logits, int next_n, "
"Tensor seq_lens, Tensor! indices, int numRows, "
"int stride0, int stride1) -> ()");
ops.impl("top_k_per_row_decode", torch::kCUDA, &top_k_per_row_decode);
// Layernorm-quant // Layernorm-quant
// Apply Root Mean Square (RMS) Normalization to the input tensor. // Apply Root Mean Square (RMS) Normalization to the input tensor.
ops.def( ops.def(

View File

@ -5,7 +5,7 @@
# docs/contributing/dockerfile/dockerfile.md and # docs/contributing/dockerfile/dockerfile.md and
# docs/assets/contributing/dockerfile-stages-dependency.png # docs/assets/contributing/dockerfile-stages-dependency.png
ARG CUDA_VERSION=12.9.1 ARG CUDA_VERSION=12.8.1
ARG PYTHON_VERSION=3.12 ARG PYTHON_VERSION=3.12
# By parameterizing the base images, we allow third-party to use their own # By parameterizing the base images, we allow third-party to use their own
@ -132,9 +132,7 @@ WORKDIR /workspace
COPY requirements/common.txt requirements/common.txt COPY requirements/common.txt requirements/common.txt
COPY requirements/cuda.txt requirements/cuda.txt COPY requirements/cuda.txt requirements/cuda.txt
RUN --mount=type=cache,target=/root/.cache/uv \ RUN --mount=type=cache,target=/root/.cache/uv \
# TODO: remove apache-tvm-ffi once FlashInfer is fixed https://github.com/flashinfer-ai/flashinfer/issues/1962 uv pip install --python /opt/venv/bin/python3 -r requirements/cuda.txt \
uv pip install --python /opt/venv/bin/python3 --pre apache-tvm-ffi==0.1.0b15 \
&& uv pip install --python /opt/venv/bin/python3 -r requirements/cuda.txt \
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
# cuda arch list used by torch # cuda arch list used by torch
@ -231,7 +229,7 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
# Check the size of the wheel if RUN_WHEEL_CHECK is true # Check the size of the wheel if RUN_WHEEL_CHECK is true
COPY .buildkite/check-wheel-size.py check-wheel-size.py COPY .buildkite/check-wheel-size.py check-wheel-size.py
# sync the default value with .buildkite/check-wheel-size.py # sync the default value with .buildkite/check-wheel-size.py
ARG VLLM_MAX_SIZE_MB=500 ARG VLLM_MAX_SIZE_MB=450
ENV VLLM_MAX_SIZE_MB=$VLLM_MAX_SIZE_MB ENV VLLM_MAX_SIZE_MB=$VLLM_MAX_SIZE_MB
ARG RUN_WHEEL_CHECK=true ARG RUN_WHEEL_CHECK=true
RUN if [ "$RUN_WHEEL_CHECK" = "true" ]; then \ RUN if [ "$RUN_WHEEL_CHECK" = "true" ]; then \
@ -275,7 +273,6 @@ WORKDIR /vllm-workspace
ENV DEBIAN_FRONTEND=noninteractive ENV DEBIAN_FRONTEND=noninteractive
ARG TARGETPLATFORM ARG TARGETPLATFORM
# TODO (huydhn): There is no prebuilt gdrcopy package on 12.9 at the moment
ARG GDRCOPY_CUDA_VERSION=12.8 ARG GDRCOPY_CUDA_VERSION=12.8
# Keep in line with FINAL_BASE_IMAGE # Keep in line with FINAL_BASE_IMAGE
ARG GDRCOPY_OS_VERSION=Ubuntu22_04 ARG GDRCOPY_OS_VERSION=Ubuntu22_04
@ -356,23 +353,14 @@ RUN --mount=type=cache,target=/root/.cache/uv \
# Install vllm wheel first, so that torch etc will be installed. # Install vllm wheel first, so that torch etc will be installed.
RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
--mount=type=cache,target=/root/.cache/uv \ --mount=type=cache,target=/root/.cache/uv \
# TODO: remove apache-tvm-ffi once FlashInfer is fixed https://github.com/flashinfer-ai/flashinfer/issues/1962 uv pip install --system dist/*.whl --verbose \
uv pip install --system --pre apache-tvm-ffi==0.1.0b15 \
&& uv pip install --system dist/*.whl --verbose \
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
# TODO (huydhn): Remove this once xformers is released for 2.9.0
RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH'
. /etc/environment
export TORCH_CUDA_ARCH_LIST='7.5 8.0+PTX 9.0a'
uv pip install --system --no-build-isolation "git+https://github.com/facebookresearch/xformers@v0.0.32.post2"
BASH
# Install FlashInfer pre-compiled kernel cache and binaries # Install FlashInfer pre-compiled kernel cache and binaries
# https://docs.flashinfer.ai/installation.html # https://docs.flashinfer.ai/installation.html
RUN --mount=type=cache,target=/root/.cache/uv \ RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system flashinfer-cubin==0.4.1 \ uv pip install --system flashinfer-cubin==0.4.0 \
&& uv pip install --system flashinfer-jit-cache==0.4.1 \ && uv pip install --system flashinfer-jit-cache==0.4.0 \
--extra-index-url https://flashinfer.ai/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \ --extra-index-url https://flashinfer.ai/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \
&& flashinfer show-config && flashinfer show-config
@ -434,7 +422,6 @@ ARG PYTHON_VERSION
ARG PIP_INDEX_URL UV_INDEX_URL ARG PIP_INDEX_URL UV_INDEX_URL
ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
ARG PYTORCH_CUDA_INDEX_BASE_URL
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out # This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
# Reference: https://github.com/astral-sh/uv/pull/1694 # Reference: https://github.com/astral-sh/uv/pull/1694
@ -447,8 +434,7 @@ ENV UV_LINK_MODE=copy
RUN --mount=type=cache,target=/root/.cache/uv \ RUN --mount=type=cache,target=/root/.cache/uv \
CUDA_MAJOR="${CUDA_VERSION%%.*}"; \ CUDA_MAJOR="${CUDA_VERSION%%.*}"; \
if [ "$CUDA_MAJOR" -ge 12 ]; then \ if [ "$CUDA_MAJOR" -ge 12 ]; then \
uv pip install --system -r requirements/dev.txt \ uv pip install --system -r requirements/dev.txt; \
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.'); \
fi fi
# install development dependencies (for testing) # install development dependencies (for testing)

View File

@ -31,7 +31,7 @@ ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
--mount=type=cache,target=/var/lib/apt,sharing=locked \ --mount=type=cache,target=/var/lib/apt,sharing=locked \
apt-get update -y \ apt-get update -y \
&& apt-get install -y --no-install-recommends sudo ccache git curl wget ca-certificates \ && apt-get install -y --no-install-recommends ccache git curl wget ca-certificates \
gcc-12 g++-12 libtcmalloc-minimal4 libnuma-dev ffmpeg libsm6 libxext6 libgl1 jq lsof \ gcc-12 g++-12 libtcmalloc-minimal4 libnuma-dev ffmpeg libsm6 libxext6 libgl1 jq lsof \
&& update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 \ && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 \
&& curl -LsSf https://astral.sh/uv/install.sh | sh && curl -LsSf https://astral.sh/uv/install.sh | sh
@ -106,106 +106,14 @@ RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,source=.git,target=.git \ --mount=type=bind,source=.git,target=.git \
VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel
#################### WHEEL BUILD IMAGE ####################
FROM base AS build
ARG TARGETPLATFORM
ARG PIP_INDEX_URL UV_INDEX_URL
ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
# install build dependencies
COPY requirements/build.txt requirements/build.txt
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
# Reference: https://github.com/astral-sh/uv/pull/1694
ENV UV_HTTP_TIMEOUT=500
ENV UV_INDEX_STRATEGY="unsafe-best-match"
# Use copy mode to avoid hardlink failures with Docker cache mounts
ENV UV_LINK_MODE=copy
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --python /opt/venv/bin/python3 -r requirements/build.txt
COPY . .
ARG GIT_REPO_CHECK=0
RUN --mount=type=bind,source=.git,target=.git \
if [ "$GIT_REPO_CHECK" != "0" ]; then bash tools/check_repo.sh ; fi
# max jobs used by Ninja to build extensions
ARG max_jobs=2
ENV MAX_JOBS=${max_jobs}
ARG USE_SCCACHE
ARG SCCACHE_DOWNLOAD_URL=https://github.com/mozilla/sccache/releases/download/v0.8.1/sccache-v0.8.1-x86_64-unknown-linux-musl.tar.gz
ARG SCCACHE_ENDPOINT
ARG SCCACHE_BUCKET_NAME=vllm-build-sccache
ARG SCCACHE_REGION_NAME=us-west-2
ARG SCCACHE_S3_NO_CREDENTIALS=0
# Flag to control whether to use pre-built vLLM wheels
ARG VLLM_USE_PRECOMPILED=""
# if USE_SCCACHE is set, use sccache to speed up compilation
RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,source=.git,target=.git \
if [ "$USE_SCCACHE" = "1" ]; then \
echo "Installing sccache..." \
&& curl -L -o sccache.tar.gz ${SCCACHE_DOWNLOAD_URL} \
&& tar -xzf sccache.tar.gz \
&& sudo mv sccache-v0.8.1-x86_64-unknown-linux-musl/sccache /usr/bin/sccache \
&& rm -rf sccache.tar.gz sccache-v0.8.1-x86_64-unknown-linux-musl \
&& if [ ! -z ${SCCACHE_ENDPOINT} ] ; then export SCCACHE_ENDPOINT=${SCCACHE_ENDPOINT} ; fi \
&& export SCCACHE_BUCKET=${SCCACHE_BUCKET_NAME} \
&& export SCCACHE_REGION=${SCCACHE_REGION_NAME} \
&& export SCCACHE_S3_NO_CREDENTIALS=${SCCACHE_S3_NO_CREDENTIALS} \
&& export SCCACHE_IDLE_TIMEOUT=0 \
&& export CMAKE_BUILD_TYPE=Release \
&& export VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED}" \
&& export VLLM_DOCKER_BUILD_CONTEXT=1 \
&& sccache --show-stats \
&& python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38 \
&& sccache --show-stats; \
fi
ARG vllm_target_device="cpu"
ENV VLLM_TARGET_DEVICE=${vllm_target_device}
ENV CCACHE_DIR=/root/.cache/ccache
RUN --mount=type=cache,target=/root/.cache/ccache \
--mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,source=.git,target=.git \
if [ "$USE_SCCACHE" != "1" ]; then \
# Clean any existing CMake artifacts
rm -rf .deps && \
mkdir -p .deps && \
export VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED}" && \
export VLLM_DOCKER_BUILD_CONTEXT=1 && \
python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \
fi
# Check the size of the wheel if RUN_WHEEL_CHECK is true
COPY .buildkite/check-wheel-size.py check-wheel-size.py
# sync the default value with .buildkite/check-wheel-size.py
ARG VLLM_MAX_SIZE_MB=450
ENV VLLM_MAX_SIZE_MB=$VLLM_MAX_SIZE_MB
ARG RUN_WHEEL_CHECK=true
RUN if [ "$RUN_WHEEL_CHECK" = "true" ]; then \
python3 check-wheel-size.py dist; \
else \
echo "Skipping wheel size check."; \
fi
######################### TEST DEPS ######################### ######################### TEST DEPS #########################
FROM base AS vllm-test-deps FROM base AS vllm-test-deps
WORKDIR /workspace/vllm WORKDIR /workspace/vllm
# TODO: Update to 2.9.0 when there is a new build for intel_extension_for_pytorch for that version
RUN --mount=type=bind,src=requirements/test.in,target=requirements/test.in \ RUN --mount=type=bind,src=requirements/test.in,target=requirements/test.in \
cp requirements/test.in requirements/cpu-test.in && \ cp requirements/test.in requirements/cpu-test.in && \
sed -i '/mamba_ssm/d' requirements/cpu-test.in && \ sed -i '/mamba_ssm/d' requirements/cpu-test.in && \
sed -i 's/^torch==.*/torch==2.8.0/g' requirements/cpu-test.in && \
sed -i 's/torchaudio.*/torchaudio/g' requirements/cpu-test.in && \
sed -i 's/torchvision.*/torchvision/g' requirements/cpu-test.in && \
uv pip compile requirements/cpu-test.in -o requirements/cpu-test.txt --index-strategy unsafe-best-match --torch-backend cpu uv pip compile requirements/cpu-test.in -o requirements/cpu-test.txt --index-strategy unsafe-best-match --torch-backend cpu
RUN --mount=type=cache,target=/root/.cache/uv \ RUN --mount=type=cache,target=/root/.cache/uv \

View File

@ -246,7 +246,7 @@ RUN pip install setuptools==75.6.0 packaging==23.2 ninja==1.11.1.3 build==1.2.2.
# build flashinfer for torch nightly from source around 10 mins # build flashinfer for torch nightly from source around 10 mins
# release version: v0.4.1 # release version: v0.4.0
# todo(elainewy): cache flashinfer build result for faster build # todo(elainewy): cache flashinfer build result for faster build
ENV CCACHE_DIR=/root/.cache/ccache ENV CCACHE_DIR=/root/.cache/ccache
RUN --mount=type=cache,target=/root/.cache/ccache \ RUN --mount=type=cache,target=/root/.cache/ccache \
@ -254,7 +254,7 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
echo "git clone flashinfer..." \ echo "git clone flashinfer..." \
&& git clone --recursive https://github.com/flashinfer-ai/flashinfer.git \ && git clone --recursive https://github.com/flashinfer-ai/flashinfer.git \
&& cd flashinfer \ && cd flashinfer \
&& git checkout v0.4.1\ && git checkout v0.4.0 \
&& git submodule update --init --recursive \ && git submodule update --init --recursive \
&& echo "finish git clone flashinfer..." \ && echo "finish git clone flashinfer..." \
&& rm -rf build \ && rm -rf build \

View File

@ -1,7 +1,7 @@
# default base image # default base image
ARG REMOTE_VLLM="0" ARG REMOTE_VLLM="0"
ARG COMMON_WORKDIR=/app ARG COMMON_WORKDIR=/app
ARG BASE_IMAGE=rocm/vllm-dev:base_custom_1020_rc1_20251008_tuned_20251008 ARG BASE_IMAGE=rocm/vllm-dev:base
FROM ${BASE_IMAGE} AS base FROM ${BASE_IMAGE} AS base
@ -12,7 +12,7 @@ ENV PYTORCH_ROCM_ARCH=${ARG_PYTORCH_ROCM_ARCH:-${PYTORCH_ROCM_ARCH}}
RUN apt-get update -q -y && apt-get install -q -y \ RUN apt-get update -q -y && apt-get install -q -y \
sqlite3 libsqlite3-dev libfmt-dev libmsgpack-dev libsuitesparse-dev \ sqlite3 libsqlite3-dev libfmt-dev libmsgpack-dev libsuitesparse-dev \
apt-transport-https ca-certificates wget curl apt-transport-https ca-certificates wget curl
# Remove sccache # Remove sccache
RUN python3 -m pip install --upgrade pip RUN python3 -m pip install --upgrade pip
RUN apt-get purge -y sccache; python3 -m pip uninstall -y sccache; rm -f "$(which sccache)" RUN apt-get purge -y sccache; python3 -m pip uninstall -y sccache; rm -f "$(which sccache)"
ARG COMMON_WORKDIR ARG COMMON_WORKDIR

View File

@ -1,13 +1,13 @@
ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:7.0-complete ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:7.0-complete
ARG TRITON_BRANCH="57c693b6" ARG TRITON_BRANCH="f9e5bf54"
ARG TRITON_REPO="https://github.com/ROCm/triton.git" ARG TRITON_REPO="https://github.com/ROCm/triton.git"
ARG PYTORCH_BRANCH="1c57644d" ARG PYTORCH_BRANCH="b2fb6885"
ARG PYTORCH_VISION_BRANCH="v0.23.0" ARG PYTORCH_VISION_BRANCH="v0.23.0"
ARG PYTORCH_REPO="https://github.com/ROCm/pytorch.git" ARG PYTORCH_REPO="https://github.com/ROCm/pytorch.git"
ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git" ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"
ARG FA_BRANCH="0e60e394" ARG FA_BRANCH="0e60e394"
ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git" ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git"
ARG AITER_BRANCH="eef23c7f" ARG AITER_BRANCH="2ab9f4cd"
ARG AITER_REPO="https://github.com/ROCm/aiter.git" ARG AITER_REPO="https://github.com/ROCm/aiter.git"
FROM ${BASE_IMAGE} AS base FROM ${BASE_IMAGE} AS base

View File

@ -20,6 +20,8 @@ API documentation for vLLM's configuration classes.
- [vllm.config.CompilationConfig][] - [vllm.config.CompilationConfig][]
- [vllm.config.VllmConfig][] - [vllm.config.VllmConfig][]
[](){ #offline-inference-api }
## Offline Inference ## Offline Inference
LLM Class. LLM Class.
@ -43,14 +45,18 @@ Engine classes for offline and online inference.
Inference parameters for vLLM APIs. Inference parameters for vLLM APIs.
[](){ #sampling-params }
- [vllm.SamplingParams][] - [vllm.SamplingParams][]
- [vllm.PoolingParams][] - [vllm.PoolingParams][]
[](){ #multi-modality }
## Multi-Modality ## Multi-Modality
vLLM provides experimental support for multi-modal models through the [vllm.multimodal][] package. vLLM provides experimental support for multi-modal models through the [vllm.multimodal][] package.
Multi-modal inputs can be passed alongside text and token prompts to [supported models](../models/supported_models.md#list-of-multimodal-language-models) Multi-modal inputs can be passed alongside text and token prompts to [supported models][supported-mm-models]
via the `multi_modal_data` field in [vllm.inputs.PromptType][]. via the `multi_modal_data` field in [vllm.inputs.PromptType][].
Looking to add your own multi-modal model? Please follow the instructions listed [here](../contributing/model/multimodal.md). Looking to add your own multi-modal model? Please follow the instructions listed [here](../contributing/model/multimodal.md).

Binary file not shown.

Before

Width:  |  Height:  |  Size: 119 KiB

After

Width:  |  Height:  |  Size: 119 KiB

View File

@ -4,6 +4,6 @@ This section lists the most common options for running vLLM.
There are three main levels of configuration, from highest priority to lowest priority: There are three main levels of configuration, from highest priority to lowest priority:
- [Request parameters](../serving/openai_compatible_server.md#completions-api) and [input arguments](../api/README.md#inference-parameters) - [Request parameters][completions-api] and [input arguments][sampling-params]
- [Engine arguments](./engine_args.md) - [Engine arguments](./engine_args.md)
- [Environment variables](./env_vars.md) - [Environment variables](./env_vars.md)

View File

@ -11,7 +11,8 @@ The following code splits the model across 2 GPUs.
```python ```python
from vllm import LLM from vllm import LLM
llm = LLM(model="ibm-granite/granite-3.1-8b-instruct", tensor_parallel_size=2) llm = LLM(model="ibm-granite/granite-3.1-8b-instruct",
tensor_parallel_size=2)
``` ```
!!! warning !!! warning
@ -23,7 +24,7 @@ llm = LLM(model="ibm-granite/granite-3.1-8b-instruct", tensor_parallel_size=2)
!!! note !!! note
With tensor parallelism enabled, each process will read the whole model and split it into chunks, which makes the disk reading time even longer (proportional to the size of tensor parallelism). With tensor parallelism enabled, each process will read the whole model and split it into chunks, which makes the disk reading time even longer (proportional to the size of tensor parallelism).
You can convert the model checkpoint to a sharded checkpoint using [examples/offline_inference/save_sharded_state.py](../../examples/offline_inference/save_sharded_state.py). The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism. You can convert the model checkpoint to a sharded checkpoint using <gh-file:examples/offline_inference/save_sharded_state.py>. The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism.
## Quantization ## Quantization
@ -42,7 +43,9 @@ and the maximum batch size (`max_num_seqs` option).
```python ```python
from vllm import LLM from vllm import LLM
llm = LLM(model="adept/fuyu-8b", max_model_len=2048, max_num_seqs=2) llm = LLM(model="adept/fuyu-8b",
max_model_len=2048,
max_num_seqs=2)
``` ```
## Reduce CUDA Graphs ## Reduce CUDA Graphs
@ -58,12 +61,12 @@ You can adjust `compilation_config` to achieve a better balance between inferenc
```python ```python
from vllm import LLM from vllm import LLM
from vllm.config import CompilationConfig, CompilationMode from vllm.config import CompilationConfig, CompilationLevel
llm = LLM( llm = LLM(
model="meta-llama/Llama-3.1-8B-Instruct", model="meta-llama/Llama-3.1-8B-Instruct",
compilation_config=CompilationConfig( compilation_config=CompilationConfig(
mode=CompilationMode.VLLM_COMPILE, level=CompilationLevel.PIECEWISE,
# By default, it goes up to max_num_seqs # By default, it goes up to max_num_seqs
cudagraph_capture_sizes=[1, 2, 4, 8, 16], cudagraph_capture_sizes=[1, 2, 4, 8, 16],
), ),
@ -75,7 +78,8 @@ You can disable graph capturing completely via the `enforce_eager` flag:
```python ```python
from vllm import LLM from vllm import LLM
llm = LLM(model="meta-llama/Llama-3.1-8B-Instruct", enforce_eager=True) llm = LLM(model="meta-llama/Llama-3.1-8B-Instruct",
enforce_eager=True)
``` ```
## Adjust cache size ## Adjust cache size
@ -93,10 +97,8 @@ You can allow a smaller number of multi-modal items per prompt to reduce the mem
from vllm import LLM from vllm import LLM
# Accept up to 3 images and 1 video per prompt # Accept up to 3 images and 1 video per prompt
llm = LLM( llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
model="Qwen/Qwen2.5-VL-3B-Instruct", limit_mm_per_prompt={"image": 3, "video": 1})
limit_mm_per_prompt={"image": 3, "video": 1},
)
``` ```
You can go a step further and disable unused modalities completely by setting its limit to zero. You can go a step further and disable unused modalities completely by setting its limit to zero.
@ -106,10 +108,8 @@ For example, if your application only accepts image input, there is no need to a
from vllm import LLM from vllm import LLM
# Accept any number of images but no videos # Accept any number of images but no videos
llm = LLM( llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
model="Qwen/Qwen2.5-VL-3B-Instruct", limit_mm_per_prompt={"video": 0})
limit_mm_per_prompt={"video": 0},
)
``` ```
You can even run a multi-modal model for text-only inference: You can even run a multi-modal model for text-only inference:
@ -118,10 +118,8 @@ You can even run a multi-modal model for text-only inference:
from vllm import LLM from vllm import LLM
# Don't accept images. Just text. # Don't accept images. Just text.
llm = LLM( llm = LLM(model="google/gemma-3-27b-it",
model="google/gemma-3-27b-it", limit_mm_per_prompt={"image": 0})
limit_mm_per_prompt={"image": 0},
)
``` ```
### Configurable options ### Configurable options
@ -175,14 +173,14 @@ Here are some examples:
from vllm import LLM from vllm import LLM
# Available for Qwen2-VL series models # Available for Qwen2-VL series models
llm = LLM( llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
model="Qwen/Qwen2.5-VL-3B-Instruct", mm_processor_kwargs={
mm_processor_kwargs={"max_pixels": 768 * 768}, # Default is 1280 * 28 * 28 "max_pixels": 768 * 768, # Default is 1280 * 28 * 28
) })
# Available for InternVL series models # Available for InternVL series models
llm = LLM( llm = LLM(model="OpenGVLab/InternVL2-2B",
model="OpenGVLab/InternVL2-2B", mm_processor_kwargs={
mm_processor_kwargs={"max_dynamic_patch": 4}, # Default is 12 "max_dynamic_patch": 4, # Default is 12
) })
``` ```

View File

@ -27,6 +27,8 @@ You can monitor the number of preemption requests through Prometheus metrics exp
In vLLM V1, the default preemption mode is `RECOMPUTE` rather than `SWAP`, as recomputation has lower overhead in the V1 architecture. In vLLM V1, the default preemption mode is `RECOMPUTE` rather than `SWAP`, as recomputation has lower overhead in the V1 architecture.
[](){ #chunked-prefill }
## Chunked Prefill ## Chunked Prefill
Chunked prefill allows vLLM to process large prefills in smaller chunks and batch them together with decode requests. This feature helps improve both throughput and latency by better balancing compute-bound (prefill) and memory-bound (decode) operations. Chunked prefill allows vLLM to process large prefills in smaller chunks and batch them together with decode requests. This feature helps improve both throughput and latency by better balancing compute-bound (prefill) and memory-bound (decode) operations.
@ -98,7 +100,7 @@ from vllm import LLM
llm = LLM( llm = LLM(
model="meta-llama/Llama-3.3-70B-Instruct, model="meta-llama/Llama-3.3-70B-Instruct,
tensor_parallel_size=4, tensor_parallel_size=4,
pipeline_parallel_size=2, pipeline_parallel_size=2
) )
``` ```
@ -172,14 +174,14 @@ Regardless, you need to set `mm_encoder_tp_mode="data"` in engine arguments to u
Known supported models (with corresponding benchmarks): Known supported models (with corresponding benchmarks):
- dots_ocr (<https://github.com/vllm-project/vllm/pull/25466>) - dots_ocr (<gh-pr:25466>)
- GLM-4.1V or above (<https://github.com/vllm-project/vllm/pull/23168>) - GLM-4.1V or above (<gh-pr:23168>)
- InternVL (<https://github.com/vllm-project/vllm/pull/23909>) - InternVL (<gh-pr:23909>)
- Kimi-VL (<https://github.com/vllm-project/vllm/pull/23817>) - Kimi-VL (<gh-pr:23817>)
- Llama4 (<https://github.com/vllm-project/vllm/pull/18368>) - Llama4 (<gh-pr:18368>)
- MiniCPM-V-2.5 or above (<https://github.com/vllm-project/vllm/pull/23327>, <https://github.com/vllm-project/vllm/pull/23948>) - MiniCPM-V-2.5 or above (<gh-pr:23327>, <gh-pr:23948>)
- Qwen2-VL or above (<https://github.com/vllm-project/vllm/pull/22742>, <https://github.com/vllm-project/vllm/pull/24955>, <https://github.com/vllm-project/vllm/pull/25445>) - Qwen2-VL or above (<gh-pr:22742>, <gh-pr:24955>, <gh-pr:25445>)
- Step3 (<https://github.com/vllm-project/vllm/pull/22697>) - Step3 (<gh-pr:22697>)
## Input Processing ## Input Processing
@ -255,24 +257,18 @@ Examples:
```python ```python
# Use a larger cache # Use a larger cache
llm = LLM( llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
model="Qwen/Qwen2.5-VL-3B-Instruct", mm_processor_cache_gb=8)
mm_processor_cache_gb=8,
)
# Use a shared-memory based IPC cache # Use a shared-memory based IPC cache
llm = LLM( llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
model="Qwen/Qwen2.5-VL-3B-Instruct", tensor_parallel_size=2,
tensor_parallel_size=2, mm_processor_cache_type="shm",
mm_processor_cache_type="shm", mm_processor_cache_gb=8)
mm_processor_cache_gb=8,
)
# Disable the cache # Disable the cache
llm = LLM( llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
model="Qwen/Qwen2.5-VL-3B-Instruct", mm_processor_cache_gb=0)
mm_processor_cache_gb=0,
)
``` ```
### Cache Placement ### Cache Placement

View File

@ -96,7 +96,7 @@ Although its common to do this with GPUs, don't try to fragment 2 or 8 differ
### Tune your workloads ### Tune your workloads
Although we try to have great default configs, we strongly recommend you check out the [vLLM auto-tuner](../../benchmarks/auto_tune/README.md) to optimize your workloads for your use case. Although we try to have great default configs, we strongly recommend you check out the [vLLM auto-tuner](gh-file:benchmarks/auto_tune/README.md) to optimize your workloads for your use case.
### Future Topics We'll Cover ### Future Topics We'll Cover

View File

@ -22,7 +22,7 @@ Unsure on where to start? Check out the following links for tasks to work on:
## License ## License
See [LICENSE](../../LICENSE). See <gh-file:LICENSE>.
## Developing ## Developing
@ -54,7 +54,7 @@ For more details about installing from source and installing for other hardware,
For an optimized workflow when iterating on C++/CUDA kernels, see the [Incremental Compilation Workflow](./incremental_build.md) for recommendations. For an optimized workflow when iterating on C++/CUDA kernels, see the [Incremental Compilation Workflow](./incremental_build.md) for recommendations.
!!! tip !!! tip
vLLM is compatible with Python versions 3.10 to 3.13. However, vLLM's default [Dockerfile](../../docker/Dockerfile) ships with Python 3.12 and tests in CI (except `mypy`) are run with Python 3.12. vLLM is compatible with Python versions 3.10 to 3.13. However, vLLM's default [Dockerfile](gh-file:docker/Dockerfile) ships with Python 3.12 and tests in CI (except `mypy`) are run with Python 3.12.
Therefore, we recommend developing with Python 3.12 to minimise the chance of your local environment clashing with our CI environment. Therefore, we recommend developing with Python 3.12 to minimise the chance of your local environment clashing with our CI environment.
@ -88,7 +88,7 @@ vLLM's `pre-commit` hooks will now run automatically every time you commit.
### Documentation ### Documentation
MkDocs is a fast, simple and downright gorgeous static site generator that's geared towards building project documentation. Documentation source files are written in Markdown, and configured with a single YAML configuration file, [mkdocs.yaml](../../mkdocs.yaml). MkDocs is a fast, simple and downright gorgeous static site generator that's geared towards building project documentation. Documentation source files are written in Markdown, and configured with a single YAML configuration file, <gh-file:mkdocs.yaml>.
Get started with: Get started with:
@ -152,7 +152,7 @@ pytest -s -v tests/test_logger.py
If you encounter a bug or have a feature request, please [search existing issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue) first to see if it has already been reported. If not, please [file a new issue](https://github.com/vllm-project/vllm/issues/new/choose), providing as much relevant information as possible. If you encounter a bug or have a feature request, please [search existing issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue) first to see if it has already been reported. If not, please [file a new issue](https://github.com/vllm-project/vllm/issues/new/choose), providing as much relevant information as possible.
!!! important !!! important
If you discover a security vulnerability, please follow the instructions [here](../../SECURITY.md). If you discover a security vulnerability, please follow the instructions [here](gh-file:SECURITY.md#reporting-a-vulnerability).
## Pull Requests & Code Reviews ## Pull Requests & Code Reviews
@ -162,7 +162,7 @@ code quality and improve the efficiency of the review process.
### DCO and Signed-off-by ### DCO and Signed-off-by
When contributing changes to this project, you must agree to the [DCO](../../DCO). When contributing changes to this project, you must agree to the <gh-file:DCO>.
Commits must include a `Signed-off-by:` header which certifies agreement with Commits must include a `Signed-off-by:` header which certifies agreement with
the terms of the DCO. the terms of the DCO.

View File

@ -6,10 +6,9 @@ toc_depth: 4
vLLM provides comprehensive benchmarking tools for performance testing and evaluation: vLLM provides comprehensive benchmarking tools for performance testing and evaluation:
- **[Benchmark CLI](#benchmark-cli)**: `vllm bench` CLI tools and specialized benchmark scripts for interactive performance testing - **[Benchmark CLI]**: `vllm bench` CLI tools and specialized benchmark scripts for interactive performance testing
- **[Parameter sweeps](#parameter-sweeps)**: Automate `vllm bench` runs for multiple configurations - **[Performance benchmarks][performance-benchmarks]**: Automated CI benchmarks for development
- **[Performance benchmarks](#performance-benchmarks)**: Automated CI benchmarks for development - **[Nightly benchmarks][nightly-benchmarks]**: Comparative benchmarks against alternatives
- **[Nightly benchmarks](#nightly-benchmarks)**: Comparative benchmarks against alternatives
[Benchmark CLI]: #benchmark-cli [Benchmark CLI]: #benchmark-cli
@ -30,13 +29,12 @@ th {
| Dataset | Online | Offline | Data Path | | Dataset | Online | Offline | Data Path |
|---------|--------|---------|-----------| |---------|--------|---------|-----------|
| ShareGPT | ✅ | ✅ | `wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json` | | ShareGPT | ✅ | ✅ | `wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json` |
| ShareGPT4V (Image) | ✅ | ✅ | `wget https://huggingface.co/datasets/Lin-Chen/ShareGPT4V/resolve/main/sharegpt4v_instruct_gpt4-vision_cap100k.json`<br>Note that the images need to be downloaded separately. For example, to download COCO's 2017 Train images:<br>`wget http://images.cocodataset.org/zips/train2017.zip` | | ShareGPT4V (Image) | ✅ | ✅ | `wget https://huggingface.co/datasets/Lin-Chen/ShareGPT4V/blob/main/sharegpt4v_instruct_gpt4-vision_cap100k.json`<br>Note that the images need to be downloaded separately. For example, to download COCO's 2017 Train images:<br>`wget http://images.cocodataset.org/zips/train2017.zip` |
| ShareGPT4Video (Video) | ✅ | ✅ | `git clone https://huggingface.co/datasets/ShareGPT4Video/ShareGPT4Video` | | ShareGPT4Video (Video) | ✅ | ✅ | `git clone https://huggingface.co/datasets/ShareGPT4Video/ShareGPT4Video` |
| BurstGPT | ✅ | ✅ | `wget https://github.com/HPMLL/BurstGPT/releases/download/v1.1/BurstGPT_without_fails_2.csv` | | BurstGPT | ✅ | ✅ | `wget https://github.com/HPMLL/BurstGPT/releases/download/v1.1/BurstGPT_without_fails_2.csv` |
| Sonnet (deprecated) | ✅ | ✅ | Local file: `benchmarks/sonnet.txt` | | Sonnet (deprecated) | ✅ | ✅ | Local file: `benchmarks/sonnet.txt` |
| Random | ✅ | ✅ | `synthetic` | | Random | ✅ | ✅ | `synthetic` |
| RandomMultiModal (Image/Video) | 🟡 | 🚧 | `synthetic` | | RandomMultiModal (Image/Video) | 🟡 | 🚧 | `synthetic` |
| RandomForReranking | ✅ | ✅ | `synthetic` |
| Prefix Repetition | ✅ | ✅ | `synthetic` | | Prefix Repetition | ✅ | ✅ | `synthetic` |
| HuggingFace-VisionArena | ✅ | ✅ | `lmarena-ai/VisionArena-Chat` | | HuggingFace-VisionArena | ✅ | ✅ | `lmarena-ai/VisionArena-Chat` |
| HuggingFace-MMVU | ✅ | ✅ | `yale-nlp/MMVU` | | HuggingFace-MMVU | ✅ | ✅ | `yale-nlp/MMVU` |
@ -715,7 +713,7 @@ Generate synthetic image inputs alongside random text prompts to stress-test vis
Notes: Notes:
- Works only with online benchmark via the OpenAI backend (`--backend openai-chat`) and endpoint `/v1/chat/completions`. - Works only with online benchmark via the OpenAI backend (`--backend openai-chat`) and endpoint `/v1/chat/completions`.
- Video sampling is not yet implemented. - Video sampling is not yet implemented.
Start the server (example): Start the server (example):
@ -823,7 +821,7 @@ you should set `--endpoint /v1/embeddings` to use the Embeddings API. The backen
- CLIP: `--backend openai-embeddings-clip` - CLIP: `--backend openai-embeddings-clip`
- VLM2Vec: `--backend openai-embeddings-vlm2vec` - VLM2Vec: `--backend openai-embeddings-vlm2vec`
For other models, please add your own implementation inside [vllm/benchmarks/lib/endpoint_request_func.py](../../vllm/benchmarks/lib/endpoint_request_func.py) to match the expected instruction format. For other models, please add your own implementation inside <gh-file:vllm/benchmarks/lib/endpoint_request_func.py> to match the expected instruction format.
You can use any text or multi-modal dataset to benchmark the model, as long as the model supports it. You can use any text or multi-modal dataset to benchmark the model, as long as the model supports it.
For example, you can use ShareGPT and VisionArena to benchmark vision-language embeddings. For example, you can use ShareGPT and VisionArena to benchmark vision-language embeddings.
@ -880,207 +878,7 @@ vllm bench serve \
</details> </details>
#### Reranker Benchmark [](){ #performance-benchmarks }
Benchmark the performance of rerank requests in vLLM.
<details class="admonition abstract" markdown="1">
<summary>Show more</summary>
Unlike generative models which use Completions API or Chat Completions API,
you should set `--backend vllm-rerank` and `--endpoint /v1/rerank` to use the Reranker API.
For reranking, the only supported dataset is `--dataset-name random-rerank`
Start the server:
```bash
vllm serve BAAI/bge-reranker-v2-m3
```
Run the benchmark:
```bash
vllm bench serve \
--model BAAI/bge-reranker-v2-m3 \
--backend vllm-rerank \
--endpoint /v1/rerank \
--dataset-name random-rerank \
--tokenizer BAAI/bge-reranker-v2-m3 \
--random-input-len 512 \
--num-prompts 10 \
--random-batch-size 5
```
For reranker models, this will create `num_prompts / random_batch_size` requests with
`random_batch_size` "documents" where each one has close to `random_input_len` tokens.
In the example above, this results in 2 rerank requests with 5 "documents" each where
each document has close to 512 tokens.
Please note that the `/v1/rerank` is also supported by embedding models. So if you're running
with an embedding model, also set `--no_reranker`. Because in this case the query is
treated as a individual prompt by the server, here we send `random_batch_size - 1` documents
to account for the extra prompt which is the query. The token accounting to report the
throughput numbers correctly is also adjusted.
</details>
## Parameter Sweeps
### Online Benchmark
[`vllm/benchmarks/sweep/serve.py`](../../vllm/benchmarks/sweep/serve.py) automatically starts `vllm serve` and runs `vllm bench serve` to evaluate vLLM over multiple configurations.
Follow these steps to run the script:
1. Construct the base command to `vllm serve`, and pass it to the `--serve-cmd` option.
2. Construct the base command to `vllm bench serve`, and pass it to the `--bench-cmd` option.
3. (Optional) If you would like to vary the settings of `vllm serve`, create a new JSON file and populate it with the parameter combinations you want to test. Pass the file path to `--serve-params`.
- Example: Tuning `--max-num-seqs` and `--max-num-batched-tokens`:
```json
[
{
"max_num_seqs": 32,
"max_num_batched_tokens": 1024
},
{
"max_num_seqs": 64,
"max_num_batched_tokens": 1024
},
{
"max_num_seqs": 64,
"max_num_batched_tokens": 2048
},
{
"max_num_seqs": 128,
"max_num_batched_tokens": 2048
},
{
"max_num_seqs": 128,
"max_num_batched_tokens": 4096
},
{
"max_num_seqs": 256,
"max_num_batched_tokens": 4096
}
]
```
4. (Optional) If you would like to vary the settings of `vllm bench serve`, create a new JSON file and populate it with the parameter combinations you want to test. Pass the file path to `--bench-params`.
- Example: Using different input/output lengths for random dataset:
```json
[
{
"random_input_len": 128,
"random_output_len": 32
},
{
"random_input_len": 256,
"random_output_len": 64
},
{
"random_input_len": 512,
"random_output_len": 128
}
]
```
5. Determine where you want to save the results, and pass that to `--output-dir`.
Example command:
```bash
python -m vllm.benchmarks.sweep.serve \
--serve-cmd 'vllm serve meta-llama/Llama-2-7b-chat-hf' \
--bench-cmd 'vllm bench serve --model meta-llama/Llama-2-7b-chat-hf --backend vllm --endpoint /v1/completions --dataset-name sharegpt --dataset-path benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json' \
--serve-params benchmarks/serve_hparams.json \
--bench-params benchmarks/bench_hparams.json \
-o benchmarks/results
```
!!! important
If both `--serve-params` and `--bench-params` are passed, the script will iterate over the Cartesian product between them.
You can use `--dry-run` to preview the commands to be run.
We only start the server once for each `--serve-params`, and keep it running for multiple `--bench-params`.
Between each benchmark run, we call the `/reset_prefix_cache` and `/reset_mm_cache` endpoints to get a clean slate for the next run.
In case you are using a custom `--serve-cmd`, you can override the commands used for resetting the state by setting `--after-bench-cmd`.
!!! note
By default, each parameter combination is run 3 times to make the results more reliable. You can adjust the number of runs by setting `--num-runs`.
!!! tip
You can use the `--resume` option to continue the parameter sweep if one of the runs failed.
### SLA Auto-Tuner
[`vllm/benchmarks/sweep/serve_sla.py`](../../vllm/benchmarks/sweep/serve_sla.py) is a wrapper over [`vllm/benchmarks/sweep/serve.py`](../../vllm/benchmarks/sweep/serve.py) that tunes either the request rate or concurrency (choose using `--sla-variable`) in order to satisfy the SLA constraints given by `--sla-params`.
For example, to ensure E2E latency within different target values for 99% of requests:
```json
[
{
"p99_e2el_ms": "<=200"
},
{
"p99_e2el_ms": "<=500"
},
{
"p99_e2el_ms": "<=1000"
},
{
"p99_e2el_ms": "<=2000"
}
]
```
Example command:
```bash
python -m vllm.benchmarks.sweep.serve_sla \
--serve-cmd 'vllm serve meta-llama/Llama-2-7b-chat-hf' \
--bench-cmd 'vllm bench serve --model meta-llama/Llama-2-7b-chat-hf --backend vllm --endpoint /v1/completions --dataset-name sharegpt --dataset-path benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json' \
--serve-params benchmarks/serve_hparams.json \
--bench-params benchmarks/bench_hparams.json \
--sla-params benchmarks/sla_hparams.json \
--sla-variable max_concurrency \
-o benchmarks/results
```
The algorithm for adjusting the SLA variable is as follows:
1. Run the benchmark with infinite QPS, and use the corresponding metrics to determine the initial value of the variable.
- For example, the initial request rate is set to the concurrency under infinite QPS.
2. If the SLA is still satisfied, keep doubling the value until the SLA is no longer satisfied. This gives a relatively narrow window that contains the point where the SLA is barely satisfied.
3. Apply binary search over the window to find the maximum value that still satisfies the SLA.
!!! important
SLA tuning is applied over each combination of `--serve-params`, `--bench-params`, and `--sla-params`.
For a given combination of `--serve-params` and `--bench-params`, we share the benchmark results across `--sla-params` to avoid rerunning benchmarks with the same SLA variable value.
### Visualizer
[`vllm/benchmarks/sweep/plot.py`](../../vllm/benchmarks/sweep/plot.py) can be used to plot performance curves from parameter sweep results.
Example command:
```bash
python -m vllm.benchmarks.sweep.plot benchmarks/results/<timestamp> \
--var-x max_concurrency \
--row-by random_input_len \
--col-by random_output_len \
--curve-by api_server_count,max_num_batched_tokens \
--filter-by 'max_concurrency<=1024'
```
!!! tip
You can use `--dry-run` to preview the figures to be plotted.
## Performance Benchmarks ## Performance Benchmarks
@ -1118,7 +916,7 @@ For more results visualization, check the [visualizing the results](https://gith
The latest performance results are hosted on the public [vLLM Performance Dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm). The latest performance results are hosted on the public [vLLM Performance Dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm).
More information on the performance benchmarks and their parameters can be found in [Benchmark README](https://github.com/intel-ai-tce/vllm/blob/more_cpu_models/.buildkite/nightly-benchmarks/README.md) and [performance benchmark description](../../.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md). More information on the performance benchmarks and their parameters can be found in [Benchmark README](https://github.com/intel-ai-tce/vllm/blob/more_cpu_models/.buildkite/nightly-benchmarks/README.md) and [performance benchmark description](gh-file:.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md).
### Continuous Benchmarking ### Continuous Benchmarking
@ -1144,10 +942,12 @@ The benchmarking currently runs on a predefined set of models configured in the
All continuous benchmarking results are automatically published to the public [vLLM Performance Dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm). All continuous benchmarking results are automatically published to the public [vLLM Performance Dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm).
[](){ #nightly-benchmarks }
## Nightly Benchmarks ## Nightly Benchmarks
These compare vLLM's performance against alternatives (`tgi`, `trt-llm`, and `lmdeploy`) when there are major updates of vLLM (e.g., bumping up to a new version). They are primarily intended for consumers to evaluate when to choose vLLM over other options and are triggered on every commit with both the `perf-benchmarks` and `nightly-benchmarks` labels. These compare vLLM's performance against alternatives (`tgi`, `trt-llm`, and `lmdeploy`) when there are major updates of vLLM (e.g., bumping up to a new version). They are primarily intended for consumers to evaluate when to choose vLLM over other options and are triggered on every commit with both the `perf-benchmarks` and `nightly-benchmarks` labels.
The latest nightly benchmark results are shared in major release blog posts such as [vLLM v0.6.0](https://blog.vllm.ai/2024/09/05/perf-update.html). The latest nightly benchmark results are shared in major release blog posts such as [vLLM v0.6.0](https://blog.vllm.ai/2024/09/05/perf-update.html).
More information on the nightly benchmarks and their parameters can be found [here](../../.buildkite/nightly-benchmarks/nightly-descriptions.md). More information on the nightly benchmarks and their parameters can be found [here](gh-file:.buildkite/nightly-benchmarks/nightly-descriptions.md).

View File

@ -64,7 +64,7 @@ Download the full log file from Buildkite locally.
Strip timestamps and colorization: Strip timestamps and colorization:
[.buildkite/scripts/ci-clean-log.sh](../../../.buildkite/scripts/ci-clean-log.sh) <gh-file:.buildkite/scripts/ci-clean-log.sh>
```bash ```bash
./ci-clean-log.sh ci.log ./ci-clean-log.sh ci.log
@ -87,7 +87,7 @@ tail -525 ci_build.log | wl-copy
CI test failures may be flaky. Use a bash loop to run repeatedly: CI test failures may be flaky. Use a bash loop to run repeatedly:
[.buildkite/scripts/rerun-test.sh](../../../.buildkite/scripts/rerun-test.sh) <gh-file:.buildkite/scripts/rerun-test.sh>
```bash ```bash
./rerun-test.sh tests/v1/engine/test_engine_core_client.py::test_kv_cache_events[True-tcp] ./rerun-test.sh tests/v1/engine/test_engine_core_client.py::test_kv_cache_events[True-tcp]

View File

@ -5,7 +5,7 @@ release in CI/CD. It is standard practice to submit a PR to update the
PyTorch version as early as possible when a new [PyTorch stable PyTorch version as early as possible when a new [PyTorch stable
release](https://github.com/pytorch/pytorch/blob/main/RELEASE.md#release-cadence) becomes available. release](https://github.com/pytorch/pytorch/blob/main/RELEASE.md#release-cadence) becomes available.
This process is non-trivial due to the gap between PyTorch This process is non-trivial due to the gap between PyTorch
releases. Using <https://github.com/vllm-project/vllm/pull/16859> as an example, this document outlines common steps to achieve this releases. Using <gh-pr:16859> as an example, this document outlines common steps to achieve this
update along with a list of potential issues and how to address them. update along with a list of potential issues and how to address them.
## Test PyTorch release candidates (RCs) ## Test PyTorch release candidates (RCs)
@ -85,9 +85,9 @@ and timeout. Additionally, since vLLM's fastcheck pipeline runs in read-only mod
it doesn't populate the cache, so re-running it to warm up the cache it doesn't populate the cache, so re-running it to warm up the cache
is ineffective. is ineffective.
While ongoing efforts like <https://github.com/vllm-project/vllm/issues/17419> While ongoing efforts like [#17419](gh-issue:17419)
address the long build time at its source, the current workaround is to set `VLLM_CI_BRANCH` address the long build time at its source, the current workaround is to set `VLLM_CI_BRANCH`
to a custom branch provided by @khluu (`VLLM_CI_BRANCH=khluu/long_build`) to a custom branch provided by @khluu (`VLLM_CI_BRANCH=khluu/use_postmerge_q`)
when manually triggering a build on Buildkite. This branch accomplishes two things: when manually triggering a build on Buildkite. This branch accomplishes two things:
1. Increase the timeout limit to 10 hours so that the build doesn't time out. 1. Increase the timeout limit to 10 hours so that the build doesn't time out.
@ -100,17 +100,35 @@ to warm it up so that future builds are faster.
## Update dependencies ## Update dependencies
Several vLLM dependencies like xFormers depend on PyTorch and need Several vLLM dependencies, such as FlashInfer, also depend on PyTorch and need
to be updated accordingly. Rather than waiting for all of them to publish new to be updated accordingly. Rather than waiting for all of them to publish new
releases (which would take too much time), they can be built from releases (which would take too much time), they can be built from
source to unblock the update process. source to unblock the update process.
### xFormers ### FlashInfer
Here is how to build and install it from source with `torch2.7.0+cu128` in vLLM [Dockerfile](https://github.com/vllm-project/vllm/blob/27bebcd89792d5c4b08af7a65095759526f2f9e1/docker/Dockerfile#L259-L271):
```bash ```bash
export TORCH_CUDA_ARCH_LIST='7.5 8.0+PTX 9.0a' export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0 10.0+PTX'
export FLASHINFER_ENABLE_SM90=1
uv pip install --system \
--no-build-isolation "git+https://github.com/flashinfer-ai/flashinfer@v0.2.6.post1"
```
One caveat is that building FlashInfer from source adds approximately 30
minutes to the vLLM build time. Therefore, it's preferable to cache the wheel in a
public location for immediate installation, such as [this FlashInfer wheel link](https://download.pytorch.org/whl/cu128/flashinfer/flashinfer_python-0.2.6.post1%2Bcu128torch2.7-cp39-abi3-linux_x86_64.whl). For future releases, contact the PyTorch release
team if you want to get the package published there.
### xFormers
Similar to FlashInfer, here is how to build and install xFormers from source:
```bash
export TORCH_CUDA_ARCH_LIST='7.0 7.5 8.0 8.9 9.0 10.0+PTX'
MAX_JOBS=16 uv pip install --system \ MAX_JOBS=16 uv pip install --system \
--no-build-isolation "git+https://github.com/facebookresearch/xformers@v0.0.32.post2" --no-build-isolation "git+https://github.com/facebookresearch/xformers@v0.0.30"
``` ```
## Update all the different vLLM platforms ## Update all the different vLLM platforms
@ -120,5 +138,5 @@ to handle some platforms separately. The separation of requirements and Dockerfi
for different platforms in vLLM CI/CD allows us to selectively choose for different platforms in vLLM CI/CD allows us to selectively choose
which platforms to update. For instance, updating XPU requires the corresponding which platforms to update. For instance, updating XPU requires the corresponding
release from [Intel Extension for PyTorch](https://github.com/intel/intel-extension-for-pytorch) by Intel. release from [Intel Extension for PyTorch](https://github.com/intel/intel-extension-for-pytorch) by Intel.
While <https://github.com/vllm-project/vllm/pull/16859> updated vLLM to PyTorch 2.7.0 on CPU, CUDA, and ROCm, While <gh-pr:16859> updated vLLM to PyTorch 2.7.0 on CPU, CUDA, and ROCm,
<https://github.com/vllm-project/vllm/pull/17444> completed the update for XPU. <gh-pr:17444> completed the update for XPU.

View File

@ -1,6 +1,6 @@
# Dockerfile # Dockerfile
We provide a [docker/Dockerfile](../../../docker/Dockerfile) to construct the image for running an OpenAI compatible server with vLLM. We provide a <gh-file:docker/Dockerfile> to construct the image for running an OpenAI compatible server with vLLM.
More information about deploying with Docker can be found [here](../../deployment/docker.md). More information about deploying with Docker can be found [here](../../deployment/docker.md).
Below is a visual representation of the multi-stage Dockerfile. The build graph contains the following nodes: Below is a visual representation of the multi-stage Dockerfile. The build graph contains the following nodes:

View File

@ -1,7 +1,7 @@
# Summary # Summary
!!! important !!! important
Many decoder language models can now be automatically loaded using the [Transformers backend](../../models/supported_models.md#transformers) without having to implement them in vLLM. See if `vllm serve <model>` works first! Many decoder language models can now be automatically loaded using the [Transformers backend][transformers-backend] without having to implement them in vLLM. See if `vllm serve <model>` works first!
vLLM models are specialized [PyTorch](https://pytorch.org/) models that take advantage of various [features](../../features/README.md#compatibility-matrix) to optimize their performance. vLLM models are specialized [PyTorch](https://pytorch.org/) models that take advantage of various [features](../../features/README.md#compatibility-matrix) to optimize their performance.

View File

@ -5,7 +5,7 @@ This guide walks you through the steps to implement a basic vLLM model.
## 1. Bring your model code ## 1. Bring your model code
First, clone the PyTorch model code from the source repository. First, clone the PyTorch model code from the source repository.
For instance, vLLM's [OPT model](../../../vllm/model_executor/models/opt.py) was adapted from For instance, vLLM's [OPT model](gh-file:vllm/model_executor/models/opt.py) was adapted from
HuggingFace's [modeling_opt.py](https://github.com/huggingface/transformers/blob/main/src/transformers/models/opt/modeling_opt.py) file. HuggingFace's [modeling_opt.py](https://github.com/huggingface/transformers/blob/main/src/transformers/models/opt/modeling_opt.py) file.
!!! warning !!! warning
@ -73,8 +73,8 @@ def forward(
self, self,
input_ids: torch.Tensor, input_ids: torch.Tensor,
positions: torch.Tensor, positions: torch.Tensor,
intermediate_tensors: IntermediateTensors | None = None, intermediate_tensors: Optional[IntermediateTensors] = None,
inputs_embeds: torch.Tensor | None = None, inputs_embeds: Optional[torch.Tensor] = None,
) -> torch.Tensor: ) -> torch.Tensor:
... ...
``` ```
@ -83,7 +83,7 @@ def forward(
Currently, vLLM supports the basic multi-head attention mechanism and its variant with rotary positional embeddings. Currently, vLLM supports the basic multi-head attention mechanism and its variant with rotary positional embeddings.
If your model employs a different attention mechanism, you will need to implement a new attention layer in vLLM. If your model employs a different attention mechanism, you will need to implement a new attention layer in vLLM.
For reference, check out our [Llama implementation](../../../vllm/model_executor/models/llama.py). vLLM already supports a large number of models. It is recommended to find a model similar to yours and adapt it to your model's architecture. Check out [vllm/model_executor/models](../../../vllm/model_executor/models) for more examples. For reference, check out our [Llama implementation](gh-file:vllm/model_executor/models/llama.py). vLLM already supports a large number of models. It is recommended to find a model similar to yours and adapt it to your model's architecture. Check out <gh-dir:vllm/model_executor/models> for more examples.
## 3. (Optional) Implement tensor parallelism and quantization support ## 3. (Optional) Implement tensor parallelism and quantization support
@ -130,22 +130,22 @@ We consider 3 different scenarios:
2. Models that combine Mamba layers (either Mamba-1 or Mamba-2) together with attention layers. 2. Models that combine Mamba layers (either Mamba-1 or Mamba-2) together with attention layers.
3. Models that combine Mamba-like mechanisms (e.g., Linear Attention, ShortConv) together with attention layers. 3. Models that combine Mamba-like mechanisms (e.g., Linear Attention, ShortConv) together with attention layers.
For case (1), we recommend looking at the implementation of [`MambaForCausalLM`](../../../vllm/model_executor/models/mamba.py) (for Mamba-1) or [`Mamba2ForCausalLM`](../../../vllm/model_executor/models/mamba2.py) (for Mamba-2) as a reference. For case (1), we recommend looking at the implementation of [`MambaForCausalLM`](gh-file:vllm/model_executor/models/mamba.py) (for Mamba-1) or [`Mamba2ForCausalLM`](gh-file:vllm/model_executor/models/mamba2.py) (for Mamba-2) as a reference.
The model should inherit protocol `IsAttentionFree` and also implement class methods `get_mamba_state_dtype_from_config` and `get_mamba_state_shape_from_config` to calculate the state shapes and data types from the config. The model should inherit protocol `IsAttentionFree` and also implement class methods `get_mamba_state_dtype_from_config` and `get_mamba_state_shape_from_config` to calculate the state shapes and data types from the config.
For the mamba layers themselves, please use the [`MambaMixer`](../../../vllm/model_executor/layers/mamba/mamba_mixer.py) (for Mamba-1) or [`MambaMixer2`](../../../vllm/model_executor/layers/mamba/mamba_mixer2.py) (for Mamba-2) classes. For the mamba layers themselves, please use the [`MambaMixer`](gh-file:vllm/model_executor/layers/mamba/mamba_mixer.py) (for Mamba-1) or [`MambaMixer2`](gh-file:vllm/model_executor/layers/mamba/mamba_mixer2.py) (for Mamba-2) classes.
Please *do not* use the `MambaCacheManager` (deprecated in V1) or replicate any of the V0-specific code paths in the existing model implementations. Please *do not* use the `MambaCacheManager` (deprecated in V1) or replicate any of the V0-specific code paths in the existing model implementations.
V0-only classes and code will be removed in the very near future. V0-only classes and code will be removed in the very near future.
The model should also be added to the `MODELS_CONFIG_MAP` dictionary in [vllm/model_executor/models/config.py](../../../vllm/model_executor/models/config.py) to ensure that the runtime defaults are optimized. The model should also be added to the `MODELS_CONFIG_MAP` dictionary in <gh-file:vllm/model_executor/models/config.py> to ensure that the runtime defaults are optimized.
For case (2), we recommend using as a reference the implementation of [`JambaForCausalLM`](../../../vllm/model_executor/models/jamba.py) (for an example of a model that uses Mamba-1 and attention together) or [`BambaForCausalLM`](../../../vllm/model_executor/models/bamba.py) (for an example of a model that uses Mamba-2 and attention together). For case (2), we recommend using as a reference the implementation of [`JambaForCausalLM`](gh-file:vllm/model_executor/models/jamba.py) (for an example of a model that uses Mamba-1 and attention together) or [`BambaForCausalLM`](gh-file:vllm/model_executor/models/bamba.py) (for an example of a model that uses Mamba-2 and attention together).
These models should follow the same instructions as case (1), but they should inherit protocol `IsHybrid` (instead of `IsAttentionFree`) and it is *not* necessary to add them to the `MODELS_CONFIG_MAP` (their runtime defaults will be inferred from the protocol). These models should follow the same instructions as case (1), but they should inherit protocol `IsHybrid` (instead of `IsAttentionFree`) and it is *not* necessary to add them to the `MODELS_CONFIG_MAP` (their runtime defaults will be inferred from the protocol).
For case (3), we recommend looking at the implementation of [`MiniMaxText01ForCausalLM`](../../../vllm/model_executor/models/minimax_text_01.py) or [`Lfm2ForCausalLM`](../../../vllm/model_executor/models/lfm2.py) as a reference, which use custom "mamba-like" layers `MiniMaxText01LinearAttention` and `ShortConv` respectively. For case (3), we recommend looking at the implementation of [`MiniMaxText01ForCausalLM`](gh-file:vllm/model_executor/models/minimax_text_01.py) or [`Lfm2ForCausalLM`](gh-file:vllm/model_executor/models/lfm2.py) as a reference, which use custom "mamba-like" layers `MiniMaxText01LinearAttention` and `ShortConv` respectively.
Please follow the same guidelines as case (2) for implementing these models. Please follow the same guidelines as case (2) for implementing these models.
We use "mamba-like" to refer to layers that posses a state that is updated in-place, rather than being appended-to (like KV cache for attention). We use "mamba-like" to refer to layers that posses a state that is updated in-place, rather than being appended-to (like KV cache for attention).
For implementing new custom mamba-like layers, one should inherit from `MambaBase` and implement the methods `get_state_dtype`, `get_state_shape` to calculate the data types and state shapes at runtime, as well as `mamba_type` and `get_attn_backend`. For implementing new custom mamba-like layers, one should inherit from `MambaBase` and implement the methods `get_state_dtype`, `get_state_shape` to calculate the data types and state shapes at runtime, as well as `mamba_type` and `get_attn_backend`.
It is also necessary to implement the "attention meta-data" class which handles the meta-data that is common across all layers. It is also necessary to implement the "attention meta-data" class which handles the meta-data that is common across all layers.
Please see [`LinearAttentionMetadata`](../../../vllm/v1/attention/backends/linear_attn.py) or [`ShortConvAttentionMetadata`](../../../vllm/v1/attention/backends/short_conv_attn.py) for examples of this. Please see [`LinearAttentionMetadata`](gh-file:vllm/v1/attention/backends/linear_attn.py) or [`ShortConvAttentionMetadata`](gh-file:v1/attention/backends/short_conv_attn.py) for examples of this.
Finally, if one wants to support torch compile and CUDA graphs, it necessary to wrap the call to the mamba-like layer inside a custom op and register it. Finally, if one wants to support torch compile and CUDA graphs, it necessary to wrap the call to the mamba-like layer inside a custom op and register it.
Please see the calls to `direct_register_custom_op` in [vllm/model_executor/models/minimax_text_01.py](../../../vllm/model_executor/models/minimax_text_01.py) or [vllm/model_executor/layers/mamba/short_conv.py](../../../vllm/model_executor/layers/mamba/short_conv.py) for examples of this. Please see the calls to `direct_register_custom_op` in <gh-file:vllm/model_executor/models/minimax_text_01.py> or <gh-file:vllm/model_executor/layers/mamba/short_conv.py> for examples of this.
The new custom op should then be added to the list `_attention_ops` in [vllm/config/compilation.py](../../../vllm/config/compilation.py) to ensure that piecewise CUDA graphs works as intended. The new custom op should then be added to the list `_attention_ops` in <gh-file:vllm/config/compilation.py> to ensure that piecewise CUDA graphs works as intended.

View File

@ -16,7 +16,7 @@ Further update the model as follows:
... ...
@classmethod @classmethod
def get_placeholder_str(cls, modality: str, i: int) -> str | None: def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
if modality.startswith("image"): if modality.startswith("image"):
return "<image>" return "<image>"
@ -45,14 +45,14 @@ Further update the model as follows:
... ...
def _process_image_input(self, image_input: YourModelImageInputs) -> torch.Tensor: def _process_image_input(self, image_input: YourModelImageInputs) -> torch.Tensor:
assert self.vision_encoder is not None assert self.vision_encoder is not None
image_features = self.vision_encoder(image_input) image_features = self.vision_encoder(image_input)
return self.multi_modal_projector(image_features) return self.multi_modal_projector(image_features)
def get_multimodal_embeddings( def get_multimodal_embeddings(
self, self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
**kwargs: object,
) -> MultiModalEmbeddings | None:
# Validate the multimodal input keyword arguments # Validate the multimodal input keyword arguments
image_input = self._parse_and_validate_image_input(**kwargs) image_input = self._parse_and_validate_image_input(**kwargs)
if image_input is None: if image_input is None:
@ -110,7 +110,7 @@ to return the maximum number of input items for each modality supported by the m
For example, if the model supports any number of images but only one video per prompt: For example, if the model supports any number of images but only one video per prompt:
```python ```python
def get_supported_mm_limits(self) -> Mapping[str, int | None]: def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
return {"image": None, "video": 1} return {"image": None, "video": 1}
``` ```
@ -258,7 +258,7 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
self, self,
seq_len: int, seq_len: int,
mm_counts: Mapping[str, int], mm_counts: Mapping[str, int],
mm_options: Mapping[str, BaseDummyOptions] | None = None, mm_options: Optional[Mapping[str, BaseDummyOptions]] = None,
) -> MultiModalDataDict: ) -> MultiModalDataDict:
num_images = mm_counts.get("image", 0) num_images = mm_counts.get("image", 0)
@ -421,10 +421,8 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
```python ```python
def get_image_size_with_most_features(self) -> ImageSize: def get_image_size_with_most_features(self) -> ImageSize:
image_processor = self.get_image_processor() image_processor = self.get_image_processor()
return ImageSize( return ImageSize(width=image_processor.size["width"],
width=image_processor.size["width"], height=image_processor.size["height"])
height=image_processor.size["height"],
)
``` ```
Fuyu does not expect image placeholders in the inputs to HF processor, so Fuyu does not expect image placeholders in the inputs to HF processor, so
@ -454,12 +452,10 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
return { return {
"image": "image":
self._get_dummy_images( self._get_dummy_images(width=target_width,
width=target_width, height=target_height,
height=target_height, num_images=num_images,
num_images=num_images, overrides=image_overrides)
overrides=image_overrides,
)
} }
``` ```
@ -507,7 +503,7 @@ return a schema of the tensors outputted by the HF processor that are related to
``` ```
!!! note !!! note
Our [actual code](../../../vllm/model_executor/models/llava.py) additionally supports Our [actual code](gh-file:vllm/model_executor/models/llava.py) additionally supports
pre-computed image embeddings, which can be passed to be model via the `image_embeds` argument. pre-computed image embeddings, which can be passed to be model via the `image_embeds` argument.
=== "With postprocessing: Fuyu" === "With postprocessing: Fuyu"
@ -569,7 +565,7 @@ return a schema of the tensors outputted by the HF processor that are related to
``` ```
!!! note !!! note
Our [actual code](../../../vllm/model_executor/models/fuyu.py) has special handling Our [actual code](gh-file:vllm/model_executor/models/fuyu.py) has special handling
for text-only inputs to prevent unnecessary warnings from HF processor. for text-only inputs to prevent unnecessary warnings from HF processor.
!!! note !!! note
@ -748,7 +744,8 @@ Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies
image_width=image_size.width, image_width=image_size.width,
image_height=image_size.height, image_height=image_size.height,
) )
image_tokens = ([_IMAGE_TOKEN_ID] * ncols + [_NEWLINE_TOKEN_ID]) * nrows image_tokens = ([_IMAGE_TOKEN_ID] * ncols +
[_NEWLINE_TOKEN_ID]) * nrows
return PromptUpdateDetails.select_token_id( return PromptUpdateDetails.select_token_id(
image_tokens + [bos_token_id], image_tokens + [bos_token_id],
@ -784,7 +781,8 @@ Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies
image_width=image_size.width, image_width=image_size.width,
image_height=image_size.height, image_height=image_size.height,
) )
image_tokens = ([_IMAGE_TOKEN_ID] * ncols + [_NEWLINE_TOKEN_ID]) * nrows image_tokens = ([_IMAGE_TOKEN_ID] * ncols +
[_NEWLINE_TOKEN_ID]) * nrows
return PromptUpdateDetails.select_token_id( return PromptUpdateDetails.select_token_id(
image_tokens + [bos_token_id], image_tokens + [bos_token_id],
@ -812,11 +810,9 @@ to register them to the multi-modal registry:
from vllm.model_executor.models.interfaces import SupportsMultiModal from vllm.model_executor.models.interfaces import SupportsMultiModal
+ from vllm.multimodal import MULTIMODAL_REGISTRY + from vllm.multimodal import MULTIMODAL_REGISTRY
+ @MULTIMODAL_REGISTRY.register_processor( + @MULTIMODAL_REGISTRY.register_processor(YourMultiModalProcessor,
+ YourMultiModalProcessor, + info=YourProcessingInfo,
+ info=YourProcessingInfo, + dummy_inputs=YourDummyInputsBuilder)
+ dummy_inputs=YourDummyInputsBuilder,
+ )
class YourModelForImage2Seq(nn.Module, SupportsMultiModal): class YourModelForImage2Seq(nn.Module, SupportsMultiModal):
``` ```
@ -828,8 +824,8 @@ Some HF processors directly insert feature tokens without replacing anything in
Examples: Examples:
- BLIP-2 (insert at start of prompt): [vllm/model_executor/models/blip2.py](../../../vllm/model_executor/models/blip2.py) - BLIP-2 (insert at start of prompt): <gh-file:vllm/model_executor/models/blip2.py>
- Molmo (insert after `<|endoftext|>` token): [vllm/model_executor/models/molmo.py](../../../vllm/model_executor/models/molmo.py) - Molmo (insert after `<|endoftext|>` token): <gh-file:vllm/model_executor/models/molmo.py>
### Handling prompt updates unrelated to multi-modal data ### Handling prompt updates unrelated to multi-modal data
@ -837,9 +833,9 @@ Examples:
Examples: Examples:
- Chameleon (appends `sep_token`): [vllm/model_executor/models/chameleon.py](../../../vllm/model_executor/models/chameleon.py) - Chameleon (appends `sep_token`): <gh-file:vllm/model_executor/models/chameleon.py>
- Fuyu (appends `boa_token`): [vllm/model_executor/models/fuyu.py](../../../vllm/model_executor/models/fuyu.py) - Fuyu (appends `boa_token`): <gh-file:vllm/model_executor/models/fuyu.py>
- Molmo (applies chat template which is not defined elsewhere): [vllm/model_executor/models/molmo.py](../../../vllm/model_executor/models/molmo.py) - Molmo (applies chat template which is not defined elsewhere): <gh-file:vllm/model_executor/models/molmo.py>
### Custom HF processor ### Custom HF processor
@ -847,6 +843,6 @@ Some models don't define an HF processor class on HF Hub. In that case, you can
Examples: Examples:
- DeepSeek-VL2: [vllm/model_executor/models/deepseek_vl2.py](../../../vllm/model_executor/models/deepseek_vl2.py) - DeepSeek-VL2: <gh-file:vllm/model_executor/models/deepseek_vl2.py>
- InternVL: [vllm/model_executor/models/internvl.py](../../../vllm/model_executor/models/internvl.py) - InternVL: <gh-file:vllm/model_executor/models/internvl.py>
- Qwen-VL: [vllm/model_executor/models/qwen_vl.py](../../../vllm/model_executor/models/qwen_vl.py) - Qwen-VL: <gh-file:vllm/model_executor/models/qwen_vl.py>

View File

@ -8,11 +8,11 @@ This page provides detailed instructions on how to do so.
## Built-in models ## Built-in models
To add a model directly to the vLLM library, start by forking our [GitHub repository](https://github.com/vllm-project/vllm) and then [build it from source](../../getting_started/installation/gpu.md#build-wheel-from-source). To add a model directly to the vLLM library, start by forking our [GitHub repository](https://github.com/vllm-project/vllm) and then [build it from source][build-from-source].
This gives you the ability to modify the codebase and test your model. This gives you the ability to modify the codebase and test your model.
After you have implemented your model (see [tutorial](basic.md)), put it into the [vllm/model_executor/models](../../../vllm/model_executor/models) directory. After you have implemented your model (see [tutorial](basic.md)), put it into the <gh-dir:vllm/model_executor/models> directory.
Then, add your model class to `_VLLM_MODELS` in [vllm/model_executor/models/registry.py](../../../vllm/model_executor/models/registry.py) so that it is automatically registered upon importing vLLM. Then, add your model class to `_VLLM_MODELS` in <gh-file:vllm/model_executor/models/registry.py> so that it is automatically registered upon importing vLLM.
Finally, update our [list of supported models](../../models/supported_models.md) to promote your model! Finally, update our [list of supported models](../../models/supported_models.md) to promote your model!
!!! important !!! important
@ -42,7 +42,7 @@ def register():
ModelRegistry.register_model( ModelRegistry.register_model(
"YourModelForCausalLM", "YourModelForCausalLM",
"your_code:YourModelForCausalLM", "your_code:YourModelForCausalLM"
) )
``` ```

View File

@ -9,7 +9,7 @@ Without them, the CI for your PR will fail.
### Model loading ### Model loading
Include an example HuggingFace repository for your model in [tests/models/registry.py](../../../tests/models/registry.py). Include an example HuggingFace repository for your model in <gh-file:tests/models/registry.py>.
This enables a unit test that loads dummy weights to ensure that the model can be initialized in vLLM. This enables a unit test that loads dummy weights to ensure that the model can be initialized in vLLM.
!!! important !!! important
@ -26,24 +26,26 @@ Passing these tests provides more confidence that your implementation is correct
### Model correctness ### Model correctness
These tests compare the model outputs of vLLM against [HF Transformers](https://github.com/huggingface/transformers). You can add new tests under the subdirectories of [tests/models](../../../tests/models). These tests compare the model outputs of vLLM against [HF Transformers](https://github.com/huggingface/transformers). You can add new tests under the subdirectories of <gh-dir:tests/models>.
#### Generative models #### Generative models
For [generative models](../../models/generative_models.md), there are two levels of correctness tests, as defined in [tests/models/utils.py](../../../tests/models/utils.py): For [generative models](../../models/generative_models.md), there are two levels of correctness tests, as defined in <gh-file:tests/models/utils.py>:
- Exact correctness (`check_outputs_equal`): The text outputted by vLLM should exactly match the text outputted by HF. - Exact correctness (`check_outputs_equal`): The text outputted by vLLM should exactly match the text outputted by HF.
- Logprobs similarity (`check_logprobs_close`): The logprobs outputted by vLLM should be in the top-k logprobs outputted by HF, and vice versa. - Logprobs similarity (`check_logprobs_close`): The logprobs outputted by vLLM should be in the top-k logprobs outputted by HF, and vice versa.
#### Pooling models #### Pooling models
For [pooling models](../../models/pooling_models.md), we simply check the cosine similarity, as defined in [tests/models/utils.py](../../../tests/models/utils.py). For [pooling models](../../models/pooling_models.md), we simply check the cosine similarity, as defined in <gh-file:tests/models/utils.py>.
[](){ #mm-processing-tests }
### Multi-modal processing ### Multi-modal processing
#### Common tests #### Common tests
Adding your model to [tests/models/multimodal/processing/test_common.py](../../../tests/models/multimodal/processing/test_common.py) verifies that the following input combinations result in the same outputs: Adding your model to <gh-file:tests/models/multimodal/processing/test_common.py> verifies that the following input combinations result in the same outputs:
- Text + multi-modal data - Text + multi-modal data
- Tokens + multi-modal data - Tokens + multi-modal data
@ -52,6 +54,6 @@ Adding your model to [tests/models/multimodal/processing/test_common.py](../../.
#### Model-specific tests #### Model-specific tests
You can add a new file under [tests/models/multimodal/processing](../../../tests/models/multimodal/processing) to run tests that only apply to your model. You can add a new file under <gh-dir:tests/models/multimodal/processing> to run tests that only apply to your model.
For example, if the HF processor for your model accepts user-specified keyword arguments, you can verify that the keyword arguments are being applied correctly, such as in [tests/models/multimodal/processing/test_phi3v.py](../../../tests/models/multimodal/processing/test_phi3v.py). For example, if the HF processor for your model accepts user-specified keyword arguments, you can verify that the keyword arguments are being applied correctly, such as in <gh-file:tests/models/multimodal/processing/test_phi3v.py>.

View File

@ -15,9 +15,8 @@ Declare supported languages and capabilities:
- Set `supports_transcription_only=True` if the model should not serve text generation (eg Whisper). - Set `supports_transcription_only=True` if the model should not serve text generation (eg Whisper).
??? code "supported_languages and supports_transcription_only" ??? code "supported_languages and supports_transcription_only"
```python ```python
from typing import ClassVar, Mapping, Literal from typing import ClassVar, Mapping, Optional, Literal
import numpy as np import numpy as np
import torch import torch
from torch import nn from torch import nn
@ -44,7 +43,6 @@ Provide an ASR configuration via [get_speech_to_text_config][vllm.model_executor
This is for controlling general behavior of the API when serving your model: This is for controlling general behavior of the API when serving your model:
??? code "get_speech_to_text_config()" ??? code "get_speech_to_text_config()"
```python ```python
class YourASRModel(nn.Module, SupportsTranscription): class YourASRModel(nn.Module, SupportsTranscription):
... ...
@ -73,7 +71,6 @@ Implement the prompt construction via [get_generation_prompt][vllm.model_executo
Return a dict containing `multi_modal_data` with the audio, and either a `prompt` string or `prompt_token_ids`: Return a dict containing `multi_modal_data` with the audio, and either a `prompt` string or `prompt_token_ids`:
??? code "get_generation_prompt()" ??? code "get_generation_prompt()"
```python ```python
class YourASRModel(nn.Module, SupportsTranscription): class YourASRModel(nn.Module, SupportsTranscription):
... ...
@ -84,10 +81,10 @@ Return a dict containing `multi_modal_data` with the audio, and either a `prompt
audio: np.ndarray, audio: np.ndarray,
stt_config: SpeechToTextConfig, stt_config: SpeechToTextConfig,
model_config: ModelConfig, model_config: ModelConfig,
language: str | None, language: Optional[str],
task_type: Literal["transcribe", "translate"], task_type: Literal["transcribe", "translate"],
request_prompt: str, request_prompt: str,
to_language: str | None, to_language: Optional[str],
) -> PromptType: ) -> PromptType:
# Example with a free-form instruction prompt # Example with a free-form instruction prompt
task_word = "Transcribe" if task_type == "transcribe" else "Translate" task_word = "Transcribe" if task_type == "transcribe" else "Translate"
@ -110,7 +107,6 @@ Return a dict containing `multi_modal_data` with the audio, and either a `prompt
Return a dict with separate `encoder_prompt` and `decoder_prompt` entries: Return a dict with separate `encoder_prompt` and `decoder_prompt` entries:
??? code "get_generation_prompt()" ??? code "get_generation_prompt()"
```python ```python
class YourASRModel(nn.Module, SupportsTranscription): class YourASRModel(nn.Module, SupportsTranscription):
... ...
@ -121,10 +117,10 @@ Return a dict with separate `encoder_prompt` and `decoder_prompt` entries:
audio: np.ndarray, audio: np.ndarray,
stt_config: SpeechToTextConfig, stt_config: SpeechToTextConfig,
model_config: ModelConfig, model_config: ModelConfig,
language: str | None, language: Optional[str],
task_type: Literal["transcribe", "translate"], task_type: Literal["transcribe", "translate"],
request_prompt: str, request_prompt: str,
to_language: str | None, to_language: Optional[str],
) -> PromptType: ) -> PromptType:
if language is None: if language is None:
raise ValueError("Language must be specified") raise ValueError("Language must be specified")
@ -152,16 +148,12 @@ Language validation via [validate_language][vllm.model_executor.models.interface
If your model requires a language and you want a default, override this method (see Whisper): If your model requires a language and you want a default, override this method (see Whisper):
??? code "validate_language()" ??? code "validate_language()"
```python ```python
@classmethod @classmethod
def validate_language(cls, language: str | None) -> str | None: def validate_language(cls, language: Optional[str]) -> Optional[str]:
if language is None: if language is None:
logger.warning( logger.warning(
"Defaulting to language='en'. If you wish to transcribe " "Defaulting to language='en'. If you wish to transcribe audio in a different language, pass the `language` field.")
"audio in a different language, pass the `language` field "
"in the TranscriptionRequest."
)
language = "en" language = "en"
return super().validate_language(language) return super().validate_language(language)
``` ```
@ -173,7 +165,6 @@ Token accounting for streaming via [get_num_audio_tokens][vllm.model_executor.mo
Provide a fast duration→token estimate to improve streaming usage statistics: Provide a fast duration→token estimate to improve streaming usage statistics:
??? code "get_num_audio_tokens()" ??? code "get_num_audio_tokens()"
```python ```python
class YourASRModel(nn.Module, SupportsTranscription): class YourASRModel(nn.Module, SupportsTranscription):
... ...
@ -184,7 +175,7 @@ Provide a fast duration→token estimate to improve streaming usage statistics:
audio_duration_s: float, audio_duration_s: float,
stt_config: SpeechToTextConfig, stt_config: SpeechToTextConfig,
model_config: ModelConfig, model_config: ModelConfig,
) -> int | None: ) -> Optional[int]:
# Return None if unknown; otherwise return an estimate. # Return None if unknown; otherwise return an estimate.
return int(audio_duration_s * stt_config.sample_rate // 320) # example return int(audio_duration_s * stt_config.sample_rate // 320) # example
``` ```
@ -200,7 +191,6 @@ The API server takes care of basic audio I/O and optional chunking before buildi
Relevant server logic: Relevant server logic:
??? code "_preprocess_speech_to_text()" ??? code "_preprocess_speech_to_text()"
```python ```python
# vllm/entrypoints/openai/speech_to_text.py # vllm/entrypoints/openai/speech_to_text.py
async def _preprocess_speech_to_text(...): async def _preprocess_speech_to_text(...):
@ -248,9 +238,9 @@ No extra registration is required beyond having your model class available via t
## Examples in-tree ## Examples in-tree
- Whisper encoderdecoder (audio-only): [vllm/model_executor/models/whisper.py](../../../vllm/model_executor/models/whisper.py) - Whisper encoderdecoder (audio-only): <gh-file:vllm/model_executor/models/whisper.py>
- Voxtral decoder-only (audio embeddings + LLM): [vllm/model_executor/models/voxtral.py](../../../vllm/model_executor/models/voxtral.py) - Voxtral decoder-only (audio embeddings + LLM): <gh-file:vllm/model_executor/models/voxtral.py>
- Gemma3n decoder-only with fixed instruction prompt: [vllm/model_executor/models/gemma3n_mm.py](../../../vllm/model_executor/models/gemma3n_mm.py) - Gemma3n decoder-only with fixed instruction prompt: <gh-file:vllm/model_executor/models/gemma3n_mm.py>
## Test with the API ## Test with the API
@ -278,7 +268,7 @@ Once your model implements `SupportsTranscription`, you can test the endpoints (
http://localhost:8000/v1/audio/translations http://localhost:8000/v1/audio/translations
``` ```
Or check out more examples in [examples/online_serving](../../../examples/online_serving). Or check out more examples in <gh-file:examples/online_serving>.
!!! note !!! note
- If your model handles chunking internally (e.g., via its processor or encoder), set `min_energy_split_window_size=None` in the returned `SpeechToTextConfig` to disable server-side chunking. - If your model handles chunking internally (e.g., via its processor or encoder), set `min_energy_split_window_size=None` in the returned `SpeechToTextConfig` to disable server-side chunking.

Some files were not shown because too many files have changed in this diff Show More