Compare commits

..

1 Commits

Author SHA1 Message Date
90eb28ca21 [V1][Scheduler] Use dict for running queue
This is just a random idea, still need to benchmark

Potential advantages for large batch sizes:
- Don't need to copy entire list every iteration
- O(1) removal of aborted requests

Signed-off-by: Nick Hill <nhill@redhat.com>
2025-03-13 16:11:07 -04:00
552 changed files with 10120 additions and 34056 deletions

View File

@ -4,8 +4,8 @@ tasks:
- name: "gsm8k" - name: "gsm8k"
metrics: metrics:
- name: "exact_match,strict-match" - name: "exact_match,strict-match"
value: 0.231 value: 0.233
- name: "exact_match,flexible-extract" - name: "exact_match,flexible-extract"
value: 0.22 value: 0.236
limit: 1000 limit: 1000
num_fewshot: 5 num_fewshot: 5

View File

@ -13,7 +13,6 @@ from pathlib import Path
import lm_eval import lm_eval
import numpy import numpy
import pytest
import yaml import yaml
RTOL = 0.05 RTOL = 0.05
@ -47,10 +46,6 @@ def test_lm_eval_correctness():
eval_config = yaml.safe_load( eval_config = yaml.safe_load(
Path(TEST_DATA_FILE).read_text(encoding="utf-8")) Path(TEST_DATA_FILE).read_text(encoding="utf-8"))
if eval_config[
"model_name"] == "nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform": #noqa: E501
pytest.skip("FBGEMM is currently failing on main.")
# Launch eval requests. # Launch eval requests.
results = launch_lm_eval(eval_config) results = launch_lm_eval(eval_config)

View File

@ -57,6 +57,8 @@ steps:
agents: agents:
queue: tpu_queue_postmerge queue: tpu_queue_postmerge
commands: commands:
- "rm -f /var/log/syslog"
- "rm -f /var/log/kern.log"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f Dockerfile.tpu ." - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f Dockerfile.tpu ."
- "docker push vllm/vllm-tpu:nightly" - "docker push vllm/vllm-tpu:nightly"
- "docker push vllm/vllm-tpu:$BUILDKITE_COMMIT" - "docker push vllm/vllm-tpu:$BUILDKITE_COMMIT"

View File

@ -101,30 +101,16 @@ if [[ $commands == *" kernels "* ]]; then
--ignore=kernels/test_permute_cols.py" --ignore=kernels/test_permute_cols.py"
fi fi
#ignore certain Entrypoints/openai tests #ignore certain Entrypoints tests
if [[ $commands == *" entrypoints/openai "* ]]; then if [[ $commands == *" entrypoints/openai "* ]]; then
commands=${commands//" entrypoints/openai "/" entrypoints/openai \ commands=${commands//" entrypoints/openai "/" entrypoints/openai \
--ignore=entrypoints/openai/test_accuracy.py \
--ignore=entrypoints/openai/test_audio.py \ --ignore=entrypoints/openai/test_audio.py \
--ignore=entrypoints/openai/test_chat.py \ --ignore=entrypoints/openai/test_encoder_decoder.py \
--ignore=entrypoints/openai/test_shutdown.py \ --ignore=entrypoints/openai/test_embedding.py \
--ignore=entrypoints/openai/test_completion.py \ --ignore=entrypoints/openai/test_oot_registration.py "}
--ignore=entrypoints/openai/test_sleep.py \
--ignore=entrypoints/openai/test_models.py \
--ignore=entrypoints/openai/test_prompt_validation.py "}
fi fi
#ignore certain Entrypoints/llm tests
if [[ $commands == *" && pytest -v -s entrypoints/llm/test_guided_generate.py"* ]]; then
commands=${commands//" && pytest -v -s entrypoints/llm/test_guided_generate.py"/" "}
fi
# --ignore=entrypoints/openai/test_encoder_decoder.py \
# --ignore=entrypoints/openai/test_embedding.py \
# --ignore=entrypoints/openai/test_oot_registration.py
# --ignore=entrypoints/openai/test_accuracy.py \
# --ignore=entrypoints/openai/test_models.py <= Fails on MI250 but passes on MI300 as of 2025-03-13
PARALLEL_JOB_COUNT=8 PARALLEL_JOB_COUNT=8
# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs. # check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
if [[ $commands == *"--shard-id="* ]]; then if [[ $commands == *"--shard-id="* ]]; then

View File

@ -44,11 +44,11 @@ remove_docker_container() {
trap remove_docker_container EXIT trap remove_docker_container EXIT
# Run the image # Run the image
docker run --rm -it --device=/dev/neuron0 --network bridge \ docker run --rm -it --device=/dev/neuron0 --device=/dev/neuron1 --network host \
-v "${HF_CACHE}:${HF_MOUNT}" \ -v "${HF_CACHE}:${HF_MOUNT}" \
-e "HF_HOME=${HF_MOUNT}" \ -e "HF_HOME=${HF_MOUNT}" \
-v "${NEURON_COMPILE_CACHE_URL}:${NEURON_COMPILE_CACHE_MOUNT}" \ -v "${NEURON_COMPILE_CACHE_URL}:${NEURON_COMPILE_CACHE_MOUNT}" \
-e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \ -e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \
--name "${container_name}" \ --name "${container_name}" \
${image_name} \ ${image_name} \
/bin/bash -c "python3 /workspace/vllm/examples/offline_inference/neuron.py && python3 -m pytest /workspace/vllm/tests/neuron/1_core/ -v --capture=tee-sys && python3 -m pytest /workspace/vllm/tests/neuron/2_core/ -v --capture=tee-sys" /bin/bash -c "python3 /workspace/vllm/examples/offline_inference/neuron.py && python3 -m pytest /workspace/vllm/tests/neuron/ -v --capture=tee-sys"

25
.buildkite/run-tpu-test.sh Executable file
View File

@ -0,0 +1,25 @@
#!/bin/bash
set -e
# Build the docker image.
docker build -f Dockerfile.tpu -t vllm-tpu .
# Set up cleanup.
remove_docker_container() { docker rm -f tpu-test || true; }
trap remove_docker_container EXIT
# Remove the container that might not be cleaned up in the previous run.
remove_docker_container
# For HF_TOKEN.
source /etc/environment
# Run a simple end-to-end example.
docker run --privileged --net host --shm-size=16G -it \
-e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \
&& python3 -m pip install pytest \
&& python3 -m pip install lm_eval[api]==0.4.4 \
&& pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
&& python3 /workspace/vllm/tests/tpu/test_compilation.py \
&& python3 /workspace/vllm/tests/tpu/test_quantization_accuracy.py \
&& python3 /workspace/vllm/examples/offline_inference/tpu.py"

View File

@ -1,36 +0,0 @@
#!/bin/bash
set -e
# Build the docker image.
docker build -f Dockerfile.tpu -t vllm-tpu .
# Set up cleanup.
remove_docker_container() { docker rm -f tpu-test || true; }
trap remove_docker_container EXIT
# Remove the container that might not be cleaned up in the previous run.
remove_docker_container
# For HF_TOKEN.
source /etc/environment
# Run a simple end-to-end example.
docker run --privileged --net host --shm-size=16G -it \
-e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \
&& python3 -m pip install pytest \
&& python3 -m pip install lm_eval[api]==0.4.4 \
&& echo TEST_1 \
&& VLLM_USE_V1=1 python3 /workspace/vllm/tests/tpu/test_compilation.py \
&& echo TEST_2 \
&& VLLM_USE_V1=1 pytest -v -s /workspace/vllm/tests/v1/tpu/test_basic.py \
&& echo TEST_3 \
&& VLLM_USE_V1=1 pytest -v -s /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine \
&& echo TEST_4 \
&& VLLM_USE_V1=1 pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py \
&& echo TEST_5 \
&& VLLM_USE_V1=1 python3 /workspace/vllm/examples/offline_inference/tpu.py" \
# TODO: This test fails because it uses RANDOM_SEED sampling
# && VLLM_USE_V1=1 pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \

View File

@ -4,28 +4,16 @@
# It serves a sanity check for compilation and basic model usage. # It serves a sanity check for compilation and basic model usage.
set -ex set -ex
image_name="xpu/vllm-ci:${BUILDKITE_COMMIT}"
container_name="xpu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
# Try building the docker image # Try building the docker image
docker build -t ${image_name} -f Dockerfile.xpu . docker build -t xpu-test -f Dockerfile.xpu .
# Setup cleanup # Setup cleanup
remove_docker_container() { remove_docker_container() { docker rm -f xpu-test || true; }
docker rm -f "${container_name}" || true;
docker image rm -f "${image_name}" || true;
docker system prune -f || true;
}
trap remove_docker_container EXIT trap remove_docker_container EXIT
remove_docker_container
# Run the image and test offline inference/tensor parallel # Run the image and test offline inference/tensor parallel
docker run \ docker run --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path --entrypoint="" xpu-test sh -c '
--device /dev/dri \ python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m
-v /dev/dri/by-path:/dev/dri/by-path \ python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m -tp 2
--entrypoint="" \
--name "${container_name}" \
"${image_name}" \
sh -c '
VLLM_USE_V1=0 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m
VLLM_USE_V1=0 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m -tp 2
' '

View File

@ -41,6 +41,7 @@ steps:
- grep \"sig sig-object py\" build/html/api/inference_params.html - grep \"sig sig-object py\" build/html/api/inference_params.html
- label: Async Engine, Inputs, Utils, Worker Test # 24min - label: Async Engine, Inputs, Utils, Worker Test # 24min
fast_check: true
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/mq_llm_engine - tests/mq_llm_engine
@ -117,14 +118,15 @@ steps:
- pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process - pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
- pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process - pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
- VLLM_USE_V1=0 pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process - pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/correctness/ - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/correctness/
- pytest -v -s entrypoints/test_chat_utils.py - pytest -v -s entrypoints/test_chat_utils.py
- VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
- label: Distributed Tests (4 GPUs) # 10min - label: Distributed Tests (4 GPUs) # 10min
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"
num_gpus: 4 num_gpus: 4
fast_check: true
source_file_dependencies: source_file_dependencies:
- vllm/distributed/ - vllm/distributed/
- vllm/core/ - vllm/core/
@ -136,7 +138,7 @@ steps:
- examples/offline_inference/rlhf_colocate.py - examples/offline_inference/rlhf_colocate.py
- tests/examples/offline_inference/data_parallel.py - tests/examples/offline_inference/data_parallel.py
commands: commands:
- python3 ../examples/offline_inference/data_parallel.py - VLLM_USE_V1=1 python3 ../examples/offline_inference/data_parallel.py
- pytest -v -s distributed/test_utils.py - pytest -v -s distributed/test_utils.py
- pytest -v -s compile/test_basic_correctness.py - pytest -v -s compile/test_basic_correctness.py
- pytest -v -s distributed/test_pynccl.py - pytest -v -s distributed/test_pynccl.py
@ -150,6 +152,7 @@ steps:
- label: Metrics, Tracing Test # 10min - label: Metrics, Tracing Test # 10min
num_gpus: 2 num_gpus: 2
fast_check: true
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/metrics - tests/metrics
@ -197,19 +200,16 @@ steps:
- tests/v1 - tests/v1
commands: commands:
# split the test to avoid interference # split the test to avoid interference
- pytest -v -s v1/core - VLLM_USE_V1=1 pytest -v -s v1/core
- pytest -v -s v1/entrypoints - VLLM_USE_V1=1 pytest -v -s v1/engine
- pytest -v -s v1/engine - VLLM_USE_V1=1 pytest -v -s v1/sample
- pytest -v -s v1/entrypoints - VLLM_USE_V1=1 pytest -v -s v1/worker
- pytest -v -s v1/sample - VLLM_USE_V1=1 pytest -v -s v1/structured_output
- pytest -v -s v1/worker - VLLM_USE_V1=1 pytest -v -s v1/test_stats.py
- pytest -v -s v1/structured_output - VLLM_USE_V1=1 pytest -v -s v1/test_utils.py
- pytest -v -s v1/test_stats.py
- pytest -v -s v1/test_utils.py
- pytest -v -s v1/test_oracle.py
# TODO: accuracy does not match, whether setting # TODO: accuracy does not match, whether setting
# VLLM_USE_FLASHINFER_SAMPLER or not on H100. # VLLM_USE_FLASHINFER_SAMPLER or not on H100.
- pytest -v -s v1/e2e - VLLM_USE_V1=1 pytest -v -s v1/e2e
# Integration test for streaming correctness (requires special branch). # Integration test for streaming correctness (requires special branch).
- pip install -U git+https://github.com/robertgshaw2-neuralmagic/lm-evaluation-harness.git@streaming-api - pip install -U git+https://github.com/robertgshaw2-neuralmagic/lm-evaluation-harness.git@streaming-api
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
@ -227,17 +227,14 @@ steps:
- python3 offline_inference/basic/chat.py - python3 offline_inference/basic/chat.py
- python3 offline_inference/prefix_caching.py - python3 offline_inference/prefix_caching.py
- python3 offline_inference/llm_engine_example.py - python3 offline_inference/llm_engine_example.py
- python3 offline_inference/audio_language.py --seed 0 - python3 offline_inference/vision_language.py
- python3 offline_inference/vision_language.py --seed 0 - python3 offline_inference/vision_language_multi_image.py
- python3 offline_inference/vision_language_embedding.py --seed 0 - python3 other/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 other/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
- python3 offline_inference/vision_language_multi_image.py --seed 0
- VLLM_USE_V1=0 python3 other/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 other/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
- python3 offline_inference/encoder_decoder.py - python3 offline_inference/encoder_decoder.py
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
- python3 offline_inference/basic/classify.py - python3 offline_inference/basic/classify.py
- python3 offline_inference/basic/embed.py - python3 offline_inference/basic/embed.py
- python3 offline_inference/basic/score.py - python3 offline_inference/basic/score.py
- VLLM_USE_V1=0 python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2 - python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2
- label: Prefix Caching Test # 9min - label: Prefix Caching Test # 9min
mirror_hardwares: [amd] mirror_hardwares: [amd]
@ -287,6 +284,7 @@ steps:
parallelism: 4 parallelism: 4
- label: PyTorch Fullgraph Smoke Test # 9min - label: PyTorch Fullgraph Smoke Test # 9min
fast_check: true
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/compile - tests/compile
@ -381,8 +379,7 @@ steps:
commands: commands:
- pytest -v -s models/test_transformers.py - pytest -v -s models/test_transformers.py
- pytest -v -s models/test_registry.py - pytest -v -s models/test_registry.py
# V1 Test: https://github.com/vllm-project/vllm/issues/14531 - pytest -v -s models/test_initialization.py
- VLLM_USE_V1=0 pytest -v -s models/test_initialization.py
- label: Language Models Test (Standard) # 32min - label: Language Models Test (Standard) # 32min
#mirror_hardwares: [amd] #mirror_hardwares: [amd]
@ -525,12 +522,13 @@ steps:
# this test fails consistently. # this test fails consistently.
# TODO: investigate and fix # TODO: investigate and fix
# - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py # - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
- VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
- VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/test_disagg.py - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/disagg_test.py
- label: Plugin Tests (2 GPUs) # 40min - label: Plugin Tests (2 GPUs) # 40min
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"
num_gpus: 2 num_gpus: 2
fast_check: true
source_file_dependencies: source_file_dependencies:
- vllm/plugins/ - vllm/plugins/
- tests/plugins/ - tests/plugins/

View File

@ -53,7 +53,7 @@ repos:
entry: tools/mypy.sh 0 "local" entry: tools/mypy.sh 0 "local"
language: python language: python
types: [python] types: [python]
additional_dependencies: &mypy_deps [mypy==1.11.1, types-cachetools, types-setuptools, types-PyYAML, types-requests] additional_dependencies: &mypy_deps [mypy==1.11.1, types-setuptools, types-PyYAML, types-requests]
stages: [pre-commit] # Don't run in CI stages: [pre-commit] # Don't run in CI
- id: mypy-3.9 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward - id: mypy-3.9 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
name: Run mypy for Python 3.9 name: Run mypy for Python 3.9

View File

@ -46,8 +46,8 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101")
# requirements.txt files and should be kept consistent. The ROCm torch # requirements.txt files and should be kept consistent. The ROCm torch
# versions are derived from Dockerfile.rocm # versions are derived from Dockerfile.rocm
# #
set(TORCH_SUPPORTED_VERSION_CUDA "2.6.0") set(TORCH_SUPPORTED_VERSION_CUDA "2.5.1")
set(TORCH_SUPPORTED_VERSION_ROCM "2.6.0") set(TORCH_SUPPORTED_VERSION_ROCM "2.5.1")
# #
# Try to find python package with an executable that exactly matches # Try to find python package with an executable that exactly matches
@ -319,7 +319,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# Only build AllSpark kernels if we are building for at least some compatible archs. # Only build AllSpark kernels if we are building for at least some compatible archs.
cuda_archs_loose_intersection(ALLSPARK_ARCHS "8.0;8.6;8.7;8.9" "${CUDA_ARCHS}") cuda_archs_loose_intersection(ALLSPARK_ARCHS "8.0;8.6;8.7;8.9" "${CUDA_ARCHS}")
if (ALLSPARK_ARCHS) if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND ALLSPARK_ARCHS)
set(ALLSPARK_SRCS set(ALLSPARK_SRCS
"csrc/quantization/gptq_allspark/allspark_repack.cu" "csrc/quantization/gptq_allspark/allspark_repack.cu"
"csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu") "csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu")
@ -330,7 +330,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
message(STATUS "Building AllSpark kernels for archs: ${ALLSPARK_ARCHS}") message(STATUS "Building AllSpark kernels for archs: ${ALLSPARK_ARCHS}")
else() else()
message(STATUS "Not building AllSpark kernels as no compatible archs found" message(STATUS "Not building AllSpark kernels as no compatible archs found"
" in CUDA target architectures") " in CUDA target architectures, or CUDA not >= 12.0")
endif() endif()

View File

@ -222,7 +222,7 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
RUN --mount=type=cache,target=/root/.cache/uv \ RUN --mount=type=cache,target=/root/.cache/uv \
if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \ if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \
uv pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.1.post2/flashinfer_python-0.2.1.post2+cu124torch2.6-cp38-abi3-linux_x86_64.whl ; \ uv pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.1.post1/flashinfer_python-0.2.1.post1+cu124torch2.5-cp38-abi3-linux_x86_64.whl ; \
fi fi
COPY examples examples COPY examples examples

View File

@ -61,7 +61,6 @@ RUN python3 -m pip install --upgrade pip && rm -rf /var/lib/apt/lists/*
RUN --mount=type=bind,from=export_vllm,src=/,target=/install \ RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
cd /install \ cd /install \
&& pip install -U -r requirements/rocm.txt \ && pip install -U -r requirements/rocm.txt \
&& pip install -U -r requirements/rocm-test.txt \
&& pip uninstall -y vllm \ && pip uninstall -y vllm \
&& pip install *.whl && pip install *.whl

View File

@ -1,7 +1,11 @@
# oneapi 2025.0.2 docker base image use rolling 2448 package. https://dgpu-docs.intel.com/releases/packages.html?release=Rolling+2448.13&os=Ubuntu+22.04, and we don't need install driver manually. FROM intel/deep-learning-essentials:2025.0.1-0-devel-ubuntu22.04 AS vllm-base
FROM intel/deep-learning-essentials:2025.0.2-0-devel-ubuntu22.04 AS vllm-base
RUN rm /etc/apt/sources.list.d/intel-graphics.list RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \
wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \
echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
chmod 644 /usr/share/keyrings/intel-graphics.gpg
RUN apt-get update -y && \ RUN apt-get update -y && \
apt-get install -y --no-install-recommends --fix-missing \ apt-get install -y --no-install-recommends --fix-missing \
@ -17,6 +21,8 @@ RUN apt-get update -y && \
python3 \ python3 \
python3-dev \ python3-dev \
python3-pip \ python3-pip \
libze-intel-gpu-dev \
libze-intel-gpu1 \
wget wget
WORKDIR /workspace/vllm WORKDIR /workspace/vllm

View File

@ -13,10 +13,18 @@ Easy, fast, and cheap LLM serving for everyone
| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://vllm.ai"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> | | <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://vllm.ai"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> |
</p> </p>
---
Were excited to invite you to the first **vLLM China Meetup** on **March 16** in **Beijing**!
Join us to connect with the **vLLM team** and explore how vLLM is leveraged in **post-training, fine-tuning, and deployment**, including [verl](https://github.com/volcengine/verl), [LLaMA-Factory](https://github.com/hiyouga/LLaMA-Factory), and [vllm-ascend](https://github.com/vllm-project/vllm-ascend).
👉 **[Register Now](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg)** to be part of the discussion!
---
*Latest News* 🔥 *Latest News* 🔥
- [2025/03] We hosted [the first vLLM China Meetup](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg)! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1REHvfQMKGnvz6p3Fd23HhSO4c8j5WPGZV0bKYLwnHyQ/edit?usp=sharing).
- [2025/03] We hosted [the East Coast vLLM Meetup](https://lu.ma/7mu4k4xx)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1NHiv8EUFF1NLd3fEYODm56nDmL26lEeXCaDgyDlTsRs/edit#slide=id.g31441846c39_0_0).
- [2025/02] We hosted [the ninth vLLM meetup](https://lu.ma/h7g3kuj9) with Meta! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1jzC_PZVXrVNSFVCW-V4cFXb6pn7zZ2CyP_Flwo05aqg/edit?usp=sharing) and AMD [here](https://drive.google.com/file/d/1Zk5qEJIkTmlQ2eQcXQZlljAx3m9s7nwn/view?usp=sharing). The slides from Meta will not be posted. - [2025/02] We hosted [the ninth vLLM meetup](https://lu.ma/h7g3kuj9) with Meta! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1jzC_PZVXrVNSFVCW-V4cFXb6pn7zZ2CyP_Flwo05aqg/edit?usp=sharing) and AMD [here](https://drive.google.com/file/d/1Zk5qEJIkTmlQ2eQcXQZlljAx3m9s7nwn/view?usp=sharing). The slides from Meta will not be posted.
- [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html). - [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html).
- [2025/01] We hosted [the eighth vLLM meetup](https://lu.ma/zep56hui) with Google Cloud! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1epVkt4Zu8Jz_S5OhEHPc798emsYh2BwYfRuDDVEF7u4/edit?usp=sharing), and Google Cloud team [here](https://drive.google.com/file/d/1h24pHewANyRL11xy5dXUbvRC9F9Kkjix/view?usp=sharing). - [2025/01] We hosted [the eighth vLLM meetup](https://lu.ma/zep56hui) with Google Cloud! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1epVkt4Zu8Jz_S5OhEHPc798emsYh2BwYfRuDDVEF7u4/edit?usp=sharing), and Google Cloud team [here](https://drive.google.com/file/d/1h24pHewANyRL11xy5dXUbvRC9F9Kkjix/view?usp=sharing).

View File

@ -43,26 +43,20 @@ become available.
<tr> <tr>
<td><strong>HuggingFace</strong></td> <td><strong>HuggingFace</strong></td>
<td style="text-align: center;"></td> <td style="text-align: center;"></td>
<td style="text-align: center;">🟡</td> <td style="text-align: center;">🚧</td>
<td>Specify your dataset path on HuggingFace</td> <td>Specify your dataset path on HuggingFace</td>
</tr> </tr>
<tr> <tr>
<td><strong>VisionArena</strong></td> <td><strong>VisionArena</strong></td>
<td style="text-align: center;"></td> <td style="text-align: center;"></td>
<td style="text-align: center;"></td> <td style="text-align: center;">🚧</td>
<td><code>lmarena-ai/vision-arena-bench-v0.1</code> (a HuggingFace dataset)</td> <td><code>lmarena-ai/vision-arena-bench-v0.1</code> (a HuggingFace dataset)</td>
</tr> </tr>
</tbody> </tbody>
</table> </table>
✅: supported
✅: supported
🚧: to be supported 🚧: to be supported
🟡: Partial support. Currently, HuggingFaceDataset only supports dataset formats
similar to `lmms-lab/LLaVA-OneVision-Data`. If you need support for other dataset
formats, please consider contributing.
**Note**: VisionArenas `dataset-name` should be set to `hf` **Note**: VisionArenas `dataset-name` should be set to `hf`
--- ---
@ -82,10 +76,10 @@ Then run the benchmarking script
# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json # wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
MODEL_NAME="NousResearch/Hermes-3-Llama-3.1-8B" MODEL_NAME="NousResearch/Hermes-3-Llama-3.1-8B"
NUM_PROMPTS=10 NUM_PROMPTS=10
BACKEND="vllm" BACKEND="openai-chat"
DATASET_NAME="sharegpt" DATASET_NAME="sharegpt"
DATASET_PATH="<your data path>/ShareGPT_V3_unfiltered_cleaned_split.json" DATASET_PATH="<your data path>/ShareGPT_V3_unfiltered_cleaned_split.json"
python3 vllm/benchmarks/benchmark_serving.py --backend ${BACKEND} --model ${MODEL_NAME} --endpoint /v1/completions --dataset-name ${DATASET_NAME} --dataset-path ${DATASET_PATH} --num-prompts ${NUM_PROMPTS} python3 benchmarks/benchmark_serving.py --backend ${BACKEND} --model ${MODEL_NAME} --endpoint /v1/chat/completions --dataset-name ${DATASET_NAME} --dataset-path ${DATASET_PATH} --num-prompts ${NUM_PROMPTS}
``` ```
If successful, you will see the following output If successful, you will see the following output
@ -129,7 +123,7 @@ DATASET_NAME="hf"
DATASET_PATH="lmarena-ai/vision-arena-bench-v0.1" DATASET_PATH="lmarena-ai/vision-arena-bench-v0.1"
DATASET_SPLIT='train' DATASET_SPLIT='train'
python3 vllm/benchmarks/benchmark_serving.py \ python3 benchmarks/benchmark_serving.py \
--backend "${BACKEND}" \ --backend "${BACKEND}" \
--model "${MODEL_NAME}" \ --model "${MODEL_NAME}" \
--endpoint "/v1/chat/completions" \ --endpoint "/v1/chat/completions" \
@ -146,65 +140,35 @@ python3 vllm/benchmarks/benchmark_serving.py \
MODEL_NAME="NousResearch/Hermes-3-Llama-3.1-8B" MODEL_NAME="NousResearch/Hermes-3-Llama-3.1-8B"
NUM_PROMPTS=10 NUM_PROMPTS=10
DATASET_NAME="sonnet" DATASET_NAME="sonnet"
DATASET_PATH="vllm/benchmarks/sonnet.txt" DATASET_PATH="benchmarks/sonnet.txt"
python3 vllm/benchmarks/benchmark_throughput.py \ python3 benchmarks/benchmark_throughput.py \
--model "${MODEL_NAME}" \ --model "${MODEL_NAME}" \
--dataset-name "${DATASET_NAME}" \ --dataset-name "${DATASET_NAME}" \
--dataset-path "${DATASET_PATH}" \ --dataset-path "${DATASET_PATH}" \
--num-prompts "${NUM_PROMPTS}" --num-prompts "${NUM_PROMPTS}"
``` ```
If successful, you will see the following output If successful, you will see the following output
``` ```
Throughput: 7.15 requests/s, 4656.00 total tokens/s, 1072.15 output tokens/s Throughput: 7.35 requests/s, 4789.20 total tokens/s, 1102.83 output tokens/s
Total num prompt tokens: 5014
Total num output tokens: 1500
```
### VisionArena Benchmark for Vision Language Models
``` bash
MODEL_NAME="Qwen/Qwen2-VL-7B-Instruct"
NUM_PROMPTS=10
DATASET_NAME="hf"
DATASET_PATH="lmarena-ai/vision-arena-bench-v0.1"
DATASET_SPLIT="train"
python3 vllm/benchmarks/benchmark_throughput.py \
--model "${MODEL_NAME}" \
--backend "vllm-chat" \
--dataset-name "${DATASET_NAME}" \
--dataset-path "${DATASET_PATH}" \
--num-prompts "${NUM_PROMPTS}" \
--hf-split "${DATASET_SPLIT}"
```
The `num prompt tokens` now includes image token counts
```
Throughput: 2.55 requests/s, 4036.92 total tokens/s, 326.90 output tokens/s
Total num prompt tokens: 14527
Total num output tokens: 1280
``` ```
### Benchmark with LoRA Adapters ### Benchmark with LoRA Adapters
``` bash ``` bash
# download dataset
# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
MODEL_NAME="meta-llama/Llama-2-7b-hf" MODEL_NAME="meta-llama/Llama-2-7b-hf"
BACKEND="vllm" BACKEND="vllm"
DATASET_NAME="sharegpt" DATASET_NAME="sharegpt"
DATASET_PATH="<your data path>/ShareGPT_V3_unfiltered_cleaned_split.json" DATASET_PATH="/home/jovyan/data/vllm_benchmark_datasets/ShareGPT_V3_unfiltered_cleaned_split.json"
NUM_PROMPTS=10 NUM_PROMPTS=10
MAX_LORAS=2 MAX_LORAS=2
MAX_LORA_RANK=8 MAX_LORA_RANK=8
ENABLE_LORA="--enable-lora" ENABLE_LORA="--enable-lora"
LORA_PATH="yard1/llama-2-7b-sql-lora-test" LORA_PATH="yard1/llama-2-7b-sql-lora-test"
python3 vllm/benchmarks/benchmark_throughput.py \ python3 benchmarks/benchmark_throughput.py \
--model "${MODEL_NAME}" \ --model "${MODEL_NAME}" \
--backend "${BACKEND}" \ --backend "${BACKEND}" \
--dataset_path "${DATASET_PATH}" \ --dataset_path "${DATASET_PATH}" \

View File

@ -14,8 +14,7 @@ from tqdm.asyncio import tqdm
from transformers import (AutoTokenizer, PreTrainedTokenizer, from transformers import (AutoTokenizer, PreTrainedTokenizer,
PreTrainedTokenizerFast) PreTrainedTokenizerFast)
# NOTE(simon): do not import vLLM here so the benchmark script from vllm.model_executor.model_loader.weight_utils import get_lock
# can run without vLLM installed.
AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60) AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
@ -334,7 +333,7 @@ async def async_request_openai_chat_completions(
) -> RequestFuncOutput: ) -> RequestFuncOutput:
api_url = request_func_input.api_url api_url = request_func_input.api_url
assert api_url.endswith( assert api_url.endswith(
("chat/completions", "profile") "chat/completions"
), "OpenAI Chat Completions API URL must end with 'chat/completions'." ), "OpenAI Chat Completions API URL must end with 'chat/completions'."
async with aiohttp.ClientSession(trust_env=True, async with aiohttp.ClientSession(trust_env=True,
@ -428,8 +427,6 @@ def get_model(pretrained_model_name_or_path: str) -> str:
if os.getenv('VLLM_USE_MODELSCOPE', 'False').lower() == 'true': if os.getenv('VLLM_USE_MODELSCOPE', 'False').lower() == 'true':
from modelscope import snapshot_download from modelscope import snapshot_download
from vllm.model_executor.model_loader.weight_utils import get_lock
# Use file lock to prevent multiple processes from # Use file lock to prevent multiple processes from
# downloading the same model weights at the same time. # downloading the same model weights at the same time.
with get_lock(pretrained_model_name_or_path): with get_lock(pretrained_model_name_or_path):

View File

@ -46,7 +46,7 @@ class SampleRequest:
Represents a single inference request for benchmarking. Represents a single inference request for benchmarking.
""" """
prompt: Union[str, Any] prompt: str
prompt_len: int prompt_len: int
expected_output_len: int expected_output_len: int
multi_modal_data: Optional[Union[MultiModalDataDict, dict]] = None multi_modal_data: Optional[Union[MultiModalDataDict, dict]] = None
@ -84,20 +84,6 @@ class BenchmarkDataset(ABC):
if random_seed is not None else self.DEFAULT_SEED) if random_seed is not None else self.DEFAULT_SEED)
self.data = None self.data = None
def apply_multimodal_chat_transformation(
self,
prompt: str,
mm_content: Optional[MultiModalDataDict] = None) -> list[dict]:
"""
Transform a prompt and optional multimodal content into a chat format.
This method is used for chat models that expect a specific
conversation format.
"""
content = [{"text": prompt, "type": "text"}]
if mm_content is not None:
content.append(mm_content)
return [{"role": "user", "content": content}]
def load_data(self) -> None: def load_data(self) -> None:
""" """
Load data from the dataset path into self.data. Load data from the dataset path into self.data.
@ -352,7 +338,6 @@ class ShareGPTDataset(BenchmarkDataset):
lora_path: Optional[str] = None, lora_path: Optional[str] = None,
max_loras: Optional[int] = None, max_loras: Optional[int] = None,
output_len: Optional[int] = None, output_len: Optional[int] = None,
enable_multimodal_chat: bool = False,
**kwargs) -> list: **kwargs) -> list:
samples: list = [] samples: list = []
for entry in self.data: for entry in self.data:
@ -373,9 +358,6 @@ class ShareGPTDataset(BenchmarkDataset):
skip_min_output_len_check=output_len skip_min_output_len_check=output_len
is not None): is not None):
continue continue
if enable_multimodal_chat:
prompt = self.apply_multimodal_chat_transformation(
prompt, None)
samples.append( samples.append(
SampleRequest( SampleRequest(
prompt=prompt, prompt=prompt,
@ -568,13 +550,10 @@ class HuggingFaceDataset(BenchmarkDataset):
split=self.dataset_split, split=self.dataset_split,
streaming=True, streaming=True,
) )
if self.data.features is None or "conversations" \
not in self.data.features: if "conversations" not in self.data.features:
raise ValueError( raise ValueError("HF Dataset must have a 'conversations' column.")
"HuggingFaceDataset currently only supports datasets with "
"a 'conversations' column like lmms-lab/LLaVA-OneVision-Data. "
"Please consider contributing if you would like to add "
"support for additional dataset formats.")
# Shuffle and filter examples with at least 2 conversations. # Shuffle and filter examples with at least 2 conversations.
self.data = self.data.shuffle(seed=self.random_seed).filter( self.data = self.data.shuffle(seed=self.random_seed).filter(
lambda x: len(x["conversations"]) >= 2) lambda x: len(x["conversations"]) >= 2)
@ -582,8 +561,9 @@ class HuggingFaceDataset(BenchmarkDataset):
def sample(self, def sample(self,
tokenizer: PreTrainedTokenizerBase, tokenizer: PreTrainedTokenizerBase,
num_requests: int, num_requests: int,
lora_path: Optional[str] = None,
max_loras: Optional[int] = None,
output_len: Optional[int] = None, output_len: Optional[int] = None,
enable_multimodal_chat: bool = False,
**kwargs) -> list: **kwargs) -> list:
sampled_requests = [] sampled_requests = []
dynamic_output = output_len is None dynamic_output = output_len is None
@ -591,9 +571,13 @@ class HuggingFaceDataset(BenchmarkDataset):
for item in self.data: for item in self.data:
if len(sampled_requests) >= num_requests: if len(sampled_requests) >= num_requests:
break break
conv = item["conversations"] conv = item["conversations"]
prompt, completion = conv[0]["value"], conv[1]["value"] prompt, completion = conv[0]["value"], conv[1]["value"]
lora_request, tokenizer = self.get_random_lora_request(
tokenizer, lora_path=lora_path, max_loras=max_loras)
prompt_ids = tokenizer(prompt).input_ids prompt_ids = tokenizer(prompt).input_ids
completion_ids = tokenizer(completion).input_ids completion_ids = tokenizer(completion).input_ids
prompt_len = len(prompt_ids) prompt_len = len(prompt_ids)
@ -603,20 +587,16 @@ class HuggingFaceDataset(BenchmarkDataset):
if dynamic_output and not is_valid_sequence( if dynamic_output and not is_valid_sequence(
prompt_len, completion_len): prompt_len, completion_len):
continue continue
mm_content = process_image( mm_content = process_image(
item["image"]) if "image" in item else None item["image"]) if "image" in item else None
if enable_multimodal_chat:
# Note: when chat is enabled the request prompt_len is no longer
# accurate and we will be using request output to count the
# actual prompt len and output len
prompt = self.apply_multimodal_chat_transformation(
prompt, mm_content)
sampled_requests.append( sampled_requests.append(
SampleRequest( SampleRequest(
prompt=prompt, prompt=prompt,
prompt_len=prompt_len, prompt_len=prompt_len,
expected_output_len=output_len, expected_output_len=output_len,
multi_modal_data=mm_content, multi_modal_data=mm_content,
lora_request=lora_request,
)) ))
return sampled_requests return sampled_requests
@ -626,7 +606,7 @@ class HuggingFaceDataset(BenchmarkDataset):
# ----------------------------------------------------------------------------- # -----------------------------------------------------------------------------
class VisionArenaDataset(HuggingFaceDataset): class VisionArenaDataset(BenchmarkDataset):
""" """
Vision Arena Dataset. Vision Arena Dataset.
""" """
@ -637,9 +617,14 @@ class VisionArenaDataset(HuggingFaceDataset):
def __init__( def __init__(
self, self,
dataset_split: str,
dataset_subset: Optional[str] = None,
**kwargs, **kwargs,
) -> None: ) -> None:
super().__init__(**kwargs) super().__init__(**kwargs)
self.dataset_split = dataset_split
self.dataset_subset = dataset_subset
if self.dataset_path != self.VISION_ARENA_DATASET_PATH: if self.dataset_path != self.VISION_ARENA_DATASET_PATH:
raise ValueError(f"Only support Vision Arena dataset.\ raise ValueError(f"Only support Vision Arena dataset.\
This data path {self.dataset_path} is not valid.") This data path {self.dataset_path} is not valid.")
@ -660,9 +645,9 @@ class VisionArenaDataset(HuggingFaceDataset):
def sample(self, def sample(self,
tokenizer: PreTrainedTokenizerBase, tokenizer: PreTrainedTokenizerBase,
num_requests: int, num_requests: int,
output_len: Optional[int] = None, output_len: int = DEFAULT_OUTPUT_LEN,
enable_multimodal_chat: bool = False,
**kwargs) -> list: **kwargs) -> list:
# TODO (jenniferzhao): Add support for offline benchmark sampling
output_len = (output_len output_len = (output_len
if output_len is not None else self.DEFAULT_OUTPUT_LEN) if output_len is not None else self.DEFAULT_OUTPUT_LEN)
sampled_requests = [] sampled_requests = []
@ -670,14 +655,8 @@ class VisionArenaDataset(HuggingFaceDataset):
if len(sampled_requests) >= num_requests: if len(sampled_requests) >= num_requests:
break break
prompt = item["turns"][0][0]["content"] prompt = item["turns"][0][0]["content"]
mm_content = process_image(item["images"][0])
prompt_len = len(tokenizer(prompt).input_ids) prompt_len = len(tokenizer(prompt).input_ids)
if enable_multimodal_chat: mm_content = process_image(item["images"][0])
# Note: when chat is enabled the request prompt_len is no longer
# accurate and we will be using request output to count the
# actual prompt len
prompt = self.apply_multimodal_chat_transformation(
prompt, mm_content)
sampled_requests.append( sampled_requests.append(
SampleRequest( SampleRequest(
prompt=prompt, prompt=prompt,

View File

@ -684,15 +684,6 @@ def main(args: argparse.Namespace):
"Invalid metadata format. Please use KEY=VALUE format." "Invalid metadata format. Please use KEY=VALUE format."
) )
if not args.save_detailed:
# Remove fields with too many data points
for field in [
"input_lens", "output_lens", "ttfts", "itls",
"generated_texts", "errors"
]:
if field in result_json:
del result_json[field]
# Traffic # Traffic
result_json["request_rate"] = (args.request_rate if args.request_rate result_json["request_rate"] = (args.request_rate if args.request_rate
< float("inf") else "inf") < float("inf") else "inf")
@ -837,12 +828,6 @@ if __name__ == "__main__":
action="store_true", action="store_true",
help="Specify to save benchmark results to a json file", help="Specify to save benchmark results to a json file",
) )
parser.add_argument(
"--save-detailed",
action="store_true",
help="When saving the results, whether to include per request "
"information such as response, error, ttfs, tpots, etc.",
)
parser.add_argument( parser.add_argument(
"--metadata", "--metadata",
metavar="KEY=VALUE", metavar="KEY=VALUE",

View File

@ -732,11 +732,8 @@ def main(args: argparse.Namespace):
api_url = f"http://{args.host}:{args.port}{args.endpoint}" api_url = f"http://{args.host}:{args.port}{args.endpoint}"
base_url = f"http://{args.host}:{args.port}" base_url = f"http://{args.host}:{args.port}"
tokenizer = get_tokenizer( tokenizer = get_tokenizer(tokenizer_id,
tokenizer_id, trust_remote_code=args.trust_remote_code)
trust_remote_code=args.trust_remote_code,
tokenizer_mode=args.tokenizer_mode,
)
if args.dataset == 'grammar': if args.dataset == 'grammar':
args.structure_type = 'guided_grammar' args.structure_type = 'guided_grammar'
@ -879,13 +876,6 @@ if __name__ == "__main__":
help= help=
"Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501 "Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501
) )
parser.add_argument(
"--tokenizer-mode",
type=str,
default="auto",
help=
"Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501
)
parser.add_argument( parser.add_argument(
"--num-prompts", "--num-prompts",
type=int, type=int,

View File

@ -11,9 +11,8 @@ from typing import Any, Optional, Union
import torch import torch
import uvloop import uvloop
from benchmark_dataset import (BurstGPTDataset, HuggingFaceDataset, from benchmark_dataset import (BurstGPTDataset, RandomDataset, SampleRequest,
RandomDataset, SampleRequest, ShareGPTDataset, ShareGPTDataset, SonnetDataset)
SonnetDataset, VisionArenaDataset)
from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
from tqdm import tqdm from tqdm import tqdm
from transformers import (AutoModelForCausalLM, AutoTokenizer, from transformers import (AutoModelForCausalLM, AutoTokenizer,
@ -24,7 +23,6 @@ from vllm.entrypoints.openai.api_server import (
build_async_engine_client_from_engine_args) build_async_engine_client_from_engine_args)
from vllm.inputs import TextPrompt, TokensPrompt from vllm.inputs import TextPrompt, TokensPrompt
from vllm.lora.request import LoRARequest from vllm.lora.request import LoRARequest
from vllm.outputs import RequestOutput
from vllm.sampling_params import BeamSearchParams from vllm.sampling_params import BeamSearchParams
from vllm.utils import FlexibleArgumentParser, merge_async_iterators from vllm.utils import FlexibleArgumentParser, merge_async_iterators
@ -34,7 +32,7 @@ def run_vllm(
n: int, n: int,
engine_args: EngineArgs, engine_args: EngineArgs,
disable_detokenize: bool = False, disable_detokenize: bool = False,
) -> tuple[float, Optional[list[RequestOutput]]]: ) -> float:
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
llm = LLM(**dataclasses.asdict(engine_args)) llm = LLM(**dataclasses.asdict(engine_args))
assert all( assert all(
@ -68,13 +66,12 @@ def run_vllm(
use_beam_search = False use_beam_search = False
outputs = None
if not use_beam_search: if not use_beam_search:
start = time.perf_counter() start = time.perf_counter()
outputs = llm.generate(prompts, llm.generate(prompts,
sampling_params, sampling_params,
lora_request=lora_requests, lora_request=lora_requests,
use_tqdm=True) use_tqdm=True)
end = time.perf_counter() end = time.perf_counter()
else: else:
assert lora_requests is None, "BeamSearch API does not support LoRA" assert lora_requests is None, "BeamSearch API does not support LoRA"
@ -92,46 +89,7 @@ def run_vllm(
ignore_eos=True, ignore_eos=True,
)) ))
end = time.perf_counter() end = time.perf_counter()
return end - start, outputs return end - start
def run_vllm_chat(
requests: list[SampleRequest],
n: int,
engine_args: EngineArgs,
disable_detokenize: bool = False) -> tuple[float, list[RequestOutput]]:
"""
Run vLLM chat benchmark. This function is recommended ONLY for benchmarking
multimodal models as it properly handles multimodal inputs and chat
formatting. For non-multimodal models, use run_vllm() instead.
"""
from vllm import LLM, SamplingParams
llm = LLM(**dataclasses.asdict(engine_args))
assert all(
llm.llm_engine.model_config.max_model_len >= (
request.prompt_len + request.expected_output_len)
for request in requests), (
"Please ensure that max_model_len is greater than the sum of "
"prompt_len and expected_output_len for all requests.")
prompts = []
sampling_params: list[SamplingParams] = []
for request in requests:
prompts.append(request.prompt)
sampling_params.append(
SamplingParams(
n=n,
temperature=1.0,
top_p=1.0,
ignore_eos=True,
max_tokens=request.expected_output_len,
detokenize=not disable_detokenize,
))
start = time.perf_counter()
outputs = llm.chat(prompts, sampling_params, use_tqdm=True)
end = time.perf_counter()
return end - start, outputs
async def run_vllm_async( async def run_vllm_async(
@ -306,8 +264,6 @@ def get_requests(args, tokenizer):
dataset_cls = RandomDataset dataset_cls = RandomDataset
elif args.dataset_name == "sharegpt": elif args.dataset_name == "sharegpt":
dataset_cls = ShareGPTDataset dataset_cls = ShareGPTDataset
if args.backend == "vllm-chat":
sample_kwargs["enable_multimodal_chat"] = True
elif args.dataset_name == "sonnet": elif args.dataset_name == "sonnet":
assert tokenizer.chat_template or tokenizer.default_chat_template, ( assert tokenizer.chat_template or tokenizer.default_chat_template, (
"Tokenizer/model must have chat template for sonnet dataset.") "Tokenizer/model must have chat template for sonnet dataset.")
@ -316,19 +272,6 @@ def get_requests(args, tokenizer):
sample_kwargs["return_prompt_formatted"] = True sample_kwargs["return_prompt_formatted"] = True
elif args.dataset_name == "burstgpt": elif args.dataset_name == "burstgpt":
dataset_cls = BurstGPTDataset dataset_cls = BurstGPTDataset
elif args.dataset_name == "hf":
if args.backend != "vllm-chat":
raise ValueError(
"hf datasets only are supported by vllm-chat backend")
# Choose between VisionArenaDataset and HuggingFaceDataset based on
# provided parameters.
dataset_cls = (VisionArenaDataset if args.dataset_path
== VisionArenaDataset.VISION_ARENA_DATASET_PATH
and args.hf_subset is None else HuggingFaceDataset)
common_kwargs['dataset_subset'] = args.hf_subset
common_kwargs['dataset_split'] = args.hf_split
sample_kwargs["enable_multimodal_chat"] = True
else: else:
raise ValueError(f"Unknown dataset name: {args.dataset_name}") raise ValueError(f"Unknown dataset name: {args.dataset_name}")
# Remove None values # Remove None values
@ -347,7 +290,6 @@ def main(args: argparse.Namespace):
requests = get_requests(args, tokenizer) requests = get_requests(args, tokenizer)
is_multi_modal = any(request.multi_modal_data is not None is_multi_modal = any(request.multi_modal_data is not None
for request in requests) for request in requests)
request_outputs: Optional[list[RequestOutput]] = None
if args.backend == "vllm": if args.backend == "vllm":
if args.async_engine: if args.async_engine:
elapsed_time = uvloop.run( elapsed_time = uvloop.run(
@ -359,9 +301,9 @@ def main(args: argparse.Namespace):
args.disable_detokenize, args.disable_detokenize,
)) ))
else: else:
elapsed_time, request_outputs = run_vllm( elapsed_time = run_vllm(requests, args.n,
requests, args.n, EngineArgs.from_cli_args(args), EngineArgs.from_cli_args(args),
args.disable_detokenize) args.disable_detokenize)
elif args.backend == "hf": elif args.backend == "hf":
assert args.tensor_parallel_size == 1 assert args.tensor_parallel_size == 1
elapsed_time = run_hf(requests, args.model, tokenizer, args.n, elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
@ -370,45 +312,20 @@ def main(args: argparse.Namespace):
elif args.backend == "mii": elif args.backend == "mii":
elapsed_time = run_mii(requests, args.model, args.tensor_parallel_size, elapsed_time = run_mii(requests, args.model, args.tensor_parallel_size,
args.output_len) args.output_len)
elif args.backend == "vllm-chat":
elapsed_time, request_outputs = run_vllm_chat(
requests, args.n, EngineArgs.from_cli_args(args),
args.disable_detokenize)
else: else:
raise ValueError(f"Unknown backend: {args.backend}") raise ValueError(f"Unknown backend: {args.backend}")
total_num_tokens = sum(request.prompt_len + request.expected_output_len
if request_outputs: for request in requests)
# Note: with the vllm and vllm-chat backends, total_output_tokens = sum(request.expected_output_len
# we have request_outputs, which we use to count tokens. for request in requests)
total_prompt_tokens = 0 if is_multi_modal:
total_output_tokens = 0 print("\033[91mWARNING\033[0m: Multi-modal request detected. The "
for ro in request_outputs:
if not isinstance(ro, RequestOutput):
continue
total_prompt_tokens += len(
ro.prompt_token_ids) if ro.prompt_token_ids else 0
total_output_tokens += sum(
len(o.token_ids) for o in ro.outputs if o)
total_num_tokens = total_prompt_tokens + total_output_tokens
else:
total_num_tokens = sum(r.prompt_len + r.expected_output_len
for r in requests)
total_output_tokens = sum(r.expected_output_len for r in requests)
total_prompt_tokens = total_num_tokens - total_output_tokens
if is_multi_modal and args.backend != "vllm-chat":
print("\033[91mWARNING\033[0m: Multi-modal request with "
f"{args.backend} backend detected. The "
"following metrics are not accurate because image tokens are not" "following metrics are not accurate because image tokens are not"
" counted. See vllm-project/vllm/issues/9778 for details.") " counted. See vllm-project/vllm/issues/9778 for details.")
# TODO(vllm-project/vllm/issues/9778): Count multi-modal token length. # TODO(vllm-project/vllm/issues/9778): Count multi-modal token length.
# vllm-chat backend counts the image tokens now
print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, " print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
f"{total_num_tokens / elapsed_time:.2f} total tokens/s, " f"{total_num_tokens / elapsed_time:.2f} total tokens/s, "
f"{total_output_tokens / elapsed_time:.2f} output tokens/s") f"{total_output_tokens / elapsed_time:.2f} output tokens/s")
print(f"Total num prompt tokens: {total_prompt_tokens}")
print(f"Total num output tokens: {total_output_tokens}")
# Output JSON results if specified # Output JSON results if specified
if args.output_json: if args.output_json:
@ -424,100 +341,17 @@ def main(args: argparse.Namespace):
save_to_pytorch_benchmark_format(args, results) save_to_pytorch_benchmark_format(args, results)
def validate_args(args):
"""
Validate command-line arguments.
"""
# === Deprecation and Defaulting ===
if args.dataset is not None:
warnings.warn(
"The '--dataset' argument will be deprecated in the next release. "
"Please use '--dataset-name' and '--dataset-path' instead.",
stacklevel=2)
args.dataset_path = args.dataset
if not getattr(args, "tokenizer", None):
args.tokenizer = args.model
# === Backend Validation ===
valid_backends = {"vllm", "hf", "mii", "vllm-chat"}
if args.backend not in valid_backends:
raise ValueError(f"Unsupported backend: {args.backend}")
# === Dataset Configuration ===
if not args.dataset and not args.dataset_path:
print(
"When dataset path is not set, it will default to random dataset")
args.dataset_name = 'random'
if args.input_len is None:
raise ValueError("input_len must be provided for a random dataset")
# === Dataset Name Specific Checks ===
# --hf-subset and --hf-split: only used
# when dataset_name is 'hf'
if args.dataset_name != "hf" and (
getattr(args, "hf_subset", None) is not None
or getattr(args, "hf_split", None) is not None):
warnings.warn("--hf-subset and --hf-split will be ignored \
since --dataset-name is not 'hf'.",
stacklevel=2)
elif args.dataset_name == "hf" and args.backend != "vllm-chat":
raise ValueError(
"When --dataset-name is 'hf', backend must be 'vllm-chat'")
# --random-range-ratio: only used when dataset_name is 'random'
if args.dataset_name != 'random' and args.random_range_ratio is not None:
warnings.warn("--random-range-ratio will be ignored since \
--dataset-name is not 'random'.",
stacklevel=2)
# --prefix-len: only used when dataset_name is 'random', 'sonnet', or not
# set.
if args.dataset_name not in {"random", "sonnet", None
} and args.prefix_len is not None:
warnings.warn("--prefix-len will be ignored since --dataset-name\
is not 'random', 'sonnet', or not set.",
stacklevel=2)
# === LoRA Settings ===
if getattr(args, "enable_lora", False) and args.backend != "vllm":
raise ValueError(
"LoRA benchmarking is only supported for vLLM backend")
if getattr(args, "enable_lora", False) and args.lora_path is None:
raise ValueError("LoRA path must be provided when enable_lora is True")
# === Backend-specific Validations ===
if args.backend == "hf" and args.hf_max_batch_size is None:
raise ValueError("HF max batch size is required for HF backend")
if args.backend != "hf" and args.hf_max_batch_size is not None:
raise ValueError("HF max batch size is only for HF backend.")
if args.backend in {"hf", "mii"} and getattr(args, "quantization",
None) is not None:
raise ValueError("Quantization is only for vLLM backend.")
if args.backend == "mii" and args.dtype != "auto":
raise ValueError("dtype must be auto for MII backend.")
if args.backend == "mii" and args.n != 1:
raise ValueError("n must be 1 for MII backend.")
if args.backend == "mii" and args.tokenizer != args.model:
raise ValueError(
"Tokenizer must be the same as the model for MII backend.")
if __name__ == "__main__": if __name__ == "__main__":
parser = FlexibleArgumentParser(description="Benchmark the throughput.") parser = FlexibleArgumentParser(description="Benchmark the throughput.")
parser.add_argument("--backend", parser.add_argument("--backend",
type=str, type=str,
choices=["vllm", "hf", "mii", "vllm-chat"], choices=["vllm", "hf", "mii"],
default="vllm") default="vllm")
parser.add_argument( parser.add_argument("--dataset-name",
"--dataset-name", type=str,
type=str, choices=["sharegpt", "random", "sonnet", "burstgpt"],
choices=["sharegpt", "random", "sonnet", "burstgpt", "hf"], help="Name of the dataset to benchmark on.",
help="Name of the dataset to benchmark on.", default="sharegpt")
default="sharegpt")
parser.add_argument( parser.add_argument(
"--dataset", "--dataset",
type=str, type=str,
@ -585,24 +419,55 @@ if __name__ == "__main__":
parser.add_argument( parser.add_argument(
"--random-range-ratio", "--random-range-ratio",
type=float, type=float,
default=None, default=1.0,
help="Range of sampled ratio of input/output length, " help="Range of sampled ratio of input/output length, "
"used only for RandomDataSet.", "used only for RandomDataSet.",
) )
# hf dtaset
parser.add_argument("--hf-subset",
type=str,
default=None,
help="Subset of the HF dataset.")
parser.add_argument("--hf-split",
type=str,
default=None,
help="Split of the HF dataset.")
parser = AsyncEngineArgs.add_cli_args(parser) parser = AsyncEngineArgs.add_cli_args(parser)
args = parser.parse_args() args = parser.parse_args()
if args.tokenizer is None: if args.tokenizer is None:
args.tokenizer = args.model args.tokenizer = args.model
validate_args(args) if args.dataset is not None:
warnings.warn(
"The '--dataset' argument will be deprecated in the next "
"release. Please use '--dataset-name' and "
"'--dataset-path' in the future runs.",
stacklevel=2)
args.dataset_path = args.dataset
if args.dataset is None and args.dataset_path is None:
# for random dataset, the default sampling setting is in
# benchmark_dataset.RandomDataset
print("When dataset is not set, it will default to random dataset")
else:
assert args.input_len is None
if args.enable_lora:
assert args.lora_path is not None
if args.backend == "vllm":
if args.hf_max_batch_size is not None:
raise ValueError("HF max batch size is only for HF backend.")
elif args.backend == "hf":
if args.hf_max_batch_size is None:
raise ValueError("HF max batch size is required for HF backend.")
if args.quantization is not None:
raise ValueError("Quantization is only for vLLM backend.")
if args.enable_lora is not None:
raise ValueError("LoRA benchmarking is only supported for vLLM"
" backend")
elif args.backend == "mii":
if args.dtype != "auto":
raise ValueError("dtype must be auto for MII backend.")
if args.n != 1:
raise ValueError("n must be 1 for MII backend.")
if args.quantization is not None:
raise ValueError("Quantization is only for vLLM backend.")
if args.hf_max_batch_size is not None:
raise ValueError("HF max batch size is only for HF backend.")
if args.tokenizer != args.model:
raise ValueError("Tokenizer must be the same as the model for MII "
"backend.")
if args.enable_lora is not None:
raise ValueError("LoRA benchmarking is only supported for vLLM"
" backend")
main(args) main(args)

View File

@ -17,8 +17,13 @@ from torch.utils.benchmark import Measurement as TMeasurement
from utils import ArgPool, Bench, CudaGraphBenchParams from utils import ArgPool, Bench, CudaGraphBenchParams
from weight_shapes import WEIGHT_SHAPES from weight_shapes import WEIGHT_SHAPES
from vllm.lora.ops.triton_ops import LoRAKernelMeta, lora_expand, lora_shrink from vllm.lora.ops.triton_ops.bgmv_expand import bgmv_expand
from vllm.lora.ops.triton_ops.bgmv_expand_slice import bgmv_expand_slice
from vllm.lora.ops.triton_ops.bgmv_shrink import bgmv_shrink
from vllm.lora.ops.triton_ops.sgmv_expand import sgmv_expand
from vllm.lora.ops.triton_ops.sgmv_shrink import sgmv_shrink
from vllm.lora.ops.triton_ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT from vllm.lora.ops.triton_ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT
from vllm.lora.ops.triton_ops.v1 import V1KernelMeta, v1_expand, v1_shrink
from vllm.utils import FlexibleArgumentParser from vllm.utils import FlexibleArgumentParser
DEFAULT_MODELS = list(WEIGHT_SHAPES.keys()) DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
@ -162,25 +167,69 @@ class OpType(Enum):
""" """
LoRA Ops to benchmark and its properties. LoRA Ops to benchmark and its properties.
""" """
LORA_SHRINK = auto() SGMV_SHRINK = auto()
LORA_EXPAND = auto() BGMV_SHRINK = auto()
SGMV_EXPAND = auto()
BGMV_EXPAND = auto()
BGMV_EXPAND_SLICE = auto()
V1_SHRINK = auto()
V1_EXPAND = auto()
@staticmethod @staticmethod
def from_str(s: str) -> "OpType": def from_str(s: str) -> "OpType":
if s.lower() == "lora_shrink": if s.lower() == 'sgmv_shrink':
return OpType.LORA_SHRINK return OpType.SGMV_SHRINK
if s.lower() == "lora_expand": if s.lower() == 'sgmv_expand':
return OpType.LORA_EXPAND return OpType.SGMV_EXPAND
if s.lower() == 'bgmv_shrink':
return OpType.BGMV_SHRINK
if s.lower() == 'bgmv_expand':
return OpType.BGMV_EXPAND
if s.lower() == "bgmv_expand_slice":
return OpType.BGMV_EXPAND_SLICE
if s.lower() == "v1_shrink":
return OpType.V1_SHRINK
if s.lower() == "v1_expand":
return OpType.V1_EXPAND
raise ValueError(f"Unrecognized str {s} to convert to OpType") raise ValueError(f"Unrecognized str {s} to convert to OpType")
def is_shrink_fn(self) -> bool: def is_shrink_fn(self) -> bool:
return self in [OpType.LORA_SHRINK] return self in [
OpType.SGMV_SHRINK, OpType.BGMV_SHRINK, OpType.V1_SHRINK
]
def is_expand_fn(self) -> bool: def is_expand_fn(self) -> bool:
return self in [OpType.LORA_EXPAND] return self in [
OpType.SGMV_EXPAND, OpType.BGMV_EXPAND, OpType.V1_EXPAND
]
def is_prefill_op(self) -> bool:
return self in [
OpType.SGMV_SHRINK, OpType.SGMV_EXPAND, OpType.V1_SHRINK,
OpType.V1_EXPAND
]
def is_decode_op(self) -> bool:
return self in [
OpType.BGMV_SHRINK, OpType.BGMV_EXPAND, OpType.BGMV_EXPAND_SLICE,
OpType.V1_SHRINK, OpType.V1_EXPAND
]
def is_expand_slice_fn(self) -> bool:
return self in [OpType.BGMV_EXPAND_SLICE]
def num_slices(self) -> list[int]: def num_slices(self) -> list[int]:
return [1, 2, 3] if self in [
OpType.SGMV_EXPAND, OpType.SGMV_SHRINK, OpType.V1_SHRINK,
OpType.V1_EXPAND
]:
# SGMV kernels and v1 kernels supports slices
return [1, 2, 3]
if self in [OpType.BGMV_SHRINK, OpType.BGMV_EXPAND]:
return [1]
if self in [OpType.BGMV_EXPAND_SLICE]:
return [2, 3]
raise ValueError(f"Unrecognized OpType {self}")
def mkn(self, batch_size: int, seq_length: int, hidden_size: int, def mkn(self, batch_size: int, seq_length: int, hidden_size: int,
lora_rank: int) -> tuple[int, int, int]: lora_rank: int) -> tuple[int, int, int]:
@ -190,7 +239,7 @@ class OpType(Enum):
k = hidden_size k = hidden_size
n = lora_rank n = lora_rank
else: else:
assert self.is_expand_fn() assert self.is_expand_fn() or self.is_expand_slice_fn()
m = num_tokens m = num_tokens
k = lora_rank k = lora_rank
n = hidden_size n = hidden_size
@ -205,7 +254,7 @@ class OpType(Enum):
if self.is_shrink_fn(): if self.is_shrink_fn():
return op_dtype, op_dtype, torch.float32 return op_dtype, op_dtype, torch.float32
else: else:
assert self.is_expand_fn() assert self.is_expand_fn() or self.is_expand_slice_fn()
return torch.float32, op_dtype, op_dtype return torch.float32, op_dtype, op_dtype
def matmul_shapes( def matmul_shapes(
@ -219,19 +268,43 @@ class OpType(Enum):
m, k, n = self.mkn(batch_size, seq_length, hidden_size, lora_rank) m, k, n = self.mkn(batch_size, seq_length, hidden_size, lora_rank)
b_shape = (num_loras, n, k) # col-major b_shape = (num_loras, n, k) # col-major
if self in [OpType.LORA_SHRINK]: if self in [OpType.SGMV_SHRINK, OpType.V1_SHRINK]:
# LoRA shrink kernels support num_slices inherently in the kernel. # SGMV shrink and V1 shrink kernels support num_slices inherently
# in the kernel.
return ((m, k), b_shape, (num_slices, m, n)) return ((m, k), b_shape, (num_slices, m, n))
if self in [OpType.LORA_EXPAND]: if self in [OpType.SGMV_EXPAND, OpType.V1_EXPAND]:
# LoRA expand kernels support num_slices inherently in the kernel # SGMV expand and V1 expand kernels support num_slices inherently
# in the kernel
return ((num_slices, m, k), b_shape, (m, n * num_slices)) return ((num_slices, m, k), b_shape, (m, n * num_slices))
if self == OpType.BGMV_SHRINK:
return ((m, k), b_shape, (m, n))
if self == OpType.BGMV_EXPAND:
return ((m, k), b_shape, (m, n))
if self == OpType.BGMV_EXPAND_SLICE:
return ((num_slices, m, k), b_shape, (m, n * num_slices))
raise ValueError(f"Unrecognized op_type {self}") raise ValueError(f"Unrecognized op_type {self}")
def bench_fn(self) -> Callable: def bench_fn(self) -> Callable:
if self == OpType.LORA_SHRINK:
return lora_shrink def emulate_bgmv_expand_slice(kwargs_list: list[dict[str, Any]]):
if self == OpType.LORA_EXPAND: for x in kwargs_list:
return lora_expand bgmv_expand_slice(**x)
if self == OpType.SGMV_SHRINK:
return sgmv_shrink
if self == OpType.SGMV_EXPAND:
return sgmv_expand
if self == OpType.BGMV_SHRINK:
return bgmv_shrink
if self == OpType.BGMV_EXPAND:
return bgmv_expand
if self == OpType.BGMV_EXPAND_SLICE:
return emulate_bgmv_expand_slice
if self == OpType.V1_SHRINK:
return v1_shrink
if self == OpType.V1_EXPAND:
return v1_expand
raise ValueError(f"Unrecognized optype {self}") raise ValueError(f"Unrecognized optype {self}")
@ -245,13 +318,34 @@ class OpType(Enum):
""" """
w_dtype = lora_weights[0].dtype w_dtype = lora_weights[0].dtype
num_slices = len(lora_weights) num_slices = len(lora_weights)
if self in [OpType.LORA_SHRINK]: if self in [OpType.SGMV_SHRINK, OpType.V1_SHRINK]:
for slice_idx in range(num_slices): for slice_idx in range(num_slices):
ref_group_gemm(ref_out=output[slice_idx, :], ref_group_gemm(ref_out=output[slice_idx, :],
input=input, input=input,
lora_weights=lora_weights[slice_idx], lora_weights=lora_weights[slice_idx],
**kwargs) **kwargs)
elif self in [OpType.LORA_EXPAND]: elif self in [OpType.SGMV_EXPAND, OpType.V1_EXPAND]:
hidden_size = lora_weights[0].shape[1]
for slice_idx in range(num_slices):
slice_offset = slice_idx * hidden_size
ref_group_gemm(
ref_out=output[:, slice_offset:slice_offset + hidden_size],
input=input[slice_idx].clone().to(dtype=w_dtype),
lora_weights=lora_weights[slice_idx],
**kwargs)
elif self == OpType.BGMV_SHRINK:
assert num_slices == 1
ref_group_gemm(ref_out=output,
input=input,
lora_weights=lora_weights[0],
**kwargs)
elif self == OpType.BGMV_EXPAND:
assert num_slices == 1
ref_group_gemm(ref_out=output,
input=input.clone().to(dtype=w_dtype),
lora_weights=lora_weights[0],
**kwargs)
elif self == OpType.BGMV_EXPAND_SLICE:
hidden_size = lora_weights[0].shape[1] hidden_size = lora_weights[0].shape[1]
for slice_idx in range(num_slices): for slice_idx in range(num_slices):
slice_offset = slice_idx * hidden_size slice_offset = slice_idx * hidden_size
@ -317,11 +411,13 @@ class BenchmarkTensors:
input: torch.Tensor input: torch.Tensor
lora_weights_lst: list[torch.Tensor] lora_weights_lst: list[torch.Tensor]
output: torch.Tensor output: torch.Tensor
# LoRA kernel metadata # metadata tensors
lora_kernel_meta: LoRAKernelMeta
# Metadata tensors used in testing correctness
seq_lens: torch.Tensor seq_lens: torch.Tensor
seq_start_loc: torch.Tensor
prompt_lora_mapping: torch.Tensor prompt_lora_mapping: torch.Tensor
token_lora_mapping: torch.Tensor
# v1 kernel metadata
v1_kernel_meta: Optional[V1KernelMeta] = None
def io_types(self) -> str: def io_types(self) -> str:
return (f"{dtype_to_str(self.input.dtype)}x" return (f"{dtype_to_str(self.input.dtype)}x"
@ -348,29 +444,35 @@ class BenchmarkTensors:
assert ctx.num_active_loras <= ctx.num_loras assert ctx.num_active_loras <= ctx.num_loras
total_tokens = ctx.batch_size * ctx.seq_length total_tokens = ctx.batch_size * ctx.seq_length
# Make metadata tensors involved in correctness testing.
# Prepare seq lens tensor # Prepare seq lens tensor
seq_len_tensor = torch.randint(ctx.seq_length, ctx.seq_length + 1, seq_len_tensor = torch.randint(ctx.seq_length, ctx.seq_length + 1,
(ctx.batch_size, )) (ctx.batch_size, ))
# Prepare seq_start_loc tensor
seq_start_loc_tensor = torch.cumsum(torch.tensor(
[0] + seq_len_tensor[:-1].tolist(), dtype=torch.long),
dim=0)
assert total_tokens == seq_len_tensor.sum() assert total_tokens == seq_len_tensor.sum()
# Prepare prompt lora indices tensor # Prepare prompt lora indices tensor
prompt_lora_indices_tensor = make_prompt_lora_mapping( prompt_lora_indices_tensor = make_prompt_lora_mapping(
ctx.batch_size, ctx.num_active_loras, ctx.sort_by_lora_id, "cpu") ctx.batch_size, ctx.num_active_loras, ctx.sort_by_lora_id, "cpu")
# Prepare token lora indices tensor
# Make LoRAKernelMeta
token_lora_indices_tensor = make_token_lora_mapping( token_lora_indices_tensor = make_token_lora_mapping(
total_tokens, ctx.batch_size, prompt_lora_indices_tensor, total_tokens, ctx.batch_size, prompt_lora_indices_tensor,
seq_len_tensor, "cpu") seq_len_tensor, "cpu")
lora_kernel_meta = LoRAKernelMeta.make(
max_loras=ctx.num_loras, v1_kernel_meta = None
max_num_tokens=token_lora_indices_tensor.size(0), if op_type in [OpType.V1_SHRINK, OpType.V1_EXPAND]:
device="cpu") v1_kernel_meta = V1KernelMeta.make(
lora_kernel_meta.prepare_tensors( max_loras=ctx.num_loras,
token_lora_mapping=token_lora_indices_tensor) max_num_tokens=token_lora_indices_tensor.size(0),
device="cpu")
v1_kernel_meta.prepare_tensors(
token_lora_mapping=token_lora_indices_tensor)
return BenchmarkTensors(input_tensor, lora_weights, output_tensor, return BenchmarkTensors(input_tensor, lora_weights, output_tensor,
lora_kernel_meta, seq_len_tensor, seq_len_tensor, seq_start_loc_tensor,
prompt_lora_indices_tensor) prompt_lora_indices_tensor,
token_lora_indices_tensor, v1_kernel_meta)
def sanity_check(self) -> None: def sanity_check(self) -> None:
""" """
@ -380,9 +482,9 @@ class BenchmarkTensors:
# check metadata tensors # check metadata tensors
assert torch.sum(self.seq_lens) == num_tokens assert torch.sum(self.seq_lens) == num_tokens
num_seqs = self.seq_lens.shape[0] num_seqs = self.seq_lens.shape[0]
#assert self.seq_start_loc.shape[0] == num_seqs assert self.seq_start_loc.shape[0] == num_seqs
assert self.prompt_lora_mapping.shape[0] == num_seqs assert self.prompt_lora_mapping.shape[0] == num_seqs
assert self.lora_kernel_meta.token_lora_mapping.shape[0] == num_tokens assert self.token_lora_mapping.shape[0] == num_tokens
def to_device(self, device: str): def to_device(self, device: str):
""" """
@ -397,27 +499,220 @@ class BenchmarkTensors:
self.input = to_device(self.input) self.input = to_device(self.input)
self.output = to_device(self.output) self.output = to_device(self.output)
self.seq_lens = to_device(self.seq_lens) self.seq_lens = to_device(self.seq_lens)
self.seq_start_loc = to_device(self.seq_start_loc)
self.prompt_lora_mapping = to_device(self.prompt_lora_mapping) self.prompt_lora_mapping = to_device(self.prompt_lora_mapping)
self.token_lora_mapping = to_device(self.token_lora_mapping)
for i in range(len(self.lora_weights_lst)): for i in range(len(self.lora_weights_lst)):
self.lora_weights_lst[i] = to_device(self.lora_weights_lst[i]) self.lora_weights_lst[i] = to_device(self.lora_weights_lst[i])
# LoRA meta # v1 meta
for field_name in LoRAKernelMeta.__dataclass_fields__: if self.v1_kernel_meta:
field = getattr(self.lora_kernel_meta, field_name) for field_name in V1KernelMeta.__dataclass_fields__:
assert isinstance(field, torch.Tensor) field = getattr(self.v1_kernel_meta, field_name)
setattr(self.lora_kernel_meta, field_name, to_device(field)) assert isinstance(field, torch.Tensor)
setattr(self.v1_kernel_meta, field_name, to_device(field))
def metadata(self) -> tuple[int, int, int]: def metadata(self) -> tuple[int, int, int]:
""" """
Return num_seqs, num_tokens and max_seq_len Return num_seqs, num_tokens and max_seq_len
""" """
num_seqs = self.seq_lens.shape[0] num_seqs = self.seq_lens.shape[0]
num_tokens = self.lora_kernel_meta.token_lora_mapping.shape[0] num_tokens = self.token_lora_mapping.shape[0]
max_seq_len = torch.max(self.seq_lens).item() max_seq_len = torch.max(self.seq_lens).item()
num_slices = len(self.lora_weights_lst) num_slices = len(self.lora_weights_lst)
return num_seqs, num_tokens, max_seq_len, num_slices return num_seqs, num_tokens, max_seq_len, num_slices
def as_lora_shrink_kwargs(self) -> dict[str, Any]: def convert_to_sgmv_benchmark_tensors(self):
"""
For sgmv punica kernels, when consecutive sequences have the
same LoRA ID, we just merge them together.
This happens in punica.py::compute_metadata
"""
# Collapse seq_lens and seq_start_loc
_, seq_lens = torch.unique_consecutive(self.token_lora_mapping,
return_counts=True)
cum_result = torch.cumsum(seq_lens, dim=0)
seq_start_loc = torch.zeros_like(seq_lens)
seq_start_loc[1:].copy_(cum_result[:-1])
# Collapse prompt mapping
prompt_lora_mapping = torch.unique_consecutive(
self.prompt_lora_mapping)
assert torch.sum(seq_lens) == torch.sum(self.seq_lens), \
f"dont match - new {torch.sum(seq_lens)} vs {torch.sum(self.seq_lens)}"
self.prompt_lora_mapping = prompt_lora_mapping.to(
dtype=self.prompt_lora_mapping.dtype)
self.seq_lens = seq_lens.to(dtype=self.seq_lens.dtype)
self.seq_start_loc = seq_start_loc.to(dtype=self.seq_start_loc.dtype)
def as_sgmv_shrink_kwargs(self) -> dict[str, Any]:
self.convert_to_sgmv_benchmark_tensors()
self.sanity_check()
self.to_device(self.input.device)
num_seqs, num_tokens, max_seq_len, num_slices = self.metadata()
# Sanity check matrix shapes.
i_shape, lw_shape, o_shape = self.input.shape, self.lora_weights_lst[
0].shape, self.output.shape
# Expected input shape [num_tokens, hidden_size]
assert len(i_shape) == 2
assert i_shape[0] == num_tokens
hidden_size = i_shape[1]
# Expected lora weight shape [num_loras, lora_rank, hidden_size]
assert len(lw_shape) == 3
assert lw_shape[2] == hidden_size
lora_rank = lw_shape[1]
# Expected output shape [num_slices, num_tokens, lora_rank]
assert len(o_shape) == 3
assert o_shape == (num_slices, num_tokens, lora_rank)
return {
'inputs': self.input,
'lora_a_weights': self.lora_weights_lst,
'output_tensor': self.output,
'b_seq_start_loc': self.seq_start_loc,
'seq_len_tensor': self.seq_lens,
'lora_indices_tensor': self.prompt_lora_mapping,
'batches': num_seqs,
'max_seq_length': max_seq_len,
'token_nums': num_tokens,
'scaling': 1.0,
}
def as_sgmv_expand_kwargs(self, add_inputs: bool) -> dict[str, Any]:
self.convert_to_sgmv_benchmark_tensors()
self.sanity_check()
self.to_device(self.input.device)
num_seqs, num_tokens, max_seq_len, num_slices = self.metadata()
# Sanity check matrix shapes.
i_shape, lw_shape, o_shape = self.input.shape, self.lora_weights_lst[
0].shape, self.output.shape
# Expected input shape : [num_slices, num_tokens, lora_rank]
assert len(i_shape) == 3
assert i_shape[0] == num_slices
assert i_shape[1] == num_tokens
lora_rank = i_shape[2]
# Expected lora weight shape : [num_lora, hidden_size, lora_rank]
assert len(lw_shape) == 3
assert lw_shape[2] == lora_rank
hidden_size = lw_shape[1]
# Expected output shape : [num_tokens, hidden_size * num_slices]
assert len(o_shape) == 2
assert o_shape == (num_tokens, hidden_size * num_slices)
return {
'inputs': self.input,
'lora_b_weights': self.lora_weights_lst,
'output_tensor': self.output,
'b_seq_start_loc': self.seq_start_loc,
'seq_len_tensor': self.seq_lens,
'lora_indices_tensor': self.prompt_lora_mapping,
'batches': num_seqs,
'max_seq_length': max_seq_len,
'token_nums': num_tokens,
'offset_start': 0,
'add_inputs': add_inputs,
}
def as_bgmv_shrink_kwargs(self) -> dict[str, Any]:
assert len(self.lora_weights_lst) == 1
self.to_device(self.input.device)
_, num_tokens, _, _ = self.metadata()
# Sanity check shapes
i_shape, lw_shape, o_shape = self.input.shape, self.lora_weights_lst[
0].shape, self.output.shape
# Expected input shape [num_tokens, hidden_size]
assert len(i_shape) == 2
assert i_shape[0] == num_tokens
hidden_size = i_shape[1]
# Expected lora weight shape [num_loras, lora_rank, hidden_size]
assert len(lw_shape) == 3
assert lw_shape[2] == hidden_size
lora_rank = lw_shape[1]
# Expected output shape [num_tokens, lora_rank]
assert len(o_shape) == 2
assert o_shape == (num_tokens, lora_rank)
return {
'inputs': self.input,
'lora_a_weights': self.lora_weights_lst[0],
'output_tensor': self.output,
'lora_indices_tensor': self.token_lora_mapping,
'scaling': 1.0
}
def as_bgmv_expand_kwargs(self, add_inputs: bool):
assert len(self.lora_weights_lst) == 1
self.to_device(self.input.device)
_, num_tokens, _, _ = self.metadata()
# Sanity check shapes
i_shape, lw_shape, o_shape = self.input.shape, self.lora_weights_lst[
0].shape, self.output.shape
# Expected input shape [num_tokens, lora_rank]
assert len(i_shape) == 2
assert i_shape[0] == num_tokens
lora_rank = i_shape[1]
# Expected lora weight shape [num_loras, hidden_size, lora_rank]
assert len(lw_shape) == 3
assert lw_shape[2] == lora_rank
hidden_size = lw_shape[1]
# Expected output shape [num_tokens, hidden_size]
assert len(o_shape) == 2
assert o_shape == (num_tokens, hidden_size)
return {
'inputs': self.input,
'lora_b_weights': self.lora_weights_lst[0],
'output_tensor': self.output,
'lora_indices_tensor': self.token_lora_mapping,
'add_inputs': add_inputs
}
def as_bgmv_expand_slice_kwargs(self, add_inputs: bool) -> dict[str, Any]:
_, num_tokens, _, num_slices = self.metadata()
# Sanity check shapes
i_shape, lw_shape, o_shape = self.input.shape, self.lora_weights_lst[
0].shape, self.output.shape
# Expected input shape [num_slices, num_tokens, lora_rank]
assert len(i_shape) == 3
assert i_shape[0] == num_slices
assert i_shape[1] == num_tokens
lora_rank = i_shape[2]
# Expected lora weight shape [num_loras, hidden_size, lora_rank]
assert len(lw_shape) == 3
assert lw_shape[2] == lora_rank
hidden_size = lw_shape[1]
# Expected output shape [num_tokens, hidden_size * num_slices]
assert len(o_shape) == 2
assert o_shape == (num_tokens, hidden_size * num_slices)
self.to_device(self.input.device)
kwargs_list = []
for i in range(num_slices):
kwargs_list.append({
'inputs': self.input[i],
'lora_b_weights': self.lora_weights_lst[i],
'output_tensor': self.output,
'lora_indices_tensor': self.token_lora_mapping,
'slice_offset': i * hidden_size,
'slice_size': hidden_size,
'add_inputs': add_inputs,
})
return {'kwargs_list': kwargs_list}
def as_v1_shrink_kwargs(self) -> dict[str, Any]:
assert self.v1_kernel_meta is not None
self.sanity_check() self.sanity_check()
self.to_device(self.input.device) self.to_device(self.input.device)
@ -442,16 +737,17 @@ class BenchmarkTensors:
'inputs': self.input, 'inputs': self.input,
'lora_a_weights': self.lora_weights_lst, 'lora_a_weights': self.lora_weights_lst,
'output_tensor': self.output, 'output_tensor': self.output,
'token_lora_mapping': self.lora_kernel_meta.token_lora_mapping, 'token_lora_mapping': self.v1_kernel_meta.token_lora_mapping,
'token_indices_sorted_by_lora_ids': 'token_indices_sorted_by_lora_ids':
self.lora_kernel_meta.token_indices_sorted_by_lora_ids, self.v1_kernel_meta.token_indices_sorted_by_lora_ids,
'num_tokens_per_lora': self.lora_kernel_meta.num_tokens_per_lora, 'num_tokens_per_lora': self.v1_kernel_meta.num_tokens_per_lora,
'lora_token_start_loc': self.lora_kernel_meta.lora_token_start_loc, 'lora_token_start_loc': self.v1_kernel_meta.lora_token_start_loc,
'lora_ids': self.lora_kernel_meta.active_lora_ids, 'lora_ids': self.v1_kernel_meta.active_lora_ids,
'scaling': 1.0, 'scaling': 1.0,
} }
def as_lora_expand_kwargs(self, add_inputs: bool) -> dict[str, Any]: def as_v1_expand_kwargs(self, add_inputs: bool) -> dict[str, Any]:
assert self.v1_kernel_meta is not None
self.sanity_check() self.sanity_check()
self.to_device(self.input.device) self.to_device(self.input.device)
@ -477,12 +773,12 @@ class BenchmarkTensors:
'inputs': self.input, 'inputs': self.input,
'lora_b_weights': self.lora_weights_lst, 'lora_b_weights': self.lora_weights_lst,
'output_tensor': self.output, 'output_tensor': self.output,
'token_lora_mapping': self.lora_kernel_meta.token_lora_mapping, 'token_lora_mapping': self.v1_kernel_meta.token_lora_mapping,
'token_indices_sorted_by_lora_ids': 'token_indices_sorted_by_lora_ids':
self.lora_kernel_meta.token_indices_sorted_by_lora_ids, self.v1_kernel_meta.token_indices_sorted_by_lora_ids,
'num_tokens_per_lora': self.lora_kernel_meta.num_tokens_per_lora, 'num_tokens_per_lora': self.v1_kernel_meta.num_tokens_per_lora,
'lora_token_start_loc': self.lora_kernel_meta.lora_token_start_loc, 'lora_token_start_loc': self.v1_kernel_meta.lora_token_start_loc,
'lora_ids': self.lora_kernel_meta.active_lora_ids, 'lora_ids': self.v1_kernel_meta.active_lora_ids,
'offset_start': 0, 'offset_start': 0,
'add_inputs': add_inputs, 'add_inputs': add_inputs,
} }
@ -495,10 +791,20 @@ class BenchmarkTensors:
else: else:
assert add_inputs is not None assert add_inputs is not None
if op_type == OpType.LORA_SHRINK: if op_type == OpType.SGMV_SHRINK:
return self.as_lora_shrink_kwargs() return self.as_sgmv_shrink_kwargs()
if op_type == OpType.LORA_EXPAND: if op_type == OpType.SGMV_EXPAND:
return self.as_lora_expand_kwargs(add_inputs) return self.as_sgmv_expand_kwargs(add_inputs)
if op_type == OpType.BGMV_SHRINK:
return self.as_bgmv_shrink_kwargs()
if op_type == OpType.BGMV_EXPAND:
return self.as_bgmv_expand_kwargs(add_inputs)
if op_type == OpType.BGMV_EXPAND_SLICE:
return self.as_bgmv_expand_slice_kwargs(add_inputs)
if op_type == OpType.V1_SHRINK:
return self.as_v1_shrink_kwargs()
if op_type == OpType.V1_EXPAND:
return self.as_v1_expand_kwargs(add_inputs)
raise ValueError(f"Unrecognized optype {self}") raise ValueError(f"Unrecognized optype {self}")
def test_correctness(self, op_type: OpType, def test_correctness(self, op_type: OpType,
@ -687,6 +993,10 @@ def run(args: argparse.Namespace, bench_ctxs: list[BenchmarkContext]):
for bench_ctx in bench_ctxs: for bench_ctx in bench_ctxs:
for seq_len in args.seq_lengths: for seq_len in args.seq_lengths:
bench_ops: list[OpType] = args.op_types bench_ops: list[OpType] = args.op_types
if seq_len > 1:
# bench only prefill ops
bench_ops = [op for op in args.op_types if op.is_prefill_op()]
seq_len_timers = [] seq_len_timers = []
for bench_op in bench_ops: for bench_op in bench_ops:
for num_slices in bench_op.num_slices(): for num_slices in bench_op.num_slices():
@ -896,13 +1206,13 @@ Benchmark LoRA kernels:
{use_cuda_graph_recommendation()} {use_cuda_graph_recommendation()}
list_bench example: list_bench example:
python3 benchmarks/kernels/benchmark_lora.py list_bench --arg-pool-size 32 --batch-sizes 1 16 32 --dtype torch.float16 --hidden-sizes 2048 --lora-ranks 16 --num-loras 1 4 --op-types lora_shrink lora_expand --seq-lengths 1 16 --sort-by-lora-id 1 --cuda-graph-nops 32 python3 benchmarks/kernels/benchmark_lora.py list_bench --arg-pool-size 32 --batch-sizes 1 16 32 --dtype torch.float16 --hidden-sizes 2048 --lora-ranks 16 --num-loras 1 4 --op-types bgmv_shrink bgmv_expand sgmv_shrink sgmv_expand bgmv_expand_slice --seq-lengths 1 16 --sort-by-lora-id 1 --cuda-graph-nops 32
model_bench example: model_bench example:
python3 benchmarks/kernels/benchmark_lora.py model_bench --models meta-llama/Llama-3-8b --arg-pool-size 32 --batch-sizes 1 16 32 --dtype torch.float16 --lora-ranks 16 --num-loras 1 4 --op-types lora_shrink lora_expand --seq-lengths 1 16 --sort-by-lora-id 1 --cuda-graph-nops 32 python3 benchmarks/kernels/benchmark_lora.py model_bench --models meta-llama/Llama-3-8b --arg-pool-size 32 --batch-sizes 1 16 32 --dtype torch.float16 --lora-ranks 16 --num-loras 1 4 --op-types bgmv_shrink bgmv_expand sgmv_shrink sgmv_expand bgmv_expand_slice --seq-lengths 1 16 --sort-by-lora-id 1 --cuda-graph-nops 32
range_bench example: range_bench example:
python3 benchmarks/kernels/benchmark_lora.py range_bench --arg-pool-size 32 --batch-sizes 1 16 32 --dtype torch.float16 --num-loras 1 4 --op-types lora_shrink lora_expand --seq-lengths 1 16 --sort-by-lora-id 1 --cuda-graph-nops 32 --hidden-sizes-start 1024 --hidden-sizes-end 4096 --hidden-sizes-increment 1024 --lora-ranks-start 8 --lora-ranks-end 24 --lora-ranks-increment 8 python3 benchmarks/kernels/benchmark_lora.py range_bench --arg-pool-size 32 --batch-sizes 1 16 32 --dtype torch.float16 --num-loras 1 4 --op-types bgmv_shrink bgmv_expand sgmv_shrink sgmv_expand bgmv_expand_slice --seq-lengths 1 16 --sort-by-lora-id 1 --cuda-graph-nops 32 --hidden-sizes-start 1024 --hidden-sizes-end 4096 --hidden-sizes-increment 1024 --lora-ranks-start 8 --lora-ranks-end 24 --lora-ranks-increment 8
""", # noqa: E501 """, # noqa: E501
formatter_class=argparse.RawTextHelpFormatter) formatter_class=argparse.RawTextHelpFormatter)

View File

@ -54,7 +54,6 @@ for qps in "${QPS_VALUES[@]}"; do
python "$SCRIPT_DIR/benchmark_serving_structured_output.py" $COMMON_PARAMS \ python "$SCRIPT_DIR/benchmark_serving_structured_output.py" $COMMON_PARAMS \
--request-rate $qps \ --request-rate $qps \
--result-filename "$FILENAME" \ --result-filename "$FILENAME" \
--tokenizer-mode ${TOKENIZER_MODE:-"auto"} \
--port ${PORT:-8000} --port ${PORT:-8000}
echo "Completed benchmark with QPS: $qps" echo "Completed benchmark with QPS: $qps"

View File

@ -350,8 +350,8 @@ __global__ void concat_and_cache_mla_kernel(
} // namespace vllm } // namespace vllm
// KV_T is the data type of key and value tensors. // KV_T is the stored data type of kv-cache.
// CACHE_T is the stored data type of kv-cache. // CACHE_T is the data type of key and value tensors.
// KV_DTYPE is the real data type of kv-cache. // KV_DTYPE is the real data type of kv-cache.
#define CALL_RESHAPE_AND_CACHE(KV_T, CACHE_T, KV_DTYPE) \ #define CALL_RESHAPE_AND_CACHE(KV_T, CACHE_T, KV_DTYPE) \
vllm::reshape_and_cache_kernel<KV_T, CACHE_T, KV_DTYPE> \ vllm::reshape_and_cache_kernel<KV_T, CACHE_T, KV_DTYPE> \
@ -393,8 +393,8 @@ void reshape_and_cache(
CALL_RESHAPE_AND_CACHE) CALL_RESHAPE_AND_CACHE)
} }
// KV_T is the data type of key and value tensors. // KV_T is the stored data type of kv-cache.
// CACHE_T is the stored data type of kv-cache. // CACHE_T is the data type of key and value tensors.
// KV_DTYPE is the real data type of kv-cache. // KV_DTYPE is the real data type of kv-cache.
#define CALL_RESHAPE_AND_CACHE_FLASH(KV_T, CACHE_T, KV_DTYPE) \ #define CALL_RESHAPE_AND_CACHE_FLASH(KV_T, CACHE_T, KV_DTYPE) \
vllm::reshape_and_cache_flash_kernel<KV_T, CACHE_T, KV_DTYPE> \ vllm::reshape_and_cache_flash_kernel<KV_T, CACHE_T, KV_DTYPE> \
@ -446,8 +446,8 @@ void reshape_and_cache_flash(
CALL_RESHAPE_AND_CACHE_FLASH); CALL_RESHAPE_AND_CACHE_FLASH);
} }
// KV_T is the data type of key and value tensors. // KV_T is the stored data type of kv-cache.
// CACHE_T is the stored data type of kv-cache. // CACHE_T is the data type of key and value tensors.
// KV_DTYPE is the real data type of kv-cache. // KV_DTYPE is the real data type of kv-cache.
#define CALL_CONCAT_AND_CACHE_MLA(KV_T, CACHE_T, KV_DTYPE) \ #define CALL_CONCAT_AND_CACHE_MLA(KV_T, CACHE_T, KV_DTYPE) \
vllm::concat_and_cache_mla_kernel<KV_T, CACHE_T, KV_DTYPE> \ vllm::concat_and_cache_mla_kernel<KV_T, CACHE_T, KV_DTYPE> \

View File

@ -3,12 +3,6 @@
#include "cpu_types.hpp" #include "cpu_types.hpp"
#if defined(__x86_64__)
#define DISPATCH_MACRO VLLM_DISPATCH_FLOATING_TYPES_WITH_E5M2
#else
#define DISPATCH_MACRO VLLM_DISPATCH_FLOATING_TYPES
#endif
namespace { namespace {
template <typename scalar_t> template <typename scalar_t>
void copy_blocks_cpu_impl(std::vector<torch::Tensor> const& key_caches, void copy_blocks_cpu_impl(std::vector<torch::Tensor> const& key_caches,
@ -101,12 +95,13 @@ void copy_blocks(std::vector<torch::Tensor> const& key_caches,
} }
const int element_num_per_block = key_caches[0][0].numel(); const int element_num_per_block = key_caches[0][0].numel();
DISPATCH_MACRO(key_caches[0].scalar_type(), "copy_blocks_cpu_impl", [&] { VLLM_DISPATCH_FLOATING_TYPES(
CPU_KERNEL_GUARD_IN(copy_blocks_cpu_impl) key_caches[0].scalar_type(), "copy_blocks_cpu_impl", [&] {
copy_blocks_cpu_impl<scalar_t>(key_caches, value_caches, block_mapping, CPU_KERNEL_GUARD_IN(copy_blocks_cpu_impl)
element_num_per_block, num_layers); copy_blocks_cpu_impl<scalar_t>(key_caches, value_caches, block_mapping,
CPU_KERNEL_GUARD_OUT(copy_blocks_cpu_impl) element_num_per_block, num_layers);
}); CPU_KERNEL_GUARD_OUT(copy_blocks_cpu_impl)
});
} }
void reshape_and_cache(torch::Tensor& key, torch::Tensor& value, void reshape_and_cache(torch::Tensor& key, torch::Tensor& value,
@ -123,15 +118,16 @@ void reshape_and_cache(torch::Tensor& key, torch::Tensor& value,
int key_stride = key.stride(0); int key_stride = key.stride(0);
int value_stride = value.stride(0); int value_stride = value.stride(0);
DISPATCH_MACRO(key.scalar_type(), "reshape_and_cache_cpu_impl", [&] { VLLM_DISPATCH_FLOATING_TYPES(
CPU_KERNEL_GUARD_IN(reshape_and_cache_cpu_impl) key.scalar_type(), "reshape_and_cache_cpu_impl", [&] {
reshape_and_cache_cpu_impl<scalar_t>( CPU_KERNEL_GUARD_IN(reshape_and_cache_cpu_impl)
key.data_ptr<scalar_t>(), value.data_ptr<scalar_t>(), reshape_and_cache_cpu_impl<scalar_t>(
key_cache.data_ptr<scalar_t>(), value_cache.data_ptr<scalar_t>(), key.data_ptr<scalar_t>(), value.data_ptr<scalar_t>(),
slot_mapping.data_ptr<int64_t>(), num_tokens, key_stride, value_stride, key_cache.data_ptr<scalar_t>(), value_cache.data_ptr<scalar_t>(),
num_heads, head_size, block_size, x); slot_mapping.data_ptr<int64_t>(), num_tokens, key_stride,
CPU_KERNEL_GUARD_OUT(reshape_and_cache_cpu_impl) value_stride, num_heads, head_size, block_size, x);
}); CPU_KERNEL_GUARD_OUT(reshape_and_cache_cpu_impl)
});
} }
void swap_blocks(torch::Tensor& src, torch::Tensor& dst, void swap_blocks(torch::Tensor& src, torch::Tensor& dst,

View File

@ -16,18 +16,9 @@ namespace vec_op {
AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) \ AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) \
AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__) AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)
#define VLLM_DISPATCH_CASE_FLOATING_TYPES_FP8(...) \
AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \
AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) \
AT_DISPATCH_CASE(at::ScalarType::Float8_e5m2, __VA_ARGS__)
#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \ #define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \
AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__)) AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
#define VLLM_DISPATCH_FLOATING_TYPES_WITH_E5M2(TYPE, NAME, ...) \
AT_DISPATCH_SWITCH(TYPE, NAME, \
VLLM_DISPATCH_CASE_FLOATING_TYPES_FP8(__VA_ARGS__))
#ifndef CPU_OP_GUARD #ifndef CPU_OP_GUARD
#define CPU_KERNEL_GUARD_IN(NAME) #define CPU_KERNEL_GUARD_IN(NAME)
#define CPU_KERNEL_GUARD_OUT(NAME) #define CPU_KERNEL_GUARD_OUT(NAME)

View File

@ -170,7 +170,7 @@ void rotary_embedding_gptj_impl(
void rotary_embedding(torch::Tensor& positions, torch::Tensor& query, void rotary_embedding(torch::Tensor& positions, torch::Tensor& query,
torch::Tensor& key, int64_t head_size, torch::Tensor& key, int64_t head_size,
torch::Tensor& cos_sin_cache, bool is_neox) { torch::Tensor& cos_sin_cache, bool is_neox) {
int num_tokens = positions.numel(); int num_tokens = query.numel() / query.size(-1);
int rot_dim = cos_sin_cache.size(1); int rot_dim = cos_sin_cache.size(1);
int num_heads = query.size(-1) / head_size; int num_heads = query.size(-1) / head_size;
int num_kv_heads = key.size(-1) / head_size; int num_kv_heads = key.size(-1) / head_size;

View File

@ -274,7 +274,7 @@ void advance_step_flashinfer(
cudaDeviceGetAttribute(&blocks, cudaDevAttrMultiProcessorCount, dev); cudaDeviceGetAttribute(&blocks, cudaDevAttrMultiProcessorCount, dev);
cudaDeviceGetAttribute(&threads, cudaDevAttrMaxThreadsPerBlock, dev); cudaDeviceGetAttribute(&threads, cudaDevAttrMaxThreadsPerBlock, dev);
[[maybe_unused]] int block_tables_stride = block_tables.stride(0); int block_tables_stride = block_tables.stride(0);
TORCH_CHECK((blocks * threads > num_queries), TORCH_CHECK((blocks * threads > num_queries),
"multi-step: not enough threads to map to num_queries = ", "multi-step: not enough threads to map to num_queries = ",
num_queries, " block_tables.stride(0) = ", block_tables.stride(0), num_queries, " block_tables.stride(0) = ", block_tables.stride(0),

View File

@ -19,24 +19,12 @@ __device__ __forceinline__ fp8_type cvt_c10(float const r) {
return {}; return {};
} }
// __hip_fp8_e4m3 only exists starting in ROCm 6.3. The macro
// HIP_FP8_TYPE_OCP comes from the hip_fp8.h header and also makes
// its first appearance in ROCm 6.3. Since VLLM_DISPATCH_FP8_TYPES
// on ROCm instantiates both OCP and FNUZ kernels, we need to replace
// the new HW cvt with something reasonable that doesn't rely on the
// ROCm 6.3 feature. This allows compiling on ROCm 6.2 or newer.
template <> template <>
__device__ __forceinline__ c10::Float8_e4m3fn cvt_c10(float const r) { __device__ __forceinline__ c10::Float8_e4m3fn cvt_c10(float const r) {
#if HIP_FP8_TYPE_OCP
return c10::Float8_e4m3fn( return c10::Float8_e4m3fn(
__hip_cvt_float_to_fp8(r, __hip_fp8_e4m3::__default_saturation, __hip_cvt_float_to_fp8(r, __hip_fp8_e4m3::__default_saturation,
__hip_fp8_e4m3::__default_interpret), __hip_fp8_e4m3::__default_interpret),
c10::Float8_e4m3fn::from_bits()); c10::Float8_e4m3fn::from_bits());
#else
// Cast implemented by pytorch. Uses bit manipulation instead of HW cvt.
// HW cvt above is faster when it is available (ROCm 6.3 or newer).
return static_cast<c10::Float8_e4m3fn>(r);
#endif
} }
template <> template <>
@ -446,7 +434,7 @@ scaled_vec_conversion<uint16_t, uint8_t>(const uint8_t& a, float scale) {
template <> template <>
__inline__ __device__ uint32_t __inline__ __device__ uint32_t
scaled_vec_conversion<uint32_t, uint16_t>(const uint16_t& a, float scale) { scaled_vec_conversion<uint32_t, uint16_t>(const uint16_t& a, float scale) {
[[maybe_unused]] __half2_raw h2r = __half2_raw h2r =
__hip_cvt_fp8x2_to_halfraw2(a, fp8_type::__default_interpret); __hip_cvt_fp8x2_to_halfraw2(a, fp8_type::__default_interpret);
union { union {
__half2_raw h2r; __half2_raw h2r;

View File

@ -206,8 +206,8 @@ __global__ void gemm_half_q_half_gptq_4bit_kernel(
int offset_m = blockIdx.y * m_count; int offset_m = blockIdx.y * m_count;
int offset_k = blockIdx.z * BLOCK_KN_SIZE; int offset_k = blockIdx.z * BLOCK_KN_SIZE;
[[maybe_unused]] int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n); int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n);
[[maybe_unused]] int end_m = min(offset_m + m_count, size_m); int end_m = min(offset_m + m_count, size_m);
int end_k = min(offset_k + BLOCK_KN_SIZE, size_k); int end_k = min(offset_k + BLOCK_KN_SIZE, size_k);
int n = offset_n + t * 4; int n = offset_n + t * 4;
@ -344,8 +344,8 @@ __global__ void gemm_half_q_half_gptq_2bit_kernel(
int offset_m = blockIdx.y * m_count; int offset_m = blockIdx.y * m_count;
int offset_k = blockIdx.z * BLOCK_KN_SIZE; int offset_k = blockIdx.z * BLOCK_KN_SIZE;
[[maybe_unused]] int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n); int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n);
[[maybe_unused]] int end_m = min(offset_m + m_count, size_m); int end_m = min(offset_m + m_count, size_m);
int end_k = min(offset_k + BLOCK_KN_SIZE, size_k); int end_k = min(offset_k + BLOCK_KN_SIZE, size_k);
int n = offset_n + t * 4; int n = offset_n + t * 4;
@ -465,8 +465,8 @@ __global__ void gemm_half_q_half_gptq_3bit_kernel(
int offset_m = blockIdx.y * m_count; int offset_m = blockIdx.y * m_count;
int offset_k = blockIdx.z * BLOCK_KN_SIZE; int offset_k = blockIdx.z * BLOCK_KN_SIZE;
[[maybe_unused]] int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n); int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n);
[[maybe_unused]] int end_m = min(offset_m + m_count, size_m); int end_m = min(offset_m + m_count, size_m);
int end_k = min(offset_k + BLOCK_KN_SIZE, size_k); int end_k = min(offset_k + BLOCK_KN_SIZE, size_k);
int n = offset_n + t * 4; int n = offset_n + t * 4;
@ -593,8 +593,8 @@ __global__ void gemm_half_q_half_gptq_8bit_kernel(
int offset_m = blockIdx.y * m_count; int offset_m = blockIdx.y * m_count;
int offset_k = blockIdx.z * BLOCK_KN_SIZE; int offset_k = blockIdx.z * BLOCK_KN_SIZE;
[[maybe_unused]] int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n); int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n);
[[maybe_unused]] int end_m = min(offset_m + m_count, size_m); int end_m = min(offset_m + m_count, size_m);
int end_k = min(offset_k + BLOCK_KN_SIZE, size_k); int end_k = min(offset_k + BLOCK_KN_SIZE, size_k);
int n = offset_n + t * 4; int n = offset_n + t * 4;

View File

@ -437,10 +437,9 @@ struct ComputeTile_W8A16_PerC_MtilexNtilex32_multistage_SM8x_SplitK {
for (int n_idx = 0; n_idx < WARP_NITER; ++n_idx) { for (int n_idx = 0; n_idx < WARP_NITER; ++n_idx) {
#pragma unroll #pragma unroll
for (int k_idx = 0; k_idx < 2; ++k_idx) { for (int k_idx = 0; k_idx < 2; ++k_idx) {
FType low16 = FType low16 = static_cast<FType>(C_frag[m_idx][n_idx][k_idx * 2]);
ScalarType<FType>::float2num(C_frag[m_idx][n_idx][k_idx * 2]);
FType high16 = FType high16 =
ScalarType<FType>::float2num(C_frag[m_idx][n_idx][k_idx * 2 + 1]); static_cast<FType>(C_frag[m_idx][n_idx][k_idx * 2 + 1]);
uint32_t tmp = (reinterpret_cast<uint32_t&>(low16) & 0xffff) | uint32_t tmp = (reinterpret_cast<uint32_t&>(low16) & 0xffff) |
(reinterpret_cast<uint32_t&>(high16) << 16); (reinterpret_cast<uint32_t&>(high16) << 16);
int sts_offset = int sts_offset =
@ -794,7 +793,7 @@ __global__ void restore_N32_K16_dequantize_rhs_w8a16_perc_kernel(
FT scale_reg[4]; FT scale_reg[4];
*(reinterpret_cast<uint2*>(scale_reg)) = *(reinterpret_cast<uint2*>(scale_reg)) =
*(reinterpret_cast<const uint2*>(scales + params_nidx)); *(reinterpret_cast<const uint2*>(scales + params_nidx));
FT zero_reg[4]; FT zero_reg[4] = {0};
if (zeros != nullptr) { if (zeros != nullptr) {
*(reinterpret_cast<uint2*>(zero_reg)) = *(reinterpret_cast<uint2*>(zero_reg)) =
*(reinterpret_cast<const uint2*>(zeros + params_nidx)); *(reinterpret_cast<const uint2*>(zeros + params_nidx));
@ -810,10 +809,8 @@ __global__ void restore_N32_K16_dequantize_rhs_w8a16_perc_kernel(
reinterpret_cast<typename HalfType<FT>::T2*>(&(fval_reg[ni * 4]))); reinterpret_cast<typename HalfType<FT>::T2*>(&(fval_reg[ni * 4])));
#pragma unroll #pragma unroll
for (int ki = 0; ki < 4; ++ki) { for (int ki = 0; ki < 4; ++ki) {
if (zeros != nullptr) { fval_reg[ni * 4 + ki] =
fval_reg[ni * 4 + ki] = __hsub(fval_reg[ni * 4 + ki], zero_reg[ni]); (fval_reg[ni * 4 + ki] - zero_reg[ni]) * scale_reg[ni];
}
fval_reg[ni * 4 + ki] = __hmul(fval_reg[ni * 4 + ki], scale_reg[ni]);
int sts_offset = sts_base_offset + ((ki / 2) * 8 + (ki % 2)) * 32 + int sts_offset = sts_base_offset + ((ki / 2) * 8 + (ki % 2)) * 32 +
((ni + lane_id % 4) % 4) * 8; ((ni + lane_id % 4) % 4) * 8;
smem[sts_offset] = fval_reg[ni * 4 + ki]; smem[sts_offset] = fval_reg[ni * 4 + ki];

View File

@ -7,8 +7,6 @@
#include <cuda_fp16.h> #include <cuda_fp16.h>
#include <cuda_bf16.h> #include <cuda_bf16.h>
#include <iostream> #include <iostream>
#include "../gptq_marlin/marlin_dtypes.cuh"
using marlin::ScalarType;
namespace allspark { namespace allspark {
@ -68,14 +66,14 @@ __global__ void f16_gemm_splitk_reduce_kernel(const FType* C_split, FType* C,
return; return;
} }
float sum = 0.f; FType sum(0);
int n_mat = N_MATRIX > 0 ? N_MATRIX : (int)n_matrix; int n_mat = N_MATRIX > 0 ? N_MATRIX : (int)n_matrix;
for (int i = 0; i < n_mat; ++i) { for (int i = 0; i < n_mat; ++i) {
sum += ScalarType<FType>::num2float(C_split[idx + i * matrix_size]); sum += C_split[idx + i * matrix_size];
} }
C[idx] = ScalarType<FType>::float2num(sum); C[idx] = sum;
} }
template <typename FType> template <typename FType>

View File

@ -127,7 +127,7 @@ __device__ __forceinline__ T from_float(const float& inp) {
template <typename T> template <typename T>
__device__ __forceinline__ _B16x4 from_floatx4(const floatx4& inp) { __device__ __forceinline__ _B16x4 from_floatx4(const floatx4& inp) {
[[maybe_unused]] union tmpcvt { union tmpcvt {
uint16_t u; uint16_t u;
_Float16 f; _Float16 f;
__hip_bfloat16 b; __hip_bfloat16 b;
@ -160,7 +160,7 @@ __device__ __forceinline__ _B16x4 from_floatx4(const floatx4& inp) {
template <typename T> template <typename T>
__device__ __forceinline__ _B16x4 addx4(const _B16x4& inp1, __device__ __forceinline__ _B16x4 addx4(const _B16x4& inp1,
const _B16x4& inp2) { const _B16x4& inp2) {
[[maybe_unused]] union tmpcvt { union tmpcvt {
uint16_t u; uint16_t u;
_Float16 f; _Float16 f;
__hip_bfloat16 b; __hip_bfloat16 b;
@ -308,8 +308,8 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel(
constexpr int GQA_RATIO4 = DIVIDE_ROUND_UP(GQA_RATIO, 4); constexpr int GQA_RATIO4 = DIVIDE_ROUND_UP(GQA_RATIO, 4);
[[maybe_unused]] __shared__ float shared_qk_max[NWARPS][16 + 1]; __shared__ float shared_qk_max[NWARPS][16 + 1];
[[maybe_unused]] __shared__ float shared_exp_sum[NWARPS][16 + 1]; __shared__ float shared_exp_sum[NWARPS][16 + 1];
// shared_logits is used for multiple purposes // shared_logits is used for multiple purposes
__shared__ _B16x4 shared_logits[NWARPS][4][16][4]; __shared__ _B16x4 shared_logits[NWARPS][4][16][4];
@ -426,8 +426,7 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel(
const cache_t* k_ptr2 = k_ptr + kblock_number * kv_block_stride; const cache_t* k_ptr2 = k_ptr + kblock_number * kv_block_stride;
const int klocal_token_idx = const int klocal_token_idx =
TOKENS_PER_WARP * warpid + token_depth * 16 + lane16id; TOKENS_PER_WARP * warpid + token_depth * 16 + lane16id;
[[maybe_unused]] const int kglobal_token_idx = const int kglobal_token_idx = partition_start_token_idx + klocal_token_idx;
partition_start_token_idx + klocal_token_idx;
const int kphysical_block_offset = klocal_token_idx % BLOCK_SIZE; const int kphysical_block_offset = klocal_token_idx % BLOCK_SIZE;
const cache_t* k_ptr3 = k_ptr2 + kphysical_block_offset * KX; const cache_t* k_ptr3 = k_ptr2 + kphysical_block_offset * KX;
@ -1273,9 +1272,9 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
const int seq_idx = blockIdx.y; const int seq_idx = blockIdx.y;
const int context_len = context_lens[seq_idx]; const int context_len = context_lens[seq_idx];
const int num_partitions = DIVIDE_ROUND_UP(context_len, PARTITION_SIZE); const int num_partitions = DIVIDE_ROUND_UP(context_len, PARTITION_SIZE);
[[maybe_unused]] constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE; constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
const int warpid = threadIdx.x / WARP_SIZE; const int warpid = threadIdx.x / WARP_SIZE;
[[maybe_unused]] const int laneid = threadIdx.x % WARP_SIZE; const int laneid = threadIdx.x % WARP_SIZE;
__shared__ float shared_global_exp_sum; __shared__ float shared_global_exp_sum;
// max num partitions supported is warp_size * NPAR_LOOPS // max num partitions supported is warp_size * NPAR_LOOPS

View File

@ -370,7 +370,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
"cutlass_scaled_mm_supports_block_fp8(int cuda_device_capability) -> " "cutlass_scaled_mm_supports_block_fp8(int cuda_device_capability) -> "
"bool"); "bool");
ops.impl("cutlass_scaled_mm_supports_block_fp8", ops.impl("cutlass_scaled_mm_supports_block_fp8",
&cutlass_scaled_mm_supports_block_fp8); &cutlass_scaled_mm_supports_fp8);
// Check if cutlass sparse scaled_mm is supported for CUDA devices of the // Check if cutlass sparse scaled_mm is supported for CUDA devices of the
// given capability // given capability

View File

@ -4,7 +4,6 @@
We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below: We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below:
- [The East Coast vLLM Meetup](https://lu.ma/7mu4k4xx), March 11th 2025. [[Slides]](https://docs.google.com/presentation/d/1NHiv8EUFF1NLd3fEYODm56nDmL26lEeXCaDgyDlTsRs/edit#slide=id.g31441846c39_0_0)
- [The ninth vLLM meetup](https://lu.ma/h7g3kuj9), with Meta, February 27th 2025. [[Slides]](https://docs.google.com/presentation/d/1jzC_PZVXrVNSFVCW-V4cFXb6pn7zZ2CyP_Flwo05aqg/edit?usp=sharing) - [The ninth vLLM meetup](https://lu.ma/h7g3kuj9), with Meta, February 27th 2025. [[Slides]](https://docs.google.com/presentation/d/1jzC_PZVXrVNSFVCW-V4cFXb6pn7zZ2CyP_Flwo05aqg/edit?usp=sharing)
- [The eighth vLLM meetup](https://lu.ma/zep56hui), with Google Cloud, January 22nd 2025. [[Slides]](https://docs.google.com/presentation/d/1epVkt4Zu8Jz_S5OhEHPc798emsYh2BwYfRuDDVEF7u4/edit?usp=sharing) - [The eighth vLLM meetup](https://lu.ma/zep56hui), with Google Cloud, January 22nd 2025. [[Slides]](https://docs.google.com/presentation/d/1epVkt4Zu8Jz_S5OhEHPc798emsYh2BwYfRuDDVEF7u4/edit?usp=sharing)
- [The seventh vLLM meetup](https://lu.ma/h0qvrajz), with Snowflake, November 14th 2024. [[Slides]](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing) - [The seventh vLLM meetup](https://lu.ma/h0qvrajz), with Snowflake, November 14th 2024. [[Slides]](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing)

View File

@ -34,8 +34,7 @@ Further update the model as follows:
image_features = self.vision_encoder(image_input) image_features = self.vision_encoder(image_input)
return self.multi_modal_projector(image_features) return self.multi_modal_projector(image_features)
def get_multimodal_embeddings( def get_multimodal_embeddings(self, **kwargs: object) -> Optional[NestedTensors]:
self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
# Validate the multimodal input keyword arguments # Validate the multimodal input keyword arguments
image_input = self._parse_and_validate_image_input(**kwargs) image_input = self._parse_and_validate_image_input(**kwargs)
@ -62,7 +61,7 @@ Further update the model as follows:
def get_input_embeddings( def get_input_embeddings(
self, self,
input_ids: torch.Tensor, input_ids: torch.Tensor,
multimodal_embeddings: Optional[MultiModalEmbeddings] = None, multimodal_embeddings: Optional[NestedTensors] = None,
) -> torch.Tensor: ) -> torch.Tensor:
# `get_input_embeddings` should already be implemented for the language # `get_input_embeddings` should already be implemented for the language

View File

@ -34,11 +34,11 @@ If you need to use those dependencies (having accepted the license terms),
create a custom Dockerfile on top of the base image with an extra layer that installs them: create a custom Dockerfile on top of the base image with an extra layer that installs them:
```Dockerfile ```Dockerfile
FROM vllm/vllm-openai:v0.8.0 FROM vllm/vllm-openai:v0.7.3
# e.g. install the `audio` and `video` optional dependencies # e.g. install the `audio` and `video` optional dependencies
# NOTE: Make sure the version of vLLM matches the base image! # NOTE: Make sure the version of vLLM matches the base image!
RUN uv pip install vllm[audio,video]==0.8.0 RUN uv pip install --system vllm[audio,video]==0.7.3
``` ```
::: :::
@ -52,7 +52,7 @@ with an extra layer that installs their code from source:
```Dockerfile ```Dockerfile
FROM vllm/vllm-openai:latest FROM vllm/vllm-openai:latest
RUN uv pip install git+https://github.com/huggingface/transformers.git RUN uv pip install --system git+https://github.com/huggingface/transformers.git
``` ```
::: :::

View File

@ -4,9 +4,9 @@
A Helm chart to deploy vLLM for Kubernetes A Helm chart to deploy vLLM for Kubernetes
Helm is a package manager for Kubernetes. It will help you to deploy vLLM on k8s and automate the deployment of vLLM Kubernetes applications. With Helm, you can deploy the same framework architecture with different configurations to multiple namespaces by overriding variable values. Helm is a package manager for Kubernetes. It will help you to deploy vLLM on k8s and automate the deployment of vLLMm Kubernetes applications. With Helm, you can deploy the same framework architecture with different configurations to multiple namespaces by overriding variables values.
This guide will walk you through the process of deploying vLLM with Helm, including the necessary prerequisites, steps for helm installation and documentation on architecture and values file. This guide will walk you through the process of deploying vLLM with Helm, including the necessary prerequisites, steps for helm install and documentation on architecture and values file.
## Prerequisites ## Prerequisites

View File

@ -4,19 +4,17 @@
Deploying vLLM on Kubernetes is a scalable and efficient way to serve machine learning models. This guide walks you through deploying vLLM using native Kubernetes. Deploying vLLM on Kubernetes is a scalable and efficient way to serve machine learning models. This guide walks you through deploying vLLM using native Kubernetes.
Alternatively, you can deploy vLLM to Kubernetes using any of the following: --------
* [Helm](frameworks/helm.md)
* [InftyAI/llmaz](integrations/llmaz.md) Alternatively, you can also deploy Kubernetes using [helm chart](https://docs.vllm.ai/en/latest/deployment/frameworks/helm.html). There are also open-source projects available to make your deployment even smoother.
* [KServe](integrations/kserve.md)
* [kubernetes-sigs/lws](frameworks/lws.md) * [vLLM production-stack](https://github.com/vllm-project/production-stack): Born out of a Berkeley-UChicago collaboration, vLLM production stack is a project that contains latest research and community effort, while still delivering production-level stability and performance. Checkout the [documentation page](https://docs.vllm.ai/en/latest/deployment/integrations/production-stack.html) for more details and examples.
* [meta-llama/llama-stack](integrations/llamastack.md)
* [substratusai/kubeai](integrations/kubeai.md) --------
* [vllm-project/aibrix](https://github.com/vllm-project/aibrix)
* [vllm-project/production-stack](integrations/production-stack.md)
## Pre-requisite ## Pre-requisite
Ensure that you have a running [Kubernetes cluster with GPUs](https://kubernetes.io/docs/tasks/manage-gpus/scheduling-gpus/). Ensure that you have a running Kubernetes environment with GPU (you can follow [this tutorial](https://github.com/vllm-project/production-stack/blob/main/tutorials/00-install-kubernetes-env.md) to install a Kubernetes environment on a bare-medal GPU machine).
## Deployment using native K8s ## Deployment using native K8s

View File

@ -419,7 +419,7 @@ List of `v_vec` for one thread
which is also `V_VEC_SIZE` elements from `logits`. Overall, with which is also `V_VEC_SIZE` elements from `logits`. Overall, with
multiple inner iterations, each warp will process one block of value multiple inner iterations, each warp will process one block of value
tokens. And with multiple outer iterations, the whole context value tokens. And with multiple outer iterations, the whole context value
tokens are processed tokens are processd
```cpp ```cpp
float accs[NUM_ROWS_PER_THREAD]; float accs[NUM_ROWS_PER_THREAD];

View File

@ -13,7 +13,7 @@ Ensure the v1 LLM Engine exposes a superset of the metrics available in v0.
Metrics in vLLM can be categorized as follows: Metrics in vLLM can be categorized as follows:
1. Server-level metrics: these are global metrics that track the state and performance of the LLM engine. These are typically exposed as Gauges or Counters in Prometheus. 1. Server-level metrics: these are global metrics that track the state and performance of the LLM engine. These are typically exposed as Gauges or Counters in Prometheus.
2. Request-level metrics: these are metrics that track the characteristics - e.g. size and timing - of individual requests. These are typically exposed as Histograms in Prometheus, and are often the SLO that an SRE monitoring vLLM will be tracking. 2. Request-level metrics: these are metrics that track the characteristics - e.g. size and timing - of individual requests. These are typically exposed as Histrograms in Prometheus, and are often the SLO that an SRE monitoring vLLM will be tracking.
The mental model is that the "Server-level Metrics" explain why the "Request-level Metrics" are what they are. The mental model is that the "Server-level Metrics" explain why the "Request-level Metrics" are what they are.
@ -47,7 +47,7 @@ In v0, the following metrics are exposed via a Prometheus-compatible `/metrics`
- `vllm:tokens_total` (Counter) - `vllm:tokens_total` (Counter)
- `vllm:iteration_tokens_total` (Histogram) - `vllm:iteration_tokens_total` (Histogram)
- `vllm:time_in_queue_requests` (Histogram) - `vllm:time_in_queue_requests` (Histogram)
- `vllm:model_forward_time_milliseconds` (Histogram) - `vllm:model_forward_time_milliseconds` (Histogram
- `vllm:model_execute_time_milliseconds` (Histogram) - `vllm:model_execute_time_milliseconds` (Histogram)
- `vllm:request_params_n` (Histogram) - `vllm:request_params_n` (Histogram)
- `vllm:request_params_max_tokens` (Histogram) - `vllm:request_params_max_tokens` (Histogram)

View File

@ -191,7 +191,7 @@ When the head block (least recently used block) of the free queue is cached, we
In this example, we assume the block size is 4 (each block can cache 4 tokens), and we have 10 blocks in the KV-cache manager in total. In this example, we assume the block size is 4 (each block can cache 4 tokens), and we have 10 blocks in the KV-cache manager in total.
**Time 1: The cache is empty and a new request comes in.** We allocate 4 blocks. 3 of them are already full and cached. The fourth block is partially full with 3 of 4 tokens. **Time 1: The cache is empty and a new request comes in.** We allocate 4 blocks. 3 of them are already full and cached. The fourth block is partially full with 2 of 4 tokens.
:::{image} /assets/design/v1/prefix_caching/example-time-1.png :::{image} /assets/design/v1/prefix_caching/example-time-1.png
:alt: Example Time 1 :alt: Example Time 1
@ -203,7 +203,7 @@ In this example, we assume the block size is 4 (each block can cache 4 tokens),
:alt: Example Time 3 :alt: Example Time 3
::: :::
**Time 4: Request 1 comes in with the 14 prompt tokens, where the first 10 tokens are the same as request 0.** We can see that only the first 2 blocks (8 tokens) hit the cache, because the 3rd block only matches 2 of 4 tokens. **Time 4: Request 1 comes in with the 14 prompt tokens, where the first 11 tokens are the same as request 0.** We can see that only 2 blocks (11 tokens) hit the cache, because the 3rd block only matches 3 of 4 tokens.
:::{image} /assets/design/v1/prefix_caching/example-time-4.png :::{image} /assets/design/v1/prefix_caching/example-time-4.png
:alt: Example Time 4 :alt: Example Time 4

View File

@ -110,7 +110,7 @@ In addition to serving LoRA adapters at server startup, the vLLM server now supp
LoRA adapters at runtime through dedicated API endpoints. This feature can be particularly useful when the flexibility LoRA adapters at runtime through dedicated API endpoints. This feature can be particularly useful when the flexibility
to change models on-the-fly is needed. to change models on-the-fly is needed.
Note: Enabling this feature in production environments is risky as users may participate in model adapter management. Note: Enabling this feature in production environments is risky as user may participate model adapter management.
To enable dynamic LoRA loading and unloading, ensure that the environment variable `VLLM_ALLOW_RUNTIME_LORA_UPDATING` To enable dynamic LoRA loading and unloading, ensure that the environment variable `VLLM_ALLOW_RUNTIME_LORA_UPDATING`
is set to `True`. When this option is enabled, the API server will log a warning to indicate that dynamic loading is active. is set to `True`. When this option is enabled, the API server will log a warning to indicate that dynamic loading is active.

View File

@ -162,7 +162,7 @@ A variety of speculative models of this type are available on HF hub:
## Speculating using EAGLE based draft models ## Speculating using EAGLE based draft models
The following code configures vLLM to use speculative decoding where proposals are generated by The following code configures vLLM to use speculative decoding where proposals are generated by
an [EAGLE (Extrapolation Algorithm for Greater Language-model Efficiency)](https://arxiv.org/pdf/2401.15077) based draft model. A more detailed example for offline mode, including how to extract request level acceptance rate, can be found [here](<gh-file:examples/offline_inference/eagle.py>). an [EAGLE (Extrapolation Algorithm for Greater Language-model Efficiency)](https://arxiv.org/pdf/2401.15077) based draft model.
```python ```python
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams

View File

@ -15,7 +15,7 @@ more are listed [here](#supported-models).
By extracting hidden states, vLLM can automatically convert text generation models like [Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B), By extracting hidden states, vLLM can automatically convert text generation models like [Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B),
[Mistral-7B-Instruct-v0.3](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3) into embedding models, [Mistral-7B-Instruct-v0.3](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3) into embedding models,
but they are expected to be inferior to models that are specifically trained on embedding tasks. but they are expected be inferior to models that are specifically trained on embedding tasks.
______________________________________________________________________ ______________________________________________________________________

View File

@ -119,7 +119,7 @@ If you're observing the following error: `docker: Error response from daemon: Un
## Supported configurations ## Supported configurations
The following configurations have been validated to function with The following configurations have been validated to be function with
Gaudi2 devices. Configurations that are not listed may or may not work. Gaudi2 devices. Configurations that are not listed may or may not work.
- [meta-llama/Llama-2-7b](https://huggingface.co/meta-llama/Llama-2-7b) - [meta-llama/Llama-2-7b](https://huggingface.co/meta-llama/Llama-2-7b)

View File

@ -19,7 +19,7 @@ Currently, there are no pre-built OpenVINO wheels.
### Build wheel from source ### Build wheel from source
First, install Python and ensure you have the latest pip. For example, on Ubuntu 22.04, you can run: First, install Python and ensure you lave the latest pip. For example, on Ubuntu 22.04, you can run:
```console ```console
sudo apt-get update -y sudo apt-get update -y

View File

@ -189,13 +189,12 @@ vLLM CPU backend supports the following vLLM features:
- Model Quantization (`INT8 W8A8, AWQ, GPTQ`) - Model Quantization (`INT8 W8A8, AWQ, GPTQ`)
- Chunked-prefill - Chunked-prefill
- Prefix-caching - Prefix-caching
- FP8-E5M2 KV cache - FP8-E5M2 KV-Caching (TODO)
## Related runtime environment variables ## Related runtime environment variables
- `VLLM_CPU_KVCACHE_SPACE`: specify the KV Cache size (e.g, `VLLM_CPU_KVCACHE_SPACE=40` means 40 GB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users. - `VLLM_CPU_KVCACHE_SPACE`: specify the KV Cache size (e.g, `VLLM_CPU_KVCACHE_SPACE=40` means 40 GB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users.
- `VLLM_CPU_OMP_THREADS_BIND`: specify the CPU cores dedicated to the OpenMP threads. For example, `VLLM_CPU_OMP_THREADS_BIND=0-31` means there will be 32 OpenMP threads bound on 0-31 CPU cores. `VLLM_CPU_OMP_THREADS_BIND=0-31|32-63` means there will be 2 tensor parallel processes, 32 OpenMP threads of rank0 are bound on 0-31 CPU cores, and the OpenMP threads of rank1 are bound on 32-63 CPU cores. - `VLLM_CPU_OMP_THREADS_BIND`: specify the CPU cores dedicated to the OpenMP threads. For example, `VLLM_CPU_OMP_THREADS_BIND=0-31` means there will be 32 OpenMP threads bound on 0-31 CPU cores. `VLLM_CPU_OMP_THREADS_BIND=0-31|32-63` means there will be 2 tensor parallel processes, 32 OpenMP threads of rank0 are bound on 0-31 CPU cores, and the OpenMP threads of rank1 are bound on 32-63 CPU cores.
- `VLLM_CPU_MOE_PREPACK`: whether to use prepack for MoE layer. This will be passed to `ipex.llm.modules.GatedMLPMOE`. Default is `1` (True). On unsupported CPUs, you might need to set this to `0` (False).
## Performance tips ## Performance tips

View File

@ -131,8 +131,6 @@ Building from source requires a lot of compilation. If you are building from sou
For example, you can install [ccache](https://github.com/ccache/ccache) using `conda install ccache` or `apt install ccache` . For example, you can install [ccache](https://github.com/ccache/ccache) using `conda install ccache` or `apt install ccache` .
As long as `which ccache` command can find the `ccache` binary, it will be used automatically by the build system. After the first build, subsequent builds will be much faster. As long as `which ccache` command can find the `ccache` binary, it will be used automatically by the build system. After the first build, subsequent builds will be much faster.
When using `ccache` with `pip install -e .`, you should run `CCACHE_NOHASHDIR="true" pip install --no-build-isolation -e .`. This is because `pip` creates a new folder with a random name for each build, preventing `ccache` from recognizing that the same files are being built.
[sccache](https://github.com/mozilla/sccache) works similarly to `ccache`, but has the capability to utilize caching in remote storage environments. [sccache](https://github.com/mozilla/sccache) works similarly to `ccache`, but has the capability to utilize caching in remote storage environments.
The following environment variables can be set to configure the vLLM `sccache` remote: `SCCACHE_BUCKET=vllm-build-sccache SCCACHE_REGION=us-west-2 SCCACHE_S3_NO_CREDENTIALS=1`. We also recommend setting `SCCACHE_IDLE_TIMEOUT=0`. The following environment variables can be set to configure the vLLM `sccache` remote: `SCCACHE_BUCKET=vllm-build-sccache SCCACHE_REGION=us-west-2 SCCACHE_S3_NO_CREDENTIALS=1`. We also recommend setting `SCCACHE_IDLE_TIMEOUT=0`.
::: :::

View File

@ -1,6 +1,6 @@
# Installation # Installation
vLLM initially supports basic model inference and serving on Intel GPU platform. vLLM initially supports basic model inferencing and serving on Intel GPU platform.
:::{attention} :::{attention}
There are no pre-built wheels or images for this device, so you must build vLLM from source. There are no pre-built wheels or images for this device, so you must build vLLM from source.
@ -65,7 +65,7 @@ $ docker run -it \
## Supported features ## Supported features
XPU platform supports **tensor parallel** inference/serving and also supports **pipeline parallel** as a beta feature for online serving. We require Ray as the distributed runtime backend. For example, a reference execution like following: XPU platform supports **tensor parallel** inference/serving and also supports **pipeline parallel** as a beta feature for online serving. We requires Ray as the distributed runtime backend. For example, a reference execution likes following:
```console ```console
python -m vllm.entrypoints.openai.api_server \ python -m vllm.entrypoints.openai.api_server \
@ -78,6 +78,6 @@ python -m vllm.entrypoints.openai.api_server \
-tp=8 -tp=8
``` ```
By default, a ray instance will be launched automatically if no existing one is detected in the system, with `num-gpus` equals to `parallel_config.world_size`. We recommend properly starting a ray cluster before execution, referring to the <gh-file:examples/online_serving/run_cluster.sh> helper script. By default, a ray instance will be launched automatically if no existing one is detected in system, with `num-gpus` equals to `parallel_config.world_size`. We recommend properly starting a ray cluster before execution, referring to the <gh-file:examples/online_serving/run_cluster.sh> helper script.
There are some new features coming with ipex-xpu 2.6, e.g. **chunked prefill**, **V1 engine support**, **lora**, **MoE**, etc. There are some new features coming with ipex-xpu 2.6, eg: **chunked prefill**, **V1 engine support**, **lora**, **MoE**, etc.

View File

@ -1,161 +0,0 @@
# vLLM V1 User Guide
V1 is now enabled by default for all supported use cases, and we will gradually enable it for every use case we plan to support. Please share any feedback on [GitHub](https://github.com/vllm-project/vllm) or in the [vLLM Slack](https://inviter.co/vllm-slack).
To disable V1, please set the environment variable as: `VLLM_USE_V1=0`, and send us a GitHub issue sharing the reason!
## Why vLLM V1?
vLLM V0 successfully supported a wide range of models and hardware, but as new features were developed independently, the system grew increasingly complex. This complexity made it harder to integrate new capabilities and introduced technical debt, revealing the need for a more streamlined and unified design.
Building on V0s success, vLLM V1 retains the stable and proven components from V0
(such as the models, GPU kernels, and utilities). At the same time, it significantly
re-architects the core systems, covering the scheduler, KV cache manager, worker,
sampler, and API server, to provide a cohesive, maintainable framework that better
accommodates continued growth and innovation.
Specifically, V1 aims to:
- Provide a **simple, modular, and easy-to-hack codebase**.
- Ensure **high performance** with near-zero CPU overhead.
- **Combine key optimizations** into a unified architecture.
- Require **zero configs** by enabling features/optimizations by default.
We see significant performance improvements from upgrading to V1 core engine, in
particular for long context scenarios. Please see performance benchmark (To be
added).
For more details, check out the vLLM V1 blog post [vLLM V1: A Major
Upgrade to vLLMs Core Architecture](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html) (published Jan 27, 2025).
This living user guide outlines a few known **important changes and limitations** introduced by vLLM V1. The team has been working actively to bring V1 as the default engine, therefore this guide will be updated constantly as more features get supported on vLLM V1.
### Supports Overview
#### Hardware
| Hardware | Status |
|----------|------------------------------------------|
| **NVIDIA** | <nobr>🚀 Natively Supported</nobr> |
| **AMD** | <nobr>🚧 WIP</nobr> |
| **TPU** | <nobr>🚧 WIP</nobr> |
#### Feature / Model
| Feature / Model | Status |
|-----------------|-----------------------------------------------------------------------------------|
| **Prefix Caching** | <nobr>🚀 Optimized</nobr> |
| **Chunked Prefill** | <nobr>🚀 Optimized</nobr> |
| **Logprobs Calculation** | <nobr>🟢 Functional</nobr> |
| **LoRA** | <nobr>🟢 Functional ([PR #13096](https://github.com/vllm-project/vllm/pull/13096))</nobr>|
| **Multimodal Models** | <nobr>🟢 Functional</nobr> |
| **Spec Decode** | <nobr>🚧 WIP ([PR #13933](https://github.com/vllm-project/vllm/pull/13933))</nobr>|
| **Prompt Logprobs with Prefix Caching** | <nobr>🟡 Planned ([RFC #13414](https://github.com/vllm-project/vllm/issues/13414))</nobr>|
| **FP8 KV Cache** | <nobr>🟡 Planned</nobr> |
| **Structured Output Alternative Backends** | <nobr>🟡 Planned</nobr> |
| **Embedding Models** | <nobr>🟡 Planned ([RFC #12249](https://github.com/vllm-project/vllm/issues/12249))</nobr> |
| **Mamba Models** | <nobr>🟡 Planned</nobr> |
| **Encoder-Decoder Models** | <nobr>🟡 Planned</nobr> |
| **Request-level Structured Output Backend** | <nobr>🔴 Deprecated</nobr> |
| **best_of** | <nobr>🔴 Deprecated ([RFC #13361](https://github.com/vllm-project/vllm/issues/13361))</nobr>|
| **Per-Request Logits Processors** | <nobr>🔴 Deprecated ([RFC #13360](https://github.com/vllm-project/vllm/pull/13360))</nobr> |
| **GPU <> CPU KV Cache Swapping** | <nobr>🔴 Deprecated</nobr> |
- **🚀 Optimized**: Nearly fully optimized, with no further work currently planned.
- **🟢 Functional**: Fully operational, with ongoing optimizations.
- **🚧 WIP**: Under active development.
- **🟡 Planned**: Scheduled for future implementation (some may have open PRs/RFCs).
- **🔴 Deprecated**: Not planned for v1 unless there is strong demand.
**Note**: vLLM V1s unified scheduler treats both prompt and output tokens the same
way by using a simple dictionary (e.g., {request_id: num_tokens}) to dynamically
allocate a fixed token budget per request, enabling features like chunked prefills,
prefix caching, and speculative decoding without a strict separation between prefill
and decode phases.
### Semantic Changes and Deprecated Features
#### Logprobs
vLLM V1 supports logprobs and prompt logprobs. However, there are some important semantic
differences compared to V0:
**Logprobs Calculation**
Logprobs in V1 are now returned immediately once computed from the models raw output (i.e.
before applying any logits post-processing such as temperature scaling or penalty
adjustments). As a result, the returned logprobs do not reflect the final adjusted
probabilities used during sampling.
Support for logprobs with post-sampling adjustments is in progress and will be added in future updates.
**Prompt Logprobs with Prefix Caching**
Currently prompt logprobs are only supported when prefix caching is turned off via `--no-enable-prefix-caching`. In a future release, prompt logprobs will be compatible with prefix caching, but a recomputation will be triggered to recover the full prompt logprobs even upon a prefix cache hit. See details in [RFC #13414](https://github.com/vllm-project/vllm/issues/13414).
#### Deprecated Features
As part of the major architectural rework in vLLM V1, several legacy features have been deprecated.
**Sampling features**
- **best_of**: This feature has been deprecated due to limited usage. See details at [RFC #13361](https://github.com/vllm-project/vllm/issues/13361).
- **Per-Request Logits Processors**: In V0, users could pass custom
processing functions to adjust logits on a per-request basis. In vLLM V1, this
feature has been deprecated. Instead, the design is moving toward supporting **global logits
processors**, a feature the team is actively working on for future releases. See details at [RFC #13360](https://github.com/vllm-project/vllm/pull/13360).
**KV Cache features**
- **GPU <> CPU KV Cache Swapping**: with the new simplified core architecture, vLLM V1 no longer requires KV cache swapping
to handle request preemptions.
**Structured Output features**
- **Request-level Structured Output Backend**: Deprecated, alternative backends
(outlines, guidance) with fallbacks is WIP.
### Feature & Model Support in Progress
Although we have re-implemented and partially optimized many features and models from V0 in vLLM V1, optimization work is still ongoing for some, and others remain unsupported.
#### Features to Be Optimized
These features are already supported in vLLM V1, but their optimization is still
in progress.
- **LoRA**: LoRA is functionally working on vLLM V1 but its performance is
inferior to that of V0. The team is actively working on improving its
performance
(e.g., see [PR #13096](https://github.com/vllm-project/vllm/pull/13096)).
- **Spec Decode**: Currently, only ngram-based spec decode is supported in V1. There
will be follow-up work to support other types of spec decode (e.g., see [PR #13933](https://github.com/vllm-project/vllm/pull/13933)). We will prioritize the support for Eagle, MTP compared to draft model based spec decode.
#### Features to Be Supported
- **FP8 KV Cache**: While vLLM V1 introduces new FP8 kernels for model weight quantization, support for an FP8 keyvalue cache is not yet available. Users must continue using FP16 (or other supported precisions) for the KV cache.
- **Structured Output Alternative Backends**: Structured output alternative backends (outlines, guidance) support is planned. V1 currently
supports only the `xgrammar:no_fallback` mode, meaning that it will error out if the output schema is unsupported by xgrammar.
Details about the structured outputs can be found
[here](https://docs.vllm.ai/en/latest/features/structured_outputs.html).
#### Models to Be Supported
vLLM V1 currently excludes model architectures with the `SupportsV0Only` protocol,
and the majority fall into the following categories. V1 support for these models will be added eventually.
**Embedding Models**
Instead of having a separate model runner, hidden states processor [RFC #12249](https://github.com/vllm-project/vllm/issues/12249), which is based on global logits processor [RFC #13360](https://github.com/vllm-project/vllm/pull/13360), has been proposed to enable simultaneous generation and embedding using the same engine instance in V1. It is still in the planning stage.
**Mamba Models**
Models using selective state-space mechanisms (instead of standard transformer attention)
are not yet supported (e.g., `MambaForCausalLM`, `JambaForCausalLM`).
**Encoder-Decoder Models**
vLLM V1 is currently optimized for decoder-only transformers. Models requiring
cross-attention between separate encoder and decoder are not yet supported (e.g., `BartForConditionalGeneration`, `MllamaForConditionalGeneration`).
For a complete list of supported models, see the [list of supported models](https://docs.vllm.ai/en/latest/models/supported_models.html).
## FAQ
TODO

View File

@ -67,8 +67,6 @@ getting_started/quickstart
getting_started/examples/examples_index getting_started/examples/examples_index
getting_started/troubleshooting getting_started/troubleshooting
getting_started/faq getting_started/faq
getting_started/v1_user_guide
::: :::
% What does vLLM support? % What does vLLM support?

View File

@ -101,7 +101,7 @@ class MyAttention(nn.Module):
def forward(self, hidden_states, **kwargs): # <- kwargs are required def forward(self, hidden_states, **kwargs): # <- kwargs are required
... ...
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] attention_interface = attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
attn_output, attn_weights = attention_interface( attn_output, attn_weights = attention_interface(
self, self,
query_states, query_states,
@ -477,11 +477,6 @@ See [this page](#generative-models) for more information on how to use generativ
* `xverse/XVERSE-7B-Chat`, `xverse/XVERSE-13B-Chat`, `xverse/XVERSE-65B-Chat`, etc. * `xverse/XVERSE-7B-Chat`, `xverse/XVERSE-13B-Chat`, `xverse/XVERSE-65B-Chat`, etc.
* ✅︎ * ✅︎
* ✅︎ * ✅︎
- * `Zamba2ForCausalLM`
* Zamba2
* `Zyphra/Zamba2-7B-instruct`, `Zyphra/Zamba2-2.7B-instruct`, `Zyphra/Zamba2-1.2B-instruct`, etc.
*
*
::: :::
:::{note} :::{note}
@ -884,7 +879,7 @@ See [this page](#generative-models) for more information on how to use generativ
- * `PixtralForConditionalGeneration` - * `PixtralForConditionalGeneration`
* Pixtral * Pixtral
* T + I<sup>+</sup> * T + I<sup>+</sup>
* `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, `mistral-community/pixtral-12b`, etc. * `mistralai/Pixtral-12B-2409`, `mistral-community/pixtral-12b`, etc.
* *
* ✅︎ * ✅︎
* ✅︎ * ✅︎
@ -951,7 +946,7 @@ V0 correctly implements the model's attention pattern:
V1 currently uses a simplified attention pattern: V1 currently uses a simplified attention pattern:
- Uses causal attention for all tokens, including image tokens - Uses causal attention for all tokens, including image tokens
- Generates reasonable outputs but does not match the original model's attention for text + image inputs, especially when `{"do_pan_and_scan": True}` - Generates reasonable outputs but does not match the original model's attention for text + image inputs
- Will be updated in the future to support the correct behavior - Will be updated in the future to support the correct behavior
This limitation exists because the model's mixed attention pattern (bidirectional for images, causal otherwise) is not yet supported by vLLM's attention backends. This limitation exists because the model's mixed attention pattern (bidirectional for images, causal otherwise) is not yet supported by vLLM's attention backends.

View File

@ -20,7 +20,7 @@ There is one edge case: if the model fits in a single node with multiple GPUs, b
## Running vLLM on a single node ## Running vLLM on a single node
vLLM supports distributed tensor-parallel and pipeline-parallel inference and serving. Currently, we support [Megatron-LM's tensor parallel algorithm](https://arxiv.org/pdf/1909.08053.pdf). We manage the distributed runtime with either [Ray](https://github.com/ray-project/ray) or python native multiprocessing. Multiprocessing can be used when deploying on a single node, multi-node inference currently requires Ray. vLLM supports distributed tensor-parallel and pipeline-parallel inference and serving. Currently, we support [Megatron-LM's tensor parallel algorithm](https://arxiv.org/pdf/1909.08053.pdf). We manage the distributed runtime with either [Ray](https://github.com/ray-project/ray) or python native multiprocessing. Multiprocessing can be used when deploying on a single node, multi-node inferencing currently requires Ray.
Multiprocessing will be used by default when not running in a Ray placement group and if there are sufficient GPUs available on the same node for the configured `tensor_parallel_size`, otherwise Ray will be used. This default can be overridden via the `LLM` class `distributed_executor_backend` argument or `--distributed-executor-backend` API server argument. Set it to `mp` for multiprocessing or `ray` for Ray. It's not required for Ray to be installed for the multiprocessing case. Multiprocessing will be used by default when not running in a Ray placement group and if there are sufficient GPUs available on the same node for the configured `tensor_parallel_size`, otherwise Ray will be used. This default can be overridden via the `LLM` class `distributed_executor_backend` argument or `--distributed-executor-backend` API server argument. Set it to `mp` for multiprocessing or `ray` for Ray. It's not required for Ray to be installed for the multiprocessing case.
@ -29,7 +29,7 @@ To run multi-GPU inference with the `LLM` class, set the `tensor_parallel_size`
```python ```python
from vllm import LLM from vllm import LLM
llm = LLM("facebook/opt-13b", tensor_parallel_size=4) llm = LLM("facebook/opt-13b", tensor_parallel_size=4)
output = llm.generate("San Francisco is a") output = llm.generate("San Franciso is a")
``` ```
To run multi-GPU serving, pass in the `--tensor-parallel-size` argument when starting the server. For example, to run API server on 4 GPUs: To run multi-GPU serving, pass in the `--tensor-parallel-size` argument when starting the server. For example, to run API server on 4 GPUs:

View File

@ -39,16 +39,7 @@ The following metrics are exposed:
The following metrics are deprecated and due to be removed in a future version: The following metrics are deprecated and due to be removed in a future version:
- `vllm:num_requests_swapped`, `vllm:cpu_cache_usage_perc`, and - *(No metrics are currently deprecated)*
`vllm:cpu_prefix_cache_hit_rate` because KV cache offloading is not
used in V1.
- `vllm:gpu_prefix_cache_hit_rate` is replaced by queries+hits
counters in V1.
- `vllm:time_in_queue_requests` because it duplicates
`vllm:request_queue_time_seconds`.
- `vllm:model_forward_time_milliseconds` and
`vllm:model_execute_time_milliseconds` because
prefill/decode/inference time metrics should be used instead.
Note: when metrics are deprecated in version `X.Y`, they are hidden in version `X.Y+1` Note: when metrics are deprecated in version `X.Y`, they are hidden in version `X.Y+1`
but can be re-enabled using the `--show-hidden-metrics-for-version=X.Y` escape hatch, but can be re-enabled using the `--show-hidden-metrics-for-version=X.Y` escape hatch,

View File

@ -1,6 +1,6 @@
# Reinforcement Learning from Human Feedback # Reinforcement Learning from Human Feedback
Reinforcement Learning from Human Feedback (RLHF) is a technique that fine-tunes language models using human-generated preference data to align model outputs with desired behaviors. Reinforcement Learning from Human Feedback (RLHF) is a technique that fine-tunes language models using human-generated preference data to align model outputs with desired behaviours.
vLLM can be used to generate the completions for RLHF. The best way to do this is with libraries like [TRL](https://github.com/huggingface/trl), [OpenRLHF](https://github.com/OpenRLHF/OpenRLHF) and [verl](https://github.com/volcengine/verl). vLLM can be used to generate the completions for RLHF. The best way to do this is with libraries like [TRL](https://github.com/huggingface/trl), [OpenRLHF](https://github.com/OpenRLHF/OpenRLHF) and [verl](https://github.com/volcengine/verl).

View File

@ -7,13 +7,11 @@ For most models, the prompt format should follow corresponding examples
on HuggingFace model repository. on HuggingFace model repository.
""" """
import os import os
from dataclasses import asdict
from typing import NamedTuple, Optional
from huggingface_hub import snapshot_download from huggingface_hub import snapshot_download
from transformers import AutoTokenizer from transformers import AutoTokenizer
from vllm import LLM, EngineArgs, SamplingParams from vllm import LLM, SamplingParams
from vllm.assets.audio import AudioAsset from vllm.assets.audio import AudioAsset
from vllm.lora.request import LoRARequest from vllm.lora.request import LoRARequest
from vllm.utils import FlexibleArgumentParser from vllm.utils import FlexibleArgumentParser
@ -25,31 +23,21 @@ question_per_audio_count = {
2: "What sport and what nursery rhyme are referenced?" 2: "What sport and what nursery rhyme are referenced?"
} }
class ModelRequestData(NamedTuple):
engine_args: EngineArgs
prompt: str
stop_token_ids: Optional[list[int]] = None
lora_requests: Optional[list[LoRARequest]] = None
# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on # NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
# lower-end GPUs. # lower-end GPUs.
# Unless specified, these settings have been tested to work on a single L4. # Unless specified, these settings have been tested to work on a single L4.
# MiniCPM-O # MiniCPM-O
def run_minicpmo(question: str, audio_count: int) -> ModelRequestData: def run_minicpmo(question: str, audio_count: int):
model_name = "openbmb/MiniCPM-o-2_6" model_name = "openbmb/MiniCPM-o-2_6"
tokenizer = AutoTokenizer.from_pretrained(model_name, tokenizer = AutoTokenizer.from_pretrained(model_name,
trust_remote_code=True) trust_remote_code=True)
engine_args = EngineArgs( llm = LLM(model=model_name,
model=model_name, trust_remote_code=True,
trust_remote_code=True, max_model_len=4096,
max_model_len=4096, max_num_seqs=5,
max_num_seqs=5, limit_mm_per_prompt={"audio": audio_count})
limit_mm_per_prompt={"audio": audio_count},
)
stop_tokens = ['<|im_end|>', '<|endoftext|>'] stop_tokens = ['<|im_end|>', '<|endoftext|>']
stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens] stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
@ -64,16 +52,11 @@ def run_minicpmo(question: str, audio_count: int) -> ModelRequestData:
tokenize=False, tokenize=False,
add_generation_prompt=True, add_generation_prompt=True,
chat_template=audio_chat_template) chat_template=audio_chat_template)
return llm, prompt, stop_token_ids
return ModelRequestData(
engine_args=engine_args,
prompt=prompt,
stop_token_ids=stop_token_ids,
)
# Phi-4-multimodal-instruct # Phi-4-multimodal-instruct
def run_phi4mm(question: str, audio_count: int) -> ModelRequestData: def run_phi4mm(questions: str, audio_count: int):
""" """
Phi-4-multimodal-instruct supports both image and audio inputs. Here, we Phi-4-multimodal-instruct supports both image and audio inputs. Here, we
show how to process audio inputs. show how to process audio inputs.
@ -84,35 +67,36 @@ def run_phi4mm(question: str, audio_count: int) -> ModelRequestData:
speech_lora_path = os.path.join(model_path, "speech-lora") speech_lora_path = os.path.join(model_path, "speech-lora")
placeholders = "".join([f"<|audio_{i+1}|>" for i in range(audio_count)]) placeholders = "".join([f"<|audio_{i+1}|>" for i in range(audio_count)])
prompts = f"<|user|>{placeholders}{question}<|end|><|assistant|>" prompts = f"<|user|>{placeholders}{questions}<|end|><|assistant|>"
engine_args = EngineArgs( llm = LLM(
model=model_path, model=model_path,
trust_remote_code=True, trust_remote_code=True,
max_model_len=4096, max_model_len=4096,
max_num_seqs=2, max_num_seqs=2,
enable_lora=True, enable_lora=True,
max_lora_rank=320, max_lora_rank=320,
lora_extra_vocab_size=0,
limit_mm_per_prompt={"audio": audio_count}, limit_mm_per_prompt={"audio": audio_count},
) )
lora_request = LoRARequest("speech", 1, speech_lora_path)
# To maintain code compatibility in this script, we add LoRA here.
llm.llm_engine.add_lora(lora_request=lora_request)
# You can also add LoRA using:
# llm.generate(prompts, lora_request=lora_request,...)
return ModelRequestData( stop_token_ids = None
engine_args=engine_args, return llm, prompts, stop_token_ids
prompt=prompts,
lora_requests=[LoRARequest("speech", 1, speech_lora_path)],
)
# Qwen2-Audio # Qwen2-Audio
def run_qwen2_audio(question: str, audio_count: int) -> ModelRequestData: def run_qwen2_audio(question: str, audio_count: int):
model_name = "Qwen/Qwen2-Audio-7B-Instruct" model_name = "Qwen/Qwen2-Audio-7B-Instruct"
engine_args = EngineArgs( llm = LLM(model=model_name,
model=model_name, max_model_len=4096,
max_model_len=4096, max_num_seqs=5,
max_num_seqs=5, limit_mm_per_prompt={"audio": audio_count})
limit_mm_per_prompt={"audio": audio_count},
)
audio_in_prompt = "".join([ audio_in_prompt = "".join([
f"Audio {idx+1}: " f"Audio {idx+1}: "
@ -123,15 +107,12 @@ def run_qwen2_audio(question: str, audio_count: int) -> ModelRequestData:
"<|im_start|>user\n" "<|im_start|>user\n"
f"{audio_in_prompt}{question}<|im_end|>\n" f"{audio_in_prompt}{question}<|im_end|>\n"
"<|im_start|>assistant\n") "<|im_start|>assistant\n")
stop_token_ids = None
return ModelRequestData( return llm, prompt, stop_token_ids
engine_args=engine_args,
prompt=prompt,
)
# Ultravox 0.5-1B # Ultravox 0.5-1B
def run_ultravox(question: str, audio_count: int) -> ModelRequestData: def run_ultravox(question: str, audio_count: int):
model_name = "fixie-ai/ultravox-v0_5-llama-3_2-1b" model_name = "fixie-ai/ultravox-v0_5-llama-3_2-1b"
tokenizer = AutoTokenizer.from_pretrained(model_name) tokenizer = AutoTokenizer.from_pretrained(model_name)
@ -143,39 +124,29 @@ def run_ultravox(question: str, audio_count: int) -> ModelRequestData:
tokenize=False, tokenize=False,
add_generation_prompt=True) add_generation_prompt=True)
engine_args = EngineArgs( llm = LLM(model=model_name,
model=model_name, max_model_len=4096,
max_model_len=4096, max_num_seqs=5,
max_num_seqs=5, trust_remote_code=True,
trust_remote_code=True, limit_mm_per_prompt={"audio": audio_count})
limit_mm_per_prompt={"audio": audio_count}, stop_token_ids = None
) return llm, prompt, stop_token_ids
return ModelRequestData(
engine_args=engine_args,
prompt=prompt,
)
# Whisper # Whisper
def run_whisper(question: str, audio_count: int) -> ModelRequestData: def run_whisper(question: str, audio_count: int):
assert audio_count == 1, ( assert audio_count == 1, (
"Whisper only support single audio input per prompt") "Whisper only support single audio input per prompt")
model_name = "openai/whisper-large-v3-turbo" model_name = "openai/whisper-large-v3-turbo"
prompt = "<|startoftranscript|>" prompt = "<|startoftranscript|>"
engine_args = EngineArgs( llm = LLM(model=model_name,
model=model_name, max_model_len=448,
max_model_len=448, max_num_seqs=5,
max_num_seqs=5, limit_mm_per_prompt={"audio": audio_count})
limit_mm_per_prompt={"audio": audio_count}, stop_token_ids = None
) return llm, prompt, stop_token_ids
return ModelRequestData(
engine_args=engine_args,
prompt=prompt,
)
model_example_map = { model_example_map = {
@ -193,24 +164,14 @@ def main(args):
raise ValueError(f"Model type {model} is not supported.") raise ValueError(f"Model type {model} is not supported.")
audio_count = args.num_audios audio_count = args.num_audios
req_data = model_example_map[model](question_per_audio_count[audio_count], llm, prompt, stop_token_ids = model_example_map[model](
audio_count) question_per_audio_count[audio_count], audio_count)
engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
llm = LLM(**engine_args)
# To maintain code compatibility in this script, we add LoRA here.
# You can also add LoRA using:
# llm.generate(prompts, lora_request=lora_request,...)
if req_data.lora_requests:
for lora_request in req_data.lora_requests:
llm.llm_engine.add_lora(lora_request=lora_request)
# We set temperature to 0.2 so that outputs can be different # We set temperature to 0.2 so that outputs can be different
# even when all prompts are identical when running batch inference. # even when all prompts are identical when running batch inference.
sampling_params = SamplingParams(temperature=0.2, sampling_params = SamplingParams(temperature=0.2,
max_tokens=64, max_tokens=64,
stop_token_ids=req_data.stop_token_ids) stop_token_ids=stop_token_ids)
mm_data = {} mm_data = {}
if audio_count > 0: if audio_count > 0:
@ -222,7 +183,7 @@ def main(args):
} }
assert args.num_prompts > 0 assert args.num_prompts > 0
inputs = {"prompt": req_data.prompt, "multi_modal_data": mm_data} inputs = {"prompt": prompt, "multi_modal_data": mm_data}
if args.num_prompts > 1: if args.num_prompts > 1:
# Batch inference # Batch inference
inputs = [inputs] * args.num_prompts inputs = [inputs] * args.num_prompts
@ -253,10 +214,6 @@ if __name__ == "__main__":
default=1, default=1,
choices=[0, 1, 2], choices=[0, 1, 2],
help="Number of audio items per prompt.") help="Number of audio items per prompt.")
parser.add_argument("--seed",
type=int,
default=None,
help="Set the seed when initializing `vllm.LLM`.")
args = parser.parse_args() args = parser.parse_args()
main(args) main(args)

View File

@ -76,10 +76,5 @@ if __name__ == "__main__":
GPUs_per_dp_rank)) GPUs_per_dp_rank))
proc.start() proc.start()
procs.append(proc) procs.append(proc)
exit_code = 0
for proc in procs: for proc in procs:
proc.join() proc.join()
if proc.exitcode:
exit_code = proc.exitcode
exit(exit_code)

View File

@ -1,93 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
import argparse
import json
import os
from transformers import AutoTokenizer
from vllm import LLM, SamplingParams
parser = argparse.ArgumentParser()
parser.add_argument(
"--dataset",
type=str,
default="./examples/data/gsm8k.jsonl",
help="downloaded from the eagle repo " \
"https://github.com/SafeAILab/EAGLE/blob/main/eagle/data/"
)
parser.add_argument("--max_num_seqs", type=int, default=8)
parser.add_argument("--num_prompts", type=int, default=80)
parser.add_argument("--num_spec_tokens", type=int, default=2)
parser.add_argument("--tp", type=int, default=1)
parser.add_argument("--draft_tp", type=int, default=1)
parser.add_argument("--enforce_eager", action='store_true')
parser.add_argument("--enable_chunked_prefill", action='store_true')
parser.add_argument("--max_num_batched_tokens", type=int, default=2048)
parser.add_argument("--temp", type=float, default=0)
args = parser.parse_args()
print(args)
model_dir = "meta-llama/Meta-Llama-3-8B-Instruct"
eagle_dir = "abhigoyal/EAGLE-LLaMA3-Instruct-8B-vllm"
max_model_len = 2048
tokenizer = AutoTokenizer.from_pretrained(model_dir)
if os.path.exists(args.dataset):
prompts = []
num_prompts = args.num_prompts
with open(args.dataset) as f:
for line in f:
data = json.loads(line)
prompts.append(data["turns"][0])
else:
prompts = ["The future of AI is", "The president of the United States is"]
prompts = prompts[:args.num_prompts]
num_prompts = len(prompts)
prompt_ids = [
tokenizer.apply_chat_template([{
"role": "user",
"content": prompt
}],
add_generation_prompt=True)
for prompt in prompts
]
llm = LLM(
model=model_dir,
trust_remote_code=True,
tensor_parallel_size=args.tp,
enable_chunked_prefill=args.enable_chunked_prefill,
max_num_batched_tokens=args.max_num_batched_tokens,
enforce_eager=args.enforce_eager,
max_model_len=max_model_len,
max_num_seqs=args.max_num_seqs,
gpu_memory_utilization=0.8,
speculative_model=eagle_dir,
num_speculative_tokens=args.num_spec_tokens,
speculative_draft_tensor_parallel_size=args.draft_tp,
speculative_max_model_len=max_model_len,
disable_log_stats=False,
)
sampling_params = SamplingParams(temperature=args.temp, max_tokens=256)
outputs = llm.generate(prompt_token_ids=prompt_ids,
sampling_params=sampling_params)
# calculate the average number of accepted tokens per forward pass, +1 is
# to account for the token from the target model that's always going to be
# accepted
acceptance_counts = [0] * (args.num_spec_tokens + 1)
for output in outputs:
for step, count in enumerate(output.metrics.spec_token_acceptance_counts):
acceptance_counts[step] += count
print(f"mean acceptance length: \
{sum(acceptance_counts) / acceptance_counts[0]:.2f}")

View File

@ -4,23 +4,16 @@ This example shows how to use vLLM for running offline inference with
the explicit/implicit prompt format on enc-dec LMMs for text generation. the explicit/implicit prompt format on enc-dec LMMs for text generation.
""" """
import time import time
from collections.abc import Sequence
from dataclasses import asdict
from typing import NamedTuple
from vllm import LLM, EngineArgs, PromptType, SamplingParams from vllm import LLM, SamplingParams
from vllm.assets.audio import AudioAsset from vllm.assets.audio import AudioAsset
from vllm.assets.image import ImageAsset from vllm.assets.image import ImageAsset
from vllm.utils import FlexibleArgumentParser from vllm.utils import FlexibleArgumentParser
class ModelRequestData(NamedTuple):
engine_args: EngineArgs
prompts: Sequence[PromptType]
def run_florence2(): def run_florence2():
engine_args = EngineArgs( # Create a Florence-2 encoder/decoder model instance
llm = LLM(
model="microsoft/Florence-2-large", model="microsoft/Florence-2-large",
tokenizer="facebook/bart-large", tokenizer="facebook/bart-large",
max_num_seqs=8, max_num_seqs=8,
@ -46,15 +39,12 @@ def run_florence2():
"decoder_prompt": "", "decoder_prompt": "",
}, },
] ]
return llm, prompts
return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
)
def run_mllama(): def run_mllama():
engine_args = EngineArgs( # Create a Mllama encoder/decoder model instance
llm = LLM(
model="meta-llama/Llama-3.2-11B-Vision-Instruct", model="meta-llama/Llama-3.2-11B-Vision-Instruct",
max_model_len=4096, max_model_len=4096,
max_num_seqs=2, max_num_seqs=2,
@ -79,15 +69,12 @@ def run_mllama():
"decoder_prompt": "<|image|><|begin_of_text|>Please describe the image.", # noqa: E501 "decoder_prompt": "<|image|><|begin_of_text|>Please describe the image.", # noqa: E501
}, },
] ]
return llm, prompts
return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
)
def run_whisper(): def run_whisper():
engine_args = EngineArgs( # Create a Whisper encoder/decoder model instance
llm = LLM(
model="openai/whisper-large-v3-turbo", model="openai/whisper-large-v3-turbo",
max_model_len=448, max_model_len=448,
max_num_seqs=16, max_num_seqs=16,
@ -112,11 +99,7 @@ def run_whisper():
"decoder_prompt": "<|startoftranscript|>", "decoder_prompt": "<|startoftranscript|>",
} }
] ]
return llm, prompts
return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
)
model_example_map = { model_example_map = {
@ -131,12 +114,7 @@ def main(args):
if model not in model_example_map: if model not in model_example_map:
raise ValueError(f"Model type {model} is not supported.") raise ValueError(f"Model type {model} is not supported.")
req_data = model_example_map[model]() llm, prompts = model_example_map[model]()
engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
llm = LLM(**engine_args)
prompts = req_data.prompts
# Create a sampling params object. # Create a sampling params object.
sampling_params = SamplingParams( sampling_params = SamplingParams(
@ -175,10 +153,6 @@ if __name__ == "__main__":
default="mllama", default="mllama",
choices=model_example_map.keys(), choices=model_example_map.keys(),
help='Huggingface "model_type".') help='Huggingface "model_type".')
parser.add_argument("--seed",
type=int,
default=None,
help="Set the seed when initializing `vllm.LLM`.")
args = parser.parse_args() args = parser.parse_args()
main(args) main(args)

View File

@ -6,14 +6,14 @@ import argparse
from vllm import LLM from vllm import LLM
from vllm.sampling_params import SamplingParams from vllm.sampling_params import SamplingParams
# This script is an offline demo for running Mistral-Small-3 # This script is an offline demo for running Pixtral.
# #
# If you want to run a server/client setup, please follow this code: # If you want to run a server/client setup, please follow this code:
# #
# - Server: # - Server:
# #
# ```bash # ```bash
# vllm serve mistralai/Mistral-Small-3.1-24B-Instruct-2503 --tokenizer-mode mistral --limit-mm-per-prompt 'image=4' --max-model-len 16384 # vllm serve mistralai/Pixtral-12B-2409 --tokenizer-mode mistral --limit-mm-per-prompt 'image=4' --max-model-len 16384
# ``` # ```
# #
# - Client: # - Client:
@ -23,7 +23,7 @@ from vllm.sampling_params import SamplingParams
# --header 'Content-Type: application/json' \ # --header 'Content-Type: application/json' \
# --header 'Authorization: Bearer token' \ # --header 'Authorization: Bearer token' \
# --data '{ # --data '{
# "model": "mistralai/Mistral-Small-3.1-24B-Instruct-2503", # "model": "mistralai/Pixtral-12B-2409",
# "messages": [ # "messages": [
# { # {
# "role": "user", # "role": "user",
@ -43,18 +43,12 @@ from vllm.sampling_params import SamplingParams
# python demo.py advanced # python demo.py advanced
def run_simple_demo(args: argparse.Namespace): def run_simple_demo():
model_name = "mistralai/Mistral-Small-3.1-24B-Instruct-2503" model_name = "mistralai/Pixtral-12B-2409"
sampling_params = SamplingParams(max_tokens=8192) sampling_params = SamplingParams(max_tokens=8192)
# Lower max_model_len and/or max_num_seqs on low-VRAM GPUs. # Lower max_num_seqs or max_model_len on low-VRAM GPUs.
llm = LLM( llm = LLM(model=model_name, tokenizer_mode="mistral")
model=model_name,
tokenizer_mode="mistral",
max_model_len=4096,
max_num_seqs=2,
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
)
prompt = "Describe this image in one sentence." prompt = "Describe this image in one sentence."
image_url = "https://picsum.photos/id/237/200/300" image_url = "https://picsum.photos/id/237/200/300"
@ -82,8 +76,8 @@ def run_simple_demo(args: argparse.Namespace):
print(outputs[0].outputs[0].text) print(outputs[0].outputs[0].text)
def run_advanced_demo(args: argparse.Namespace): def run_advanced_demo():
model_name = "mistralai/Mistral-Small-3.1-24B-Instruct-2503" model_name = "mistralai/Pixtral-12B-2409"
max_img_per_msg = 5 max_img_per_msg = 5
max_tokens_per_img = 4096 max_tokens_per_img = 4096
@ -93,7 +87,6 @@ def run_advanced_demo(args: argparse.Namespace):
tokenizer_mode="mistral", tokenizer_mode="mistral",
limit_mm_per_prompt={"image": max_img_per_msg}, limit_mm_per_prompt={"image": max_img_per_msg},
max_model_len=max_img_per_msg * max_tokens_per_img, max_model_len=max_img_per_msg * max_tokens_per_img,
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
) )
prompt = "Describe the following image." prompt = "Describe the following image."
@ -160,19 +153,14 @@ def main():
help="Specify the demo mode: 'simple' or 'advanced'", help="Specify the demo mode: 'simple' or 'advanced'",
) )
parser.add_argument(
'--disable-mm-preprocessor-cache',
action='store_true',
help='If True, disables caching of multi-modal preprocessor/mapper.')
args = parser.parse_args() args = parser.parse_args()
if args.mode == "simple": if args.mode == "simple":
print("Running simple demo...") print("Running simple demo...")
run_simple_demo(args) run_simple_demo()
elif args.mode == "advanced": elif args.mode == "advanced":
print("Running advanced demo...") print("Running advanced demo...")
run_advanced_demo(args) run_advanced_demo()
if __name__ == "__main__": if __name__ == "__main__":

View File

@ -8,164 +8,122 @@ on HuggingFace model repository.
""" """
import os import os
import random import random
from dataclasses import asdict
from typing import NamedTuple, Optional
from huggingface_hub import snapshot_download from huggingface_hub import snapshot_download
from transformers import AutoTokenizer from transformers import AutoTokenizer
from vllm import LLM, EngineArgs, SamplingParams from vllm import LLM, SamplingParams
from vllm.assets.image import ImageAsset from vllm.assets.image import ImageAsset
from vllm.assets.video import VideoAsset from vllm.assets.video import VideoAsset
from vllm.lora.request import LoRARequest from vllm.lora.request import LoRARequest
from vllm.utils import FlexibleArgumentParser from vllm.utils import FlexibleArgumentParser
class ModelRequestData(NamedTuple):
engine_args: EngineArgs
prompts: list[str]
stop_token_ids: Optional[list[int]] = None
lora_requests: Optional[list[LoRARequest]] = None
# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on # NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
# lower-end GPUs. # lower-end GPUs.
# Unless specified, these settings have been tested to work on a single L4. # Unless specified, these settings have been tested to work on a single L4.
# Aria # Aria
def run_aria(questions: list[str], modality: str) -> ModelRequestData: def run_aria(questions: list[str], modality: str):
assert modality == "image" assert modality == "image"
model_name = "rhymes-ai/Aria" model_name = "rhymes-ai/Aria"
# NOTE: Need L40 (or equivalent) to avoid OOM # NOTE: Need L40 (or equivalent) to avoid OOM
engine_args = EngineArgs( llm = LLM(model=model_name,
model=model_name, max_model_len=4096,
max_model_len=4096, max_num_seqs=2,
max_num_seqs=2, dtype="bfloat16",
dtype="bfloat16", disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
)
prompts = [(f"<|im_start|>user\n<fim_prefix><|img|><fim_suffix>{question}" prompts = [(f"<|im_start|>user\n<fim_prefix><|img|><fim_suffix>{question}"
"<|im_end|>\n<|im_start|>assistant\n") "<|im_end|>\n<|im_start|>assistant\n")
for question in questions] for question in questions]
stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519] stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519]
return llm, prompts, stop_token_ids
return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
stop_token_ids=stop_token_ids,
)
# BLIP-2 # BLIP-2
def run_blip2(questions: list[str], modality: str) -> ModelRequestData: def run_blip2(questions: list[str], modality: str):
assert modality == "image" assert modality == "image"
# BLIP-2 prompt format is inaccurate on HuggingFace model repository. # BLIP-2 prompt format is inaccurate on HuggingFace model repository.
# See https://huggingface.co/Salesforce/blip2-opt-2.7b/discussions/15#64ff02f3f8cf9e4f5b038262 #noqa # See https://huggingface.co/Salesforce/blip2-opt-2.7b/discussions/15#64ff02f3f8cf9e4f5b038262 #noqa
prompts = [f"Question: {question} Answer:" for question in questions] prompts = [f"Question: {question} Answer:" for question in questions]
engine_args = EngineArgs( llm = LLM(model="Salesforce/blip2-opt-2.7b",
model="Salesforce/blip2-opt-2.7b", disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, stop_token_ids = None
) return llm, prompts, stop_token_ids
return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
)
# Chameleon # Chameleon
def run_chameleon(questions: list[str], modality: str) -> ModelRequestData: def run_chameleon(questions: list[str], modality: str):
assert modality == "image" assert modality == "image"
prompts = [f"{question}<image>" for question in questions] prompts = [f"{question}<image>" for question in questions]
engine_args = EngineArgs( llm = LLM(model="facebook/chameleon-7b",
model="facebook/chameleon-7b", max_model_len=4096,
max_model_len=4096, max_num_seqs=2,
max_num_seqs=2, disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, stop_token_ids = None
) return llm, prompts, stop_token_ids
return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
)
# Deepseek-VL2 # Deepseek-VL2
def run_deepseek_vl2(questions: list[str], modality: str) -> ModelRequestData: def run_deepseek_vl2(questions: list[str], modality: str):
assert modality == "image" assert modality == "image"
model_name = "deepseek-ai/deepseek-vl2-tiny" model_name = "deepseek-ai/deepseek-vl2-tiny"
engine_args = EngineArgs( llm = LLM(model=model_name,
model=model_name, max_model_len=4096,
max_model_len=4096, max_num_seqs=2,
max_num_seqs=2, disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]})
hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]},
)
prompts = [ prompts = [
f"<|User|>: <image>\n{question}\n\n<|Assistant|>:" f"<|User|>: <image>\n{question}\n\n<|Assistant|>:"
for question in questions for question in questions
] ]
stop_token_ids = None
return ModelRequestData( return llm, prompts, stop_token_ids
engine_args=engine_args,
prompts=prompts,
)
# Florence2 # Florence2
def run_florence2(questions: list[str], modality: str) -> ModelRequestData: def run_florence2(question: str, modality: str):
assert modality == "image" assert modality == "image"
engine_args = EngineArgs( llm = LLM(model="microsoft/Florence-2-large",
model="microsoft/Florence-2-large", tokenizer="facebook/bart-large",
tokenizer="facebook/bart-large", max_num_seqs=8,
max_num_seqs=8, trust_remote_code=True,
trust_remote_code=True, dtype="bfloat16",
dtype="bfloat16", disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
)
prompts = ["<MORE_DETAILED_CAPTION>" for _ in questions] prompt = "<MORE_DETAILED_CAPTION>"
stop_token_ids = None
return ModelRequestData( return llm, prompt, stop_token_ids
engine_args=engine_args,
prompts=prompts,
)
# Fuyu # Fuyu
def run_fuyu(questions: list[str], modality: str) -> ModelRequestData: def run_fuyu(questions: list[str], modality: str):
assert modality == "image" assert modality == "image"
prompts = [f"{question}\n" for question in questions] prompts = [f"{question}\n" for question in questions]
engine_args = EngineArgs( llm = LLM(model="adept/fuyu-8b",
model="adept/fuyu-8b", max_model_len=2048,
max_model_len=2048, max_num_seqs=2,
max_num_seqs=2, disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, stop_token_ids = None
) return llm, prompts, stop_token_ids
return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
)
# Gemma 3 # Gemma 3
def run_gemma3(questions: list[str], modality: str) -> ModelRequestData: def run_gemma3(questions: list[str], modality: str):
assert modality == "image" assert modality == "image"
model_name = "google/gemma-3-4b-it" model_name = "google/gemma-3-4b-it"
engine_args = EngineArgs( llm = LLM(
model=model_name, model=model_name,
max_model_len=2048, max_model_len=2048,
max_num_seqs=2, max_num_seqs=2,
@ -177,27 +135,22 @@ def run_gemma3(questions: list[str], modality: str) -> ModelRequestData:
prompts = [("<bos><start_of_turn>user\n" prompts = [("<bos><start_of_turn>user\n"
f"<start_of_image>{question}<end_of_turn>\n" f"<start_of_image>{question}<end_of_turn>\n"
"<start_of_turn>model\n") for question in questions] "<start_of_turn>model\n") for question in questions]
stop_token_ids = None
return ModelRequestData( return llm, prompts, stop_token_ids
engine_args=engine_args,
prompts=prompts,
)
# GLM-4v # GLM-4v
def run_glm4v(questions: list[str], modality: str) -> ModelRequestData: def run_glm4v(questions: list[str], modality: str):
assert modality == "image" assert modality == "image"
model_name = "THUDM/glm-4v-9b" model_name = "THUDM/glm-4v-9b"
engine_args = EngineArgs( llm = LLM(model=model_name,
model=model_name, max_model_len=2048,
max_model_len=2048, max_num_seqs=2,
max_num_seqs=2, trust_remote_code=True,
trust_remote_code=True, enforce_eager=True,
enforce_eager=True, hf_overrides={"architectures": ["GLM4VForCausalLM"]},
hf_overrides={"architectures": ["GLM4VForCausalLM"]}, disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
)
prompts = [ prompts = [
f"<|user|>\n<|begin_of_image|><|endoftext|><|end_of_image|>\ f"<|user|>\n<|begin_of_image|><|endoftext|><|end_of_image|>\
@ -205,21 +158,16 @@ def run_glm4v(questions: list[str], modality: str) -> ModelRequestData:
] ]
stop_token_ids = [151329, 151336, 151338] stop_token_ids = [151329, 151336, 151338]
return llm, prompts, stop_token_ids
return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
stop_token_ids=stop_token_ids,
)
# H2OVL-Mississippi # H2OVL-Mississippi
def run_h2ovl(questions: list[str], modality: str) -> ModelRequestData: def run_h2ovl(questions: list[str], modality: str):
assert modality == "image" assert modality == "image"
model_name = "h2oai/h2ovl-mississippi-800m" model_name = "h2oai/h2ovl-mississippi-800m"
engine_args = EngineArgs( llm = LLM(
model=model_name, model=model_name,
trust_remote_code=True, trust_remote_code=True,
max_model_len=8192, max_model_len=8192,
@ -239,20 +187,15 @@ def run_h2ovl(questions: list[str], modality: str) -> ModelRequestData:
# Stop tokens for H2OVL-Mississippi # Stop tokens for H2OVL-Mississippi
# https://huggingface.co/h2oai/h2ovl-mississippi-800m # https://huggingface.co/h2oai/h2ovl-mississippi-800m
stop_token_ids = [tokenizer.eos_token_id] stop_token_ids = [tokenizer.eos_token_id]
return llm, prompts, stop_token_ids
return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
stop_token_ids=stop_token_ids,
)
# Idefics3-8B-Llama3 # Idefics3-8B-Llama3
def run_idefics3(questions: list[str], modality: str) -> ModelRequestData: def run_idefics3(questions: list[str], modality: str):
assert modality == "image" assert modality == "image"
model_name = "HuggingFaceM4/Idefics3-8B-Llama3" model_name = "HuggingFaceM4/Idefics3-8B-Llama3"
engine_args = EngineArgs( llm = LLM(
model=model_name, model=model_name,
max_model_len=8192, max_model_len=8192,
max_num_seqs=2, max_num_seqs=2,
@ -269,20 +212,17 @@ def run_idefics3(questions: list[str], modality: str) -> ModelRequestData:
prompts = [( prompts = [(
f"<|begin_of_text|>User:<image>{question}<end_of_utterance>\nAssistant:" f"<|begin_of_text|>User:<image>{question}<end_of_utterance>\nAssistant:"
) for question in questions] ) for question in questions]
stop_token_ids = None
return ModelRequestData( return llm, prompts, stop_token_ids
engine_args=engine_args,
prompts=prompts,
)
# InternVL # InternVL
def run_internvl(questions: list[str], modality: str) -> ModelRequestData: def run_internvl(questions: list[str], modality: str):
assert modality == "image" assert modality == "image"
model_name = "OpenGVLab/InternVL2-2B" model_name = "OpenGVLab/InternVL2-2B"
engine_args = EngineArgs( llm = LLM(
model=model_name, model=model_name,
trust_remote_code=True, trust_remote_code=True,
max_model_len=4096, max_model_len=4096,
@ -305,75 +245,53 @@ def run_internvl(questions: list[str], modality: str) -> ModelRequestData:
# https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py # https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py
stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"] stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"]
stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens] stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
return llm, prompts, stop_token_ids
return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
stop_token_ids=stop_token_ids,
)
# LLaVA-1.5 # LLaVA-1.5
def run_llava(questions: list[str], modality: str) -> ModelRequestData: def run_llava(questions: list[str], modality: str):
assert modality == "image" assert modality == "image"
prompts = [ prompts = [
f"USER: <image>\n{question}\nASSISTANT:" for question in questions f"USER: <image>\n{question}\nASSISTANT:" for question in questions
] ]
engine_args = EngineArgs( llm = LLM(model="llava-hf/llava-1.5-7b-hf",
model="llava-hf/llava-1.5-7b-hf", max_model_len=4096,
max_model_len=4096, disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, stop_token_ids = None
) return llm, prompts, stop_token_ids
return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
)
# LLaVA-1.6/LLaVA-NeXT # LLaVA-1.6/LLaVA-NeXT
def run_llava_next(questions: list[str], modality: str) -> ModelRequestData: def run_llava_next(questions: list[str], modality: str):
assert modality == "image" assert modality == "image"
prompts = [f"[INST] <image>\n{question} [/INST]" for question in questions] prompts = [f"[INST] <image>\n{question} [/INST]" for question in questions]
engine_args = EngineArgs( llm = LLM(model="llava-hf/llava-v1.6-mistral-7b-hf",
model="llava-hf/llava-v1.6-mistral-7b-hf", max_model_len=8192,
max_model_len=8192, disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, stop_token_ids = None
) return llm, prompts, stop_token_ids
return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
)
# LlaVA-NeXT-Video # LlaVA-NeXT-Video
# Currently only support for video input # Currently only support for video input
def run_llava_next_video(questions: list[str], def run_llava_next_video(questions: list[str], modality: str):
modality: str) -> ModelRequestData:
assert modality == "video" assert modality == "video"
prompts = [ prompts = [
f"USER: <video>\n{question} ASSISTANT:" for question in questions f"USER: <video>\n{question} ASSISTANT:" for question in questions
] ]
engine_args = EngineArgs( llm = LLM(model="llava-hf/LLaVA-NeXT-Video-7B-hf",
model="llava-hf/LLaVA-NeXT-Video-7B-hf", max_model_len=8192,
max_model_len=8192, disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, stop_token_ids = None
) return llm, prompts, stop_token_ids
return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
)
# LLaVA-OneVision # LLaVA-OneVision
def run_llava_onevision(questions: list[str], def run_llava_onevision(questions: list[str], modality: str):
modality: str) -> ModelRequestData:
if modality == "video": if modality == "video":
prompts = [ prompts = [
@ -387,20 +305,15 @@ def run_llava_onevision(questions: list[str],
<|im_start|>assistant\n" for question in questions <|im_start|>assistant\n" for question in questions
] ]
engine_args = EngineArgs( llm = LLM(model="llava-hf/llava-onevision-qwen2-7b-ov-hf",
model="llava-hf/llava-onevision-qwen2-7b-ov-hf", max_model_len=16384,
max_model_len=16384, disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, stop_token_ids = None
) return llm, prompts, stop_token_ids
return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
)
# Mantis # Mantis
def run_mantis(questions: list[str], modality: str) -> ModelRequestData: def run_mantis(questions: list[str], modality: str):
assert modality == "image" assert modality == "image"
llama3_template = '<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n' # noqa: E501 llama3_template = '<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n' # noqa: E501
@ -409,19 +322,14 @@ def run_mantis(questions: list[str], modality: str) -> ModelRequestData:
for question in questions for question in questions
] ]
engine_args = EngineArgs( llm = LLM(
model="TIGER-Lab/Mantis-8B-siglip-llama3", model="TIGER-Lab/Mantis-8B-siglip-llama3",
max_model_len=4096, max_model_len=4096,
hf_overrides={"architectures": ["MantisForConditionalGeneration"]}, hf_overrides={"architectures": ["MantisForConditionalGeneration"]},
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
) )
stop_token_ids = [128009] stop_token_ids = [128009]
return llm, prompts, stop_token_ids
return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
stop_token_ids=stop_token_ids,
)
# MiniCPM-V # MiniCPM-V
@ -449,7 +357,7 @@ def run_minicpmv_base(questions: list[str], modality: str, model_name):
# model_name = "openbmb/MiniCPM-o-2_6" # model_name = "openbmb/MiniCPM-o-2_6"
tokenizer = AutoTokenizer.from_pretrained(model_name, tokenizer = AutoTokenizer.from_pretrained(model_name,
trust_remote_code=True) trust_remote_code=True)
engine_args = EngineArgs( llm = LLM(
model=model_name, model=model_name,
max_model_len=4096, max_model_len=4096,
max_num_seqs=2, max_num_seqs=2,
@ -481,24 +389,19 @@ def run_minicpmv_base(questions: list[str], modality: str, model_name):
tokenize=False, tokenize=False,
add_generation_prompt=True) for question in questions add_generation_prompt=True) for question in questions
] ]
return llm, prompts, stop_token_ids
return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
stop_token_ids=stop_token_ids,
)
def run_minicpmo(questions: list[str], modality: str) -> ModelRequestData: def run_minicpmo(questions: list[str], modality: str):
return run_minicpmv_base(questions, modality, "openbmb/MiniCPM-o-2_6") return run_minicpmv_base(questions, modality, "openbmb/MiniCPM-o-2_6")
def run_minicpmv(questions: list[str], modality: str) -> ModelRequestData: def run_minicpmv(questions: list[str], modality: str):
return run_minicpmv_base(questions, modality, "openbmb/MiniCPM-V-2_6") return run_minicpmv_base(questions, modality, "openbmb/MiniCPM-V-2_6")
# LLama 3.2 # LLama 3.2
def run_mllama(questions: list[str], modality: str) -> ModelRequestData: def run_mllama(questions: list[str], modality: str):
assert modality == "image" assert modality == "image"
model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct" model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
@ -508,7 +411,7 @@ def run_mllama(questions: list[str], modality: str) -> ModelRequestData:
# You may lower either to run this example on lower-end GPUs. # You may lower either to run this example on lower-end GPUs.
# The configuration below has been confirmed to launch on a single L40 GPU. # The configuration below has been confirmed to launch on a single L40 GPU.
engine_args = EngineArgs( llm = LLM(
model=model_name, model=model_name,
max_model_len=4096, max_model_len=4096,
max_num_seqs=16, max_num_seqs=16,
@ -529,20 +432,17 @@ def run_mllama(questions: list[str], modality: str) -> ModelRequestData:
prompts = tokenizer.apply_chat_template(messages, prompts = tokenizer.apply_chat_template(messages,
add_generation_prompt=True, add_generation_prompt=True,
tokenize=False) tokenize=False)
stop_token_ids = None
return ModelRequestData( return llm, prompts, stop_token_ids
engine_args=engine_args,
prompts=prompts,
)
# Molmo # Molmo
def run_molmo(questions: list[str], modality: str) -> ModelRequestData: def run_molmo(questions: list[str], modality: str):
assert modality == "image" assert modality == "image"
model_name = "allenai/Molmo-7B-D-0924" model_name = "allenai/Molmo-7B-D-0924"
engine_args = EngineArgs( llm = LLM(
model=model_name, model=model_name,
trust_remote_code=True, trust_remote_code=True,
dtype="bfloat16", dtype="bfloat16",
@ -553,21 +453,18 @@ def run_molmo(questions: list[str], modality: str) -> ModelRequestData:
f"<|im_start|>user <image>\n{question}<|im_end|> \ f"<|im_start|>user <image>\n{question}<|im_end|> \
<|im_start|>assistant\n" for question in questions <|im_start|>assistant\n" for question in questions
] ]
stop_token_ids = None
return ModelRequestData( return llm, prompts, stop_token_ids
engine_args=engine_args,
prompts=prompts,
)
# NVLM-D # NVLM-D
def run_nvlm_d(questions: list[str], modality: str) -> ModelRequestData: def run_nvlm_d(questions: list[str], modality: str):
assert modality == "image" assert modality == "image"
model_name = "nvidia/NVLM-D-72B" model_name = "nvidia/NVLM-D-72B"
# Adjust this as necessary to fit in GPU # Adjust this as necessary to fit in GPU
engine_args = EngineArgs( llm = LLM(
model=model_name, model=model_name,
trust_remote_code=True, trust_remote_code=True,
max_model_len=4096, max_model_len=4096,
@ -584,47 +481,36 @@ def run_nvlm_d(questions: list[str], modality: str) -> ModelRequestData:
prompts = tokenizer.apply_chat_template(messages, prompts = tokenizer.apply_chat_template(messages,
tokenize=False, tokenize=False,
add_generation_prompt=True) add_generation_prompt=True)
stop_token_ids = None
return ModelRequestData( return llm, prompts, stop_token_ids
engine_args=engine_args,
prompts=prompts,
)
# PaliGemma # PaliGemma
def run_paligemma(questions: list[str], modality: str) -> ModelRequestData: def run_paligemma(question: str, modality: str):
assert modality == "image" assert modality == "image"
# PaliGemma has special prompt format for VQA # PaliGemma has special prompt format for VQA
prompts = ["caption en" for _ in questions] prompt = ["caption en"]
engine_args = EngineArgs( llm = LLM(model="google/paligemma-3b-mix-224",
model="google/paligemma-3b-mix-224", disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache) stop_token_ids = None
return llm, prompt, stop_token_ids
return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
)
# PaliGemma 2 # PaliGemma 2
def run_paligemma2(questions: list[str], modality: str) -> ModelRequestData: def run_paligemma2(question: str, modality: str):
assert modality == "image" assert modality == "image"
# PaliGemma 2 has special prompt format for VQA # PaliGemma 2 has special prompt format for VQA
prompts = ["caption en" for _ in questions] prompt = ["caption en"]
engine_args = EngineArgs( llm = LLM(model="google/paligemma2-3b-ft-docci-448",
model="google/paligemma2-3b-ft-docci-448", disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache) stop_token_ids = None
return llm, prompt, stop_token_ids
return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
)
# Phi-3-Vision # Phi-3-Vision
def run_phi3v(questions: list[str], modality: str) -> ModelRequestData: def run_phi3v(questions: list[str], modality: str):
assert modality == "image" assert modality == "image"
prompts = [ prompts = [
@ -644,7 +530,7 @@ def run_phi3v(questions: list[str], modality: str) -> ModelRequestData:
# #
# https://huggingface.co/microsoft/Phi-3.5-vision-instruct#loading-the-model-locally # https://huggingface.co/microsoft/Phi-3.5-vision-instruct#loading-the-model-locally
# https://huggingface.co/microsoft/Phi-3.5-vision-instruct/blob/main/processing_phi3_v.py#L194 # https://huggingface.co/microsoft/Phi-3.5-vision-instruct/blob/main/processing_phi3_v.py#L194
engine_args = EngineArgs( llm = LLM(
model="microsoft/Phi-3.5-vision-instruct", model="microsoft/Phi-3.5-vision-instruct",
trust_remote_code=True, trust_remote_code=True,
max_model_len=4096, max_model_len=4096,
@ -653,15 +539,12 @@ def run_phi3v(questions: list[str], modality: str) -> ModelRequestData:
mm_processor_kwargs={"num_crops": 16}, mm_processor_kwargs={"num_crops": 16},
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
) )
stop_token_ids = None
return ModelRequestData( return llm, prompts, stop_token_ids
engine_args=engine_args,
prompts=prompts,
)
# Phi-4-multimodal-instruct # Phi-4-multimodal-instruct
def run_phi4mm(questions: list[str], modality: str) -> ModelRequestData: def run_phi4mm(questions: list[str], modality: str):
""" """
Phi-4-multimodal-instruct supports both image and audio inputs. Here, we Phi-4-multimodal-instruct supports both image and audio inputs. Here, we
show how to process image inputs. show how to process image inputs.
@ -675,30 +558,33 @@ def run_phi4mm(questions: list[str], modality: str) -> ModelRequestData:
f"<|user|><|image_1|>{question}<|end|><|assistant|>" f"<|user|><|image_1|>{question}<|end|><|assistant|>"
for question in questions for question in questions
] ]
engine_args = EngineArgs( llm = LLM(
model=model_path, model=model_path,
trust_remote_code=True, trust_remote_code=True,
max_model_len=4096, max_model_len=4096,
max_num_seqs=2, max_num_seqs=2,
enable_lora=True, enable_lora=True,
max_lora_rank=320, max_lora_rank=320,
lora_extra_vocab_size=0,
) )
lora_request = LoRARequest("vision", 1, vision_lora_path)
# To maintain code compatibility in this script, we add LoRA here.
llm.llm_engine.add_lora(lora_request=lora_request)
# You can also add LoRA using:
# llm.generate(prompts, lora_request=lora_request,...)
return ModelRequestData( stop_token_ids = None
engine_args=engine_args, return llm, prompts, stop_token_ids
prompts=prompts,
lora_requests=[LoRARequest("vision", 1, vision_lora_path)],
)
# Pixtral HF-format # Pixtral HF-format
def run_pixtral_hf(questions: list[str], modality: str) -> ModelRequestData: def run_pixtral_hf(questions: list[str], modality: str):
assert modality == "image" assert modality == "image"
model_name = "mistral-community/pixtral-12b" model_name = "mistral-community/pixtral-12b"
# NOTE: Need L40 (or equivalent) to avoid OOM # NOTE: Need L40 (or equivalent) to avoid OOM
engine_args = EngineArgs( llm = LLM(
model=model_name, model=model_name,
max_model_len=8192, max_model_len=8192,
max_num_seqs=2, max_num_seqs=2,
@ -706,18 +592,15 @@ def run_pixtral_hf(questions: list[str], modality: str) -> ModelRequestData:
) )
prompts = [f"<s>[INST]{question}\n[IMG][/INST]" for question in questions] prompts = [f"<s>[INST]{question}\n[IMG][/INST]" for question in questions]
stop_token_ids = None
return ModelRequestData( return llm, prompts, stop_token_ids
engine_args=engine_args,
prompts=prompts,
)
# Qwen # Qwen
def run_qwen_vl(questions: list[str], modality: str) -> ModelRequestData: def run_qwen_vl(questions: list[str], modality: str):
assert modality == "image" assert modality == "image"
engine_args = EngineArgs( llm = LLM(
model="Qwen/Qwen-VL", model="Qwen/Qwen-VL",
trust_remote_code=True, trust_remote_code=True,
max_model_len=1024, max_model_len=1024,
@ -727,19 +610,16 @@ def run_qwen_vl(questions: list[str], modality: str) -> ModelRequestData:
) )
prompts = [f"{question}Picture 1: <img></img>\n" for question in questions] prompts = [f"{question}Picture 1: <img></img>\n" for question in questions]
stop_token_ids = None
return ModelRequestData( return llm, prompts, stop_token_ids
engine_args=engine_args,
prompts=prompts,
)
# Qwen2-VL # Qwen2-VL
def run_qwen2_vl(questions: list[str], modality: str) -> ModelRequestData: def run_qwen2_vl(questions: list[str], modality: str):
model_name = "Qwen/Qwen2-VL-7B-Instruct" model_name = "Qwen/Qwen2-VL-7B-Instruct"
engine_args = EngineArgs( llm = LLM(
model=model_name, model=model_name,
max_model_len=4096, max_model_len=4096,
max_num_seqs=5, max_num_seqs=5,
@ -762,19 +642,16 @@ def run_qwen2_vl(questions: list[str], modality: str) -> ModelRequestData:
f"{question}<|im_end|>\n" f"{question}<|im_end|>\n"
"<|im_start|>assistant\n") for question in questions "<|im_start|>assistant\n") for question in questions
] ]
stop_token_ids = None
return ModelRequestData( return llm, prompts, stop_token_ids
engine_args=engine_args,
prompts=prompts,
)
# Qwen2.5-VL # Qwen2.5-VL
def run_qwen2_5_vl(questions: list[str], modality: str) -> ModelRequestData: def run_qwen2_5_vl(questions: list[str], modality: str):
model_name = "Qwen/Qwen2.5-VL-3B-Instruct" model_name = "Qwen/Qwen2.5-VL-3B-Instruct"
engine_args = EngineArgs( llm = LLM(
model=model_name, model=model_name,
max_model_len=4096, max_model_len=4096,
max_num_seqs=5, max_num_seqs=5,
@ -797,11 +674,8 @@ def run_qwen2_5_vl(questions: list[str], modality: str) -> ModelRequestData:
f"{question}<|im_end|>\n" f"{question}<|im_end|>\n"
"<|im_start|>assistant\n") for question in questions "<|im_start|>assistant\n") for question in questions
] ]
stop_token_ids = None
return ModelRequestData( return llm, prompts, stop_token_ids
engine_args=engine_args,
prompts=prompts,
)
model_example_map = { model_example_map = {
@ -915,28 +789,18 @@ def main(args):
data = mm_input["data"] data = mm_input["data"]
questions = mm_input["questions"] questions = mm_input["questions"]
req_data = model_example_map[model](questions, modality) llm, prompts, stop_token_ids = model_example_map[model](questions,
modality)
engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
llm = LLM(**engine_args)
# To maintain code compatibility in this script, we add LoRA here.
# You can also add LoRA using:
# llm.generate(prompts, lora_request=lora_request,...)
if req_data.lora_requests:
for lora_request in req_data.lora_requests:
llm.llm_engine.add_lora(lora_request=lora_request)
# Don't want to check the flag multiple times, so just hijack `prompts`. # Don't want to check the flag multiple times, so just hijack `prompts`.
prompts = req_data.prompts if args.use_different_prompt_per_request else [ prompts = prompts if args.use_different_prompt_per_request else [
req_data.prompts[0] prompts[0]
] ]
# We set temperature to 0.2 so that outputs can be different # We set temperature to 0.2 so that outputs can be different
# even when all prompts are identical when running batch inference. # even when all prompts are identical when running batch inference.
sampling_params = SamplingParams(temperature=0.2, sampling_params = SamplingParams(temperature=0.2,
max_tokens=64, max_tokens=64,
stop_token_ids=req_data.stop_token_ids) stop_token_ids=stop_token_ids)
assert args.num_prompts > 0 assert args.num_prompts > 0
if args.num_prompts == 1: if args.num_prompts == 1:
@ -1001,10 +865,6 @@ if __name__ == "__main__":
type=int, type=int,
default=16, default=16,
help='Number of frames to extract from the video.') help='Number of frames to extract from the video.')
parser.add_argument("--seed",
type=int,
default=None,
help="Set the seed when initializing `vllm.LLM`.")
parser.add_argument( parser.add_argument(
'--image-repeat-prob', '--image-repeat-prob',

View File

@ -7,12 +7,11 @@ For most models, the prompt format should follow corresponding examples
on HuggingFace model repository. on HuggingFace model repository.
""" """
from argparse import Namespace from argparse import Namespace
from dataclasses import asdict
from typing import Literal, NamedTuple, Optional, TypedDict, Union, get_args from typing import Literal, NamedTuple, Optional, TypedDict, Union, get_args
from PIL.Image import Image from PIL.Image import Image
from vllm import LLM, EngineArgs from vllm import LLM
from vllm.multimodal.utils import fetch_image from vllm.multimodal.utils import fetch_image
from vllm.utils import FlexibleArgumentParser from vllm.utils import FlexibleArgumentParser
@ -38,12 +37,12 @@ Query = Union[TextQuery, ImageQuery, TextImageQuery]
class ModelRequestData(NamedTuple): class ModelRequestData(NamedTuple):
engine_args: EngineArgs llm: LLM
prompt: str prompt: str
image: Optional[Image] image: Optional[Image]
def run_e5_v(query: Query) -> ModelRequestData: def run_e5_v(query: Query):
llama3_template = '<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n' # noqa: E501 llama3_template = '<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n' # noqa: E501
if query["modality"] == "text": if query["modality"] == "text":
@ -59,20 +58,20 @@ def run_e5_v(query: Query) -> ModelRequestData:
modality = query['modality'] modality = query['modality']
raise ValueError(f"Unsupported query modality: '{modality}'") raise ValueError(f"Unsupported query modality: '{modality}'")
engine_args = EngineArgs( llm = LLM(
model="royokong/e5-v", model="royokong/e5-v",
task="embed", task="embed",
max_model_len=4096, max_model_len=4096,
) )
return ModelRequestData( return ModelRequestData(
engine_args=engine_args, llm=llm,
prompt=prompt, prompt=prompt,
image=image, image=image,
) )
def run_vlm2vec(query: Query) -> ModelRequestData: def run_vlm2vec(query: Query):
if query["modality"] == "text": if query["modality"] == "text":
text = query["text"] text = query["text"]
prompt = f"Find me an everyday image that matches the given caption: {text}" # noqa: E501 prompt = f"Find me an everyday image that matches the given caption: {text}" # noqa: E501
@ -88,7 +87,7 @@ def run_vlm2vec(query: Query) -> ModelRequestData:
modality = query['modality'] modality = query['modality']
raise ValueError(f"Unsupported query modality: '{modality}'") raise ValueError(f"Unsupported query modality: '{modality}'")
engine_args = EngineArgs( llm = LLM(
model="TIGER-Lab/VLM2Vec-Full", model="TIGER-Lab/VLM2Vec-Full",
task="embed", task="embed",
trust_remote_code=True, trust_remote_code=True,
@ -96,7 +95,7 @@ def run_vlm2vec(query: Query) -> ModelRequestData:
) )
return ModelRequestData( return ModelRequestData(
engine_args=engine_args, llm=llm,
prompt=prompt, prompt=prompt,
image=image, image=image,
) )
@ -127,18 +126,15 @@ def get_query(modality: QueryModality):
raise ValueError(msg) raise ValueError(msg)
def run_encode(model: str, modality: QueryModality, seed: Optional[int]): def run_encode(model: str, modality: QueryModality):
query = get_query(modality) query = get_query(modality)
req_data = model_example_map[model](query) req_data = model_example_map[model](query)
engine_args = asdict(req_data.engine_args) | {"seed": seed}
llm = LLM(**engine_args)
mm_data = {} mm_data = {}
if req_data.image is not None: if req_data.image is not None:
mm_data["image"] = req_data.image mm_data["image"] = req_data.image
outputs = llm.embed({ outputs = req_data.llm.embed({
"prompt": req_data.prompt, "prompt": req_data.prompt,
"multi_modal_data": mm_data, "multi_modal_data": mm_data,
}) })
@ -148,7 +144,7 @@ def run_encode(model: str, modality: QueryModality, seed: Optional[int]):
def main(args: Namespace): def main(args: Namespace):
run_encode(args.model_name, args.modality, args.seed) run_encode(args.model_name, args.modality)
model_example_map = { model_example_map = {
@ -171,10 +167,5 @@ if __name__ == "__main__":
default="image", default="image",
choices=get_args(QueryModality), choices=get_args(QueryModality),
help='Modality of the input.') help='Modality of the input.')
parser.add_argument("--seed",
type=int,
default=None,
help="Set the seed when initializing `vllm.LLM`.")
args = parser.parse_args() args = parser.parse_args()
main(args) main(args)

View File

@ -6,14 +6,13 @@ using the chat template defined by the model.
""" """
import os import os
from argparse import Namespace from argparse import Namespace
from dataclasses import asdict
from typing import NamedTuple, Optional from typing import NamedTuple, Optional
from huggingface_hub import snapshot_download from huggingface_hub import snapshot_download
from PIL.Image import Image from PIL.Image import Image
from transformers import AutoProcessor, AutoTokenizer from transformers import AutoProcessor, AutoTokenizer
from vllm import LLM, EngineArgs, SamplingParams from vllm import LLM, SamplingParams
from vllm.lora.request import LoRARequest from vllm.lora.request import LoRARequest
from vllm.multimodal.utils import fetch_image from vllm.multimodal.utils import fetch_image
from vllm.utils import FlexibleArgumentParser from vllm.utils import FlexibleArgumentParser
@ -26,12 +25,11 @@ IMAGE_URLS = [
class ModelRequestData(NamedTuple): class ModelRequestData(NamedTuple):
engine_args: EngineArgs llm: LLM
prompt: str prompt: str
stop_token_ids: Optional[list[int]]
image_data: list[Image] image_data: list[Image]
stop_token_ids: Optional[list[int]] = None chat_template: Optional[str]
chat_template: Optional[str] = None
lora_requests: Optional[list[LoRARequest]] = None
# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on # NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
@ -39,55 +37,53 @@ class ModelRequestData(NamedTuple):
# Unless specified, these settings have been tested to work on a single L4. # Unless specified, these settings have been tested to work on a single L4.
def load_aria(question: str, image_urls: list[str]) -> ModelRequestData: def load_aria(question, image_urls: list[str]) -> ModelRequestData:
model_name = "rhymes-ai/Aria" model_name = "rhymes-ai/Aria"
engine_args = EngineArgs( llm = LLM(model=model_name,
model=model_name, tokenizer_mode="slow",
tokenizer_mode="slow", trust_remote_code=True,
trust_remote_code=True, dtype="bfloat16",
dtype="bfloat16", limit_mm_per_prompt={"image": len(image_urls)})
limit_mm_per_prompt={"image": len(image_urls)},
)
placeholders = "<fim_prefix><|img|><fim_suffix>\n" * len(image_urls) placeholders = "<fim_prefix><|img|><fim_suffix>\n" * len(image_urls)
prompt = (f"<|im_start|>user\n{placeholders}{question}<|im_end|>\n" prompt = (f"<|im_start|>user\n{placeholders}{question}<|im_end|>\n"
"<|im_start|>assistant\n") "<|im_start|>assistant\n")
stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519] stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519]
return ModelRequestData( return ModelRequestData(
engine_args=engine_args, llm=llm,
prompt=prompt, prompt=prompt,
stop_token_ids=stop_token_ids, stop_token_ids=stop_token_ids,
image_data=[fetch_image(url) for url in image_urls], image_data=[fetch_image(url) for url in image_urls],
chat_template=None,
) )
def load_deepseek_vl2(question: str, def load_deepseek_vl2(question: str, image_urls: list[str]):
image_urls: list[str]) -> ModelRequestData:
model_name = "deepseek-ai/deepseek-vl2-tiny" model_name = "deepseek-ai/deepseek-vl2-tiny"
engine_args = EngineArgs( llm = LLM(model=model_name,
model=model_name, max_model_len=4096,
max_model_len=4096, max_num_seqs=2,
max_num_seqs=2, hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]},
hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]}, limit_mm_per_prompt={"image": len(image_urls)})
limit_mm_per_prompt={"image": len(image_urls)},
)
placeholder = "".join(f"image_{i}:<image>\n" placeholder = "".join(f"image_{i}:<image>\n"
for i, _ in enumerate(image_urls, start=1)) for i, _ in enumerate(image_urls, start=1))
prompt = f"<|User|>: {placeholder}{question}\n\n<|Assistant|>:" prompt = f"<|User|>: {placeholder}{question}\n\n<|Assistant|>:"
return ModelRequestData( return ModelRequestData(
engine_args=engine_args, llm=llm,
prompt=prompt, prompt=prompt,
stop_token_ids=None,
image_data=[fetch_image(url) for url in image_urls], image_data=[fetch_image(url) for url in image_urls],
chat_template=None,
) )
def load_gemma3(question: str, image_urls: list[str]) -> ModelRequestData: def load_gemma3(question, image_urls: list[str]) -> ModelRequestData:
model_name = "google/gemma-3-4b-it" model_name = "google/gemma-3-4b-it"
engine_args = EngineArgs( llm = LLM(
model=model_name, model=model_name,
max_model_len=8192, max_model_len=8192,
max_num_seqs=2, max_num_seqs=2,
@ -116,16 +112,18 @@ def load_gemma3(question: str, image_urls: list[str]) -> ModelRequestData:
add_generation_prompt=True) add_generation_prompt=True)
return ModelRequestData( return ModelRequestData(
engine_args=engine_args, llm=llm,
prompt=prompt, prompt=prompt,
stop_token_ids=None,
image_data=[fetch_image(url) for url in image_urls], image_data=[fetch_image(url) for url in image_urls],
chat_template=None,
) )
def load_h2ovl(question: str, image_urls: list[str]) -> ModelRequestData: def load_h2ovl(question: str, image_urls: list[str]) -> ModelRequestData:
model_name = "h2oai/h2ovl-mississippi-800m" model_name = "h2oai/h2ovl-mississippi-800m"
engine_args = EngineArgs( llm = LLM(
model=model_name, model=model_name,
trust_remote_code=True, trust_remote_code=True,
max_model_len=8192, max_model_len=8192,
@ -148,18 +146,19 @@ def load_h2ovl(question: str, image_urls: list[str]) -> ModelRequestData:
stop_token_ids = [tokenizer.eos_token_id] stop_token_ids = [tokenizer.eos_token_id]
return ModelRequestData( return ModelRequestData(
engine_args=engine_args, llm=llm,
prompt=prompt, prompt=prompt,
stop_token_ids=stop_token_ids, stop_token_ids=stop_token_ids,
image_data=[fetch_image(url) for url in image_urls], image_data=[fetch_image(url) for url in image_urls],
chat_template=None,
) )
def load_idefics3(question: str, image_urls: list[str]) -> ModelRequestData: def load_idefics3(question, image_urls: list[str]) -> ModelRequestData:
model_name = "HuggingFaceM4/Idefics3-8B-Llama3" model_name = "HuggingFaceM4/Idefics3-8B-Llama3"
# The configuration below has been confirmed to launch on a single L40 GPU. # The configuration below has been confirmed to launch on a single L40 GPU.
engine_args = EngineArgs( llm = LLM(
model=model_name, model=model_name,
max_model_len=8192, max_model_len=8192,
max_num_seqs=16, max_num_seqs=16,
@ -178,16 +177,18 @@ def load_idefics3(question: str, image_urls: list[str]) -> ModelRequestData:
for i, _ in enumerate(image_urls, start=1)) for i, _ in enumerate(image_urls, start=1))
prompt = f"<|begin_of_text|>User:{placeholders}\n{question}<end_of_utterance>\nAssistant:" # noqa: E501 prompt = f"<|begin_of_text|>User:{placeholders}\n{question}<end_of_utterance>\nAssistant:" # noqa: E501
return ModelRequestData( return ModelRequestData(
engine_args=engine_args, llm=llm,
prompt=prompt, prompt=prompt,
stop_token_ids=None,
image_data=[fetch_image(url) for url in image_urls], image_data=[fetch_image(url) for url in image_urls],
chat_template=None,
) )
def load_internvl(question: str, image_urls: list[str]) -> ModelRequestData: def load_internvl(question: str, image_urls: list[str]) -> ModelRequestData:
model_name = "OpenGVLab/InternVL2-2B" model_name = "OpenGVLab/InternVL2-2B"
engine_args = EngineArgs( llm = LLM(
model=model_name, model=model_name,
trust_remote_code=True, trust_remote_code=True,
max_model_len=4096, max_model_len=4096,
@ -213,18 +214,19 @@ def load_internvl(question: str, image_urls: list[str]) -> ModelRequestData:
stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens] stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
return ModelRequestData( return ModelRequestData(
engine_args=engine_args, llm=llm,
prompt=prompt, prompt=prompt,
stop_token_ids=stop_token_ids, stop_token_ids=stop_token_ids,
image_data=[fetch_image(url) for url in image_urls], image_data=[fetch_image(url) for url in image_urls],
chat_template=None,
) )
def load_mllama(question: str, image_urls: list[str]) -> ModelRequestData: def load_mllama(question, image_urls: list[str]) -> ModelRequestData:
model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct" model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
# The configuration below has been confirmed to launch on a single L40 GPU. # The configuration below has been confirmed to launch on a single L40 GPU.
engine_args = EngineArgs( llm = LLM(
model=model_name, model=model_name,
max_model_len=4096, max_model_len=4096,
max_num_seqs=16, max_num_seqs=16,
@ -234,17 +236,19 @@ def load_mllama(question: str, image_urls: list[str]) -> ModelRequestData:
placeholders = "<|image|>" * len(image_urls) placeholders = "<|image|>" * len(image_urls)
prompt = f"{placeholders}<|begin_of_text|>{question}" prompt = f"{placeholders}<|begin_of_text|>{question}"
return ModelRequestData( return ModelRequestData(
engine_args=engine_args, llm=llm,
prompt=prompt, prompt=prompt,
stop_token_ids=None,
image_data=[fetch_image(url) for url in image_urls], image_data=[fetch_image(url) for url in image_urls],
chat_template=None,
) )
def load_nvlm_d(question: str, image_urls: list[str]) -> ModelRequestData: def load_nvlm_d(question: str, image_urls: list[str]):
model_name = "nvidia/NVLM-D-72B" model_name = "nvidia/NVLM-D-72B"
# Adjust this as necessary to fit in GPU # Adjust this as necessary to fit in GPU
engine_args = EngineArgs( llm = LLM(
model=model_name, model=model_name,
trust_remote_code=True, trust_remote_code=True,
max_model_len=8192, max_model_len=8192,
@ -262,11 +266,14 @@ def load_nvlm_d(question: str, image_urls: list[str]) -> ModelRequestData:
prompt = tokenizer.apply_chat_template(messages, prompt = tokenizer.apply_chat_template(messages,
tokenize=False, tokenize=False,
add_generation_prompt=True) add_generation_prompt=True)
stop_token_ids = None
return ModelRequestData( return ModelRequestData(
engine_args=engine_args, llm=llm,
prompt=prompt, prompt=prompt,
stop_token_ids=stop_token_ids,
image_data=[fetch_image(url) for url in image_urls], image_data=[fetch_image(url) for url in image_urls],
chat_template=None,
) )
@ -274,7 +281,7 @@ def load_pixtral_hf(question: str, image_urls: list[str]) -> ModelRequestData:
model_name = "mistral-community/pixtral-12b" model_name = "mistral-community/pixtral-12b"
# Adjust this as necessary to fit in GPU # Adjust this as necessary to fit in GPU
engine_args = EngineArgs( llm = LLM(
model=model_name, model=model_name,
max_model_len=8192, max_model_len=8192,
max_num_seqs=2, max_num_seqs=2,
@ -284,11 +291,14 @@ def load_pixtral_hf(question: str, image_urls: list[str]) -> ModelRequestData:
placeholders = "[IMG]" * len(image_urls) placeholders = "[IMG]" * len(image_urls)
prompt = f"<s>[INST]{question}\n{placeholders}[/INST]" prompt = f"<s>[INST]{question}\n{placeholders}[/INST]"
stop_token_ids = None
return ModelRequestData( return ModelRequestData(
engine_args=engine_args, llm=llm,
prompt=prompt, prompt=prompt,
stop_token_ids=stop_token_ids,
image_data=[fetch_image(url) for url in image_urls], image_data=[fetch_image(url) for url in image_urls],
chat_template=None,
) )
@ -305,7 +315,7 @@ def load_phi3v(question: str, image_urls: list[str]) -> ModelRequestData:
# #
# https://huggingface.co/microsoft/Phi-3.5-vision-instruct#loading-the-model-locally # https://huggingface.co/microsoft/Phi-3.5-vision-instruct#loading-the-model-locally
# https://huggingface.co/microsoft/Phi-3.5-vision-instruct/blob/main/processing_phi3_v.py#L194 # https://huggingface.co/microsoft/Phi-3.5-vision-instruct/blob/main/processing_phi3_v.py#L194
engine_args = EngineArgs( llm = LLM(
model="microsoft/Phi-3.5-vision-instruct", model="microsoft/Phi-3.5-vision-instruct",
trust_remote_code=True, trust_remote_code=True,
max_model_len=4096, max_model_len=4096,
@ -316,11 +326,14 @@ def load_phi3v(question: str, image_urls: list[str]) -> ModelRequestData:
placeholders = "\n".join(f"<|image_{i}|>" placeholders = "\n".join(f"<|image_{i}|>"
for i, _ in enumerate(image_urls, start=1)) for i, _ in enumerate(image_urls, start=1))
prompt = f"<|user|>\n{placeholders}\n{question}<|end|>\n<|assistant|>\n" prompt = f"<|user|>\n{placeholders}\n{question}<|end|>\n<|assistant|>\n"
stop_token_ids = None
return ModelRequestData( return ModelRequestData(
engine_args=engine_args, llm=llm,
prompt=prompt, prompt=prompt,
stop_token_ids=stop_token_ids,
image_data=[fetch_image(url) for url in image_urls], image_data=[fetch_image(url) for url in image_urls],
chat_template=None,
) )
@ -334,7 +347,7 @@ def load_phi4mm(question: str, image_urls: list[str]) -> ModelRequestData:
# Since the vision-lora and speech-lora co-exist with the base model, # Since the vision-lora and speech-lora co-exist with the base model,
# we have to manually specify the path of the lora weights. # we have to manually specify the path of the lora weights.
vision_lora_path = os.path.join(model_path, "vision-lora") vision_lora_path = os.path.join(model_path, "vision-lora")
engine_args = EngineArgs( llm = LLM(
model=model_path, model=model_path,
trust_remote_code=True, trust_remote_code=True,
max_model_len=10000, max_model_len=10000,
@ -342,24 +355,32 @@ def load_phi4mm(question: str, image_urls: list[str]) -> ModelRequestData:
limit_mm_per_prompt={"image": len(image_urls)}, limit_mm_per_prompt={"image": len(image_urls)},
enable_lora=True, enable_lora=True,
max_lora_rank=320, max_lora_rank=320,
lora_extra_vocab_size=0,
) )
lora_request = LoRARequest("vision", 1, vision_lora_path)
# To maintain code compatibility in this script, we add LoRA here.
llm.llm_engine.add_lora(lora_request=lora_request)
# You can also add LoRA using:
# llm.generate(prompts, lora_request=lora_request,...)
placeholders = "".join(f"<|image_{i}|>" placeholders = "".join(f"<|image_{i}|>"
for i, _ in enumerate(image_urls, start=1)) for i, _ in enumerate(image_urls, start=1))
prompt = f"<|user|>{placeholders}{question}<|end|><|assistant|>" prompt = f"<|user|>{placeholders}{question}<|end|><|assistant|>"
stop_token_ids = None
return ModelRequestData( return ModelRequestData(
engine_args=engine_args, llm=llm,
prompt=prompt, prompt=prompt,
stop_token_ids=stop_token_ids,
image_data=[fetch_image(url) for url in image_urls], image_data=[fetch_image(url) for url in image_urls],
lora_requests=[LoRARequest("vision", 1, vision_lora_path)], chat_template=None,
) )
def load_qwen_vl_chat(question: str, def load_qwen_vl_chat(question: str,
image_urls: list[str]) -> ModelRequestData: image_urls: list[str]) -> ModelRequestData:
model_name = "Qwen/Qwen-VL-Chat" model_name = "Qwen/Qwen-VL-Chat"
engine_args = EngineArgs( llm = LLM(
model=model_name, model=model_name,
trust_remote_code=True, trust_remote_code=True,
max_model_len=1024, max_model_len=1024,
@ -390,7 +411,7 @@ def load_qwen_vl_chat(question: str,
stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens] stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
return ModelRequestData( return ModelRequestData(
engine_args=engine_args, llm=llm,
prompt=prompt, prompt=prompt,
stop_token_ids=stop_token_ids, stop_token_ids=stop_token_ids,
image_data=[fetch_image(url) for url in image_urls], image_data=[fetch_image(url) for url in image_urls],
@ -398,7 +419,7 @@ def load_qwen_vl_chat(question: str,
) )
def load_qwen2_vl(question: str, image_urls: list[str]) -> ModelRequestData: def load_qwen2_vl(question, image_urls: list[str]) -> ModelRequestData:
try: try:
from qwen_vl_utils import process_vision_info from qwen_vl_utils import process_vision_info
except ModuleNotFoundError: except ModuleNotFoundError:
@ -410,7 +431,7 @@ def load_qwen2_vl(question: str, image_urls: list[str]) -> ModelRequestData:
model_name = "Qwen/Qwen2-VL-7B-Instruct" model_name = "Qwen/Qwen2-VL-7B-Instruct"
# Tested on L40 # Tested on L40
engine_args = EngineArgs( llm = LLM(
model=model_name, model=model_name,
max_model_len=32768 if process_vision_info is None else 4096, max_model_len=32768 if process_vision_info is None else 4096,
max_num_seqs=5, max_num_seqs=5,
@ -439,19 +460,23 @@ def load_qwen2_vl(question: str, image_urls: list[str]) -> ModelRequestData:
tokenize=False, tokenize=False,
add_generation_prompt=True) add_generation_prompt=True)
stop_token_ids = None
if process_vision_info is None: if process_vision_info is None:
image_data = [fetch_image(url) for url in image_urls] image_data = [fetch_image(url) for url in image_urls]
else: else:
image_data, _ = process_vision_info(messages) image_data, _ = process_vision_info(messages)
return ModelRequestData( return ModelRequestData(
engine_args=engine_args, llm=llm,
prompt=prompt, prompt=prompt,
stop_token_ids=stop_token_ids,
image_data=image_data, image_data=image_data,
chat_template=None,
) )
def load_qwen2_5_vl(question: str, image_urls: list[str]) -> ModelRequestData: def load_qwen2_5_vl(question, image_urls: list[str]) -> ModelRequestData:
try: try:
from qwen_vl_utils import process_vision_info from qwen_vl_utils import process_vision_info
except ModuleNotFoundError: except ModuleNotFoundError:
@ -462,7 +487,7 @@ def load_qwen2_5_vl(question: str, image_urls: list[str]) -> ModelRequestData:
model_name = "Qwen/Qwen2.5-VL-3B-Instruct" model_name = "Qwen/Qwen2.5-VL-3B-Instruct"
engine_args = EngineArgs( llm = LLM(
model=model_name, model=model_name,
max_model_len=32768 if process_vision_info is None else 4096, max_model_len=32768 if process_vision_info is None else 4096,
max_num_seqs=5, max_num_seqs=5,
@ -491,6 +516,8 @@ def load_qwen2_5_vl(question: str, image_urls: list[str]) -> ModelRequestData:
tokenize=False, tokenize=False,
add_generation_prompt=True) add_generation_prompt=True)
stop_token_ids = None
if process_vision_info is None: if process_vision_info is None:
image_data = [fetch_image(url) for url in image_urls] image_data = [fetch_image(url) for url in image_urls]
else: else:
@ -498,9 +525,11 @@ def load_qwen2_5_vl(question: str, image_urls: list[str]) -> ModelRequestData:
return_video_kwargs=False) return_video_kwargs=False)
return ModelRequestData( return ModelRequestData(
engine_args=engine_args, llm=llm,
prompt=prompt, prompt=prompt,
stop_token_ids=stop_token_ids,
image_data=image_data, image_data=image_data,
chat_template=None,
) )
@ -522,25 +551,14 @@ model_example_map = {
} }
def run_generate(model, question: str, image_urls: list[str], def run_generate(model, question: str, image_urls: list[str]):
seed: Optional[int]):
req_data = model_example_map[model](question, image_urls) req_data = model_example_map[model](question, image_urls)
engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
llm = LLM(**engine_args)
# To maintain code compatibility in this script, we add LoRA here.
# You can also add LoRA using:
# llm.generate(prompts, lora_request=lora_request,...)
if req_data.lora_requests:
for lora_request in req_data.lora_requests:
llm.llm_engine.add_lora(lora_request=lora_request)
sampling_params = SamplingParams(temperature=0.0, sampling_params = SamplingParams(temperature=0.0,
max_tokens=128, max_tokens=128,
stop_token_ids=req_data.stop_token_ids) stop_token_ids=req_data.stop_token_ids)
outputs = llm.generate( outputs = req_data.llm.generate(
{ {
"prompt": req_data.prompt, "prompt": req_data.prompt,
"multi_modal_data": { "multi_modal_data": {
@ -554,24 +572,13 @@ def run_generate(model, question: str, image_urls: list[str],
print(generated_text) print(generated_text)
def run_chat(model: str, question: str, image_urls: list[str], def run_chat(model: str, question: str, image_urls: list[str]):
seed: Optional[int]):
req_data = model_example_map[model](question, image_urls) req_data = model_example_map[model](question, image_urls)
engine_args = asdict(req_data.engine_args) | {"seed": seed}
llm = LLM(**engine_args)
# To maintain code compatibility in this script, we add LoRA here.
# You can also add LoRA using:
# llm.generate(prompts, lora_request=lora_request,...)
if req_data.lora_requests:
for lora_request in req_data.lora_requests:
llm.llm_engine.add_lora(lora_request=lora_request)
sampling_params = SamplingParams(temperature=0.0, sampling_params = SamplingParams(temperature=0.0,
max_tokens=128, max_tokens=128,
stop_token_ids=req_data.stop_token_ids) stop_token_ids=req_data.stop_token_ids)
outputs = llm.chat( outputs = req_data.llm.chat(
[{ [{
"role": "role":
"user", "user",
@ -600,12 +607,11 @@ def run_chat(model: str, question: str, image_urls: list[str],
def main(args: Namespace): def main(args: Namespace):
model = args.model_type model = args.model_type
method = args.method method = args.method
seed = args.seed
if method == "generate": if method == "generate":
run_generate(model, QUESTION, IMAGE_URLS, seed) run_generate(model, QUESTION, IMAGE_URLS)
elif method == "chat": elif method == "chat":
run_chat(model, QUESTION, IMAGE_URLS, seed) run_chat(model, QUESTION, IMAGE_URLS)
else: else:
raise ValueError(f"Invalid method: {method}") raise ValueError(f"Invalid method: {method}")
@ -626,10 +632,6 @@ if __name__ == "__main__":
default="generate", default="generate",
choices=["generate", "chat"], choices=["generate", "chat"],
help="The method to run in `vllm.LLM`.") help="The method to run in `vllm.LLM`.")
parser.add_argument("--seed",
type=int,
default=None,
help="Set the seed when initializing `vllm.LLM`.")
args = parser.parse_args() args = parser.parse_args()
main(args) main(args)

View File

@ -42,7 +42,7 @@ def post_http_request(prompt: str,
def get_streaming_response(response: requests.Response) -> Iterable[list[str]]: def get_streaming_response(response: requests.Response) -> Iterable[list[str]]:
for chunk in response.iter_lines(chunk_size=8192, for chunk in response.iter_lines(chunk_size=8192,
decode_unicode=False, decode_unicode=False,
delimiter=b"\n"): delimiter=b"\0"):
if chunk: if chunk:
data = json.loads(chunk.decode("utf-8")) data = json.loads(chunk.decode("utf-8"))
output = data["text"] output = data["text"]

View File

@ -21,7 +21,7 @@ def http_bot(prompt):
for chunk in response.iter_lines(chunk_size=8192, for chunk in response.iter_lines(chunk_size=8192,
decode_unicode=False, decode_unicode=False,
delimiter=b"\n"): delimiter=b"\0"):
if chunk: if chunk:
data = json.loads(chunk.decode("utf-8")) data = json.loads(chunk.decode("utf-8"))
output = data["text"][0] output = data["text"][0]

View File

@ -127,7 +127,7 @@ configuration for the root vLLM logger and for the logger you wish to silence:
"vllm": { "vllm": {
"handlers": ["vllm"], "handlers": ["vllm"],
"level": "DEBUG", "level": "DEBUG",
"propagate": false "propagage": false
}, },
"vllm.example_noisy_logger": { "vllm.example_noisy_logger": {
"propagate": false "propagate": false

View File

@ -6,7 +6,7 @@ requires = [
"packaging", "packaging",
"setuptools>=61", "setuptools>=61",
"setuptools-scm>=8.0", "setuptools-scm>=8.0",
"torch == 2.6.0", "torch == 2.5.1",
"wheel", "wheel",
"jinja2", "jinja2",
] ]

View File

@ -4,6 +4,6 @@ ninja
packaging packaging
setuptools>=61 setuptools>=61
setuptools-scm>=8 setuptools-scm>=8
torch==2.6.0 torch==2.5.1
wheel wheel
jinja2>=3.1.6 jinja2

View File

@ -1,4 +1,3 @@
cachetools
psutil psutil
sentencepiece # Required for LLaMA tokenizer. sentencepiece # Required for LLaMA tokenizer.
numpy < 2.0.0 numpy < 2.0.0
@ -20,7 +19,7 @@ tiktoken >= 0.6.0 # Required for DBRX tokenizer
lm-format-enforcer >= 0.10.11, < 0.11 lm-format-enforcer >= 0.10.11, < 0.11
outlines == 0.1.11 outlines == 0.1.11
lark == 1.2.2 lark == 1.2.2
xgrammar == 0.1.16; platform_machine == "x86_64" or platform_machine == "aarch64" xgrammar == 0.1.15; platform_machine == "x86_64" or platform_machine == "aarch64"
typing_extensions >= 4.10 typing_extensions >= 4.10
filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317 filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317
partial-json-parser # used for parsing partial JSON outputs partial-json-parser # used for parsing partial JSON outputs
@ -28,7 +27,7 @@ pyzmq
msgspec msgspec
gguf == 0.10.0 gguf == 0.10.0
importlib_metadata importlib_metadata
mistral_common[opencv] >= 1.5.4 mistral_common[opencv] >= 1.5.0
pyyaml pyyaml
six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12 six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
setuptools>=74.1.1; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12 setuptools>=74.1.1; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
@ -39,4 +38,3 @@ cloudpickle # allows pickling lambda functions in model_executor/models/registry
watchfiles # required for http server to monitor the updates of TLS files watchfiles # required for http server to monitor the updates of TLS files
python-json-logger # Used by logging as per examples/other/logging_configuration.md python-json-logger # Used by logging as per examples/other/logging_configuration.md
scipy # Required for phi-4-multimodal-instruct scipy # Required for phi-4-multimodal-instruct
ninja # Required for xgrammar, rocm, tpu, xpu

View File

@ -4,9 +4,9 @@
numba == 0.60.0 # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding numba == 0.60.0 # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding
# Dependencies for NVIDIA GPUs # Dependencies for NVIDIA GPUs
ray[cgraph]>=2.43.0 # Ray Compiled Graph, required for pipeline parallelism in V1. ray[cgraph] >= 2.43.0 # Ray Compiled Graph, required for pipeline parallelism in V1.
torch==2.6.0 torch == 2.5.1
torchaudio==2.6.0 torchaudio==2.5.1
# These must be updated alongside torch # These must be updated alongside torch
torchvision==0.21.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version torchvision == 0.20.1 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
xformers==0.0.29.post2; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch 2.6.0 xformers == 0.0.28.post3; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch 2.5.1

View File

@ -9,13 +9,12 @@ msgspec
cloudpickle cloudpickle
# packages to install to build the documentation # packages to install to build the documentation
cachetools
pydantic >= 2.8 pydantic >= 2.8
-f https://download.pytorch.org/whl/cpu -f https://download.pytorch.org/whl/cpu
torch torch
py-cpuinfo py-cpuinfo
transformers transformers
mistral_common >= 1.5.4 mistral_common >= 1.5.0
aiohttp aiohttp
starlette starlette
openai # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args openai # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args

View File

@ -7,9 +7,10 @@ torchvision==0.20.1
torchaudio==2.5.1 torchaudio==2.5.1
cmake>=3.26 cmake>=3.26
ninja
packaging packaging
setuptools>=61 setuptools>=61
setuptools-scm>=8 setuptools-scm>=8
wheel wheel
jinja2>=3.1.6 jinja2
amdsmi==6.2.4 amdsmi==6.2.4

View File

@ -1,23 +0,0 @@
# entrypoints test
# librosa==0.10.2.post1 # required by audio tests in entrypoints/openai
audioread==3.0.1
cffi==1.17.1
decorator==5.2.1
lazy-loader==0.4
platformdirs==4.3.6
pooch==1.8.2
#pycparse==2.22
soundfile==0.13.1
soxr==0.5.0.post1
librosa==0.10.2.post1
# entrypoints test
#vllm[video] # required by entrypoints/openai/test_video.py
decord==0.6.0
# entrypoints test
#sentence-transformers # required by entrypoints/openai/test_score.py
sentence-transformers==3.4.1

View File

@ -8,7 +8,6 @@ pytest-shard
# testing utils # testing utils
awscli awscli
backoff # required for phi4mm test
decord # required for video tests decord # required for video tests
einops # required for MPT, qwen-vl and Mamba einops # required for MPT, qwen-vl and Mamba
httpx httpx
@ -22,17 +21,16 @@ sentence-transformers # required for embedding tests
soundfile # required for audio tests soundfile # required for audio tests
jiwer # required for audio tests jiwer # required for audio tests
timm # required for internvl test timm # required for internvl test
torch==2.6.0 torch==2.5.1
torchaudio==2.6.0 torchaudio==2.5.1
torchvision==0.21.0
transformers_stream_generator # required for qwen-vl test transformers_stream_generator # required for qwen-vl test
matplotlib # required for qwen-vl test matplotlib # required for qwen-vl test
mistral_common[opencv] >= 1.5.4 # required for pixtral test mistral_common[opencv] >= 1.5.0 # required for pixtral test
datamodel_code_generator # required for minicpm3 test datamodel_code_generator # required for minicpm3 test
lm-eval[api]==0.4.4 # required for model evaluation test lm-eval[api]==0.4.4 # required for model evaluation test
transformers==4.48.2 transformers==4.48.2
# quantization # quantization
bitsandbytes>=0.45.3 bitsandbytes>=0.45.0
buildkite-test-collector==0.1.9 buildkite-test-collector==0.1.9
genai_perf==0.0.8 genai_perf==0.0.8
@ -40,4 +38,4 @@ tritonclient==2.51.0
numpy < 2.0.0 numpy < 2.0.0
runai-model-streamer==0.11.0 runai-model-streamer==0.11.0
runai-model-streamer-s3==0.11.0 runai-model-streamer-s3==0.11.0

View File

@ -8,7 +8,7 @@ accelerate==1.0.1
# peft # peft
aiohappyeyeballs==2.4.3 aiohappyeyeballs==2.4.3
# via aiohttp # via aiohttp
aiohttp==3.10.11 aiohttp==3.10.10
# via # via
# datasets # datasets
# fsspec # fsspec
@ -33,9 +33,7 @@ audioread==3.0.1
# via librosa # via librosa
awscli==1.35.23 awscli==1.35.23
# via -r requirements/test.in # via -r requirements/test.in
backoff==2.2.1 bitsandbytes==0.45.0
# via -r requirements/test.in
bitsandbytes==0.45.3
# via -r requirements/test.in # via -r requirements/test.in
black==24.10.0 black==24.10.0
# via datamodel-code-generator # via datamodel-code-generator
@ -129,6 +127,7 @@ filelock==3.16.1
# ray # ray
# torch # torch
# transformers # transformers
# triton
fonttools==4.54.1 fonttools==4.54.1
# via matplotlib # via matplotlib
frozendict==2.4.6 frozendict==2.4.6
@ -183,7 +182,7 @@ iniconfig==2.0.0
# via pytest # via pytest
isort==5.13.2 isort==5.13.2
# via datamodel-code-generator # via datamodel-code-generator
jinja2==3.1.6 jinja2==3.1.4
# via # via
# datamodel-code-generator # datamodel-code-generator
# torch # torch
@ -235,7 +234,7 @@ mbstrdecoder==1.1.3
# typepy # typepy
mdurl==0.1.2 mdurl==0.1.2
# via markdown-it-py # via markdown-it-py
mistral-common==1.5.4 mistral-common==1.5.1
# via -r requirements/test.in # via -r requirements/test.in
more-itertools==10.5.0 more-itertools==10.5.0
# via lm-eval # via lm-eval
@ -321,8 +320,6 @@ nvidia-cusparse-cu12==12.3.1.170
# via # via
# nvidia-cusolver-cu12 # nvidia-cusolver-cu12
# torch # torch
nvidia-cusparselt-cu12==0.6.2
# via torch
nvidia-nccl-cu12==2.21.5 nvidia-nccl-cu12==2.21.5
# via torch # via torch
nvidia-nvjitlink-cu12==12.4.127 nvidia-nvjitlink-cu12==12.4.127
@ -594,7 +591,7 @@ timm==1.0.11
# via -r requirements/test.in # via -r requirements/test.in
tokenizers==0.21.0 tokenizers==0.21.0
# via transformers # via transformers
torch==2.6.0 torch==2.5.1
# via # via
# -r requirements/test.in # -r requirements/test.in
# accelerate # accelerate
@ -610,15 +607,13 @@ torch==2.6.0
# torchvision # torchvision
# vector-quantize-pytorch # vector-quantize-pytorch
# vocos # vocos
torchaudio==2.6.0 torchaudio==2.5.1
# via # via
# -r requirements/test.in # -r requirements/test.in
# encodec # encodec
# vocos # vocos
torchvision==0.21.0 torchvision==0.20.1
# via # via timm
# -r requirements/test.in
# timm
tqdm==4.66.6 tqdm==4.66.6
# via # via
# datasets # datasets
@ -643,7 +638,7 @@ transformers==4.48.2
# transformers-stream-generator # transformers-stream-generator
transformers-stream-generator==0.0.5 transformers-stream-generator==0.0.5
# via -r requirements/test.in # via -r requirements/test.in
triton==3.2.0 triton==3.1.0
# via torch # via torch
tritonclient==2.51.0 tritonclient==2.51.0
# via # via
@ -656,6 +651,7 @@ typepy==1.3.2
# tabledata # tabledata
typing-extensions==4.12.2 typing-extensions==4.12.2
# via # via
# bitsandbytes
# huggingface-hub # huggingface-hub
# librosa # librosa
# mistral-common # mistral-common

View File

@ -3,10 +3,11 @@
# Dependencies for TPU # Dependencies for TPU
cmake>=3.26 cmake>=3.26
ninja
packaging packaging
setuptools-scm>=8 setuptools-scm>=8
wheel wheel
jinja2>=3.1.6 jinja2
ray[default] ray[default]
ray[data] ray[data]
@ -17,9 +18,9 @@ ray[data]
--find-links https://storage.googleapis.com/libtpu-releases/index.html --find-links https://storage.googleapis.com/libtpu-releases/index.html
--find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html --find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html
--find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html --find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250314%2Bcxx11-cp39-cp39-linux_x86_64.whl ; python_version == "3.9" torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.7.0.dev20250306%2Bcxx11-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250314%2Bcxx11-cp310-cp310-linux_x86_64.whl ; python_version == "3.10" torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.7.0.dev20250306%2Bcxx11-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250314%2Bcxx11-cp311-cp311-linux_x86_64.whl ; python_version == "3.11" torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.7.0.dev20250306%2Bcxx11-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250314%2Bcxx11-cp39-cp39-linux_x86_64.whl ; python_version == "3.9" torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250306%2Bcxx11-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250314%2Bcxx11-cp310-cp310-linux_x86_64.whl ; python_version == "3.10" torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250306%2Bcxx11-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250314%2Bcxx11-cp311-cp311-linux_x86_64.whl ; python_version == "3.11" torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250306%2Bcxx11-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"

View File

@ -3,11 +3,12 @@
ray>=2.9 ray>=2.9
cmake>=3.26 cmake>=3.26
ninja
packaging packaging
setuptools-scm>=8 setuptools-scm>=8
setuptools>=75.8.0 setuptools>=75.8.0
wheel wheel
jinja2>=3.1.6 jinja2
datasets # for benchmark scripts datasets # for benchmark scripts
torch==2.6.0+xpu torch==2.6.0+xpu
@ -20,4 +21,4 @@ pytorch-triton-xpu
# FIXME: This will be fix in ipex 2.7. just leave this here for awareness. # FIXME: This will be fix in ipex 2.7. just leave this here for awareness.
# intel-extension-for-pytorch==2.6.10+xpu # intel-extension-for-pytorch==2.6.10+xpu
oneccl_bind_pt==2.6.0+xpu oneccl_bind_pt==2.6.0+xpu
--extra-index-url=https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ --extra-index-url=https://pytorch-extension.intel.com/release-whl/stable/xpu/us/

View File

@ -294,28 +294,26 @@ class repackage_wheel(build_ext):
]).decode("utf-8") ]).decode("utf-8")
upstream_main_commit = json.loads(resp_json)["sha"] upstream_main_commit = json.loads(resp_json)["sha"]
# Check if the upstream_main_commit exists in the local repo # Check if the local main branch is up-to-date. This is to ensure
try: # the base commit we found is the most recent commit on the main
subprocess.check_output( # branch.
["git", "cat-file", "-e", f"{upstream_main_commit}"]) local_main_commit = subprocess.check_output(
except subprocess.CalledProcessError: ["git", "rev-parse", "main"]).decode("utf-8").strip()
# If not present, fetch it from the remote repository. if local_main_commit != upstream_main_commit:
# Note that this does not update any local branches, raise ValueError(
# but ensures that this commit ref and its history are f"Local main branch ({local_main_commit}) is not "
# available in our local repo. "up-to-date with upstream main branch "
subprocess.check_call([ f"({upstream_main_commit}). Please pull the latest "
"git", "fetch", "https://github.com/vllm-project/vllm", "changes from upstream main branch first.")
"main"
])
# Then get the commit hash of the current branch that is the same as # Then get the commit hash of the current branch that is the same as
# the upstream main commit. # the upstream main commit.
current_branch = subprocess.check_output( current_branch = subprocess.check_output(
["git", "branch", "--show-current"]).decode("utf-8").strip() ["git", "branch", "--show-current"]).decode("utf-8").strip()
base_commit = subprocess.check_output([ base_commit = subprocess.check_output(
"git", "merge-base", f"{upstream_main_commit}", current_branch ["git", "merge-base", "main",
]).decode("utf-8").strip() current_branch]).decode("utf-8").strip()
return base_commit return base_commit
except ValueError as err: except ValueError as err:
raise ValueError(err) from None raise ValueError(err) from None

View File

@ -1,11 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
import pytest
@pytest.fixture(scope="function", autouse=True)
def use_v0_only(monkeypatch):
"""
Since this module is V0 only, set VLLM_USE_V1=0 for
all tests in the module.
"""
monkeypatch.setenv('VLLM_USE_V1', '0')

View File

@ -1,6 +1,5 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
import os
import subprocess import subprocess
import sys import sys
import time import time
@ -45,10 +44,7 @@ def api_server(tokenizer_pool_size: int, distributed_executor_backend: str):
distributed_executor_backend, distributed_executor_backend,
] ]
# API Server Test Requires V0. uvicorn_process = subprocess.Popen(commands)
my_env = os.environ.copy()
my_env["VLLM_USE_V1"] = "0"
uvicorn_process = subprocess.Popen(commands, env=my_env)
yield yield
uvicorn_process.terminate() uvicorn_process.terminate()

View File

@ -151,10 +151,6 @@ def uid() -> str:
@pytest_asyncio.fixture(scope="module") @pytest_asyncio.fixture(scope="module")
async def async_engine(): async def async_engine():
# We cannot use monkeypatch since this is a module
# scoped fixture and monkeypatch is function scoped.
previous_value = os.getenv("VLLM_USE_V1", None)
os.environ["VLLM_USE_V1"] = "0"
engine = await asyncio.get_event_loop().run_in_executor(executor=None, engine = await asyncio.get_event_loop().run_in_executor(executor=None,
func=start_engine) func=start_engine)
try: try:
@ -165,11 +161,6 @@ async def async_engine():
await asyncio.sleep(0.1) await asyncio.sleep(0.1)
cleanup_dist_env_and_memory() cleanup_dist_env_and_memory()
if previous_value:
os.environ["VLLM_USE_V1"] = previous_value
else:
del os.environ["VLLM_USE_V1"]
@pytest.fixture() @pytest.fixture()
def should_do_global_cleanup_after_test(request) -> bool: def should_do_global_cleanup_after_test(request) -> bool:

View File

@ -47,7 +47,6 @@ def test_vllm_gc_ed():
@pytest.mark.parametrize("max_tokens", [5]) @pytest.mark.parametrize("max_tokens", [5])
@pytest.mark.parametrize("enforce_eager", [False]) @pytest.mark.parametrize("enforce_eager", [False])
def test_models( def test_models(
monkeypatch: pytest.MonkeyPatch,
hf_runner, hf_runner,
model: str, model: str,
backend: str, backend: str,
@ -64,33 +63,31 @@ def test_models(
pytest.skip( pytest.skip(
f"{backend} does not support gemma2 with full context length.") f"{backend} does not support gemma2 with full context length.")
with monkeypatch.context() as m: os.environ["VLLM_ATTENTION_BACKEND"] = backend
m.setenv("VLLM_ATTENTION_BACKEND", backend)
# 5042 tokens for gemma2 # 5042 tokens for gemma2
# gemma2 has alternating sliding window size of 4096 # gemma2 has alternating sliding window size of 4096
# we need a prompt with more than 4096 tokens to test the sliding window # we need a prompt with more than 4096 tokens to test the sliding window
prompt = "The following numbers of the sequence " + ", ".join( prompt = "The following numbers of the sequence " + ", ".join(
str(i) for i in range(1024)) + " are:" str(i) for i in range(1024)) + " are:"
example_prompts = [prompt] example_prompts = [prompt]
with hf_runner(model, dtype=dtype) as hf_model: with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
with VllmRunner(model, with VllmRunner(model,
max_model_len=8192, max_model_len=8192,
dtype=dtype, dtype=dtype,
enforce_eager=enforce_eager, enforce_eager=enforce_eager,
gpu_memory_utilization=0.7) as vllm_model: gpu_memory_utilization=0.7) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
max_tokens)
check_outputs_equal( check_outputs_equal(
outputs_0_lst=hf_outputs, outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_outputs, outputs_1_lst=vllm_outputs,
name_0="hf", name_0="hf",
name_1="vllm", name_1="vllm",
) )
@multi_gpu_test(num_gpus=2) @multi_gpu_test(num_gpus=2)
@ -107,7 +104,6 @@ def test_models(
("meta-llama/Meta-Llama-3-8B", "ray", "FLASHINFER", "A100"), ("meta-llama/Meta-Llama-3-8B", "ray", "FLASHINFER", "A100"),
]) ])
def test_models_distributed( def test_models_distributed(
monkeypatch: pytest.MonkeyPatch,
hf_runner, hf_runner,
vllm_runner, vllm_runner,
example_prompts, example_prompts,
@ -120,41 +116,34 @@ def test_models_distributed(
if test_suite != TARGET_TEST_SUITE: if test_suite != TARGET_TEST_SUITE:
pytest.skip(f"Skip test for {test_suite}") pytest.skip(f"Skip test for {test_suite}")
with monkeypatch.context() as monkeypatch_context: if model == "meta-llama/Llama-3.2-1B-Instruct" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4": # noqa
if model == "meta-llama/Llama-3.2-1B-Instruct" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4": # noqa # test Ray Compiled Graph
# test Ray Compiled Graph os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1"
monkeypatch_context.setenv("VLLM_USE_RAY_SPMD_WORKER", "1") os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1"
monkeypatch_context.setenv("VLLM_USE_RAY_COMPILED_DAG", "1")
if attention_backend: if attention_backend:
monkeypatch_context.setenv( os.environ["VLLM_ATTENTION_BACKEND"] = attention_backend
"VLLM_ATTENTION_BACKEND",
attention_backend,
)
dtype = "half" dtype = "half"
max_tokens = 5 max_tokens = 5
# NOTE: take care of the order. run vLLM first, and then run HF. # NOTE: take care of the order. run vLLM first, and then run HF.
# vLLM needs a fresh new process without cuda initialization. # vLLM needs a fresh new process without cuda initialization.
# if we run HF first, the cuda initialization will be done and it # if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method # will hurt multiprocessing backend with fork method (the default method).
# (the default method). with vllm_runner(model,
with vllm_runner( dtype=dtype,
model, tensor_parallel_size=2,
dtype=dtype, distributed_executor_backend=distributed_executor_backend
tensor_parallel_size=2, ) as vllm_model:
distributed_executor_backend=distributed_executor_backend, vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts,
max_tokens)
with hf_runner(model, dtype=dtype) as hf_model: with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
check_outputs_equal( check_outputs_equal(
outputs_0_lst=hf_outputs, outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_outputs, outputs_1_lst=vllm_outputs,
name_0="hf", name_0="hf",
name_1="vllm", name_1="vllm",
) )

View File

@ -7,39 +7,22 @@ prefill requests are chunked.
Run `pytest tests/models/test_chunked_prefill.py`. Run `pytest tests/models/test_chunked_prefill.py`.
""" """
import os
from __future__ import annotations
from typing import TYPE_CHECKING
import pytest import pytest
from tests.kernels.utils import override_backend_env_variable
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils import STR_BACKEND_ENV_VAR
from ..models.utils import check_logprobs_close, check_outputs_equal from ..models.utils import check_logprobs_close, check_outputs_equal
from ..utils import multi_gpu_test from ..utils import multi_gpu_test
if TYPE_CHECKING:
from .conftest import HfRunner, VllmRunner
MODELS = [ MODELS = [
"facebook/opt-125m", "facebook/opt-125m",
"meta-llama/Llama-3.2-1B-Instruct", "meta-llama/Llama-3.2-1B-Instruct",
] ]
@pytest.fixture(scope="function", autouse=True)
def use_v0_only(monkeypatch: pytest.MonkeyPatch):
"""
Since this module is V0 only, set VLLM_USE_V1=0 for
all tests in the file.
"""
with monkeypatch.context() as m:
m.setenv('VLLM_USE_V1', '0')
yield
@pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["half"]) @pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [32]) @pytest.mark.parametrize("max_tokens", [32])
@ -50,8 +33,8 @@ def use_v0_only(monkeypatch: pytest.MonkeyPatch):
@pytest.mark.parametrize("tensor_parallel_size", [1]) @pytest.mark.parametrize("tensor_parallel_size", [1])
@pytest.mark.parametrize("attention_backend", ["FLASHINFER", "FLASH_ATTN"]) @pytest.mark.parametrize("attention_backend", ["FLASHINFER", "FLASH_ATTN"])
def test_models( def test_models(
hf_runner: HfRunner, hf_runner,
vllm_runner: VllmRunner, vllm_runner,
example_prompts, example_prompts,
model: str, model: str,
dtype: str, dtype: str,
@ -60,39 +43,37 @@ def test_models(
enforce_eager: bool, enforce_eager: bool,
tensor_parallel_size: int, tensor_parallel_size: int,
attention_backend: str, attention_backend: str,
monkeypatch: pytest.MonkeyPatch, monkeypatch,
) -> None: ) -> None:
""" """
Checks exact match decode between huggingface model and vllm runner with Checks exact match decode between huggingface model and vllm runner with
chunked prefill. chunked prefill.
""" """
with monkeypatch.context() as m: override_backend_env_variable(monkeypatch, attention_backend)
m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
max_num_seqs = chunked_prefill_token_size max_num_seqs = chunked_prefill_token_size
max_num_batched_tokens = chunked_prefill_token_size max_num_batched_tokens = chunked_prefill_token_size
with hf_runner(model, dtype=dtype) as hf_model: with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
with vllm_runner( with vllm_runner(
model, model,
dtype=dtype, dtype=dtype,
max_num_batched_tokens=max_num_batched_tokens, max_num_batched_tokens=max_num_batched_tokens,
enable_chunked_prefill=True, enable_chunked_prefill=True,
tensor_parallel_size=tensor_parallel_size, tensor_parallel_size=tensor_parallel_size,
enforce_eager=enforce_eager, enforce_eager=enforce_eager,
max_num_seqs=max_num_seqs, max_num_seqs=max_num_seqs,
) as vllm_model: ) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
max_tokens)
check_outputs_equal( check_outputs_equal(
outputs_0_lst=hf_outputs, outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_outputs, outputs_1_lst=vllm_outputs,
name_0="hf", name_0="hf",
name_1="vllm", name_1="vllm",
) )
@multi_gpu_test(num_gpus=2) @multi_gpu_test(num_gpus=2)
@ -100,61 +81,57 @@ def test_models(
@pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("attention_backend", ["FLASHINFER", "FLASH_ATTN"]) @pytest.mark.parametrize("attention_backend", ["FLASHINFER", "FLASH_ATTN"])
def test_models_distributed( def test_models_distributed(
hf_runner: HfRunner, hf_runner,
vllm_runner: VllmRunner, vllm_runner,
example_prompts, example_prompts,
model: str, model: str,
distributed_executor_backend: str, distributed_executor_backend: str,
attention_backend: str, attention_backend: str,
monkeypatch: pytest.MonkeyPatch, monkeypatch,
) -> None: ) -> None:
with monkeypatch.context() as m: override_backend_env_variable(monkeypatch, attention_backend)
m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
if (model == "meta-llama/Llama-3.2-1B-Instruct"
and distributed_executor_backend == "ray"):
# test Ray Compiled Graph
m.setenv("VLLM_USE_RAY_SPMD_WORKER", "1")
m.setenv("VLLM_USE_RAY_COMPILED_DAG", "1")
dtype = "half" if (model == "meta-llama/Llama-3.2-1B-Instruct"
max_tokens = 5 and distributed_executor_backend == "ray"):
chunked_prefill_token_size = 16 # test Ray Compiled Graph
os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1"
os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1"
# Add a chunked prefill config. dtype = "half"
max_num_seqs = min(chunked_prefill_token_size, 256) max_tokens = 5
assert chunked_prefill_token_size != -1 chunked_prefill_token_size = 16
enable_chunked_prefill = True
max_num_batched_tokens = chunked_prefill_token_size
# NOTE: take care of the order. run vLLM first, and then run HF. # Add a chunked prefill config.
# vLLM needs a fresh new process without cuda initialization. max_num_seqs = min(chunked_prefill_token_size, 256)
# if we run HF first, the cuda initialization will be done and it assert chunked_prefill_token_size != -1
# will hurt multiprocessing backend with enable_chunked_prefill = True
# fork method (the default method). max_num_batched_tokens = chunked_prefill_token_size
with vllm_runner( # NOTE: take care of the order. run vLLM first, and then run HF.
model, # vLLM needs a fresh new process without cuda initialization.
dtype=dtype, # if we run HF first, the cuda initialization will be done and it
tensor_parallel_size=2, # will hurt multiprocessing backend with fork method (the default method).
max_num_seqs=max_num_seqs,
enable_chunked_prefill=enable_chunked_prefill,
max_num_batched_tokens=max_num_batched_tokens,
distributed_executor_backend=distributed_executor_backend,
) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(
example_prompts,
max_tokens,
)
with hf_runner(model, dtype=dtype) as hf_model: with vllm_runner(
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) model,
dtype=dtype,
tensor_parallel_size=2,
max_num_seqs=max_num_seqs,
enable_chunked_prefill=enable_chunked_prefill,
max_num_batched_tokens=max_num_batched_tokens,
distributed_executor_backend=distributed_executor_backend,
) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
check_outputs_equal( with hf_runner(model, dtype=dtype) as hf_model:
outputs_0_lst=hf_outputs, hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
outputs_1_lst=vllm_outputs,
name_0="hf", check_outputs_equal(
name_1="vllm", outputs_0_lst=hf_outputs,
) outputs_1_lst=vllm_outputs,
name_0="hf",
name_1="vllm",
)
@pytest.mark.parametrize( @pytest.mark.parametrize(
@ -172,7 +149,7 @@ def test_models_distributed(
# the async postprocessor # the async postprocessor
@pytest.mark.parametrize("disable_async_output_proc", [True]) @pytest.mark.parametrize("disable_async_output_proc", [True])
def test_models_with_fp8_kv_cache( def test_models_with_fp8_kv_cache(
vllm_runner: VllmRunner, vllm_runner,
example_prompts, example_prompts,
kv_cache_dtype: str, kv_cache_dtype: str,
model: str, model: str,
@ -232,7 +209,7 @@ def test_models_with_fp8_kv_cache(
@pytest.mark.parametrize("tensor_parallel_size", [1]) @pytest.mark.parametrize("tensor_parallel_size", [1])
@pytest.mark.parametrize("dtype", ["half"]) @pytest.mark.parametrize("dtype", ["half"])
def test_with_prefix_caching( def test_with_prefix_caching(
vllm_runner: VllmRunner, vllm_runner,
max_tokens: int, max_tokens: int,
enforce_eager: bool, enforce_eager: bool,
chunk_size: int, chunk_size: int,
@ -268,10 +245,8 @@ def test_with_prefix_caching(
) as vllm_model: ) as vllm_model:
outputs[enable] = [] outputs[enable] = []
for prompt in full_prompts: for prompt in full_prompts:
outputs[enable] += vllm_model.generate_greedy( outputs[enable] += vllm_model.generate_greedy([prompt],
[prompt], max_tokens)
max_tokens,
)
check_outputs_equal( check_outputs_equal(
outputs_0_lst=outputs[False], outputs_0_lst=outputs[False],
@ -282,7 +257,7 @@ def test_with_prefix_caching(
@pytest.mark.parametrize("model", ["facebook/opt-125m"]) @pytest.mark.parametrize("model", ["facebook/opt-125m"])
@pytest.mark.parametrize("dtype", ["bfloat16", "half"]) @pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.parametrize("max_tokens", [32]) @pytest.mark.parametrize("max_tokens", [32])
@pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16]) @pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16])
@pytest.mark.parametrize("enforce_eager", [False]) @pytest.mark.parametrize("enforce_eager", [False])
@ -290,8 +265,8 @@ def test_with_prefix_caching(
@pytest.mark.cpu_model @pytest.mark.cpu_model
@pytest.mark.skipif(not current_platform.is_cpu(), reason="CPU only") @pytest.mark.skipif(not current_platform.is_cpu(), reason="CPU only")
def test_models_cpu( def test_models_cpu(
hf_runner: HfRunner, hf_runner,
vllm_runner: VllmRunner, vllm_runner,
example_prompts, example_prompts,
model: str, model: str,
dtype: str, dtype: str,
@ -299,7 +274,7 @@ def test_models_cpu(
chunked_prefill_token_size: int, chunked_prefill_token_size: int,
enforce_eager: bool, enforce_eager: bool,
attention_backend: str, attention_backend: str,
monkeypatch: pytest.MonkeyPatch, monkeypatch,
) -> None: ) -> None:
test_models( test_models(
hf_runner, hf_runner,
@ -319,11 +294,11 @@ def test_models_cpu(
@pytest.mark.parametrize("max_tokens", [16]) @pytest.mark.parametrize("max_tokens", [16])
@pytest.mark.parametrize("enforce_eager", [False]) @pytest.mark.parametrize("enforce_eager", [False])
@pytest.mark.parametrize("chunk_size", [30, 32]) @pytest.mark.parametrize("chunk_size", [30, 32])
@pytest.mark.parametrize("dtype", ["bfloat16", "half"]) @pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.cpu_model @pytest.mark.cpu_model
@pytest.mark.skipif(not current_platform.is_cpu(), reason="CPU only") @pytest.mark.skipif(not current_platform.is_cpu(), reason="CPU only")
def test_with_prefix_caching_cpu( def test_with_prefix_caching_cpu(
vllm_runner: VllmRunner, vllm_runner,
max_tokens: int, max_tokens: int,
enforce_eager: bool, enforce_eager: bool,
chunk_size: int, chunk_size: int,

View File

@ -1,15 +1,8 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
import pytest
from ..utils import compare_two_settings from ..utils import compare_two_settings
@pytest.fixture(scope="function", autouse=True)
def use_v0_only(monkeypatch):
monkeypatch.setenv('VLLM_USE_V1', '0')
def test_cpu_offload(): def test_cpu_offload():
compare_two_settings("meta-llama/Llama-3.2-1B-Instruct", [], compare_two_settings("meta-llama/Llama-3.2-1B-Instruct", [],
["--cpu-offload-gb", "1"]) ["--cpu-offload-gb", "1"])

View File

@ -7,10 +7,10 @@ from vllm import LLM, SamplingParams
from vllm.device_allocator.cumem import CuMemAllocator from vllm.device_allocator.cumem import CuMemAllocator
from vllm.utils import GiB_bytes from vllm.utils import GiB_bytes
from ..utils import create_new_process_for_each_test from ..utils import fork_new_process_for_each_test
@create_new_process_for_each_test() @fork_new_process_for_each_test
def test_python_error(): def test_python_error():
""" """
Test if Python error occurs when there's low-level Test if Python error occurs when there's low-level
@ -36,7 +36,7 @@ def test_python_error():
allocator.wake_up() allocator.wake_up()
@create_new_process_for_each_test() @fork_new_process_for_each_test
def test_basic_cumem(): def test_basic_cumem():
# some tensors from default memory pool # some tensors from default memory pool
shape = (1024, 1024) shape = (1024, 1024)
@ -69,7 +69,7 @@ def test_basic_cumem():
assert torch.allclose(output, torch.ones_like(output) * 3) assert torch.allclose(output, torch.ones_like(output) * 3)
@create_new_process_for_each_test() @fork_new_process_for_each_test
def test_cumem_with_cudagraph(): def test_cumem_with_cudagraph():
allocator = CuMemAllocator.get_instance() allocator = CuMemAllocator.get_instance()
with allocator.use_memory_pool(): with allocator.use_memory_pool():
@ -114,7 +114,7 @@ def test_cumem_with_cudagraph():
assert torch.allclose(y, x + 1) assert torch.allclose(y, x + 1)
@create_new_process_for_each_test() @fork_new_process_for_each_test
@pytest.mark.parametrize( @pytest.mark.parametrize(
"model, use_v1", "model, use_v1",
[ [
@ -123,38 +123,40 @@ def test_cumem_with_cudagraph():
# sleep mode with pytorch checkpoint # sleep mode with pytorch checkpoint
("facebook/opt-125m", False), ("facebook/opt-125m", False),
]) ])
def test_end_to_end(monkeypatch: pytest.MonkeyPatch, model: str, use_v1: bool): def test_end_to_end(model: str, use_v1: bool):
with monkeypatch.context() as m: import os
m.setenv("VLLM_USE_V1", "1" if use_v1 else "0") os.environ["VLLM_USE_V1"] = "1" if use_v1 else "0"
free, total = torch.cuda.mem_get_info() free, total = torch.cuda.mem_get_info()
used_bytes_baseline = total - free # in case other process is running used_bytes_baseline = total - free # in case other process is running
llm = LLM(model, enable_sleep_mode=True) llm = LLM(model, enable_sleep_mode=True)
prompt = "How are you?" prompt = "How are you?"
sampling_params = SamplingParams(temperature=0, max_tokens=10) sampling_params = SamplingParams(temperature=0, max_tokens=10)
output = llm.generate(prompt, sampling_params) output = llm.generate(prompt, sampling_params)
# the benefit of `llm.sleep(level=2)` is mainly CPU memory usage, # the benefit of `llm.sleep(level=2)` is mainly CPU memory usage,
# which is difficult to measure in the test. therefore, we only # which is difficult to measure in the test. therefore, we only
# test sleep level 1 here. # test sleep level 1 here.
llm.sleep(level=1) llm.sleep(level=1)
free_gpu_bytes_after_sleep, total = torch.cuda.mem_get_info() free_gpu_bytes_after_sleep, total = torch.cuda.mem_get_info()
used_bytes = total - free_gpu_bytes_after_sleep - used_bytes_baseline used_bytes = total - free_gpu_bytes_after_sleep - used_bytes_baseline
# now the memory usage is mostly cudagraph memory pool, # now the memory usage is mostly cudagraph memory pool,
# and it should be less than the model weights (1B model, 2GiB weights) # and it should be less than the model weights (1B model, 2GiB weights)
# NOTE: In V1, the memory buffer for logits (max_num_reqs x vocab_size) # NOTE: In V1, the memory buffer for logits (max_num_reqs x vocab_size)
# is captured but cannot be releasesd from PyTorch due to a known bug, # is captured but cannot be releasesd from PyTorch due to a known bug,
# therefore high memory usage after `llm.sleep` is called is expected. # therefore high memory usage after `llm.sleep` is called is expected.
# FIXME(youkaichao & ywang96): Fix memory buffer issue with sleep mode # FIXME(youkaichao & ywang96): Fix memory buffer issue with sleep mode
# in V1. # in V1.
if use_v1: if use_v1:
assert used_bytes < 7 * GiB_bytes assert used_bytes < 7 * GiB_bytes
else: else:
assert used_bytes < 2 * GiB_bytes assert used_bytes < 2 * GiB_bytes
llm.wake_up() llm.wake_up()
output2 = llm.generate(prompt, sampling_params) output2 = llm.generate(prompt, sampling_params)
# cmp output # cmp output
assert output[0].outputs[0].text == output2[0].outputs[0].text assert output[0].outputs[0].text == output2[0].outputs[0].text
del os.environ["VLLM_USE_V1"]

View File

@ -21,15 +21,6 @@ MODELS = [
] ]
@pytest.fixture(scope="function", autouse=True)
def use_v0_only(monkeypatch):
"""
We should enable this for V1, but VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT,
so use VLLM_USE_V1=0 for all tests in the file.
"""
monkeypatch.setenv('VLLM_USE_V1', '0')
@pytest.fixture(scope="module", autouse=True) @pytest.fixture(scope="module", autouse=True)
def check_settings(): def check_settings():
assert ENABLE_ARTIFICIAL_PREEMPT is True, ( assert ENABLE_ARTIFICIAL_PREEMPT is True, (

View File

@ -6,7 +6,6 @@ from typing import Callable, Union
from torch import fx from torch import fx
from vllm.compilation.inductor_pass import InductorPass from vllm.compilation.inductor_pass import InductorPass
from vllm.config import get_current_vllm_config
class TestBackend: class TestBackend:
@ -18,14 +17,13 @@ class TestBackend:
Inductor config can be modified directly by editing the inductor_config Inductor config can be modified directly by editing the inductor_config
property. This can be helpful for adding passes like the property. This can be helpful for adding passes like the
'pre_grad_custom_pass' and the 'post_grad_custom_pre_pass'. 'pre_grad_custom_pass' and the 'post_grad_custom_pre_pass'.
Inductor config is default-initialized from VllmConfig.CompilationConfig.
""" """
def __init__(self, *passes: Union[InductorPass, Callable[[fx.Graph], def __init__(self, *passes: Union[InductorPass, Callable[[fx.Graph],
None]]): None]]):
self.custom_passes = list(passes) self.custom_passes = list(passes)
compile_config = get_current_vllm_config().compilation_config from torch._inductor import config
self.inductor_config = compile_config.inductor_compile_config self.inductor_config = config.shallow_copy_dict()
self.inductor_config['force_disable_caches'] = True self.inductor_config['force_disable_caches'] = True
self.inductor_config['post_grad_custom_post_pass'] = self.post_pass self.inductor_config['post_grad_custom_post_pass'] = self.post_pass

View File

@ -1,14 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
import pytest
# TEST V1: this should be removed. Right now V1 overrides
# all the torch compile logic. We should re-enable this
# as we add torch compile support back to V1.
@pytest.fixture(scope="function", autouse=True)
def use_v0_only(monkeypatch):
"""
Since this module is V0 only, set VLLM_USE_V1=0 for
all tests in the module.
"""
monkeypatch.setenv('VLLM_USE_V1', '0')

View File

@ -1,7 +1,7 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
from __future__ import annotations
import dataclasses import dataclasses
from typing import Optional
import pytest import pytest
@ -22,76 +22,75 @@ class TestSetting:
fullgraph: bool fullgraph: bool
# representative settings for testing
test_settings = [
# basic llama model
TestSetting(
model="meta-llama/Llama-3.2-1B-Instruct",
model_args=[],
pp_size=2,
tp_size=2,
attn_backend="FLASHINFER",
method="generate",
fullgraph=True,
),
# llama model with quantization
TestSetting(
model="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ",
model_args=["--quantization", "gptq"],
pp_size=1,
tp_size=1,
attn_backend="FLASH_ATTN",
method="generate",
fullgraph=True,
),
# MoE model
TestSetting(
model="ibm/PowerMoE-3b",
model_args=[],
pp_size=1,
tp_size=2,
attn_backend="FLASH_ATTN",
method="generate",
fullgraph=True,
),
# embedding model
TestSetting(
model="BAAI/bge-multilingual-gemma2",
model_args=["--task", "embed"],
pp_size=1,
tp_size=1,
attn_backend="FLASH_ATTN",
method="encode",
fullgraph=True,
),
# encoder-based embedding model (BERT)
TestSetting(
model="BAAI/bge-base-en-v1.5",
model_args=["--task", "embed"],
pp_size=1,
tp_size=1,
attn_backend="XFORMERS",
method="encode",
fullgraph=True,
),
# vision language model
TestSetting(
model="microsoft/Phi-3.5-vision-instruct",
model_args=["--trust-remote-code", "--max-model-len", "2048"],
pp_size=2,
tp_size=1,
attn_backend="FLASH_ATTN",
method="generate_with_image",
fullgraph=False,
),
]
# we cannot afford testing the full Catesian product # we cannot afford testing the full Catesian product
# of all models and all levels # of all models and all levels
@pytest.mark.parametrize( @pytest.mark.parametrize("test_setting", test_settings)
"test_setting", def test_compile_correctness(test_setting: TestSetting):
[
# basic llama model
TestSetting(
model="meta-llama/Llama-3.2-1B-Instruct",
model_args=[],
pp_size=2,
tp_size=2,
attn_backend="FLASHINFER",
method="generate",
fullgraph=True,
),
# llama model with quantization
TestSetting(
model="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ",
model_args=["--quantization", "gptq"],
pp_size=1,
tp_size=1,
attn_backend="FLASH_ATTN",
method="generate",
fullgraph=True,
),
# MoE model
TestSetting(
model="ibm/PowerMoE-3b",
model_args=[],
pp_size=1,
tp_size=2,
attn_backend="FLASH_ATTN",
method="generate",
fullgraph=True,
),
# embedding model
TestSetting(
model="BAAI/bge-multilingual-gemma2",
model_args=["--task", "embed", "--dtype", "bfloat16"],
pp_size=1,
tp_size=1,
attn_backend="FLASH_ATTN",
method="encode",
fullgraph=True,
),
# encoder-based embedding model (BERT)
TestSetting(
model="BAAI/bge-base-en-v1.5",
model_args=["--task", "embed"],
pp_size=1,
tp_size=1,
attn_backend="XFORMERS",
method="encode",
fullgraph=True,
),
# vision language model
TestSetting(
model="microsoft/Phi-3.5-vision-instruct",
model_args=["--trust-remote-code", "--max-model-len", "2048"],
pp_size=2,
tp_size=1,
attn_backend="FLASH_ATTN",
method="generate_with_image",
fullgraph=False,
),
])
def test_compile_correctness(
monkeypatch: pytest.MonkeyPatch,
test_setting: TestSetting,
):
# this test is run under multiple suits, with different GPUs. # this test is run under multiple suits, with different GPUs.
# make sure we only run the test with correct CUDA devices. # make sure we only run the test with correct CUDA devices.
# don't use "<", as it will duplicate the tests. # don't use "<", as it will duplicate the tests.
@ -104,45 +103,41 @@ def test_compile_correctness(
fullgraph = test_setting.fullgraph fullgraph = test_setting.fullgraph
if cuda_device_count_stateless() != pp_size * tp_size: if cuda_device_count_stateless() != pp_size * tp_size:
pytest.skip("Not correct CUDA devices for the test.") pytest.skip("Not correct CUDA devices for the test.")
import os
os.environ["VLLM_ATTENTION_BACKEND"] = attn_backend
final_args = ["--enforce-eager"] + model_args + ["-pp", str(pp_size)] + \
["-tp", str(tp_size)]
with monkeypatch.context() as m: all_args: list[list[str]] = []
m.setenv("VLLM_ATTENTION_BACKEND", attn_backend) all_envs: list[Optional[dict[str, str]]] = []
final_args = [
"--enforce-eager", *model_args, "-pp",
str(pp_size), "-tp",
str(tp_size)
]
all_args: list[list[str]] = [] for level in [
all_envs: list[dict[str, str] | None] = [] CompilationLevel.NO_COMPILATION,
CompilationLevel.PIECEWISE,
]:
all_args.append(final_args + [f"-O{level}"])
all_envs.append({})
for level in [ # inductor will change the output, so we only compare if the output
CompilationLevel.NO_COMPILATION, # is close, not exactly the same.
CompilationLevel.PIECEWISE, compare_all_settings(
]: model,
all_args.append(final_args + [f"-O{level}"]) all_args,
all_envs.append({}) all_envs,
method=method if method != "generate" else "generate_close")
all_envs.clear()
all_args.clear()
# inductor will change the output, so we only compare if the output for level in [
# is close, not exactly the same. CompilationLevel.NO_COMPILATION,
compare_all_settings( CompilationLevel.DYNAMO_AS_IS,
model, CompilationLevel.DYNAMO_ONCE,
all_args, ]:
all_envs, all_args.append(final_args + [f"-O{level}"])
method=method if method != "generate" else "generate_close") all_envs.append({})
all_envs.clear() if level != CompilationLevel.DYNAMO_ONCE and not fullgraph:
all_args.clear() # "DYNAMO_ONCE" will always use fullgraph
all_envs[-1][
"VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "0" # type: ignore
for level in [ compare_all_settings(model, all_args * 3, all_envs, method=method)
CompilationLevel.NO_COMPILATION,
CompilationLevel.DYNAMO_AS_IS,
CompilationLevel.DYNAMO_ONCE,
]:
all_args.append(final_args + [f"-O{level}"])
all_envs.append({})
if level != CompilationLevel.DYNAMO_ONCE and not fullgraph:
# "DYNAMO_ONCE" will always use fullgraph
all_envs[-1][
"VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "0" # type: ignore
compare_all_settings(model, all_args * 3, all_envs, method=method)

View File

@ -1,115 +1,22 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
from __future__ import annotations
from typing import Any
import pytest import pytest
import torch
from tests.quantization.utils import is_quant_method_supported
from vllm import LLM, SamplingParams
from vllm.config import CompilationLevel from vllm.config import CompilationLevel
from vllm.platforms import current_platform
from ..utils import create_new_process_for_each_test from ..utils import fork_new_process_for_each_test
from .utils import TEST_MODELS, check_full_graph_support
@pytest.fixture(params=None, name="model_info")
def models_list_fixture(request):
TEST_MODELS: list[tuple[str, dict[str, Any]]] = [
("facebook/opt-125m", {}),
("nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", {
"dtype": torch.float16,
"quantization": "compressed-tensors"
}),
("neuralmagic/Llama-3.2-1B-Instruct-FP8-dynamic", {
"dtype": torch.float16,
"quantization": "compressed-tensors"
}),
("neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8", {
"quantization": "compressed-tensors"
}),
("meta-llama/Llama-3.2-1B-Instruct", {}),
]
if is_quant_method_supported("aqlm"):
TEST_MODELS.append(("ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf", {
"quantization": "aqlm"
}))
# TODO: figure out why this fails.
if False and is_quant_method_supported("gguf"): # noqa: SIM223
TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF", {
"quantization": "gguf"
}))
if is_quant_method_supported("gptq"):
TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", {
"quantization": "gptq"
}))
if is_quant_method_supported("gptq_marlin"):
TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", {
"quantization": "gptq_marlin"
}))
if is_quant_method_supported("gptq_marlin_24"):
TEST_MODELS.append(("alexm-nm/tinyllama-24-marlin24-4bit-g128", {
"quantization": "gptq_marlin_24"
}))
if is_quant_method_supported("marlin"):
TEST_MODELS.append(
("robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-marlin", {
"quantization": "marlin"
}))
if not current_platform.is_rocm() and is_quant_method_supported("awq"):
TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", {
"quantization": "AWQ"
}))
return TEST_MODELS
@pytest.mark.parametrize("model_info", TEST_MODELS)
@pytest.mark.parametrize( @pytest.mark.parametrize(
"optimization_level", "optimization_level",
[CompilationLevel.DYNAMO_ONCE, CompilationLevel.PIECEWISE], [CompilationLevel.DYNAMO_ONCE, CompilationLevel.PIECEWISE])
) @fork_new_process_for_each_test
@pytest.mark.parametrize("model_info", "", indirect=True) def test_full_graph(model_info, optimization_level):
@create_new_process_for_each_test() model = model_info[0]
def test_full_graph( model_kwargs = model_info[1]
monkeypatch: pytest.MonkeyPatch, check_full_graph_support(model,
model_info: tuple[str, dict[str, Any]], model_kwargs,
optimization_level: int, optimization_level,
): tp_size=1)
model, model_kwargs = model_info
with monkeypatch.context() as m:
# make sure these models can be captured in full graph mode
m.setenv("VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE", "1")
print(f"MODEL={model}")
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
sampling_params = SamplingParams(temperature=0)
llm = LLM(
model=model,
enforce_eager=True,
tensor_parallel_size=1,
disable_custom_all_reduce=True,
compilation_config=optimization_level,
**model_kwargs,
)
outputs = llm.generate(prompts, sampling_params)
# Print the outputs.
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")

93
tests/compile/utils.py Normal file
View File

@ -0,0 +1,93 @@
# SPDX-License-Identifier: Apache-2.0
import os
import torch
from tests.quantization.utils import is_quant_method_supported
from vllm import LLM, SamplingParams
from vllm.platforms import current_platform
TEST_MODELS = [
("facebook/opt-125m", {}),
("nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", {
"dtype": torch.float16,
"quantization": "compressed-tensors"
}),
("neuralmagic/Llama-3.2-1B-Instruct-FP8-dynamic", {
"dtype": torch.float16,
"quantization": "compressed-tensors"
}),
("neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8", {
"quantization": "compressed-tensors"
}),
("meta-llama/Llama-3.2-1B-Instruct", {}),
]
if is_quant_method_supported("aqlm"):
TEST_MODELS.append(("ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf", {
"quantization": "aqlm"
}))
# TODO: figure out why this fails.
if False and is_quant_method_supported("gguf"): # noqa: SIM223
TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF", {
"quantization": "gguf"
}))
if is_quant_method_supported("gptq"):
TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", {
"quantization": "gptq"
}))
if is_quant_method_supported("gptq_marlin"):
TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", {
"quantization": "gptq_marlin"
}))
if is_quant_method_supported("gptq_marlin_24"):
TEST_MODELS.append(("alexm-nm/tinyllama-24-marlin24-4bit-g128", {
"quantization": "gptq_marlin_24"
}))
if is_quant_method_supported("marlin"):
TEST_MODELS.append(("robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-marlin", {
"quantization": "marlin"
}))
if not current_platform.is_rocm() and is_quant_method_supported("awq"):
TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", {
"quantization": "AWQ"
}))
def check_full_graph_support(model,
model_kwargs,
optimization_level,
tp_size=1):
# make sure these models can be captured in full graph mode
os.environ["VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "1"
print(f"MODEL={model}")
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
sampling_params = SamplingParams(temperature=0)
llm = LLM(model=model,
enforce_eager=True,
tensor_parallel_size=tp_size,
disable_custom_all_reduce=True,
compilation_config=optimization_level,
**model_kwargs)
outputs = llm.generate(prompts, sampling_params)
# Print the outputs.
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")

View File

@ -14,8 +14,8 @@ import torch.nn as nn
import torch.nn.functional as F import torch.nn.functional as F
from huggingface_hub import snapshot_download from huggingface_hub import snapshot_download
from PIL import Image from PIL import Image
from transformers import (AutoConfig, AutoModelForCausalLM, AutoTokenizer, from transformers import (AutoModelForCausalLM, AutoTokenizer, BatchEncoding,
BatchEncoding, BatchFeature) BatchFeature)
from transformers.models.auto.auto_factory import _BaseAutoModelClass from transformers.models.auto.auto_factory import _BaseAutoModelClass
from tests.models.utils import (TokensTextLogprobs, from tests.models.utils import (TokensTextLogprobs,
@ -23,7 +23,7 @@ from tests.models.utils import (TokensTextLogprobs,
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
from vllm.assets.image import ImageAsset from vllm.assets.image import ImageAsset
from vllm.assets.video import VideoAsset from vllm.assets.video import VideoAsset
from vllm.config import TaskOption, TokenizerPoolConfig, _get_and_verify_dtype from vllm.config import TaskOption, TokenizerPoolConfig
from vllm.connections import global_http_connection from vllm.connections import global_http_connection
from vllm.distributed import (cleanup_dist_env_and_memory, from vllm.distributed import (cleanup_dist_env_and_memory,
init_distributed_environment, init_distributed_environment,
@ -34,7 +34,8 @@ from vllm.inputs import (ExplicitEncoderDecoderPrompt, TextPrompt,
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.outputs import RequestOutput from vllm.outputs import RequestOutput
from vllm.sampling_params import BeamSearchParams from vllm.sampling_params import BeamSearchParams
from vllm.utils import cuda_device_count_stateless, is_list_of from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, cuda_device_count_stateless,
identity, is_list_of)
logger = init_logger(__name__) logger = init_logger(__name__)
@ -110,26 +111,6 @@ VIDEO_ASSETS = _VideoAssets()
"""Singleton instance of :class:`_VideoAssets`.""" """Singleton instance of :class:`_VideoAssets`."""
@pytest.fixture(scope="function", autouse=True)
def cleanup_VLLM_USE_V1(monkeypatch):
"""
The V1 oracle sets "VLLM_USE_V1" during loading. This means
that each invocation of a test change the env variable.
If we touch "VLLM_USE_V1" with monkeypatch, then any changes
made during the test run by vLLM will be cleaned up.
This fixture is used by every test.
"""
# If VLLM_USE_V1 is not set, set then delete. This will
# cause monkeypatch to clean up VLLM_USE_V1 upon exit
# if VLLM modifies the value of envs.VLLM_USE_V1.
if "VLLM_USE_V1" not in os.environ:
monkeypatch.setenv("VLLM_USE_V1", "")
monkeypatch.delenv("VLLM_USE_V1")
@pytest.fixture(params=[True, False]) @pytest.fixture(params=[True, False])
def run_with_both_engines(request, monkeypatch): def run_with_both_engines(request, monkeypatch):
# Automatically runs tests twice, once with V1 and once without # Automatically runs tests twice, once with V1 and once without
@ -270,18 +251,14 @@ _R = TypeVar("_R")
class HfRunner: class HfRunner:
def get_default_device(self):
from vllm.platforms import current_platform
return ("cpu" if current_platform.is_cpu()
or current_platform.is_openvino() else "cuda")
def wrap_device(self, x: _T, device: Optional[str] = None) -> _T: def wrap_device(self, x: _T, device: Optional[str] = None) -> _T:
from vllm.platforms import current_platform
if x is None or isinstance(x, (bool, )): if x is None or isinstance(x, (bool, )):
return x return x
if device is None: if device is None:
device = self.device device = "cpu" if current_platform.is_cpu(
) or current_platform.is_openvino() else "cuda"
if isinstance(x, dict): if isinstance(x, dict):
return {k: self.wrap_device(v, device) for k, v in x.items()} return {k: self.wrap_device(v, device) for k, v in x.items()}
@ -294,59 +271,45 @@ class HfRunner:
def __init__( def __init__(
self, self,
model_name: str, model_name: str,
dtype: str = "auto", dtype: str = "half",
*, *,
model_kwargs: Optional[dict[str, Any]] = None, model_kwargs: Optional[dict[str, Any]] = None,
is_sentence_transformer: bool = False, is_sentence_transformer: bool = False,
is_cross_encoder: bool = False, is_cross_encoder: bool = False,
skip_tokenizer_init: bool = False, skip_tokenizer_init: bool = False,
auto_cls: type[_BaseAutoModelClass] = AutoModelForCausalLM, auto_cls: type[_BaseAutoModelClass] = AutoModelForCausalLM,
postprocess_inputs: Callable[..., BatchEncoding] = identity,
) -> None: ) -> None:
torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[dtype]
self.model_name = model_name self.model_name = model_name
self.config = AutoConfig.from_pretrained(
model_name,
trust_remote_code=True,
)
self.device = self.get_default_device()
self.dtype = torch_dtype = _get_and_verify_dtype(self.config, dtype)
model_kwargs = model_kwargs if model_kwargs is not None else {}
model_kwargs.setdefault("torch_dtype", torch_dtype)
if is_sentence_transformer: if is_sentence_transformer:
# Lazy init required for AMD CI # Lazy init required for AMD CI
from sentence_transformers import SentenceTransformer from sentence_transformers import SentenceTransformer
self.model = self.wrap_device(
self.model = SentenceTransformer( SentenceTransformer(
model_name, model_name,
device=self.device, device="cpu",
model_kwargs=model_kwargs, trust_remote_code=True,
trust_remote_code=True, ).to(dtype=torch_dtype))
)
elif is_cross_encoder: elif is_cross_encoder:
# Lazy init required for AMD CI # Lazy init required for AMD CI
from sentence_transformers import CrossEncoder from sentence_transformers import CrossEncoder
self.model = CrossEncoder(model_name,
self.model = CrossEncoder( device="cpu",
model_name, trust_remote_code=True)
device=self.device, self.model.model = self.wrap_device(self.model.model)\
automodel_args=model_kwargs, .to(dtype=torch_dtype)
trust_remote_code=True,
)
else: else:
model = auto_cls.from_pretrained( model_kwargs = model_kwargs if model_kwargs is not None else {}
model_name, self.model = self.wrap_device(
trust_remote_code=True, auto_cls.from_pretrained(
**model_kwargs, model_name,
) torch_dtype=torch_dtype,
trust_remote_code=True,
if (getattr(model, "quantization_method", None) != "bitsandbytes" **model_kwargs,
and len({p.device ))
for p in model.parameters()}) < 2):
model = model.to(self.device)
self.model = model
if not skip_tokenizer_init: if not skip_tokenizer_init:
self.tokenizer = AutoTokenizer.from_pretrained( self.tokenizer = AutoTokenizer.from_pretrained(
@ -366,13 +329,16 @@ class HfRunner:
if skip_tokenizer_init: if skip_tokenizer_init:
self.tokenizer = self.processor.tokenizer self.tokenizer = self.processor.tokenizer
self.dtype = dtype
self.postprocess_inputs = postprocess_inputs
def get_inputs( def get_inputs(
self, self,
prompts: list[str], prompts: list[str],
images: Optional[PromptImageInput] = None, images: Optional[PromptImageInput] = None,
videos: Optional[PromptVideoInput] = None, videos: Optional[PromptVideoInput] = None,
audios: Optional[PromptAudioInput] = None, audios: Optional[PromptAudioInput] = None,
) -> list[Union[BatchFeature, BatchEncoding]]: ) -> list[BatchEncoding]:
if images is not None: if images is not None:
assert len(prompts) == len(images) assert len(prompts) == len(images)
@ -382,7 +348,7 @@ class HfRunner:
if audios is not None: if audios is not None:
assert len(prompts) == len(audios) assert len(prompts) == len(audios)
all_inputs: list[Union[BatchFeature, BatchEncoding]] = [] all_inputs: list[BatchEncoding] = []
for i, prompt in enumerate(prompts): for i, prompt in enumerate(prompts):
processor_kwargs: dict[str, Any] = { processor_kwargs: dict[str, Any] = {
"text": prompt, "text": prompt,
@ -398,8 +364,7 @@ class HfRunner:
processor_kwargs["sampling_rate"] = sr processor_kwargs["sampling_rate"] = sr
inputs = self.processor(**processor_kwargs) inputs = self.processor(**processor_kwargs)
if isinstance(inputs, BatchFeature): inputs = self.postprocess_inputs(inputs, dtype=self.dtype)
inputs = inputs.to(dtype=self.dtype)
all_inputs.append(inputs) all_inputs.append(inputs)
@ -432,7 +397,7 @@ class HfRunner:
outputs: list[tuple[list[list[int]], list[str]]] = [] outputs: list[tuple[list[list[int]], list[str]]] = []
for inputs in all_inputs: for inputs in all_inputs:
output_ids = self.model.generate( output_ids = self.model.generate(
**self.wrap_device(inputs), **self.wrap_device(inputs, device=self.model.device.type),
use_cache=True, use_cache=True,
**kwargs, **kwargs,
) )
@ -503,7 +468,7 @@ class HfRunner:
all_logprobs: list[list[torch.Tensor]] = [] all_logprobs: list[list[torch.Tensor]] = []
for inputs in all_inputs: for inputs in all_inputs:
output = self.model.generate( output = self.model.generate(
**self.wrap_device(inputs), **self.wrap_device(inputs, device=self.model.device.type),
use_cache=True, use_cache=True,
do_sample=False, do_sample=False,
max_new_tokens=max_tokens, max_new_tokens=max_tokens,
@ -584,7 +549,7 @@ class HfRunner:
for inputs in all_inputs: for inputs in all_inputs:
output = self.model.generate( output = self.model.generate(
**self.wrap_device(inputs), **self.wrap_device(inputs, device=self.model.device.type),
use_cache=True, use_cache=True,
do_sample=False, do_sample=False,
max_new_tokens=max_tokens, max_new_tokens=max_tokens,
@ -635,15 +600,19 @@ class HfRunner:
if images is not None and images[i] is not None: if images is not None and images[i] is not None:
processor_kwargs["images"] = images[i] processor_kwargs["images"] = images[i]
encoder_inputs = self.processor(**processor_kwargs) encoder_inputs = self.wrap_device(
encoder_inputs = self.wrap_device(encoder_inputs) self.processor(**processor_kwargs),
device=self.model.device.type,
)
if decoder_prompt is None: if decoder_prompt is None:
decoder_input_ids = None decoder_input_ids = None
else: else:
decoder_inputs = self.tokenizer(decoder_prompt, decoder_input_ids = self.wrap_device(
return_tensors="pt") self.tokenizer(decoder_prompt,
decoder_input_ids = self.wrap_device(decoder_inputs.input_ids) return_tensors="pt").input_ids,
device=self.model.device.type,
)
output = self.model.generate( output = self.model.generate(
decoder_input_ids=decoder_input_ids, decoder_input_ids=decoder_input_ids,
@ -692,18 +661,6 @@ def hf_runner():
class VllmRunner: class VllmRunner:
"""
The default value of some arguments have been modified from
:class:`~vllm.LLM` as follows:
- `trust_remote_code`: Set to `True` instead of `False` for convenience.
- `seed`: Set to `0` instead of `None` for test reproducibility.
- `max_model_len`: Set to `1024` instead of `None` to reduce memory usage.
- `block_size`: Set to `16` instead of `None` to reduce memory usage.
- `enable_chunked_prefill`: Set to `False` instead of `None` for
test reproducibility.
- `enforce_eager`: Set to `False` instead of `None` to test CUDA graph.
"""
def __init__( def __init__(
self, self,
@ -711,14 +668,14 @@ class VllmRunner:
task: TaskOption = "auto", task: TaskOption = "auto",
tokenizer_name: Optional[str] = None, tokenizer_name: Optional[str] = None,
tokenizer_mode: str = "auto", tokenizer_mode: str = "auto",
trust_remote_code: bool = True, # Use smaller max model length, otherwise bigger model cannot run due
seed: Optional[int] = 0, # to kv cache size limit.
max_model_len: int = 1024, max_model_len: int = 1024,
dtype: str = "auto", dtype: str = "half",
disable_log_stats: bool = True, disable_log_stats: bool = True,
tensor_parallel_size: int = 1, tensor_parallel_size: int = 1,
block_size: int = 16, block_size: int = 16,
enable_chunked_prefill: Optional[bool] = False, enable_chunked_prefill: bool = False,
swap_space: int = 4, swap_space: int = 4,
enforce_eager: Optional[bool] = False, enforce_eager: Optional[bool] = False,
**kwargs, **kwargs,
@ -728,9 +685,8 @@ class VllmRunner:
task=task, task=task,
tokenizer=tokenizer_name, tokenizer=tokenizer_name,
tokenizer_mode=tokenizer_mode, tokenizer_mode=tokenizer_mode,
trust_remote_code=trust_remote_code, trust_remote_code=True,
dtype=dtype, dtype=dtype,
seed=seed,
swap_space=swap_space, swap_space=swap_space,
enforce_eager=enforce_eager, enforce_eager=enforce_eager,
disable_log_stats=disable_log_stats, disable_log_stats=disable_log_stats,

View File

@ -1,11 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
import pytest
@pytest.fixture(scope="function", autouse=True)
def use_v0_only(monkeypatch):
"""
Since this module is V0 only, set VLLM_USE_V1=0 for
all tests in the module.
"""
monkeypatch.setenv('VLLM_USE_V1', '0')

View File

@ -1,10 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
import pytest
@pytest.fixture(autouse=True)
def v1(run_with_both_engines):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass

View File

@ -1,141 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
from typing import Any, Optional
import pytest
from vllm import LLM, SamplingParams, envs
MODEL = "meta-llama/llama-2-7b-hf"
MAX_TOKENS = 200
def _test_stopping(llm: LLM,
expected_output: str,
expected_reason: Any,
stop: Optional[list[str]] = None,
stop_token_ids: Optional[list[int]] = None,
include_in_output: bool = False) -> None:
output = llm.generate(
"A story about vLLM:\n",
SamplingParams(
temperature=0.0,
max_tokens=MAX_TOKENS,
stop=stop,
stop_token_ids=stop_token_ids,
include_stop_str_in_output=include_in_output,
))[0].outputs[0]
assert output is not None
assert output.text == expected_output
assert output.stop_reason == expected_reason
def _set_async_mode(llm, is_async):
llm.llm_engine.scheduler[0].use_async_output_proc = is_async
def _stop_basic(llm):
_test_stopping(llm,
stop=["."],
include_in_output=False,
expected_output="VLLM is a 100% volunteer organization",
expected_reason=".")
_test_stopping(llm,
stop=["."],
include_in_output=True,
expected_output="VLLM is a 100% volunteer organization.",
expected_reason=".")
def _stop_multi_tokens(llm):
_test_stopping(
llm,
stop=["group of peo", "short"],
include_in_output=False,
expected_output="VLLM is a 100% volunteer organization. We are a ",
expected_reason="group of peo")
_test_stopping(
llm,
stop=["group of peo", "short"],
include_in_output=True,
expected_output=
"VLLM is a 100% volunteer organization. We are a group of peo",
expected_reason="group of peo")
def _stop_partial_token(llm):
_test_stopping(llm,
stop=["gani"],
include_in_output=False,
expected_output="VLLM is a 100% volunteer or",
expected_reason="gani")
_test_stopping(llm,
stop=["gani"],
include_in_output=True,
expected_output="VLLM is a 100% volunteer organi",
expected_reason="gani")
def _stop_token_id(llm):
# token id 13013 => " organization"
_test_stopping(llm,
stop_token_ids=[13013],
include_in_output=False,
expected_output="VLLM is a 100% volunteer",
expected_reason=13013)
_test_stopping(llm,
stop_token_ids=[13013],
include_in_output=True,
expected_output="VLLM is a 100% volunteer organization",
expected_reason=13013)
@pytest.mark.skip_global_cleanup
def test_stop_strings():
# If V0, must set enforce_eager=False since we use
# async output processing below.
vllm_model = LLM(MODEL, enforce_eager=envs.VLLM_USE_V1)
if envs.VLLM_USE_V1:
_stop_basic(vllm_model)
else:
_set_async_mode(vllm_model, True)
_stop_basic(vllm_model)
_set_async_mode(vllm_model, False)
_stop_basic(vllm_model)
if envs.VLLM_USE_V1:
_stop_multi_tokens(vllm_model)
else:
_set_async_mode(vllm_model, True)
_stop_multi_tokens(vllm_model)
_set_async_mode(vllm_model, False)
_stop_multi_tokens(vllm_model)
if envs.VLLM_USE_V1:
_stop_partial_token(vllm_model)
else:
_set_async_mode(vllm_model, True)
_stop_partial_token(vllm_model)
_set_async_mode(vllm_model, False)
_stop_partial_token(vllm_model)
if envs.VLLM_USE_V1:
# FIXME: this does not respect include_in_output=False
# _stop_token_id(vllm_model)
pass
else:
_set_async_mode(vllm_model, True)
_stop_token_id(vllm_model)
_set_async_mode(vllm_model, False)
_stop_token_id(vllm_model)

View File

@ -3,10 +3,7 @@
Run `pytest tests/distributed/test_comm_ops.py`. Run `pytest tests/distributed/test_comm_ops.py`.
""" """
import os
from __future__ import annotations
from typing import Any, Callable
import pytest import pytest
import ray import ray
@ -20,18 +17,12 @@ from ..utils import init_test_distributed_environment, multi_process_parallel
@ray.remote(num_gpus=1, max_calls=1) @ray.remote(num_gpus=1, max_calls=1)
def all_reduce_test_worker( def all_reduce_test_worker(tp_size: int, pp_size: int, rank: int,
monkeypatch: pytest.MonkeyPatch, distributed_init_port: str):
tp_size: int,
pp_size: int,
rank: int,
distributed_init_port: str,
):
# it is important to delete the CUDA_VISIBLE_DEVICES environment variable # it is important to delete the CUDA_VISIBLE_DEVICES environment variable
# so that each worker can see all the GPUs # so that each worker can see all the GPUs
# they will be able to set the device to the correct GPU # they will be able to set the device to the correct GPU
monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False) os.environ.pop("CUDA_VISIBLE_DEVICES", None)
device = torch.device(f"cuda:{rank}") device = torch.device(f"cuda:{rank}")
torch.cuda.set_device(device) torch.cuda.set_device(device)
init_test_distributed_environment(tp_size, pp_size, rank, init_test_distributed_environment(tp_size, pp_size, rank,
@ -48,17 +39,12 @@ def all_reduce_test_worker(
@ray.remote(num_gpus=1, max_calls=1) @ray.remote(num_gpus=1, max_calls=1)
def all_gather_test_worker( def all_gather_test_worker(tp_size: int, pp_size: int, rank: int,
monkeypatch: pytest.MonkeyPatch, distributed_init_port: str):
tp_size: int,
pp_size: int,
rank: int,
distributed_init_port: str,
):
# it is important to delete the CUDA_VISIBLE_DEVICES environment variable # it is important to delete the CUDA_VISIBLE_DEVICES environment variable
# so that each worker can see all the GPUs # so that each worker can see all the GPUs
# they will be able to set the device to the correct GPU # they will be able to set the device to the correct GPU
monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False) os.environ.pop("CUDA_VISIBLE_DEVICES", None)
device = torch.device(f"cuda:{rank}") device = torch.device(f"cuda:{rank}")
torch.cuda.set_device(device) torch.cuda.set_device(device)
init_test_distributed_environment(tp_size, pp_size, rank, init_test_distributed_environment(tp_size, pp_size, rank,
@ -81,17 +67,12 @@ def all_gather_test_worker(
@ray.remote(num_gpus=1, max_calls=1) @ray.remote(num_gpus=1, max_calls=1)
def broadcast_tensor_dict_test_worker( def broadcast_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
monkeypatch: pytest.MonkeyPatch, distributed_init_port: str):
tp_size: int,
pp_size: int,
rank: int,
distributed_init_port: str,
):
# it is important to delete the CUDA_VISIBLE_DEVICES environment variable # it is important to delete the CUDA_VISIBLE_DEVICES environment variable
# so that each worker can see all the GPUs # so that each worker can see all the GPUs
# they will be able to set the device to the correct GPU # they will be able to set the device to the correct GPU
monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False) os.environ.pop("CUDA_VISIBLE_DEVICES", None)
device = torch.device(f"cuda:{rank}") device = torch.device(f"cuda:{rank}")
torch.cuda.set_device(device) torch.cuda.set_device(device)
init_test_distributed_environment(tp_size, pp_size, rank, init_test_distributed_environment(tp_size, pp_size, rank,
@ -125,14 +106,9 @@ def broadcast_tensor_dict_test_worker(
@ray.remote(num_gpus=1, max_calls=1) @ray.remote(num_gpus=1, max_calls=1)
def send_recv_tensor_dict_test_worker( def send_recv_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
monkeypatch: pytest.MonkeyPatch, distributed_init_port: str):
tp_size: int, os.environ.pop("CUDA_VISIBLE_DEVICES", None)
pp_size: int,
rank: int,
distributed_init_port: str,
):
monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
device = torch.device(f"cuda:{rank}") device = torch.device(f"cuda:{rank}")
torch.cuda.set_device(device) torch.cuda.set_device(device)
init_test_distributed_environment(tp_size, pp_size, rank, init_test_distributed_environment(tp_size, pp_size, rank,
@ -170,14 +146,9 @@ def send_recv_tensor_dict_test_worker(
@ray.remote(num_gpus=1, max_calls=1) @ray.remote(num_gpus=1, max_calls=1)
def send_recv_test_worker( def send_recv_test_worker(tp_size: int, pp_size: int, rank: int,
monkeypatch: pytest.MonkeyPatch, distributed_init_port: str):
tp_size: int, os.environ.pop("CUDA_VISIBLE_DEVICES", None)
pp_size: int,
rank: int,
distributed_init_port: str,
):
monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
device = torch.device(f"cuda:{rank}") device = torch.device(f"cuda:{rank}")
torch.cuda.set_device(device) torch.cuda.set_device(device)
init_test_distributed_environment(tp_size, pp_size, rank, init_test_distributed_environment(tp_size, pp_size, rank,
@ -203,12 +174,8 @@ def send_recv_test_worker(
all_reduce_test_worker, all_gather_test_worker, all_reduce_test_worker, all_gather_test_worker,
broadcast_tensor_dict_test_worker broadcast_tensor_dict_test_worker
]) ])
def test_multi_process_tensor_parallel( def test_multi_process_tensor_parallel(tp_size, test_target):
monkeypatch: pytest.MonkeyPatch, multi_process_parallel(tp_size, 1, test_target)
tp_size: int,
test_target: Callable[..., Any],
):
multi_process_parallel(monkeypatch, tp_size, 1, test_target)
@pytest.mark.skipif(torch.cuda.device_count() < 2, @pytest.mark.skipif(torch.cuda.device_count() < 2,
@ -216,12 +183,8 @@ def test_multi_process_tensor_parallel(
@pytest.mark.parametrize("pp_size", [2]) @pytest.mark.parametrize("pp_size", [2])
@pytest.mark.parametrize( @pytest.mark.parametrize(
"test_target", [send_recv_test_worker, send_recv_tensor_dict_test_worker]) "test_target", [send_recv_test_worker, send_recv_tensor_dict_test_worker])
def test_multi_process_pipeline_parallel( def test_multi_process_pipeline_parallel(pp_size, test_target):
monkeypatch: pytest.MonkeyPatch, multi_process_parallel(1, pp_size, test_target)
pp_size: int,
test_target: Callable[..., Any],
):
multi_process_parallel(monkeypatch, 1, pp_size, test_target)
@pytest.mark.skipif(torch.cuda.device_count() < 4, @pytest.mark.skipif(torch.cuda.device_count() < 4,
@ -234,9 +197,5 @@ def test_multi_process_pipeline_parallel(
broadcast_tensor_dict_test_worker broadcast_tensor_dict_test_worker
]) ])
def test_multi_process_tensor_parallel_pipeline_parallel( def test_multi_process_tensor_parallel_pipeline_parallel(
tp_size: int, tp_size, pp_size, test_target):
pp_size: int, multi_process_parallel(tp_size, pp_size, test_target)
test_target: Callable[..., Any],
monkeypatch: pytest.MonkeyPatch,
):
multi_process_parallel(monkeypatch, tp_size, pp_size, test_target)

View File

@ -1,5 +1,6 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
import os
import random import random
import pytest import pytest
@ -22,115 +23,95 @@ for i, v in enumerate(test_sizes):
@ray.remote(num_gpus=1, max_calls=1) @ray.remote(num_gpus=1, max_calls=1)
def graph_allreduce( def graph_allreduce(tp_size, pp_size, rank, distributed_init_port):
monkeypatch: pytest.MonkeyPatch, os.environ.pop("CUDA_VISIBLE_DEVICES", None)
tp_size, device = torch.device(f"cuda:{rank}")
pp_size, torch.cuda.set_device(device)
rank, init_test_distributed_environment(tp_size, pp_size, rank,
distributed_init_port, distributed_init_port)
): ensure_model_parallel_initialized(tp_size, pp_size)
with monkeypatch.context() as m: group = get_tensor_model_parallel_group().device_group
m.delenv("CUDA_VISIBLE_DEVICES", raising=False)
device = torch.device(f"cuda:{rank}")
torch.cuda.set_device(device)
init_test_distributed_environment(tp_size, pp_size, rank,
distributed_init_port)
ensure_model_parallel_initialized(tp_size, pp_size)
group = get_tensor_model_parallel_group().device_group
# A small all_reduce for warmup. # A small all_reduce for warmup.
# this is needed because device communicators might be created lazily # this is needed because device communicators might be created lazily
# (e.g. NCCL). This will ensure that the communicator is initialized # (e.g. NCCL). This will ensure that the communicator is initialized
# before any communication happens, so that this group can be used for # before any communication happens, so that this group can be used for
# graph capture immediately. # graph capture immediately.
data = torch.zeros(1) data = torch.zeros(1)
data = data.to(device=device) data = data.to(device=device)
torch.distributed.all_reduce(data, group=group) torch.distributed.all_reduce(data, group=group)
torch.cuda.synchronize() torch.cuda.synchronize()
del data del data
# we use the first group to communicate once # we use the first group to communicate once
# and the second group to communicate twice # and the second group to communicate twice
# and so on # and so on
# this is used to demonstrate that each group can # this is used to demonstrate that each group can
# communicate independently # communicate independently
num_communication = rank // tp_size + 1 num_communication = rank // tp_size + 1
for sz in test_sizes: for sz in test_sizes:
for dtype in [torch.float32, torch.float16, torch.bfloat16]: for dtype in [torch.float32, torch.float16, torch.bfloat16]:
with graph_capture(device=device) as graph_capture_context: with graph_capture(device=device) as graph_capture_context:
# use integers so result matches NCCL exactly # use integers so result matches NCCL exactly
inp1 = torch.randint(1, inp1 = torch.randint(1,
16, (sz, ), 16, (sz, ),
dtype=dtype, dtype=dtype,
device=torch.cuda.current_device()) device=torch.cuda.current_device())
inp2 = torch.randint(1, inp2 = torch.randint(1,
16, (sz, ), 16, (sz, ),
dtype=dtype, dtype=dtype,
device=torch.cuda.current_device()) device=torch.cuda.current_device())
torch.cuda.synchronize() torch.cuda.synchronize()
graph = torch.cuda.CUDAGraph() graph = torch.cuda.CUDAGraph()
with torch.cuda.graph(graph, with torch.cuda.graph(graph,
stream=graph_capture_context.stream): stream=graph_capture_context.stream):
for i in range(num_communication): for i in range(num_communication):
out1 = tensor_model_parallel_all_reduce(inp1) out1 = tensor_model_parallel_all_reduce(inp1)
# the input buffer is immediately modified to test # the input buffer is immediately modified to test
# synchronization # synchronization
dist.all_reduce(inp1, group=group) dist.all_reduce(inp1, group=group)
out2 = tensor_model_parallel_all_reduce(inp2) out2 = tensor_model_parallel_all_reduce(inp2)
dist.all_reduce(inp2, group=group) dist.all_reduce(inp2, group=group)
graph.replay() graph.replay()
torch.testing.assert_close(out1, inp1) torch.testing.assert_close(out1, inp1)
torch.testing.assert_close(out2, inp2) torch.testing.assert_close(out2, inp2)
@ray.remote(num_gpus=1, max_calls=1) @ray.remote(num_gpus=1, max_calls=1)
def eager_allreduce( def eager_allreduce(tp_size, pp_size, rank, distributed_init_port):
monkeypatch: pytest.MonkeyPatch, os.environ.pop("CUDA_VISIBLE_DEVICES", None)
tp_size, device = torch.device(f"cuda:{rank}")
pp_size, torch.cuda.set_device(device)
rank, init_test_distributed_environment(tp_size, pp_size, rank,
distributed_init_port, distributed_init_port)
):
with monkeypatch.context() as m:
m.delenv("CUDA_VISIBLE_DEVICES", raising=False)
device = torch.device(f"cuda:{rank}")
torch.cuda.set_device(device)
init_test_distributed_environment(tp_size, pp_size, rank,
distributed_init_port)
# we use the first group to communicate once # we use the first group to communicate once
# and the second group to communicate twice # and the second group to communicate twice
# and so on # and so on
# this is used to demonstrate that each group can # this is used to demonstrate that each group can
# communicate independently # communicate independently
num_communication = rank // tp_size + 1 num_communication = rank // tp_size + 1
sz = 1024 sz = 1024
fa = get_tp_group().ca_comm fa = get_tp_group().ca_comm
inp = torch.ones(sz, dtype=torch.float32, device=device) inp = torch.ones(sz, dtype=torch.float32, device=device)
out = inp out = inp
for _ in range(num_communication): for _ in range(num_communication):
out = fa.all_reduce(out, registered=False) out = fa.all_reduce(out, registered=False)
torch.testing.assert_close(out, inp * (tp_size**num_communication)) torch.testing.assert_close(out, inp * (tp_size**num_communication))
inp = torch.ones(sz * 4, dtype=torch.bfloat16, device=device) inp = torch.ones(sz * 4, dtype=torch.bfloat16, device=device)
out = inp out = inp
for _ in range(num_communication): for _ in range(num_communication):
out = fa.all_reduce(out, registered=False) out = fa.all_reduce(out, registered=False)
torch.testing.assert_close(out, inp * (tp_size**num_communication)) torch.testing.assert_close(out, inp * (tp_size**num_communication))
@pytest.mark.parametrize("tp_size", [2]) @pytest.mark.parametrize("tp_size", [2])
@pytest.mark.parametrize("pipeline_parallel_size", [1, 2]) @pytest.mark.parametrize("pipeline_parallel_size", [1, 2])
@pytest.mark.parametrize("test_target", [eager_allreduce, graph_allreduce]) @pytest.mark.parametrize("test_target", [eager_allreduce, graph_allreduce])
def test_custom_allreduce( def test_custom_allreduce(tp_size, pipeline_parallel_size, test_target):
monkeypatch: pytest.MonkeyPatch,
tp_size,
pipeline_parallel_size,
test_target,
):
world_size = tp_size * pipeline_parallel_size world_size = tp_size * pipeline_parallel_size
if world_size > torch.cuda.device_count(): if world_size > torch.cuda.device_count():
pytest.skip("Not enough GPUs to run the test.") pytest.skip("Not enough GPUs to run the test.")
multi_process_parallel(monkeypatch, tp_size, pipeline_parallel_size, multi_process_parallel(tp_size, pipeline_parallel_size, test_target)
test_target)

View File

@ -8,7 +8,7 @@ import pytest
from vllm.config import TaskOption from vllm.config import TaskOption
from vllm.logger import init_logger from vllm.logger import init_logger
from ..utils import compare_two_settings, create_new_process_for_each_test from ..utils import compare_two_settings, fork_new_process_for_each_test
logger = init_logger("test_expert_parallel") logger = init_logger("test_expert_parallel")
@ -209,7 +209,7 @@ def _compare_tp(
for params in settings.iter_params(model_name) for params in settings.iter_params(model_name)
], ],
) )
@create_new_process_for_each_test() @fork_new_process_for_each_test
def test_ep( def test_ep(
model_name: str, model_name: str,
parallel_setup: ParallelSetup, parallel_setup: ParallelSetup,

View File

@ -17,25 +17,13 @@ from vllm.config import TaskOption
from vllm.logger import init_logger from vllm.logger import init_logger
from ..models.registry import HF_EXAMPLE_MODELS from ..models.registry import HF_EXAMPLE_MODELS
from ..utils import compare_two_settings, create_new_process_for_each_test from ..utils import compare_two_settings, fork_new_process_for_each_test
logger = init_logger("test_pipeline_parallel") logger = init_logger("test_pipeline_parallel")
VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1" VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1"
@pytest.fixture(scope="function", autouse=True)
def use_v0_only(monkeypatch):
"""
For PP, we fall back to V0 by default. This means
that the TP baseline runs with V1 while the PP engine
runs with V0. This gives divergent results with dummy
weights. Once we enable V1 by default for PP, we can
remove this.
"""
monkeypatch.setenv('VLLM_USE_V1', '0')
class ParallelSetup(NamedTuple): class ParallelSetup(NamedTuple):
tp_size: int tp_size: int
pp_size: int pp_size: int
@ -226,7 +214,7 @@ MULTIMODAL_MODELS = {
"llava-hf/llava-onevision-qwen2-0.5b-ov-hf": PPTestSettings.fast(), "llava-hf/llava-onevision-qwen2-0.5b-ov-hf": PPTestSettings.fast(),
"openbmb/MiniCPM-Llama3-V-2_5": PPTestSettings.fast(), "openbmb/MiniCPM-Llama3-V-2_5": PPTestSettings.fast(),
"allenai/Molmo-7B-D-0924": PPTestSettings.fast(), "allenai/Molmo-7B-D-0924": PPTestSettings.fast(),
"microsoft/Phi-3.5-vision-instruct": PPTestSettings.fast(), "microsoft/Phi-3-vision-128k-instruct": PPTestSettings.fast(),
"mistralai/Pixtral-12B-2409": PPTestSettings.fast(load_format="dummy"), "mistralai/Pixtral-12B-2409": PPTestSettings.fast(load_format="dummy"),
"Qwen/Qwen-VL-Chat": PPTestSettings.fast(), "Qwen/Qwen-VL-Chat": PPTestSettings.fast(),
"Qwen/Qwen2-Audio-7B-Instruct": PPTestSettings.fast(), "Qwen/Qwen2-Audio-7B-Instruct": PPTestSettings.fast(),
@ -249,7 +237,7 @@ TEST_MODELS = [
"BAAI/bge-multilingual-gemma2", "BAAI/bge-multilingual-gemma2",
# [MULTIMODAL GENERATION] # [MULTIMODAL GENERATION]
"OpenGVLab/InternVL2-1B", "OpenGVLab/InternVL2-1B",
"microsoft/Phi-3.5-vision-instruct", "microsoft/Phi-3-vision-128k-instruct",
"fixie-ai/ultravox-v0_5-llama-3_2-1b", "fixie-ai/ultravox-v0_5-llama-3_2-1b",
# [LANGUAGE GENERATION - HYBRID ARCH] # [LANGUAGE GENERATION - HYBRID ARCH]
"ai21labs/Jamba-tiny-dev", "ai21labs/Jamba-tiny-dev",
@ -350,10 +338,6 @@ def _compare_tp(
else: else:
pp_env = None pp_env = None
tp_env = {
"VLLM_USE_V1": vllm_major_version,
}
pp_args = [ pp_args = [
*common_args, *common_args,
"--pipeline-parallel-size", "--pipeline-parallel-size",
@ -378,20 +362,14 @@ def _compare_tp(
] ]
try: try:
compare_two_settings(model_id, compare_two_settings(model_id, pp_args, tp_args, pp_env, method=method)
pp_args,
tp_args,
pp_env,
tp_env,
method=method)
except Exception: except Exception:
testing_ray_compiled_graph = pp_env is not None if pp_env is None:
if testing_ray_compiled_graph and vllm_major_version == "0": raise
# Ray Compiled Graph tests are flaky for V0, else:
# Ray Compiled Graph tests are flaky,
# so we don't want to fail the test # so we don't want to fail the test
logger.exception("Ray Compiled Graph tests failed") logger.exception("Ray Compiled Graph tests failed")
else:
raise
@pytest.mark.parametrize( @pytest.mark.parametrize(
@ -402,7 +380,7 @@ def _compare_tp(
for params in settings.iter_params(model_id) if model_id in TEST_MODELS for params in settings.iter_params(model_id) if model_id in TEST_MODELS
], ],
) )
@create_new_process_for_each_test() @fork_new_process_for_each_test
def test_tp_language_generation( def test_tp_language_generation(
model_id: str, model_id: str,
parallel_setup: ParallelSetup, parallel_setup: ParallelSetup,
@ -431,7 +409,7 @@ def test_tp_language_generation(
for params in settings.iter_params(model_id) if model_id in TEST_MODELS for params in settings.iter_params(model_id) if model_id in TEST_MODELS
], ],
) )
@create_new_process_for_each_test() @fork_new_process_for_each_test
def test_tp_language_embedding( def test_tp_language_embedding(
model_id: str, model_id: str,
parallel_setup: ParallelSetup, parallel_setup: ParallelSetup,
@ -460,7 +438,7 @@ def test_tp_language_embedding(
for params in settings.iter_params(model_id) if model_id in TEST_MODELS for params in settings.iter_params(model_id) if model_id in TEST_MODELS
], ],
) )
@create_new_process_for_each_test() @fork_new_process_for_each_test
def test_tp_multimodal_generation( def test_tp_multimodal_generation(
model_id: str, model_id: str,
parallel_setup: ParallelSetup, parallel_setup: ParallelSetup,

Some files were not shown because too many files have changed in this diff Show More