Compare commits

..

4 Commits

Author SHA1 Message Date
6e1e31a66a add code owner
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2025-09-17 22:19:17 -07:00
50e80db4ef add mixin
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2025-09-17 22:15:10 -07:00
d3d6afb355 add mixin
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2025-09-17 22:12:41 -07:00
808fa43d76 add mixin
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2025-09-17 22:02:15 -07:00
846 changed files with 45506 additions and 34032 deletions

View File

@ -86,6 +86,10 @@ if [[ $commands == *"pytest -v -s models/test_registry.py"* ]]; then
commands=${commands//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"}
fi
if [[ $commands == *"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'"* ]]; then
commands=${commands//"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'"/"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2 and not BambaForCausalLM and not Gemma2ForCausalLM and not Grok1ModelForCausalLM and not Zamba2ForCausalLM and not Gemma2Model and not GritLM'"}
fi
if [[ $commands == *"pytest -v -s compile/test_basic_correctness.py"* ]]; then
commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s compile/test_basic_correctness.py"}
fi
@ -163,6 +167,12 @@ if [[ $commands == *" entrypoints/llm "* ]]; then
--ignore=entrypoints/llm/test_prompt_validation.py "}
fi
#Obsolete currently
##ignore certain Entrypoints/llm tests
#if [[ $commands == *" && pytest -v -s entrypoints/llm/test_guided_generate.py"* ]]; then
# commands=${commands//" && pytest -v -s entrypoints/llm/test_guided_generate.py"/" "}
#fi
# --ignore=entrypoints/openai/test_encoder_decoder.py \
# --ignore=entrypoints/openai/test_embedding.py \
# --ignore=entrypoints/openai/test_oot_registration.py

View File

@ -58,8 +58,11 @@ function cpu_tests() {
# pytest -x -v -s tests/kernels/attention/test_cache.py -m cpu_model
# pytest -x -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
pytest -x -v -s tests/models/language/generation -m cpu_model
VLLM_CPU_SGL_KERNEL=1 pytest -x -v -s tests/models/language/generation -m cpu_model
# Note: disable Bart until supports V1
pytest -x -v -s tests/models/language/generation -m cpu_model \
--ignore=tests/models/language/generation/test_bart.py
VLLM_CPU_SGL_KERNEL=1 pytest -x -v -s tests/models/language/generation -m cpu_model \
--ignore=tests/models/language/generation/test_bart.py
pytest -x -v -s tests/models/language/pooling -m cpu_model
pytest -x -v -s tests/models/multimodal/generation \

View File

@ -62,7 +62,7 @@ echo "--- Installing Python dependencies ---"
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
&& python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
&& python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
&& python3 -m pip install --progress-bar off hf-transfer
echo "--- Python dependencies installed ---"
export VLLM_USE_V1=1
export VLLM_XLA_CHECK_RECOMPILATION=1

View File

@ -62,7 +62,7 @@ echo "--- Installing Python dependencies ---"
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
&& python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
&& python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
&& python3 -m pip install --progress-bar off hf-transfer
echo "--- Python dependencies installed ---"
export VLLM_USE_V1=1
export VLLM_XLA_CHECK_RECOMPILATION=1

View File

@ -35,7 +35,7 @@ docker run \
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -O.cudagraph_mode=NONE
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
VLLM_ATTENTION_BACKEND=TRITON_ATTN python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
VLLM_ATTENTION_BACKEND=TRITON_ATTN_VLLM_V1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
cd tests
pytest -v -s v1/core
pytest -v -s v1/engine

View File

@ -1,59 +0,0 @@
#!/bin/bash
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Setup script for Prime-RL integration tests
# This script prepares the environment for running Prime-RL tests with nightly vLLM
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
PRIME_RL_REPO="https://github.com/PrimeIntellect-ai/prime-rl.git"
PRIME_RL_DIR="${REPO_ROOT}/prime-rl"
echo "Setting up Prime-RL integration test environment..."
# Clean up any existing Prime-RL directory
if [ -d "${PRIME_RL_DIR}" ]; then
echo "Removing existing Prime-RL directory..."
rm -rf "${PRIME_RL_DIR}"
fi
# Install UV if not available
if ! command -v uv &> /dev/null; then
echo "Installing UV package manager..."
curl -LsSf https://astral.sh/uv/install.sh | sh
source $HOME/.local/bin/env
fi
# Clone Prime-RL repository at specific branch for reproducible tests
PRIME_RL_BRANCH="integ-vllm-main"
echo "Cloning Prime-RL repository at branch: ${PRIME_RL_BRANCH}..."
git clone --branch "${PRIME_RL_BRANCH}" --single-branch "${PRIME_RL_REPO}" "${PRIME_RL_DIR}"
cd "${PRIME_RL_DIR}"
echo "Setting up UV project environment..."
export UV_PROJECT_ENVIRONMENT=/usr/local
ln -s /usr/bin/python3 /usr/local/bin/python
# Remove vllm pin from pyproject.toml
echo "Removing vllm pin from pyproject.toml..."
sed -i '/vllm==/d' pyproject.toml
# Sync Prime-RL dependencies
echo "Installing Prime-RL dependencies..."
uv sync --inexact && uv sync --inexact --all-extras
# Verify installation
echo "Verifying installations..."
uv run python -c "import vllm; print(f'vLLM version: {vllm.__version__}')"
uv run python -c "import prime_rl; print('Prime-RL imported successfully')"
echo "Prime-RL integration test environment setup complete!"
echo "Running Prime-RL integration tests..."
export WANDB_MODE=offline # this makes this test not require a WANDB_API_KEY
uv run pytest -vs tests/integration/test_rl.py -m gpu
echo "Prime-RL integration tests completed!"

View File

@ -6,28 +6,24 @@
# to generate the final pipeline yaml file.
# Documentation
# label(str): the name of the test. emojis allowed.
# fast_check(bool): whether to run this on each commit on the fastcheck pipeline.
# torch_nightly(bool): whether to run this on vllm against the torch nightly pipeline.
# fast_check_only(bool): run this test on the fastcheck pipeline only
# optional(bool): never run this test by default (i.e. need to unblock manually) unless it's a scheduled nightly run.
# soft_fail(bool): allow this step to fail without failing the entire pipeline (useful for flaky or experimental tests).
# label(str): the name of the test. emoji allowed.
# fast_check(bool): whether to run this on each commit on fastcheck pipeline.
# torch_nightly(bool): whether to run this on vllm against torch nightly pipeline.
# fast_check_only(bool): run this test on fastcheck pipeline only
# optional(bool): never run this test by default (i.e. need to unblock manually) unless it's scheduled nightly run.
# command(str): the single command to run for tests. incompatible with commands.
# commands(list): the list of commands to run for the test. incompatible with command.
# mirror_hardwares(list): the list of hardware to run the test on as well. currently only supports [amdexperimental]
# gpu(str): override the GPU selection for the test. default is L4 GPUs. supports a100, b200, h200
# num_gpus(int): override the number of GPUs for the test. defaults to 1 GPU. currently supports 2,4.
# num_nodes(int): whether to simulate multi-node setup by launching multiple containers on one host,
# in this case, commands must be specified. the first command runs on the first host, the second
# commands(list): the list of commands to run for test. incompatbile with command.
# mirror_hardwares(list): the list of hardwares to run the test on as well. currently only supports [amd]
# gpu(str): override the GPU selection for the test. default is on L4 GPUs. currently only supports a100
# num_gpus(int): override the number of GPUs for the test. default to 1 GPU. currently support 2,4.
# num_nodes(int): whether to simulate multi-node setup by launch multiple containers on one host,
# in this case, commands must be specified. the first command runs on first host, the second
# command runs on the second host.
# timeout_in_minutes(int): sets a timeout for the step in minutes. if not specified, uses the default timeout.
# parallelism(int): number of parallel jobs to run for this step. enables test sharding using $$BUILDKITE_PARALLEL_JOB
# and $$BUILDKITE_PARALLEL_JOB_COUNT environment variables.
# working_dir(str): specify the place where the command should execute, default to /vllm-workspace/tests
# source_file_dependencies(list): the list of prefixes to opt-in the test for, if empty, the test will always run.
# working_dir(str): specify the place where command should execute, default to /vllm-workspace/tests
# source_file_dependencies(list): the list of prefix to opt-in the test for, if empty, the test will always run.
# When adding a test
# - If the test belongs to an existing group, add it there
# - If the test belong to an existing group, add it there
# - If the test is short, add to any existing step
# - If the test takes more than 10min, then it is okay to create a new step.
# Note that all steps execute in parallel.
@ -50,18 +46,22 @@ steps:
mirror_hardwares: [amdexperimental]
source_file_dependencies:
- vllm/
- tests/async_engine
- tests/test_inputs.py
- tests/test_outputs.py
- tests/multimodal
- tests/utils_
- tests/worker
- tests/standalone_tests/lazy_imports.py
- tests/transformers_utils
commands:
- python3 standalone_tests/lazy_imports.py
- pytest -v -s async_engine # AsyncLLMEngine
- pytest -v -s test_inputs.py
- pytest -v -s test_outputs.py
- pytest -v -s multimodal
- pytest -v -s utils_ # Utils
- pytest -v -s worker # Worker
- pytest -v -s transformers_utils # transformers_utils
- label: Python-only Installation Test # 10min
@ -82,12 +82,14 @@ steps:
- vllm/
- tests/basic_correctness/test_basic_correctness
- tests/basic_correctness/test_cpu_offload
- tests/basic_correctness/test_preemption
- tests/basic_correctness/test_cumem.py
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s basic_correctness/test_cumem.py
- pytest -v -s basic_correctness/test_basic_correctness.py
- pytest -v -s basic_correctness/test_cpu_offload.py
- VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
- label: Entrypoints Unit Tests # 5min
timeout_in_minutes: 10
@ -112,9 +114,10 @@ steps:
- tests/entrypoints/offline_mode
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
- pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
- pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
- VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
- label: Entrypoints Integration Test (API Server) # 100min
timeout_in_minutes: 130
@ -152,6 +155,7 @@ steps:
num_gpus: 4
source_file_dependencies:
- vllm/distributed/
- vllm/core/
- tests/distributed/test_utils
- tests/distributed/test_pynccl
- tests/distributed/test_events
@ -164,20 +168,12 @@ steps:
- tests/v1/test_internal_lb_dp.py
- tests/v1/test_hybrid_lb_dp.py
- tests/v1/engine/test_engine_core_client.py
- tests/distributed/test_symm_mem_allreduce.py
commands:
# test with torchrun tp=2 and external_dp=2
# test with tp=2 and external_dp=2
- VLLM_USE_V1=0 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
- torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
# test with torchrun tp=2 and pp=2
# test with tp=2 and pp=2
- PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
# test with torchrun tp=4 and dp=1
- TP_SIZE=4 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
# test with torchrun tp=2, pp=2 and dp=1
- PP_SIZE=2 TP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
# test with torchrun tp=1 and dp=4 with ep
- DP_SIZE=4 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
# test with torchrun tp=2 and dp=2 with ep
- TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
# test with internal dp
- python3 ../examples/offline_inference/data_parallel.py --enforce-eager
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
@ -189,7 +185,6 @@ steps:
- pytest -v -s compile/test_basic_correctness.py
- pytest -v -s distributed/test_pynccl.py
- pytest -v -s distributed/test_events.py
- pytest -v -s distributed/test_symm_mem_allreduce.py
# TODO: create a dedicated test section for multi-GPU example tests
# when we have multiple distributed example tests
- pushd ../examples/offline_inference
@ -292,7 +287,6 @@ steps:
# split the test to avoid interference
- pytest -v -s v1/core
- pytest -v -s v1/executor
- pytest -v -s v1/kv_offload
- pytest -v -s v1/sample
- pytest -v -s v1/logits_processors
- pytest -v -s v1/worker
@ -300,12 +294,10 @@ steps:
- pytest -v -s v1/spec_decode
- pytest -v -s v1/kv_connector/unit
- pytest -v -s v1/metrics
- pytest -v -s v1/test_kv_sharing.py
- pytest -v -s v1/test_metrics_reader.py
- pytest -v -s v1/test_oracle.py
- pytest -v -s v1/test_request.py
- pytest -v -s v1/test_serial_utils.py
- pytest -v -s v1/test_utils.py
- pytest -v -s v1/test_oracle.py
- pytest -v -s v1/test_metrics_reader.py
# Integration test for streaming correctness (requires special branch).
- pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
@ -328,13 +320,12 @@ steps:
- python3 offline_inference/vision_language.py --seed 0
- python3 offline_inference/vision_language_pooling.py --seed 0
- python3 offline_inference/vision_language_multi_image.py --seed 0
- python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
- VLLM_USE_V1=0 python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
- python3 offline_inference/basic/classify.py
- python3 offline_inference/basic/embed.py
- python3 offline_inference/basic/score.py
- python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
- python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
- VLLM_USE_V1=0 python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2
- label: Platform Tests (CUDA) # 4min
timeout_in_minutes: 15
@ -772,9 +763,8 @@ steps:
- pytest -v -s tests/models/multimodal/processing/
- pytest -v -s tests/models/multimodal/test_mapping.py
- python3 examples/offline_inference/basic/chat.py
- python3 examples/offline_inference/audio_language.py --model-type whisper
- python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
# Whisper needs spawn method to avoid deadlock
- VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
- label: Blackwell Test # 38 min
timeout_in_minutes: 60
@ -872,28 +862,26 @@ steps:
- NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
- python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
- label: Distributed Tests (2 GPUs) # 68min
timeout_in_minutes: 90
- label: Distributed Tests (2 GPUs) # 110min
timeout_in_minutes: 150
mirror_hardwares: [amdexperimental]
working_dir: "/vllm-workspace/tests"
num_gpus: 2
source_file_dependencies:
- vllm/compilation/
- vllm/distributed/
- vllm/engine/
- vllm/executor/
- vllm/worker/worker_base.py
- vllm/v1/engine/
- vllm/v1/worker/
- tests/compile/test_basic_correctness.py
- tests/compile/test_wrapper.py
- vllm/model_executor/models/
- tests/distributed/
- tests/entrypoints/llm/test_collective_rpc.py
- vllm/compilation
- vllm/worker/worker_base.py
- vllm/worker/worker.py
- vllm/worker/model_runner.py
- entrypoints/llm/test_collective_rpc.py
- tests/v1/test_async_llm_dp.py
- tests/v1/test_external_lb_dp.py
- tests/v1/entrypoints/openai/test_multi_api_servers.py
- tests/v1/shutdown
- tests/v1/worker/test_worker_memory_snapshot.py
- vllm/v1/engine/
commands:
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_external_lb_dp.py
@ -902,29 +890,19 @@ steps:
- pytest -v -s ./compile/test_basic_correctness.py
- pytest -v -s ./compile/test_wrapper.py
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
- pytest -v -s distributed/test_sequence_parallel.py
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
- pytest -v -s v1/worker/test_worker_memory_snapshot.py
- label: Distributed Model Tests (2 GPUs) # 37min
timeout_in_minutes: 50
mirror_hardwares: [amdexperimental]
working_dir: "/vllm-workspace/tests"
num_gpus: 2
source_file_dependencies:
- vllm/model_executor/model_loader/sharded_state_loader.py
- vllm/model_executor/models/
- tests/basic_correctness/
- tests/model_executor/model_loader/test_sharded_state_loader.py
- tests/models/
commands:
- TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s model_executor/model_loader/test_sharded_state_loader.py
# Avoid importing model tests that cause CUDA reinitialization error
- pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
- pytest models/language -v -s -m 'distributed(num_gpus=2)'
- pytest models/multimodal -v -s -m 'distributed(num_gpus=2)' --ignore models/multimodal/generation/test_whisper.py
- VLLM_WORKER_MULTIPROC_METHOD=spawn pytest models/multimodal/generation/test_whisper.py -v -s -m 'distributed(num_gpus=2)'
# test sequence parallel
- pytest -v -s distributed/test_sequence_parallel.py
# this test fails consistently.
# TODO: investigate and fix
- VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
- pytest -v -s models/multimodal/generation/test_maverick.py
- label: Plugin Tests (2 GPUs) # 40min
timeout_in_minutes: 60
@ -1058,16 +1036,3 @@ steps:
num_gpus: 2
commands:
- pytest -v -s tests/distributed/test_context_parallel.py
- pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
##### RL Integration Tests #####
- label: Prime-RL Integration Test # 15min
timeout_in_minutes: 30
optional: true
num_gpus: 2
working_dir: "/vllm-workspace"
source_file_dependencies:
- vllm/
- .buildkite/scripts/run-prime-rl-test.sh
commands:
- bash .buildkite/scripts/run-prime-rl-test.sh

25
.github/CODEOWNERS vendored
View File

@ -4,8 +4,11 @@
# This lists cover the "core" components of vLLM that require careful review
/vllm/attention @LucasWilkinson
/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
/vllm/core @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
/vllm/engine/llm_engine.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
/vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
/vllm/model_executor/layers/fused_moe @mgoin
/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @NickLucche
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256
@ -34,10 +37,11 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
/vllm/v1/attention/backends/triton_attn.py @tdoublep
/vllm/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat @heheda12345 @ApostaC
/vllm/v1/kv_cache_interface.py @heheda12345
/vllm/v1/worker/kv_cache_initializer_mixin.py @heheda12345
/vllm/v1/offloading @ApostaC
# Test ownership
/.buildkite/lm-eval-harness @mgoin @simon-mo
/tests/async_engine @njhill @robertgshaw2-redhat @simon-mo
/tests/distributed/test_multi_node_assignment.py @youkaichao
/tests/distributed/test_pipeline_parallel.py @youkaichao
/tests/distributed/test_same_node.py @youkaichao
@ -46,6 +50,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
/tests/kernels @mgoin @tlrmchlsmth @WoosukKwon @yewentao256
/tests/models @DarkLight1337 @ywang96
/tests/multimodal @DarkLight1337 @ywang96 @NickLucche
/tests/prefix_caching @comaniac @KuntaiDu
/tests/quantization @mgoin @robertgshaw2-redhat @yewentao256
/tests/test_inputs.py @DarkLight1337 @ywang96
/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
@ -58,31 +63,19 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
/tests/v1/kv_connector @ApostaC
/tests/v1/offloading @ApostaC
# Transformers backend
/vllm/model_executor/models/transformers.py @hmellor
/tests/models/test_transformers.py @hmellor
# Docs
/docs/mkdocs @hmellor
/docs/**/*.yml @hmellor
/requirements/docs.txt @hmellor
.readthedocs.yaml @hmellor
/docs @hmellor
mkdocs.yaml @hmellor
# Linting
.markdownlint.yaml @hmellor
.pre-commit-config.yaml @hmellor
/tools/pre_commit @hmellor
# CPU
/vllm/v1/worker/cpu* @bigPYJ1151
/vllm/v1/worker/^cpu @bigPYJ1151
/csrc/cpu @bigPYJ1151
/vllm/platforms/cpu.py @bigPYJ1151
/cmake/cpu_extension.cmake @bigPYJ1151
/docker/Dockerfile.cpu @bigPYJ1151
# Intel GPU
/vllm/v1/worker/xpu* @jikunshang
/vllm/v1/worker/^xpu @jikunshang
/vllm/platforms/xpu.py @jikunshang
/docker/Dockerfile.xpu @jikunshang

View File

@ -43,6 +43,10 @@ body:
Any other things you would like to mention.
validations:
required: false
- type: markdown
attributes:
value: >
Thanks for contributing 🎉! The vLLM core team hosts a biweekly RFC review session at 9:30AM Pacific Time, while most RFCs can be discussed online, you can optionally sign up for a slot to discuss your RFC online [here](https://docs.google.com/document/d/1CiLVBZeIVfR7_PNAKVSusxpceywkoOOB78qoWqHvSZc/edit).
- type: checkboxes
id: askllm
attributes:

19
.github/mergify.yml vendored
View File

@ -171,7 +171,7 @@ pull_request_rules:
- files=examples/online_serving/openai_chat_completion_structured_outputs.py
- files=examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
- files~=^tests/v1/structured_output/
- files=tests/v1/entrypoints/llm/test_struct_output_generate.py
- files=tests/v1/entrypoints/llm/test_guided_generate.py
- files~=^vllm/v1/structured_output/
actions:
label:
@ -302,20 +302,3 @@ pull_request_rules:
label:
remove:
- needs-rebase
- name: label-kv-connector
description: Automatically apply kv-connector label
conditions:
- or:
- files~=^examples/online_serving/disaggregated[^/]*/.*
- files~=^examples/offline_inference/disaggregated[^/]*/.*
- files~=^examples/others/lmcache/
- files~=^tests/v1/kv_connector/
- files~=^vllm/distributed/kv_transfer/
- title~=(?i)\bP/?D\b
- title~=(?i)NIXL
- title~=(?i)LMCache
actions:
label:
add:
- kv-connector

View File

@ -49,7 +49,7 @@ repos:
rev: 0.6.17
hooks:
- id: pip-compile
args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu128, --python-platform, x86_64-manylinux_2_28]
args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu128]
files: ^requirements/test\.(in|txt)$
- repo: local
hooks:
@ -60,32 +60,38 @@ repos:
files: ^requirements/test\.(in|txt)$
- id: mypy-local
name: Run mypy for local Python installation
entry: python tools/pre_commit/mypy.py 0 "local"
entry: tools/mypy.sh 0 "local"
language: python
types: [python]
additional_dependencies: &mypy_deps [mypy==1.11.1, types-cachetools, types-setuptools, types-PyYAML, types-requests, pydantic]
stages: [pre-commit] # Don't run in CI
<<: &mypy_common
language: python
types_or: [python, pyi]
require_serial: true
additional_dependencies: [mypy==1.11.1, regex, types-cachetools, types-setuptools, types-PyYAML, types-requests, types-torch, pydantic]
- id: mypy-3.9 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
name: Run mypy for Python 3.9
entry: python tools/pre_commit/mypy.py 1 "3.9"
<<: *mypy_common
entry: tools/mypy.sh 1 "3.9"
language: python
types: [python]
additional_dependencies: *mypy_deps
stages: [manual] # Only run in CI
- id: mypy-3.10 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
name: Run mypy for Python 3.10
entry: python tools/pre_commit/mypy.py 1 "3.10"
<<: *mypy_common
entry: tools/mypy.sh 1 "3.10"
language: python
types: [python]
additional_dependencies: *mypy_deps
stages: [manual] # Only run in CI
- id: mypy-3.11 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
name: Run mypy for Python 3.11
entry: python tools/pre_commit/mypy.py 1 "3.11"
<<: *mypy_common
entry: tools/mypy.sh 1 "3.11"
language: python
types: [python]
additional_dependencies: *mypy_deps
stages: [manual] # Only run in CI
- id: mypy-3.12 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
name: Run mypy for Python 3.12
entry: python tools/pre_commit/mypy.py 1 "3.12"
<<: *mypy_common
entry: tools/mypy.sh 1 "3.12"
language: python
types: [python]
additional_dependencies: *mypy_deps
stages: [manual] # Only run in CI
- id: shellcheck
name: Lint shell scripts
@ -149,15 +155,18 @@ repos:
additional_dependencies: [regex]
- id: check-pickle-imports
name: Prevent new pickle/cloudpickle imports
entry: python tools/pre_commit/check_pickle_imports.py
entry: python tools/check_pickle_imports.py
language: python
types: [python]
additional_dependencies: [regex]
pass_filenames: false
additional_dependencies: [pathspec, regex]
- id: validate-config
name: Validate configuration has default values and that each field has a docstring
entry: python tools/validate_config.py
language: python
additional_dependencies: [regex]
types: [python]
pass_filenames: true
files: vllm/config.py|tests/test_config.py|vllm/entrypoints/openai/cli_args.py
# Keep `suggestion` last
- id: suggestion
name: Suggestion

View File

@ -13,7 +13,6 @@ build:
mkdocs:
configuration: mkdocs.yaml
fail_on_warning: true
# Optionally declare the Python requirements required to build your docs
python:

View File

@ -103,15 +103,10 @@ start_server() {
VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 \
vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 &
fi
local server_pid=$!
# wait for 10 minutes...
server_started=0
for i in {1..60}; do
# This line checks whether the server is still alive or not,
# since that we should always have permission to send signal to the server process.
kill -0 $server_pid 2> /dev/null || break
RESPONSE=$(curl -s -X GET "http://0.0.0.0:8004/health" -w "%{http_code}" -o /dev/stdout)
STATUS_CODE=$(echo "$RESPONSE" | tail -n 1)
if [[ "$STATUS_CODE" -eq 200 ]]; then
@ -123,7 +118,7 @@ start_server() {
done
if (( ! server_started )); then
echo "server did not start within 10 minutes or crashed. Please check server log at $vllm_log".
echo "server did not start within 10 minutes. Please check server log at $vllm_log".
return 1
else
return 0

View File

@ -1,31 +1,17 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import gc
import time
from unittest import mock
import numpy as np
from tabulate import tabulate
from benchmark_utils import TimeCollector
from vllm.config import (
CacheConfig,
DeviceConfig,
LoadConfig,
ModelConfig,
ParallelConfig,
SchedulerConfig,
SpeculativeConfig,
VllmConfig,
)
from vllm.platforms import current_platform
from vllm.config import ModelConfig, SpeculativeConfig, VllmConfig
from vllm.utils import FlexibleArgumentParser
from vllm.v1.spec_decode.ngram_proposer import NgramProposer
from vllm.v1.worker.gpu_input_batch import InputBatch
from vllm.v1.worker.gpu_model_runner import GPUModelRunner
def benchmark_propose(args):
def main(args):
rows = []
for max_ngram in args.max_ngram:
collector = TimeCollector(TimeCollector.US)
@ -83,88 +69,10 @@ def benchmark_propose(args):
)
def benchmark_batched_propose(args):
NUM_SPECULATIVE_TOKENS_NGRAM = 10
PROMPT_LOOKUP_MIN = 5
PROMPT_LOOKUP_MAX = 15
MAX_MODEL_LEN = int(1e7)
DEVICE = current_platform.device_type
model_config = ModelConfig(model="facebook/opt-125m", runner="generate")
speculative_config = SpeculativeConfig(
target_model_config=model_config,
target_parallel_config=ParallelConfig(),
method="ngram",
num_speculative_tokens=NUM_SPECULATIVE_TOKENS_NGRAM,
prompt_lookup_max=PROMPT_LOOKUP_MAX,
prompt_lookup_min=PROMPT_LOOKUP_MIN,
)
vllm_config = VllmConfig(
model_config=model_config,
cache_config=CacheConfig(),
speculative_config=speculative_config,
device_config=DeviceConfig(device=current_platform.device_type),
parallel_config=ParallelConfig(),
load_config=LoadConfig(),
scheduler_config=SchedulerConfig(),
)
# monkey patch vllm.v1.worker.gpu_model_runner.get_pp_group
mock_pp_group = mock.MagicMock()
mock_pp_group.world_size = 1
with mock.patch(
"vllm.v1.worker.gpu_model_runner.get_pp_group", return_value=mock_pp_group
):
runner = GPUModelRunner(vllm_config, DEVICE)
# hack max model len
runner.max_model_len = MAX_MODEL_LEN
runner.drafter.max_model_len = MAX_MODEL_LEN
dummy_input_batch = InputBatch(
max_num_reqs=args.num_req,
max_model_len=MAX_MODEL_LEN,
max_num_batched_tokens=args.num_req * args.num_token,
device=DEVICE,
pin_memory=False,
vocab_size=256000,
block_sizes=[16],
)
dummy_input_batch._req_ids = list(str(id) for id in range(args.num_req))
dummy_input_batch.spec_decode_unsupported_reqs = ()
dummy_input_batch.num_tokens_no_spec = [args.num_token] * args.num_req
dummy_input_batch.token_ids_cpu = np.random.randint(
0, 20, (args.num_req, args.num_token)
)
runner.input_batch = dummy_input_batch
sampled_token_ids = [[0]] * args.num_req
print("Starting benchmark")
# first run is warmup so ignore it
for _ in range(args.num_iteration):
start = time.time()
runner.drafter.propose(
sampled_token_ids,
dummy_input_batch.req_ids,
dummy_input_batch.num_tokens_no_spec,
dummy_input_batch.token_ids_cpu,
dummy_input_batch.spec_decode_unsupported_reqs,
)
end = time.time()
print(f"Iteration time (s): {end - start}")
def invoke_main() -> None:
parser = FlexibleArgumentParser(
description="Benchmark the performance of N-gram speculative decode drafting"
)
parser.add_argument(
"--batched", action="store_true", help="consider time to prepare batch"
) # noqa: E501
parser.add_argument(
"--num-iteration",
type=int,
@ -197,17 +105,8 @@ def invoke_main() -> None:
help="Number of speculative tokens to generate",
)
args = parser.parse_args()
if not args.batched:
benchmark_propose(args)
else:
benchmark_batched_propose(args)
main(args)
"""
# Example command lines:
# time python3 benchmarks/benchmark_ngram_proposer.py
# time python3 benchmarks/benchmark_ngram_proposer.py --batched --num-iteration 4 --num-token 1000000 --num-req 128
""" # noqa: E501
if __name__ == "__main__":
invoke_main() # pragma: no cover

View File

@ -449,8 +449,7 @@ async def benchmark(
def prepare_extra_body(request) -> dict:
extra_body = {}
# Add the schema to the extra_body
extra_body["structured_outputs"] = {}
extra_body["structured_outputs"][request.structure_type] = request.schema
extra_body[request.structure_type] = request.schema
return extra_body
print("Starting initial single prompt test run...")
@ -697,11 +696,11 @@ def evaluate(ret, args):
return re.match(args.regex, actual) is not None
def _eval_correctness(expected, actual):
if args.structure_type == "json":
if args.structure_type == "guided_json":
return _eval_correctness_json(expected, actual)
elif args.structure_type == "regex":
elif args.structure_type == "guided_regex":
return _eval_correctness_regex(expected, actual)
elif args.structure_type == "choice":
elif args.structure_type == "guided_choice":
return _eval_correctness_choice(expected, actual)
else:
return None
@ -781,18 +780,18 @@ def main(args: argparse.Namespace):
)
if args.dataset == "grammar":
args.structure_type = "grammar"
args.structure_type = "guided_grammar"
elif args.dataset == "regex":
args.structure_type = "regex"
args.structure_type = "guided_regex"
elif args.dataset == "choice":
args.structure_type = "choice"
args.structure_type = "guided_choice"
else:
args.structure_type = "json"
args.structure_type = "guided_json"
if args.no_structured_output:
args.structured_output_ratio = 0
if args.save_results:
result_file_name = f"{args.structured_output_ratio}so"
result_file_name = f"{args.structured_output_ratio}guided"
result_file_name += f"_{backend}"
result_file_name += f"_{args.request_rate}qps"
result_file_name += f"_{args.model.split('/')[-1]}"

View File

@ -3,7 +3,6 @@
import argparse
import copy
import itertools
import os
import torch
from weight_shapes import WEIGHT_SHAPES
@ -24,45 +23,21 @@ PROVIDER_CFGS = {
"torch-bf16": dict(enabled=True),
"nvfp4": dict(no_a_quant=False, enabled=True),
"nvfp4-noquant": dict(no_a_quant=True, enabled=True),
"fbgemm-nvfp4": dict(fbgemm=True, no_a_quant=False, enabled=True),
"fbgemm-nvfp4-noquant": dict(fbgemm=True, no_a_quant=True, enabled=True),
}
_needs_fbgemm = any(
v.get("fbgemm", False) for v in PROVIDER_CFGS.values() if v.get("enabled", False)
)
if _needs_fbgemm:
try:
from fbgemm_gpu.experimental.gemm.triton_gemm.fp4_quantize import (
triton_scale_nvfp4_quant,
)
except ImportError:
print(
"WARNING: FBGEMM providers are enabled but fbgemm_gpu is not installed. "
"These providers will be skipped. Please install fbgemm_gpu with: "
"'pip install fbgemm-gpu-genai' to run them."
)
# Disable FBGEMM providers so the benchmark can run.
for cfg in PROVIDER_CFGS.values():
if cfg.get("fbgemm"):
cfg["enabled"] = False
_enabled = [k for k, v in PROVIDER_CFGS.items() if v["enabled"]]
def _quant_weight_nvfp4(b: torch.Tensor, device: str, cfg):
def _quant_weight_nvfp4(b: torch.Tensor, device: str):
# Compute global scale for weight
b_amax = torch.abs(b).max().to(torch.float32)
b_global_scale = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / b_amax
if "fbgemm" in cfg and cfg["fbgemm"]:
b_fp4, scale_b_fp4 = triton_scale_nvfp4_quant(b, b_global_scale)
else:
b_fp4, scale_b_fp4 = ops.scaled_fp4_quant(b, b_global_scale)
b_fp4, scale_b_fp4 = ops.scaled_fp4_quant(b, b_global_scale)
return b_fp4, scale_b_fp4, b_global_scale
def build_nvfp4_runner(cfg, a, b, dtype, device):
b_fp4, scale_b_fp4, b_global_scale = _quant_weight_nvfp4(b, device, cfg)
b_fp4, scale_b_fp4, b_global_scale = _quant_weight_nvfp4(b, device)
# Compute global scale for activation
# NOTE: This is generally provided ahead-of-time by the model checkpoint.
@ -71,35 +46,6 @@ def build_nvfp4_runner(cfg, a, b, dtype, device):
# Alpha for the GEMM operation
alpha = 1.0 / (a_global_scale * b_global_scale)
if "fbgemm" in cfg and cfg["fbgemm"]:
if cfg["no_a_quant"]:
a_fp4, scale_a_fp4 = triton_scale_nvfp4_quant(a, a_global_scale)
def run():
return torch.ops.fbgemm.f4f4bf16(
a_fp4,
b_fp4,
scale_a_fp4,
scale_b_fp4,
global_scale=alpha,
use_mx=False,
)
return run
else:
def run():
a_fp4, scale_a_fp4 = triton_scale_nvfp4_quant(a, a_global_scale)
return torch.ops.fbgemm.f4f4bf16(
a_fp4,
b_fp4,
scale_a_fp4,
scale_b_fp4,
global_scale=alpha,
use_mx=False,
)
return run
if cfg["no_a_quant"]:
# Pre-quantize activation
@ -184,13 +130,10 @@ if __name__ == "__main__":
for K, N, model in prepare_shapes(args):
print(f"{model}, N={N} K={K}, BF16 vs NVFP4 GEMMs TFLOP/s:")
save_dir = f"bench_nvfp4_res_n{N}_k{K}"
os.makedirs(save_dir, exist_ok=True)
benchmark.run(
print_data=True,
show_plots=True,
save_path=save_dir,
save_path=f"bench_nvfp4_res_n{N}_k{K}",
N=N,
K=K,
)

View File

@ -51,7 +51,7 @@ def calculate_diff(
):
"""Calculate the difference between Inductor and CUDA implementations."""
device = torch.device("cuda")
x = torch.randn((batch_size, hidden_size), dtype=dtype, device=device)
x = torch.rand((batch_size * hidden_size, 4096), dtype=dtype, device=device)
quant_fp8 = QuantFP8(False, group_shape, column_major_scales=False)
@ -59,25 +59,23 @@ def calculate_diff(
torch_eager_out, torch_eager_scale = quant_fp8.forward_native(x)
cuda_out, cuda_scale = quant_fp8.forward_cuda(x)
try:
torch.testing.assert_close(
cuda_out.to(torch.float32),
torch_out.to(torch.float32),
rtol=1e-3,
atol=1e-5,
)
torch.testing.assert_close(cuda_scale, torch_scale, rtol=1e-3, atol=1e-5)
torch.testing.assert_close(
cuda_out.to(torch.float32),
torch_eager_out.to(torch.float32),
rtol=1e-3,
atol=1e-5,
)
torch.testing.assert_close(cuda_scale, torch_eager_scale, rtol=1e-3, atol=1e-5)
out_allclose = lambda o1, o2: torch.allclose(
o1.to(torch.float32),
o2.to(torch.float32),
rtol=1e-3,
atol=1e-5,
)
scale_allclose = lambda s1, s2: torch.allclose(s1, s2, rtol=1e-3, atol=1e-5)
if (
out_allclose(cuda_out, torch_out)
and scale_allclose(cuda_scale, torch_scale)
and out_allclose(cuda_out, torch_eager_out)
and scale_allclose(cuda_scale, torch_eager_scale)
):
print("✅ All implementations match")
except AssertionError as e:
else:
print("❌ Implementations differ")
print(e)
configs = []
@ -93,7 +91,7 @@ def benchmark_quantization(
):
device = torch.device("cuda")
x = torch.randn(batch_size, hidden_size, device=device, dtype=dtype)
x = torch.randn(batch_size * hidden_size, 4096, device=device, dtype=dtype)
quantiles = [0.5, 0.2, 0.8]
quant_fp8 = QuantFP8(False, group_shape, column_major_scales=col_major)
@ -159,21 +157,21 @@ if __name__ == "__main__":
)
parser.add_argument("-c", "--check", action="store_true")
parser.add_argument(
"--dtype", type=str, choices=["half", "bfloat16", "float"], default="bfloat16"
"--dtype", type=str, choices=["half", "bfloat16", "float"], default="half"
)
parser.add_argument(
"--hidden-sizes",
type=int,
nargs="+",
default=[896, 1024, 2048, 4096, 7168],
help="Hidden sizes to benchmark",
default=None,
help="Hidden sizes to benchmark (default: 1,16,64,128,256,512,1024,2048,4096)",
)
parser.add_argument(
"--batch-sizes",
type=int,
nargs="+",
default=[1, 16, 128, 512, 1024],
help="Batch sizes to benchmark",
default=None,
help="Batch sizes to benchmark (default: 1,16,32,64,128)",
)
parser.add_argument(
"--group-sizes",
@ -194,8 +192,8 @@ if __name__ == "__main__":
dtype = STR_DTYPE_TO_TORCH_DTYPE[args.dtype]
hidden_sizes = args.hidden_sizes
batch_sizes = args.batch_sizes
hidden_sizes = args.hidden_sizes or [1, 16, 64, 128, 256, 512, 1024, 2048, 4096]
batch_sizes = args.batch_sizes or [1, 16, 32, 64, 128]
if args.group_sizes is not None:
group_shapes = []

View File

@ -1,406 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Benchmark the performance of the cutlass_moe_fp8 kernel vs the triton_moe
kernel. Both kernels take in fp8 quantized weights and 16-bit activations,
but use different quantization strategies and backends.
"""
import nvtx
import torch
from vllm import _custom_ops as ops
from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_config
from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp8
from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk
from vllm.platforms import current_platform
from vllm.utils import FlexibleArgumentParser
# Weight shapes for different models: [num_experts, topk, hidden_size,
# intermediate_size]
WEIGHT_SHAPES_MOE = {
"mixtral-8x7b": [
[8, 2, 4096, 14336],
],
"deepseek-v2": [
[160, 6, 5120, 12288],
],
"custom-small": [
[8, 2, 2048, 7168],
],
"glm45-fp8": [
[128, 8, 4096, 1408],
],
"Llama-4-Maverick-17B-128E-Instruct-FP8": [
[128, 1, 5120, 8192],
],
}
DEFAULT_MODELS = [
"mixtral-8x7b",
]
DEFAULT_BATCH_SIZES = [4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048]
DEFAULT_TP_SIZES = [1]
PER_ACT_TOKEN_OPTS = [False, True]
PER_OUT_CH_OPTS = [False, True]
FP8_DTYPE = current_platform.fp8_dtype()
def bench_run(
results: list,
model: str,
num_experts: int,
topk: int,
per_act_token: bool,
per_out_ch: bool,
mkn: tuple[int, int, int],
):
(m, k, n) = mkn
dtype = torch.half
device = "cuda"
# Create input activations
a = torch.randn((m, k), device=device, dtype=dtype) / 10
# Create weights
w1 = torch.randn((num_experts, 2 * n, k), device=device, dtype=dtype) / 10
w2 = torch.randn((num_experts, k, n), device=device, dtype=dtype) / 10
# Create FP8 quantized weights and scales for both kernels
w1_fp8q = torch.empty((num_experts, 2 * n, k), device=device, dtype=FP8_DTYPE)
w2_fp8q = torch.empty((num_experts, k, n), device=device, dtype=FP8_DTYPE)
# Create scales based on quantization strategy
if per_out_ch:
# Per-channel quantization
w1_scale = torch.empty(
(num_experts, 2 * n, 1), device=device, dtype=torch.float32
)
w2_scale = torch.empty((num_experts, k, 1), device=device, dtype=torch.float32)
else:
# Per-tensor quantization
w1_scale = torch.empty((num_experts, 1, 1), device=device, dtype=torch.float32)
w2_scale = torch.empty((num_experts, 1, 1), device=device, dtype=torch.float32)
# Quantize weights
for expert in range(num_experts):
if per_out_ch:
# Per-channel quantization - not yet implemented properly
# For now, fall back to per-tensor quantization
w1_fp8q[expert], w1_scale_temp = ops.scaled_fp8_quant(w1[expert])
w2_fp8q[expert], w2_scale_temp = ops.scaled_fp8_quant(w2[expert])
# Expand scalar scales to the expected per-channel shape
w1_scale[expert] = w1_scale_temp.expand(2 * n, 1)
w2_scale[expert] = w2_scale_temp.expand(k, 1)
else:
# Per-tensor quantization
w1_fp8q[expert], w1_scale_temp = ops.scaled_fp8_quant(w1[expert])
w2_fp8q[expert], w2_scale_temp = ops.scaled_fp8_quant(w2[expert])
# Store scalar scales in [1, 1] tensors
w1_scale[expert, 0, 0] = w1_scale_temp
w2_scale[expert, 0, 0] = w2_scale_temp
# Prepare weights for CUTLASS (no transpose needed)
w1_fp8q_cutlass = w1_fp8q # Keep original [E, 2N, K]
w2_fp8q_cutlass = w2_fp8q # Keep original [E, K, N]
# Create router scores and get topk
score = torch.randn((m, num_experts), device=device, dtype=dtype)
topk_weights, topk_ids, _ = fused_topk(a, score, topk, renormalize=False)
# WORKAROUND: CUTLASS MoE FP8 has issues with per-token quantization
# Force per-tensor quantization for all cases to match working e2e setup
a1_scale = torch.full((), 1e-2, device=device, dtype=torch.float32)
a2_scale = torch.full((), 1e-2, device=device, dtype=torch.float32)
# Force per-tensor quantization for all cases
per_act_token = False
# Create stride tensors for CUTLASS
ab_strides1 = torch.full((num_experts,), k, dtype=torch.int64, device=device)
ab_strides2 = torch.full((num_experts,), n, dtype=torch.int64, device=device)
c_strides1 = torch.full((num_experts,), 2 * n, dtype=torch.int64, device=device)
c_strides2 = torch.full((num_experts,), k, dtype=torch.int64, device=device)
def run_triton_moe(
a: torch.Tensor,
w1: torch.Tensor,
w2: torch.Tensor,
topk_weights: torch.Tensor,
topk_ids: torch.Tensor,
w1_scale: torch.Tensor,
w2_scale: torch.Tensor,
a1_scale: torch.Tensor,
a2_scale: torch.Tensor,
num_repeats: int,
):
quant_config = fp8_w8a8_moe_quant_config(
w1_scale=w1_scale,
w2_scale=w2_scale,
a1_scale=a1_scale,
a2_scale=a2_scale,
per_act_token_quant=per_act_token,
per_out_ch_quant=per_out_ch,
)
for _ in range(num_repeats):
fused_experts(
a,
w1,
w2,
topk_weights,
topk_ids,
quant_config=quant_config,
)
def run_cutlass_moe_fp8(
a: torch.Tensor,
w1: torch.Tensor,
w2: torch.Tensor,
topk_weights: torch.Tensor,
topk_ids: torch.Tensor,
ab_strides1: torch.Tensor,
ab_strides2: torch.Tensor,
c_strides1: torch.Tensor,
c_strides2: torch.Tensor,
w1_scale: torch.Tensor,
w2_scale: torch.Tensor,
a1_scale: torch.Tensor,
a2_scale: torch.Tensor,
num_repeats: int,
):
quant_config = fp8_w8a8_moe_quant_config(
w1_scale=w1_scale,
w2_scale=w2_scale,
a1_scale=a1_scale,
a2_scale=a2_scale,
per_act_token_quant=per_act_token,
per_out_ch_quant=per_out_ch,
)
for _ in range(num_repeats):
with nvtx.annotate("cutlass_moe_fp8", color="blue"):
cutlass_moe_fp8(
a=a,
w1_q=w1,
w2_q=w2,
topk_weights=topk_weights,
topk_ids=topk_ids,
ab_strides1=ab_strides1,
ab_strides2=ab_strides2,
c_strides1=c_strides1,
c_strides2=c_strides2,
quant_config=quant_config,
activation="silu",
global_num_experts=num_experts,
)
# Pre-create quantization config to avoid creating it inside CUDA graph
quant_config = fp8_w8a8_moe_quant_config(
w1_scale=w1_scale,
w2_scale=w2_scale,
a1_scale=a1_scale,
a2_scale=a2_scale,
per_act_token_quant=per_act_token,
per_out_ch_quant=per_out_ch,
)
# Create CUDA graphs for CUTLASS (match benchmark_moe.py pattern exactly)
cutlass_stream = torch.cuda.Stream()
cutlass_graph = torch.cuda.CUDAGraph()
with torch.cuda.graph(cutlass_graph, stream=cutlass_stream):
# Capture 10 invocations like benchmark_moe.py
for _ in range(10):
cutlass_moe_fp8(
a=a,
w1_q=w1_fp8q_cutlass,
w2_q=w2_fp8q_cutlass,
topk_weights=topk_weights,
topk_ids=topk_ids,
ab_strides1=ab_strides1,
ab_strides2=ab_strides2,
c_strides1=c_strides1,
c_strides2=c_strides2,
quant_config=quant_config,
activation="silu",
global_num_experts=num_experts,
)
torch.cuda.synchronize()
# Create CUDA graphs for Triton (match benchmark_moe.py pattern exactly)
triton_stream = torch.cuda.Stream()
triton_graph = torch.cuda.CUDAGraph()
with torch.cuda.graph(triton_graph, stream=triton_stream):
# Capture 10 invocations like benchmark_moe.py
for _ in range(10):
fused_experts(
a,
w1_fp8q,
w2_fp8q,
topk_weights,
topk_ids,
quant_config=quant_config,
)
torch.cuda.synchronize()
def bench_cuda_graph(graph, num_warmup=5, num_iters=100):
"""Benchmark CUDA graph using events like benchmark_moe.py"""
# Warmup
for _ in range(num_warmup):
graph.replay()
torch.cuda.synchronize()
# Timing
start_event = torch.cuda.Event(enable_timing=True)
end_event = torch.cuda.Event(enable_timing=True)
latencies = []
for _ in range(num_iters):
torch.cuda.synchronize()
start_event.record()
graph.replay()
end_event.record()
end_event.synchronize()
latencies.append(start_event.elapsed_time(end_event))
# Divide by 10 since graph contains 10 calls
return sum(latencies) / (num_iters * 10)
# Benchmark parameters
num_warmup = 5
num_iters = 100
# Benchmark only CUDA graphs (more reliable and faster)
# Benchmark Triton MoE with CUDA graphs
triton_graph_time = bench_cuda_graph(
triton_graph, num_warmup=num_warmup, num_iters=num_iters
)
# Benchmark CUTLASS MoE with CUDA graphs
cutlass_graph_time = bench_cuda_graph(
cutlass_graph, num_warmup=num_warmup, num_iters=num_iters
)
# Convert ms to us and return results
triton_time_us = triton_graph_time * 1000
cutlass_time_us = cutlass_graph_time * 1000
return {
"batch_size": m,
"triton_time_us": triton_time_us,
"cutlass_time_us": cutlass_time_us,
}
def main(args):
print("Benchmarking models:")
for i, model in enumerate(args.models):
print(f"[{i}] {model}")
all_results = []
for model in args.models:
for tp in args.tp_sizes:
for layer in WEIGHT_SHAPES_MOE[model]:
num_experts = layer[0]
topk = layer[1]
size_k = layer[2]
size_n = layer[3] // tp
if len(args.limit_k) > 0 and size_k not in args.limit_k:
continue
if len(args.limit_n) > 0 and size_n not in args.limit_n:
continue
for per_act_token in args.per_act_token_opts:
for per_out_ch in args.per_out_ch_opts:
print(
f"\n=== {model}, experts={num_experts}, topk={topk},"
f"per_act={per_act_token}, per_out_ch={per_out_ch} ==="
)
config_results = []
for size_m in args.batch_sizes:
mkn = (size_m, size_k, size_n)
result = bench_run(
[], # Not used anymore
model,
num_experts,
topk,
per_act_token,
per_out_ch,
mkn,
)
if result:
config_results.append(result)
# Print results table for this configuration
if config_results:
print(
f"\n{'Batch Size':<12}"
f"{'Triton (us)':<15}"
f"{'CUTLASS (us)':<15}"
)
print("-" * 45)
for result in config_results:
print(
f"{result['batch_size']:<12}"
f"{result['triton_time_us']:<15.2f}"
f"{result['cutlass_time_us']:<15.2f}"
)
all_results.extend(config_results)
print(f"\nTotal benchmarks completed: {len(all_results)}")
if __name__ == "__main__":
parser = FlexibleArgumentParser(
description="""Benchmark CUTLASS FP8 MOE vs Triton FP8 FUSED MOE
across specified models/shapes/batches
Example usage:
python benchmark_cutlass_moe_fp8.py \
--model "Llama-4-Maverick-17B-128E-Instruct-FP8" \
--tp-sizes 8 \
--batch-size 2 4 8 \
--per-act-token-opts false \
--per-out-ch-opts false
"""
)
parser.add_argument(
"--models",
nargs="+",
type=str,
default=DEFAULT_MODELS,
choices=WEIGHT_SHAPES_MOE.keys(),
)
parser.add_argument("--tp-sizes", nargs="+", type=int, default=DEFAULT_TP_SIZES)
parser.add_argument(
"--batch-sizes", nargs="+", type=int, default=DEFAULT_BATCH_SIZES
)
parser.add_argument("--limit-k", nargs="+", type=int, default=[])
parser.add_argument("--limit-n", nargs="+", type=int, default=[])
parser.add_argument(
"--per-act-token-opts",
nargs="+",
type=lambda x: x.lower() == "true",
default=[False, True],
help="Per-activation token quantization options (true/false)",
)
parser.add_argument(
"--per-out-ch-opts",
nargs="+",
type=lambda x: x.lower() == "true",
default=[False, True],
help="Per-output channel quantization options (true/false)",
)
args = parser.parse_args()
main(args)

View File

@ -7,10 +7,6 @@ Benchmark script for device communicators:
CustomAllreduce (oneshot, twoshot), PyNcclCommunicator,
and SymmMemCommunicator (multimem, two-shot).
for NCCL symmetric memory you need to set the environment variables
NCCL_NVLS_ENABLE=1 NCCL_CUMEM_ENABLE=1 VLLM_USE_NCCL_SYMM_MEM=1, otherwise NCCL does
not use fast NVLS implementation for all reduce.
Usage:
torchrun --nproc_per_node=<N> benchmark_device_communicators.py [options]
@ -30,13 +26,7 @@ import torch.distributed as dist
from torch.distributed import ProcessGroup
from vllm.distributed.device_communicators.custom_all_reduce import CustomAllreduce
from vllm.distributed.device_communicators.pynccl import (
PyNcclCommunicator,
register_nccl_symmetric_ops,
)
from vllm.distributed.device_communicators.pynccl_allocator import (
set_graph_pool_id,
)
from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
from vllm.distributed.device_communicators.symm_mem import SymmMemCommunicator
from vllm.logger import init_logger
from vllm.utils import FlexibleArgumentParser
@ -108,7 +98,6 @@ class CommunicatorBenchmark:
)
if not self.pynccl_comm.disabled:
logger.info("Rank %s: PyNcclCommunicator initialized", self.rank)
register_nccl_symmetric_ops(self.pynccl_comm)
else:
logger.info("Rank %s: PyNcclCommunicator disabled", self.rank)
self.pynccl_comm = None
@ -205,15 +194,6 @@ class CommunicatorBenchmark:
None, # no env variable needed
)
)
communicators.append(
(
"pynccl-symm",
lambda t: torch.ops.vllm.all_reduce_symmetric_with_copy(t),
lambda t: True, # Always available if initialized
nullcontext(),
None, # no env variable needed
)
)
if self.symm_mem_comm_multimem is not None:
comm = self.symm_mem_comm_multimem
@ -291,9 +271,7 @@ class CommunicatorBenchmark:
# Capture the graph using context manager
with context:
graph = torch.cuda.CUDAGraph()
graph_pool = torch.cuda.graph_pool_handle()
set_graph_pool_id(graph_pool)
with torch.cuda.graph(graph, pool=graph_pool):
with torch.cuda.graph(graph):
for _ in range(CUDA_GRAPH_CAPTURE_CYCLES):
allreduce_fn(graph_input)

View File

@ -79,9 +79,9 @@ def make_rand_lora_weight_tensor(
def make_rand_tensors(
a_shape: tuple[int, ...],
b_shape: tuple[int, ...],
c_shape: tuple[int, ...],
a_shape: tuple[int],
b_shape: tuple[int],
c_shape: tuple[int],
a_dtype: torch.dtype,
b_dtype: torch.dtype,
c_dtype: torch.dtype,
@ -243,7 +243,7 @@ class OpType(Enum):
lora_rank: int,
num_loras: int,
num_slices: int,
) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
) -> tuple[tuple[int], tuple[int], tuple[int]]:
"""
Given num_slices, return the shapes of the A, B, and C matrices
in A x B = C, for the op_type

View File

@ -9,9 +9,6 @@ import torch
from tabulate import tabulate
from vllm import _custom_ops as ops
from vllm.attention.ops.triton_reshape_and_cache_flash import (
triton_reshape_and_cache_flash,
)
from vllm.logger import init_logger
from vllm.platforms import current_platform
from vllm.utils import (
@ -34,8 +31,6 @@ def run_benchmark(
kv_cache_dtype: str,
kv_cache_layout: str,
num_iters: int,
implementation: str,
benchmark_mode: str,
device: str = "cuda",
) -> float:
"""Return latency (seconds) for given num_tokens."""
@ -43,14 +38,6 @@ def run_benchmark(
if kv_cache_dtype == "fp8" and head_size % 16:
raise ValueError("fp8 kv-cache requires head_size to be a multiple of 16.")
if implementation not in ("cuda", "triton"):
raise ValueError(
f"Unsupported implementation: {implementation}. "
"Only 'cuda' and 'triton' are supported."
)
if implementation == "triton" and kv_cache_layout == "HND":
return float("nan") # Triton does not support HND layout yet.
current_platform.seed_everything(42)
torch.set_default_device(device)
@ -78,49 +65,27 @@ def run_benchmark(
cache_layout=kv_cache_layout,
)
key_cache, value_cache = key_caches[0], value_caches[0]
# to free unused memory
del key_caches, value_caches
# compute per-kernel scaling factors for fp8 conversion (if used).
k_scale = (key.amax() / 64.0).to(torch.float32)
v_scale = (value.amax() / 64.0).to(torch.float32)
if implementation == "cuda":
function_under_test = lambda: ops.reshape_and_cache_flash(
key, # noqa: F821
value, # noqa: F821
key_cache, # noqa: F821
value_cache, # noqa: F821
slot_mapping, # noqa: F821
kv_cache_dtype,
k_scale,
v_scale,
)
else:
function_under_test = lambda: triton_reshape_and_cache_flash(
key, # noqa: F821
value, # noqa: F821
key_cache, # noqa: F821
value_cache, # noqa: F821
slot_mapping, # noqa: F821
kv_cache_dtype,
k_scale,
v_scale,
)
if benchmark_mode == "cudagraph":
g = torch.cuda.CUDAGraph()
with torch.cuda.graph(g):
function_under_test()
torch.cuda.synchronize()
function_under_test = lambda: g.replay()
def run_cuda_benchmark(n_iters: int) -> float:
nonlocal key, value, key_cache, value_cache, slot_mapping
torch.cuda.synchronize()
start = time.perf_counter()
for _ in range(n_iters):
function_under_test()
torch.cuda.synchronize()
ops.reshape_and_cache_flash(
key,
value,
key_cache,
value_cache,
slot_mapping,
kv_cache_dtype,
k_scale,
v_scale,
)
torch.cuda.synchronize()
end = time.perf_counter()
return (end - start) / n_iters
@ -151,16 +116,10 @@ def main(args):
kv_cache_dtype=args.kv_cache_dtype,
kv_cache_layout=layout,
num_iters=args.iters,
implementation=args.implementation,
benchmark_mode=args.mode,
device="cuda",
)
rows.append([n_tok, layout, f"{lat * 1e6:.3f}"])
print(
f"Benchmark results for implementation {args.implementation}"
f" (measuring with {args.mode}):"
)
print(tabulate(rows, headers=["num_tokens", "layout", "latency (µs)"]))
@ -192,21 +151,6 @@ if __name__ == "__main__":
)
parser.add_argument("--iters", type=int, default=100)
parser.add_argument(
"--implementation",
type=str,
choices=["cuda", "triton"],
default="cuda",
)
parser.add_argument(
"--mode",
type=str,
choices=["cudagraph", "no_graph"],
default="cudagraph",
)
args = parser.parse_args()
main(args)

View File

@ -11,13 +11,13 @@ from datetime import datetime
from typing import Any
import torch
import triton
from tqdm import tqdm
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
_w8a8_block_fp8_matmul,
)
from vllm.platforms import current_platform
from vllm.triton_utils import triton
from vllm.utils import FlexibleArgumentParser
mp.set_start_method("spawn", force=True)

View File

@ -8,16 +8,12 @@ import torch
from vllm import _custom_ops as ops
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
get_col_major_tma_aligned_tensor,
per_token_group_quant_fp8,
w8a8_block_fp8_matmul,
)
from vllm.triton_utils import triton
from vllm.utils.deep_gemm import (
calc_diff,
fp8_gemm_nt,
get_col_major_tma_aligned_tensor,
per_block_cast_to_fp8,
)
from vllm.utils.deep_gemm import calc_diff, fp8_gemm_nt, per_block_cast_to_fp8
def benchmark_shape(m: int,

View File

@ -101,7 +101,6 @@ else()
find_isa(${CPUINFO} "asimd" ASIMD_FOUND) # Check for ARM NEON support
find_isa(${CPUINFO} "bf16" ARM_BF16_FOUND) # Check for ARM BF16 support
find_isa(${CPUINFO} "S390" S390_FOUND)
find_isa(${CPUINFO} "v" RVV_FOUND) # Check for RISC-V RVV support
endif()
if (AVX512_FOUND AND NOT AVX512_DISABLED)
@ -178,14 +177,8 @@ elseif (S390_FOUND)
"-mzvector"
"-march=native"
"-mtune=native")
elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "riscv64")
if(RVV_FOUND)
message(FAIL_ERROR "Can't support rvv now.")
else()
list(APPEND CXX_COMPILE_FLAGS "-march=rv64gc")
endif()
else()
message(FATAL_ERROR "vLLM CPU backend requires AVX512, AVX2, Power9+ ISA, S390X ISA, ARMv8 or RISC-V support.")
message(FATAL_ERROR "vLLM CPU backend requires AVX512, AVX2, Power9+ ISA, S390X ISA or ARMv8 support.")
endif()
#
@ -265,8 +258,7 @@ set(VLLM_EXT_SRC
"csrc/cpu/layernorm.cpp"
"csrc/cpu/mla_decode.cpp"
"csrc/cpu/pos_encoding.cpp"
"csrc/cpu/torch_bindings.cpp"
"csrc/moe/dynamic_4bit_int_moe_cpu.cpp")
"csrc/cpu/torch_bindings.cpp")
if (AVX512_FOUND AND NOT AVX512_DISABLED)
set(VLLM_EXT_SRC

View File

@ -135,10 +135,10 @@ public:
max_splits = min(16, max_splits);
// TODO: This avoids a hang when the batch size larger than 1 and
// there is more than 1 kv_splits.
// there is more than 4 kv_splits.
// Discuss with NVIDIA how this can be fixed.
if (B > 1) {
max_splits = min(1, max_splits);
max_splits = min(2, max_splits);
}
// printf(" max_splits = %d\n", max_splits);

View File

@ -14,12 +14,7 @@
// arm implementation
#include "cpu_types_arm.hpp"
#else
#warning "unsupported vLLM cpu implementation, vLLM will compile with scalar"
#include "cpu_types_scalar.hpp"
#endif
#ifdef _OPENMP
#include <omp.h>
#warning "unsupported vLLM cpu implementation"
#endif
#endif

View File

@ -1,513 +0,0 @@
#include <cmath>
#include <cstdint>
#include <cstring>
#include <torch/all.h>
#include "float_convert.hpp"
namespace vec_op {
#define VLLM_DISPATCH_CASE_FLOATING_TYPES(...) \
AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \
AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) \
AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)
#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \
AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
#ifndef CPU_OP_GUARD
#define CPU_KERNEL_GUARD_IN(NAME)
#define CPU_KERNEL_GUARD_OUT(NAME)
#else
#define CPU_KERNEL_GUARD_IN(NAME) \
std::cout << #NAME << " invoked." << std::endl;
#define CPU_KERNEL_GUARD_OUT(NAME) \
std::cout << #NAME << " exit." << std::endl;
#endif
#define FORCE_INLINE __attribute__((always_inline)) inline
#define __max(a, b) ((a) > (b) ? (a) : (b))
#define __min(a, b) ((a) < (b) ? (a) : (b))
#define __abs(a) ((a) < (0) ? (0 - a) : (a))
typedef struct f16x8_t {
uint16_t val[8];
} f16x8_t;
typedef struct f16x16_t {
uint16_t val[16];
} f16x16_t;
typedef struct f16x32_t {
uint16_t val[32];
} f16x32_t;
typedef struct f32x4_t {
float val[4];
} f32x4_t;
typedef struct f32x8_t {
float val[8];
} f32x8_t;
typedef struct f32x16_t {
float val[16];
} f32x16_t;
namespace {
template <typename T, T... indexes, typename F>
constexpr void unroll_loop_item(std::integer_sequence<T, indexes...>, F&& f) {
(f(std::integral_constant<T, indexes>{}), ...);
};
}; // namespace
template <typename T, T count, typename F,
typename = std::enable_if_t<std::is_invocable_v<F, T> > >
constexpr void unroll_loop(F&& f) {
unroll_loop_item(std::make_integer_sequence<T, count>{}, std::forward<F>(f));
}
template <typename T>
struct Vec {
constexpr static int get_elem_num() { return T::VEC_ELEM_NUM; }
};
struct FP32Vec8;
struct FP32Vec16;
struct FP16Vec8 : public Vec<FP16Vec8> {
constexpr static int VEC_ELEM_NUM = 8;
f16x8_t reg;
explicit FP16Vec8(const void* ptr)
: reg(*reinterpret_cast<const f16x8_t*>(ptr)) {};
explicit FP16Vec8(const FP32Vec8&);
void save(void* ptr) const { *reinterpret_cast<f16x8_t*>(ptr) = reg; }
};
struct FP16Vec16 : public Vec<FP16Vec16> {
constexpr static int VEC_ELEM_NUM = 16;
f16x16_t reg;
explicit FP16Vec16(const void* ptr)
: reg(*reinterpret_cast<const f16x16_t*>(ptr)) {};
explicit FP16Vec16(const FP32Vec16&);
void save(void* ptr) const { *reinterpret_cast<f16x16_t*>(ptr) = reg; }
void save(void* ptr, const int elem_num) const {
int num = __min(elem_num, VEC_ELEM_NUM);
std::memcpy(ptr, &(reg.val[0]), num * sizeof(uint16_t));
}
};
struct BF16Vec8 : public Vec<BF16Vec8> {
constexpr static int VEC_ELEM_NUM = 8;
f16x8_t reg;
explicit BF16Vec8(const void* ptr)
: reg(*reinterpret_cast<const f16x8_t*>(ptr)) {};
explicit BF16Vec8(const FP32Vec8&);
void save(void* ptr) const { *reinterpret_cast<f16x8_t*>(ptr) = reg; }
};
struct BF16Vec16 : public Vec<BF16Vec16> {
constexpr static int VEC_ELEM_NUM = 16;
f16x16_t reg;
explicit BF16Vec16(const void* ptr)
: reg(*reinterpret_cast<const f16x16_t*>(ptr)) {};
explicit BF16Vec16(const FP32Vec16&);
void save(void* ptr) const { *reinterpret_cast<f16x16_t*>(ptr) = reg; }
void save(void* ptr, const int elem_num) const {
int num = __min(elem_num, VEC_ELEM_NUM);
std::memcpy(ptr, &(reg.val[0]), num * sizeof(uint16_t));
}
};
struct BF16Vec32 : public Vec<BF16Vec32> {
constexpr static int VEC_ELEM_NUM = 32;
f16x32_t reg;
explicit BF16Vec32(const void* ptr)
: reg(*reinterpret_cast<const f16x32_t*>(ptr)) {};
explicit BF16Vec32(f16x32_t data) : reg(data) {};
explicit BF16Vec32(BF16Vec8& vec8_data) {
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
reg.val[i] = vec8_data.reg.val[i % BF16Vec8::VEC_ELEM_NUM];
}
}
void save(void* ptr) const { *reinterpret_cast<f16x32_t*>(ptr) = reg; }
};
struct FP32Vec4 : public Vec<FP32Vec4> {
constexpr static int VEC_ELEM_NUM = 4;
f32x4_t reg;
explicit FP32Vec4(float v) {
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
reg.val[i] = v;
}
}
explicit FP32Vec4() {
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
reg.val[i] = 0.0f;
}
}
explicit FP32Vec4(const float* ptr)
: reg(*reinterpret_cast<const f32x4_t*>(ptr)) {};
explicit FP32Vec4(f32x4_t data) : reg(data) {};
explicit FP32Vec4(const FP32Vec4& data) : reg(data.reg) {};
};
struct FP32Vec8 : public Vec<FP32Vec8> {
constexpr static int VEC_ELEM_NUM = 8;
f32x8_t reg;
explicit FP32Vec8(float v) {
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
reg.val[i] = v;
}
}
explicit FP32Vec8() {
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
reg.val[i] = 0.0f;
}
}
explicit FP32Vec8(const float* ptr)
: reg(*reinterpret_cast<const f32x8_t*>(ptr)) {};
explicit FP32Vec8(f32x8_t data) : reg(data) {};
explicit FP32Vec8(const FP32Vec8& data) : reg(data.reg) {};
explicit FP32Vec8(const FP16Vec8& v) {
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
reg.val[i] = fp16_to_float(v.reg.val[i]);
}
}
FP32Vec8(const BF16Vec8& v) {
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
reg.val[i] = bf16_to_float(v.reg.val[i]);
}
}
float reduce_sum() const {
float result = 0;
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
result += reg.val[i];
}
return result;
}
FP32Vec8 exp() const {
f32x8_t ret;
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
ret.val[i] = expf(reg.val[i]);
}
return FP32Vec8(ret);
}
FP32Vec8 tanh() const {
f32x8_t ret;
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
ret.val[i] = tanhf(reg.val[i]);
}
return FP32Vec8(ret);
}
FP32Vec8 er() const {
f32x8_t ret;
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
ret.val[i] = erf(reg.val[i]);
}
return FP32Vec8(ret);
}
FP32Vec8 operator*(const FP32Vec8& b) const {
f32x8_t ret;
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
ret.val[i] = reg.val[i] * b.reg.val[i];
}
return FP32Vec8(ret);
}
FP32Vec8 operator+(const FP32Vec8& b) const {
f32x8_t ret;
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
ret.val[i] = reg.val[i] + b.reg.val[i];
}
return FP32Vec8(ret);
}
FP32Vec8 operator-(const FP32Vec8& b) const {
f32x8_t ret;
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
ret.val[i] = reg.val[i] - b.reg.val[i];
}
return FP32Vec8(ret);
}
FP32Vec8 operator/(const FP32Vec8& b) const {
f32x8_t ret;
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
ret.val[i] = reg.val[i] / b.reg.val[i];
}
return FP32Vec8(ret);
}
void save(void* ptr) const { *reinterpret_cast<f32x8_t*>(ptr) = reg; }
};
struct FP32Vec16 : public Vec<FP32Vec16> {
constexpr static int VEC_ELEM_NUM = 16;
f32x16_t reg;
explicit FP32Vec16(float v) {
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
reg.val[i] = v;
}
}
explicit FP32Vec16() {
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
reg.val[i] = 0.0f;
}
}
explicit FP32Vec16(const float* ptr)
: reg(*reinterpret_cast<const f32x16_t*>(ptr)) {};
explicit FP32Vec16(f32x16_t data) : reg(data) {};
FP32Vec16(const FP32Vec4& data) {
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
reg.val[i] = data.reg.val[i % FP32Vec4::VEC_ELEM_NUM];
}
}
FP32Vec16(const FP32Vec8& data) {
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
reg.val[i] = data.reg.val[i % FP32Vec8::VEC_ELEM_NUM];
}
}
FP32Vec16(const FP32Vec16& data) : reg(data.reg) {};
explicit FP32Vec16(const FP16Vec16& v) {
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
reg.val[i] = fp16_to_float(v.reg.val[i]);
}
}
explicit FP32Vec16(const BF16Vec16& v) {
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
reg.val[i] = bf16_to_float(v.reg.val[i]);
}
}
explicit FP32Vec16(const FP16Vec8& v) : FP32Vec16(FP32Vec8(v)) {};
FP32Vec16(const BF16Vec8& v) : FP32Vec16(FP32Vec8(v)) {};
FP32Vec16 operator*(const FP32Vec16& b) const {
FP32Vec16 result(0.0f);
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
result.reg.val[i] = reg.val[i] * b.reg.val[i];
}
return result;
}
FP32Vec16 operator+(const FP32Vec16& b) const {
FP32Vec16 result(0.0f);
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
result.reg.val[i] = reg.val[i] + b.reg.val[i];
}
return result;
}
FP32Vec16 operator-(const FP32Vec16& b) const {
FP32Vec16 result(0.0f);
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
result.reg.val[i] = reg.val[i] - b.reg.val[i];
}
return result;
}
FP32Vec16 operator/(const FP32Vec16& b) const {
FP32Vec16 result(0.0f);
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
result.reg.val[i] = reg.val[i] / b.reg.val[i];
}
return result;
}
FP32Vec16 max(const FP32Vec16& b) const {
FP32Vec16 result(0.0f);
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
result.reg.val[i] = __max(reg.val[i], b.reg.val[i]);
}
return result;
}
FP32Vec16 min(const FP32Vec16& b) const {
FP32Vec16 result(0.0f);
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
result.reg.val[i] = __min(reg.val[i], b.reg.val[i]);
}
return result;
}
FP32Vec16 abs() const {
FP32Vec16 result(0.0f);
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
result.reg.val[i] = __abs(reg.val[i]);
}
return result;
}
float reduce_sum() const {
float result = 0.0f;
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
result += reg.val[i];
}
return result;
}
float reduce_max() const {
float result = reg.val[0];
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
result = __max(reg.val[i], result);
}
return result;
}
float reduce_min() const {
float result = reg.val[0];
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
result = __min(reg.val[i], result);
}
return result;
}
template <int group_size>
float reduce_sub_sum(int idx) {
static_assert(VEC_ELEM_NUM % group_size == 0);
float sum = 0.0;
int start = idx * group_size;
int end = (idx + 1) * group_size;
for (; (start < VEC_ELEM_NUM) && (start < end); ++start) {
sum += reg.val[start];
}
return sum;
}
void save(void* ptr) const { *reinterpret_cast<f32x16_t*>(ptr) = reg; }
};
template <typename T>
struct VecType {
using vec_type = void;
};
template <typename T>
using vec_t = typename VecType<T>::vec_type;
template <>
struct VecType<float> {
using vec_type = FP32Vec8;
};
template <>
struct VecType<c10::Half> {
using vec_type = FP16Vec8;
};
template <>
struct VecType<c10::BFloat16> {
using vec_type = BF16Vec8;
};
template <typename T>
void storeFP32(float v, T* ptr) {
*ptr = v;
}
/*
template <> inline void storeFP32<c10::Half>(float v, c10::Half *ptr) {
c10::Half __attribute__((__may_alias__)) *v_ptr =
reinterpret_cast<c10::Half *>(&v);
*ptr = *(v_ptr + 1);
}
*/
template <>
inline void storeFP32<c10::Half>(float v, c10::Half* ptr) {
uint16_t fp16 = float_to_fp16(v);
*reinterpret_cast<uint16_t*>(ptr) = fp16;
}
template <>
inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16* ptr) {
c10::BFloat16 __attribute__((__may_alias__))* v_ptr =
reinterpret_cast<c10::BFloat16*>(&v);
*ptr = *(v_ptr + 1);
}
inline FP16Vec16::FP16Vec16(const FP32Vec16& v) {
int i = 0;
for (i = 0; i < FP16Vec16::VEC_ELEM_NUM; ++i) {
reg.val[i] = float_to_fp16(v.reg.val[i]);
}
}
inline FP16Vec8 ::FP16Vec8(const FP32Vec8& v) {
int i = 0;
for (i = 0; i < FP16Vec8::VEC_ELEM_NUM; ++i) {
reg.val[i] = float_to_fp16(v.reg.val[i]);
}
}
inline void fma(FP32Vec16& acc, FP32Vec16& a, FP32Vec16& b) {
acc = acc + a * b;
}
inline BF16Vec8::BF16Vec8(const FP32Vec8& v) {
int i = 0;
for (i = 0; i < BF16Vec8::VEC_ELEM_NUM; ++i) {
reg.val[i] = float_to_bf16(v.reg.val[i]);
}
}
inline BF16Vec16::BF16Vec16(const FP32Vec16& v) {
int i = 0;
for (i = 0; i < BF16Vec16::VEC_ELEM_NUM; ++i) {
reg.val[i] = float_to_bf16(v.reg.val[i]);
}
}
inline void prefetch(const void* addr) { __builtin_prefetch(addr, 0, 3); }
}; // namespace vec_op

View File

@ -1,106 +0,0 @@
static float bf16_to_float(uint16_t bf16) {
uint32_t bits = static_cast<uint32_t>(bf16) << 16;
float fp32;
std::memcpy(&fp32, &bits, sizeof(fp32));
return fp32;
}
static uint16_t float_to_bf16(float fp32) {
uint32_t bits;
std::memcpy(&bits, &fp32, sizeof(fp32));
return static_cast<uint16_t>(bits >> 16);
}
/************************************************
* Copyright (c) 2015 Princeton Vision Group
* Licensed under the MIT license.
* Codes below copied from
* https://github.com/PrincetonVision/marvin/tree/master/tools/tensorIO_matlab
*************************************************/
static uint16_t float_to_fp16(float fp32) {
uint16_t fp16;
unsigned x;
unsigned u, remainder, shift, lsb, lsb_s1, lsb_m1;
unsigned sign, exponent, mantissa;
std::memcpy(&x, &fp32, sizeof(fp32));
u = (x & 0x7fffffff);
// Get rid of +NaN/-NaN case first.
if (u > 0x7f800000) {
fp16 = 0x7fffU;
return fp16;
}
sign = ((x >> 16) & 0x8000);
// Get rid of +Inf/-Inf, +0/-0.
if (u > 0x477fefff) {
fp16 = sign | 0x7c00U;
return fp16;
}
if (u < 0x33000001) {
fp16 = (sign | 0x0000);
return fp16;
}
exponent = ((u >> 23) & 0xff);
mantissa = (u & 0x7fffff);
if (exponent > 0x70) {
shift = 13;
exponent -= 0x70;
} else {
shift = 0x7e - exponent;
exponent = 0;
mantissa |= 0x800000;
}
lsb = (1 << shift);
lsb_s1 = (lsb >> 1);
lsb_m1 = (lsb - 1);
// Round to nearest even.
remainder = (mantissa & lsb_m1);
mantissa >>= shift;
if (remainder > lsb_s1 || (remainder == lsb_s1 && (mantissa & 0x1))) {
++mantissa;
if (!(mantissa & 0x3ff)) {
++exponent;
mantissa = 0;
}
}
fp16 = (sign | (exponent << 10) | mantissa);
return fp16;
}
static float fp16_to_float(uint16_t fp16) {
unsigned sign = ((fp16 >> 15) & 1);
unsigned exponent = ((fp16 >> 10) & 0x1f);
unsigned mantissa = ((fp16 & 0x3ff) << 13);
int temp;
float fp32;
if (exponent == 0x1f) { /* NaN or Inf */
mantissa = (mantissa ? (sign = 0, 0x7fffff) : 0);
exponent = 0xff;
} else if (!exponent) { /* Denorm or Zero */
if (mantissa) {
unsigned int msb;
exponent = 0x71;
do {
msb = (mantissa & 0x400000);
mantissa <<= 1; /* normalize */
--exponent;
} while (!msb);
mantissa &= 0x7fffff; /* 1.mantissa is implicit */
}
} else {
exponent += 0x70;
}
temp = ((sign << 31) | (exponent << 23) | mantissa);
std::memcpy(&fp32, &temp, sizeof(temp));
return fp32;
}

View File

@ -88,18 +88,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
" int tp_rank, int blocksparse_local_blocks,"
" int blocksparse_vert_stride, int blocksparse_block_size,"
" int blocksparse_head_sliding_step) -> ()");
ops.impl("paged_attention_v1", torch::kCPU, &paged_attention_v1);
ops.def(
"dynamic_4bit_int_moe("
"Tensor x, Tensor topk_ids, Tensor topk_weights,"
"Tensor w13_packed, Tensor w2_packed, int H, int I, int I2,"
"int group_size, bool apply_router_weight_on_input, int activation_kind"
") -> Tensor");
ops.impl("dynamic_4bit_int_moe", torch::kCPU, &dynamic_4bit_int_moe_cpu);
// PagedAttention V2.
ops.def(
"paged_attention_v2("

View File

@ -1,38 +0,0 @@
#pragma once
#include <cuda_runtime_api.h>
#include <algorithm>
// maximum blocks per SM cap
#ifndef VLLM_LAUNCH_BLOCKS_CAP
#define VLLM_LAUNCH_BLOCKS_CAP 4
#endif
// compile-time estimate of max threads per SM for launch bounds.
#ifndef VLLM_MAX_THREADS_PER_SM
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 300
#define VLLM_MAX_THREADS_PER_SM 1536
#else
#define VLLM_MAX_THREADS_PER_SM 2048
#endif
#endif
// compute the number of blocks per SM to request in __launch_bounds__
#define VLLM_BLOCKS_DIV(VAL) (VLLM_MAX_THREADS_PER_SM / (VAL))
#define VLLM_CLAMP_BLOCKS_PER_SM(VAL) \
(((VAL) <= 0) \
? 1 \
: (((VAL) < VLLM_LAUNCH_BLOCKS_CAP) ? (VAL) : VLLM_LAUNCH_BLOCKS_CAP))
#define VLLM_BLOCKS_PER_SM(BLOCK_THREADS) \
VLLM_CLAMP_BLOCKS_PER_SM(VLLM_BLOCKS_DIV(BLOCK_THREADS))
// runtime-time helper to compute blocks/SM
static inline int vllm_runtime_blocks_per_sm(int block_threads) {
int device = -1;
cudaGetDevice(&device);
int max_threads_per_sm = VLLM_MAX_THREADS_PER_SM;
cudaDeviceGetAttribute(&max_threads_per_sm,
cudaDevAttrMaxThreadsPerMultiProcessor, device);
int blocks = (block_threads > 0) ? (max_threads_per_sm / block_threads) : 1;
return VLLM_CLAMP_BLOCKS_PER_SM(blocks);
}

View File

@ -1,156 +0,0 @@
#include <ATen/ATen.h>
#include <ATen/Parallel.h>
#include <torch/all.h>
// _dyn_quant_matmul_4bit is only available on AArch64.
#if defined(__aarch64__)
#include <ATen/ops/_dyn_quant_matmul_4bit.h>
#endif
inline torch::Tensor mm(const torch::Tensor& a, const torch::Tensor& packed_w,
int64_t group_size_eff, int64_t in_features,
int64_t out_features) {
#if defined(__aarch64__)
return at::_ops::_dyn_quant_matmul_4bit::call(a, packed_w, group_size_eff,
in_features, out_features);
#else
TORCH_CHECK(false,
"dynamic 4-bit int MoE path requires AArch64 (ARM64); "
"_dyn_quant_matmul_4bit is unavailable on this architecture");
return {};
#endif
}
enum ActivationKind : int64_t {
SwiGLU_Gu = 0, // act = SiLU(g) * u
SwiGLUOAI = 1, // act = SiLU(u) * g
SiLU = 2 // SiLU
};
torch::Tensor dynamic_4bit_int_moe_cpu(
torch::Tensor x, torch::Tensor topk_ids, torch::Tensor topk_weights,
torch::Tensor w13_packed, torch::Tensor w2_packed, int64_t H, int64_t I,
int64_t I2, int64_t group_size, bool apply_router_weight_on_input,
int64_t activation_kind) {
TORCH_CHECK(x.dim() == 2, "x must be 2D");
TORCH_CHECK(topk_ids.dim() == 2 && topk_weights.dim() == 2,
"topk tensors must be [T, K]");
TORCH_CHECK(
w13_packed.size(0) == w2_packed.size(0),
"w13_packed and w2_packed must have same number of experts in dim 0");
TORCH_CHECK(I2 == 2 * I, "I2 must equal 2*I");
const int64_t T = x.size(0);
const int64_t K = topk_ids.size(1);
const int64_t E = w13_packed.size(0);
const int64_t N = T * K;
auto x_c = x.contiguous();
auto ids_c = topk_ids.contiguous();
auto gates_c = topk_weights.to(at::kFloat).contiguous();
// bucketing tokens -> experts
c10::SmallVector<int64_t, 64> counts(
E, 0); // Small vector uses stack allocation
{
const auto* ids_ptr = ids_c.data_ptr<int64_t>();
for (int64_t i = 0; i < N; ++i) {
const int64_t e_id = ids_ptr[i];
TORCH_CHECK(0 <= e_id && e_id < E, "expert id out of range");
counts[e_id]++;
}
}
c10::SmallVector<int64_t, 65> offsets(E + 1, 0); // ( E +1 )
for (int64_t e = 0; e < E; ++e) offsets[e + 1] = offsets[e] + counts[e];
auto expert_tokens = at::empty({offsets[E]}, ids_c.options());
auto expert_gates = at::empty({offsets[E]}, gates_c.options());
{
c10::SmallVector<int64_t, 64> cursor(E, 0);
const auto* ids_ptr = ids_c.data_ptr<int64_t>();
const auto* gts_ptr = gates_c.data_ptr<float>();
auto* tok_ptr = expert_tokens.data_ptr<int64_t>();
auto* gate_ptr = expert_gates.data_ptr<float>();
for (int64_t t = 0; t < T; ++t) {
const int64_t base = t * K;
for (int64_t k = 0; k < K; ++k) {
const int64_t idx = base + k;
const int64_t e = ids_ptr[idx];
const int64_t p = offsets[e] + (cursor[e]++);
tok_ptr[p] = t;
gate_ptr[p] = gts_ptr[idx];
}
}
}
const int64_t g_eff_13 = (group_size != -1) ? group_size : H;
const int64_t g_eff_2 = (group_size != -1) ? group_size : I;
// Per-expert outputs filled in parallel
std::vector<torch::Tensor> y_list(E);
y_list.resize(E);
at::parallel_for(0, E, 1, [&](int64_t e_begin, int64_t e_end) {
for (int64_t e = e_begin; e < e_end; ++e) {
const int64_t te = counts[e];
if (te == 0) {
y_list[e] = at::empty({0, H}, x_c.options());
continue;
}
const int64_t start = offsets[e];
auto sel_tokens =
expert_tokens.narrow(/*dim=*/0, /*start=*/start, /*length=*/te);
auto gates_e =
expert_gates.narrow(/*dim=*/0, /*start=*/start, /*length=*/te);
auto x_e = x_c.index_select(/*dim=*/0, sel_tokens);
if (apply_router_weight_on_input) {
x_e = x_e.mul(gates_e.unsqueeze(1));
}
auto w13_e = w13_packed.select(/*dim=*/0, e);
auto w2_e = w2_packed.select(/*dim=*/0, e);
// W13
auto y13 =
mm(x_e, w13_e, g_eff_13, /*in_features=*/H, /*out_features=*/I2);
auto g_part = y13.narrow(/*dim=*/1, /*start=*/0, /*length=*/I);
auto u_part = y13.narrow(/*dim=*/1, /*start=*/I, /*length=*/I);
torch::Tensor act;
if (activation_kind == ActivationKind::SwiGLUOAI) { // SwiGLUOAI
constexpr double kAlpha = 1.702; // GPT-OSS default
constexpr double kLimit = 7.0; // GPT-OSS default
auto gate_c = at::clamp_max(g_part, kLimit);
auto up_c = at::clamp(u_part, -kLimit, kLimit);
auto glu = gate_c.mul(at::sigmoid(gate_c.mul(kAlpha)));
act = up_c.add(1.0).mul(glu);
} else { // SiLU , SwiGLU_GU, vLLM maps silu to SiluAndMul()
act = at::silu(g_part).mul(u_part);
}
// W2
auto y = mm(act, w2_e, g_eff_2, /*in_features=*/I, /*out_features=*/H);
if (!apply_router_weight_on_input) {
y = y.mul(gates_e.unsqueeze(1));
}
// Store per-expert result
y_list[e] = y;
}
});
// Concatenate all expert outputs to match expert_tokens order
auto Y_all = at::cat(y_list, /*dim=*/0);
auto out = at::zeros({T, H}, x.options());
out =
at::index_add(out, /*dim=*/0, /*index=*/expert_tokens, /*source=*/Y_all);
return out;
}

View File

@ -21,7 +21,6 @@
#include <torch/all.h>
#include <cuda_fp16.h>
#include <cuda_bf16.h>
#include <cuda/std/limits>
#include <cooperative_groups.h>
#include <cooperative_groups/reduce.h>
namespace cg = cooperative_groups;
@ -29,6 +28,7 @@ namespace cg = cooperative_groups;
namespace vllm {
namespace moe {
constexpr float kNegInfinity = INFINITY * -1;
constexpr unsigned FULL_WARP_MASK = 0xffffffff;
constexpr int32_t WARP_SIZE = 32;
constexpr int32_t BLOCK_SIZE = 512;
@ -411,30 +411,14 @@ __device__ inline float cuda_cast<float, __nv_bfloat16>(__nv_bfloat16 val) {
return __bfloat162float(val);
}
template <typename T>
__device__ inline T neg_inf() {
// cuda::std::numeric_limits<T>::infinity() returns `0` for [T=bf16 or fp16]
// so we need to cast from fp32
return cuda_cast<T, float>(-cuda::std::numeric_limits<float>::infinity());
}
template <typename T>
__device__ inline bool is_finite(const T val) {
#if (__CUDACC_VER_MAJOR__ * 10000 + __CUDACC_VER_MINOR__ * 100 >= 120800)
return cuda::std::isfinite(val);
#else
return isfinite(cuda_cast<float, T>(val));
#endif
}
template <typename T>
__device__ void topk_with_k2(T* output, T const* input,
cg::thread_block_tile<32> const& tile,
int32_t const lane_id,
int const num_experts_per_group) {
// Get the top2 per thread
T largest = neg_inf<T>();
T second_largest = neg_inf<T>();
T largest = -INFINITY;
T second_largest = -INFINITY;
if (num_experts_per_group > WARP_SIZE) {
for (int i = lane_id; i < num_experts_per_group; i += WARP_SIZE) {
@ -529,8 +513,8 @@ __global__ void group_idx_and_topk_idx_kernel(
warp_id * topk;
s_topk_idx += warp_id * topk;
T value = neg_inf<T>();
T topk_group_value = neg_inf<T>();
T value = kNegInfinity;
T topk_group_value = kNegInfinity;
int32_t num_equalto_topkth_group;
#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
@ -541,8 +525,11 @@ __global__ void group_idx_and_topk_idx_kernel(
if (case_id < num_tokens) {
// calculate group_idx
int32_t target_num_min = WARP_SIZE - n_group + topk_group;
// The check is necessary to avoid abnormal input
if (lane_id < n_group && is_finite(group_scores[lane_id])) {
if (lane_id < n_group &&
(isfinite(cuda_cast<float, T>(
group_scores[lane_id])))) // The check is necessary to avoid
// abnormal input
{
value = group_scores[lane_id];
}
@ -553,11 +540,11 @@ __global__ void group_idx_and_topk_idx_kernel(
__syncwarp(); // Ensure all threads have valid data before reduction
topk_group_value = cg::reduce(tile, value, cg::greater<T>());
if (value == topk_group_value) {
value = neg_inf<T>();
value = kNegInfinity;
}
pre_count_equal_to_top_value = count_equal_to_top_value;
count_equal_to_top_value =
__popc(__ballot_sync(FULL_WARP_MASK, (value == neg_inf<T>())));
count_equal_to_top_value = __popc(__ballot_sync(
FULL_WARP_MASK, (value == cuda_cast<T, float>(kNegInfinity))));
}
num_equalto_topkth_group = target_num_min - pre_count_equal_to_top_value;
}
@ -565,10 +552,11 @@ __global__ void group_idx_and_topk_idx_kernel(
warp_topk::WarpSelect</*capability*/ WARP_SIZE, /*greater*/ true, T, int32_t,
/* is_stable */ true>
queue((int32_t)topk, neg_inf<T>());
queue((int32_t)topk, -INFINITY);
int count_equalto_topkth_group = 0;
bool if_proceed_next_topk = topk_group_value != neg_inf<T>();
bool if_proceed_next_topk =
(topk_group_value != cuda_cast<T, float>(kNegInfinity));
if (case_id < num_tokens && if_proceed_next_topk) {
for (int i_group = 0; i_group < n_group; i_group++) {
if ((group_scores[i_group] > topk_group_value) ||
@ -577,10 +565,11 @@ __global__ void group_idx_and_topk_idx_kernel(
int32_t offset = i_group * num_experts_per_group;
for (int32_t i = lane_id; i < align_num_experts_per_group;
i += WARP_SIZE) {
T candidates = (i < num_experts_per_group) &&
is_finite(scores_with_bias[offset + i])
? scores_with_bias[offset + i]
: neg_inf<T>();
T candidates =
(i < num_experts_per_group) && isfinite(cuda_cast<float, T>(
scores_with_bias[offset + i]))
? scores_with_bias[offset + i]
: cuda_cast<T, float>(kNegInfinity);
queue.add(candidates, offset + i);
}
if (group_scores[i_group] == topk_group_value) {
@ -609,8 +598,7 @@ __global__ void group_idx_and_topk_idx_kernel(
if (i < topk) {
s_topk_value[i] = value;
}
topk_sum +=
cg::reduce(tile, cuda_cast<float, T>(value), cg::plus<float>());
topk_sum += reduce(tile, cuda_cast<float, T>(value), cg::plus<float>());
}
}

View File

@ -44,9 +44,6 @@ __global__ void moe_align_block_size_kernel(
for (size_t i = tid; i < numel; i += stride) {
int expert_id = topk_ids[i];
if (expert_id >= num_experts) {
continue;
}
int warp_idx = expert_id / experts_per_warp;
int expert_offset = expert_id % experts_per_warp;
atomicAdd(&shared_counts[warp_idx * experts_per_warp + expert_offset], 1);
@ -98,15 +95,12 @@ template <typename scalar_t>
__global__ void count_and_sort_expert_tokens_kernel(
const scalar_t* __restrict__ topk_ids,
int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ cumsum_buffer,
size_t numel, int32_t num_experts) {
size_t numel) {
const size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
const size_t stride = blockDim.x * gridDim.x;
for (size_t i = tid; i < numel; i += stride) {
int32_t expert_id = topk_ids[i];
if (expert_id >= num_experts) {
continue;
}
int32_t rank_post_pad = atomicAdd(&cumsum_buffer[expert_id], 1);
sorted_token_ids[rank_post_pad] = i;
}
@ -275,7 +269,7 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
sort_kernel<<<actual_blocks, block_threads, 0, stream>>>(
topk_ids.data_ptr<scalar_t>(),
sorted_token_ids.data_ptr<int32_t>(),
cumsum_buffer.data_ptr<int32_t>(), topk_ids.numel(), num_experts);
cumsum_buffer.data_ptr<int32_t>(), topk_ids.numel());
}
});
}

View File

@ -328,12 +328,6 @@ void selective_scan_fwd(const torch::Tensor& u, const torch::Tensor& delta,
const std::optional<torch::Tensor>& has_initial_state,
const torch::Tensor& ssm_states, int64_t pad_slot_id);
torch::Tensor dynamic_4bit_int_moe_cpu(
torch::Tensor x, torch::Tensor topk_ids, torch::Tensor topk_weights,
torch::Tensor w13_packed, torch::Tensor w2_packed, int64_t H, int64_t I,
int64_t I2, int64_t group_size, bool apply_router_weight_on_input,
int64_t activation_kind);
using fptr_t = int64_t;
fptr_t init_custom_ar(const std::vector<int64_t>& fake_ipc_ptrs,
torch::Tensor& rank_data, int64_t rank,

View File

@ -23,14 +23,9 @@
typedef __hip_bfloat162 __nv_bfloat162;
typedef __hip_bfloat16 __nv_bfloat16;
typedef __hip_bfloat16_raw __nv_bfloat16_raw;
#if defined(HIP_FP8_TYPE_OCP)
typedef __hip_fp8_e4m3 __nv_fp8_e4m3;
typedef __hip_fp8x4_e4m3 __nv_fp8x4_e4m3;
#else
// ROCm 6.2 fallback: only *_fnuz types exist
typedef __hip_fp8_e4m3_fnuz __nv_fp8_e4m3;
typedef __hip_fp8x4_e4m3_fnuz __nv_fp8x4_e4m3;
#endif
#endif
#include "core/registration.h"
@ -370,6 +365,7 @@ __global__ void silu_mul_fp8_quant_deep_gemm_kernel(
int32_t compute_pipeline_offset_64 = 0;
for (int32_t t = n_tokens_lower; t < n_tokens_upper; ++t) {
__nv_bfloat16 y_max_bf16 = EPS;
__nv_bfloat162 results_bf162[2];
cp_async_wait<NUM_STAGES - 2>();
@ -409,7 +405,7 @@ __global__ void silu_mul_fp8_quant_deep_gemm_kernel(
auto _y_max2 =
__hmax2(__habs2(results_bf162[0]), __habs2(results_bf162[1]));
__nv_bfloat16 y_max_bf16 = __hmax(EPS, __hmax(_y_max2.x, _y_max2.y));
y_max_bf16 = __hmax(_y_max2.x, _y_max2.y);
// An entire group is assigned to a single warp, so a simple warp reduce
// is used.

View File

@ -26,7 +26,6 @@
#include "dispatch_utils.h"
#include "cuda_utils.h"
#include "launch_bounds_utils.h"
#include "nvfp4_utils.cuh"
namespace vllm {
@ -64,7 +63,7 @@ __inline__ __device__ PackedVec<Type> compute_silu_mul(PackedVec<Type>& vec,
// Use UE4M3 by default.
template <class Type, bool UE8M0_SF = false>
__global__ void __launch_bounds__(1024, VLLM_BLOCKS_PER_SM(1024))
__global__ void __launch_bounds__(1024, 4)
silu_mul_cvt_fp16_to_fp4(int32_t numRows, int32_t numCols, Type const* in,
float const* SFScale, uint32_t* out,
uint32_t* SFout) {
@ -132,8 +131,7 @@ void silu_and_mul_nvfp4_quant_sm1xxa(torch::Tensor& output, // [..., d]
const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
auto stream = at::cuda::getCurrentCUDAStream(input.get_device());
dim3 block(std::min(int(n / ELTS_PER_THREAD), 1024));
int const numBlocksPerSM =
vllm_runtime_blocks_per_sm(static_cast<int>(block.x));
int const numBlocksPerSM = 2048 / block.x;
dim3 grid(std::min(int(m), multiProcessorCount * numBlocksPerSM));
VLLM_DISPATCH_HALF_TYPES(

View File

@ -26,13 +26,12 @@
#include "dispatch_utils.h"
#include "nvfp4_utils.cuh"
#include "launch_bounds_utils.h"
namespace vllm {
// Use UE4M3 by default.
template <class Type, bool UE8M0_SF = false, bool SMALL_NUM_EXPERTS = false>
__global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
__global__ void __launch_bounds__(512, 4)
cvt_fp16_to_fp4(int32_t numRows, int32_t numCols, Type const* in,
float const* SFScale, uint32_t* out, uint32_t* SFout,
uint32_t* input_offset_by_experts,
@ -130,7 +129,7 @@ __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
// Kernel for LARGE_M_TOPK = true (large m_topk optimized version)
template <class Type, bool UE8M0_SF = false, bool SMALL_NUM_EXPERTS = false>
__global__ void __launch_bounds__(1024, VLLM_BLOCKS_PER_SM(1024))
__global__ void __launch_bounds__(1024, 4)
cvt_fp16_to_fp4(int32_t numRows, int32_t numCols, Type const* in,
float const* SFScale, uint32_t* out, uint32_t* SFout,
uint32_t* input_offset_by_experts,
@ -234,9 +233,8 @@ void quant_impl(void* output, void* output_scale, void* input,
int const workSizePerRow = k / ELTS_PER_THREAD;
int const totalWorkSize = m_topk * workSizePerRow;
dim3 block(std::min(workSizePerRow, 512));
// Get number of blocks per SM
int const numBlocksPerSM =
vllm_runtime_blocks_per_sm(static_cast<int>(block.x));
// Get number of blocks per SM (assume we can fully utilize the SM).
int const numBlocksPerSM = 2048 / block.x;
dim3 grid(std::min(static_cast<int>((totalWorkSize + block.x - 1) / block.x),
multiProcessorCount * numBlocksPerSM));
while (grid.x <= multiProcessorCount && block.x > 64) {

View File

@ -26,14 +26,13 @@
#include "dispatch_utils.h"
#include "cuda_utils.h"
#include "launch_bounds_utils.h"
#include "nvfp4_utils.cuh"
namespace vllm {
// Use UE4M3 by default.
template <class Type, bool UE8M0_SF = false>
__global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
__global__ void __launch_bounds__(512, 4)
cvt_fp16_to_fp4(int32_t numRows, int32_t numCols, Type const* in,
float const* SFScale, uint32_t* out, uint32_t* SFout) {
using PackedVec = PackedVec<Type>;
@ -76,9 +75,8 @@ void invokeFP4Quantization(int m, int n, T const* input, float const* SFScale,
// Grid, Block size.
// Each thread converts 8 values.
dim3 block(std::min(int(n / ELTS_PER_THREAD), 512));
// Get number of blocks per SM
int const numBlocksPerSM =
vllm_runtime_blocks_per_sm(static_cast<int>(block.x));
// Get number of blocks per SM (assume we can fully utilize the SM).
int const numBlocksPerSM = 2048 / block.x;
dim3 grid(std::min(int(m), multiProcessorCount * numBlocksPerSM));
// Launch the cvt kernel.

View File

@ -12,8 +12,8 @@
#include "../vectorization_utils.cuh"
#include "../../dispatch_utils.h"
__device__ __forceinline__ float GroupReduceMax(float val) {
unsigned mask = threadIdx.x % 32 >= 16 ? 0xffff0000 : 0x0000ffff;
__device__ __forceinline__ float GroupReduceMax(float val, const int tid) {
unsigned mask = 0xffff;
val = fmaxf(val, __shfl_xor_sync(mask, val, 8));
val = fmaxf(val, __shfl_xor_sync(mask, val, 4));
@ -86,7 +86,7 @@ __global__ void per_token_group_quant_8bit_kernel(
threads_per_group, // stride in group
scalar_op_cache); // scalar handler
local_absmax = GroupReduceMax(local_absmax);
local_absmax = GroupReduceMax(local_absmax, lane_id);
float y_s = local_absmax / max_8bit;
if constexpr (SCALE_UE8M0) {

View File

@ -25,12 +25,6 @@
#include "../attention/dtype_fp8.cuh"
#include "../quantization/fp8/amd/quant_utils.cuh"
// ROCm 6.2 compatibility: map OCP fp8 types to FNUZ variants if OCP is absent
#if !defined(HIP_FP8_TYPE_OCP)
using __hip_fp8_e4m3 = __hip_fp8_e4m3_fnuz;
using __hip_fp8_e5m2 = __hip_fp8_e5m2_fnuz;
#endif
#if defined(__HIPCC__) && \
(defined(__gfx90a__) || defined(__gfx942__) || defined(__gfx950__))
#define __HIP__GFX9__

View File

@ -5,14 +5,11 @@
torch::Tensor LLMM1(at::Tensor& in_a, at::Tensor& in_b,
const int64_t rows_per_block);
torch::Tensor wvSplitK(const at::Tensor& in_a, const at::Tensor& in_b,
const std::optional<at::Tensor>& in_bias,
torch::Tensor wvSplitK(at::Tensor& in_a, at::Tensor& in_b,
const int64_t CuCount);
void wvSplitKQ(const at::Tensor& in_a, const at::Tensor& in_b,
const std::optional<at::Tensor>& in_bias, at::Tensor& out_c,
const at::Tensor& scale_a, const at::Tensor& scale_b,
const int64_t CuCount);
void wvSplitKQ(at::Tensor& in_a, at::Tensor& in_b, at::Tensor& out_c,
at::Tensor& scale_a, at::Tensor& scale_b, const int64_t CuCount);
void paged_attention(
torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits,

View File

@ -292,9 +292,8 @@ torch::Tensor LLMM1(at::Tensor& in_a, at::Tensor& in_b,
template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
int UNRL, int N>
__global__ void __launch_bounds__(WvPrGrp* THRDS)
wvSplitK_hf_sml_(const int K, const int M, const int Bx, const int By,
const scalar_t* B, const scalar_t* __restrict__ A,
const scalar_t* __restrict__ BIAS, scalar_t* C,
wvSplitK_hf_sml_(const int K, const int M, const scalar_t* B,
const scalar_t* __restrict__ A, scalar_t* C,
const int _WvPrGrp, const int CuCount) {
constexpr int max_lds_len = LDS_SIZE / 2;
#if defined(__HIP__MI3XX__)
@ -485,14 +484,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
if (threadIdx.x == 63) {
for (int n = 0; n < N; n++) {
for (int i = 0; i < YTILE; i++) {
if constexpr (std::is_same_v<scalar_t, half>) {
if (BIAS)
sum[n][i] += __half2float(BIAS[(m + i) % Bx + (n % By) * M]);
} else if constexpr (std::is_same_v<scalar_t, __hip_bfloat16>) {
if (BIAS)
sum[n][i] +=
__bfloat162float(BIAS[(m + i) % Bx + (n % By) * M]);
}
// if (commitColumn[i]) C[m + i + n * M] = __float2half(sum[n][i]);
C[m + i + n * M] = __float2s<scalar_t>(sum[n][i]);
}
}
@ -537,9 +529,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
if (threadIdx.x == 63) {
for (int n = 0; n < N; n++) {
for (int i = 0; i < YTILE; i++) {
if (BIAS)
sum4[n][i][0] +=
__bfloat162float(BIAS[(m + i) % Bx + (n % By) * M]);
// if (commitColumn[i]) C[n + i + m * N] = __float2half(sum[n][i]);
C[m + i + n * M] = __float2bfloat16(sum4[n][i][0]);
}
}
@ -551,10 +541,8 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
#else // !defined(__HIP__GFX9__) TODO: Add NAVI support
template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
int UNRL, int N>
__global__ void wvSplitK_hf_sml_(const int K, const int M, const int Bx,
const int By, const scalar_t* B,
const scalar_t* __restrict__ A,
const scalar_t* __restrict__ BIAS, scalar_t* C,
__global__ void wvSplitK_hf_sml_(const int K, const int M, const scalar_t* B,
const scalar_t* __restrict__ A, scalar_t* C,
const int _WvPrGrp, const int CuCount) {
UNREACHABLE_CODE
}
@ -565,9 +553,8 @@ __global__ void wvSplitK_hf_sml_(const int K, const int M, const int Bx,
template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
int UNRL, int N>
__global__ void __launch_bounds__(WvPrGrp* THRDS)
wvSplitK_hf_(const int K, const int M, const int Bx, const int By,
const scalar_t* B, const scalar_t* __restrict__ A,
const scalar_t* __restrict__ BIAS, scalar_t* C,
wvSplitK_hf_(const int K, const int M, const scalar_t* B,
const scalar_t* __restrict__ A, scalar_t* C,
const int _WvPrGrp, const int CuCount) {
constexpr int max_lds_len = LDS_SIZE / 2;
#if defined(__HIP__MI3XX__)
@ -785,17 +772,8 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
if (threadIdx.x == 63) {
for (int n = 0; n < N; n++) {
for (int i = 0; i < YTILE; i++) {
if (commitColumn[i]) {
if constexpr (std::is_same_v<scalar_t, half>) {
if (BIAS)
sum[n][i] += __half2float(BIAS[(m + i) % Bx + (n % By) * M]);
} else if constexpr (std::is_same_v<scalar_t, __hip_bfloat16>) {
if (BIAS)
sum[n][i] +=
__bfloat162float(BIAS[(m + i) % Bx + (n % By) * M]);
}
if (commitColumn[i])
C[m + i + n * M] = __float2s<scalar_t>(sum[n][i]);
}
}
}
}
@ -840,12 +818,8 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
if (threadIdx.x == 63) {
for (int n = 0; n < N; n++) {
for (int i = 0; i < YTILE; i++) {
if (commitColumn[i]) {
if (BIAS)
sum4[n][i][0] +=
__bfloat162float(BIAS[(m + i) % Bx + (n % By) * M]);
C[m + i + n * M] = __float2bfloat16(sum4[n][i][0]);
}
// if (commitColumn[i]) C[n + i + m * N] = __float2half(sum[n][i]);
C[m + i + n * M] = __float2bfloat16(sum4[n][i][0]);
}
}
}
@ -868,10 +842,8 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
#else // !defined(__HIP__GFX9__) TODO: Add NAVI support
template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
int UNRL, int N>
__global__ void wvSplitK_hf_(const int K, const int M, const int Bx,
const int By, const scalar_t* B,
const scalar_t* __restrict__ A,
const scalar_t* __restrict__ BIAS, scalar_t* C,
__global__ void wvSplitK_hf_(const int K, const int M, const scalar_t* B,
const scalar_t* __restrict__ A, scalar_t* C,
const int _WvPrGrp, const int CuCount) {
UNREACHABLE_CODE
}
@ -882,9 +854,8 @@ __global__ void wvSplitK_hf_(const int K, const int M, const int Bx,
template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
int UNRL, int N>
__global__ void __launch_bounds__(WvPrGrp* THRDS)
wvSplitK_hf_big_(const int K, const int M, const int Bx, const int By,
const scalar_t* B, const scalar_t* __restrict__ A,
const scalar_t* __restrict__ BIAS, scalar_t* C,
wvSplitK_hf_big_(const int K, const int M, const scalar_t* B,
const scalar_t* __restrict__ A, scalar_t* C,
const int _WvPrGrp, const int CuCount) {
constexpr int max_lds_len = LDS_SIZE / 2;
#if defined(__HIP__MI3XX__)
@ -1153,17 +1124,8 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
if (threadIdx.x == 63) {
for (int n = 0; n < N; n++) {
for (int i = 0; i < YTILE; i++) {
if (commitColumn[i]) {
if constexpr (std::is_same_v<scalar_t, half>) {
if (BIAS)
sum[n][i] += __half2float(BIAS[(m + i) % Bx + (n % By) * M]);
} else if constexpr (std::is_same_v<scalar_t, __hip_bfloat16>) {
if (BIAS)
sum[n][i] +=
__bfloat162float(BIAS[(m + i) % Bx + (n % By) * M]);
}
if (commitColumn[i])
C[m + i + n * M] = __float2s<scalar_t>(sum[n][i]);
}
}
}
}
@ -1204,12 +1166,8 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
if (threadIdx.x == 63) {
for (int n = 0; n < N; n++) {
for (int i = 0; i < YTILE; i++) {
if (commitColumn[i]) {
if (BIAS)
sum4[n][i][0] +=
__bfloat162float(BIAS[(m + i) % Bx + (n % By) * M]);
C[m + i + n * M] = __float2bfloat16(sum4[n][i][0]);
}
// if (commitColumn[i]) C[n + i + m * N] = __float2half(sum[n][i]);
C[m + i + n * M] = __float2bfloat16(sum4[n][i][0]);
}
}
}
@ -1232,10 +1190,8 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
#else // !defined(__HIP__GFX9__) TODO: Add NAVI support
template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
int UNRL, int N>
__global__ void wvSplitK_hf_big_(const int K, const int M, const int Bx,
const int By, const scalar_t* B,
const scalar_t* __restrict__ A,
const scalar_t* __restrict__ BIAS, scalar_t* C,
__global__ void wvSplitK_hf_big_(const int K, const int M, const scalar_t* B,
const scalar_t* __restrict__ A, scalar_t* C,
const int _WvPrGrp, const int CuCount) {
UNREACHABLE_CODE
}
@ -1270,20 +1226,11 @@ int mindiv(int N, int div1, int div2) {
return rtn;
}
torch::Tensor wvSplitK(const at::Tensor& in_a, const at::Tensor& in_b,
const std::optional<at::Tensor>& in_bias,
torch::Tensor wvSplitK(at::Tensor& in_a, at::Tensor& in_b,
const int64_t CuCount) {
auto M_in = in_a.size(0);
auto K_in = in_a.size(1);
auto N_in = in_b.size(0);
auto Bx_in =
(in_bias.has_value() && in_bias->numel() > 0)
? (in_bias->sizes().size() == 2) ? in_bias->size(1) : in_bias->size(0)
: 1;
auto By_in = (in_bias.has_value() && in_bias->numel() > 0 &&
in_bias->sizes().size() == 2)
? in_bias->size(0)
: 1;
TORCH_CHECK(in_a.dtype() == in_b.dtype());
TORCH_CHECK(K_in % 8 == 0, "k % 8 == 0");
@ -1307,18 +1254,18 @@ torch::Tensor wvSplitK(const at::Tensor& in_a, const at::Tensor& in_b,
if ((K_in * N_in <= max_lds_len) && (M_in % _YTILEs == 0)) { \
int __wvPrGrp = mindiv(M_in, CuCount * _YTILEs, _WvPrGrp); \
wvSplitK_hf_sml_<fptype, 64, _YTILEs, _WvPrGrp, 8, _UNRLs, _N> \
<<<grid, block, 0, stream>>>(K_in, M_in, Bx_in, By_in, af4, bf4, \
biasf4, c, __wvPrGrp, CuCount); \
<<<grid, block, 0, stream>>>(K_in, M_in, af4, bf4, c, __wvPrGrp, \
CuCount); \
} else if (K_in * N_in <= max_lds_len * 1.2) { \
int __wvPrGrp = mindiv(M_in, CuCount * _YTILEm, _WvPrGrp); \
wvSplitK_hf_<fptype, 64, _YTILEm, _WvPrGrp, 8, _UNRLm, _N> \
<<<grid, block, 0, stream>>>(K_in, M_in, Bx_in, By_in, af4, bf4, \
biasf4, c, __wvPrGrp, CuCount); \
<<<grid, block, 0, stream>>>(K_in, M_in, af4, bf4, c, __wvPrGrp, \
CuCount); \
} else { \
int __wvPrGrp = mindiv(M_in, CuCount * _YTILEb, _WvPrGrp); \
wvSplitK_hf_big_<fptype, 64, _YTILEb, _WvPrGrp, 8, _UNRLb, _N> \
<<<grid, block, 0, stream>>>(K_in, M_in, Bx_in, By_in, af4, bf4, \
biasf4, c, __wvPrGrp, CuCount); \
<<<grid, block, 0, stream>>>(K_in, M_in, af4, bf4, c, __wvPrGrp, \
CuCount); \
} \
}
@ -1326,10 +1273,6 @@ torch::Tensor wvSplitK(const at::Tensor& in_a, const at::Tensor& in_b,
using fptype = typename scalar<scalar_t>::type;
fptype* af4 = reinterpret_cast<fptype*>(in_a.data_ptr());
const fptype* bf4 = reinterpret_cast<const fptype*>(in_b.data_ptr());
const fptype* biasf4 =
(in_bias.has_value() && in_bias->numel() > 0)
? reinterpret_cast<const fptype*>(in_bias->data_ptr())
: nullptr;
fptype* c = reinterpret_cast<fptype*>(out_c.data_ptr());
switch (N_in) {
case 1:
@ -1357,9 +1300,8 @@ torch::Tensor wvSplitK(const at::Tensor& in_a, const at::Tensor& in_b,
template <typename scalar_t, typename fp8_t, int THRDS, int YTILE, int WvPrGrp,
int A_CHUNK, int UNRL, int N>
__global__ void __launch_bounds__(WvPrGrp* THRDS)
wvSplitKQ_hf_sml_(const int K, const int Kp, const int M, const int Bx,
const int By, const fp8_t* B, const fp8_t* __restrict__ A,
const scalar_t* __restrict__ BIAS, scalar_t* C,
wvSplitKQ_hf_sml_(const int K, const int Kp, const int M, const fp8_t* B,
const fp8_t* __restrict__ A, scalar_t* C,
const float* __restrict__ s_A,
const float* __restrict__ s_B, const int _WvPrGrp,
const int CuCount) {
@ -1511,17 +1453,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
if (threadIdx.x == 0) {
for (int n = 0; n < N; n++) {
for (int y = 0; y < YTILE; y++) {
if (y + m >= M) break; // To avoid mem access fault.
sum[n][y][0] *= sA * sB;
if constexpr (std::is_same_v<scalar_t, half>) {
if (BIAS)
sum[n][y][0] += __half2float(BIAS[(m + y) % Bx + (n % By) * M]);
} else if constexpr (std::is_same_v<scalar_t, __hip_bfloat16>) {
if (BIAS)
sum[n][y][0] +=
__bfloat162float(BIAS[(m + y) % Bx + (n % By) * M]);
}
C[m + y + n * M] = __float2s<scalar_t>(sum[n][y][0]); // * sA * sB);
C[m + y + n * M] = __float2s<scalar_t>(sum[n][y][0] * sA * sB);
}
}
}
@ -1533,9 +1465,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
template <typename scalar_t, typename fp8_t, int THRDS, int YTILE, int WvPrGrp,
int A_CHUNK, int UNRL, int N>
__global__ void wvSplitKQ_hf_sml_(const int K, const int Kp, const int M,
const int Bx, const int By, const fp8_t* B,
const fp8_t* __restrict__ A,
const scalar_t* __restrict__ BIAS,
const fp8_t* B, const fp8_t* __restrict__ A,
scalar_t* C, const float* __restrict__ s_A,
const float* __restrict__ s_B,
const int _WvPrGrp, const int CuCount) {
@ -1547,9 +1477,8 @@ __global__ void wvSplitKQ_hf_sml_(const int K, const int Kp, const int M,
template <typename scalar_t, typename fp8_t, int THRDS, int YTILE, int WvPrGrp,
int A_CHUNK, int UNRL, int N>
__global__ void __launch_bounds__(WvPrGrp* THRDS)
wvSplitKQ_hf_(const int K, const int Kp, const int M, const int Bx,
const int By, const fp8_t* B, const fp8_t* __restrict__ A,
const scalar_t* __restrict__ BIAS, scalar_t* C,
wvSplitKQ_hf_(const int K, const int Kp, const int M, const fp8_t* B,
const fp8_t* __restrict__ A, scalar_t* C,
const float* __restrict__ s_A, const float* __restrict__ s_B,
const int _WvPrGrp, const int CuCount) {
constexpr int max_lds_len = LDS_SIZE;
@ -1697,16 +1626,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
for (int n = 0; n < N; n++) {
for (int y = 0; y < YTILE; y++) {
if (y + m >= M) break; // To avoid mem access fault.
sum[n][y][0] *= sA * sB;
if constexpr (std::is_same_v<scalar_t, half>) {
if (BIAS)
sum[n][y][0] += __half2float(BIAS[(m + y) % Bx + (n % By) * M]);
} else if constexpr (std::is_same_v<scalar_t, __hip_bfloat16>) {
if (BIAS)
sum[n][y][0] +=
__bfloat162float(BIAS[(m + y) % Bx + (n % By) * M]);
}
C[m + y + n * M] = __float2s<scalar_t>(sum[n][y][0]);
C[m + y + n * M] = __float2s<scalar_t>(sum[n][y][0] * sA * sB);
}
}
}
@ -1718,19 +1638,16 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
template <typename scalar_t, typename fp8_t, int THRDS, int YTILE, int WvPrGrp,
int A_CHUNK, int UNRL, int N>
__global__ void wvSplitKQ_hf_(const int K, const int Kp, const int M,
const int Bx, const int By, const fp8_t* B,
const fp8_t* __restrict__ A,
const scalar_t* __restrict__ BIAS, scalar_t* C,
const float* __restrict__ s_A,
const fp8_t* B, const fp8_t* __restrict__ A,
scalar_t* C, const float* __restrict__ s_A,
const float* __restrict__ s_B, const int _WvPrGrp,
const int CuCount) {
UNREACHABLE_CODE
}
#endif // defined(__HIP__MI3XX__) TODO: Add NAVI support
void wvSplitKQ(const at::Tensor& in_a, const at::Tensor& in_b,
const std::optional<at::Tensor>& in_bias, at::Tensor& out_c,
const at::Tensor& scale_a, const at::Tensor& scale_b,
void wvSplitKQ(at::Tensor& in_a, at::Tensor& in_b, at::Tensor& out_c,
at::Tensor& scale_a, at::Tensor& scale_b,
const int64_t CuCount) {
static c10::ScalarType kFp8Type = is_fp8_ocp()
? c10::ScalarType::Float8_e4m3fn
@ -1739,15 +1656,6 @@ void wvSplitKQ(const at::Tensor& in_a, const at::Tensor& in_b,
auto K_in = in_a.size(1);
auto N_in = in_b.size(0);
auto Kp_in = in_a.stride(0);
auto Bx_in =
(in_bias.has_value() && in_bias->numel() > 0)
? (in_bias->sizes().size() == 2) ? in_bias->size(1) : in_bias->size(0)
: 1;
auto By_in = (in_bias.has_value() && in_bias->numel() > 0 &&
in_bias->sizes().size() == 2)
? in_bias->size(0)
: 1;
TORCH_CHECK(K_in % 16 == 0, "k % 16 == 0");
TORCH_CHECK(in_a.dtype() == in_b.dtype() && in_a.dtype() == kFp8Type);
TORCH_CHECK(out_c.dtype() == torch::kFloat16 ||
@ -1765,15 +1673,13 @@ void wvSplitKQ(const at::Tensor& in_a, const at::Tensor& in_b,
if ((K_in * N_in <= max_lds_len) && (M_in % _YTILEs == 0)) { \
int __wvPrGrp = mindiv(M_in, CuCount * _YTILEs, _WvPrGrp); \
wvSplitKQ_hf_sml_<fptype, fp8_t, 64, _YTILEs, _WvPrGrp, 16, _UNRLs, _N> \
<<<grid, block, 0, stream>>>(K_in, Kp_in, M_in, Bx_in, By_in, a_ptr, \
b_ptr, bias_ptr, c_ptr, s_a, s_b, \
__wvPrGrp, CuCount); \
<<<grid, block, 0, stream>>>(K_in, Kp_in, M_in, a_ptr, b_ptr, c_ptr, \
s_a, s_b, __wvPrGrp, CuCount); \
} else { \
int __wvPrGrp = mindiv(M_in, CuCount * _YTILEm, _WvPrGrp); \
wvSplitKQ_hf_<fptype, fp8_t, 64, _YTILEm, _WvPrGrp, 16, _UNRLm, _N> \
<<<grid, block, 0, stream>>>(K_in, Kp_in, M_in, Bx_in, By_in, a_ptr, \
b_ptr, bias_ptr, c_ptr, s_a, s_b, \
__wvPrGrp, CuCount); \
<<<grid, block, 0, stream>>>(K_in, Kp_in, M_in, a_ptr, b_ptr, c_ptr, \
s_a, s_b, __wvPrGrp, CuCount); \
} \
}
@ -1785,9 +1691,6 @@ void wvSplitKQ(const at::Tensor& in_a, const at::Tensor& in_b,
VLLM_DISPATCH_FP8_TYPES(in_a.scalar_type(), "wvSplitKQ", [&] {
auto a_ptr = in_a.data_ptr<fp8_t>();
auto b_ptr = in_b.data_ptr<fp8_t>();
auto bias_ptr = (in_bias.has_value() && in_bias->numel() > 0)
? reinterpret_cast<fptype*>(in_bias->data_ptr())
: nullptr;
switch (N_in) {
case 1:
WVSPLITKQ(16, 2, 2, 2, 2, 2, 2, 1)

View File

@ -22,14 +22,13 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, rocm_ops) {
// Custom gemm op for skinny matrix-matrix multiplication
rocm_ops.def(
"wvSplitK(Tensor in_a, Tensor in_b, Tensor? in_bias, int CuCount) -> "
"wvSplitK(Tensor in_a, Tensor in_b, int CuCount) -> "
"Tensor");
rocm_ops.impl("wvSplitK", torch::kCUDA, &wvSplitK);
// wvSplitK for fp8
rocm_ops.def(
"wvSplitKQ(Tensor in_a, Tensor in_b, Tensor? in_bias, Tensor! out_c, "
"Tensor scale_a, "
"wvSplitKQ(Tensor in_a, Tensor in_b, Tensor! out_c, Tensor scale_a, "
" Tensor scale_b, int CuCount) -> ()");
rocm_ops.impl("wvSplitKQ", torch::kCUDA, &wvSplitKQ);

View File

@ -114,6 +114,9 @@ WORKDIR /workspace/vllm
RUN --mount=type=bind,src=requirements/test.in,target=requirements/test.in \
cp requirements/test.in requirements/cpu-test.in && \
sed -i '/mamba_ssm/d' requirements/cpu-test.in && \
sed -i 's/^torch==.*/torch==2.6.0/g' requirements/cpu-test.in && \
sed -i 's/torchaudio.*/torchaudio/g' requirements/cpu-test.in && \
sed -i 's/torchvision.*/torchvision/g' requirements/cpu-test.in && \
uv pip compile requirements/cpu-test.in -o requirements/cpu-test.txt --index-strategy unsafe-best-match --torch-backend cpu
RUN --mount=type=cache,target=/root/.cache/uv \

View File

@ -29,10 +29,7 @@ ARG VLLM_BRANCH="main"
ONBUILD RUN git clone ${VLLM_REPO} \
&& cd vllm \
&& git fetch -v --prune -- origin ${VLLM_BRANCH} \
&& git checkout FETCH_HEAD \
&& if [ ${VLLM_REPO} != "https://github.com/vllm-project/vllm.git" ] ; then \
git remote add upstream "https://github.com/vllm-project/vllm.git" \
&& git fetch upstream ; fi
&& git checkout FETCH_HEAD
FROM fetch_vllm_${REMOTE_VLLM} AS fetch_vllm
# -----------------------

View File

@ -1,23 +1,25 @@
ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:7.0-complete
ARG TRITON_BRANCH="f9e5bf54"
ARG TRITON_REPO="https://github.com/ROCm/triton.git"
ARG PYTORCH_BRANCH="b2fb6885"
ARG PYTORCH_VISION_BRANCH="v0.23.0"
ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:6.4.1-complete
ARG HIPBLASLT_BRANCH="aa0bda7b"
ARG HIPBLAS_COMMON_BRANCH="9b80ba8e"
ARG LEGACY_HIPBLASLT_OPTION=
ARG TRITON_BRANCH="e5be006"
ARG TRITON_REPO="https://github.com/triton-lang/triton.git"
ARG PYTORCH_BRANCH="f717b2af"
ARG PYTORCH_VISION_BRANCH="v0.21.0"
ARG PYTORCH_REPO="https://github.com/ROCm/pytorch.git"
ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"
ARG FA_BRANCH="0e60e394"
ARG FA_BRANCH="1a7f4dfa"
ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git"
ARG AITER_BRANCH="2ab9f4cd"
ARG AITER_BRANCH="4822e675"
ARG AITER_REPO="https://github.com/ROCm/aiter.git"
FROM ${BASE_IMAGE} AS base
ENV PATH=/opt/rocm/llvm/bin:/opt/rocm/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
ENV PATH=/opt/rocm/llvm/bin:$PATH
ENV ROCM_PATH=/opt/rocm
ENV LD_LIBRARY_PATH=/opt/rocm/lib:/usr/local/lib:
ARG PYTORCH_ROCM_ARCH=gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201
ARG PYTORCH_ROCM_ARCH=gfx90a;gfx942;gfx1100;gfx1101;gfx1200;gfx1201
ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}
ENV AITER_ROCM_ARCH=gfx942;gfx950
ARG PYTHON_VERSION=3.12
@ -43,6 +45,29 @@ RUN apt-get update -y \
RUN pip install -U packaging 'cmake<4' ninja wheel 'setuptools<80' pybind11 Cython
FROM base AS build_hipblaslt
ARG HIPBLASLT_BRANCH
ARG HIPBLAS_COMMON_BRANCH
# Set to "--legacy_hipblas_direct" for ROCm<=6.2
ARG LEGACY_HIPBLASLT_OPTION
RUN git clone https://github.com/ROCm/hipBLAS-common.git
RUN apt-get remove -y hipblaslt && apt-get autoremove -y && apt-get autoclean -y
RUN cd hipBLAS-common \
&& git checkout ${HIPBLAS_COMMON_BRANCH} \
&& mkdir build \
&& cd build \
&& cmake .. \
&& make package \
&& dpkg -i ./*.deb
RUN git clone https://github.com/ROCm/hipBLASLt
RUN cd hipBLASLt \
&& git checkout ${HIPBLASLT_BRANCH} \
&& apt-get install -y llvm-dev \
&& ./install.sh -dc --architecture ${PYTORCH_ROCM_ARCH} ${LEGACY_HIPBLASLT_OPTION} \
&& cd build/release \
&& make package
RUN mkdir -p /app/install && cp /app/hipBLASLt/build/release/*.deb /app/hipBLAS-common/build/*.deb /app/install
FROM base AS build_triton
ARG TRITON_BRANCH
ARG TRITON_REPO
@ -65,6 +90,8 @@ ARG PYTORCH_BRANCH
ARG PYTORCH_VISION_BRANCH
ARG PYTORCH_REPO
ARG PYTORCH_VISION_REPO
ARG FA_BRANCH
ARG FA_REPO
RUN git clone ${PYTORCH_REPO} pytorch
RUN cd pytorch && git checkout ${PYTORCH_BRANCH} && \
pip install -r requirements.txt && git submodule update --init --recursive \
@ -75,20 +102,14 @@ RUN git clone ${PYTORCH_VISION_REPO} vision
RUN cd vision && git checkout ${PYTORCH_VISION_BRANCH} \
&& python3 setup.py bdist_wheel --dist-dir=dist \
&& pip install dist/*.whl
RUN mkdir -p /app/install && cp /app/pytorch/dist/*.whl /app/install \
&& cp /app/vision/dist/*.whl /app/install
FROM base AS build_fa
ARG FA_BRANCH
ARG FA_REPO
RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
pip install /install/*.whl
RUN git clone ${FA_REPO}
RUN cd flash-attention \
&& git checkout ${FA_BRANCH} \
&& git submodule update --init \
&& GPU_ARCHS=$(echo ${PYTORCH_ROCM_ARCH} | sed -e 's/;gfx1[0-9]\{3\}//g') python3 setup.py bdist_wheel --dist-dir=dist
RUN mkdir -p /app/install && cp /app/flash-attention/dist/*.whl /app/install
RUN mkdir -p /app/install && cp /app/pytorch/dist/*.whl /app/install \
&& cp /app/vision/dist/*.whl /app/install \
&& cp /app/flash-attention/dist/*.whl /app/install
FROM base AS build_aiter
ARG AITER_BRANCH
@ -100,15 +121,15 @@ RUN cd aiter \
&& git checkout ${AITER_BRANCH} \
&& git submodule update --init --recursive \
&& pip install -r requirements.txt
RUN pip install pyyaml && cd aiter && PREBUILD_KERNELS=1 GPU_ARCHS=${AITER_ROCM_ARCH} python3 setup.py bdist_wheel --dist-dir=dist && ls /app/aiter/dist/*.whl
RUN pip install pyyaml && cd aiter && PREBUILD_KERNELS=1 GPU_ARCHS=gfx942 python3 setup.py bdist_wheel --dist-dir=dist && ls /app/aiter/dist/*.whl
RUN mkdir -p /app/install && cp /app/aiter/dist/*.whl /app/install
FROM base AS debs
RUN mkdir /app/debs
RUN --mount=type=bind,from=build_hipblaslt,src=/app/install/,target=/install \
cp /install/*.deb /app/debs
RUN --mount=type=bind,from=build_triton,src=/app/install/,target=/install \
cp /install/*.whl /app/debs
RUN --mount=type=bind,from=build_fa,src=/app/install/,target=/install \
cp /install/*.whl /app/debs
RUN --mount=type=bind,from=build_amdsmi,src=/app/install/,target=/install \
cp /install/*.whl /app/debs
RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
@ -117,10 +138,24 @@ RUN --mount=type=bind,from=build_aiter,src=/app/install/,target=/install \
cp /install/*.whl /app/debs
FROM base AS final
RUN --mount=type=bind,from=debs,src=/app/debs,target=/install \
RUN --mount=type=bind,from=build_hipblaslt,src=/app/install/,target=/install \
dpkg -i /install/*deb \
&& perl -p -i -e 's/, hipblas-common-dev \([^)]*?\), /, /g' /var/lib/dpkg/status \
&& perl -p -i -e 's/, hipblaslt-dev \([^)]*?\), /, /g' /var/lib/dpkg/status \
&& perl -p -i -e 's/, hipblaslt \([^)]*?\), /, /g' /var/lib/dpkg/status
RUN --mount=type=bind,from=build_triton,src=/app/install/,target=/install \
pip install /install/*.whl
RUN --mount=type=bind,from=build_amdsmi,src=/app/install/,target=/install \
pip install /install/*.whl
RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
pip install /install/*.whl
RUN --mount=type=bind,from=build_aiter,src=/app/install/,target=/install \
pip install /install/*.whl
ARG BASE_IMAGE
ARG HIPBLAS_COMMON_BRANCH
ARG HIPBLASLT_BRANCH
ARG LEGACY_HIPBLASLT_OPTION
ARG TRITON_BRANCH
ARG TRITON_REPO
ARG PYTORCH_BRANCH
@ -132,6 +167,9 @@ ARG FA_REPO
ARG AITER_BRANCH
ARG AITER_REPO
RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
&& echo "HIPBLAS_COMMON_BRANCH: ${HIPBLAS_COMMON_BRANCH}" >> /app/versions.txt \
&& echo "HIPBLASLT_BRANCH: ${HIPBLASLT_BRANCH}" >> /app/versions.txt \
&& echo "LEGACY_HIPBLASLT_OPTION: ${LEGACY_HIPBLASLT_OPTION}" >> /app/versions.txt \
&& echo "TRITON_BRANCH: ${TRITON_BRANCH}" >> /app/versions.txt \
&& echo "TRITON_REPO: ${TRITON_REPO}" >> /app/versions.txt \
&& echo "PYTORCH_BRANCH: ${PYTORCH_BRANCH}" >> /app/versions.txt \
@ -139,6 +177,5 @@ RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
&& echo "PYTORCH_REPO: ${PYTORCH_REPO}" >> /app/versions.txt \
&& echo "PYTORCH_VISION_REPO: ${PYTORCH_VISION_REPO}" >> /app/versions.txt \
&& echo "FA_BRANCH: ${FA_BRANCH}" >> /app/versions.txt \
&& echo "FA_REPO: ${FA_REPO}" >> /app/versions.txt \
&& echo "AITER_BRANCH: ${AITER_BRANCH}" >> /app/versions.txt \
&& echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt

View File

@ -14,7 +14,7 @@ API documentation for vLLM's configuration classes.
- [vllm.config.LoRAConfig][]
- [vllm.config.MultiModalConfig][]
- [vllm.config.PoolerConfig][]
- [vllm.config.StructuredOutputsConfig][]
- [vllm.config.DecodingConfig][]
- [vllm.config.ObservabilityConfig][]
- [vllm.config.KVTransferConfig][]
- [vllm.config.CompilationConfig][]
@ -46,6 +46,7 @@ Engine classes for offline and online inference.
Inference parameters for vLLM APIs.
[](){ #sampling-params }
[](){ #pooling-params }
- [vllm.SamplingParams][]
- [vllm.PoolingParams][]

View File

@ -139,9 +139,9 @@ there is relatively little gain from TP. On the other hand, TP incurs significan
overhead because of all-reduce being performed after every layer.
Given this, it may be advantageous to instead shard the batched input data using TP, essentially
performing batch-level DP. This has been shown to improve the throughput and TTFT by around 10% for
performing batch-level DP. This has been shown to improve the throughput by around 10% for
`tensor_parallel_size=8`. For vision encoders that use hardware-unoptimized Conv3D operations,
batch-level DP can provide another 40% improvement compared to regular TP.
batch-level DP can provide another 40% increase to throughput compared to regular TP.
Nevertheless, since the weights of the multi-modal encoder are replicated across each TP rank,
there will be a minor increase in memory consumption and may cause OOM if you can barely fit the model already.
@ -172,15 +172,13 @@ Batch-level DP needs to be implemented on a per-model basis,
and enabled by setting `supports_encoder_tp_data = True` in the model class.
Regardless, you need to set `mm_encoder_tp_mode="data"` in engine arguments to use this feature.
Known supported models (with corresponding benchmarks):
Known supported models:
- dots_ocr (<gh-pr:25466>)
- GLM-4.1V or above (<gh-pr:23168>)
- InternVL (<gh-pr:23909>)
- GLM-4.5V GLM-4.1V (<gh-pr:23168>)
- Kimi-VL (<gh-pr:23817>)
- Llama4 (<gh-pr:18368>)
- MiniCPM-V-2.5 or above (<gh-pr:23327>, <gh-pr:23948>)
- Qwen2-VL or above (<gh-pr:22742>, <gh-pr:24955>, <gh-pr:25445>)
- Qwen2.5-VL (<gh-pr:22742>)
- Step3 (<gh-pr:22697>)
## Input Processing

View File

@ -26,123 +26,113 @@ See <gh-file:LICENSE>.
## Developing
The first step of contributing to vLLM is to clone the GitHub repository:
--8<-- "docs/getting_started/installation/python_env_setup.inc.md"
Depending on the kind of development you'd like to do (e.g. Python, CUDA), you can choose to build vLLM with or without compilation.
Check out the [building from source][build-from-source] documentation for details.
For an optimized workflow when iterating on C++/CUDA kernels, see the [Incremental Compilation Workflow](./incremental_build.md) for recommendations.
### Building the docs with MkDocs
#### Introduction to MkDocs
[MkDocs](https://github.com/mkdocs/mkdocs) is a fast, simple and downright gorgeous static site generator that's geared towards building project documentation. Documentation source files are written in Markdown, and configured with a single YAML configuration file.
#### Install MkDocs and Plugins
Install MkDocs along with the [plugins](https://github.com/vllm-project/vllm/blob/main/mkdocs.yaml) used in the vLLM documentation, as well as required dependencies:
```bash
uv pip install -r requirements/docs.txt
```
!!! note
Ensure that your Python version is compatible with the plugins (e.g., `mkdocs-awesome-nav` requires Python 3.10+)
#### Verify Installation
Confirm that MkDocs is correctly installed:
```bash
mkdocs --version
```
Example output:
```console
mkdocs, version 1.6.1 from /opt/miniconda3/envs/mkdoc/lib/python3.10/site-packages/mkdocs (Python 3.10)
```
#### Clone the `vLLM` repository
```bash
git clone https://github.com/vllm-project/vllm.git
cd vllm
```
Then, configure your Python virtual environment.
#### Start the Development Server
--8<-- "docs/getting_started/installation/python_env_setup.inc.md"
If you are only developing vLLM's Python code, install vLLM using:
MkDocs comes with a built-in dev-server that lets you preview your documentation as you work on it. Make sure you're in the same directory as the `mkdocs.yml` configuration file, and then start the server by running the `mkdocs serve` command:
```bash
VLLM_USE_PRECOMPILED=1 uv pip install -e .
mkdocs serve
```
If you are developing vLLM's Python and CUDA/C++ code, install vLLM using:
Example output:
```bash
uv pip install -e .
```console
INFO - Documentation built in 106.83 seconds
INFO - [22:02:02] Watching paths for changes: 'docs', 'mkdocs.yaml'
INFO - [22:02:02] Serving on http://127.0.0.1:8000/
```
For more details about installing from source and installing for other hardware, check out the [installation instructions](../getting_started/installation/README.md) for your hardware and head to the "Build wheel from source" section.
#### View in Your Browser
For an optimized workflow when iterating on C++/CUDA kernels, see the [Incremental Compilation Workflow](./incremental_build.md) for recommendations.
Open up [http://127.0.0.1:8000/](http://127.0.0.1:8000/) in your browser to see a live preview:.
#### Learn More
For additional features and advanced configurations, refer to the official [MkDocs Documentation](https://www.mkdocs.org/).
## Testing
??? console "Commands"
```bash
# These commands are only for Nvidia CUDA platforms.
uv pip install -r requirements/common.txt -r requirements/dev.txt --torch-backend=auto
# Linting, formatting and static type checking
pre-commit install
# You can manually run pre-commit with
pre-commit run --all-files --show-diff-on-failure
# To manually run something from CI that does not run
# locally by default, you can run:
pre-commit run mypy-3.9 --hook-stage manual --all-files
# Unit tests
pytest tests/
# Run tests for a single test file with detailed output
pytest -s -v tests/test_logger.py
```
!!! tip
vLLM is compatible with Python versions 3.9 to 3.12. However, vLLM's default [Dockerfile](gh-file:docker/Dockerfile) ships with Python 3.12 and tests in CI (except `mypy`) are run with Python 3.12.
Since the <gh-file:docker/Dockerfile> ships with Python 3.12, all tests in CI (except `mypy`) are run with Python 3.12.
Therefore, we recommend developing with Python 3.12 to minimise the chance of your local environment clashing with our CI environment.
### Linting
vLLM uses `pre-commit` to lint and format the codebase. See <https://pre-commit.com/#usage> if `pre-commit` is new to you. Setting up `pre-commit` is as easy as:
```bash
uv pip install pre-commit
pre-commit install
```
vLLM's `pre-commit` hooks will now run automatically every time you commit.
!!! tip "Tips"
You can manually run the `pre-commit` hooks using:
```bash
pre-commit run # runs on staged files
pre-commit run -a # runs on all files (short for --all-files)
```
---
Some `pre-commit` hooks only run in CI. If you need to, you can run them locally with:
```bash
pre-commit run --hook-stage manual markdownlint
pre-commit run --hook-stage manual mypy-3.9
```
### Documentation
MkDocs is a fast, simple and downright gorgeous static site generator that's geared towards building project documentation. Documentation source files are written in Markdown, and configured with a single YAML configuration file, <gh-file:mkdocs.yaml>.
Get started with:
```bash
uv pip install -r requirements/docs.txt
```
!!! tip
Ensure that your Python version is compatible with the plugins
(e.g., `mkdocs-awesome-nav` requires Python 3.10+)
MkDocs comes with a built-in dev-server that lets you preview your documentation as you work on it.
From the root of the repository, run:
```bash
mkdocs serve # with API ref (~10 minutes)
API_AUTONAV_EXCLUDE=vllm mkdocs serve # API ref off (~15 seconds)
```
Once you see `Serving on http://127.0.0.1:8000/` in the logs, the live preview is ready!
Open <http://127.0.0.1:8000/> in your browser to see it.
For additional features and advanced configurations, refer to the:
- [MkDocs documentation](https://www.mkdocs.org/)
- [Material for MkDocs documentation](https://squidfunk.github.io/mkdocs-material/) (the MkDocs theme we use)
### Testing
vLLM uses `pytest` to test the codebase.
```bash
# Install the test dependencies used in CI (CUDA only)
uv pip install -r requirements/common.txt -r requirements/dev.txt --torch-backend=auto
# Install some common test dependencies (hardware agnostic)
uv pip install pytest pytest-asyncio
# Run all tests
pytest tests/
# Run tests for a single test file with detailed output
pytest -s -v tests/test_logger.py
```
!!! tip "Install python3-dev if Python.h is missing"
!!! note "Install python3-dev if Python.h is missing"
If any of the above commands fails with `Python.h: No such file or directory`, install
`python3-dev` with `sudo apt install python3-dev`.
!!! warning "Warnings"
!!! note
Currently, the repository is not fully checked by `mypy`.
---
!!! note
Currently, not all unit tests pass when run on CPU platforms. If you don't have access to a GPU
platform to run unit tests locally, rely on the continuous integration system to run the tests for
now.
@ -204,7 +194,8 @@ appropriately to indicate the type of change. Please use one of the following:
The PR needs to meet the following code quality standards:
- We adhere to [Google Python style guide](https://google.github.io/styleguide/pyguide.html) and [Google C++ style guide](https://google.github.io/styleguide/cppguide.html).
- Pass all linter checks.
- Pass all linter checks. Please use `pre-commit` to format your code. See
<https://pre-commit.com/#usage> if `pre-commit` is new to you.
- The code needs to be well-documented to ensure future contributors can easily
understand the code.
- Include sufficient tests to ensure the project stays correct and robust. This

View File

@ -156,6 +156,7 @@ vllm serve Qwen/Qwen2-VL-7B-Instruct
```bash
vllm bench serve \
--backend openai-chat \
--endpoint-type openai-chat \
--model Qwen/Qwen2-VL-7B-Instruct \
--endpoint /v1/chat/completions \
--dataset-name hf \
@ -229,6 +230,7 @@ vllm serve Qwen/Qwen2-VL-7B-Instruct
```bash
vllm bench serve \
--backend openai-chat \
--endpoint-type openai-chat \
--model Qwen/Qwen2-VL-7B-Instruct \
--endpoint /v1/chat/completions \
--dataset-name hf \
@ -243,6 +245,7 @@ vllm bench serve \
```bash
vllm bench serve \
--backend openai-chat \
--endpoint-type openai-chat \
--model Qwen/Qwen2-VL-7B-Instruct \
--endpoint /v1/chat/completions \
--dataset-name hf \
@ -680,7 +683,7 @@ vllm bench serve \
--save-result \
--result-dir ~/vllm_benchmark_results \
--save-detailed \
--endpoint /v1/chat/completions
--endpoint /v1/chat/completion
```
##### Videos (ShareGPT4Video)
@ -707,7 +710,7 @@ vllm bench serve \
--save-result \
--result-dir ~/vllm_benchmark_results \
--save-detailed \
--endpoint /v1/chat/completions
--endpoint /v1/chat/completion
```
##### Synthetic Random Images (random-mm)

View File

@ -36,23 +36,22 @@ th:not(:first-child) {
}
</style>
| Feature | [CP][chunked-prefill] | [APC](automatic_prefix_caching.md) | [LoRA](lora.md) | [SD](spec_decode.md) | CUDA graph | [pooling](../models/pooling_models.md) | <abbr title="Encoder-Decoder Models">enc-dec</abbr> | <abbr title="Logprobs">logP</abbr> | <abbr title="Prompt Logprobs">prmpt logP</abbr> | <abbr title="Async Output Processing">async output</abbr> | multi-step | <abbr title="Multimodal Inputs">mm</abbr> | best-of | beam-search | [prompt-embeds](prompt_embeds.md) |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| [CP][chunked-prefill] | ✅ | | | | | | | | | | | | | | |
| [APC](automatic_prefix_caching.md) | ✅ | ✅ | | | | | | | | | | | | | |
| [LoRA](lora.md) | ✅ | ✅ | ✅ | | | | | | | | | | | | |
| [SD](spec_decode.md) | ✅ | ✅ | ❌ | ✅ | | | | | | | | | | | |
| CUDA graph | ✅ | ✅ | ✅ | ✅ | ✅ | | | | | | | | | | |
| [pooling](../models/pooling_models.md) | 🟠\* | 🟠\* | ✅ | ❌ | ✅ | ✅ | | | | | | | | | |
| <abbr title="Encoder-Decoder Models">enc-dec</abbr> | ❌ | [](gh-issue:7366) | ❌ | [](gh-issue:7366) | ✅ | ✅ | ✅ | | | | | | | | |
| <abbr title="Logprobs">logP</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | | | | | | | |
| <abbr title="Prompt Logprobs">prmpt logP</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | | | | | | |
| <abbr title="Async Output Processing">async output</abbr> | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ | | | | | |
| multi-step | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | | | | |
| [mm](multimodal_inputs.md) | ✅ | ✅ | [🟠](gh-pr:4194)<sup>^</sup> | ❔ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❔ | ✅ | | | |
| best-of | ✅ | ✅ | ✅ | [](gh-issue:6137) | ✅ | ❌ | ✅ | ✅ | ✅ | ❔ | [](gh-issue:7968) | ✅ | ✅ | | |
| beam-search | ✅ | ✅ | ✅ | [](gh-issue:6137) | ✅ | ❌ | ✅ | ✅ | ✅ | ❔ | [](gh-issue:7968) | ❔ | ✅ | ✅ | |
| [prompt-embeds](prompt_embeds.md) | ✅ | [](gh-issue:25096) | ? | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ? | ? | ❌ | ? | ? | ✅ |
| Feature | [CP][chunked-prefill] | [APC](automatic_prefix_caching.md) | [LoRA](lora.md) | [SD](spec_decode.md) | CUDA graph | [pooling](../models/pooling_models.md) | <abbr title="Encoder-Decoder Models">enc-dec</abbr> | <abbr title="Logprobs">logP</abbr> | <abbr title="Prompt Logprobs">prmpt logP</abbr> | <abbr title="Async Output Processing">async output</abbr> | multi-step | <abbr title="Multimodal Inputs">mm</abbr> | best-of | beam-search |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| [CP][chunked-prefill] | ✅ | | | | | | | | | | | | | |
| [APC](automatic_prefix_caching.md) | ✅ | ✅ | | | | | | | | | | | | |
| [LoRA](lora.md) | ✅ | ✅ | ✅ | | | | | | | | | | | |
| [SD](spec_decode.md) | ✅ | ✅ | ❌ | ✅ | | | | | | | | | | |
| CUDA graph | ✅ | ✅ | ✅ | ✅ | ✅ | | | | | | | | | |
| [pooling](../models/pooling_models.md) | 🟠\* | 🟠\* | ✅ | ❌ | ✅ | ✅ | | | | | | | | |
| <abbr title="Encoder-Decoder Models">enc-dec</abbr> | ❌ | [](gh-issue:7366) | ❌ | [](gh-issue:7366) | ✅ | ✅ | ✅ | | | | | | | |
| <abbr title="Logprobs">logP</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | | | | | | |
| <abbr title="Prompt Logprobs">prmpt logP</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | | | | | |
| <abbr title="Async Output Processing">async output</abbr> | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ | | | | |
| multi-step | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | | | |
| [mm](multimodal_inputs.md) | ✅ | ✅ | [🟠](gh-pr:4194)<sup>^</sup> | ❔ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❔ | ✅ | | |
| best-of | ✅ | ✅ | ✅ | [](gh-issue:6137) | ✅ | ❌ | ✅ | ✅ | ✅ | ❔ | [](gh-issue:7968) | ✅ | ✅ | |
| beam-search | ✅ | ✅ | ✅ | [](gh-issue:6137) | ✅ | ❌ | ✅ | ✅ | ✅ | ❔ | [](gh-issue:7968) | ❔ | ✅ | ✅ |
\* Chunked prefill and prefix caching are only applicable to last-token pooling.
<sup>^</sup> LoRA is only applicable to the language backbone of multimodal models.
@ -77,4 +76,3 @@ th:not(:first-child) {
| multi-step | ✅ | ✅ | ✅ | ✅ | ✅ | [](gh-issue:8477) | ✅ | ❌ |
| best-of | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
| beam-search | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
| [prompt-embeds](prompt_embeds.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ? | [](gh-issue:25097) |

View File

@ -23,7 +23,7 @@ Now supports 5 types of connectors:
- **SharedStorageConnector**: refer to <gh-file:examples/offline_inference/disaggregated-prefill-v1/run.sh> for the example usage of SharedStorageConnector disaggregated prefilling.
- **LMCacheConnectorV1**: refer to <gh-file:examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh> for the example usage of LMCacheConnectorV1 disaggregated prefilling which uses NIXL as the underlying KV transmission.
- **NixlConnector**: refer to <gh-file:tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh> for the example usage of NixlConnector disaggregated prefilling which support fully async send/recv. For detailed usage guide, see [NixlConnector Usage Guide](nixl_connector_usage.md).
- **NixlConnector**: refer to <gh-file:tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh> for the example usage of NixlConnector disaggregated prefilling which support fully async send/recv.
- **P2pNcclConnector**: refer to <gh-file:examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh> for the example usage of P2pNcclConnector disaggregated prefilling.
- **MultiConnector**: take advantage of the kv_connector_extra_config: dict[str, Any] already present in KVTransferConfig to stash all the connectors we want in an ordered list of kwargs.such as:
@ -31,18 +31,6 @@ Now supports 5 types of connectors:
--kv-transfer-config '{"kv_connector":"MultiConnector","kv_role":"kv_both","kv_connector_extra_config":{"connectors":[{"kv_connector":"NixlConnector","kv_role":"kv_both"},{"kv_connector":"SharedStorageConnector","kv_role":"kv_both","kv_connector_extra_config":{"shared_storage_path":"local_storage"}}]}}'
```
For NixlConnector, you may also specify one or multiple NIXL_Backend. Such as:
```bash
--kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both", "kv_buffer_device":"cuda", "kv_connector_extra_config":{"backends":["UCX", "GDS"]}}'
```
- **OffloadingConnector**: enable offloading of KV data to CPU memory, customizing the CPU block size (in tokens) and number of blocks to allocate (per worker):
```bash
--kv-transfer-config '{"kv_connector":"OffloadingConnector","kv_role":"kv_both","kv_connector_extra_config":{"block_size": 64, "num_cpu_blocks": 1000}}'
```
## Benchmarks
Please refer to <gh-file:benchmarks/disagg_benchmarks> for disaggregated prefilling benchmarks.

View File

@ -1,159 +0,0 @@
# NixlConnector Usage Guide
NixlConnector is a high-performance KV cache transfer connector for vLLM's disaggregated prefilling feature. It provides fully asynchronous send/receive operations using the NIXL library for efficient cross-process KV cache transfer.
## Prerequisites
### Installation
Install the NIXL library: `uv pip install nixl`, as a quick start.
- Refer to [NIXL official repository](https://github.com/ai-dynamo/nixl) for more installation instructions
- The specified required NIXL version can be found in [requirements/kv_connectors.txt](gh-file:requirements/kv_connectors.txt) and other relevant config files
### Transport Configuration
NixlConnector uses NIXL library for underlying communication, which supports multiple transport backends. UCX (Unified Communication X) is the primary default transport library used by NIXL. Configure transport environment variables:
```bash
# Example UCX configuration, adjust according to your enviroment
export UCX_TLS=all # or specify specific transports like "rc,ud,sm,^cuda_ipc" ..etc
export UCX_NET_DEVICES=all # or specify network devices like "mlx5_0:1,mlx5_1:1"
```
!!! tip
When using UCX as the transport backend, NCCL environment variables (like `NCCL_IB_HCA`, `NCCL_SOCKET_IFNAME`) are not applicable to NixlConnector, so configure UCX-specific environment variables instead of NCCL variables.
## Basic Usage (on the same host)
### Producer (Prefiller) Configuration
Start a prefiller instance that produces KV caches
```bash
# 1st GPU as prefiller
CUDA_VISIBLE_DEVICES=0 \
UCX_NET_DEVICES=all \
VLLM_NIXL_SIDE_CHANNEL_PORT=5600 \
vllm serve Qwen/Qwen3-0.6B \
--port 8100 \
--enforce-eager \
--kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}'
```
### Consumer (Decoder) Configuration
Start a decoder instance that consumes KV caches:
```bash
# 2nd GPU as decoder
CUDA_VISIBLE_DEVICES=1 \
UCX_NET_DEVICES=all \
VLLM_NIXL_SIDE_CHANNEL_PORT=5601 \
vllm serve Qwen/Qwen3-0.6B \
--port 8200 \
--enforce-eager \
--kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}'
```
### Proxy Server
Use a proxy server to route requests between prefiller and decoder:
```bash
python tests/v1/kv_connector/nixl_integration/toy_proxy_server.py \
--port 8192 \
--prefiller-hosts localhost \
--prefiller-ports 8100 \
--decoder-hosts localhost \
--decoder-ports 8200
```
## Environment Variables
- `VLLM_NIXL_SIDE_CHANNEL_PORT`: Port for NIXL handshake communication
- Default: 5600
- **Required for both prefiller and decoder instances**
- Each vLLM worker needs a unique port on its host; using the same port number across different hosts is fine
- For TP/DP deployments, each worker's port on a node is computed as: base_port + dp_rank * tp_size + tp_rank (e.g., with `--tensor-parallel-size=4` and base_port=5600, tp_rank 0..3 use ports 5600, 5601, 5602, 5603 on that node).
- Used for the initial NIXL handshake between the prefiller and the decoder
- `VLLM_NIXL_SIDE_CHANNEL_HOST`: Host for side channel communication
- Default: "localhost"
- Set when prefiller and decoder are on different machines
- Connection info is passed via KVTransferParams from prefiller to decoder for handshake
- `VLLM_NIXL_ABORT_REQUEST_TIMEOUT`: Timeout (in seconds) for automatically releasing the prefillers KV cache for a particular request. (Optional)
- Default: 120
- If a request is aborted and the decoder has not yet read the KV-cache blocks through the nixl channel, the prefill instance will release its KV-cache blocks after this timeout to avoid holding them indefinitely.
## Multi-Instance Setup
### Multiple Prefiller Instances on Different Machines
```bash
# Prefiller 1 on Machine A (example IP: ${IP1})
VLLM_NIXL_SIDE_CHANNEL_HOST=${IP1} \
VLLM_NIXL_SIDE_CHANNEL_PORT=5600 \
UCX_NET_DEVICES=all \
vllm serve Qwen/Qwen3-0.6B --port 8000 \
--tensor-parallel-size 8 \
--kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_producer"}'
# Prefiller 2 on Machine B (example IP: ${IP2})
VLLM_NIXL_SIDE_CHANNEL_HOST=${IP2} \
VLLM_NIXL_SIDE_CHANNEL_PORT=5600 \
UCX_NET_DEVICES=all \
vllm serve Qwen/Qwen3-0.6B --port 8000 \
--tensor-parallel-size 8 \
--kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_producer"}'
```
### Multiple Decoder Instances on Different Machines
```bash
# Decoder 1 on Machine C (example IP: ${IP3})
VLLM_NIXL_SIDE_CHANNEL_HOST=${IP3} \
VLLM_NIXL_SIDE_CHANNEL_PORT=5600 \
UCX_NET_DEVICES=all \
vllm serve Qwen/Qwen3-0.6B --port 8000 \
--tensor-parallel-size 8 \
--kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_consumer"}'
# Decoder 2 on Machine D (example IP: ${IP4})
VLLM_NIXL_SIDE_CHANNEL_HOST=${IP4} \
VLLM_NIXL_SIDE_CHANNEL_PORT=5600 \
UCX_NET_DEVICES=all \
vllm serve Qwen/Qwen3-0.6B --port 8000 \
--tensor-parallel-size 8 \
--kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_consumer"}'
```
### Proxy for Multiple Instances
```bash
python tests/v1/kv_connector/nixl_integration/toy_proxy_server.py \
--port 8192 \
--prefiller-hosts ${IP1} ${IP2} \
--prefiller-ports 8000 8000 \
--decoder-hosts ${IP3} ${IP4} \
--decoder-ports 8000 8000
```
### KV Role Options
- **kv_producer**: For prefiller instances that generate KV caches
- **kv_consumer**: For decoder instances that consume KV caches from prefiller
- **kv_both**: Enables symmetric functionality where the connector can act as both producer and consumer. This provides flexibility for experimental setups and scenarios where the role distinction is not predetermined.
!!! tip
NixlConnector currently does not distinguish `kv_role`; the actual prefiller/decoder roles are determined by the upper-level proxy (e.g., `toy_proxy_server.py` using `--prefiller-hosts` and `--decoder-hosts`).
Therefore, `kv_role` in `--kv-transfer-config` is effectively a placeholder and does not affect NixlConnector's behavior.
## Example Scripts/Code
Refer to these example scripts in the vLLM repository:
- [run_accuracy_test.sh](gh-file:tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh)
- [toy_proxy_server.py](gh-file:tests/v1/kv_connector/nixl_integration/toy_proxy_server.py)
- [test_accuracy.py](gh-file:tests/v1/kv_connector/nixl_integration/test_accuracy.py)

View File

@ -6,6 +6,9 @@ This page teaches you how to pass prompt embedding inputs to vLLM.
The traditional flow of text data for a Large Language Model goes from text to token ids (via a tokenizer) then from token ids to prompt embeddings. For a traditional decoder-only model (such as meta-llama/Llama-3.1-8B-Instruct), this step of converting token ids to prompt embeddings happens via a look-up from a learned embedding matrix, but the model is not limited to processing only the embeddings corresponding to its token vocabulary.
!!! note
Prompt embeddings are currently only supported in the v0 engine.
## Offline Inference
To input multi-modal data, follow this schema in [vllm.inputs.EmbedsPrompt][]:

View File

@ -10,12 +10,12 @@ vLLM currently supports the following reasoning models:
| Model Series | Parser Name | Structured Output Support | Tool Calling |
|--------------|-------------|------------------|-------------|
| [DeepSeek R1 series](https://huggingface.co/collections/deepseek-ai/deepseek-r1-678e1e131c0169c0bc89728d) | `deepseek_r1` | `json`, `regex` | ❌ |
| [QwQ-32B](https://huggingface.co/Qwen/QwQ-32B) | `deepseek_r1` | `json`, `regex` | ✅ |
| [DeepSeek R1 series](https://huggingface.co/collections/deepseek-ai/deepseek-r1-678e1e131c0169c0bc89728d) | `deepseek_r1` | `guided_json`, `guided_regex` | ❌ |
| [QwQ-32B](https://huggingface.co/Qwen/QwQ-32B) | `deepseek_r1` | `guided_json`, `guided_regex` | ✅ |
| [IBM Granite 3.2 language models](https://huggingface.co/collections/ibm-granite/granite-32-language-models-67b3bc8c13508f6d064cff9a) | `granite` | ❌ | ❌ |
| [Qwen3 series](https://huggingface.co/collections/Qwen/qwen3-67dd247413f0e2e4f653967f) | `qwen3` | `json`, `regex` | ✅ |
| [Hunyuan A13B series](https://huggingface.co/collections/tencent/hunyuan-a13b-685ec38e5b46321e3ea7c4be) | `hunyuan_a13b` | `json`, `regex` | ✅ |
| [GLM-4.5 series](https://huggingface.co/collections/zai-org/glm-45-687c621d34bda8c9e4bf503b) | `glm45` | `json`, `regex` | ✅ |
| [Qwen3 series](https://huggingface.co/collections/Qwen/qwen3-67dd247413f0e2e4f653967f) | `qwen3` | `guided_json`, `guided_regex` | ✅ |
| [Hunyuan A13B series](https://huggingface.co/collections/tencent/hunyuan-a13b-685ec38e5b46321e3ea7c4be) | `hunyuan_a13b` | `guided_json`, `guided_regex` | ✅ |
| [GLM-4.5 series](https://huggingface.co/collections/zai-org/glm-45-687c621d34bda8c9e4bf503b) | `glm45` | `guided_json`, `guided_regex` | ✅ |
!!! note
IBM Granite 3.2 reasoning is disabled by default; to enable it, you must also pass `thinking=True` in your `chat_template_kwargs`.

View File

@ -6,40 +6,29 @@ vLLM supports the generation of structured outputs using
This document shows you some examples of the different options that are
available to generate structured outputs.
!!! warning
If you are still using the following deprecated API fields, please update your code to use `structured_outputs` as demonstrated in the rest of this document:
- `guided_json` -> `{"structured_outputs": {"json": ...}}` or `StructuredOutputsParams(json=...)`
- `guided_regex` -> `{"structured_outputs": {"regex": ...}}` or `StructuredOutputsParams(regex=...)`
- `guided_choice` -> `{"structured_outputs": {"choice": ...}}` or `StructuredOutputsParams(choice=...)`
- `guided_grammar` -> `{"structured_outputs": {"grammar": ...}}` or `StructuredOutputsParams(grammar=...)`
- `guided_whitespace_pattern` -> `{"structured_outputs": {"whitespace_pattern": ...}}` or `StructuredOutputsParams(whitespace_pattern=...)`
- `structural_tag` -> `{"structured_outputs": {"structural_tag": ...}}` or `StructuredOutputsParams(structural_tag=...)`
- `guided_decoding_backend` -> Remove this field from your request
## Online Serving (OpenAI API)
You can generate structured outputs using the OpenAI's [Completions](https://platform.openai.com/docs/api-reference/completions) and [Chat](https://platform.openai.com/docs/api-reference/chat) API.
The following parameters are supported, which must be added as extra parameters:
- `choice`: the output will be exactly one of the choices.
- `regex`: the output will follow the regex pattern.
- `json`: the output will follow the JSON schema.
- `grammar`: the output will follow the context free grammar.
- `guided_choice`: the output will be exactly one of the choices.
- `guided_regex`: the output will follow the regex pattern.
- `guided_json`: the output will follow the JSON schema.
- `guided_grammar`: the output will follow the context free grammar.
- `structural_tag`: Follow a JSON schema within a set of specified tags within the generated text.
You can see the complete list of supported parameters on the [OpenAI-Compatible Server](../serving/openai_compatible_server.md) page.
Structured outputs are supported by default in the OpenAI-Compatible Server. You
may choose to specify the backend to use by setting the
`--structured-outputs-config.backend` flag to `vllm serve`. The default backend is `auto`,
`--guided-decoding-backend` flag to `vllm serve`. The default backend is `auto`,
which will try to choose an appropriate backend based on the details of the
request. You may also choose a specific backend, along with
some options. A full set of options is available in the `vllm serve --help`
text.
Now let´s see an example for each of the cases, starting with the `choice`, as it´s the easiest one:
Now let´s see an example for each of the cases, starting with the `guided_choice`, as it´s the easiest one:
??? code
@ -56,12 +45,12 @@ Now let´s see an example for each of the cases, starting with the `choice`, as
messages=[
{"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"}
],
extra_body={"structured_outputs": {"choice": ["positive", "negative"]}},
extra_body={"guided_choice": ["positive", "negative"]},
)
print(completion.choices[0].message.content)
```
The next example shows how to use the `regex`. The idea is to generate an email address, given a simple regex template:
The next example shows how to use the `guided_regex`. The idea is to generate an email address, given a simple regex template:
??? code
@ -74,18 +63,18 @@ The next example shows how to use the `regex`. The idea is to generate an email
"content": "Generate an example email address for Alan Turing, who works in Enigma. End in .com and new line. Example result: alan.turing@enigma.com\n",
}
],
extra_body={"structured_outputs": {"regex": r"\w+@\w+\.com\n"}, "stop": ["\n"]},
extra_body={"guided_regex": r"\w+@\w+\.com\n", "stop": ["\n"]},
)
print(completion.choices[0].message.content)
```
One of the most relevant features in structured text generation is the option to generate a valid JSON with pre-defined fields and formats.
For this we can use the `json` parameter in two different ways:
For this we can use the `guided_json` parameter in two different ways:
- Using directly a [JSON Schema](https://json-schema.org/)
- Defining a [Pydantic model](https://docs.pydantic.dev/latest/) and then extracting the JSON Schema from it (which is normally an easier option).
The next example shows how to use the `response_format` parameter with a Pydantic model:
The next example shows how to use the `guided_json` parameter with a Pydantic model:
??? code
@ -130,7 +119,7 @@ The next example shows how to use the `response_format` parameter with a Pydanti
JSON schema and how the fields should be populated. This can improve the
results notably in most cases.
Finally we have the `grammar` option, which is probably the most
Finally we have the `guided_grammar` option, which is probably the most
difficult to use, but it´s really powerful. It allows us to define complete
languages like SQL queries. It works by using a context free EBNF grammar.
As an example, we can use to define a specific format of simplified SQL queries:
@ -160,7 +149,7 @@ As an example, we can use to define a specific format of simplified SQL queries:
"content": "Generate an SQL query to show the 'username' and 'email' from the 'users' table.",
}
],
extra_body={"structured_outputs": {"grammar": simplified_sql_grammar}},
extra_body={"guided_grammar": simplified_sql_grammar},
)
print(completion.choices[0].message.content)
```
@ -303,8 +292,8 @@ An example of using `structural_tag` can be found here: <gh-file:examples/online
## Offline Inference
Offline inference allows for the same types of structured outputs.
To use it, we´ll need to configure the structured outputs using the class `StructuredOutputsParams` inside `SamplingParams`.
The main available options inside `StructuredOutputsParams` are:
To use it, we´ll need to configure the guided decoding using the class `GuidedDecodingParams` inside `SamplingParams`.
The main available options inside `GuidedDecodingParams` are:
- `json`
- `regex`
@ -320,12 +309,12 @@ shown below:
```python
from vllm import LLM, SamplingParams
from vllm.sampling_params import StructuredOutputsParams
from vllm.sampling_params import GuidedDecodingParams
llm = LLM(model="HuggingFaceTB/SmolLM2-1.7B-Instruct")
structured_outputs_params = StructuredOutputsParams(choice=["Positive", "Negative"])
sampling_params = SamplingParams(structured_outputs=structured_outputs_params)
guided_decoding_params = GuidedDecodingParams(choice=["Positive", "Negative"])
sampling_params = SamplingParams(guided_decoding=guided_decoding_params)
outputs = llm.generate(
prompts="Classify this sentiment: vLLM is wonderful!",
sampling_params=sampling_params,

View File

@ -71,7 +71,7 @@ This example demonstrates:
* Making a request with `tool_choice="auto"`
* Handling the structured response and executing the corresponding function
You can also specify a particular function using named function calling by setting `tool_choice={"type": "function", "function": {"name": "get_weather"}}`. Note that this will use the structured outputs backend - so the first time this is used, there will be several seconds of latency (or more) as the FSM is compiled for the first time before it is cached for subsequent requests.
You can also specify a particular function using named function calling by setting `tool_choice={"type": "function", "function": {"name": "get_weather"}}`. Note that this will use the guided decoding backend - so the first time this is used, there will be several seconds of latency (or more) as the FSM is compiled for the first time before it is cached for subsequent requests.
Remember that it's the caller's responsibility to:
@ -83,18 +83,19 @@ For more advanced usage, including parallel tool calls and different model-speci
## Named Function Calling
vLLM supports named function calling in the chat completion API by default. This should work with most structured outputs backends supported by vLLM. You are guaranteed a validly-parsable function call - not a
vLLM supports named function calling in the chat completion API by default. It does so using Outlines through guided decoding, so this is
enabled by default and will work with any supported model. You are guaranteed a validly-parsable function call - not a
high-quality one.
vLLM will use structured outputs to ensure the response matches the tool parameter object defined by the JSON schema in the `tools` parameter.
For best results, we recommend ensuring that the expected output format / schema is specified in the prompt to ensure that the model's intended generation is aligned with the schema that it's being forced to generate by the structured outputs backend.
vLLM will use guided decoding to ensure the response matches the tool parameter object defined by the JSON schema in the `tools` parameter.
For best results, we recommend ensuring that the expected output format / schema is specified in the prompt to ensure that the model's intended generation is aligned with the schema that it's being forced to generate by the guided decoding backend.
To use a named function, you need to define the functions in the `tools` parameter of the chat completion request, and
specify the `name` of one of the tools in the `tool_choice` parameter of the chat completion request.
## Required Function Calling
vLLM supports the `tool_choice='required'` option in the chat completion API. Similar to the named function calling, it also uses structured outputs, so this is enabled by default and will work with any supported model. However, support for alternative decoding backends are on the [roadmap](../usage/v1_guide.md#features) for the V1 engine.
vLLM supports the `tool_choice='required'` option in the chat completion API. Similar to the named function calling, it also uses guided decoding, so this is enabled by default and will work with any supported model. The guided decoding features for `tool_choice='required'` (such as JSON schema with `anyOf`) are currently only supported in the V0 engine with the guided decoding backend `outlines`. However, support for alternative decoding backends are on the [roadmap](../usage/v1_guide.md#features) for the V1 engine.
When tool_choice='required' is set, the model is guaranteed to generate one or more tool calls based on the specified tool list in the `tools` parameter. The number of tool calls depends on the user's query. The output format strictly follows the schema defined in the `tools` parameter.
@ -310,15 +311,6 @@ Flags:
* For non-reasoning: `--tool-call-parser hunyuan_a13b`
* For reasoning: `--tool-call-parser hunyuan_a13b --reasoning-parser hunyuan_a13b --enable_reasoning`
### LongCat-Flash-Chat Models (`longcat`)
Supported models:
* `meituan-longcat/LongCat-Flash-Chat`
* `meituan-longcat/LongCat-Flash-Chat-FP8`
Flags: `--tool-call-parser longcat`
### GLM-4.5 Models (`glm45`)
Supported models:
@ -328,15 +320,6 @@ Supported models:
Flags: `--tool-call-parser glm45`
### Qwen3-Coder Models (`qwen3_xml`)
Supported models:
* `Qwen/Qwen3-480B-A35B-Instruct`
* `Qwen/Qwen3-Coder-30B-A3B-Instruct`
Flags: `--tool-call-parser qwen3_xml`
### Models with Pythonic Tool Calls (`pythonic`)
A growing number of models output a python list to represent tool calls instead of using JSON. This has the advantage of inherently supporting parallel tool calls and removing ambiguity around the JSON schema required for tool calls. The `pythonic` tool parser can support such models.

View File

@ -20,80 +20,7 @@ vLLM supports basic model inferencing and serving on x86 CPU platform, with data
# --8<-- [end:pre-built-wheels]
# --8<-- [start:build-wheel-from-source]
Install recommended compiler. We recommend to use `gcc/g++ >= 12.3.0` as the default compiler to avoid potential problems. For example, on Ubuntu 22.4, you can run:
```bash
sudo apt-get update -y
sudo apt-get install -y gcc-12 g++-12 libnuma-dev python3-dev
sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
```
Clone the vLLM project:
```bash
git clone https://github.com/vllm-project/vllm.git vllm_source
cd vllm_source
```
Install the required dependencies:
```bash
uv pip install -r requirements/cpu-build.txt --torch-backend cpu
uv pip install -r requirements/cpu.txt --torch-backend cpu
```
??? console "pip"
```bash
pip install --upgrade pip
pip install -v -r requirements/cpu-build.txt --extra-index-url https://download.pytorch.org/whl/cpu
pip install -v -r requirements/cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
```
Build and install vLLM:
```bash
VLLM_TARGET_DEVICE=cpu uv pip install . --no-build-isolation
```
If you want to develop vLLM, install it in editable mode instead.
```bash
VLLM_TARGET_DEVICE=cpu uv pip install -e . --no-build-isolation
```
Optionally, build a portable wheel which you can then install elsewhere:
```bash
VLLM_TARGET_DEVICE=cpu uv build --wheel
```
```bash
uv pip install dist/*.whl
```
??? console "pip"
```bash
VLLM_TARGET_DEVICE=cpu python -m build --wheel --no-isolation
```
```bash
pip install dist/*.whl
```
!!! example "Troubleshooting"
- **NumPy ≥2.0 error**: Downgrade using `pip install "numpy<2.0"`.
- **CMake picks up CUDA**: Add `CMAKE_DISABLE_FIND_PACKAGE_CUDA=ON` to prevent CUDA detection during CPU builds, even if CUDA is installed.
- `AMD` requies at least 4th gen processors (Zen 4/Genoa) or higher to support [AVX512](https://www.phoronix.com/review/amd-zen4-avx512) to run vLLM on CPU.
- If you receive an error such as: `Could not find a version that satisfies the requirement torch==X.Y.Z+cpu+cpu`, consider updating [pyproject.toml](https://github.com/vllm-project/vllm/blob/main/pyproject.toml) to help pip resolve the dependency.
```toml title="pyproject.toml"
[build-system]
requires = [
"cmake>=3.26.1",
...
"torch==X.Y.Z+cpu" # <-------
]
```
- If you are building vLLM from source and not using the pre-built images, remember to set `LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$LD_PRELOAD"` on x86 machines before running vLLM.
--8<-- "docs/getting_started/installation/cpu/build.inc.md"
# --8<-- [end:build-wheel-from-source]
# --8<-- [start:pre-built-images]
@ -130,4 +57,4 @@ docker run --rm \
# --8<-- [end:build-image-from-source]
# --8<-- [start:extra-information]
# --8<-- [end:extra-information]
# --8<-- [end:extra-information]

View File

@ -1,4 +1,4 @@
It's recommended to use [uv](https://docs.astral.sh/uv/), a very fast Python environment manager, to create and manage Python environments. Please follow the [documentation](https://docs.astral.sh/uv/#getting-started) to install `uv`. After installing `uv`, you can create a new Python environment using the following commands:
It's recommended to use [uv](https://docs.astral.sh/uv/), a very fast Python environment manager, to create and manage Python environments. Please follow the [documentation](https://docs.astral.sh/uv/#getting-started) to install `uv`. After installing `uv`, you can create a new Python environment and install vLLM using the following commands:
```bash
uv venv --python 3.12 --seed

View File

@ -32,9 +32,8 @@ def auto_mock(module, attr, max_mocks=50):
for _ in range(max_mocks):
try:
# First treat attr as an attr, then as a submodule
with patch("importlib.metadata.version", return_value="0.0.0"):
return getattr(importlib.import_module(module), attr,
importlib.import_module(f"{module}.{attr}"))
return getattr(importlib.import_module(module), attr,
importlib.import_module(f"{module}.{attr}"))
except importlib.metadata.PackageNotFoundError as e:
raise e
except ModuleNotFoundError as e:
@ -168,5 +167,5 @@ def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
doc_path = ARGPARSE_DOC_DIR / f"{stem}.md"
# Specify encoding for building on Windows
with open(doc_path, "w", encoding="utf-8") as f:
f.write(super(type(parser), parser).format_help())
f.write(parser.format_help())
logger.info("Argparse generated: %s", doc_path.relative_to(ROOT_DIR))

View File

@ -4,7 +4,7 @@ vLLM provides first-class support for generative models, which covers most of LL
In vLLM, generative models implement the[VllmModelForTextGeneration][vllm.model_executor.models.VllmModelForTextGeneration] interface.
Based on the final hidden states of the input, these models output log probabilities of the tokens to generate,
which are then passed through [Sampler][vllm.v1.sample.sampler.Sampler] to obtain the final text.
which are then passed through [Sampler][vllm.model_executor.layers.sampler.Sampler] to obtain the final text.
## Configuration

View File

@ -59,7 +59,7 @@ enabling the corresponding APIs:
#### Predefined models
If the [Pooler][vllm.model_executor.layers.pooler.Pooler] defined by the model accepts `pooler_config`,
you can override some of its attributes via the `--pooler-config` option.
you can override some of its attributes via the `--override-pooler-config` option.
#### Converted models
@ -75,7 +75,7 @@ the pooler assigned to each task has the following attributes by default:
When loading [Sentence Transformers](https://huggingface.co/sentence-transformers) models,
its Sentence Transformers configuration file (`modules.json`) takes priority over the model's defaults.
You can further customize this via the `--pooler-config` option,
You can further customize this via the `--override-pooler-config` option,
which takes priority over both the model's and Sentence Transformers's defaults.
## Offline Inference

View File

@ -17,24 +17,9 @@ These models are what we list in [supported-text-models][supported-text-models]
### Transformers
vLLM also supports model implementations that are available in Transformers. You should expect the performance of a Transformers model implementation used in vLLM to be within <1% of the performance of a dedicated vLLM model implementation. We call this feature the "Transformers backend".
vLLM also supports model implementations that are available in Transformers. This does not currently work for all models, but most decoder language models and common vision language models are supported! Vision-language models currently accept only image inputs. Support for video inputs will be added in future releases.
Currently, the Transformers backend works for the following:
- Modalities: embedding models, language models and vision-language models*
- Architectures: encoder-only, decoder-only
- Attention types: full attention and/or sliding attention
_*Vision-language models currently accept only image inputs. Support for video inputs will be added in a future release._
If the Transformers model implementation follows all the steps in [writing a custom model](#writing-custom-models) then, when used with the Transformers backend, it will be compatible with the following features of vLLM:
- All the features listed in the [compatibility matrix](../features/README.md#feature-x-feature)
- Any combination of the following vLLM parallelisation schemes:
- Pipeline parallel
- Tensor parallel
Checking if the modeling backend is Transformers is as simple as:
To check if the modeling backend is Transformers, you can simply do this:
```python
from vllm import LLM
@ -42,12 +27,16 @@ llm = LLM(model=...) # Name or path of your model
llm.apply_model(lambda model: print(type(model)))
```
If the printed type starts with `Transformers...` then it's using the Transformers model implementation!
If it is `TransformersForCausalLM` or `TransformersForMultimodalLM` then it means it's based on Transformers!
If a model has a vLLM implementation but you would prefer to use the Transformers implementation via the Transformers backend, set `model_impl="transformers"` for [offline inference](../serving/offline_inference.md) or `--model-impl transformers` for the [online serving](../serving/openai_compatible_server.md).
!!! tip
You can force the use of `TransformersForCausalLM` by setting `model_impl="transformers"` for [offline-inference](../serving/offline_inference.md) or `--model-impl transformers` for the [openai-compatible-server](../serving/openai_compatible_server.md).
!!! note
For vision-language models, if you are loading with `dtype="auto"`, vLLM loads the whole model with config's `dtype` if it exists. In contrast the native Transformers will respect the `dtype` attribute of each backbone in the model. That might cause a slight difference in performance.
vLLM may not fully optimise the Transformers implementation so you may see degraded performance if comparing a native model to a Transformers model in vLLM.
!!! note
In case of vision language models if you are loading with `dtype="auto"`, vLLM loads the whole model with config's `dtype` if it exists. In contrast the native Transformers will respect the `dtype` attribute of each backbone in the model. That might cause a slight difference in performance.
#### Custom models
@ -77,11 +66,10 @@ This section details the necessary modifications to make to a Transformers compa
To make your model compatible with the Transformers backend, it needs:
1. `kwargs` passed down through all modules from `MyModel` to `MyAttention`.
1. If your model is encoder-only, you must also add `is_causal = False` to `MyAttention`.
2. `MyAttention` must use `ALL_ATTENTION_FUNCTIONS` to call attention.
3. `MyModel` must contain `_supports_attention_backend = True`.
<details class="code">
<details>
<summary>modeling_my_model.py</summary>
```python
@ -90,7 +78,6 @@ from transformers import PreTrainedModel
from torch import nn
class MyAttention(nn.Module):
is_causal = False # Only do this for encoder-only models
def forward(self, hidden_states, **kwargs):
...
@ -114,13 +101,13 @@ Here is what happens in the background when this model is loaded:
1. The config is loaded.
2. `MyModel` Python class is loaded from the `auto_map` in config, and we check that the model `is_backend_compatible()`.
3. `MyModel` is loaded into one of the Transformers backend classes in <gh-file:vllm/model_executor/models/transformers.py> which sets `self.config._attn_implementation = "vllm"` so that vLLM's attention layer is used.
3. `MyModel` is loaded into `TransformersForCausalLM` or `TransformersForMultimodalLM` (see <gh-file:vllm/model_executor/models/transformers.py>) which sets `self.config._attn_implementation = "vllm"` so that vLLM's attention layer is used.
That's it!
For your model to be compatible with vLLM's tensor parallel and/or pipeline parallel features, you must add `base_model_tp_plan` and/or `base_model_pp_plan` to your model's config class:
<details class="code">
<details>
<summary>configuration_my_model.py</summary>
```python
@ -352,7 +339,6 @@ th {
| `DeepseekV2ForCausalLM` | DeepSeek-V2 | `deepseek-ai/DeepSeek-V2`, `deepseek-ai/DeepSeek-V2-Chat`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `DeepseekV3ForCausalLM` | DeepSeek-V3 | `deepseek-ai/DeepSeek-V3`, `deepseek-ai/DeepSeek-R1`, `deepseek-ai/DeepSeek-V3.1`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `Dots1ForCausalLM` | dots.llm1 | `rednote-hilab/dots.llm1.base`, `rednote-hilab/dots.llm1.inst`, etc. | | ✅︎ | ✅︎ |
| `DotsOCRForCausalLM` | dots_ocr | `rednote-hilab/dots.ocr` | | ✅︎ | ✅︎ |
| `Ernie4_5ForCausalLM` | Ernie4.5 | `baidu/ERNIE-4.5-0.3B-PT`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `Ernie4_5_MoeForCausalLM` | Ernie4.5MoE | `baidu/ERNIE-4.5-21B-A3B-PT`, `baidu/ERNIE-4.5-300B-A47B-PT`, etc. |✅︎| ✅︎ | ✅︎ |
| `ExaoneForCausalLM` | EXAONE-3 | `LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
@ -428,7 +414,6 @@ th {
| `MiniMaxM1ForCausalLM` | MiniMax-Text | `MiniMaxAI/MiniMax-M1-40k`, `MiniMaxAI/MiniMax-M1-80k`, etc. | | | ✅︎ |
| `MiniMaxText01ForCausalLM` | MiniMax-Text | `MiniMaxAI/MiniMax-Text-01`, etc. | | | ✅︎ |
| `Zamba2ForCausalLM` | Zamba2 | `Zyphra/Zamba2-7B-instruct`, `Zyphra/Zamba2-2.7B-instruct`, `Zyphra/Zamba2-1.2B-instruct`, etc. | | | ✅︎ |
| `LongcatFlashForCausalLM` | LongCat-Flash | `meituan-longcat/LongCat-Flash-Chat`, `meituan-longcat/LongCat-Flash-Chat-FP8` | ✅︎ |✅︎ | ✅︎ |
Some models are supported only via the [Transformers backend](#transformers). The purpose of the table below is to acknowledge models which we officially support in this way. The logs will say that the Transformers backend is being used, and you will see no warning that this is fallback behaviour. This means that, if you have issues with any of the models listed below, please [make an issue](https://github.com/vllm-project/vllm/issues/new/choose) and we'll do our best to fix it!
@ -472,7 +457,7 @@ These models primarily support the [`LLM.embed`](./pooling_models.md#llmembed) A
!!! note
`ssmits/Qwen2-7B-Instruct-embed-base` has an improperly defined Sentence Transformers config.
You need to manually set mean pooling by passing `--pooler-config '{"pooling_type": "MEAN"}'`.
You need to manually set mean pooling by passing `--override-pooler-config '{"pooling_type": "MEAN"}'`.
!!! note
For `Alibaba-NLP/gte-Qwen2-*`, you need to enable `--trust-remote-code` for the correct tokenizer to be loaded.
@ -567,18 +552,7 @@ If your model is not in the above list, we will try to automatically convert the
!!! important
For process-supervised reward models such as `peiyi9979/math-shepherd-mistral-7b-prm`, the pooling config should be set explicitly,
e.g.: `--pooler-config '{"pooling_type": "STEP", "step_tag_id": 123, "returned_token_ids": [456, 789]}'`.
#### Token Classification
These models primarily support the [`LLM.encode`](./pooling_models.md#llmencode) API.
| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) |
|--------------|--------|-------------------|-----------------------------|-----------------------------------------|---------------------|
| `BertForTokenClassification` | bert-based | `boltuix/NeuroBERT-NER` (see note), etc. | | | ✅︎ |
!!! note
Named Entity Recognition (NER) usage, please refer to <gh-file:examples/offline_inference/pooling/ner.py>, <gh-file:examples/online_serving/pooling/ner.py>.
e.g.: `--override-pooler-config '{"pooling_type": "STEP", "step_tag_id": 123, "returned_token_ids": [456, 789]}'`.
[](){ #supported-mm-models }

View File

@ -193,7 +193,7 @@ For production deployments requiring strict SLA guarantees for time-to-first-tok
1. **Install gdrcopy/ucx/nixl**: For maximum performance, run the [install_gdrcopy.sh](gh-file:tools/install_gdrcopy.sh) script to install `gdrcopy` (e.g., `install_gdrcopy.sh "${GDRCOPY_OS_VERSION}" "12.8" "x64"`). You can find available OS versions [here](https://developer.download.nvidia.com/compute/redist/gdrcopy/CUDA%2012.8/). If `gdrcopy` is not installed, things will still work with a plain `pip install nixl`, just with lower performance. `nixl` and `ucx` are installed as dependencies via pip.
2. **Configure Both Instances**: Add this flag to both prefill and decode instances `--kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}`. Noted, you may also specify one or multiple NIXL_Backend. Such as: `--kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both", "kv_connector_extra_config":{"backends":["UCX", "GDS"]}}'`
2. **Configure Both Instances**: Add this flag to both prefill and decode instances `--kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}`
3. **Client Orchestration**: Use the client-side script below to coordinate prefill/decode operations. We are actively working on routing solutions.

View File

@ -133,7 +133,7 @@ completion = client.chat.completions.create(
{"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"}
],
extra_body={
"structured_outputs": {"choice": ["positive", "negative"]}
"guided_choice": ["positive", "negative"]
}
)
```
@ -317,11 +317,10 @@ Full example: <gh-file:examples/online_serving/pooling/openai_chat_embedding_cli
#### Extra parameters
The following [pooling parameters][vllm.PoolingParams] are supported.
The following [pooling parameters][pooling-params] are supported.
```python
--8<-- "vllm/pooling_params.py:common-pooling-params"
--8<-- "vllm/pooling_params.py:embedding-pooling-params"
--8<-- "vllm/entrypoints/openai/protocol.py:embedding-pooling-params"
```
The following extra parameters are supported by default:
@ -375,7 +374,7 @@ The following extra parameters are supported:
```python
--8<-- "vllm/entrypoints/openai/protocol.py:transcription-extra-params"
```
[](){ #translations-api }
### Translations API
@ -528,11 +527,10 @@ curl -v "http://127.0.0.1:8000/classify" \
#### Extra parameters
The following [pooling parameters][vllm.PoolingParams] are supported.
The following [pooling parameters][pooling-params] are supported.
```python
--8<-- "vllm/pooling_params.py:common-pooling-params"
--8<-- "vllm/pooling_params.py:classification-pooling-params"
--8<-- "vllm/entrypoints/openai/protocol.py:classification-pooling-params"
```
The following extra parameters are supported:
@ -735,11 +733,10 @@ Full example: <gh-file:examples/online_serving/openai_cross_encoder_score_for_mu
#### Extra parameters
The following [pooling parameters][vllm.PoolingParams] are supported.
The following [pooling parameters][pooling-params] are supported.
```python
--8<-- "vllm/pooling_params.py:common-pooling-params"
--8<-- "vllm/pooling_params.py:classification-pooling-params"
--8<-- "vllm/entrypoints/openai/protocol.py:score-pooling-params"
```
The following extra parameters are supported:
@ -818,11 +815,10 @@ Result documents will be sorted by relevance, and the `index` property can be us
#### Extra parameters
The following [pooling parameters][vllm.PoolingParams] are supported.
The following [pooling parameters][pooling-params] are supported.
```python
--8<-- "vllm/pooling_params.py:common-pooling-params"
--8<-- "vllm/pooling_params.py:classification-pooling-params"
--8<-- "vllm/entrypoints/openai/protocol.py:rerank-pooling-params"
```
The following extra parameters are supported:

View File

@ -1,6 +1,6 @@
# Using vLLM
First, vLLM must be [installed](../getting_started/installation/) for your chosen device in either a Python or Docker environment.
First, vLLM must be [installed](../getting_started/installation) for your chosen device in either a Python or Docker environment.
Then, vLLM supports the following usage patterns:

View File

@ -87,7 +87,6 @@ def main(args: dict):
use_tqdm=False,
chat_template=chat_template,
)
print_outputs(outputs)
if __name__ == "__main__":

View File

@ -101,13 +101,6 @@ def parse_args():
"--quantization",
type=str,
)
parser.add_argument(
"--disable-expert-parallel",
dest="enable_expert_parallel",
action="store_false",
help="Disable expert parallel (default: enabled).",
)
parser.set_defaults(enable_expert_parallel=True)
return parser.parse_args()
@ -120,7 +113,6 @@ def main(
dp_master_port,
GPUs_per_dp_rank,
enforce_eager,
enable_expert_parallel,
trust_remote_code,
max_num_seqs,
max_model_len,
@ -176,7 +168,7 @@ def main(
model=model,
tensor_parallel_size=GPUs_per_dp_rank,
enforce_eager=enforce_eager,
enable_expert_parallel=enable_expert_parallel,
enable_expert_parallel=True,
trust_remote_code=trust_remote_code,
max_num_seqs=max_num_seqs,
max_model_len=max_model_len,
@ -237,7 +229,6 @@ if __name__ == "__main__":
dp_master_port,
tp_size,
args.enforce_eager,
args.enable_expert_parallel,
args.trust_remote_code,
args.max_num_seqs,
args.max_model_len,

View File

@ -26,14 +26,8 @@ python examples/offline_inference/pooling/embed_jina_embeddings_v3.py
python examples/offline_inference/pooling/embed_matryoshka_fy.py
```
## Named Entity Recognition (NER) usage
```bash
python examples/offline_inference/pooling/ner.py
```
## Qwen3 reranker usage
```bash
python examples/offline_inference/pooling/qwen3_reranker.py
python qwen3_reranker.py
```

View File

@ -1,54 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Adapted from https://huggingface.co/boltuix/NeuroBERT-NER
from argparse import Namespace
from vllm import LLM, EngineArgs
from vllm.utils import FlexibleArgumentParser
def parse_args():
parser = FlexibleArgumentParser()
parser = EngineArgs.add_cli_args(parser)
# Set example specific arguments
parser.set_defaults(
model="boltuix/NeuroBERT-NER",
runner="pooling",
enforce_eager=True,
trust_remote_code=True,
)
return parser.parse_args()
def main(args: Namespace):
# Sample prompts.
prompts = [
"Barack Obama visited Microsoft headquarters in Seattle on January 2025."
]
# Create an LLM.
llm = LLM(**vars(args))
tokenizer = llm.get_tokenizer()
label_map = llm.llm_engine.vllm_config.model_config.hf_config.id2label
# Run inference
outputs = llm.encode(prompts)
for prompt, output in zip(prompts, outputs):
logits = output.outputs.data
predictions = logits.argmax(dim=-1)
# Map predictions to labels
tokens = tokenizer.convert_ids_to_tokens(output.prompt_token_ids)
labels = [label_map[p.item()] for p in predictions]
# Print results
for token, label in zip(tokens, labels):
if token not in tokenizer.all_special_tokens:
print(f"{token:15}{label}")
if __name__ == "__main__":
args = parse_args()
main(args)

View File

@ -0,0 +1,510 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import inspect
import json
import os
import sys
from argparse import RawTextHelpFormatter
from collections.abc import Generator
from dataclasses import asdict, dataclass
from typing import Any, Optional, TypeAlias
import torch
import tqdm
from vllm import LLM, SamplingParams
from vllm.engine.arg_utils import EngineArgs
from vllm.profiler.layerwise_profile import layerwise_profile
from vllm.utils import FlexibleArgumentParser
BATCH_SIZE_DEFAULT = 1
PROMPT_LEN_DEFAULT = 256
@dataclass
class ProfileContext:
engine_args: EngineArgs
prompt_len: int
batch_size: int
# The profiler can run in 2 modes,
# 1. Run profiler for user specified num_steps
num_steps: Optional[int] = None
# 2. Run profiler until all requests complete
complete_num_requests_per_step: Optional[int] = None
save_chrome_traces_folder: Optional[str] = None
def get_dtype(dtype: str):
if dtype == "torch.float":
return torch.float
else:
return dtype
OutputLen_NumReqs_Map: TypeAlias = dict[int, int]
def compute_request_output_lengths(
batch_size: int, step_requests: list[int]
) -> OutputLen_NumReqs_Map:
"""
Given the number of requests, batch_size, and the number of requests
that each engine-step should process, step_requests, determine the
output lengths of the requests such that step_request is honoured.
Example:
if batch size = 128 and step_request = [128, 128, 96, 64, 32, 1]
then return,
{2 : 32, 3 : 32, 4 : 32, 5 : 31, 6 : 1}, meaning,
32 requests should have output length 2,
32 requests should have output length 3,
32 requests should have output length 4,
31 requests should have output length 5,
1 request should have output length 6.
Args:
batch_size (int): Number of requests submitted for profile. This is
args.batch_size.
step_requests (list[int]): step_requests[i] is the number of requests
that the ith engine step should process.
Returns:
OutputLen_NumReqs_Map : A dictionary with output-length as keys and the
number of requests required to have that output-length as values.
"""
ol_nr: OutputLen_NumReqs_Map = {}
# Number of request that are assigned an output-length
num_reqs_assigned: int = 0
num_steps: int = len(step_requests)
# sanity check. The first step (prefill-step), must process all requests.
assert step_requests[0] == batch_size
# Begin assignments from the last step.
output_length: int = num_steps
for num_requests_at_step in reversed(step_requests):
if num_reqs_assigned == batch_size:
break
assert num_reqs_assigned < batch_size
# Remove the number of requests that have been determined
# to participate in this step and beyond.
num_reqs_unassigned_at_step = num_requests_at_step - num_reqs_assigned
assert num_reqs_unassigned_at_step >= 0
if num_reqs_unassigned_at_step > 0:
ol_nr[output_length] = num_reqs_unassigned_at_step
num_reqs_assigned += num_reqs_unassigned_at_step
output_length -= 1
# sanity checks.
assert sum(ol_nr.values()) == batch_size, (
"Number of requests in output-length assignment does not match "
f"batch-size.\n batch size {batch_size} - "
f"step requests {step_requests} - assignments {ol_nr}"
)
# Check that the output-length is in [1, num-steps]. Output length must be
# at least 1 as all requests must participate in the prefill-step.
assert all(ol >= 1 and ol <= num_steps for ol in ol_nr), (
"Output lengths of requests should be in range "
f"[1, num-engine-steps].\n batch size {batch_size} - "
f"step requests {step_requests} - assignments {ol_nr}"
)
return ol_nr
def determine_requests_per_step(context: ProfileContext) -> list[int]:
"""
Determine number of requests each engine step should process.
If context.num_steps is set, then all engine steps process the
same number of requests and the output list is of length
context.num_steps.
If context.complete_num_requests_per_step is set, then each decode step
processes fewer and fewer requests until there are no requests to process.
In this case, the output list is as big as the number of steps
required to process all requests.
Args:
context: ProfileContext object.
Returns:
list[int]: Number of requests to process for all engine-steps.
output[i], contains the number of requests that the ith step
should process.
"""
if context.num_steps:
# All requests must run until num_engine_steps. This implies
# that their output lengths must be equal to num_engine_steps.
return [context.batch_size] * context.num_steps
assert (
context.complete_num_requests_per_step
and context.complete_num_requests_per_step > 0
), (
f"Expected a positive complete_num_requests_per_step argument."
f"Instead got {context.complete_num_requests_per_step}"
)
# We start dropping after the first decode step.
step_requests = [
context.batch_size, # prefill
context.batch_size, # decode
]
num_running_requests = context.batch_size
num_running_requests -= context.complete_num_requests_per_step
while num_running_requests > 0:
step_requests.append(num_running_requests)
num_running_requests -= context.complete_num_requests_per_step
if step_requests[-1] != 1:
# have 1 request running at the last step. This is often
# useful
step_requests.append(1)
return step_requests
def run_profile(
context: ProfileContext, csv_output: Optional[str], json_output: Optional[str]
):
print("Run profile with:")
for key, value in asdict(context).items():
print(f" {key} = {value}")
requests_per_step: list[int] = determine_requests_per_step(context)
ol_nr: OutputLen_NumReqs_Map = compute_request_output_lengths(
context.batch_size, requests_per_step
)
num_steps_to_profile: int = len(requests_per_step)
max_output_len: int = max(ol_nr.keys())
assert max_output_len >= 1
# Create sampling params
sampling_params = SamplingParams(
temperature=0.8,
top_p=0.95,
# max_tokens is set on a per-request basis.
max_tokens=None,
ignore_eos=True,
)
# Create LLM
llm = LLM(**asdict(context.engine_args))
batch_size = context.batch_size
prompt_len = context.prompt_len
scheduler_config = llm.llm_engine.vllm_config.scheduler_config
max_model_len = llm.llm_engine.model_config.max_model_len
max_num_batched_tokens = scheduler_config.max_num_batched_tokens
max_num_seqs = scheduler_config.max_num_seqs
if batch_size * prompt_len > max_num_batched_tokens:
print(
f"ERROR: chosen batch_size * prompt_len "
f"({batch_size} * {prompt_len} = {batch_size * prompt_len}) is "
f"larger than max_num_batched_tokens ({max_num_batched_tokens}) "
f"and therefore cannot be run in a single profile step, please "
f"choose a smaller batch size or prompt length, or increase "
f"--max-num-batched-tokens"
)
sys.exit(-1)
if batch_size > max_num_seqs:
print(
f"ERROR: chosen batch_size ({batch_size}) is larger than "
f"max_num_seqs ({max_num_seqs}) and therefore cannot be run in a "
f"single profile step, please choose a smaller batch size"
)
sys.exit(-1)
print(
"llm.llm_engine.model_config.max_model_len: ",
llm.llm_engine.model_config.max_model_len,
)
if prompt_len + max_output_len > llm.llm_engine.model_config.max_model_len:
print(
f"ERROR: chosen prompt_len + max_output_len ({prompt_len} + "
f"{max_output_len} = {prompt_len + max_output_len}) is larger "
f"than the model's max_model_len ({max_model_len}), please "
f"choose a smaller prompt_len or max_output_len, or increase "
f"--max-model-len"
)
sys.exit(-1)
def add_requests():
def get_output_len_generator() -> Generator[int, Any, Any]:
for output_len, num_reqs in ol_nr.items():
for _ in range(num_reqs):
yield output_len
output_len_generator = get_output_len_generator()
for i in range(batch_size):
sampling_params.max_tokens = next(output_len_generator)
assert isinstance(sampling_params.max_tokens, int)
prompt_token_ids = torch.randint(
llm.get_tokenizer().vocab_size, size=(prompt_len,)
).tolist()
llm.llm_engine.add_request(
request_id=f"seq{i}",
prompt={"prompt_token_ids": prompt_token_ids},
params=sampling_params,
)
def abort_requests():
for i in range(batch_size):
llm.llm_engine.abort_request(f"seq{i}")
# Warm up run
print("Warm up run ...")
add_requests()
llm.llm_engine.step() # Prefill
llm.llm_engine.step() # Decode
abort_requests()
print("Profile run ...")
add_requests()
with layerwise_profile() as prefill_prof:
llm.llm_engine.step() # First step is prefill
decode_profs = []
for _ in tqdm.tqdm(range(num_steps_to_profile - 1)):
num_running_seqs = llm.llm_engine.scheduler[0].get_num_unfinished_seq_groups()
with layerwise_profile(num_running_seqs=num_running_seqs) as decode_prof:
llm.llm_engine.step()
decode_profs.append(decode_prof)
decode_results_list = [prof.results for prof in decode_profs]
prefill_results = prefill_prof.results
has_decode = len(decode_results_list) > 0
LINE_WIDTH = 80
print("=" * LINE_WIDTH)
print(f"= Prefill Model Table (prompt_len={prompt_len}, batch_size={batch_size})")
print("=" * LINE_WIDTH)
print()
prefill_results.print_model_table()
if has_decode:
print()
print("=" * LINE_WIDTH)
print(
f"= First Decode Step Model Table "
f"(prompt_len={prompt_len}, batch_size={batch_size})"
)
print("=" * LINE_WIDTH)
print()
decode_results_list[0].print_model_table()
print()
print("=" * LINE_WIDTH)
print(f"= Prefill Summary Table (prompt_len={prompt_len}, batch_size={batch_size})")
print("=" * LINE_WIDTH)
print()
prefill_results.print_summary_table()
if has_decode:
print()
print("=" * LINE_WIDTH)
print(
f"= First Decode Step Summary Table "
f"(prompt_len={prompt_len}, batch_size={batch_size})"
)
print("=" * LINE_WIDTH)
print()
decode_results_list[0].print_summary_table()
if csv_output:
csv_filename_base = (
csv_output[:-4] if csv_output.endswith(".csv") else csv_output
)
prefill_results.export_model_stats_table_csv(
csv_filename_base + "_prefill_model_table.csv"
)
prefill_results.export_summary_stats_table_csv(
csv_filename_base + "_prefill_summary_table.csv"
)
if has_decode:
decode_results_list[0].export_model_stats_table_csv(
csv_filename_base + "_decode_model_table.csv"
)
decode_results_list[0].export_summary_stats_table_csv(
csv_filename_base + "_decode_summary_table.csv"
)
if json_output:
cuda_devices = [
torch.cuda.get_device_properties(dev_idx)
for dev_idx in range(torch.cuda.device_count())
]
json_dict = {
"context": {
"python_version": f"{sys.version}",
"torch_version": f"{torch.__version__}",
"torch_cuda_version": f"{torch.version.cuda}",
"cuda_devices": f"{cuda_devices}",
**asdict(context),
},
"prefill": prefill_results.convert_stats_to_dict(),
}
if has_decode:
for idx, dr in enumerate(decode_results_list):
json_dict[f"decode_{idx + 1}"] = dr.convert_stats_to_dict()
# Add .json to json_output filename if it doesn't exist already.
json_output_file = (
json_output if json_output.endswith(".json") else json_output + ".json"
)
with open(json_output_file, "w+") as f:
json.dump(json_dict, f, indent=2)
pass
if context.save_chrome_traces_folder is not None:
os.makedirs(context.save_chrome_traces_folder, exist_ok=True)
prefill_prof.profiler.export_chrome_trace(
context.save_chrome_traces_folder + "/prefill.json"
)
for idx, decode_prof in enumerate(decode_profs):
decode_prof.profiler.export_chrome_trace(
context.save_chrome_traces_folder + f"/decode_{idx + 1}.json"
)
print(
"Traces saved as prefill.json and decode_1.json, etc."
f" in folder {context.save_chrome_traces_folder}"
)
def parse_args():
parser = FlexibleArgumentParser(
description="""
Profile a model
example:
```
python examples/offline_inference/profiling.py \\
--model neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8 --batch-size 4 \\
--prompt-len 512 --max-num-batched-tokens 8196 --json Llama31-8b-FP8 \\
--enforce-eager run_num_steps -n 2
```
then you can use various tools to analyze the json output
terminal ascii tables:
```
python tools/profiler/print_layerwise_table.py \\
--json-trace Llama31-8b-FP8.json --phase prefill --table summary
```
or create matplotlib stacked bar charts:
```
python tools/profiler/visualize_layerwise_profile.py \\
--json-trace Llama31-8b-FP8.json \\
--output-directory profile_breakdown --plot-metric pct_cuda_time
```
""",
formatter_class=RawTextHelpFormatter,
)
parser.add_argument(
"--csv",
type=str,
default=None,
help="Export the results as multiple csv file. This should be the root "
"filename, will create <filename>_prefill_model_table.csv, "
"<filename>_prefill_summary_table.csv, "
"<filename>_decode_model_table.csv, and "
"<filename>_decode_summary_table.csv",
)
parser.add_argument(
"--json",
type=str,
default=None,
help="Export the results as a json file. This should be the filename",
)
parser.add_argument(
"--save-chrome-traces-folder",
type=str,
help="Save chrome traces for the prefill and decode "
"will save traces as prefill.json and decode_1.json, "
"etc. inside this folder",
)
parser.add_argument(
"--prompt-len",
type=int,
default=PROMPT_LEN_DEFAULT,
help=f"Length of the random prompt to use when profiling, all batched "
f"requests use the same prompt_len, default={PROMPT_LEN_DEFAULT}",
)
parser.add_argument(
"--batch-size",
type=int,
default=BATCH_SIZE_DEFAULT,
help=f"Number of requests to run as a single batch, "
f"default={BATCH_SIZE_DEFAULT}",
)
subparsers = parser.add_subparsers(dest="cmd")
run_num_steps_parser = subparsers.add_parser(
"run_num_steps", help="This variation profiles n engine.step() invocations."
)
run_num_steps_parser.add_argument(
"-n",
"--num-steps",
type=int,
help="Number of engine steps to profile.\n"
"Setting it to 1, profiles only the prefill step.\n"
"Setting it to 2, profiles the prefill and first decode step\n"
"Setting it to 3, profiles the prefill, 1st and 2nd decode steps\n"
"and so on ...",
)
run_to_completion_parser = subparsers.add_parser(
"run_to_completion",
help="This variation profiles all the engine.step() invocations"
"until the engine exhausts all submitted requests.",
)
run_to_completion_parser.add_argument(
"-n",
"--complete-num-requests-per-step",
type=int,
help="Complete complete_num_requests_per_step requests every decode step."
"For e.g., with batch_size 128 and complete_num_requests_per_step 32,"
"the profiler is run for 6 engine steps, with the steps processing, "
"128, 128, 96, 64, 32, 1 requests respectively.\n"
"Note that we tack-on a one-request step at the end as it is often "
"useful.",
)
EngineArgs.add_cli_args(parser)
return parser.parse_args()
def main(args):
context = ProfileContext(
engine_args=EngineArgs.from_cli_args(args),
**{
k: v
for k, v in vars(args).items()
if k in inspect.signature(ProfileContext).parameters
},
)
run_profile(context, csv_output=args.csv, json_output=args.json)
if __name__ == "__main__":
args = parse_args()
main(args)

View File

@ -5,6 +5,7 @@ from urllib.request import urlopen
from vllm import LLM, SamplingParams
os.environ["VLLM_ATTENTION_BACKEND"] = "DUAL_CHUNK_FLASH_ATTN"
os.environ["VLLM_ALLOW_LONG_MAX_MODEL_LEN"] = "1"

View File

@ -49,11 +49,11 @@ def get_custom_mm_prompts(num_prompts):
def parse_args():
parser = FlexibleArgumentParser()
add_dataset_parser(parser)
parser.add_argument("--test", action="store_true")
parser.add_argument(
"--method",
type=str,
default="eagle",
choices=["ngram", "eagle", "eagle3", "mtp"],
)
parser.add_argument("--num-spec-tokens", type=int, default=2)
parser.add_argument("--prompt-lookup-max", type=int, default=5)
@ -61,7 +61,6 @@ def parse_args():
parser.add_argument("--tp", type=int, default=1)
parser.add_argument("--enforce-eager", action="store_true")
parser.add_argument("--enable-chunked-prefill", action="store_true")
parser.add_argument("--max-model-len", type=int, default=16384)
parser.add_argument("--temp", type=float, default=0)
parser.add_argument("--top-p", type=float, default=1.0)
parser.add_argument("--top-k", type=int, default=-1)
@ -73,7 +72,8 @@ def parse_args():
return parser.parse_args()
def main(args):
def main():
args = parse_args()
args.endpoint_type = "openai-chat"
model_dir = args.model_dir
@ -118,11 +118,6 @@ def main(args):
"prompt_lookup_max": args.prompt_lookup_max,
"prompt_lookup_min": args.prompt_lookup_min,
}
elif args.method.endswith("mtp"):
speculative_config = {
"method": args.method,
"num_speculative_tokens": args.num_spec_tokens,
}
else:
raise ValueError(f"unknown method: {args.method}")
@ -135,7 +130,7 @@ def main(args):
gpu_memory_utilization=0.8,
speculative_config=speculative_config,
disable_log_stats=False,
max_model_len=args.max_model_len,
max_model_len=16384,
limit_mm_per_prompt={"image": 5},
disable_chunked_mm_input=True,
)
@ -199,39 +194,6 @@ def main(args):
acceptance_rate = acceptance_counts[i] / num_drafts if num_drafts > 0 else 0
print(f"acceptance at token {i}: {acceptance_rate:.2f}")
return acceptance_length
if __name__ == "__main__":
args = parse_args()
acceptance_length = main(args)
if args.test:
# takes ~30s to run on 1xH100
assert args.method in ["eagle", "eagle3"]
assert args.tp == 1
assert args.num_spec_tokens == 3
assert args.dataset_name == "hf"
assert args.dataset_path == "philschmid/mt-bench"
assert args.num_prompts == 80
assert args.temp == 0
assert args.top_p == 1.0
assert args.top_k == -1
assert args.enable_chunked_prefill
# check acceptance length is within 2% of expected value
rtol = 0.02
expected_acceptance_length = 2.296 if args.method == "eagle" else 2.811
assert (
acceptance_length <= (1 + rtol) * expected_acceptance_length
and acceptance_length >= (1 - rtol) * expected_acceptance_length
), (
f"acceptance_length {acceptance_length} is not "
f"within {rtol * 100}% of {expected_acceptance_length}"
)
print(
f"Test passed! Expected AL: "
f"{expected_acceptance_length}, got {acceptance_length}"
)
main()

View File

@ -1,10 +1,11 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
This file demonstrates the example usage of structured outputs
in vLLM. It shows how to apply different constraints such as choice,
regex, json schema, and grammar to produce structured and formatted
results based on specific prompts.
This file demonstrates the example usage of guided decoding
to generate structured outputs using vLLM. It shows how to apply
different guided decoding techniques such as Choice, Regex, JSON schema,
and Grammar to produce structured and formatted results
based on specific prompts.
"""
from enum import Enum
@ -12,23 +13,19 @@ from enum import Enum
from pydantic import BaseModel
from vllm import LLM, SamplingParams
from vllm.sampling_params import StructuredOutputsParams
from vllm.sampling_params import GuidedDecodingParams
MAX_TOKENS = 50
# Structured outputs by Choice (list of possible options)
structured_outputs_params_choice = StructuredOutputsParams(
choice=["Positive", "Negative"]
)
sampling_params_choice = SamplingParams(
structured_outputs=structured_outputs_params_choice
)
# Guided decoding by Choice (list of possible options)
guided_decoding_params_choice = GuidedDecodingParams(choice=["Positive", "Negative"])
sampling_params_choice = SamplingParams(guided_decoding=guided_decoding_params_choice)
prompt_choice = "Classify this sentiment: vLLM is wonderful!"
# Structured outputs by Regex
structured_outputs_params_regex = StructuredOutputsParams(regex=r"\w+@\w+\.com\n")
# Guided decoding by Regex
guided_decoding_params_regex = GuidedDecodingParams(regex=r"\w+@\w+\.com\n")
sampling_params_regex = SamplingParams(
structured_outputs=structured_outputs_params_regex,
guided_decoding=guided_decoding_params_regex,
stop=["\n"],
max_tokens=MAX_TOKENS,
)
@ -39,7 +36,7 @@ prompt_regex = (
)
# Structured outputs by JSON using Pydantic schema
# Guided decoding by JSON using Pydantic schema
class CarType(str, Enum):
sedan = "sedan"
suv = "SUV"
@ -54,16 +51,17 @@ class CarDescription(BaseModel):
json_schema = CarDescription.model_json_schema()
structured_outputs_params_json = StructuredOutputsParams(json=json_schema)
guided_decoding_params_json = GuidedDecodingParams(json=json_schema)
sampling_params_json = SamplingParams(
structured_outputs=structured_outputs_params_json, max_tokens=MAX_TOKENS
guided_decoding=guided_decoding_params_json,
max_tokens=MAX_TOKENS,
)
prompt_json = (
"Generate a JSON with the brand, model and car_type of "
"Generate a JSON with the brand, model and car_type of"
"the most iconic car from the 90's"
)
# Structured outputs by Grammar
# Guided decoding by Grammar
simplified_sql_grammar = """
root ::= select_statement
select_statement ::= "SELECT " column " from " table " where " condition
@ -72,15 +70,13 @@ table ::= "table_1 " | "table_2 "
condition ::= column "= " number
number ::= "1 " | "2 "
"""
structured_outputs_params_grammar = StructuredOutputsParams(
grammar=simplified_sql_grammar
)
guided_decoding_params_grammar = GuidedDecodingParams(grammar=simplified_sql_grammar)
sampling_params_grammar = SamplingParams(
structured_outputs=structured_outputs_params_grammar,
guided_decoding=guided_decoding_params_grammar,
max_tokens=MAX_TOKENS,
)
prompt_grammar = (
"Generate an SQL query to show the 'username' and 'email' from the 'users' table."
"Generate an SQL query to show the 'username' and 'email'from the 'users' table."
)
@ -97,16 +93,16 @@ def main():
llm = LLM(model="Qwen/Qwen2.5-3B-Instruct", max_model_len=100)
choice_output = generate_output(prompt_choice, sampling_params_choice, llm)
format_output("Structured outputs by Choice", choice_output)
format_output("Guided decoding by Choice", choice_output)
regex_output = generate_output(prompt_regex, sampling_params_regex, llm)
format_output("Structured outputs by Regex", regex_output)
format_output("Guided decoding by Regex", regex_output)
json_output = generate_output(prompt_json, sampling_params_json, llm)
format_output("Structured outputs by JSON", json_output)
format_output("Guided decoding by JSON", json_output)
grammar_output = generate_output(prompt_grammar, sampling_params_grammar, llm)
format_output("Structured outputs by Grammar", grammar_output)
format_output("Guided decoding by Grammar", grammar_output)
if __name__ == "__main__":

View File

@ -1,81 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
experimental support for data-parallel inference with torchrun
Note the data load balancing and distribution is done out of the vllm engine,
no internal lb supported in external_launcher mode.
"""
from vllm import LLM, SamplingParams
# Create prompts, the same across all ranks
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
] * 50
# Create sampling parameters, the same across all ranks
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
# Use `distributed_executor_backend="external_launcher"` so that
# this llm engine/instance only creates one worker.
# it is important to set an explicit seed to make sure that
# all ranks have the same random seed, so that sampling can be
# deterministic across ranks.
llm = LLM(
model="microsoft/Phi-mini-MoE-instruct",
tensor_parallel_size=1,
data_parallel_size=2,
pipeline_parallel_size=1,
enable_expert_parallel=False,
distributed_executor_backend="external_launcher",
max_model_len=4096,
gpu_memory_utilization=0.6,
seed=1,
)
dp_rank = llm.llm_engine.vllm_config.parallel_config.data_parallel_rank
dp_size = llm.llm_engine.vllm_config.parallel_config.data_parallel_size
prompts = [
f"{idx}.{prompt}" for idx, prompt in enumerate(prompts) if idx % dp_size == dp_rank
]
outputs = llm.generate(prompts, sampling_params)
# all ranks will have the same outputs
print("-" * 50)
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}\n")
print("-" * 50)
"""
Further tips:
1. to communicate control messages across all ranks, use the cpu group,
a PyTorch ProcessGroup with GLOO backend.
```python
from vllm.distributed.parallel_state import get_world_group
cpu_group = get_world_group().cpu_group
torch_rank = dist.get_rank(group=cpu_group)
if torch_rank == 0:
# do something for rank 0, e.g. saving the results to disk.
```
2. to communicate data across all ranks, use the model's device group,
a PyTorch ProcessGroup with NCCL backend.
```python
from vllm.distributed.parallel_state import get_world_group
device_group = get_world_group().device_group
```
3. to access the model directly in every rank, use the following code:
```python
llm.llm_engine.model_executor.driver_worker.worker.model_runner.model
```
"""

View File

@ -126,23 +126,6 @@ def run_chameleon(questions: list[str], modality: str) -> ModelRequestData:
)
# Dots-OCR
def run_dots_ocr(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
prompts = [f"<|img|><|imgpad|><|endofimg|>{question}" for question in questions]
engine_args = EngineArgs(
model="rednote-hilab/dots.ocr",
limit_mm_per_prompt={modality: 1},
trust_remote_code=True,
)
return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
)
def run_command_a_vision(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
@ -1693,7 +1676,6 @@ model_example_map = {
"aya_vision": run_aya_vision,
"blip-2": run_blip2,
"chameleon": run_chameleon,
"dots_ocr": run_dots_ocr,
"command_a_vision": run_command_a_vision,
"deepseek_vl_v2": run_deepseek_vl2,
"ernie45_vl": run_ernie45_vl,

View File

@ -11,9 +11,9 @@ vLLM performance and metrics.
## Dashboard Descriptions
- **performance_statistics.json**: Tracks performance metrics including latency and
- **[performance_statistics.json](./performance_statistics.json)**: Tracks performance metrics including latency and
throughput for your vLLM service.
- **query_statistics.json**: Tracks query performance, request volume, and key
- **[query_statistics.json](./query_statistics.json)**: Tracks query performance, request volume, and key
performance indicators for your vLLM service.
## Deployment Options

View File

@ -21,9 +21,9 @@ deployment methods:
## Dashboard Descriptions
- **performance_statistics.yaml**: Performance metrics with aggregated latency
- **[performance_statistics.yaml](./performance_statistics.yaml)**: Performance metrics with aggregated latency
statistics
- **query_statistics.yaml**: Query performance and deployment metrics
- **[query_statistics.yaml](./query_statistics.yaml)**: Query performance and deployment metrics
## Deployment Options

View File

@ -6,7 +6,7 @@ without any specific flags:
```bash
VLLM_USE_V1=0 vllm serve unsloth/Llama-3.2-1B-Instruct \
--structured-outputs-config.backend outlines
--guided-decoding-backend outlines
```
This example demonstrates how to generate chat completions

View File

@ -42,7 +42,7 @@ python client.py
### Server Configuration
The key parameters for chunked processing are in the `--pooler-config`:
The key parameters for chunked processing are in the `--override-pooler-config`:
```json
{

View File

@ -13,7 +13,7 @@ Prerequisites:
# MEAN pooling (processes all chunks, recommended for complete coverage)
vllm serve intfloat/multilingual-e5-large \
--pooler-config \
--override-pooler-config \
'{"pooling_type": "MEAN", "normalize": true, ' \
'"enable_chunked_processing": true, "max_embed_len": 3072000}' \
--served-model-name multilingual-e5-large \
@ -23,7 +23,7 @@ Prerequisites:
# OR CLS pooling (native CLS within chunks, MEAN aggregation across chunks)
vllm serve BAAI/bge-large-en-v1.5 \
--pooler-config \
--override-pooler-config \
'{"pooling_type": "CLS", "normalize": true, ' \
'"enable_chunked_processing": true, "max_embed_len": 1048576}' \
--served-model-name bge-large-en-v1.5 \

View File

@ -103,7 +103,7 @@ POOLER_CONFIG="{\"pooling_type\": \"$POOLING_TYPE\", \"normalize\": true, \"enab
vllm serve "$MODEL_NAME" \
--tensor-parallel-size "$GPU_COUNT" \
--enforce-eager \
--pooler-config "$POOLER_CONFIG" \
--override-pooler-config "$POOLER_CONFIG" \
--served-model-name ${MODEL_CODE} \
--api-key "$API_KEY" \
--trust-remote-code \

View File

@ -12,12 +12,6 @@ python examples/online_serving/pooling/cohere_rerank_client.py
python examples/online_serving/pooling/jinaai_rerank_client.py
```
## Named Entity Recognition (NER) usage
```bash
python examples/online_serving/pooling/ner.py
```
## Openai chat embedding for multimodal usage
```bash

View File

@ -1,71 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Adapted from https://huggingface.co/boltuix/NeuroBERT-NER
"""
Example online usage of Pooling API for Named Entity Recognition (NER).
Run `vllm serve <model> --runner pooling`
to start up the server in vLLM. e.g.
vllm serve boltuix/NeuroBERT-NER
"""
import argparse
import requests
import torch
def post_http_request(prompt: dict, api_url: str) -> requests.Response:
headers = {"User-Agent": "Test Client"}
response = requests.post(api_url, headers=headers, json=prompt)
return response
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("--host", type=str, default="localhost")
parser.add_argument("--port", type=int, default=8000)
parser.add_argument("--model", type=str, default="boltuix/NeuroBERT-NER")
return parser.parse_args()
def main(args):
from transformers import AutoConfig, AutoTokenizer
api_url = f"http://{args.host}:{args.port}/pooling"
model_name = args.model
# Load tokenizer and config
tokenizer = AutoTokenizer.from_pretrained(model_name)
config = AutoConfig.from_pretrained(model_name)
label_map = config.id2label
# Input text
text = "Barack Obama visited Microsoft headquarters in Seattle on January 2025."
prompt = {"model": model_name, "input": text}
pooling_response = post_http_request(prompt=prompt, api_url=api_url)
# Run inference
output = pooling_response.json()["data"][0]
logits = torch.tensor(output["data"])
predictions = logits.argmax(dim=-1)
inputs = tokenizer(text, return_tensors="pt")
# Map predictions to labels
tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
labels = [label_map[p.item()] for p in predictions]
assert len(tokens) == len(predictions)
# Print results
for token, label in zip(tokens, labels):
if token not in tokenizer.all_special_tokens:
print(f"{token:15}{label}")
if __name__ == "__main__":
args = parse_args()
main(args)

View File

@ -86,7 +86,7 @@ PARAMS: dict[ConstraintsFormat, dict[str, Any]] = {
"content": "Classify this sentiment: vLLM is wonderful!",
}
],
"extra_body": {"structured_outputs": {"choice": ["positive", "negative"]}},
"extra_body": {"guided_choice": ["positive", "negative"]},
},
"regex": {
"messages": [
@ -96,7 +96,7 @@ PARAMS: dict[ConstraintsFormat, dict[str, Any]] = {
}
],
"extra_body": {
"structured_outputs": {"regex": r"[a-z0-9.]{1,20}@\w{6,10}\.com\n"},
"guided_regex": r"[a-z0-9.]{1,20}@\w{6,10}\.com\n",
},
},
"json": {
@ -122,8 +122,7 @@ PARAMS: dict[ConstraintsFormat, dict[str, Any]] = {
}
],
"extra_body": {
"structured_outputs": {
"grammar": """
"guided_grammar": """
root ::= select_statement
select_statement ::= "SELECT " column " from " table " where " condition
@ -136,7 +135,6 @@ condition ::= column "= " number
number ::= "1 " | "2 "
""",
}
},
},
"structural_tag": {

View File

@ -1,6 +1,8 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse
import dataclasses
import json
import logging
import os
@ -325,7 +327,12 @@ def main():
if args.command == "serialize":
engine_args = EngineArgs.from_cli_args(args)
eng_args_dict = {f.name: getattr(args, f.name) for f in
dataclasses.fields(EngineArgs)}
engine_args = EngineArgs.from_cli_args(
argparse.Namespace(**eng_args_dict)
)
input_dir = tensorizer_dir.rstrip('/')
suffix = args.suffix if args.suffix else uuid.uuid4().hex

View File

@ -79,7 +79,6 @@ plugins:
- "re:vllm\\._.*" # Internal modules
- "vllm.third_party"
- "vllm.vllm_flash_attn"
- !ENV [API_AUTONAV_EXCLUDE, "re:^$"] # Match nothing by default
- mkdocstrings:
handlers:
python:
@ -102,7 +101,6 @@ plugins:
- https://numpy.org/doc/stable/objects.inv
- https://pytorch.org/docs/stable/objects.inv
- https://psutil.readthedocs.io/en/stable/objects.inv
- https://huggingface.co/docs/transformers/main/en/objects.inv
markdown_extensions:
- attr_list

View File

@ -70,6 +70,7 @@ line-length = 80
"vllm/_version.py" = ["ALL"]
# Python 3.8 typing - skip V0 code
"vllm/attention/**/*.py" = ["UP006", "UP035"]
"vllm/core/**/*.py" = ["UP006", "UP035"]
"vllm/engine/**/*.py" = ["UP006", "UP035"]
"vllm/executor/**/*.py" = ["UP006", "UP035"]
"vllm/worker/**/*.py" = ["UP006", "UP035"]
@ -110,6 +111,28 @@ ignore_missing_imports = true
check_untyped_defs = true
follow_imports = "silent"
# After fixing type errors resulting from follow_imports: "skip" -> "silent",
# move the directory here and remove it from tools/mypy.sh
files = [
"vllm/*.py",
"vllm/assets",
"vllm/entrypoints",
"vllm/core",
"vllm/inputs",
"vllm/logging_utils",
"vllm/multimodal",
"vllm/platforms",
"vllm/transformers_utils",
"vllm/triton_utils",
"vllm/usage",
]
# TODO(woosuk): Include the code from Megatron and HuggingFace.
exclude = [
"vllm/model_executor/parallel_utils/|vllm/model_executor/models/",
# Ignore triton kernels in ops.
'vllm/attention/ops/.*\.py$'
]
[tool.isort]
skip_glob = [
".buildkite/*",

View File

@ -24,7 +24,7 @@ outlines_core == 0.2.11
# required for outlines backend disk cache
diskcache == 5.6.3
lark == 1.2.2
xgrammar == 0.1.25; platform_machine == "x86_64" or platform_machine == "aarch64" or platform_machine == "arm64"
xgrammar == 0.1.23; platform_machine == "x86_64" or platform_machine == "aarch64" or platform_machine == "arm64"
typing_extensions >= 4.10
filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317
partial-json-parser # used for parsing partial JSON outputs

View File

@ -1,11 +1,12 @@
# Temporarily used for x86 CPU backend to avoid performance regression of torch>2.6.0+cpu,
# see https://github.com/pytorch/pytorch/pull/151218
cmake>=3.26.1
ninja
packaging>=24.2
setuptools>=77.0.3,<80.0.0
setuptools-scm>=8
--extra-index-url https://download.pytorch.org/whl/cpu
torch==2.8.0+cpu; platform_machine == "x86_64"
torch==2.8.0; platform_machine == "ppc64le" or platform_machine == "aarch64" or platform_system == "Darwin"
torch==2.6.0+cpu
wheel
jinja2>=3.1.6
regex

View File

@ -8,7 +8,7 @@ numba == 0.61.2; python_version > '3.9' and platform_machine != "s390x"
packaging>=24.2
setuptools>=77.0.3,<80.0.0
--extra-index-url https://download.pytorch.org/whl/cpu
torch==2.8.0+cpu; platform_machine == "x86_64"
torch==2.6.0+cpu; platform_machine == "x86_64" # torch>2.6.0+cpu has performance regression on x86 platform, see https://github.com/pytorch/pytorch/pull/151218
torch==2.8.0; platform_system == "Darwin"
torch==2.8.0; platform_machine == "ppc64le" or platform_machine == "aarch64"
@ -23,7 +23,7 @@ datasets # for benchmark scripts
# Intel Extension for PyTorch, only for x86_64 CPUs
intel-openmp==2024.2.1; platform_machine == "x86_64"
intel_extension_for_pytorch==2.8.0; platform_machine == "x86_64"
intel_extension_for_pytorch==2.6.0; platform_machine == "x86_64" # torch>2.6.0+cpu has performance regression on x86 platform, see https://github.com/pytorch/pytorch/pull/151218
triton==3.2.0; platform_machine == "x86_64" # Triton is required for torch 2.6+cpu, as it is imported in torch.compile.
# Use this to gather CPU info and optimize based on ARM Neoverse cores

View File

@ -1,5 +1,5 @@
# This file was autogenerated by uv via the following command:
# uv pip compile requirements/test.in -o requirements/test.txt --index-strategy unsafe-best-match --torch-backend cu128 --python-platform x86_64-manylinux_2_28
# uv pip compile requirements/test.in -o requirements/test.txt --index-strategy unsafe-best-match --torch-backend cu128
absl-py==2.1.0
# via rouge-score
accelerate==1.0.1

View File

@ -14,4 +14,14 @@ nixl==0.3.0
tpu_info==0.4.0
# Install torch_xla
torch_xla[tpu, pallas]==2.8.0
--pre
--extra-index-url https://download.pytorch.org/whl/nightly/cpu
--find-links https://storage.googleapis.com/libtpu-wheels/index.html
--find-links https://storage.googleapis.com/libtpu-releases/index.html
--find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html
--find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
torch==2.9.0.dev20250730
torchvision==0.24.0.dev20250730
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.9.0.dev20250730-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.9.0.dev20250730-cp312-cp312-linux_x86_64.whl ; python_version == "3.12"

View File

@ -0,0 +1,54 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""vllm.entrypoints.api_server with some extra logging for testing."""
from collections.abc import Iterable
from typing import Any
import uvicorn
from fastapi.responses import JSONResponse, Response
import vllm.entrypoints.api_server
import vllm.envs as envs
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.engine.async_llm_engine import AsyncLLMEngine
from vllm.utils import FlexibleArgumentParser
app = vllm.entrypoints.api_server.app
class AsyncLLMEngineWithStats(AsyncLLMEngine):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self._num_aborts = 0
async def _engine_abort(self, request_ids: Iterable[str]):
ids = list(request_ids)
self._num_aborts += len(ids)
await super()._engine_abort(ids)
def testing_stats(self) -> dict[str, Any]:
return {"num_aborted_requests": self._num_aborts}
@app.get("/stats")
def stats() -> Response:
"""Get the statistics of the engine."""
return JSONResponse(engine.testing_stats())
if __name__ == "__main__":
parser = FlexibleArgumentParser()
parser.add_argument("--host", type=str, default="localhost")
parser.add_argument("--port", type=int, default=8000)
parser = AsyncEngineArgs.add_cli_args(parser)
args = parser.parse_args()
engine_args = AsyncEngineArgs.from_cli_args(args)
engine = AsyncLLMEngineWithStats.from_engine_args(engine_args)
vllm.entrypoints.api_server.engine = engine
uvicorn.run(app,
host=args.host,
port=args.port,
log_level="debug",
timeout_keep_alive=envs.VLLM_HTTP_TIMEOUT_KEEP_ALIVE)

View File

@ -0,0 +1,12 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
@pytest.fixture(scope="function", autouse=True)
def use_v0_only(monkeypatch):
"""
Since this module is V0 only, set VLLM_USE_V1=0 for
all tests in the module.
"""
monkeypatch.setenv('VLLM_USE_V1', '0')

View File

@ -0,0 +1,139 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import copyreg
import os
import subprocess
import sys
import time
from multiprocessing import Pool
from pathlib import Path
import pytest
import requests
import urllib3.exceptions
def _pickle_new_connection_error(obj):
"""Custom pickler for NewConnectionError to fix tblib compatibility."""
# Extract the original message by removing the "conn: " prefix
full_message = obj.args[0] if obj.args else ""
if ': ' in full_message:
# Split off the connection part and keep the actual message
_, actual_message = full_message.split(': ', 1)
else:
actual_message = full_message
return _unpickle_new_connection_error, (actual_message, )
def _unpickle_new_connection_error(message):
"""Custom unpickler for NewConnectionError."""
# Create with None as conn and the actual message
return urllib3.exceptions.NewConnectionError(None, message)
# Register the custom pickle/unpickle functions for tblib compatibility
copyreg.pickle(urllib3.exceptions.NewConnectionError,
_pickle_new_connection_error)
def _query_server(prompt: str, max_tokens: int = 5) -> dict:
response = requests.post("http://localhost:8000/generate",
json={
"prompt": prompt,
"max_tokens": max_tokens,
"temperature": 0,
"ignore_eos": True
})
response.raise_for_status()
return response.json()
def _query_server_long(prompt: str) -> dict:
return _query_server(prompt, max_tokens=500)
@pytest.fixture
def api_server(distributed_executor_backend: str):
script_path = Path(__file__).parent.joinpath(
"api_server_async_engine.py").absolute()
commands = [
sys.executable,
"-u",
str(script_path),
"--model",
"facebook/opt-125m",
"--host",
"127.0.0.1",
"--distributed-executor-backend",
distributed_executor_backend,
]
# API Server Test Requires V0.
my_env = os.environ.copy()
my_env["VLLM_USE_V1"] = "0"
uvicorn_process = subprocess.Popen(commands, env=my_env)
yield
uvicorn_process.terminate()
@pytest.mark.timeout(300)
@pytest.mark.parametrize("distributed_executor_backend", ["mp", "ray"])
def test_api_server(api_server, distributed_executor_backend: str):
"""
Run the API server and test it.
We run both the server and requests in separate processes.
We test that the server can handle incoming requests, including
multiple requests at the same time, and that it can handle requests
being cancelled without crashing.
"""
with Pool(32) as pool:
# Wait until the server is ready
prompts = ["warm up"] * 1
result = None
while not result:
try:
for r in pool.map(_query_server, prompts):
result = r
break
except requests.exceptions.ConnectionError:
time.sleep(1)
# Actual tests start here
# Try with 1 prompt
for result in pool.map(_query_server, prompts):
assert result
num_aborted_requests = requests.get(
"http://localhost:8000/stats").json()["num_aborted_requests"]
assert num_aborted_requests == 0
# Try with 100 prompts
prompts = ["test prompt"] * 100
for result in pool.map(_query_server, prompts):
assert result
with Pool(32) as pool:
# Cancel requests
prompts = ["canceled requests"] * 100
pool.map_async(_query_server_long, prompts)
time.sleep(0.01)
pool.terminate()
pool.join()
# check cancellation stats
# give it some time to update the stats
time.sleep(1)
num_aborted_requests = requests.get(
"http://localhost:8000/stats").json()["num_aborted_requests"]
assert num_aborted_requests > 0
# check that server still runs after cancellations
with Pool(32) as pool:
# Try with 100 prompts
prompts = ["test prompt after canceled"] * 100
for result in pool.map(_query_server, prompts):
assert result

View File

@ -0,0 +1,71 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
from vllm.engine.async_llm_engine import RequestTracker
from vllm.outputs import RequestOutput
@pytest.mark.asyncio
async def test_request_tracker():
tracker = RequestTracker()
stream_1 = tracker.add_request("1")
assert tracker.new_requests_event.is_set()
await tracker.wait_for_new_requests()
new, aborted = tracker.get_new_and_aborted_requests()
assert not tracker.new_requests_event.is_set()
assert len(new) == 1
assert new[0]["request_id"] == "1"
assert not aborted
assert not stream_1.finished
stream_2 = tracker.add_request("2")
stream_3 = tracker.add_request("3")
assert tracker.new_requests_event.is_set()
await tracker.wait_for_new_requests()
new, aborted = tracker.get_new_and_aborted_requests()
assert not tracker.new_requests_event.is_set()
assert len(new) == 2
assert new[0]["request_id"] == "2"
assert new[1]["request_id"] == "3"
assert not aborted
assert not stream_2.finished
assert not stream_3.finished
# request_ids must be unique
with pytest.raises(KeyError):
tracker.add_request("1")
assert not tracker.new_requests_event.is_set()
tracker.abort_request("1")
new, aborted = tracker.get_new_and_aborted_requests()
assert len(aborted) == 1
assert "1" in aborted
assert not new
assert stream_1.finished
stream_4 = tracker.add_request("4")
tracker.abort_request("4")
assert tracker.new_requests_event.is_set()
await tracker.wait_for_new_requests()
new, aborted = tracker.get_new_and_aborted_requests()
# aborted new requests will cancel each other out -
# there's no need for them to propagate into the
# engine
assert not aborted
assert not new
assert stream_4.finished
stream_5 = tracker.add_request("5")
assert tracker.new_requests_event.is_set()
tracker.process_request_output(
RequestOutput("2", "output", [], [], [], finished=True))
await tracker.wait_for_new_requests()
new, aborted = tracker.get_new_and_aborted_requests()
assert not tracker.new_requests_event.is_set()
assert not aborted
assert len(new) == 1
assert new[0]["request_id"] == "5"
assert stream_2.finished
assert not stream_5.finished

View File

@ -11,7 +11,7 @@ from unittest.mock import Mock
import pytest
import torch
from vllm import LLM
from vllm import LLM, envs
from vllm.v1.engine.llm_engine import LLMEngine as LLMEngineV1
from ..conftest import HfRunner, VllmRunner
@ -26,6 +26,14 @@ MODELS = [
TARGET_TEST_SUITE = os.environ.get("TARGET_TEST_SUITE", "L4")
@pytest.fixture(autouse=True)
def v1(run_with_both_engines):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass
def test_vllm_gc_ed():
"""Verify vllm instance is GC'ed when it is deleted"""
llm = LLM("distilbert/distilgpt2")
@ -68,6 +76,17 @@ def test_models(
model_executor: str,
enable_prompt_embeds: bool,
) -> None:
if enable_prompt_embeds and envs.is_set(
"VLLM_USE_V1") and envs.VLLM_USE_V1:
pytest.skip("enable_prompt_embeds is not supported in v1.")
if not envs.VLLM_USE_V1:
if async_scheduling:
pytest.skip("async_scheduling only supported in v1.")
if model_executor != "uni":
pytest.skip("only test uniproc executor for v0.")
if backend == "XFORMERS" and model == "google/gemma-2-2b-it":
pytest.skip(
f"{backend} does not support gemma2 with full context length.")
@ -145,6 +164,11 @@ def test_models_distributed(
extra_env: dict[str, str],
enable_prompt_embeds: bool,
) -> None:
if enable_prompt_embeds and envs.is_set(
"VLLM_USE_V1") and envs.VLLM_USE_V1:
pytest.skip("enable_prompt_embeds is not supported in v1.")
if test_suite != TARGET_TEST_SUITE:
pytest.skip(f"Skip test for {test_suite}")

View File

@ -122,12 +122,11 @@ def test_cumem_with_cudagraph():
# sleep mode with safetensors
("meta-llama/Llama-3.2-1B", True),
# sleep mode with pytorch checkpoint
("facebook/opt-125m", True),
("facebook/opt-125m", False),
])
def test_end_to_end(monkeypatch: pytest.MonkeyPatch, model: str, use_v1: bool):
with monkeypatch.context() as m:
assert use_v1
m.setenv("VLLM_USE_V1", "1")
m.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
free, total = torch.cuda.mem_get_info()
used_bytes_baseline = total - free # in case other process is running
llm = LLM(model, enable_sleep_mode=True)

Some files were not shown because too many files have changed in this diff Show More