Compare commits
80 Commits
v0.11.0rc4
...
fix_ds_eag
| Author | SHA1 | Date | |
|---|---|---|---|
| 9ffb182ba3 | |||
| 80608ba5af | |||
| e184c9c510 | |||
| d7e34b4210 | |||
| ef6e0e7132 | |||
| 1ad3aca682 | |||
| 8d0afa9b42 | |||
| fa7e254a7f | |||
| e23cacda35 | |||
| 2e1b8bc2b6 | |||
| e47433b3c1 | |||
| 23194d83e8 | |||
| 61aedb5ffe | |||
| d3bd171123 | |||
| 89e4050af4 | |||
| 78a47f87ce | |||
| 6a113d9aed | |||
| 2e4fe48c37 | |||
| 8eb0a1d906 | |||
| fea3e476aa | |||
| 61a3431613 | |||
| 9bedac9623 | |||
| c42ff4f4fd | |||
| d5ab28511c | |||
| e61eb5e09d | |||
| 0899ba5b42 | |||
| 145ac73317 | |||
| d0d138bc55 | |||
| 43227236ec | |||
| 8616300ae2 | |||
| edbaadd91f | |||
| 9360d34fa1 | |||
| 1b67b04656 | |||
| bd51f78e39 | |||
| 65ecb4f134 | |||
| 143844fa43 | |||
| 219cfbe7f6 | |||
| 9b44a7d926 | |||
| a3ae45a38c | |||
| 0307428d65 | |||
| 471997adf6 | |||
| b1ded114b9 | |||
| f4e4088c99 | |||
| 0efd540dbc | |||
| 6144754014 | |||
| 69311446ba | |||
| da63274d9f | |||
| c216119d64 | |||
| 5546acb463 | |||
| c0ec81836f | |||
| b65e56babe | |||
| 49996cd597 | |||
| ecb37e276a | |||
| a5354b3ed2 | |||
| f9df8b4ad7 | |||
| ec152c8748 | |||
| 7977e5027c | |||
| 3f5d902d2a | |||
| 27d7638b94 | |||
| 176173989a | |||
| 23b8ee672d | |||
| 3939152069 | |||
| cd87bfbf37 | |||
| b3613e3ace | |||
| d346ec695e | |||
| c242c98031 | |||
| f1d53d150c | |||
| 92da847cf5 | |||
| 3958b96bf5 | |||
| 8bf8f45822 | |||
| 6f5c0931c1 | |||
| 4e33a7ea85 | |||
| dc48ba0c75 | |||
| 4778b42660 | |||
| c70ac4b8ff | |||
| cf89202855 | |||
| f075693da7 | |||
| f708bd4904 | |||
| 0002b7f0d1 | |||
| 11aafd9886 |
@ -42,9 +42,8 @@ docker run \
|
||||
pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py
|
||||
pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
|
||||
pytest -v -s v1/structured_output
|
||||
pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_eagle.py --ignore=v1/spec_decode/test_tree_attention.py
|
||||
pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py
|
||||
pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py
|
||||
pytest -v -s v1/test_metrics
|
||||
pytest -v -s v1/test_serial_utils.py
|
||||
pytest -v -s v1/test_utils.py
|
||||
pytest -v -s v1/test_metrics_reader.py
|
||||
'
|
||||
|
||||
@ -159,10 +159,7 @@ steps:
|
||||
- examples/offline_inference/rlhf.py
|
||||
- examples/offline_inference/rlhf_colocate.py
|
||||
- tests/examples/offline_inference/data_parallel.py
|
||||
- tests/v1/test_async_llm_dp.py
|
||||
- tests/v1/test_external_lb_dp.py
|
||||
- tests/v1/test_internal_lb_dp.py
|
||||
- tests/v1/test_hybrid_lb_dp.py
|
||||
- tests/v1/distributed
|
||||
- tests/v1/engine/test_engine_core_client.py
|
||||
- tests/distributed/test_symm_mem_allreduce.py
|
||||
commands:
|
||||
@ -180,10 +177,10 @@ steps:
|
||||
- TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
|
||||
# test with internal dp
|
||||
- python3 ../examples/offline_inference/data_parallel.py --enforce-eager
|
||||
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
|
||||
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_external_lb_dp.py
|
||||
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/test_internal_lb_dp.py
|
||||
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/test_hybrid_lb_dp.py
|
||||
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
|
||||
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
|
||||
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
|
||||
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
|
||||
- pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
|
||||
- pytest -v -s distributed/test_utils.py
|
||||
- pytest -v -s compile/test_basic_correctness.py
|
||||
@ -300,12 +297,9 @@ steps:
|
||||
- pytest -v -s v1/spec_decode
|
||||
- pytest -v -s v1/kv_connector/unit
|
||||
- pytest -v -s v1/metrics
|
||||
- pytest -v -s v1/test_kv_sharing.py
|
||||
- pytest -v -s v1/test_metrics_reader.py
|
||||
- pytest -v -s v1/test_oracle.py
|
||||
- pytest -v -s v1/test_request.py
|
||||
- pytest -v -s v1/test_serial_utils.py
|
||||
- pytest -v -s v1/test_utils.py
|
||||
# Integration test for streaming correctness (requires special branch).
|
||||
- pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
|
||||
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
|
||||
@ -465,29 +459,18 @@ steps:
|
||||
commands:
|
||||
- pytest -v -s kernels/mamba
|
||||
|
||||
- label: Tensorizer Test # 14min
|
||||
timeout_in_minutes: 25
|
||||
mirror_hardwares: [amdexperimental]
|
||||
source_file_dependencies:
|
||||
- vllm/model_executor/model_loader
|
||||
- tests/tensorizer_loader
|
||||
- tests/entrypoints/openai/test_tensorizer_entrypoint.py
|
||||
commands:
|
||||
- apt-get update && apt-get install -y curl libsodium23
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- pytest -v -s tensorizer_loader
|
||||
- pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
|
||||
|
||||
- label: Model Executor Test # 7min
|
||||
timeout_in_minutes: 20
|
||||
- label: Model Executor Test # 23min
|
||||
timeout_in_minutes: 35
|
||||
mirror_hardwares: [amdexperimental]
|
||||
source_file_dependencies:
|
||||
- vllm/model_executor
|
||||
- tests/model_executor
|
||||
- tests/entrypoints/openai/test_tensorizer_entrypoint.py
|
||||
commands:
|
||||
- apt-get update && apt-get install -y curl libsodium23
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- pytest -v -s model_executor
|
||||
- pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
|
||||
|
||||
- label: Benchmarks # 11min
|
||||
timeout_in_minutes: 20
|
||||
@ -522,7 +505,7 @@ steps:
|
||||
# https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
|
||||
# we can only upgrade after this is resolved
|
||||
- pip install --pre torchao==0.13.0.dev20250814 --index-url https://download.pytorch.org/whl/nightly/cu128
|
||||
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
|
||||
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/
|
||||
|
||||
- label: LM Eval Small Models # 53min
|
||||
timeout_in_minutes: 75
|
||||
@ -769,6 +752,7 @@ steps:
|
||||
commands:
|
||||
- pip install --upgrade git+https://github.com/huggingface/transformers
|
||||
- pytest -v -s tests/models/test_initialization.py
|
||||
- pytest -v -s tests/models/test_transformers.py
|
||||
- pytest -v -s tests/models/multimodal/processing/
|
||||
- pytest -v -s tests/models/multimodal/test_mapping.py
|
||||
- python3 examples/offline_inference/basic/chat.py
|
||||
@ -830,6 +814,23 @@ steps:
|
||||
- uv pip install --system 'gpt-oss[eval]==0.0.5'
|
||||
- pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58 --server-args '--tensor-parallel-size 2'
|
||||
|
||||
- label: Blackwell Quantized MoE Test
|
||||
timeout_in_minutes: 60
|
||||
working_dir: "/vllm-workspace/"
|
||||
gpu: b200
|
||||
source_file_dependencies:
|
||||
- tests/quantization/test_blackwell_moe.py
|
||||
- vllm/model_executor/models/deepseek_v2.py
|
||||
- vllm/model_executor/models/gpt_oss.py
|
||||
- vllm/model_executor/models/llama4.py
|
||||
- vllm/model_executor/layers/fused_moe
|
||||
- vllm/model_executor/layers/quantization/compressed_tensors
|
||||
- vllm/model_executor/layers/quantization/modelopt.py
|
||||
- vllm/model_executor/layers/quantization/mxfp4.py
|
||||
- vllm/v1/attention/backends/flashinfer.py
|
||||
commands:
|
||||
- pytest -s -v tests/quantization/test_blackwell_moe.py
|
||||
|
||||
##### 1 GPU test #####
|
||||
##### multi gpus test #####
|
||||
|
||||
@ -889,14 +890,13 @@ steps:
|
||||
- tests/compile/test_wrapper.py
|
||||
- tests/distributed/
|
||||
- tests/entrypoints/llm/test_collective_rpc.py
|
||||
- tests/v1/test_async_llm_dp.py
|
||||
- tests/v1/test_external_lb_dp.py
|
||||
- tests/v1/distributed
|
||||
- tests/v1/entrypoints/openai/test_multi_api_servers.py
|
||||
- tests/v1/shutdown
|
||||
- tests/v1/worker/test_worker_memory_snapshot.py
|
||||
commands:
|
||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
|
||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_external_lb_dp.py
|
||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
|
||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
|
||||
- DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
|
||||
- pytest -v -s entrypoints/llm/test_collective_rpc.py
|
||||
- pytest -v -s ./compile/test_basic_correctness.py
|
||||
|
||||
10
.github/CODEOWNERS
vendored
10
.github/CODEOWNERS
vendored
@ -12,8 +12,6 @@
|
||||
/vllm/model_executor/layers/mamba @tdoublep
|
||||
/vllm/model_executor/model_loader @22quinn
|
||||
/vllm/multimodal @DarkLight1337 @ywang96 @NickLucche
|
||||
/vllm/v1/attention @LucasWilkinson
|
||||
/vllm/v1/sample @22quinn @houseroad
|
||||
/vllm/vllm_flash_attn @LucasWilkinson
|
||||
/vllm/lora @jeejeelee
|
||||
/vllm/reasoning @aarnphm @chaunceyjiang
|
||||
@ -28,11 +26,13 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
|
||||
|
||||
# vLLM V1
|
||||
/vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
|
||||
/vllm/v1/structured_output @mgoin @russellb @aarnphm @benchislett
|
||||
/vllm/v1/spec_decode @benchislett @luccafong
|
||||
/vllm/v1/attention @LucasWilkinson
|
||||
/vllm/v1/attention/backends/flashinfer.py @mgoin
|
||||
/vllm/v1/attention/backends/triton_attn.py @tdoublep
|
||||
/vllm/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat @heheda12345 @ApostaC
|
||||
/vllm/v1/sample @22quinn @houseroad @njhill
|
||||
/vllm/v1/spec_decode @benchislett @luccafong
|
||||
/vllm/v1/structured_output @mgoin @russellb @aarnphm @benchislett
|
||||
/vllm/v1/kv_cache_interface.py @heheda12345
|
||||
/vllm/v1/offloading @ApostaC
|
||||
|
||||
@ -54,7 +54,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
|
||||
/tests/weight_loading @mgoin @youkaichao @yewentao256
|
||||
/tests/lora @jeejeelee
|
||||
/tests/models/language/generation/test_hybrid.py @tdoublep
|
||||
/tests/v1/kv_connector/nixl_integration @NickLucche
|
||||
/tests/v1/kv_connector/nixl_integration @NickLucche
|
||||
/tests/v1/kv_connector @ApostaC
|
||||
/tests/v1/offloading @ApostaC
|
||||
|
||||
|
||||
2
.github/mergify.yml
vendored
2
.github/mergify.yml
vendored
@ -274,7 +274,7 @@ pull_request_rules:
|
||||
- files~=^vllm/model_executor/model_loader/tensorizer.py
|
||||
- files~=^vllm/model_executor/model_loader/tensorizer_loader.py
|
||||
- files~=^tests/entrypoints/openai/test_tensorizer_entrypoint.py
|
||||
- files~=^tests/tensorizer_loader/
|
||||
- files~=^tests/model_executor/model_loader/tensorizer_loader/
|
||||
actions:
|
||||
assign:
|
||||
users:
|
||||
|
||||
@ -21,6 +21,7 @@ Join us at the [PyTorch Conference, October 22-23](https://events.linuxfoundatio
|
||||
|
||||
*Latest News* 🔥
|
||||
|
||||
- [2025/09] We hosted [vLLM Toronto Meetup](https://luma.com/e80e0ymm) focused on tackling inference at scale and speculative decoding with speakers from NVIDIA and Red Hat! Please find the meetup slides [here](https://docs.google.com/presentation/d/1IYJYmJcu9fLpID5N5RbW_vO0XLo0CGOR14IXOjB61V8/edit?usp=sharing).
|
||||
- [2025/08] We hosted [vLLM Shenzhen Meetup](https://mp.weixin.qq.com/s/k8ZBO1u2_2odgiKWH_GVTQ) focusing on the ecosystem around vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Ua2SVKVSu-wp5vou_6ElraDt2bnKhiEA).
|
||||
- [2025/08] We hosted [vLLM Singapore Meetup](https://www.sginnovate.com/event/vllm-sg-meet). We shared V1 updates, disaggregated serving and MLLM speedups with speakers from Embedded LLM, AMD, WekaIO, and A*STAR. Please find the meetup slides [here](https://drive.google.com/drive/folders/1ncf3GyqLdqFaB6IeB834E5TZJPLAOiXZ?usp=sharing).
|
||||
- [2025/08] We hosted [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/pDmAXHcN7Iqc8sUKgJgGtg) focusing on building, developing, and integrating with vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1OvLx39wnCGy_WKq8SiVKf7YcxxYI3WCH).
|
||||
|
||||
@ -584,9 +584,8 @@ def main(args: argparse.Namespace):
|
||||
topk = config.num_experts_per_tok
|
||||
intermediate_size = config.intermediate_size
|
||||
elif config.architectures[0] in (
|
||||
"DeepseekV2ForCausalLM",
|
||||
"DeepseekV3ForCausalLM",
|
||||
"DeepseekV32ForCausalLM",
|
||||
"DeepseekV2ForCausalLM",
|
||||
"Glm4MoeForCausalLM",
|
||||
):
|
||||
E = config.n_routed_experts
|
||||
|
||||
@ -536,9 +536,7 @@ __global__ void indexer_k_quant_and_cache_kernel(
|
||||
for (int i = 0; i < VEC_SIZE; i++) {
|
||||
amax = fmaxf(amax, fabsf(float(k_val_ptr[i])));
|
||||
}
|
||||
#ifndef USE_ROCM
|
||||
__syncwarp();
|
||||
#endif
|
||||
|
||||
// Reduced amax
|
||||
for (int mask = 16; mask > 0; mask /= 2) {
|
||||
@ -548,9 +546,7 @@ __global__ void indexer_k_quant_and_cache_kernel(
|
||||
amax = fmaxf(amax, __shfl_xor_sync(unsigned(-1), amax, mask));
|
||||
#endif
|
||||
}
|
||||
#ifndef USE_ROCM
|
||||
__syncwarp();
|
||||
#endif
|
||||
float scale = fmaxf(amax, 1e-4) / 448.0f;
|
||||
if (use_ue8m0) {
|
||||
scale = exp2f(ceilf(log2f(scale)));
|
||||
@ -1171,4 +1167,4 @@ void indexer_k_quant_and_cache(
|
||||
|
||||
DISPATCH_BY_KV_CACHE_DTYPE(k.dtype(), "fp8_e4m3",
|
||||
CALL_INDEXER_K_QUANT_AND_CACHE);
|
||||
}
|
||||
}
|
||||
16
csrc/core/batch_invariant.hpp
Normal file
16
csrc/core/batch_invariant.hpp
Normal file
@ -0,0 +1,16 @@
|
||||
#pragma once
|
||||
#include <cstdlib>
|
||||
#include <string>
|
||||
#include <cctype>
|
||||
|
||||
namespace vllm {
|
||||
|
||||
// vllm_kernel_override_batch_invariant(); returns true
|
||||
// if env VLLM_KERNEL_OVERRIDE_BATCH_INVARIANT=1
|
||||
inline bool vllm_kernel_override_batch_invariant() {
|
||||
std::string env_key = "VLLM_KERNEL_OVERRIDE_BATCH_INVARIANT";
|
||||
const char* val = std::getenv(env_key.c_str());
|
||||
return (val && std::atoi(val) != 0) ? 1 : 0;
|
||||
}
|
||||
|
||||
} // namespace vllm
|
||||
@ -1,6 +1,7 @@
|
||||
#include "type_convert.cuh"
|
||||
#include "dispatch_utils.h"
|
||||
#include "cub_helpers.h"
|
||||
#include "core/batch_invariant.hpp"
|
||||
|
||||
#include <torch/cuda.h>
|
||||
#include <c10/cuda/CUDAGuard.h>
|
||||
@ -413,7 +414,9 @@ void fused_add_rms_norm(torch::Tensor& input, // [..., hidden_size]
|
||||
wt_ptr % req_alignment_bytes == 0;
|
||||
bool offsets_are_multiple_of_vector_width =
|
||||
hidden_size % vector_width == 0 && input_stride % vector_width == 0;
|
||||
if (ptrs_are_aligned && offsets_are_multiple_of_vector_width) {
|
||||
bool batch_invariant_launch = vllm::vllm_kernel_override_batch_invariant();
|
||||
if (ptrs_are_aligned && offsets_are_multiple_of_vector_width &&
|
||||
!batch_invariant_launch) {
|
||||
LAUNCH_FUSED_ADD_RMS_NORM(8);
|
||||
} else {
|
||||
LAUNCH_FUSED_ADD_RMS_NORM(0);
|
||||
@ -459,7 +462,8 @@ void poly_norm(torch::Tensor& out, // [..., hidden_size]
|
||||
auto inp_ptr = reinterpret_cast<std::uintptr_t>(input.data_ptr());
|
||||
auto out_ptr = reinterpret_cast<std::uintptr_t>(out.data_ptr());
|
||||
bool ptrs_are_aligned = inp_ptr % 16 == 0 && out_ptr % 16 == 0;
|
||||
if (ptrs_are_aligned && hidden_size % 8 == 0) {
|
||||
bool batch_invariant_launch = vllm::vllm_kernel_override_batch_invariant();
|
||||
if (ptrs_are_aligned && hidden_size % 8 == 0 && !batch_invariant_launch) {
|
||||
LAUNCH_FUSED_POLY_NORM(8);
|
||||
} else {
|
||||
LAUNCH_FUSED_POLY_NORM(0);
|
||||
|
||||
@ -9,6 +9,7 @@
|
||||
#include "quantization/fp8/common.cuh"
|
||||
#include "dispatch_utils.h"
|
||||
#include "cub_helpers.h"
|
||||
#include "core/batch_invariant.hpp"
|
||||
|
||||
#include <torch/cuda.h>
|
||||
#include <c10/cuda/CUDAGuard.h>
|
||||
@ -240,7 +241,9 @@ void fused_add_rms_norm_static_fp8_quant(
|
||||
auto wt_ptr = reinterpret_cast<std::uintptr_t>(weight.data_ptr());
|
||||
bool ptrs_are_aligned =
|
||||
inp_ptr % 16 == 0 && res_ptr % 16 == 0 && wt_ptr % 16 == 0;
|
||||
if (ptrs_are_aligned && hidden_size % 8 == 0 && input_stride % 8 == 0) {
|
||||
bool batch_invariant_launch = vllm::vllm_kernel_override_batch_invariant();
|
||||
if (ptrs_are_aligned && hidden_size % 8 == 0 && input_stride % 8 == 0 &&
|
||||
!batch_invariant_launch) {
|
||||
LAUNCH_FUSED_ADD_RMS_NORM(8);
|
||||
} else {
|
||||
LAUNCH_FUSED_ADD_RMS_NORM(0);
|
||||
|
||||
@ -21,6 +21,7 @@
|
||||
#include <c10/cuda/CUDAGuard.h>
|
||||
#include "../cuda_compat.h"
|
||||
#include "../cub_helpers.h"
|
||||
#include "../core/batch_invariant.hpp"
|
||||
|
||||
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
||||
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
||||
@ -405,7 +406,8 @@ void topkGatingSoftmaxLauncherHelper(const float* input, const bool* finished, f
|
||||
using Constants = detail::TopkConstants<EXPERTS, BYTES_PER_LDG, WARP_SIZE_PARAM>;
|
||||
static constexpr int VPT = Constants::VPT;
|
||||
static constexpr int ROWS_PER_WARP = Constants::ROWS_PER_WARP;
|
||||
const int num_warps = (num_rows + ROWS_PER_WARP - 1) / ROWS_PER_WARP;
|
||||
const bool batch_invariant_launch = vllm::vllm_kernel_override_batch_invariant();
|
||||
const int num_warps = batch_invariant_launch ? 32 : (num_rows + ROWS_PER_WARP - 1) / ROWS_PER_WARP;
|
||||
const int num_blocks = (num_warps + WARPS_PER_TB - 1) / WARPS_PER_TB;
|
||||
|
||||
dim3 block_dim(WARP_SIZE_PARAM, WARPS_PER_TB);
|
||||
|
||||
@ -391,18 +391,28 @@ RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH'
|
||||
git clone --depth 1 --recursive --shallow-submodules \
|
||||
--branch ${FLASHINFER_GIT_REF} \
|
||||
${FLASHINFER_GIT_REPO} flashinfer
|
||||
# Exclude CUDA arches for older versions (11.x and 12.0-12.7)
|
||||
# TODO: Update this to allow setting TORCH_CUDA_ARCH_LIST as a build arg.
|
||||
if [[ "${CUDA_VERSION}" == 11.* ]]; then
|
||||
FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9"
|
||||
elif [[ "${CUDA_VERSION}" == 12.[0-7]* ]]; then
|
||||
FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a"
|
||||
else
|
||||
# CUDA 12.8+ supports 10.0a and 12.0
|
||||
FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a 10.0a 12.0"
|
||||
fi
|
||||
pushd flashinfer
|
||||
if [ "${FLASHINFER_AOT_COMPILE}" = "true" ]; then
|
||||
# Exclude CUDA arches for older versions (11.x and 12.0-12.7)
|
||||
# TODO: Update this to allow setting TORCH_CUDA_ARCH_LIST as a build arg.
|
||||
if [[ "${CUDA_VERSION}" == 11.* ]]; then
|
||||
FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9"
|
||||
elif [[ "${CUDA_VERSION}" == 12.[0-7]* ]]; then
|
||||
FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a"
|
||||
else
|
||||
# CUDA 12.8+ supports 10.0a and 12.0
|
||||
FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a 10.0a 12.0"
|
||||
if [[ "${CUDA_VERSION}" == 12.8.* ]] && [ "$TARGETPLATFORM" = "linux/amd64" ]; then
|
||||
# NOTE: To make new precompiled wheels, see tools/flashinfer-build.sh
|
||||
echo "🏗️ Installing FlashInfer from pre-compiled wheel"
|
||||
uv pip install --system https://wheels.vllm.ai/flashinfer-python/flashinfer_python-0.3.1-cp39-abi3-manylinux1_x86_64.whl \
|
||||
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
|
||||
if [ "${FLASHINFER_AOT_COMPILE}" = "true" ]; then
|
||||
# Download pre-compiled cubins
|
||||
TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
|
||||
python3 -m flashinfer --download-cubin || echo "WARNING: Failed to download flashinfer cubins."
|
||||
fi
|
||||
elif [ "${FLASHINFER_AOT_COMPILE}" = "true" ]; then
|
||||
echo "🏗️ Installing FlashInfer with AOT compilation for arches: ${FI_TORCH_CUDA_ARCH_LIST}"
|
||||
export FLASHINFER_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}"
|
||||
# HACK: We need these to run flashinfer.aot before installing flashinfer, get from the package in the future
|
||||
@ -446,7 +456,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
ARG DEEPGEMM_GIT_REF
|
||||
COPY tools/install_deepgemm.sh /tmp/install_deepgemm.sh
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
VLLM_DOCKER_BUILD_CONTEXT=1 /tmp/install_deepgemm.sh --cuda-version "${CUDA_VERSION}" ${DEEPGEMM_GIT_REF:+--ref "$DEEPGEMM_GIT_REF"}
|
||||
VLLM_DOCKER_BUILD_CONTEXT=1 TORCH_CUDA_ARCH_LIST="9.0a 10.0a" /tmp/install_deepgemm.sh --cuda-version "${CUDA_VERSION}" ${DEEPGEMM_GIT_REF:+--ref "$DEEPGEMM_GIT_REF"}
|
||||
|
||||
COPY tools/install_gdrcopy.sh install_gdrcopy.sh
|
||||
RUN set -eux; \
|
||||
@ -464,6 +474,12 @@ ENV CUDA_HOME=/usr/local/cuda
|
||||
RUN export TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST:-9.0a+PTX}" \
|
||||
&& bash install_python_libraries.sh
|
||||
|
||||
# CUDA image changed from /usr/local/nvidia to /usr/local/cuda in 12.8 but will
|
||||
# return to /usr/local/nvidia in 13.0 to allow container providers to mount drivers
|
||||
# consistently from the host (see https://github.com/vllm-project/vllm/issues/18859).
|
||||
# Until then, add /usr/local/nvidia/lib64 before the image cuda path to allow override.
|
||||
ENV LD_LIBRARY_PATH=/usr/local/nvidia/lib64:${LD_LIBRARY_PATH}
|
||||
|
||||
#################### vLLM installation IMAGE ####################
|
||||
|
||||
#################### TEST IMAGE ####################
|
||||
@ -536,7 +552,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
else \
|
||||
BITSANDBYTES_VERSION="0.46.1"; \
|
||||
fi; \
|
||||
uv pip install --system accelerate hf_transfer modelscope "bitsandbytes>=${BITSANDBYTES_VERSION}" 'timm>=1.0.17' boto3 runai-model-streamer runai-model-streamer[s3]
|
||||
uv pip install --system accelerate hf_transfer modelscope "bitsandbytes>=${BITSANDBYTES_VERSION}" 'timm>=1.0.17' 'runai-model-streamer[s3]>=0.14.0'
|
||||
|
||||
ENV VLLM_USAGE_SOURCE production-docker-image
|
||||
|
||||
|
||||
@ -2,6 +2,7 @@
|
||||
|
||||
We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below:
|
||||
|
||||
- [vLLM Toronto Meetup](https://luma.com/e80e0ymm), September 25th 2025. [[Slides]](https://docs.google.com/presentation/d/1IYJYmJcu9fLpID5N5RbW_vO0XLo0CGOR14IXOjB61V8/edit?usp=sharing)
|
||||
- [vLLM Shenzhen Meetup](https://mp.weixin.qq.com/s/k8ZBO1u2_2odgiKWH_GVTQ), August 30th 2025. [[Slides]](https://drive.google.com/drive/folders/1Ua2SVKVSu-wp5vou_6ElraDt2bnKhiEA)
|
||||
- [vLLM Singapore Meetup](https://www.sginnovate.com/event/vllm-sg-meet), August 27th 2025. [[Slides]](https://drive.google.com/drive/folders/1ncf3GyqLdqFaB6IeB834E5TZJPLAOiXZ?usp=sharing)
|
||||
- [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/pDmAXHcN7Iqc8sUKgJgGtg), August 23rd 2025. [[Slides]](https://drive.google.com/drive/folders/1OvLx39wnCGy_WKq8SiVKf7YcxxYI3WCH)
|
||||
|
||||
@ -823,6 +823,30 @@ The latest performance results are hosted on the public [vLLM Performance Dashbo
|
||||
|
||||
More information on the performance benchmarks and their parameters can be found in [Benchmark README](https://github.com/intel-ai-tce/vllm/blob/more_cpu_models/.buildkite/nightly-benchmarks/README.md) and [performance benchmark description](gh-file:.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md).
|
||||
|
||||
### Continuous Benchmarking
|
||||
|
||||
The continuous benchmarking provides automated performance monitoring for vLLM across different models and GPU devices. This helps track vLLM's performance characteristics over time and identify any performance regressions or improvements.
|
||||
|
||||
#### How It Works
|
||||
|
||||
The continuous benchmarking is triggered via a [GitHub workflow CI](https://github.com/pytorch/pytorch-integration-testing/actions/workflows/vllm-benchmark.yml) in the PyTorch infrastructure repository, which runs automatically every 4 hours. The workflow executes three types of performance tests:
|
||||
|
||||
- **Serving tests**: Measure request handling and API performance
|
||||
- **Throughput tests**: Evaluate token generation rates
|
||||
- **Latency tests**: Assess response time characteristics
|
||||
|
||||
#### Benchmark Configuration
|
||||
|
||||
The benchmarking currently runs on a predefined set of models configured in the [vllm-benchmarks directory](https://github.com/pytorch/pytorch-integration-testing/tree/main/vllm-benchmarks/benchmarks). To add new models for benchmarking:
|
||||
|
||||
1. Navigate to the appropriate GPU directory in the benchmarks configuration
|
||||
2. Add your model specifications to the corresponding configuration files
|
||||
3. The new models will be included in the next scheduled benchmark run
|
||||
|
||||
#### Viewing Results
|
||||
|
||||
All continuous benchmarking results are automatically published to the public [vLLM Performance Dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm).
|
||||
|
||||
[](){ #nightly-benchmarks }
|
||||
|
||||
## Nightly Benchmarks
|
||||
|
||||
@ -66,35 +66,12 @@ Further update the model as follows:
|
||||
!!! important
|
||||
The returned `multimodal_embeddings` must be either a **3D [torch.Tensor][]** of shape `(num_items, feature_size, hidden_size)`, or a **list / tuple of 2D [torch.Tensor][]'s** of shape `(feature_size, hidden_size)`, so that `multimodal_embeddings[i]` retrieves the embeddings generated from the `i`-th multimodal data item (e.g, image) of the request.
|
||||
|
||||
- Implement [get_input_embeddings][vllm.model_executor.models.interfaces.SupportsMultiModal.get_input_embeddings] to merge `multimodal_embeddings` with text embeddings from the `input_ids`. If input processing for the model is implemented correctly (see sections below), then you can leverage the utility function we provide to easily merge the embeddings.
|
||||
!!! note
|
||||
By default, vLLM merges the multimodal embeddings into text embeddings depending on the information of their locations defined in
|
||||
[PlaceholderRange][vllm.multimodal.inputs.PlaceholderRange] from input processing.
|
||||
This logic can be found at [get_input_embeddings][vllm.model_executor.models.interfaces.SupportsMultiModal.get_input_embeddings].
|
||||
|
||||
??? code
|
||||
|
||||
```python
|
||||
from .utils import merge_multimodal_embeddings
|
||||
|
||||
class YourModelForImage2Seq(nn.Module):
|
||||
...
|
||||
|
||||
def get_input_embeddings(
|
||||
self,
|
||||
input_ids: torch.Tensor,
|
||||
multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
|
||||
) -> torch.Tensor:
|
||||
|
||||
# `get_input_embeddings` should already be implemented for the language
|
||||
# model as one of the requirements of basic vLLM model implementation.
|
||||
inputs_embeds = self.language_model.get_input_embeddings(input_ids)
|
||||
|
||||
if multimodal_embeddings is not None:
|
||||
inputs_embeds = merge_multimodal_embeddings(
|
||||
input_ids=input_ids,
|
||||
inputs_embeds=inputs_embeds,
|
||||
multimodal_embeddings=multimodal_embeddings,
|
||||
placeholder_token_id=self.config.image_token_index)
|
||||
|
||||
return inputs_embeds
|
||||
```
|
||||
You may override this method if additional logic is required for your model when merging embeddings.
|
||||
|
||||
- Implement [get_language_model][vllm.model_executor.models.interfaces.SupportsMultiModal.get_language_model] getter to provide stable access to the underlying language model.
|
||||
|
||||
|
||||
@ -160,6 +160,22 @@ GUI example:
|
||||
|
||||
<img width="1799" alt="Screenshot 2025-03-05 at 11 48 42 AM" src="https://github.com/user-attachments/assets/c7cff1ae-6d6f-477d-a342-bd13c4fc424c" />
|
||||
|
||||
## Continuous Profiling
|
||||
|
||||
There is a [GitHub CI workflow](https://github.com/pytorch/pytorch-integration-testing/actions/workflows/vllm-profiling.yml) in the PyTorch infrastructure repository that provides continuous profiling for different models on vLLM. This automated profiling helps track performance characteristics over time and across different model configurations.
|
||||
|
||||
### How It Works
|
||||
|
||||
The workflow currently runs weekly profiling sessions for selected models, generating detailed performance traces that can be analyzed using different tools to identify performance regressions or optimization opportunities. But, it can be triggered manually as well, using the Github Action tool.
|
||||
|
||||
### Adding New Models
|
||||
|
||||
To extend the continuous profiling to additional models, you can modify the [profiling-tests.json](https://github.com/pytorch/pytorch-integration-testing/blob/main/vllm-profiling/cuda/profiling-tests.json) configuration file in the PyTorch integration testing repository. Simply add your model specifications to this file to include them in the automated profiling runs.
|
||||
|
||||
### Viewing Profiling Results
|
||||
|
||||
The profiling traces generated by the continuous profiling workflow are publicly available on the [vLLM Performance Dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm). Look for the **Profiling traces** table to access and download the traces for different models and runs.
|
||||
|
||||
## Profiling vLLM Python Code
|
||||
|
||||
The Python standard library includes
|
||||
@ -208,3 +224,11 @@ One example is [snakeviz](https://jiffyclub.github.io/snakeviz/).
|
||||
pip install snakeviz
|
||||
snakeviz expensive_function.prof
|
||||
```
|
||||
|
||||
### Analyzing Garbage Collection Costs
|
||||
|
||||
Leverage VLLM_GC_DEBUG environment variable to debug GC costs.
|
||||
|
||||
- VLLM_GC_DEBUG=1: enable GC debugger with gc.collect elpased times
|
||||
- VLLM_GC_DEBUG='{"top_objects":5}': enable GC debugger to log top 5
|
||||
collected objects for each gc.collect
|
||||
|
||||
@ -52,7 +52,7 @@ th:not(:first-child) {
|
||||
| [mm](multimodal_inputs.md) | ✅ | ✅ | [🟠](gh-pr:4194)<sup>^</sup> | ❔ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❔ | ✅ | | | |
|
||||
| best-of | ✅ | ✅ | ✅ | [❌](gh-issue:6137) | ✅ | ❌ | ✅ | ✅ | ✅ | ❔ | [❌](gh-issue:7968) | ✅ | ✅ | | |
|
||||
| beam-search | ✅ | ✅ | ✅ | [❌](gh-issue:6137) | ✅ | ❌ | ✅ | ✅ | ✅ | ❔ | [❌](gh-issue:7968) | ❔ | ✅ | ✅ | |
|
||||
| [prompt-embeds](prompt_embeds.md) | ✅ | [❌](gh-issue:25096) | ? | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ? | ? | ❌ | ? | ? | ✅ |
|
||||
| [prompt-embeds](prompt_embeds.md) | ✅ | [❌](gh-issue:25096) | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ❔ | ❔ | ❌ | ❔ | ❔ | ✅ |
|
||||
|
||||
\* Chunked prefill and prefix caching are only applicable to last-token pooling.
|
||||
<sup>^</sup> LoRA is only applicable to the language backbone of multimodal models.
|
||||
|
||||
@ -84,7 +84,7 @@ python tests/v1/kv_connector/nixl_integration/toy_proxy_server.py \
|
||||
- Connection info is passed via KVTransferParams from prefiller to decoder for handshake
|
||||
|
||||
- `VLLM_NIXL_ABORT_REQUEST_TIMEOUT`: Timeout (in seconds) for automatically releasing the prefiller’s KV cache for a particular request. (Optional)
|
||||
- Default: 120
|
||||
- Default: 480
|
||||
- If a request is aborted and the decoder has not yet read the KV-cache blocks through the nixl channel, the prefill instance will release its KV-cache blocks after this timeout to avoid holding them indefinitely.
|
||||
|
||||
## Multi-Instance Setup
|
||||
|
||||
@ -323,8 +323,10 @@ Flags: `--tool-call-parser longcat`
|
||||
|
||||
Supported models:
|
||||
|
||||
* `ZhipuAI/GLM-4.5`
|
||||
* `ZhipuAI/GLM-4.5-Air`
|
||||
* `zai-org/GLM-4.5`
|
||||
* `zai-org/GLM-4.5-Air`
|
||||
* `zai-org/GLM-4.6`
|
||||
* `zai-org/GLM-4.6-Air`
|
||||
|
||||
Flags: `--tool-call-parser glm45`
|
||||
|
||||
|
||||
@ -25,3 +25,4 @@ The backends below live **outside** the main `vllm` repository and follow the
|
||||
| MetaX MACA GPU | N/A, install from source | <https://github.com/MetaX-MACA/vLLM-metax> |
|
||||
| Rebellions ATOM / REBEL NPU | `vllm-rbln` | <https://github.com/rebellions-sw/vllm-rbln> |
|
||||
| IBM Spyre AIU | `vllm-spyre` | <https://github.com/vllm-project/vllm-spyre> |
|
||||
| Cambricon MLU | `vllm-mlu` | <https://github.com/Cambricon/vllm-mlu> |
|
||||
|
||||
@ -46,22 +46,22 @@ Execute the following commands to build and install vLLM from source.
|
||||
Please build the following dependencies, `torchvision`, `pyarrow` from source before building vLLM.
|
||||
|
||||
```bash
|
||||
sed -i '/^torch/d' requirements-build.txt # remove torch from requirements-build.txt since we use nightly builds
|
||||
sed -i '/^torch/d' requirements/build.txt # remove torch from requirements/build.txt since we use nightly builds
|
||||
uv pip install -v \
|
||||
--torch-backend auto \
|
||||
-r requirements-build.txt \
|
||||
-r requirements-cpu.txt \
|
||||
-r requirements/build.txt \
|
||||
-r requirements/cpu.txt \
|
||||
VLLM_TARGET_DEVICE=cpu python setup.py bdist_wheel && \
|
||||
uv pip install dist/*.whl
|
||||
```
|
||||
|
||||
??? console "pip"
|
||||
```bash
|
||||
sed -i '/^torch/d' requirements-build.txt # remove torch from requirements-build.txt since we use nightly builds
|
||||
sed -i '/^torch/d' requirements/build.txt # remove torch from requirements/build.txt since we use nightly builds
|
||||
pip install -v \
|
||||
--extra-index-url https://download.pytorch.org/whl/nightly/cpu \
|
||||
-r requirements-build.txt \
|
||||
-r requirements-cpu.txt \
|
||||
-r requirements/build.txt \
|
||||
-r requirements/cpu.txt \
|
||||
VLLM_TARGET_DEVICE=cpu python setup.py bdist_wheel && \
|
||||
pip install dist/*.whl
|
||||
```
|
||||
|
||||
@ -367,7 +367,7 @@ th {
|
||||
| `Gemma3nForCausalLM` | Gemma 3n | `google/gemma-3n-E2B-it`, `google/gemma-3n-E4B-it`, etc. | | | ✅︎ |
|
||||
| `GlmForCausalLM` | GLM-4 | `zai-org/glm-4-9b-chat-hf`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `Glm4ForCausalLM` | GLM-4-0414 | `zai-org/GLM-4-32B-0414`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `Glm4MoeForCausalLM` | GLM-4.5 | `zai-org/GLM-4.5`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `Glm4MoeForCausalLM` | GLM-4.5, GLM-4.6 | `zai-org/GLM-4.5`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `GPT2LMHeadModel` | GPT-2 | `gpt2`, `gpt2-xl`, etc. | | ✅︎ | ✅︎ |
|
||||
| `GPTBigCodeForCausalLM` | StarCoder, SantaCoder, WizardCoder | `bigcode/starcoder`, `bigcode/gpt_bigcode-santacoder`, `WizardLM/WizardCoder-15B-V1.0`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `GPTJForCausalLM` | GPT-J | `EleutherAI/gpt-j-6b`, `nomic-ai/gpt4all-j`, etc. | | ✅︎ | ✅︎ |
|
||||
@ -396,7 +396,6 @@ th {
|
||||
| `MiniCPM3ForCausalLM` | MiniCPM3 | `openbmb/MiniCPM3-4B`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `MistralForCausalLM` | Mistral, Mistral-Instruct | `mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `MixtralForCausalLM` | Mixtral-8x7B, Mixtral-8x7B-Instruct | `mistralai/Mixtral-8x7B-v0.1`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, `mistral-community/Mixtral-8x22B-v0.1`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `MotifForCausalLM` | Motif-1-Tiny | `Motif-Technologies/Motif-2.6B`, `Motif-Technologies/Motif-2.6b-v1.1-LC`, etc. | ✅︎ | ✅︎ | |
|
||||
| `MPTForCausalLM` | MPT, MPT-Instruct, MPT-Chat, MPT-StoryWriter | `mosaicml/mpt-7b`, `mosaicml/mpt-7b-storywriter`, `mosaicml/mpt-30b`, etc. | | ✅︎ | ✅︎ |
|
||||
| `NemotronForCausalLM` | Nemotron-3, Nemotron-4, Minitron | `nvidia/Minitron-8B-Base`, `mgoin/Nemotron-4-340B-Base-hf-FP8`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `NemotronHForCausalLM` | Nemotron-H | `nvidia/Nemotron-H-8B-Base-8K`, `nvidia/Nemotron-H-47B-Base-8K`, `nvidia/Nemotron-H-56B-Base-8K`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
@ -404,12 +403,11 @@ th {
|
||||
| `OLMo2ForCausalLM` | OLMo2 | `allenai/OLMo-2-0425-1B`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `OLMo3ForCausalLM` | OLMo3 | TBA | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `OLMoEForCausalLM` | OLMoE | `allenai/OLMoE-1B-7B-0924`, `allenai/OLMoE-1B-7B-0924-Instruct`, etc. | | ✅︎ | ✅︎ |
|
||||
| `OPTForCausalLM` | OPT, OPT-IML | `facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc. | | ✅︎ | ✅︎ |
|
||||
| `OPTForCausalLM` | OPT, OPT-IML | `facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `OrionForCausalLM` | Orion | `OrionStarAI/Orion-14B-Base`, `OrionStarAI/Orion-14B-Chat`, etc. | | ✅︎ | ✅︎ |
|
||||
| `PhiForCausalLM` | Phi | `microsoft/phi-1_5`, `microsoft/phi-2`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `Phi3ForCausalLM` | Phi-4, Phi-3 | `microsoft/Phi-4-mini-instruct`, `microsoft/Phi-4`, `microsoft/Phi-3-mini-4k-instruct`, `microsoft/Phi-3-mini-128k-instruct`, `microsoft/Phi-3-medium-128k-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `PhiMoEForCausalLM` | Phi-3.5-MoE | `microsoft/Phi-3.5-MoE-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `Phi4FlashForCausalLM` | Phi-4-mini-flash-reasoning | `microsoft/microsoft/Phi-4-mini-instruct`, etc. | | | |
|
||||
| `PersimmonForCausalLM` | Persimmon | `adept/persimmon-8b-base`, `adept/persimmon-8b-chat`, etc. | | ✅︎ | ✅︎ |
|
||||
| `Plamo2ForCausalLM` | PLaMo2 | `pfnet/plamo-2-1b`, `pfnet/plamo-2-8b`, etc. | | ✅︎ | ✅︎ |
|
||||
| `QWenLMHeadModel` | Qwen | `Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
|
||||
@ -351,13 +351,92 @@ you can use the [official OpenAI Python client](https://github.com/openai/openai
|
||||
To use the Transcriptions API, please install with extra audio dependencies using `pip install vllm[audio]`.
|
||||
|
||||
Code example: <gh-file:examples/online_serving/openai_transcription_client.py>
|
||||
<!-- TODO: api enforced limits + uploading audios -->
|
||||
|
||||
#### API Enforced Limits
|
||||
|
||||
Set the maximum audio file size (in MB) that VLLM will accept, via the
|
||||
`VLLM_MAX_AUDIO_CLIP_FILESIZE_MB` environment variable. Default is 25 MB.
|
||||
|
||||
#### Uploading Audio Files
|
||||
|
||||
The Transcriptions API supports uploading audio files in various formats including FLAC, MP3, MP4, MPEG, MPGA, M4A, OGG, WAV, and WEBM.
|
||||
|
||||
**Using OpenAI Python Client:**
|
||||
|
||||
??? code
|
||||
|
||||
```python
|
||||
from openai import OpenAI
|
||||
|
||||
client = OpenAI(
|
||||
base_url="http://localhost:8000/v1",
|
||||
api_key="token-abc123",
|
||||
)
|
||||
|
||||
# Upload audio file from disk
|
||||
with open("audio.mp3", "rb") as audio_file:
|
||||
transcription = client.audio.transcriptions.create(
|
||||
model="openai/whisper-large-v3-turbo",
|
||||
file=audio_file,
|
||||
language="en",
|
||||
response_format="verbose_json"
|
||||
)
|
||||
|
||||
print(transcription.text)
|
||||
```
|
||||
|
||||
**Using curl with multipart/form-data:**
|
||||
|
||||
??? code
|
||||
|
||||
```bash
|
||||
curl -X POST "http://localhost:8000/v1/audio/transcriptions" \
|
||||
-H "Authorization: Bearer token-abc123" \
|
||||
-F "file=@audio.mp3" \
|
||||
-F "model=openai/whisper-large-v3-turbo" \
|
||||
-F "language=en" \
|
||||
-F "response_format=verbose_json"
|
||||
```
|
||||
|
||||
**Supported Parameters:**
|
||||
|
||||
- `file`: The audio file to transcribe (required)
|
||||
- `model`: The model to use for transcription (required)
|
||||
- `language`: The language code (e.g., "en", "zh") (optional)
|
||||
- `prompt`: Optional text to guide the transcription style (optional)
|
||||
- `response_format`: Format of the response ("json", "text") (optional)
|
||||
- `temperature`: Sampling temperature between 0 and 1 (optional)
|
||||
|
||||
For the complete list of supported parameters including sampling parameters and vLLM extensions, see the [protocol definitions](https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/openai/protocol.py#L2182).
|
||||
|
||||
**Response Format:**
|
||||
|
||||
For `verbose_json` response format:
|
||||
|
||||
??? code
|
||||
|
||||
```json
|
||||
{
|
||||
"text": "Hello, this is a transcription of the audio file.",
|
||||
"language": "en",
|
||||
"duration": 5.42,
|
||||
"segments": [
|
||||
{
|
||||
"id": 0,
|
||||
"seek": 0,
|
||||
"start": 0.0,
|
||||
"end": 2.5,
|
||||
"text": "Hello, this is a transcription",
|
||||
"tokens": [50364, 938, 428, 307, 275, 28347],
|
||||
"temperature": 0.0,
|
||||
"avg_logprob": -0.245,
|
||||
"compression_ratio": 1.235,
|
||||
"no_speech_prob": 0.012
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
#### Extra Parameters
|
||||
|
||||
The following [sampling parameters][sampling-params] are supported.
|
||||
|
||||
@ -1,12 +1,54 @@
|
||||
# Transformers Reinforcement Learning
|
||||
|
||||
Transformers Reinforcement Learning (TRL) is a full stack library that provides a set of tools to train transformer language models with methods like Supervised Fine-Tuning (SFT), Group Relative Policy Optimization (GRPO), Direct Preference Optimization (DPO), Reward Modeling, and more. The library is integrated with 🤗 transformers.
|
||||
[Transformers Reinforcement Learning](https://huggingface.co/docs/trl) (TRL) is a full stack library that provides a set of tools to train transformer language models with methods like Supervised Fine-Tuning (SFT), Group Relative Policy Optimization (GRPO), Direct Preference Optimization (DPO), Reward Modeling, and more. The library is integrated with 🤗 transformers.
|
||||
|
||||
Online methods such as GRPO or Online DPO require the model to generate completions. vLLM can be used to generate these completions!
|
||||
|
||||
See the guide [vLLM for fast generation in online methods](https://huggingface.co/docs/trl/main/en/speeding_up_training#vllm-for-fast-generation-in-online-methods) in the TRL documentation for more information.
|
||||
See the [vLLM integration guide](https://huggingface.co/docs/trl/main/en/vllm_integration) in the TRL documentation for more information.
|
||||
|
||||
TRL currently supports the following online trainers with vLLM:
|
||||
|
||||
- [GRPO](https://huggingface.co/docs/trl/main/en/grpo_trainer)
|
||||
- [Online DPO](https://huggingface.co/docs/trl/main/en/online_dpo_trainer)
|
||||
- [RLOO](https://huggingface.co/docs/trl/main/en/rloo_trainer)
|
||||
- [Nash-MD](https://huggingface.co/docs/trl/main/en/nash_md_trainer)
|
||||
- [XPO](https://huggingface.co/docs/trl/main/en/xpo_trainer)
|
||||
|
||||
To enable vLLM in TRL, set the `use_vllm` flag in the trainer configuration to `True`.
|
||||
|
||||
## Modes of Using vLLM During Training
|
||||
|
||||
TRL supports **two modes** for integrating vLLM during training: **server mode** and **colocate mode**. You can control how vLLM operates during training with the `vllm_mode` parameter.
|
||||
|
||||
### Server mode
|
||||
|
||||
In **server mode**, vLLM runs as an independent process on dedicated GPUs and communicates with the trainer through HTTP requests. This configuration is ideal when you have separate GPUs for inference, as it isolates generation workloads from training, ensuring stable performance and easier scaling.
|
||||
|
||||
```python
|
||||
from trl import GRPOConfig
|
||||
|
||||
training_args = GRPOConfig(
|
||||
...,
|
||||
use_vllm=True,
|
||||
vllm_mode="server", # default value, can be omitted
|
||||
)
|
||||
```
|
||||
|
||||
### Colocate mode
|
||||
|
||||
In **colocate mode**, vLLM runs inside the trainer process and shares GPU memory with the training model. This avoids launching a separate server and can improve GPU utilization, but may lead to memory contention on the training GPUs.
|
||||
|
||||
```python
|
||||
from trl import GRPOConfig
|
||||
|
||||
training_args = GRPOConfig(
|
||||
...,
|
||||
use_vllm=True,
|
||||
vllm_mode="colocate",
|
||||
)
|
||||
```
|
||||
|
||||
Some trainers also support **vLLM sleep mode**, which offloads parameters and caches to GPU RAM during training, helping reduce memory usage. Learn more in the [memory optimization docs](https://huggingface.co/docs/trl/main/en/reducing_memory_usage#vllm-sleep-mode).
|
||||
|
||||
!!! info
|
||||
For more information on the `use_vllm` flag you can provide to the configs of these online methods, see:
|
||||
- [`trl.GRPOConfig.use_vllm`](https://huggingface.co/docs/trl/main/en/grpo_trainer#trl.GRPOConfig.use_vllm)
|
||||
- [`trl.OnlineDPOConfig.use_vllm`](https://huggingface.co/docs/trl/main/en/online_dpo_trainer#trl.OnlineDPOConfig.use_vllm)
|
||||
For detailed configuration options and flags, refer to the documentation of the specific trainer you are using.
|
||||
|
||||
@ -4,6 +4,11 @@
|
||||
experimental support for data-parallel inference with torchrun
|
||||
Note the data load balancing and distribution is done out of the vllm engine,
|
||||
no internal lb supported in external_launcher mode.
|
||||
|
||||
To run this example:
|
||||
```bash
|
||||
$ torchrun --nproc-per-node=2 examples/offline_inference/torchrun_dp_example.py
|
||||
```
|
||||
"""
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
@ -14,7 +19,7 @@ prompts = [
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
] * 50
|
||||
]
|
||||
|
||||
# Create sampling parameters, the same across all ranks
|
||||
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
||||
@ -45,14 +50,13 @@ prompts = [
|
||||
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
|
||||
|
||||
# all ranks will have the same outputs
|
||||
print("-" * 50)
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}\n")
|
||||
print("-" * 50)
|
||||
print(
|
||||
f"DP Rank: {dp_rank} Prompt: {prompt!r}\nGenerated text: {generated_text!r}\n"
|
||||
)
|
||||
|
||||
"""
|
||||
Further tips:
|
||||
|
||||
|
||||
@ -38,11 +38,13 @@ client = OpenAI(
|
||||
base_url=openai_api_base,
|
||||
)
|
||||
|
||||
headers = {"User-Agent": "vLLM Example Client"}
|
||||
|
||||
|
||||
def encode_base64_content_from_url(content_url: str) -> str:
|
||||
"""Encode a content retrieved from a remote url to base64 format."""
|
||||
|
||||
with requests.get(content_url) as response:
|
||||
with requests.get(content_url, headers=headers) as response:
|
||||
response.raise_for_status()
|
||||
result = base64.b64encode(response.content).decode("utf-8")
|
||||
|
||||
@ -50,19 +52,19 @@ def encode_base64_content_from_url(content_url: str) -> str:
|
||||
|
||||
|
||||
# Text-only inference
|
||||
def run_text_only(model: str) -> None:
|
||||
def run_text_only(model: str, max_completion_tokens: int) -> None:
|
||||
chat_completion = client.chat.completions.create(
|
||||
messages=[{"role": "user", "content": "What's the capital of France?"}],
|
||||
model=model,
|
||||
max_completion_tokens=64,
|
||||
max_completion_tokens=max_completion_tokens,
|
||||
)
|
||||
|
||||
result = chat_completion.choices[0].message.content
|
||||
print("Chat completion output:", result)
|
||||
print("Chat completion output:\n", result)
|
||||
|
||||
|
||||
# Single-image input inference
|
||||
def run_single_image(model: str) -> None:
|
||||
def run_single_image(model: str, max_completion_tokens: int) -> None:
|
||||
## Use image url in the payload
|
||||
image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
|
||||
chat_completion_from_url = client.chat.completions.create(
|
||||
@ -79,11 +81,11 @@ def run_single_image(model: str) -> None:
|
||||
}
|
||||
],
|
||||
model=model,
|
||||
max_completion_tokens=64,
|
||||
max_completion_tokens=max_completion_tokens,
|
||||
)
|
||||
|
||||
result = chat_completion_from_url.choices[0].message.content
|
||||
print("Chat completion output from image url:", result)
|
||||
print("Chat completion output from image url:\n", result)
|
||||
|
||||
## Use base64 encoded image in the payload
|
||||
image_base64 = encode_base64_content_from_url(image_url)
|
||||
@ -101,7 +103,7 @@ def run_single_image(model: str) -> None:
|
||||
}
|
||||
],
|
||||
model=model,
|
||||
max_completion_tokens=64,
|
||||
max_completion_tokens=max_completion_tokens,
|
||||
)
|
||||
|
||||
result = chat_completion_from_base64.choices[0].message.content
|
||||
@ -109,7 +111,7 @@ def run_single_image(model: str) -> None:
|
||||
|
||||
|
||||
# Multi-image input inference
|
||||
def run_multi_image(model: str) -> None:
|
||||
def run_multi_image(model: str, max_completion_tokens: int) -> None:
|
||||
image_url_duck = "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg"
|
||||
image_url_lion = "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg"
|
||||
chat_completion_from_url = client.chat.completions.create(
|
||||
@ -130,15 +132,15 @@ def run_multi_image(model: str) -> None:
|
||||
}
|
||||
],
|
||||
model=model,
|
||||
max_completion_tokens=64,
|
||||
max_completion_tokens=max_completion_tokens,
|
||||
)
|
||||
|
||||
result = chat_completion_from_url.choices[0].message.content
|
||||
print("Chat completion output:", result)
|
||||
print("Chat completion output:\n", result)
|
||||
|
||||
|
||||
# Video input inference
|
||||
def run_video(model: str) -> None:
|
||||
def run_video(model: str, max_completion_tokens: int) -> None:
|
||||
video_url = "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerFun.mp4"
|
||||
video_base64 = encode_base64_content_from_url(video_url)
|
||||
|
||||
@ -157,11 +159,11 @@ def run_video(model: str) -> None:
|
||||
}
|
||||
],
|
||||
model=model,
|
||||
max_completion_tokens=64,
|
||||
max_completion_tokens=max_completion_tokens,
|
||||
)
|
||||
|
||||
result = chat_completion_from_url.choices[0].message.content
|
||||
print("Chat completion output from image url:", result)
|
||||
print("Chat completion output from video url:\n", result)
|
||||
|
||||
## Use base64 encoded video in the payload
|
||||
chat_completion_from_base64 = client.chat.completions.create(
|
||||
@ -178,15 +180,15 @@ def run_video(model: str) -> None:
|
||||
}
|
||||
],
|
||||
model=model,
|
||||
max_completion_tokens=64,
|
||||
max_completion_tokens=max_completion_tokens,
|
||||
)
|
||||
|
||||
result = chat_completion_from_base64.choices[0].message.content
|
||||
print("Chat completion output from base64 encoded image:", result)
|
||||
print("Chat completion output from base64 encoded video:\n", result)
|
||||
|
||||
|
||||
# Audio input inference
|
||||
def run_audio(model: str) -> None:
|
||||
def run_audio(model: str, max_completion_tokens: int) -> None:
|
||||
from vllm.assets.audio import AudioAsset
|
||||
|
||||
audio_url = AudioAsset("winning_call").url
|
||||
@ -211,11 +213,11 @@ def run_audio(model: str) -> None:
|
||||
}
|
||||
],
|
||||
model=model,
|
||||
max_completion_tokens=64,
|
||||
max_completion_tokens=max_completion_tokens,
|
||||
)
|
||||
|
||||
result = chat_completion_from_base64.choices[0].message.content
|
||||
print("Chat completion output from input audio:", result)
|
||||
print("Chat completion output from input audio:\n", result)
|
||||
|
||||
# HTTP URL
|
||||
chat_completion_from_url = client.chat.completions.create(
|
||||
@ -235,11 +237,11 @@ def run_audio(model: str) -> None:
|
||||
}
|
||||
],
|
||||
model=model,
|
||||
max_completion_tokens=64,
|
||||
max_completion_tokens=max_completion_tokens,
|
||||
)
|
||||
|
||||
result = chat_completion_from_url.choices[0].message.content
|
||||
print("Chat completion output from audio url:", result)
|
||||
print("Chat completion output from audio url:\n", result)
|
||||
|
||||
# base64 URL
|
||||
chat_completion_from_base64 = client.chat.completions.create(
|
||||
@ -259,14 +261,14 @@ def run_audio(model: str) -> None:
|
||||
}
|
||||
],
|
||||
model=model,
|
||||
max_completion_tokens=64,
|
||||
max_completion_tokens=max_completion_tokens,
|
||||
)
|
||||
|
||||
result = chat_completion_from_base64.choices[0].message.content
|
||||
print("Chat completion output from base64 encoded audio:", result)
|
||||
print("Chat completion output from base64 encoded audio:\n", result)
|
||||
|
||||
|
||||
def run_multi_audio(model: str) -> None:
|
||||
def run_multi_audio(model: str, max_completion_tokens: int) -> None:
|
||||
from vllm.assets.audio import AudioAsset
|
||||
|
||||
# Two different audios to showcase batched inference.
|
||||
@ -300,11 +302,11 @@ def run_multi_audio(model: str) -> None:
|
||||
}
|
||||
],
|
||||
model=model,
|
||||
max_completion_tokens=64,
|
||||
max_completion_tokens=max_completion_tokens,
|
||||
)
|
||||
|
||||
result = chat_completion_from_base64.choices[0].message.content
|
||||
print("Chat completion output from input audio:", result)
|
||||
print("Chat completion output from input audio:\n", result)
|
||||
|
||||
|
||||
example_function_map = {
|
||||
@ -330,13 +332,20 @@ def parse_args():
|
||||
choices=list(example_function_map.keys()),
|
||||
help="Conversation type with multimodal data.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max-completion-tokens",
|
||||
"-n",
|
||||
type=int,
|
||||
default=128,
|
||||
help="Maximum number of tokens to generate for each completion.",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def main(args) -> None:
|
||||
chat_type = args.chat_type
|
||||
model = get_first_model(client)
|
||||
example_function_map[chat_type](model)
|
||||
example_function_map[chat_type](model, args.max_completion_tokens)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@ -102,7 +102,6 @@ plugins:
|
||||
- https://numpy.org/doc/stable/objects.inv
|
||||
- https://pytorch.org/docs/stable/objects.inv
|
||||
- https://psutil.readthedocs.io/en/stable/objects.inv
|
||||
- https://huggingface.co/docs/transformers/main/en/objects.inv
|
||||
|
||||
markdown_extensions:
|
||||
- attr_list
|
||||
|
||||
@ -43,7 +43,6 @@ tritonclient==2.51.0
|
||||
numba == 0.60.0; python_version == '3.9' # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding
|
||||
numba == 0.61.2; python_version > '3.9'
|
||||
numpy
|
||||
runai-model-streamer==0.11.0
|
||||
runai-model-streamer-s3==0.11.0
|
||||
runai-model-streamer[s3]==0.14.0
|
||||
fastsafetensors>=0.1.10
|
||||
pydantic>=2.10 # 2.9 leads to error on python 3.10
|
||||
|
||||
@ -5,8 +5,6 @@ numba == 0.60.0; python_version == '3.9' # v0.61 doesn't support Python 3.9. Req
|
||||
numba == 0.61.2; python_version > '3.9'
|
||||
|
||||
# Dependencies for AMD GPUs
|
||||
boto3
|
||||
botocore
|
||||
datasets
|
||||
ray[cgraph]>=2.48.0 # Ray Compiled Graph, required for pipeline parallelism in V1.
|
||||
peft
|
||||
@ -15,7 +13,6 @@ tensorizer==2.10.1
|
||||
packaging>=24.2
|
||||
setuptools>=77.0.3,<80.0.0
|
||||
setuptools-scm>=8
|
||||
runai-model-streamer==0.11.0
|
||||
runai-model-streamer-s3==0.11.0
|
||||
runai-model-streamer[s3]==0.14.0
|
||||
conch-triton-kernels==1.2.1
|
||||
timm>=1.0.17
|
||||
@ -51,8 +51,7 @@ tritonclient==2.51.0
|
||||
numba == 0.60.0; python_version == '3.9' # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding
|
||||
numba == 0.61.2; python_version > '3.9'
|
||||
numpy
|
||||
runai-model-streamer==0.11.0
|
||||
runai-model-streamer-s3==0.11.0
|
||||
runai-model-streamer[s3]==0.14.0
|
||||
fastsafetensors>=0.1.10
|
||||
pydantic>=2.10 # 2.9 leads to error on python 3.10
|
||||
decord==0.6.0
|
||||
|
||||
@ -72,7 +72,9 @@ blobfile==3.0.0
|
||||
bm25s==0.2.13
|
||||
# via mteb
|
||||
boto3==1.35.57
|
||||
# via tensorizer
|
||||
# via
|
||||
# runai-model-streamer-s3
|
||||
# tensorizer
|
||||
botocore==1.35.57
|
||||
# via
|
||||
# boto3
|
||||
@ -925,10 +927,10 @@ rsa==4.9.1
|
||||
# via google-auth
|
||||
rtree==1.4.0
|
||||
# via torchgeo
|
||||
runai-model-streamer==0.11.0
|
||||
# via -r requirements/test.in
|
||||
runai-model-streamer-s3==0.11.0
|
||||
runai-model-streamer==0.14.0
|
||||
# via -r requirements/test.in
|
||||
runai-model-streamer-s3==0.14.0
|
||||
# via runai-model-streamer
|
||||
s3transfer==0.10.3
|
||||
# via boto3
|
||||
sacrebleu==2.4.3
|
||||
|
||||
5
setup.py
5
setup.py
@ -658,10 +658,7 @@ setup(
|
||||
"bench": ["pandas", "datasets"],
|
||||
"tensorizer": ["tensorizer==2.10.1"],
|
||||
"fastsafetensors": ["fastsafetensors >= 0.1.10"],
|
||||
"runai": [
|
||||
"runai-model-streamer >= 0.14.0", "runai-model-streamer-gcs",
|
||||
"google-cloud-storage", "runai-model-streamer-s3", "boto3"
|
||||
],
|
||||
"runai": ["runai-model-streamer[s3,gcs] >= 0.14.0"],
|
||||
"audio": ["librosa", "soundfile",
|
||||
"mistral_common[audio]"], # Required for audio processing
|
||||
"video": [], # Kept for backwards compatibility
|
||||
|
||||
@ -3,12 +3,11 @@
|
||||
import contextlib
|
||||
import os
|
||||
import weakref
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional
|
||||
|
||||
import pytest
|
||||
|
||||
from tests.utils import wait_for_gpu_memory_to_clear
|
||||
from tests.v1.attention.utils import full_cg_backend_configs as backend_configs
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.config import CompilationConfig
|
||||
from vllm.platforms import current_platform
|
||||
@ -33,89 +32,6 @@ def temporary_environ(env_vars):
|
||||
os.environ[k] = v
|
||||
|
||||
|
||||
@dataclass
|
||||
class BackendConfig:
|
||||
name: str
|
||||
env_vars: dict
|
||||
comp_config: dict
|
||||
specific_gpu_arch: Optional[tuple] = None
|
||||
|
||||
|
||||
# Define all backend configurations of full cudagraph to be tested
|
||||
backend_configs = {
|
||||
# FA3 on Hopper
|
||||
"FA3":
|
||||
BackendConfig(name="FA3",
|
||||
env_vars={
|
||||
"VLLM_FLASH_ATTN_VERSION": "3",
|
||||
"VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH": "16",
|
||||
},
|
||||
comp_config={
|
||||
"cudagraph_mode": "FULL",
|
||||
},
|
||||
specific_gpu_arch=(9, 0)),
|
||||
# FlashMLA on Hopper
|
||||
"FlashMLA":
|
||||
BackendConfig(name="FlashMLA",
|
||||
env_vars={
|
||||
"VLLM_ATTENTION_BACKEND": "FLASHMLA",
|
||||
},
|
||||
comp_config={
|
||||
"cudagraph_mode": "FULL_AND_PIECEWISE",
|
||||
},
|
||||
specific_gpu_arch=(9, 0)),
|
||||
# FlashAttention MLA on Hopper
|
||||
"FlashAttentionMLA":
|
||||
BackendConfig(name="FlashAttentionMLA",
|
||||
env_vars={
|
||||
"VLLM_ATTENTION_BACKEND": "FLASH_ATTN_MLA",
|
||||
"VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH": "16",
|
||||
},
|
||||
comp_config={
|
||||
"cudagraph_mode": "FULL_DECODE_ONLY",
|
||||
},
|
||||
specific_gpu_arch=(9, 0)),
|
||||
# Cutlass MLA on Blackwell
|
||||
"CutlassMLA":
|
||||
BackendConfig(
|
||||
name="CutlassMLA",
|
||||
env_vars={
|
||||
"VLLM_USE_V1": "1",
|
||||
"VLLM_ATTENTION_BACKEND": "CUTLASS_MLA",
|
||||
"FORCE_NUM_KV_SPLITS":
|
||||
"1", # TODO: remove this when hang issue is fixed
|
||||
},
|
||||
comp_config={
|
||||
"cudagraph_mode": "FULL_AND_PIECEWISE",
|
||||
"cudagraph_capture_sizes": [16, 32, 64, 128, 256, 512],
|
||||
},
|
||||
specific_gpu_arch=(10, 0)),
|
||||
# FA2
|
||||
"FA2":
|
||||
BackendConfig(name="FA2",
|
||||
env_vars={
|
||||
"VLLM_FLASH_ATTN_VERSION": "2",
|
||||
"VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH": "16",
|
||||
},
|
||||
comp_config={
|
||||
"cudagraph_mode": "FULL",
|
||||
}),
|
||||
# Triton Attention
|
||||
"TritonAttn":
|
||||
BackendConfig(name="TritonAttn",
|
||||
env_vars={"VLLM_ATTENTION_BACKEND": "TRITON_ATTN"},
|
||||
comp_config={
|
||||
"cudagraph_mode": "FULL",
|
||||
}),
|
||||
# FlashInfer
|
||||
"FlashInfer":
|
||||
BackendConfig(name="FlashInfer",
|
||||
env_vars={"VLLM_ATTENTION_BACKEND": "FLASHINFER"},
|
||||
comp_config={
|
||||
"cudagraph_mode": "FULL_AND_PIECEWISE",
|
||||
}),
|
||||
}
|
||||
|
||||
test_params_full_cudagraph = []
|
||||
|
||||
# deepseek-ai/DeepSeek-V2-Lite with MLA
|
||||
|
||||
@ -4,7 +4,7 @@ import pytest
|
||||
|
||||
import vllm
|
||||
from vllm.compilation.counter import compilation_counter
|
||||
from vllm.config import CompilationConfig, VllmConfig
|
||||
from vllm.config import CompilationConfig, CUDAGraphMode, VllmConfig
|
||||
from vllm.utils import _is_torch_equal_or_newer
|
||||
|
||||
|
||||
@ -106,7 +106,6 @@ def test_dynamo_as_is(vllm_runner, monkeypatch):
|
||||
def test_no_compilation(vllm_runner, monkeypatch):
|
||||
# Disable multiprocessing so that the counter is in the same process
|
||||
monkeypatch.setenv('VLLM_ENABLE_V1_MULTIPROCESSING', '0')
|
||||
|
||||
with (
|
||||
compilation_counter.expect(num_graphs_seen=0,
|
||||
dynamo_as_is_count=0),
|
||||
@ -131,3 +130,67 @@ def test_enforce_eager(vllm_runner, monkeypatch):
|
||||
enforce_eager=True,
|
||||
gpu_memory_utilization=0.4) as _):
|
||||
pass
|
||||
|
||||
|
||||
def test_splitting_ops_dynamic():
|
||||
# Default config
|
||||
config = VllmConfig()
|
||||
assert config.compilation_config.cudagraph_mode == \
|
||||
CUDAGraphMode.FULL_AND_PIECEWISE
|
||||
assert config.compilation_config.splitting_ops_contain_attention()
|
||||
|
||||
# When use_inductor_graph_partition=True
|
||||
if _is_torch_equal_or_newer('2.9.0.dev'):
|
||||
# inductor graph partition is only available in PyTorch 2.9+.
|
||||
# this is a fast config check so we are not using pytest.skip.
|
||||
config = VllmConfig(compilation_config=CompilationConfig(
|
||||
use_inductor_graph_partition=True,
|
||||
splitting_ops=["silly_attention"]))
|
||||
# should ignore splitting_ops
|
||||
assert config.compilation_config.splitting_ops == []
|
||||
|
||||
# When attn_fusion pass enabled.
|
||||
config = VllmConfig(compilation_config=CompilationConfig(
|
||||
pass_config={
|
||||
"enable_attn_fusion": True,
|
||||
"enable_noop": True
|
||||
},
|
||||
custom_ops=["+quant_fp8"],
|
||||
cudagraph_mode=CUDAGraphMode.PIECEWISE,
|
||||
))
|
||||
assert config.compilation_config.splitting_ops == []
|
||||
# cudagraph mode also fall back to FULL
|
||||
assert config.compilation_config.cudagraph_mode == \
|
||||
CUDAGraphMode.FULL
|
||||
|
||||
# splitting_ops can not contain attention ops when attn_fusion
|
||||
# pass enabled.
|
||||
with pytest.raises(AssertionError):
|
||||
config = VllmConfig(compilation_config=CompilationConfig(
|
||||
pass_config={
|
||||
"enable_attn_fusion": True,
|
||||
"enable_noop": True
|
||||
},
|
||||
custom_ops=["+quant_fp8"],
|
||||
cudagraph_mode=CUDAGraphMode.PIECEWISE,
|
||||
# work around for accessing all attntion ops
|
||||
splitting_ops=CompilationConfig()._attention_ops,
|
||||
))
|
||||
|
||||
# When both use_inductor_graph_partition and attn_fusion pass enabled.
|
||||
if _is_torch_equal_or_newer('2.9.0.dev'):
|
||||
config = VllmConfig(compilation_config=CompilationConfig(
|
||||
use_inductor_graph_partition=True,
|
||||
pass_config={
|
||||
"enable_attn_fusion": True,
|
||||
"enable_noop": True
|
||||
},
|
||||
custom_ops=["+quant_fp8"],
|
||||
cudagraph_mode=CUDAGraphMode.PIECEWISE,
|
||||
))
|
||||
assert config.compilation_config.splitting_ops == []
|
||||
# enable_attn_fusion is directly support under
|
||||
# use_inductor_graph_partition=True, and cudagraph_mode
|
||||
# is unchanged.
|
||||
assert config.compilation_config.cudagraph_mode == \
|
||||
CUDAGraphMode.PIECEWISE
|
||||
|
||||
@ -139,6 +139,21 @@ def test_custom_compile_config(
|
||||
run_model(compilation_config, model, model_kwargs)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"optimization_level",
|
||||
[CompilationLevel.NO_COMPILATION, CompilationLevel.PIECEWISE],
|
||||
)
|
||||
def test_fp8_kv_scale_compile(optimization_level: int):
|
||||
model = "Qwen/Qwen2-0.5B"
|
||||
model_kwargs = {
|
||||
"quantization": "fp8",
|
||||
"kv_cache_dtype": "fp8_e4m3",
|
||||
"calculate_kv_scales": True,
|
||||
"max_model_len": 512,
|
||||
}
|
||||
run_model(optimization_level, model, model_kwargs)
|
||||
|
||||
|
||||
def test_inductor_graph_partition_attn_fusion(caplog_vllm):
|
||||
if not is_torch_equal_or_newer("2.9.0.dev"):
|
||||
pytest.skip("inductor graph partition is only available "
|
||||
|
||||
@ -208,3 +208,11 @@ def zephyr_lora_files():
|
||||
"""Download zephyr LoRA files once per test session."""
|
||||
from huggingface_hub import snapshot_download
|
||||
return snapshot_download(repo_id="typeof/zephyr-7b-beta-lora")
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def opt125_lora_files() -> str:
|
||||
"""Download opt-125m LoRA files once per test session."""
|
||||
from huggingface_hub import snapshot_download
|
||||
return snapshot_download(
|
||||
repo_id="peft-internal-testing/opt-125m-dummy-lora")
|
||||
|
||||
@ -3,6 +3,7 @@
|
||||
|
||||
import base64
|
||||
import io
|
||||
import json
|
||||
|
||||
import openai # use the official client for correctness check
|
||||
import pytest
|
||||
@ -16,13 +17,15 @@ from ...utils import RemoteOpenAIServer
|
||||
|
||||
# any model with a chat template should work here
|
||||
MODEL_NAME = "facebook/opt-125m"
|
||||
LORA_SERVING_MODEL_NAME = "opt125m-lora"
|
||||
|
||||
CONFIG = AutoConfig.from_pretrained(MODEL_NAME)
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def default_server_args() -> list[str]:
|
||||
return [
|
||||
@pytest.fixture(scope="module", params=["use-lora"])
|
||||
def default_server_args(request: pytest.FixtureRequest,
|
||||
opt125_lora_files: str) -> list[str]:
|
||||
args = [
|
||||
# use half precision for speed and memory savings in CI environment
|
||||
"--dtype",
|
||||
"bfloat16",
|
||||
@ -35,6 +38,25 @@ def default_server_args() -> list[str]:
|
||||
"--enable-prompt-embeds",
|
||||
]
|
||||
|
||||
if request.param == "use-lora":
|
||||
lora_module_1 = {
|
||||
"name": LORA_SERVING_MODEL_NAME,
|
||||
"path": opt125_lora_files,
|
||||
"base_model_name": MODEL_NAME
|
||||
}
|
||||
|
||||
args.extend([
|
||||
"--enable-lora",
|
||||
"--lora-module",
|
||||
json.dumps(lora_module_1),
|
||||
"--max-lora-rank",
|
||||
"64",
|
||||
"--max-cpu-loras",
|
||||
"2",
|
||||
])
|
||||
|
||||
return args
|
||||
|
||||
|
||||
EXAMPLE_PROMPTS = [
|
||||
"Hello, my name is",
|
||||
@ -74,7 +96,7 @@ async def client_with_prompt_embeds(server_with_prompt_embeds):
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME, LORA_SERVING_MODEL_NAME])
|
||||
async def test_completions_with_prompt_embeds(
|
||||
example_prompt_embeds,
|
||||
client_with_prompt_embeds: openai.AsyncOpenAI,
|
||||
@ -179,7 +201,7 @@ async def test_completions_with_prompt_embeds(
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME, LORA_SERVING_MODEL_NAME])
|
||||
async def test_completions_errors_with_prompt_embeds(
|
||||
client_with_prompt_embeds: openai.AsyncOpenAI, model_name: str):
|
||||
# Test error case: invalid prompt_embeds
|
||||
@ -194,7 +216,7 @@ async def test_completions_errors_with_prompt_embeds(
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("logprobs_arg", [1, 0])
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME, LORA_SERVING_MODEL_NAME])
|
||||
async def test_completions_with_logprobs_and_prompt_embeds(
|
||||
example_prompt_embeds,
|
||||
client_with_prompt_embeds: openai.AsyncOpenAI,
|
||||
|
||||
@ -1,52 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sample_regex():
|
||||
return (r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}"
|
||||
r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)")
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sample_json_schema():
|
||||
return {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"name": {
|
||||
"type": "string"
|
||||
},
|
||||
"age": {
|
||||
"type": "integer"
|
||||
},
|
||||
"skills": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string",
|
||||
"maxLength": 10
|
||||
},
|
||||
"minItems": 3
|
||||
},
|
||||
"work_history": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"company": {
|
||||
"type": "string"
|
||||
},
|
||||
"duration": {
|
||||
"type": "number"
|
||||
},
|
||||
"position": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"required": ["company", "position"]
|
||||
}
|
||||
}
|
||||
},
|
||||
"required": ["name", "age", "skills", "work_history"]
|
||||
}
|
||||
@ -10,7 +10,7 @@ from vllm.model_executor.model_loader import tensorizer as tensorizer_mod
|
||||
from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
|
||||
from vllm.utils import get_distributed_init_method, get_ip, get_open_port
|
||||
from vllm.v1.executor.abstract import UniProcExecutor
|
||||
from vllm.worker.worker_base import WorkerWrapperBase
|
||||
from vllm.v1.worker.worker_base import WorkerWrapperBase
|
||||
|
||||
MODEL_REF = "facebook/opt-125m"
|
||||
|
||||
@ -14,6 +14,7 @@ import pytest
|
||||
import torch
|
||||
|
||||
import vllm.model_executor.model_loader.tensorizer
|
||||
from tests.utils import VLLM_PATH, RemoteOpenAIServer
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.engine.arg_utils import EngineArgs
|
||||
# yapf: disable
|
||||
@ -27,7 +28,6 @@ from vllm.model_executor.model_loader.tensorizer_loader import (
|
||||
# yapf: enable
|
||||
from vllm.utils import PlaceholderModule
|
||||
|
||||
from ..utils import VLLM_PATH, RemoteOpenAIServer
|
||||
from .conftest import DummyExecutor, assert_from_collective_rpc
|
||||
|
||||
try:
|
||||
@ -299,9 +299,6 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
|
||||
"MistralForCausalLM": _HfExamplesInfo("mistralai/Mistral-7B-Instruct-v0.1"),
|
||||
"MixtralForCausalLM": _HfExamplesInfo("mistralai/Mixtral-8x7B-Instruct-v0.1", # noqa: E501
|
||||
{"tiny": "TitanML/tiny-mixtral"}), # noqa: E501
|
||||
"MotifForCausalLM": _HfExamplesInfo("Motif-Technologies/Motif-2.6B",
|
||||
trust_remote_code=True,
|
||||
v0_only=True),
|
||||
"MptForCausalLM": _HfExamplesInfo("mpt", is_available_online=False),
|
||||
"MPTForCausalLM": _HfExamplesInfo("mosaicml/mpt-7b"),
|
||||
"NemotronForCausalLM": _HfExamplesInfo("nvidia/Minitron-8B-Base"),
|
||||
@ -643,7 +640,7 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
|
||||
speculative_model="baidu/ERNIE-4.5-21B-A3B-PT"),
|
||||
"Glm4MoeMTPModel": _HfExamplesInfo("zai-org/GLM-4.5",
|
||||
speculative_model="zai-org/GLM-4.5",
|
||||
min_transformers_version="4.54",
|
||||
min_transformers_version="4.56",
|
||||
is_available_online=False),
|
||||
"LongCatFlashMTPModel": _HfExamplesInfo(
|
||||
"meituan-longcat/LongCat-Flash-Chat",
|
||||
@ -652,6 +649,9 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
|
||||
"MiMoMTPModel": _HfExamplesInfo("XiaomiMiMo/MiMo-7B-RL",
|
||||
trust_remote_code=True,
|
||||
speculative_model="XiaomiMiMo/MiMo-7B-RL"),
|
||||
"Eagle3Qwen2_5vlForCausalLM": _HfExamplesInfo(
|
||||
"Qwen/Qwen2.5-VL-7B-Instruct",
|
||||
speculative_model="Rayzl/qwen2.5-vl-7b-eagle3-sgl"),
|
||||
"Qwen3NextMTP": _HfExamplesInfo("Qwen/Qwen3-Next-80B-A3B-Instruct",
|
||||
min_transformers_version="4.56.3"),
|
||||
}
|
||||
|
||||
@ -79,10 +79,6 @@ def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch,
|
||||
if model_info.v0_only:
|
||||
# NOTE(woosuk): skip the test for V0-only models
|
||||
return
|
||||
|
||||
if model_arch in ("Phi4FlashForCausalLM", "MotifForCausalLM"):
|
||||
pytest.skip(
|
||||
"Differential Flash Attention backend has been removed.")
|
||||
if model_arch == "GptOssForCausalLM":
|
||||
# FIXME: A hack to bypass FA3 assertion because our CI's L4 GPU
|
||||
# has cc==8.9 which hasn't supported FA3 yet. Remove this hack when
|
||||
|
||||
@ -100,10 +100,9 @@ def test_distributed(
|
||||
kwargs_test=kwargs)
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
current_platform.is_rocm(),
|
||||
reason="bitsandbytes quantization is currently not supported in rocm.")
|
||||
@pytest.mark.parametrize("model, quantization_kwargs", [
|
||||
("TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", {}),
|
||||
("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", {}),
|
||||
(
|
||||
"meta-llama/Llama-3.2-1B-Instruct",
|
||||
{
|
||||
@ -121,6 +120,11 @@ def test_quantization(
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
) -> None:
|
||||
if (current_platform.is_rocm()
|
||||
and quantization_kwargs.get("quantization", "") == "bitsandbytes"):
|
||||
pytest.skip(
|
||||
"bitsandbytes quantization is currently not supported in rocm.")
|
||||
|
||||
with vllm_runner(
|
||||
model, model_impl="auto", enforce_eager=True,
|
||||
**quantization_kwargs) as vllm_model: # type: ignore[arg-type]
|
||||
|
||||
@ -18,7 +18,7 @@ from vllm.utils import get_open_port, update_environment_variables
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("feature_sample_layers", "num_layers_loaded", "max_possible_layers",
|
||||
("select_layers", "num_layers_loaded", "max_possible_layers",
|
||||
"expected_features"),
|
||||
[
|
||||
# All layers loaded
|
||||
@ -28,8 +28,8 @@ from vllm.utils import get_open_port, update_environment_variables
|
||||
([1, 10], 10, 20, [1, 10]),
|
||||
([-20, -11], 10, 20, [1, 10]),
|
||||
])
|
||||
def test_resolve_visual_encoder_outputs(feature_sample_layers,
|
||||
num_layers_loaded, max_possible_layers,
|
||||
def test_resolve_visual_encoder_outputs(select_layers, num_layers_loaded,
|
||||
max_possible_layers,
|
||||
expected_features):
|
||||
"""
|
||||
Test that offsets are correctly handled for vision feature layers.
|
||||
@ -39,9 +39,10 @@ def test_resolve_visual_encoder_outputs(feature_sample_layers,
|
||||
]
|
||||
output_tensor = resolve_visual_encoder_outputs(
|
||||
encoder_outputs=encoder_outputs,
|
||||
feature_sample_layers=feature_sample_layers,
|
||||
post_layer_norm=None,
|
||||
max_possible_layers=max_possible_layers)
|
||||
select_layers=select_layers,
|
||||
max_possible_layers=max_possible_layers,
|
||||
)
|
||||
assert torch.equal(torch.tensor(expected_features), output_tensor)
|
||||
|
||||
|
||||
|
||||
132
tests/quantization/test_blackwell_moe.py
Normal file
132
tests/quantization/test_blackwell_moe.py
Normal file
@ -0,0 +1,132 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import json
|
||||
import os
|
||||
|
||||
import pytest
|
||||
|
||||
from tests.utils import RemoteOpenAIServer
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
if not current_platform.is_device_capability(100):
|
||||
pytest.skip("This test only runs on Blackwell GPUs (SM100).",
|
||||
allow_module_level=True)
|
||||
|
||||
os.environ["FLASHINFER_NVCC_THREADS"] = "16"
|
||||
|
||||
# dummy_hf_overrides = {"num_layers": 4, "num_hidden_layers": 4,
|
||||
# "text_config": {"num_layers": 4, "num_hidden_layers": 4}}
|
||||
dummy_hf_overrides = {"num_layers": 4, "num_hidden_layers": 4}
|
||||
|
||||
|
||||
def can_initialize(model: str, extra_args: list[str]):
|
||||
|
||||
# Server arguments
|
||||
server_args = [
|
||||
"--max-model-len",
|
||||
"2048",
|
||||
"--max-num-batched-tokens",
|
||||
"256",
|
||||
"--load-format",
|
||||
"dummy",
|
||||
"--trust-remote-code",
|
||||
"--limit-mm-per-prompt",
|
||||
json.dumps({"image": 0}),
|
||||
*extra_args,
|
||||
]
|
||||
|
||||
# Launch server and make a simple request
|
||||
with RemoteOpenAIServer(
|
||||
model,
|
||||
server_args,
|
||||
max_wait_seconds=1000, # Due to FlashInfer compile
|
||||
override_hf_configs=dummy_hf_overrides) as server:
|
||||
client = server.get_client()
|
||||
# Make a simple request to verify the server works
|
||||
completion = client.completions.create(
|
||||
model=model,
|
||||
prompt=["Hello, World!"],
|
||||
temperature=0,
|
||||
max_tokens=2,
|
||||
)
|
||||
print(completion)
|
||||
assert completion.choices[0].text is not None
|
||||
|
||||
|
||||
## Llama4 ##
|
||||
|
||||
|
||||
@pytest.mark.skip(reason=(
|
||||
"RuntimeError: run_moe() Expected a value of type "
|
||||
"'Optional[List[Tensor]]' for argument '_9' but instead found type "
|
||||
"'list'."))
|
||||
def test_llama4_fp8_tensor_moe_flashinfer_cutlass(
|
||||
monkeypatch: pytest.MonkeyPatch):
|
||||
monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP8", "1")
|
||||
monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "throughput")
|
||||
can_initialize("nvidia/Llama-4-Scout-17B-16E-Instruct-FP8", [])
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="Works, but takes too long to run")
|
||||
def test_llama4_fp8_tensor_moe_flashinfer_trtllm(
|
||||
monkeypatch: pytest.MonkeyPatch):
|
||||
monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP8", "1")
|
||||
monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "latency")
|
||||
can_initialize("nvidia/Llama-4-Scout-17B-16E-Instruct-FP8", [])
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="Works, but takes too long to run")
|
||||
def test_llama4_nvfp4_moe_flashinfer_cutlass(monkeypatch: pytest.MonkeyPatch):
|
||||
monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP4", "1")
|
||||
monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "throughput")
|
||||
can_initialize("nvidia/Llama-4-Scout-17B-16E-Instruct-FP4", [])
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="RuntimeError: No kernel found for the given options")
|
||||
def test_llama4_nvfp4_moe_flashinfer_trtllm(monkeypatch: pytest.MonkeyPatch):
|
||||
monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP4", "1")
|
||||
monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "latency")
|
||||
can_initialize("nvidia/Llama-4-Scout-17B-16E-Instruct-FP4", [])
|
||||
|
||||
|
||||
## DeepSeekV3 ##
|
||||
|
||||
|
||||
def test_deepseek_fp8_block_moe_deep_gemm(monkeypatch: pytest.MonkeyPatch):
|
||||
monkeypatch.setenv("VLLM_USE_DEEP_GEMM", "1")
|
||||
can_initialize("deepseek-ai/DeepSeek-V3.1", [])
|
||||
|
||||
|
||||
def test_deepseek_nvfp4_moe_flashinfer_cutlass(
|
||||
monkeypatch: pytest.MonkeyPatch):
|
||||
monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP4", "1")
|
||||
monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "throughput")
|
||||
can_initialize("nvidia/DeepSeek-R1-0528-FP4-v2", [])
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="RuntimeError: No kernel found for the given options")
|
||||
def test_deepseek_nvfp4_moe_flashinfer_trtllm(monkeypatch: pytest.MonkeyPatch):
|
||||
monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP4", "1")
|
||||
monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "latency")
|
||||
can_initialize("nvidia/DeepSeek-R1-0528-FP4-v2", [])
|
||||
|
||||
|
||||
## GPT-OSS ##
|
||||
|
||||
|
||||
def test_gptoss_mxfp4bf16_moe_flashinfer(monkeypatch: pytest.MonkeyPatch):
|
||||
monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_MXFP4_BF16", "1")
|
||||
can_initialize("openai/gpt-oss-20b", [])
|
||||
|
||||
|
||||
def test_gptoss_mxfp4mxfp8_moe_flashinfer_cutlass(
|
||||
monkeypatch: pytest.MonkeyPatch):
|
||||
monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS", "1")
|
||||
can_initialize("openai/gpt-oss-20b", [])
|
||||
|
||||
|
||||
def test_gptoss_mxfp4mxfp8_moe_flashinfer_trtllm(
|
||||
monkeypatch: pytest.MonkeyPatch):
|
||||
monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8", "1")
|
||||
can_initialize("openai/gpt-oss-20b", [])
|
||||
203
tests/reasoning/test_glm4_moe_reasoning_parser.py
Normal file
203
tests/reasoning/test_glm4_moe_reasoning_parser.py
Normal file
@ -0,0 +1,203 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import pytest
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
from tests.reasoning.utils import run_reasoning_extraction
|
||||
from vllm.reasoning import ReasoningParser, ReasoningParserManager
|
||||
|
||||
parser_name = "glm45"
|
||||
start_token = "<think>"
|
||||
end_token = "</think>"
|
||||
|
||||
REASONING_MODEL_NAME = "zai-org/GLM-4.5"
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def glm45_tokenizer():
|
||||
return AutoTokenizer.from_pretrained(REASONING_MODEL_NAME)
|
||||
|
||||
|
||||
WITH_THINK = {
|
||||
"output": "<think>This is a reasoning section</think>This is the rest",
|
||||
"reasoning_content": "This is a reasoning section",
|
||||
"content": "This is the rest",
|
||||
"is_reasoning_end": True,
|
||||
}
|
||||
|
||||
WITH_THINK_STREAM = {
|
||||
"output": "<think>This is a reasoning section</think>This is the rest",
|
||||
"reasoning_content": "This is a reasoning section",
|
||||
"content": "This is the rest",
|
||||
"is_reasoning_end": True,
|
||||
}
|
||||
|
||||
WITHOUT_THINK = {
|
||||
"output": "This is the rest",
|
||||
"reasoning_content": None,
|
||||
"content": "This is the rest",
|
||||
"is_reasoning_end": False,
|
||||
}
|
||||
|
||||
WITHOUT_THINK_STREAM = {
|
||||
"output": "This is the rest",
|
||||
"reasoning_content": None,
|
||||
"content": "This is the rest",
|
||||
"is_reasoning_end": False,
|
||||
}
|
||||
|
||||
COMPLETE_REASONING = {
|
||||
"output": "<think>This is a reasoning section</think>",
|
||||
"reasoning_content": "This is a reasoning section",
|
||||
"content": None,
|
||||
"is_reasoning_end": True,
|
||||
}
|
||||
MULTILINE_REASONING = {
|
||||
"output":
|
||||
"<think>This is a reasoning\nsection</think>This is the rest\nThat",
|
||||
"reasoning_content": "This is a reasoning\nsection",
|
||||
"content": "This is the rest\nThat",
|
||||
"is_reasoning_end": True,
|
||||
}
|
||||
ONLY_OPEN_TAG = {
|
||||
"output": "<think>This is a reasoning section",
|
||||
"reasoning_content": None,
|
||||
"content": "<think>This is a reasoning section",
|
||||
"is_reasoning_end": False,
|
||||
}
|
||||
|
||||
ONLY_OPEN_TAG_STREAM = {
|
||||
"output": "<think>This is a reasoning section",
|
||||
"reasoning_content": "This is a reasoning section",
|
||||
"content": None,
|
||||
"is_reasoning_end": False,
|
||||
}
|
||||
|
||||
TEST_CASES = [
|
||||
pytest.param(
|
||||
False,
|
||||
WITH_THINK,
|
||||
id="with_think",
|
||||
),
|
||||
pytest.param(
|
||||
True,
|
||||
WITH_THINK_STREAM,
|
||||
id="with_think_stream",
|
||||
),
|
||||
pytest.param(
|
||||
False,
|
||||
WITHOUT_THINK,
|
||||
id="without_think",
|
||||
),
|
||||
pytest.param(
|
||||
True,
|
||||
WITHOUT_THINK_STREAM,
|
||||
id="without_think_stream",
|
||||
),
|
||||
pytest.param(
|
||||
False,
|
||||
COMPLETE_REASONING,
|
||||
id="complete_reasoning",
|
||||
),
|
||||
pytest.param(
|
||||
True,
|
||||
COMPLETE_REASONING,
|
||||
id="complete_reasoning_stream",
|
||||
),
|
||||
pytest.param(
|
||||
False,
|
||||
MULTILINE_REASONING,
|
||||
id="multiline_reasoning",
|
||||
),
|
||||
pytest.param(
|
||||
True,
|
||||
MULTILINE_REASONING,
|
||||
id="multiline_reasoning_stream",
|
||||
),
|
||||
pytest.param(
|
||||
False,
|
||||
ONLY_OPEN_TAG,
|
||||
id="only_open_tag",
|
||||
),
|
||||
pytest.param(
|
||||
True,
|
||||
ONLY_OPEN_TAG_STREAM,
|
||||
id="only_open_tag_stream",
|
||||
),
|
||||
]
|
||||
|
||||
STILL_REASONING_PROMPT = """[gMASK]<sop><|system|>
|
||||
You are a helpful assistant.<|user|>
|
||||
What is the capital of France?<|assistant|>
|
||||
<think>The user is asking for the capital of"""
|
||||
|
||||
DONE_REASONING_PROMPT = """[gMASK]<sop><|system|>
|
||||
You are a helpful assistant.<|user|>
|
||||
What is the capital of France?<|assistant|>
|
||||
<think>The user is asking for the capital of France.</think>
|
||||
The capital of France is Paris."""
|
||||
|
||||
MULTI_TURN_STILL_REASONING_PROMPT = """[gMASK]<sop><|system|>
|
||||
You are a helpful assistant.<|user|>
|
||||
What is the capital of France?<|assistant|>
|
||||
<think></think>
|
||||
The capital of France is Paris.<|user|>
|
||||
What about Chile?<|assistant|>
|
||||
<think>The user is asking for the capital of"""
|
||||
|
||||
MULTI_TURN_DONE_REASONING_PROMPT = """[gMASK]<sop><|system|>
|
||||
You are a helpful assistant.<|user|>
|
||||
What is the capital of France?<|assistant|>
|
||||
<think></think>
|
||||
The capital of France is Paris.<|user|>
|
||||
What about Chile?<|assistant|>
|
||||
<think>The user is asking for the capital of Chile.</think>
|
||||
The capital of Chile is Santiago."""
|
||||
|
||||
REASONING_END_TEST_CASES = [
|
||||
pytest.param(STILL_REASONING_PROMPT, False, id="still_reasoning"),
|
||||
pytest.param(DONE_REASONING_PROMPT, True, id="done_reasoning"),
|
||||
pytest.param(MULTI_TURN_STILL_REASONING_PROMPT,
|
||||
False,
|
||||
id="multi_turn_still_reasoning"),
|
||||
pytest.param(MULTI_TURN_DONE_REASONING_PROMPT,
|
||||
True,
|
||||
id="multi_turn_done_reasoning")
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("streaming, param_dict", TEST_CASES)
|
||||
def test_reasoning(
|
||||
streaming: bool,
|
||||
param_dict: dict,
|
||||
glm45_tokenizer,
|
||||
):
|
||||
output = glm45_tokenizer.tokenize(param_dict["output"])
|
||||
output_tokens: list[str] = [
|
||||
glm45_tokenizer.convert_tokens_to_string([token]) for token in output
|
||||
]
|
||||
parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(
|
||||
parser_name)(glm45_tokenizer)
|
||||
|
||||
reasoning, content = run_reasoning_extraction(parser,
|
||||
output_tokens,
|
||||
streaming=streaming)
|
||||
|
||||
assert reasoning == param_dict["reasoning_content"]
|
||||
assert content == param_dict["content"]
|
||||
|
||||
output_ids = glm45_tokenizer.convert_tokens_to_ids(output)
|
||||
is_reasoning_end = parser.is_reasoning_end(output_ids)
|
||||
assert is_reasoning_end == param_dict["is_reasoning_end"]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("prompt, is_reasoning_end", REASONING_END_TEST_CASES)
|
||||
def test_is_reasoning_end_full_prompt(prompt: str, is_reasoning_end: bool,
|
||||
glm45_tokenizer):
|
||||
parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(
|
||||
parser_name)(glm45_tokenizer)
|
||||
tokens = glm45_tokenizer.tokenize(prompt)
|
||||
token_ids = glm45_tokenizer.convert_tokens_to_ids(tokens)
|
||||
check_is_reasoning_end = parser.is_reasoning_end(token_ids)
|
||||
assert check_is_reasoning_end == is_reasoning_end
|
||||
@ -14,6 +14,9 @@ from vllm.model_executor.models.interfaces import supports_eagle3
|
||||
pytest.param(
|
||||
"nm-testing/Speculator-Qwen3-8B-Eagle3-converted-071-quantized",
|
||||
id="qwen3-eagle3-speculator"),
|
||||
pytest.param(
|
||||
"nm-testing/Speculator-Qwen3-8B-Eagle3-converted-071-quantized-w4a16",
|
||||
id="qwen3-eagle3-speculator-w4a16-verifier"),
|
||||
])
|
||||
def test_eagle3_speculators_model(vllm_runner, example_prompts, model_path,
|
||||
monkeypatch):
|
||||
|
||||
@ -91,8 +91,10 @@ class RemoteOpenAIServer:
|
||||
env['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn'
|
||||
if env_dict is not None:
|
||||
env.update(env_dict)
|
||||
serve_cmd = ["vllm", "serve", model, *vllm_serve_args]
|
||||
print(f"Launching RemoteOpenAIServer with: {' '.join(serve_cmd)}")
|
||||
self.proc: subprocess.Popen = subprocess.Popen(
|
||||
["vllm", "serve", model, *vllm_serve_args],
|
||||
serve_cmd,
|
||||
env=env,
|
||||
stdout=sys.stdout,
|
||||
stderr=sys.stderr,
|
||||
@ -1141,6 +1143,8 @@ def get_attn_backend_list_based_on_platform() -> list[str]:
|
||||
print("Skip FLASH_ATTN on ROCm as aiter is not installed")
|
||||
|
||||
return attn_backend_list
|
||||
elif current_platform.is_xpu():
|
||||
return ["FLASH_ATTN", "TRITON_ATTN"]
|
||||
else:
|
||||
raise ValueError("Unsupported platform")
|
||||
|
||||
|
||||
69
tests/utils_/test_gc_utils.py
Normal file
69
tests/utils_/test_gc_utils.py
Normal file
@ -0,0 +1,69 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from dataclasses import dataclass
|
||||
from typing import Any
|
||||
|
||||
from vllm.utils.gc_utils import (GCDebugConfig, _compute_detailed_type,
|
||||
_compute_top_gc_collected_objects)
|
||||
|
||||
|
||||
@dataclass
|
||||
class Normal:
|
||||
v: int
|
||||
|
||||
|
||||
@dataclass
|
||||
class ListWrapper:
|
||||
vs: list[int]
|
||||
|
||||
def __len__(self) -> int:
|
||||
return len(self.vs)
|
||||
|
||||
|
||||
def test_compute_detailed_type():
|
||||
assert _compute_detailed_type(
|
||||
Normal(v=8)) == "<class 'tests.utils_.test_gc_utils.Normal'>"
|
||||
|
||||
assert _compute_detailed_type([1, 2, 3]) == "<class 'list'>(size:3)"
|
||||
assert _compute_detailed_type({4, 5}) == "<class 'set'>(size:2)"
|
||||
assert _compute_detailed_type({6: 7}) == "<class 'dict'>(size:1)"
|
||||
assert _compute_detailed_type(ListWrapper(
|
||||
vs=[])) == "<class 'tests.utils_.test_gc_utils.ListWrapper'>(size:0)"
|
||||
|
||||
|
||||
def test_compute_top_gc_collected_objects():
|
||||
objects: list[Any] = [[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12],
|
||||
{13, 14}, {
|
||||
15: 16,
|
||||
17: 18
|
||||
},
|
||||
Normal(v=19),
|
||||
Normal(v=20),
|
||||
Normal(v=21)]
|
||||
assert _compute_top_gc_collected_objects(objects, top=-1) == ""
|
||||
assert _compute_top_gc_collected_objects(objects, top=0) == ""
|
||||
assert _compute_top_gc_collected_objects(
|
||||
objects, top=1) == " 4:<class 'list'>(size:3)"
|
||||
assert _compute_top_gc_collected_objects(objects, top=2) == "\n".join([
|
||||
" 4:<class 'list'>(size:3)",
|
||||
" 3:<class 'tests.utils_.test_gc_utils.Normal'>"
|
||||
])
|
||||
assert _compute_top_gc_collected_objects(objects, top=3) == "\n".join([
|
||||
" 4:<class 'list'>(size:3)",
|
||||
" 3:<class 'tests.utils_.test_gc_utils.Normal'>",
|
||||
" 1:<class 'set'>(size:2)"
|
||||
])
|
||||
|
||||
|
||||
def test_gc_debug_config():
|
||||
assert not GCDebugConfig(None).enabled
|
||||
assert not GCDebugConfig("").enabled
|
||||
assert not GCDebugConfig("0").enabled
|
||||
|
||||
config = GCDebugConfig("1")
|
||||
assert config.enabled
|
||||
assert config.top_objects == -1
|
||||
|
||||
config = GCDebugConfig("{\"top_objects\":5}")
|
||||
assert config.enabled
|
||||
assert config.top_objects == 5
|
||||
@ -3,7 +3,7 @@
|
||||
"""Utility functions for attention-related v1 tests."""
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import Union
|
||||
from typing import Optional, Union
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
@ -259,3 +259,88 @@ def create_dummy_kv_cache(block_size: int,
|
||||
dtype=dtype,
|
||||
device=device)
|
||||
return kv_cache
|
||||
|
||||
|
||||
@dataclass
|
||||
class BackendConfig:
|
||||
name: str
|
||||
env_vars: dict
|
||||
comp_config: dict # compilation config
|
||||
specific_gpu_arch: Optional[tuple] = None
|
||||
|
||||
|
||||
# Define all backend configurations of full cudagraph to be tested
|
||||
full_cg_backend_configs = {
|
||||
# FA3 on Hopper
|
||||
"FA3":
|
||||
BackendConfig(name="FA3",
|
||||
env_vars={
|
||||
"VLLM_ATTENTION_BACKEND": "FLASH_ATTN",
|
||||
"VLLM_FLASH_ATTN_VERSION": "3",
|
||||
"VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH": "16",
|
||||
},
|
||||
comp_config={
|
||||
"cudagraph_mode": "FULL",
|
||||
},
|
||||
specific_gpu_arch=(9, 0)),
|
||||
# FlashMLA on Hopper
|
||||
"FlashMLA":
|
||||
BackendConfig(name="FlashMLA",
|
||||
env_vars={
|
||||
"VLLM_ATTENTION_BACKEND": "FLASHMLA",
|
||||
},
|
||||
comp_config={
|
||||
"cudagraph_mode": "FULL_AND_PIECEWISE",
|
||||
},
|
||||
specific_gpu_arch=(9, 0)),
|
||||
# Cutlass MLA on Blackwell
|
||||
"CutlassMLA":
|
||||
BackendConfig(
|
||||
name="CutlassMLA",
|
||||
env_vars={
|
||||
"VLLM_USE_V1": "1",
|
||||
"VLLM_ATTENTION_BACKEND": "CUTLASS_MLA",
|
||||
"FORCE_NUM_KV_SPLITS":
|
||||
"1", # TODO: remove this when hang issue is fixed
|
||||
},
|
||||
comp_config={
|
||||
"cudagraph_mode": "FULL_AND_PIECEWISE",
|
||||
},
|
||||
specific_gpu_arch=(10, 0)),
|
||||
# FlashAttention MLA on Hopper
|
||||
"FlashAttentionMLA":
|
||||
BackendConfig(name="FlashAttentionMLA",
|
||||
env_vars={
|
||||
"VLLM_ATTENTION_BACKEND": "FLASH_ATTN_MLA",
|
||||
"VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH": "16",
|
||||
},
|
||||
comp_config={
|
||||
"cudagraph_mode": "FULL_DECODE_ONLY",
|
||||
},
|
||||
specific_gpu_arch=(9, 0)),
|
||||
# FA2
|
||||
"FA2":
|
||||
BackendConfig(name="FA2",
|
||||
env_vars={
|
||||
"VLLM_ATTENTION_BACKEND": "FLASH_ATTN",
|
||||
"VLLM_FLASH_ATTN_VERSION": "2",
|
||||
"VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH": "16",
|
||||
},
|
||||
comp_config={
|
||||
"cudagraph_mode": "FULL_AND_PIECEWISE",
|
||||
}),
|
||||
# Triton Attention
|
||||
"TritonAttn":
|
||||
BackendConfig(name="TritonAttn",
|
||||
env_vars={"VLLM_ATTENTION_BACKEND": "TRITON_ATTN"},
|
||||
comp_config={
|
||||
"cudagraph_mode": "FULL_AND_PIECEWISE",
|
||||
}),
|
||||
# FlashInfer
|
||||
"FlashInfer":
|
||||
BackendConfig(name="FlashInfer",
|
||||
env_vars={"VLLM_ATTENTION_BACKEND": "FLASHINFER"},
|
||||
comp_config={
|
||||
"cudagraph_mode": "FULL_AND_PIECEWISE",
|
||||
}),
|
||||
}
|
||||
|
||||
@ -1,5 +1,6 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import dataclasses
|
||||
from typing import Optional
|
||||
from unittest.mock import Mock
|
||||
|
||||
@ -1899,4 +1900,53 @@ def test_priority_scheduling_preemption_when_out_of_kv():
|
||||
assert output.scheduled_cached_reqs.num_reqs == 1
|
||||
assert output.scheduled_cached_reqs.req_ids[0] == request_high.request_id
|
||||
assert len(scheduler.waiting) == 1
|
||||
assert len(scheduler.running) == 1
|
||||
assert len(scheduler.running) == 1
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("enable_chunked_prefill", "is_encoder_decoder", "expect_enabled"),
|
||||
[
|
||||
(True, False, True),
|
||||
(False, False, False),
|
||||
# Encoder-decoder models should always have it disabled
|
||||
(False, True, False),
|
||||
(True, True, False),
|
||||
])
|
||||
def test_chunked_prefill_disabled_for_encoder_decoder(
|
||||
enable_chunked_prefill: bool, is_encoder_decoder: bool,
|
||||
expect_enabled: bool) -> None:
|
||||
"""Validate that chunked prefill is appropriately disabled for
|
||||
encoder-decoder models."""
|
||||
scheduler_config = SchedulerConfig(
|
||||
enable_chunked_prefill=enable_chunked_prefill,
|
||||
is_encoder_decoder=is_encoder_decoder,
|
||||
)
|
||||
|
||||
# `is_encoder_decoder` should only be used during construction
|
||||
# of the config, and otherwise stored in the model config.
|
||||
assert "is_encoder_decoder" not in vars(scheduler_config)
|
||||
assert "is_encoder_decoder" not in [
|
||||
f.name for f in dataclasses.fields(scheduler_config)
|
||||
]
|
||||
_validate_chunked_prefill_settings_for_encoder_decoder(
|
||||
scheduler_config, is_encoder_decoder, expect_enabled)
|
||||
|
||||
# Ensure it is retained in VllmConfig, even after its post-init.
|
||||
vllm_config = VllmConfig(scheduler_config=scheduler_config)
|
||||
_validate_chunked_prefill_settings_for_encoder_decoder(
|
||||
vllm_config.scheduler_config, is_encoder_decoder, expect_enabled)
|
||||
|
||||
|
||||
def _validate_chunked_prefill_settings_for_encoder_decoder(
|
||||
scheduler_config: SchedulerConfig, is_encoder_decoder: bool,
|
||||
expect_enabled: bool) -> None:
|
||||
"""Validate chunked prefill settings in the scheduler config for
|
||||
encoder-decoder models."""
|
||||
assert scheduler_config.chunked_prefill_enabled is expect_enabled
|
||||
assert scheduler_config.enable_chunked_prefill is expect_enabled
|
||||
if is_encoder_decoder:
|
||||
# Encoder-decoder models should automatically disable chunked multimodal
|
||||
# inputs as well
|
||||
assert scheduler_config.disable_chunked_mm_input is not expect_enabled
|
||||
if is_encoder_decoder and not expect_enabled:
|
||||
assert scheduler_config.long_prefill_token_threshold == 0
|
||||
|
||||
@ -45,39 +45,22 @@ def _create_vllm_config(compilation_config: CompilationConfig,
|
||||
class TestCudagraphDispatcher:
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"params",
|
||||
"case_id,cudagraph_mode_str,compilation_level",
|
||||
[
|
||||
# Test case 0: Full CG for mixed batches, no separate routine
|
||||
{
|
||||
"case_id": 0,
|
||||
"cudagraph_mode": "FULL",
|
||||
"compilation_level": CompilationLevel.NO_COMPILATION,
|
||||
},
|
||||
(0, "FULL", CompilationLevel.NO_COMPILATION),
|
||||
# Test case 1: Full CG for uniform batches, piecewise for mixed
|
||||
{
|
||||
"case_id": 1,
|
||||
"cudagraph_mode": "FULL_AND_PIECEWISE",
|
||||
"compilation_level": CompilationLevel.PIECEWISE,
|
||||
},
|
||||
(1, "FULL_AND_PIECEWISE", CompilationLevel.NO_COMPILATION),
|
||||
# Test case 2: Full CG for uniform batches, no CG for mixed
|
||||
{
|
||||
"case_id": 2,
|
||||
"cudagraph_mode": "FULL_DECODE_ONLY",
|
||||
"compilation_level": CompilationLevel.NO_COMPILATION,
|
||||
},
|
||||
(2, "FULL_DECODE_ONLY", CompilationLevel.NO_COMPILATION),
|
||||
# Test case 3: Piecewise for all
|
||||
{
|
||||
"case_id": 3,
|
||||
"cudagraph_mode": "PIECEWISE",
|
||||
"compilation_level": CompilationLevel.PIECEWISE,
|
||||
},
|
||||
(3, "PIECEWISE", CompilationLevel.PIECEWISE),
|
||||
])
|
||||
def test_dispatcher(self, params):
|
||||
def test_dispatcher(self, cudagraph_mode_str, compilation_level):
|
||||
# Setup dispatcher
|
||||
comp_config = CompilationConfig(
|
||||
cudagraph_mode=params["cudagraph_mode"],
|
||||
level=params["compilation_level"],
|
||||
cudagraph_capture_sizes=[1, 8])
|
||||
comp_config = CompilationConfig(cudagraph_mode=cudagraph_mode_str,
|
||||
level=compilation_level,
|
||||
cudagraph_capture_sizes=[1, 8])
|
||||
|
||||
config = _create_vllm_config(comp_config, max_num_seqs=8)
|
||||
dispatcher = CudagraphDispatcher(config)
|
||||
@ -86,11 +69,11 @@ class TestCudagraphDispatcher:
|
||||
uniform_decode_query_len=1)
|
||||
|
||||
# Verify the key is initialized correctly
|
||||
if params["cudagraph_mode"] in ["FULL_AND_PIECEWISE", "PIECEWISE"]:
|
||||
if cudagraph_mode_str in ["FULL_AND_PIECEWISE", "PIECEWISE"]:
|
||||
assert len(dispatcher.cudagraph_keys[CUDAGraphMode.PIECEWISE]) == 2
|
||||
else:
|
||||
assert len(dispatcher.cudagraph_keys[CUDAGraphMode.PIECEWISE]) == 0
|
||||
if params["cudagraph_mode"] not in ["NONE", "PIECEWISE"]:
|
||||
if cudagraph_mode_str not in ["NONE", "PIECEWISE"]:
|
||||
assert len(dispatcher.cudagraph_keys[CUDAGraphMode.FULL]) == 2
|
||||
else:
|
||||
assert len(dispatcher.cudagraph_keys[CUDAGraphMode.FULL]) == 0
|
||||
@ -99,10 +82,10 @@ class TestCudagraphDispatcher:
|
||||
# 1. non-uniform batch, size in cudagraph size list
|
||||
desc_full_exact = BatchDescriptor(num_tokens=8, uniform_decode=False)
|
||||
rt_mode, key = dispatcher.dispatch(desc_full_exact)
|
||||
if params["cudagraph_mode"] == "FULL":
|
||||
if cudagraph_mode_str == "FULL":
|
||||
assert rt_mode == CUDAGraphMode.FULL
|
||||
assert key == desc_full_exact
|
||||
elif params["cudagraph_mode"] in ["FULL_AND_PIECEWISE", "PIECEWISE"]:
|
||||
elif cudagraph_mode_str in ["FULL_AND_PIECEWISE", "PIECEWISE"]:
|
||||
assert rt_mode == CUDAGraphMode.PIECEWISE
|
||||
assert key == desc_full_exact
|
||||
else:
|
||||
@ -111,15 +94,13 @@ class TestCudagraphDispatcher:
|
||||
# 2. uniform decode batch, size in cudagraph size list
|
||||
desc_uniform_exact = BatchDescriptor(num_tokens=8, uniform_decode=True)
|
||||
rt_mode, key = dispatcher.dispatch(desc_uniform_exact)
|
||||
if params["cudagraph_mode"] == "FULL":
|
||||
if cudagraph_mode_str == "FULL":
|
||||
assert rt_mode == CUDAGraphMode.FULL
|
||||
assert key == desc_uniform_exact.non_uniform
|
||||
elif params["cudagraph_mode"] in [
|
||||
"FULL_DECODE_ONLY", "FULL_AND_PIECEWISE"
|
||||
]:
|
||||
elif cudagraph_mode_str in ["FULL_DECODE_ONLY", "FULL_AND_PIECEWISE"]:
|
||||
assert rt_mode == CUDAGraphMode.FULL
|
||||
assert key == desc_uniform_exact
|
||||
elif params["cudagraph_mode"] == "PIECEWISE":
|
||||
elif cudagraph_mode_str == "PIECEWISE":
|
||||
assert rt_mode == CUDAGraphMode.PIECEWISE
|
||||
assert key == desc_uniform_exact.non_uniform
|
||||
else:
|
||||
@ -131,6 +112,16 @@ class TestCudagraphDispatcher:
|
||||
assert rt_mode == CUDAGraphMode.NONE
|
||||
assert key is None
|
||||
|
||||
# 4. Cascade attention should have a fall back mode
|
||||
desc_full_exact = BatchDescriptor(num_tokens=8, uniform_decode=False)
|
||||
rt_mode, key = dispatcher.dispatch(desc_full_exact,
|
||||
use_cascade_attn=True)
|
||||
if "PIECEWISE" in cudagraph_mode_str: # string contains check
|
||||
assert rt_mode == CUDAGraphMode.PIECEWISE
|
||||
assert key == desc_full_exact.non_uniform
|
||||
else:
|
||||
assert rt_mode == CUDAGraphMode.NONE
|
||||
|
||||
|
||||
@pytest.mark.skipif(not current_platform.is_cuda(), reason="Skip if not cuda")
|
||||
class TestCUDAGraphWrapper:
|
||||
|
||||
@ -4,12 +4,11 @@ import contextlib
|
||||
import os
|
||||
import weakref
|
||||
from contextlib import ExitStack
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional
|
||||
|
||||
import pytest
|
||||
|
||||
from tests.utils import wait_for_gpu_memory_to_clear
|
||||
from tests.v1.attention.utils import full_cg_backend_configs as backend_configs
|
||||
from vllm import LLM
|
||||
from vllm.config import CompilationConfig
|
||||
from vllm.platforms import current_platform
|
||||
@ -34,74 +33,6 @@ def temporary_environ(env_vars):
|
||||
os.environ[k] = v
|
||||
|
||||
|
||||
@dataclass
|
||||
class BackendConfig:
|
||||
name: str
|
||||
env_vars: dict
|
||||
comp_config: dict
|
||||
specific_gpu_arch: Optional[tuple] = None
|
||||
|
||||
|
||||
# Define all backend configurations of full cudagraph to be tested
|
||||
backend_configs = {
|
||||
# FA3 on Hopper
|
||||
"FA3":
|
||||
BackendConfig(name="FA3",
|
||||
env_vars={
|
||||
"VLLM_FLASH_ATTN_VERSION": "3",
|
||||
"VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH": "16",
|
||||
},
|
||||
comp_config={
|
||||
"cudagraph_mode": "FULL",
|
||||
},
|
||||
specific_gpu_arch=(9, 0)),
|
||||
# FlashMLA on Hopper
|
||||
"FlashMLA":
|
||||
BackendConfig(name="FlashMLA",
|
||||
env_vars={
|
||||
"VLLM_ATTENTION_BACKEND": "FLASHMLA",
|
||||
},
|
||||
comp_config={
|
||||
"cudagraph_mode": "FULL_AND_PIECEWISE",
|
||||
},
|
||||
specific_gpu_arch=(9, 0)),
|
||||
# FlashAttention MLA on Hopper
|
||||
"FlashAttentionMLA":
|
||||
BackendConfig(name="FlashAttentionMLA",
|
||||
env_vars={
|
||||
"VLLM_ATTENTION_BACKEND": "FLASH_ATTN_MLA",
|
||||
"VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH": "16",
|
||||
},
|
||||
comp_config={
|
||||
"cudagraph_mode": "FULL_DECODE_ONLY",
|
||||
},
|
||||
specific_gpu_arch=(9, 0)),
|
||||
# FA2
|
||||
"FA2":
|
||||
BackendConfig(name="FA2",
|
||||
env_vars={
|
||||
"VLLM_FLASH_ATTN_VERSION": "2",
|
||||
"VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH": "16",
|
||||
},
|
||||
comp_config={
|
||||
"cudagraph_mode": "FULL_AND_PIECEWISE",
|
||||
}),
|
||||
# Triton Attention
|
||||
"TritonAttn":
|
||||
BackendConfig(name="TritonAttn",
|
||||
env_vars={"VLLM_ATTENTION_BACKEND": "TRITON_ATTN"},
|
||||
comp_config={
|
||||
"cudagraph_mode": "FULL_AND_PIECEWISE",
|
||||
}),
|
||||
# FlashInfer
|
||||
"FlashInfer":
|
||||
BackendConfig(name="FlashInfer",
|
||||
env_vars={"VLLM_ATTENTION_BACKEND": "FLASHINFER"},
|
||||
comp_config={
|
||||
"cudagraph_mode": "FULL_AND_PIECEWISE",
|
||||
}),
|
||||
}
|
||||
|
||||
# test attention backend and cudagraph_mode combo
|
||||
# (backend_name, cudagraph_mode, supported)
|
||||
combo_cases_1 = [
|
||||
@ -114,9 +45,10 @@ combo_cases_1 = [
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("combo_case", combo_cases_1)
|
||||
def test_backend_and_cudagraph_mode_combo(combo_case):
|
||||
backend_name, cudagraph_mode, supported = combo_case
|
||||
@pytest.mark.parametrize("backend_name, cudagraph_mode, supported",
|
||||
combo_cases_1)
|
||||
def test_backend_and_cudagraph_mode_combo(backend_name, cudagraph_mode,
|
||||
supported):
|
||||
if backend_name == "FlashInfer":
|
||||
try:
|
||||
import flashinfer # noqa: F401
|
||||
@ -142,7 +74,7 @@ def test_backend_and_cudagraph_mode_combo(combo_case):
|
||||
compilation_config=CompilationConfig(
|
||||
level=3, cudagraph_mode=cudagraph_mode))
|
||||
llm.generate(["Hello, my name is"] * 10)
|
||||
|
||||
# when above code raises, `llm` may be undefined, so we need to catch that
|
||||
try:
|
||||
llm = weakref.proxy(llm)
|
||||
del llm
|
||||
@ -173,7 +105,8 @@ combo_cases_2 = [
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("combo_case", combo_cases_2)
|
||||
@pytest.mark.parametrize("backend_name,cudagraph_mode,compilation_level,"\
|
||||
"supported", combo_cases_2)
|
||||
def test_cudagraph_compilation_combo(combo_case):
|
||||
backend_name, cudagraph_mode, compilation_level, supported\
|
||||
= combo_case
|
||||
@ -192,6 +125,7 @@ def test_cudagraph_compilation_combo(combo_case):
|
||||
compilation_config=CompilationConfig(
|
||||
level=compilation_level, cudagraph_mode=cudagraph_mode))
|
||||
llm.generate(["Hello, my name is"] * 10)
|
||||
# when above code raises, `llm` may be undefined, so we need to catch that
|
||||
try:
|
||||
llm = weakref.proxy(llm)
|
||||
del llm
|
||||
|
||||
@ -12,7 +12,7 @@ import pytest_asyncio
|
||||
import requests
|
||||
|
||||
from tests.utils import RemoteOpenAIServer
|
||||
from tests.v1.test_utils import check_request_balancing
|
||||
from tests.v1.utils import check_request_balancing
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
MODEL_NAME = "ibm-research/PowerMoE-3b"
|
||||
@ -13,7 +13,7 @@ import pytest_asyncio
|
||||
import requests
|
||||
|
||||
from tests.utils import RemoteOpenAIServer
|
||||
from tests.v1.test_utils import check_request_balancing
|
||||
from tests.v1.utils import check_request_balancing
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
MODEL_NAME = "ibm-research/PowerMoE-3b"
|
||||
@ -8,7 +8,7 @@ from typing import Any, Union
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from tests.utils import get_attn_backend_list_based_on_platform
|
||||
from tests.utils import get_attn_backend_list_based_on_platform, large_gpu_mark
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.assets.base import VLLM_S3_BUCKET_URL
|
||||
from vllm.assets.image import VLM_IMAGES_DIR
|
||||
@ -88,69 +88,71 @@ def test_ngram_correctness(
|
||||
Compare the outputs of an original LLM and a speculative LLM
|
||||
should be the same when using ngram speculative decoding.
|
||||
'''
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_USE_V1", "1")
|
||||
test_prompts = get_test_prompts(mm_enabled=False)
|
||||
test_prompts = get_test_prompts(mm_enabled=False)
|
||||
|
||||
ref_llm = LLM(model=model_name, max_model_len=1024)
|
||||
ref_outputs = ref_llm.chat(test_prompts, sampling_config)
|
||||
del ref_llm
|
||||
torch.cuda.empty_cache()
|
||||
cleanup_dist_env_and_memory()
|
||||
ref_llm = LLM(model=model_name, max_model_len=1024)
|
||||
ref_outputs = ref_llm.chat(test_prompts, sampling_config)
|
||||
del ref_llm
|
||||
torch.cuda.empty_cache()
|
||||
cleanup_dist_env_and_memory()
|
||||
|
||||
spec_llm = LLM(
|
||||
model=model_name,
|
||||
speculative_config={
|
||||
"method": "ngram",
|
||||
"prompt_lookup_max": 5,
|
||||
"prompt_lookup_min": 3,
|
||||
"num_speculative_tokens": 3,
|
||||
},
|
||||
max_model_len=1024,
|
||||
)
|
||||
spec_outputs = spec_llm.chat(test_prompts, sampling_config)
|
||||
matches = 0
|
||||
misses = 0
|
||||
for ref_output, spec_output in zip(ref_outputs, spec_outputs):
|
||||
if ref_output.outputs[0].text == spec_output.outputs[0].text:
|
||||
matches += 1
|
||||
else:
|
||||
misses += 1
|
||||
print(f"ref_output: {ref_output.outputs[0].text}")
|
||||
print(f"spec_output: {spec_output.outputs[0].text}")
|
||||
spec_llm = LLM(
|
||||
model=model_name,
|
||||
speculative_config={
|
||||
"method": "ngram",
|
||||
"prompt_lookup_max": 5,
|
||||
"prompt_lookup_min": 3,
|
||||
"num_speculative_tokens": 3,
|
||||
},
|
||||
max_model_len=1024,
|
||||
)
|
||||
spec_outputs = spec_llm.chat(test_prompts, sampling_config)
|
||||
matches = 0
|
||||
misses = 0
|
||||
for ref_output, spec_output in zip(ref_outputs, spec_outputs):
|
||||
if ref_output.outputs[0].text == spec_output.outputs[0].text:
|
||||
matches += 1
|
||||
else:
|
||||
misses += 1
|
||||
print(f"ref_output: {ref_output.outputs[0].text}")
|
||||
print(f"spec_output: {spec_output.outputs[0].text}")
|
||||
|
||||
# Heuristic: expect at least 66% of the prompts to match exactly
|
||||
# Upon failure, inspect the outputs to check for inaccuracy.
|
||||
assert matches >= int(0.66 * len(ref_outputs))
|
||||
del spec_llm
|
||||
torch.cuda.empty_cache()
|
||||
cleanup_dist_env_and_memory()
|
||||
# Heuristic: expect at least 66% of the prompts to match exactly
|
||||
# Upon failure, inspect the outputs to check for inaccuracy.
|
||||
assert matches >= int(0.66 * len(ref_outputs))
|
||||
del spec_llm
|
||||
torch.cuda.empty_cache()
|
||||
cleanup_dist_env_and_memory()
|
||||
|
||||
|
||||
@pytest.mark.parametrize(["model_setup", "mm_enabled"], [
|
||||
(("eagle3", "Qwen/Qwen3-8B", "AngelSlim/Qwen3-8B_eagle3", 1), False),
|
||||
(("eagle", "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"yuhuili/EAGLE-LLaMA3.1-Instruct-8B", 1), False),
|
||||
(("eagle3", "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"yuhuili/EAGLE3-LLaMA3.1-Instruct-8B", 1), False),
|
||||
pytest.param(
|
||||
("eagle", "meta-llama/Llama-4-Scout-17B-16E-Instruct",
|
||||
"morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct", 4),
|
||||
False,
|
||||
marks=pytest.mark.skip(reason="Skipping due to CI OOM issues")),
|
||||
pytest.param(
|
||||
("eagle", "meta-llama/Llama-4-Scout-17B-16E-Instruct",
|
||||
"morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct", 4),
|
||||
True,
|
||||
marks=pytest.mark.skip(reason="Skipping due to CI OOM issues")),
|
||||
(("eagle", "eagle618/deepseek-v3-random",
|
||||
"eagle618/eagle-deepseek-v3-random", 1), False),
|
||||
],
|
||||
ids=[
|
||||
"qwen3_eagle3", "llama3_eagle", "llama3_eagle3",
|
||||
"llama4_eagle", "llama4_eagle_mm",
|
||||
"deepseek_eagle"
|
||||
])
|
||||
@pytest.mark.parametrize(
|
||||
["model_setup", "mm_enabled"],
|
||||
[
|
||||
(("eagle3", "Qwen/Qwen3-8B", "AngelSlim/Qwen3-8B_eagle3", 1), False),
|
||||
pytest.param(("eagle3", "Qwen/Qwen2.5-VL-7B-Instruct",
|
||||
"Rayzl/qwen2.5-vl-7b-eagle3-sgl", 1),
|
||||
False,
|
||||
marks=pytest.mark.skip(reason="Skipping due to its " \
|
||||
"head_dim not being a a multiple of 32")),
|
||||
(("eagle", "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"yuhuili/EAGLE-LLaMA3.1-Instruct-8B", 1), False),
|
||||
(("eagle3", "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"yuhuili/EAGLE3-LLaMA3.1-Instruct-8B", 1), False),
|
||||
pytest.param(("eagle", "meta-llama/Llama-4-Scout-17B-16E-Instruct",
|
||||
"morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct", 4),
|
||||
False,
|
||||
marks=large_gpu_mark(min_gb=80)), # works on 4x H100
|
||||
pytest.param(("eagle", "meta-llama/Llama-4-Scout-17B-16E-Instruct",
|
||||
"morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct", 4),
|
||||
True,
|
||||
marks=large_gpu_mark(min_gb=80)), # works on 4x H100
|
||||
(("eagle", "eagle618/deepseek-v3-random",
|
||||
"eagle618/eagle-deepseek-v3-random", 1), False),
|
||||
],
|
||||
ids=[
|
||||
"qwen3_eagle3", "qwen2_5_vl_eagle3", "llama3_eagle", "llama3_eagle3",
|
||||
"llama4_eagle", "llama4_eagle_mm", "deepseek_eagle"
|
||||
])
|
||||
@pytest.mark.parametrize("attn_backend",
|
||||
get_attn_backend_list_based_on_platform())
|
||||
def test_eagle_correctness(
|
||||
@ -174,9 +176,14 @@ def test_eagle_correctness(
|
||||
model_setup: (method, model_name, eagle_model_name, tp_size)
|
||||
'''
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_USE_V1", "1")
|
||||
m.setenv("VLLM_MLA_DISABLE", "1")
|
||||
m.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
|
||||
if "Llama-4-Scout" in model_setup[1] and attn_backend == "FLASH_ATTN":
|
||||
# Scout requires default backend selection
|
||||
# because vision encoder has head_dim 88 being incompatible
|
||||
# with FLASH_ATTN and needs to fall back to Flex Attn
|
||||
pass
|
||||
else:
|
||||
m.setenv("VLLM_MLA_DISABLE", "1")
|
||||
m.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
|
||||
|
||||
if (attn_backend == "TRITON_ATTN" and not current_platform.is_rocm()):
|
||||
pytest.skip("TRITON_ATTN does not support "
|
||||
|
||||
@ -8,7 +8,7 @@ import pytest
|
||||
import pytest_asyncio
|
||||
|
||||
from tests.utils import RemoteOpenAIServer
|
||||
from tests.v1.test_utils import check_request_balancing
|
||||
from tests.v1.utils import check_request_balancing
|
||||
|
||||
MODEL_NAME = "ibm-research/PowerMoE-3b"
|
||||
|
||||
|
||||
290
tests/v1/generation/test_batch_invariance.py
Normal file
290
tests/v1/generation/test_batch_invariance.py
Normal file
@ -0,0 +1,290 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import contextlib
|
||||
import os
|
||||
import random
|
||||
import string
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
|
||||
|
||||
def _random_prompt(min_words: int = 1024, max_words: int = 1024 * 2) -> str:
|
||||
# Lightweight random prompt generator to vary prompt lengths and content.
|
||||
vocab = [
|
||||
"alpha",
|
||||
"bravo",
|
||||
"charlie",
|
||||
"delta",
|
||||
"echo",
|
||||
"foxtrot",
|
||||
"golf",
|
||||
"hotel",
|
||||
"india",
|
||||
"juliet",
|
||||
"kilo",
|
||||
"lima",
|
||||
"mike",
|
||||
"november",
|
||||
"oscar",
|
||||
"papa",
|
||||
"quebec",
|
||||
"romeo",
|
||||
"sierra",
|
||||
"tango",
|
||||
"uniform",
|
||||
"victor",
|
||||
"whiskey",
|
||||
"xray",
|
||||
"yankee",
|
||||
"zulu",
|
||||
]
|
||||
n = random.randint(min_words, max_words)
|
||||
words = random.choices(vocab, k=n)
|
||||
|
||||
# Add some noise and punctuation variability
|
||||
if random.random() < 0.5:
|
||||
words[0] = words[0].capitalize()
|
||||
if random.random() < 0.2:
|
||||
words.append("".join(random.choices(string.ascii_lowercase, k=5)))
|
||||
punct = random.choice([".", "?", "!", "...", ""])
|
||||
return " ".join(words) + punct
|
||||
|
||||
|
||||
@pytest.mark.timeout(1000)
|
||||
def test_v1_generation_is_deterministic_across_batch_sizes_with_needle():
|
||||
"""
|
||||
Ensures that the same request (the 'needle' prompt) yields identical output
|
||||
whether run alone (bs=1) or mixed into a larger batch (e.g., bs=64),
|
||||
using the high-level v1 LLM() API only (no manual batching).
|
||||
|
||||
Strategy:
|
||||
- Create two LLM engines with identical config except max_num_seqs: 1 vs N.
|
||||
- Compute a baseline output for the needle prompt with the bs=1 engine.
|
||||
- For many trials, generate a batch (size N) where the needle appears at a
|
||||
random position among random filler prompts using the bs=N engine.
|
||||
- Track how many trials match vs mismatch, and report totals at the end.
|
||||
The test fails if any mismatches occur, but we still dump pass/fail
|
||||
counts.
|
||||
|
||||
Notes:
|
||||
- Use seeded stochastic sampling with a fixed seed to test determinism.
|
||||
- Outputs are intentionally longer and sampled at higher temperature/top_p
|
||||
to produce a more random-sounding phrase, yet remain deterministic by
|
||||
seed.
|
||||
- Keep max_tokens and max_model_len bounded for speed and memory use.
|
||||
"""
|
||||
random.seed(12345)
|
||||
|
||||
# Allow overrides from environment (useful for CI tuning)
|
||||
# "facebook/opt-125m" is too small, doesn't reliably test determinism
|
||||
model = os.getenv("VLLM_TEST_MODEL", "Qwen/Qwen3-1.7B")
|
||||
num_trials = int(os.getenv("VLLM_NEEDLE_TRIALS", "5"))
|
||||
batch_size = int(os.getenv("VLLM_NEEDLE_BATCH_SIZE", "64"))
|
||||
assert batch_size >= 2, "Batch size should be >= 2 to mix needle."
|
||||
|
||||
# Keep GPU memory usage low to avoid startup allocation failures.
|
||||
gpu_mem_util = float(os.getenv("VLLM_GPU_MEMORY_UTILIZATION", "0.3"))
|
||||
max_model_len = int(os.getenv("VLLM_MAX_MODEL_LEN", "4096"))
|
||||
swap_space_gb = int(os.getenv("VLLM_SWAP_SPACE_GB", "4"))
|
||||
|
||||
# Sampling parameters: longer outputs with a more random-sounding
|
||||
# continuation,but still deterministic due to fixed seed.
|
||||
temperature = float(os.getenv("VLLM_NEEDLE_TEMPERATURE", "0.0"))
|
||||
top_p = float(os.getenv("VLLM_NEEDLE_TOP_P", "0.95"))
|
||||
max_tokens = int(os.getenv("VLLM_NEEDLE_MAX_TOKENS", "128"))
|
||||
|
||||
sampling = SamplingParams(
|
||||
temperature=temperature,
|
||||
top_p=top_p,
|
||||
max_tokens=max_tokens,
|
||||
seed=20240919,
|
||||
)
|
||||
|
||||
needle_prompt = ("There once was a ")
|
||||
|
||||
llm_bs1 = None
|
||||
llm_bsN = None
|
||||
try:
|
||||
# Engine with bs=1 behavior
|
||||
llm_bs1 = LLM_with_max_seqs(
|
||||
model=model,
|
||||
max_num_seqs=1,
|
||||
gpu_memory_utilization=gpu_mem_util,
|
||||
max_model_len=max_model_len,
|
||||
swap_space=swap_space_gb,
|
||||
)
|
||||
|
||||
# Baseline generation for the needle prompt alone.
|
||||
baseline_out = llm_bs1.generate([needle_prompt], sampling)
|
||||
assert len(baseline_out) == 1
|
||||
assert len(baseline_out[0].outputs) >= 1
|
||||
baseline_text = baseline_out[0].outputs[0].text
|
||||
|
||||
# Engine with larger batch limit (e.g., 64)
|
||||
llm_bsN = LLM_with_max_seqs(
|
||||
model=model,
|
||||
max_num_seqs=batch_size,
|
||||
gpu_memory_utilization=gpu_mem_util,
|
||||
max_model_len=max_model_len,
|
||||
swap_space=swap_space_gb,
|
||||
)
|
||||
|
||||
mismatches = 0
|
||||
|
||||
for trial in range(num_trials):
|
||||
# Create a batch of size `batch_size` and insert the needle at
|
||||
# a random index
|
||||
prompts: list[str] = []
|
||||
needle_pos = random.randint(0, batch_size - 1)
|
||||
for i in range(batch_size):
|
||||
if i == needle_pos:
|
||||
prompts.append(needle_prompt)
|
||||
else:
|
||||
prompts.append(_random_prompt())
|
||||
|
||||
# Generate with the larger-batch engine
|
||||
outputs = llm_bsN.generate(prompts, sampling)
|
||||
# Find the needle output by position
|
||||
needle_output = outputs[needle_pos]
|
||||
assert needle_output.prompt == needle_prompt
|
||||
assert len(needle_output.outputs) >= 1
|
||||
text = needle_output.outputs[0].text
|
||||
|
||||
if text != baseline_text:
|
||||
mismatches += 1
|
||||
|
||||
passes = num_trials - mismatches
|
||||
# Dump how many passed vs failed
|
||||
print(f"[determinism] total={num_trials}, passed={passes}, "
|
||||
f"failed={mismatches}, batch_size={batch_size}")
|
||||
|
||||
if mismatches > 0:
|
||||
pytest.fail(
|
||||
f"Nondeterministic outputs detected: {mismatches} failed out "
|
||||
f"of {num_trials} trials (batch_size={batch_size}).")
|
||||
|
||||
finally:
|
||||
# Ensure engines are shutdown to free GPU/VRAM across test sessions
|
||||
if llm_bs1 is not None:
|
||||
with contextlib.suppress(Exception):
|
||||
llm_bs1.shutdown()
|
||||
if llm_bsN is not None:
|
||||
with contextlib.suppress(Exception):
|
||||
llm_bsN.shutdown()
|
||||
|
||||
|
||||
def _extract_step_logprobs(request_output):
|
||||
if getattr(request_output, "outputs", None):
|
||||
inner = request_output.outputs[0]
|
||||
if hasattr(inner, "logprobs") and inner.logprobs is not None:
|
||||
t = torch.tensor(
|
||||
[
|
||||
inner.logprobs[i][tid].logprob
|
||||
for i, tid in enumerate(inner.token_ids)
|
||||
],
|
||||
dtype=torch.float32,
|
||||
)
|
||||
return t
|
||||
|
||||
return None
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not torch.cuda.is_available(),
|
||||
reason="Requires CUDA to match production inference path.",
|
||||
)
|
||||
def test_logprobs_bitwise_batch_invariance_bs1_vs_bs2():
|
||||
|
||||
#model_name = os.getenv("VLLM_TEST_MODEL", "facebook/opt-125m")
|
||||
model_name = os.getenv("VLLM_TEST_MODEL", "Qwen/Qwen3-1.7B")
|
||||
tp_size = int(os.getenv("VLLM_TEST_TP_SIZE", "1"))
|
||||
|
||||
# Force float32 to avoid precision-induced differences.
|
||||
llm = LLM(
|
||||
model=model_name,
|
||||
tensor_parallel_size=tp_size,
|
||||
enforce_eager=True, # helps reduce nondeterminism from some backends
|
||||
)
|
||||
|
||||
prompts = [
|
||||
"The capital of France is",
|
||||
"The capital of Germany is",
|
||||
]
|
||||
|
||||
sp = SamplingParams(
|
||||
temperature=0.0,
|
||||
top_p=1.0,
|
||||
max_tokens=8,
|
||||
# Seed shouldn't matter at temperature=0, but keeping it stable anyway.
|
||||
seed=1234,
|
||||
logprobs=5,
|
||||
)
|
||||
|
||||
# BS=1: run prompts individually and collect logprobs per step.
|
||||
bs1_logprobs_per_prompt = []
|
||||
for p in prompts:
|
||||
outs = llm.generate([p], sp, use_tqdm=False)
|
||||
assert len(outs) == 1
|
||||
step_logprobs = _extract_step_logprobs(outs[0])
|
||||
if step_logprobs is None:
|
||||
pytest.skip("Logits are not available on RequestOutput; "
|
||||
"enable logprobs return to run this test.")
|
||||
bs1_logprobs_per_prompt.append(step_logprobs)
|
||||
|
||||
# BS=2: run prompts in a batch and collect logprobs per step for each
|
||||
# prompt.
|
||||
outs_batched = llm.generate(prompts, sp, use_tqdm=False)
|
||||
assert len(outs_batched) == len(prompts)
|
||||
bs2_logprobs_per_prompt = []
|
||||
for o in outs_batched:
|
||||
step_logprobs = _extract_step_logprobs(o)
|
||||
if step_logprobs is None:
|
||||
pytest.skip("Logits are not available on RequestOutput; "
|
||||
"enable logprobs return to run this test.")
|
||||
bs2_logprobs_per_prompt.append(step_logprobs)
|
||||
|
||||
# Compare step-by-step logprobs for each prompt between BS=1 and BS=2 runs.
|
||||
for i, (logprobs_bs1, logprobs_bs2) in enumerate(
|
||||
zip(bs1_logprobs_per_prompt, bs2_logprobs_per_prompt)):
|
||||
assert len(logprobs_bs1) == len(logprobs_bs2), (
|
||||
f"Different number of generation steps for prompt index {i}: "
|
||||
f"{len(logprobs_bs1)} (BS=1) vs {len(logprobs_bs2)} (BS=2)")
|
||||
for t, (a, b) in enumerate(zip(logprobs_bs1, logprobs_bs2)):
|
||||
assert a.shape == b.shape, (
|
||||
f"Logits shape mismatch at prompt {i}, step {t}: "
|
||||
f"{a.shape} vs {b.shape}")
|
||||
# Bitwise exact equality.
|
||||
assert torch.equal(
|
||||
a, b), (f"Bitwise logprobs mismatch at prompt {i}, step {t} "
|
||||
f"(dtype={a.dtype}, shape={a.shape}).")
|
||||
|
||||
|
||||
def LLM_with_max_seqs(
|
||||
model: str,
|
||||
max_num_seqs: int,
|
||||
gpu_memory_utilization: float,
|
||||
max_model_len: int,
|
||||
swap_space: int,
|
||||
) -> LLM:
|
||||
"""
|
||||
Helper to construct an LLM with a specific max_num_seqs (batch-size limit)
|
||||
using the high-level v1 LLM API, while constraining memory usage.
|
||||
"""
|
||||
return LLM(
|
||||
model=model,
|
||||
max_num_seqs=max_num_seqs,
|
||||
# Constrain GPU memory pool so test can run even on busy GPUs.
|
||||
gpu_memory_utilization=gpu_memory_utilization,
|
||||
# Keep KV cache footprint small while allowing longer outputs.
|
||||
max_model_len=max_model_len,
|
||||
# Allow some CPU offload if needed.
|
||||
swap_space=swap_space,
|
||||
# Keep things lean and CI-friendly.
|
||||
dtype="float16",
|
||||
# Single-GPU by default; override externally if desired.
|
||||
tensor_parallel_size=int(os.getenv("VLLM_TP_SIZE", "1")),
|
||||
trust_remote_code=os.getenv("VLLM_TRUST_REMOTE_CODE", "0") == "1",
|
||||
)
|
||||
@ -1,6 +1,31 @@
|
||||
#!/bin/bash
|
||||
set -xe
|
||||
|
||||
# Parse command line arguments
|
||||
KV_BUFFER_DEVICE="cuda" # Default to cuda
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case $1 in
|
||||
--kv_buffer_device)
|
||||
KV_BUFFER_DEVICE="$2"
|
||||
shift 2
|
||||
;;
|
||||
*)
|
||||
echo "Unknown option $1"
|
||||
echo "Usage: $0 [--kv_buffer_device <cuda|cpu>]"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
echo "Running accuracy tests with kv_buffer_device=$KV_BUFFER_DEVICE"
|
||||
|
||||
# Build the kv-transfer-config once
|
||||
if [[ "$KV_BUFFER_DEVICE" == "cuda" ]]; then
|
||||
KV_CONFIG='{"kv_connector":"NixlConnector","kv_role":"kv_both"}'
|
||||
else
|
||||
KV_CONFIG="{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\",\"kv_buffer_device\":\"$KV_BUFFER_DEVICE\"}"
|
||||
fi
|
||||
|
||||
# Models to run
|
||||
MODELS=(
|
||||
"Qwen/Qwen3-0.6B"
|
||||
@ -79,7 +104,7 @@ run_tests_for_model() {
|
||||
|
||||
# Calculate port number (base port + instance number)
|
||||
PORT=$((8100 + i))
|
||||
# Calculate side channel port. Avoid clash with with TP workers.
|
||||
# Calculate side channel port. Avoid clash with with TP workers.
|
||||
SIDE_CHANNEL_PORT=$((5559 + i))
|
||||
|
||||
echo "Starting prefill instance $i on GPU $GPU_ID, port $PORT"
|
||||
@ -93,7 +118,7 @@ run_tests_for_model() {
|
||||
--enforce-eager \
|
||||
--gpu-memory-utilization 0.2 \
|
||||
--tensor-parallel-size $PREFILLER_TP_SIZE \
|
||||
--kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\"}'"
|
||||
--kv-transfer-config '$KV_CONFIG'"
|
||||
|
||||
if [ -n "$model_args" ]; then
|
||||
FULL_CMD="$BASE_CMD $model_args"
|
||||
@ -128,7 +153,7 @@ run_tests_for_model() {
|
||||
--enforce-eager \
|
||||
--gpu-memory-utilization 0.2 \
|
||||
--tensor-parallel-size $DECODER_TP_SIZE \
|
||||
--kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\"}'"
|
||||
--kv-transfer-config '$KV_CONFIG'"
|
||||
|
||||
if [ -n "$model_args" ]; then
|
||||
FULL_CMD="$BASE_CMD $model_args"
|
||||
|
||||
37
tests/v1/kv_connector/nixl_integration/run_edge_case_test.sh
Normal file → Executable file
37
tests/v1/kv_connector/nixl_integration/run_edge_case_test.sh
Normal file → Executable file
@ -1,6 +1,33 @@
|
||||
#!/bin/bash
|
||||
set -xe
|
||||
|
||||
# Parse command line arguments
|
||||
KV_BUFFER_DEVICE="cuda" # Default to cuda
|
||||
PREFILL_GPU_ID=4 # Default GPU IDs
|
||||
DECODE_GPU_ID=5
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case $1 in
|
||||
--kv_buffer_device)
|
||||
KV_BUFFER_DEVICE="$2"
|
||||
shift 2
|
||||
;;
|
||||
*)
|
||||
echo "Unknown option $1"
|
||||
echo "Usage: $0 [--kv_buffer_device <cuda|cpu>]"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
echo "Running edge case tests with kv_buffer_device=$KV_BUFFER_DEVICE (GPUs: $PREFILL_GPU_ID, $DECODE_GPU_ID)"
|
||||
|
||||
# Build the kv-transfer-config once
|
||||
if [[ "$KV_BUFFER_DEVICE" == "cuda" ]]; then
|
||||
KV_CONFIG='{"kv_connector":"NixlConnector","kv_role":"kv_both"}'
|
||||
else
|
||||
KV_CONFIG="{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\",\"kv_buffer_device\":\"$KV_BUFFER_DEVICE\"}"
|
||||
fi
|
||||
|
||||
# Models to run
|
||||
MODELS=(
|
||||
"Qwen/Qwen3-0.6B"
|
||||
@ -50,15 +77,15 @@ run_tests_for_model() {
|
||||
|
||||
# Get model-specific arguments
|
||||
local model_args=$(get_model_args "$model_name")
|
||||
|
||||
|
||||
# Start prefill instance
|
||||
PREFILL_PORT=8001
|
||||
|
||||
BASE_CMD="CUDA_VISIBLE_DEVICES=0 VLLM_NIXL_SIDE_CHANNEL_PORT=5559 vllm serve $model_name \
|
||||
BASE_CMD="CUDA_VISIBLE_DEVICES=$PREFILL_GPU_ID VLLM_NIXL_SIDE_CHANNEL_PORT=5559 vllm serve $model_name \
|
||||
--port $PREFILL_PORT \
|
||||
--enforce-eager \
|
||||
--gpu-memory-utilization 0.2 \
|
||||
--kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\"}'"
|
||||
--kv-transfer-config '$KV_CONFIG'"
|
||||
|
||||
if [ -n "$model_args" ]; then
|
||||
FULL_CMD="$BASE_CMD $model_args"
|
||||
@ -72,11 +99,11 @@ run_tests_for_model() {
|
||||
DECODE_PORT=8002
|
||||
|
||||
# Build the command with or without model-specific args
|
||||
BASE_CMD="CUDA_VISIBLE_DEVICES=1 VLLM_NIXL_SIDE_CHANNEL_PORT=6000 vllm serve $model_name \
|
||||
BASE_CMD="CUDA_VISIBLE_DEVICES=$DECODE_GPU_ID VLLM_NIXL_SIDE_CHANNEL_PORT=6000 vllm serve $model_name \
|
||||
--port $DECODE_PORT \
|
||||
--enforce-eager \
|
||||
--gpu-memory-utilization 0.2 \
|
||||
--kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\"}'"
|
||||
--kv-transfer-config '$KV_CONFIG'"
|
||||
|
||||
if [ -n "$model_args" ]; then
|
||||
FULL_CMD="$BASE_CMD $model_args"
|
||||
|
||||
@ -337,19 +337,13 @@ def test_load_model(mock_get_model, mock_get_layers, mock_get_pp_group, method,
|
||||
"target_attn_1": mock.MagicMock(),
|
||||
"target_attn_2": mock.MagicMock()
|
||||
}
|
||||
target_indx_layers: dict[str, mock.MagicMock] = {}
|
||||
# Draft model has one extra attention layer compared to target model
|
||||
all_attn_layers = {
|
||||
**target_attn_layers, "draft_extra_attn": mock.MagicMock()
|
||||
}
|
||||
|
||||
all_indx_layers: dict[str, mock.MagicMock] = {}
|
||||
|
||||
# Make mock_get_layers return different values for each call
|
||||
mock_get_layers.side_effect = [
|
||||
target_attn_layers, target_indx_layers, all_attn_layers,
|
||||
all_indx_layers
|
||||
]
|
||||
mock_get_layers.side_effect = [target_attn_layers, all_attn_layers]
|
||||
|
||||
# Setup mock for pp group to return the appropriate value for world size
|
||||
mock_pp_group = mock.MagicMock()
|
||||
@ -664,9 +658,6 @@ def test_propose_tree(spec_token_tree):
|
||||
# Mock runner for attention metadata building.
|
||||
proposer.runner = mock.MagicMock()
|
||||
proposer.runner.attn_groups.append([mock.MagicMock()])
|
||||
proposer.runner.attn_groups[0][0].metadata_builders = [
|
||||
attn_metadata_builder
|
||||
]
|
||||
proposer.runner.attn_groups[0][0].get_metadata_builder.return_value = \
|
||||
attn_metadata_builder
|
||||
proposer._get_attention_metadata_builder = mock.MagicMock(
|
||||
|
||||
@ -63,13 +63,7 @@ def test_mtp_load_model_unified(mock_get_model, mock_get_layers,
|
||||
|
||||
target_attn_layers = {"target_attn_1": mock.MagicMock()}
|
||||
all_attn_layers = {**target_attn_layers, "draft_attn_1": mock.MagicMock()}
|
||||
target_indexer_layers: dict = {}
|
||||
all_indexer_layers: dict = {}
|
||||
|
||||
mock_get_layers.side_effect = [
|
||||
target_attn_layers, target_indexer_layers, all_attn_layers,
|
||||
all_indexer_layers
|
||||
]
|
||||
mock_get_layers.side_effect = [target_attn_layers, all_attn_layers]
|
||||
|
||||
mock_pp_group = mock.MagicMock()
|
||||
mock_pp_group.world_size = 1
|
||||
|
||||
@ -1,71 +1,10 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import pytest
|
||||
import regex as re
|
||||
import requests
|
||||
import torch
|
||||
|
||||
from tests.utils import RemoteOpenAIServer
|
||||
from vllm.v1.worker.utils import bind_kv_cache
|
||||
|
||||
|
||||
def test_bind_kv_cache():
|
||||
from vllm.attention import Attention
|
||||
|
||||
ctx = {
|
||||
'layers.0.self_attn': Attention(32, 128, 0.1),
|
||||
'layers.1.self_attn': Attention(32, 128, 0.1),
|
||||
'layers.2.self_attn': Attention(32, 128, 0.1),
|
||||
'layers.3.self_attn': Attention(32, 128, 0.1),
|
||||
}
|
||||
kv_cache = {
|
||||
'layers.0.self_attn': torch.zeros((1, )),
|
||||
'layers.1.self_attn': torch.zeros((1, )),
|
||||
'layers.2.self_attn': torch.zeros((1, )),
|
||||
'layers.3.self_attn': torch.zeros((1, )),
|
||||
}
|
||||
runner_kv_caches: list[torch.Tensor] = []
|
||||
bind_kv_cache(kv_cache, ctx, runner_kv_caches)
|
||||
assert ctx['layers.0.self_attn'].kv_cache[0] is kv_cache[
|
||||
'layers.0.self_attn']
|
||||
assert ctx['layers.1.self_attn'].kv_cache[0] is kv_cache[
|
||||
'layers.1.self_attn']
|
||||
assert ctx['layers.2.self_attn'].kv_cache[0] is kv_cache[
|
||||
'layers.2.self_attn']
|
||||
assert ctx['layers.3.self_attn'].kv_cache[0] is kv_cache[
|
||||
'layers.3.self_attn']
|
||||
|
||||
assert runner_kv_caches[0] is kv_cache['layers.0.self_attn']
|
||||
assert runner_kv_caches[1] is kv_cache['layers.1.self_attn']
|
||||
assert runner_kv_caches[2] is kv_cache['layers.2.self_attn']
|
||||
assert runner_kv_caches[3] is kv_cache['layers.3.self_attn']
|
||||
|
||||
|
||||
def test_bind_kv_cache_non_attention():
|
||||
from vllm.attention import Attention
|
||||
|
||||
# example from Jamba PP=2
|
||||
ctx = {
|
||||
'model.layers.20.attn': Attention(32, 128, 0.1),
|
||||
'model.layers.28.attn': Attention(32, 128, 0.1),
|
||||
}
|
||||
kv_cache = {
|
||||
'model.layers.20.attn': torch.zeros((1, )),
|
||||
'model.layers.28.attn': torch.zeros((1, )),
|
||||
}
|
||||
|
||||
runner_kv_caches: list[torch.Tensor] = []
|
||||
bind_kv_cache(kv_cache, ctx, runner_kv_caches)
|
||||
|
||||
assert ctx['model.layers.20.attn'].kv_cache[0] is kv_cache[
|
||||
'model.layers.20.attn']
|
||||
assert ctx['model.layers.28.attn'].kv_cache[0] is kv_cache[
|
||||
'model.layers.28.attn']
|
||||
|
||||
assert runner_kv_caches[0] is kv_cache['model.layers.20.attn']
|
||||
assert runner_kv_caches[1] is kv_cache['model.layers.28.attn']
|
||||
|
||||
|
||||
# Prometheus metrics utilities for testing
|
||||
|
||||
63
tests/v1/worker/test_utils.py
Normal file
63
tests/v1/worker/test_utils.py
Normal file
@ -0,0 +1,63 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import torch
|
||||
|
||||
from vllm.v1.worker.utils import bind_kv_cache
|
||||
|
||||
|
||||
def test_bind_kv_cache():
|
||||
from vllm.attention import Attention
|
||||
|
||||
ctx = {
|
||||
'layers.0.self_attn': Attention(32, 128, 0.1),
|
||||
'layers.1.self_attn': Attention(32, 128, 0.1),
|
||||
'layers.2.self_attn': Attention(32, 128, 0.1),
|
||||
'layers.3.self_attn': Attention(32, 128, 0.1),
|
||||
}
|
||||
kv_cache = {
|
||||
'layers.0.self_attn': torch.zeros((1, )),
|
||||
'layers.1.self_attn': torch.zeros((1, )),
|
||||
'layers.2.self_attn': torch.zeros((1, )),
|
||||
'layers.3.self_attn': torch.zeros((1, )),
|
||||
}
|
||||
runner_kv_caches: list[torch.Tensor] = []
|
||||
bind_kv_cache(kv_cache, ctx, runner_kv_caches)
|
||||
assert ctx['layers.0.self_attn'].kv_cache[0] is kv_cache[
|
||||
'layers.0.self_attn']
|
||||
assert ctx['layers.1.self_attn'].kv_cache[0] is kv_cache[
|
||||
'layers.1.self_attn']
|
||||
assert ctx['layers.2.self_attn'].kv_cache[0] is kv_cache[
|
||||
'layers.2.self_attn']
|
||||
assert ctx['layers.3.self_attn'].kv_cache[0] is kv_cache[
|
||||
'layers.3.self_attn']
|
||||
|
||||
assert runner_kv_caches[0] is kv_cache['layers.0.self_attn']
|
||||
assert runner_kv_caches[1] is kv_cache['layers.1.self_attn']
|
||||
assert runner_kv_caches[2] is kv_cache['layers.2.self_attn']
|
||||
assert runner_kv_caches[3] is kv_cache['layers.3.self_attn']
|
||||
|
||||
|
||||
def test_bind_kv_cache_non_attention():
|
||||
from vllm.attention import Attention
|
||||
|
||||
# example from Jamba PP=2
|
||||
ctx = {
|
||||
'model.layers.20.attn': Attention(32, 128, 0.1),
|
||||
'model.layers.28.attn': Attention(32, 128, 0.1),
|
||||
}
|
||||
kv_cache = {
|
||||
'model.layers.20.attn': torch.zeros((1, )),
|
||||
'model.layers.28.attn': torch.zeros((1, )),
|
||||
}
|
||||
|
||||
runner_kv_caches: list[torch.Tensor] = []
|
||||
bind_kv_cache(kv_cache, ctx, runner_kv_caches)
|
||||
|
||||
assert ctx['model.layers.20.attn'].kv_cache[0] is kv_cache[
|
||||
'model.layers.20.attn']
|
||||
assert ctx['model.layers.28.attn'].kv_cache[0] is kv_cache[
|
||||
'model.layers.28.attn']
|
||||
|
||||
assert runner_kv_caches[0] is kv_cache['model.layers.20.attn']
|
||||
assert runner_kv_caches[1] is kv_cache['model.layers.28.attn']
|
||||
63
tools/flashinfer-build.sh
Normal file
63
tools/flashinfer-build.sh
Normal file
@ -0,0 +1,63 @@
|
||||
#!/usr/bin/env bash
|
||||
# This script is used to build FlashInfer wheels with AOT kernels
|
||||
|
||||
set -ex
|
||||
|
||||
# FlashInfer configuration
|
||||
FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git"
|
||||
FLASHINFER_GIT_REF="${FLASHINFER_GIT_REF}"
|
||||
CUDA_VERSION="${CUDA_VERSION}"
|
||||
BUILD_WHEEL="${BUILD_WHEEL:-true}"
|
||||
|
||||
if [[ -z "${FLASHINFER_GIT_REF}" ]]; then
|
||||
echo "❌ FLASHINFER_GIT_REF must be specified" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [[ -z "${CUDA_VERSION}" ]]; then
|
||||
echo "❌ CUDA_VERSION must be specified" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "🏗️ Building FlashInfer ${FLASHINFER_GIT_REF} for CUDA ${CUDA_VERSION}"
|
||||
|
||||
# Clone FlashInfer
|
||||
git clone --depth 1 --recursive --shallow-submodules \
|
||||
--branch ${FLASHINFER_GIT_REF} \
|
||||
${FLASHINFER_GIT_REPO} flashinfer
|
||||
|
||||
# Set CUDA arch list based on CUDA version
|
||||
# Exclude CUDA arches for older versions (11.x and 12.0-12.7)
|
||||
if [[ "${CUDA_VERSION}" == 11.* ]]; then
|
||||
FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9"
|
||||
elif [[ "${CUDA_VERSION}" == 12.[0-7]* ]]; then
|
||||
FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a"
|
||||
else
|
||||
# CUDA 12.8+ supports 10.0a and 12.0
|
||||
FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a 10.0a 12.0"
|
||||
fi
|
||||
|
||||
echo "🏗️ Building FlashInfer AOT for arches: ${FI_TORCH_CUDA_ARCH_LIST}"
|
||||
|
||||
pushd flashinfer
|
||||
# Make sure the wheel is built for the correct CUDA version
|
||||
export UV_TORCH_BACKEND=cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
|
||||
|
||||
# Build AOT kernels
|
||||
export TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}"
|
||||
export FLASHINFER_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}"
|
||||
python3 -m flashinfer.aot
|
||||
|
||||
if [[ "${BUILD_WHEEL}" == "true" ]]; then
|
||||
# Build wheel for distribution
|
||||
uv build --no-build-isolation --wheel --out-dir ../flashinfer-dist .
|
||||
echo "✅ FlashInfer wheel built successfully in flashinfer-dist/"
|
||||
else
|
||||
# Install directly (for Dockerfile)
|
||||
uv pip install --system --no-build-isolation --force-reinstall .
|
||||
echo "✅ FlashInfer installed successfully"
|
||||
fi
|
||||
popd
|
||||
|
||||
# Cleanup
|
||||
rm -rf flashinfer
|
||||
@ -6,7 +6,7 @@ set -e
|
||||
|
||||
# Default values
|
||||
DEEPGEMM_GIT_REPO="https://github.com/deepseek-ai/DeepGEMM.git"
|
||||
DEEPGEMM_GIT_REF="ea9c5d9270226c5dd7a577c212e9ea385f6ef048"
|
||||
DEEPGEMM_GIT_REF="594953acce41793ae00a1233eb516044d604bcb6"
|
||||
|
||||
# Parse command line arguments
|
||||
while [[ $# -gt 0 ]]; do
|
||||
|
||||
@ -36,7 +36,6 @@ ALLOWED_FILES = {
|
||||
'benchmarks/cutlass_benchmarks/w8a8_benchmarks.py',
|
||||
'benchmarks/cutlass_benchmarks/sparse_benchmarks.py',
|
||||
# cloudpickle
|
||||
'vllm/worker/worker_base.py',
|
||||
'vllm/executor/mp_distributed_executor.py',
|
||||
'vllm/executor/ray_distributed_executor.py',
|
||||
'vllm/entrypoints/llm.py',
|
||||
|
||||
@ -20,8 +20,7 @@ from vllm.forward_context import ForwardContext, get_forward_context
|
||||
from vllm.logger import init_logger
|
||||
from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
|
||||
from vllm.model_executor.layers.linear import UnquantizedLinearMethod
|
||||
from vllm.model_executor.layers.quantization.base_config import (
|
||||
QuantizationConfig)
|
||||
from vllm.model_executor.layers.quantization import QuantizationConfig
|
||||
from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
|
||||
from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
|
||||
from vllm.model_executor.layers.quantization.utils.quant_utils import (
|
||||
@ -280,9 +279,8 @@ class Attention(nn.Module, AttentionLayerBase):
|
||||
`vllm.forward_context.get_forward_context().attn_metadata`.
|
||||
"""
|
||||
if self.calculate_kv_scales:
|
||||
attn_metadata = get_forward_context().attn_metadata
|
||||
if attn_metadata.enable_kv_scales_calculation:
|
||||
self.calc_kv_scales(query, key, value)
|
||||
torch.ops.vllm.maybe_calc_kv_scales(query, key, value,
|
||||
self.layer_name)
|
||||
|
||||
output_dtype = query.dtype
|
||||
if self.query_quant is not None:
|
||||
@ -557,6 +555,44 @@ def maybe_save_kv_layer_to_connector(
|
||||
attn_metadata[layer_name])
|
||||
|
||||
|
||||
def maybe_calc_kv_scales(
|
||||
query: torch.Tensor,
|
||||
key: torch.Tensor,
|
||||
value: torch.Tensor,
|
||||
layer_name: str,
|
||||
) -> None:
|
||||
|
||||
forward_context: ForwardContext = get_forward_context()
|
||||
attn_metadata = forward_context.attn_metadata
|
||||
|
||||
if isinstance(attn_metadata, dict):
|
||||
attn_metadata = attn_metadata[layer_name]
|
||||
|
||||
if attn_metadata is None or not getattr(
|
||||
attn_metadata, 'enable_kv_scales_calculation', False):
|
||||
return
|
||||
|
||||
self = forward_context.no_compile_layers[layer_name]
|
||||
self.calc_kv_scales(query, key, value)
|
||||
|
||||
|
||||
def maybe_calc_kv_scales_fake(
|
||||
query: torch.Tensor,
|
||||
key: torch.Tensor,
|
||||
value: torch.Tensor,
|
||||
layer_name: str,
|
||||
) -> None:
|
||||
return
|
||||
|
||||
|
||||
direct_register_custom_op(
|
||||
op_name="maybe_calc_kv_scales",
|
||||
op_func=maybe_calc_kv_scales,
|
||||
mutates_args=["query", "key", "value"],
|
||||
fake_impl=maybe_calc_kv_scales_fake,
|
||||
)
|
||||
|
||||
|
||||
def unified_attention(
|
||||
query: torch.Tensor,
|
||||
key: torch.Tensor,
|
||||
|
||||
@ -9,7 +9,8 @@ from vllm import envs
|
||||
from vllm.attention.backends.abstract import (AttentionBackend,
|
||||
AttentionMetadata)
|
||||
from vllm.attention.selector import get_attn_backend
|
||||
from vllm.config import CacheConfig, QuantizationConfig
|
||||
from vllm.config import CacheConfig
|
||||
from vllm.model_executor.layers.quantization import QuantizationConfig
|
||||
from vllm.v1.attention.backends.utils import (
|
||||
CommonAttentionMetadata, make_local_attention_virtual_batches,
|
||||
subclass_attention_backend)
|
||||
|
||||
@ -136,7 +136,7 @@ def flash_mla_with_kvcache(
|
||||
descale_k is None
|
||||
), "descale_q and descale_k should be both None or both not None"
|
||||
|
||||
if indices is None and q.element_size() == 1:
|
||||
if (descale_q is not None) and (descale_k is not None):
|
||||
out, softmax_lse = torch.ops._flashmla_extension_C.fwd_kvcache_mla_fp8(
|
||||
q, k_cache, head_dim_v, cache_seqlens, block_table, softmax_scale,
|
||||
causal, tile_scheduler_metadata, num_splits, descale_q, descale_k)
|
||||
|
||||
@ -137,7 +137,7 @@ def triton_reshape_and_cache_flash(
|
||||
|
||||
# heuristics instead of autotuning
|
||||
TILE_SIZE = min(2048, triton.next_power_of_2(n))
|
||||
if torch.version.hip or torch.version.xpu:
|
||||
if current_platform.is_rocm() or current_platform.is_xpu():
|
||||
num_stages = 4
|
||||
num_warps = 8
|
||||
else: # cuda
|
||||
|
||||
@ -366,11 +366,67 @@ def process_video(video: Any) -> Mapping[str, Any]:
|
||||
f"Invalid video input {video}. Must be a string of local path/remote url, or a dictionary with raw video bytes in the form of `{{'bytes': raw_video_bytes}}`." # noqa: E501
|
||||
)
|
||||
|
||||
|
||||
def gen_prompt_decode_to_target_len(
|
||||
tokenizer: PreTrainedTokenizerBase,
|
||||
token_sequence: list[int],
|
||||
target_token_len: int,
|
||||
max_retry: int = 10,
|
||||
add_special_tokens: bool = False,
|
||||
rng: Optional[np.random.Generator] = None,
|
||||
) -> tuple[str, list[int]]:
|
||||
"""
|
||||
Ensure decoded-then-encoded prompt length matches the target token length.
|
||||
|
||||
This function decodes an initial token sequence to text and re-encodes it
|
||||
, iteratively adjusting the token sequence length to match a target.
|
||||
This is necessary because some tokenizers do not guarantee a 1:1 mapping
|
||||
between consecutive tokens and the decoded-then-encoded sequence length.
|
||||
For example, for GPT2Tokenizer:
|
||||
[6880, 6881] -> ['Ġcalls', 'here'] ->
|
||||
[1650, 939, 486] -> ['Ġcall', 'sh', 'ere']
|
||||
|
||||
Returns a tuple of the final prompt string and the adjusted token sequence.
|
||||
"""
|
||||
remain_num_try = max_retry
|
||||
token_mismatch = 0
|
||||
while True:
|
||||
prompt = tokenizer.decode(token_sequence)
|
||||
token_sequence = tokenizer.encode(
|
||||
prompt, add_special_tokens=add_special_tokens
|
||||
)
|
||||
if remain_num_try <= 0:
|
||||
if len(token_sequence) != target_token_len:
|
||||
token_mismatch = len(token_sequence) - target_token_len
|
||||
break
|
||||
|
||||
if len(token_sequence) == target_token_len:
|
||||
break
|
||||
elif len(token_sequence) < target_token_len:
|
||||
if rng is not None:
|
||||
extra_tokens = rng.integers(
|
||||
0,
|
||||
tokenizer.vocab_size,
|
||||
size=target_token_len - len(token_sequence),
|
||||
).tolist()
|
||||
else:
|
||||
extra_tokens = np.random.randint(
|
||||
0,
|
||||
tokenizer.vocab_size,
|
||||
size=target_token_len - len(token_sequence),
|
||||
).tolist()
|
||||
token_sequence.extend(extra_tokens)
|
||||
elif len(token_sequence) > target_token_len:
|
||||
token_sequence = token_sequence[:target_token_len]
|
||||
|
||||
remain_num_try -= 1
|
||||
|
||||
return prompt, token_sequence, token_mismatch
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Random Dataset Implementation (Synthetic Data)
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
|
||||
class RandomDataset(BenchmarkDataset):
|
||||
"""
|
||||
Synthetic text-only dataset for serving/throughput benchmarks.
|
||||
@ -420,8 +476,9 @@ class RandomDataset(BenchmarkDataset):
|
||||
vocab_size = tokenizer.vocab_size
|
||||
|
||||
requests = []
|
||||
token_mismatch_total = 0
|
||||
for i in range(num_requests):
|
||||
prompt, total_input_len = self.generate_token_sequence(
|
||||
prompt, total_input_len, token_mismatch = self.generate_token_sequence( # noqa: E501
|
||||
tokenizer=tokenizer,
|
||||
prefix_token_ids=prefix_token_ids,
|
||||
prefix_len=prefix_len,
|
||||
@ -430,6 +487,7 @@ class RandomDataset(BenchmarkDataset):
|
||||
offset=int(offsets[i]),
|
||||
index=i,
|
||||
)
|
||||
token_mismatch_total += token_mismatch
|
||||
requests.append(
|
||||
SampleRequest(
|
||||
prompt=prompt,
|
||||
@ -453,6 +511,18 @@ class RandomDataset(BenchmarkDataset):
|
||||
)
|
||||
)
|
||||
requests = batch_requests
|
||||
|
||||
if token_mismatch_total != 0:
|
||||
sign = "more" if token_mismatch_total > 0 else "fewer"
|
||||
logger.warning(
|
||||
"Across all generated prompts, there were %d %s tokens "
|
||||
"than expected after decoding and re-encoding. This is "
|
||||
"expected due to the imperfect nature of the sampling "
|
||||
"procedure.",
|
||||
abs(token_mismatch_total),
|
||||
sign,
|
||||
)
|
||||
|
||||
return requests
|
||||
|
||||
def get_prefix(
|
||||
@ -530,7 +600,7 @@ class RandomDataset(BenchmarkDataset):
|
||||
input_len: int,
|
||||
offset: int,
|
||||
index: int,
|
||||
) -> tuple[str, int]:
|
||||
) -> tuple[str, int, int]:
|
||||
"""
|
||||
Returns (prompt, total_input_len).
|
||||
|
||||
@ -549,15 +619,16 @@ class RandomDataset(BenchmarkDataset):
|
||||
token_sequence = prefix_token_ids + inner_seq
|
||||
|
||||
# Decode, then re-encode and truncate to preserve token count invariants
|
||||
prompt = tokenizer.decode(token_sequence)
|
||||
total_input_len = prefix_len + int(input_len)
|
||||
|
||||
re_encoded_sequence = tokenizer.encode(
|
||||
prompt, add_special_tokens=False)[:total_input_len]
|
||||
prompt = tokenizer.decode(re_encoded_sequence)
|
||||
total_input_len = len(re_encoded_sequence)
|
||||
|
||||
return prompt, total_input_len
|
||||
prompt, adjusted_token_sequence, token_mismatch = gen_prompt_decode_to_target_len( # noqa: E501
|
||||
tokenizer=tokenizer,
|
||||
token_sequence=token_sequence,
|
||||
target_token_len=total_input_len,
|
||||
add_special_tokens=False,
|
||||
rng=self._rng,
|
||||
)
|
||||
total_input_len = len(adjusted_token_sequence)
|
||||
return prompt, total_input_len, token_mismatch
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
@ -873,8 +944,9 @@ class RandomMultiModalDataset(RandomDataset):
|
||||
vocab_size = tokenizer.vocab_size
|
||||
# Add synthetic multimodal items to each request
|
||||
mm_requests = []
|
||||
token_mismatch_total = 0
|
||||
for i in range(num_requests):
|
||||
prompt, total_input_len = self.generate_token_sequence(
|
||||
prompt, total_input_len, token_mismatch = self.generate_token_sequence( # noqa: E501
|
||||
tokenizer=tokenizer,
|
||||
prefix_token_ids=prefix_token_ids,
|
||||
prefix_len=prefix_len,
|
||||
@ -883,6 +955,7 @@ class RandomMultiModalDataset(RandomDataset):
|
||||
offset=int(offsets[i]),
|
||||
index=i,
|
||||
)
|
||||
token_mismatch_total += token_mismatch
|
||||
# Get multimodal item iterator for a given request
|
||||
mm_item_iterator = self.get_mm_item_iterator(
|
||||
min_num_mm_items,
|
||||
@ -918,6 +991,18 @@ class RandomMultiModalDataset(RandomDataset):
|
||||
request_id=request_id_prefix + str(i),
|
||||
)
|
||||
mm_requests.append(sample_request)
|
||||
|
||||
if token_mismatch_total != 0:
|
||||
sign = "more" if token_mismatch_total > 0 else "fewer"
|
||||
logger.warning(
|
||||
"Across all generated prompts, there were %d %s tokens "
|
||||
"than expected after decoding and re-encoding. This is "
|
||||
"expected due to the imperfect nature of the sampling "
|
||||
"procedure.",
|
||||
abs(token_mismatch_total),
|
||||
sign,
|
||||
)
|
||||
|
||||
return mm_requests
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
@ -1450,6 +1535,13 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
|
||||
):
|
||||
dataset_class = MLPerfDataset
|
||||
args.hf_split = "train"
|
||||
elif (
|
||||
args.dataset_path in MMStarDataset.SUPPORTED_DATASET_PATHS
|
||||
or args.hf_name in MMStarDataset.SUPPORTED_DATASET_PATHS
|
||||
):
|
||||
dataset_class = MMStarDataset
|
||||
args.hf_split = "val"
|
||||
args.hf_subset = None
|
||||
else:
|
||||
supported_datasets = set([
|
||||
dataset_name for cls in HuggingFaceDataset.__subclasses__()
|
||||
@ -2687,27 +2779,23 @@ class PrefixRepetitionRandomDataset(BenchmarkDataset):
|
||||
# Generate random tokens
|
||||
tokens = np.random.randint(
|
||||
0, vocab_size, size=target_length).tolist()
|
||||
text = tokenizer.decode(tokens)
|
||||
re_encoded = tokenizer.encode(text, add_special_tokens=False)
|
||||
|
||||
if len(re_encoded) == target_length:
|
||||
return re_encoded
|
||||
elif len(re_encoded) < target_length:
|
||||
# Recursively generate additional consistent tokens
|
||||
needed = target_length - len(re_encoded)
|
||||
extra_tokens = _generate_exact_length_tokens(needed)
|
||||
return re_encoded + extra_tokens
|
||||
else:
|
||||
# Truncate to target length
|
||||
return re_encoded[:target_length]
|
||||
_, adjusted_tokens, token_mismatch = gen_prompt_decode_to_target_len( # noqa: E501
|
||||
tokenizer=tokenizer,
|
||||
token_sequence=tokens,
|
||||
target_token_len=target_length,
|
||||
add_special_tokens=False,
|
||||
)
|
||||
return adjusted_tokens, token_mismatch
|
||||
|
||||
requests = []
|
||||
token_mismatch_total = 0
|
||||
for _ in range(num_prefixes):
|
||||
prefix_tokens = _generate_exact_length_tokens(prefix_len)
|
||||
|
||||
for _ in range(prompts_per_prefix):
|
||||
suffix_tokens = _generate_exact_length_tokens(suffix_len)
|
||||
|
||||
suffix_tokens, token_mistmatch = _generate_exact_length_tokens(suffix_len) # noqa: E501
|
||||
token_mismatch_total += token_mistmatch
|
||||
combined_tokens = prefix_tokens + suffix_tokens
|
||||
prompt = tokenizer.decode(combined_tokens)
|
||||
prompt_len = len(combined_tokens)
|
||||
@ -2719,5 +2807,88 @@ class PrefixRepetitionRandomDataset(BenchmarkDataset):
|
||||
)
|
||||
)
|
||||
|
||||
if token_mismatch_total != 0:
|
||||
sign = "more" if token_mismatch_total > 0 else "fewer"
|
||||
logger.warning(
|
||||
"Across all generated prompts, there were %d %s tokens "
|
||||
"than expected after decoding and re-encoding. This is "
|
||||
"expected due to the imperfect nature of the sampling "
|
||||
"procedure.",
|
||||
abs(token_mismatch_total),
|
||||
sign,
|
||||
)
|
||||
random.shuffle(requests)
|
||||
return requests
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# MMStar Dataset Implementation
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
|
||||
class MMStarDataset(HuggingFaceDataset):
|
||||
"""
|
||||
Lin-Chen/MMStar: https://huggingface.co/datasets/Lin-Chen/MMStar
|
||||
refer to: https://github.com/sgl-project/SpecForge/pull/106
|
||||
"""
|
||||
DEFAULT_OUTPUT_LEN = 128
|
||||
SUPPORTED_DATASET_PATHS = {"Lin-Chen/MMStar"}
|
||||
IS_MULTIMODAL = True
|
||||
|
||||
def sample(
|
||||
self,
|
||||
tokenizer: PreTrainedTokenizerBase,
|
||||
num_requests: int,
|
||||
output_len: Optional[int] = None,
|
||||
enable_multimodal_chat: bool = False,
|
||||
request_id_prefix: str = "",
|
||||
no_oversample: bool = False,
|
||||
**kwargs,
|
||||
) -> list[SampleRequest]:
|
||||
# If --hf-output-len is not set, use the default output length.
|
||||
output_len = (output_len
|
||||
if output_len is not None else self.DEFAULT_OUTPUT_LEN)
|
||||
sampled_requests: list[SampleRequest] = []
|
||||
|
||||
for ind, item in enumerate(self.data):
|
||||
if len(sampled_requests) >= num_requests:
|
||||
break
|
||||
# Split the question text from options
|
||||
# (keep only the part before "Options:").
|
||||
full_q: str = item.get("question", "")
|
||||
question_text = full_q.split("Options:", 1)[0].strip()
|
||||
|
||||
# Multimodal image content.
|
||||
mm_content = process_image(item["image"])
|
||||
|
||||
# Compute prompt token length (note: this is plain text length
|
||||
# if enable_multimodal_chat is False).
|
||||
prompt_len = len(tokenizer(question_text).input_ids)
|
||||
|
||||
if enable_multimodal_chat:
|
||||
# If multimodal content should be embedded in the chat message,
|
||||
# convert to [{"role":"user","content":[...]}]
|
||||
prompt = self.apply_multimodal_chat_transformation(
|
||||
question_text, mm_content
|
||||
)
|
||||
mm_for_request = None # Already embedded in chat content.
|
||||
else:
|
||||
# Default: prompt is plain text,
|
||||
# image is in mm_content for the bench to assemble.
|
||||
prompt = question_text
|
||||
mm_for_request = mm_content
|
||||
|
||||
sampled_requests.append(
|
||||
SampleRequest(
|
||||
prompt=prompt,
|
||||
prompt_len=prompt_len,
|
||||
expected_output_len=output_len,
|
||||
multi_modal_data=mm_for_request,
|
||||
request_id=request_id_prefix + str(ind),
|
||||
)
|
||||
)
|
||||
|
||||
self.maybe_oversample_requests(
|
||||
sampled_requests, num_requests, request_id_prefix, no_oversample
|
||||
)
|
||||
return sampled_requests
|
||||
|
||||
@ -358,7 +358,23 @@ def get_requests(args, tokenizer):
|
||||
raise ValueError(f"Unknown dataset name: {args.dataset_name}")
|
||||
# Remove None values
|
||||
sample_kwargs = {k: v for k, v in sample_kwargs.items() if v is not None}
|
||||
return dataset_cls(**common_kwargs).sample(**sample_kwargs)
|
||||
requests = dataset_cls(**common_kwargs).sample(**sample_kwargs)
|
||||
requests = filter_requests_for_dp(requests, args.data_parallel_size)
|
||||
return requests
|
||||
|
||||
|
||||
def filter_requests_for_dp(requests, data_parallel_size):
|
||||
# Note(zhuohan): The way we get data_parallel_rank is hacky and only
|
||||
# works for external launcher mode. Should be cleaned up and deprecated
|
||||
# in the future with a better vLLM distributed process design.
|
||||
if data_parallel_size == 1:
|
||||
return requests
|
||||
|
||||
global_rank = int(os.environ["RANK"])
|
||||
world_size = int(os.environ["WORLD_SIZE"])
|
||||
data_parallel_rank = global_rank // (world_size // data_parallel_size)
|
||||
return [r for i, r in enumerate(requests)
|
||||
if i % data_parallel_size == data_parallel_rank]
|
||||
|
||||
|
||||
def validate_args(args):
|
||||
@ -453,12 +469,17 @@ def validate_args(args):
|
||||
if args.backend == "mii" and args.tokenizer != args.model:
|
||||
raise ValueError(
|
||||
"Tokenizer must be the same as the model for MII backend.")
|
||||
|
||||
# --data-parallel is not supported currently.
|
||||
# https://github.com/vllm-project/vllm/issues/16222
|
||||
if args.data_parallel_size > 1:
|
||||
|
||||
if args.data_parallel_size > 1 and (
|
||||
args.distributed_executor_backend != "external_launcher"
|
||||
or args.async_engine):
|
||||
# --data-parallel is not supported fully.
|
||||
# Old issue: https://github.com/vllm-project/vllm/issues/16222
|
||||
# Currently we only support data parallel with external launcher
|
||||
# mode (i.e., launch with toruchrun).
|
||||
raise ValueError(
|
||||
"Data parallel is not supported in offline benchmark, "
|
||||
"Data parallel is only supported with external launcher mode "
|
||||
"with synchronous engine in offline benchmark, "
|
||||
"please use benchmark serving instead"
|
||||
)
|
||||
|
||||
|
||||
@ -340,15 +340,15 @@ class PiecewiseCompileInterpreter(torch.fx.Interpreter):
|
||||
num_graphs=len(self.compile_submod_names),
|
||||
runtime_shape=None)
|
||||
# Lazy import here to avoid circular import
|
||||
from .cuda_piecewise_backend import PiecewiseBackend
|
||||
from .piecewise_backend import PiecewiseBackend
|
||||
|
||||
piecewise_backend = PiecewiseBackend(
|
||||
submod, self.vllm_config, index,
|
||||
len(self.compile_submod_names), sym_shape_indices,
|
||||
compiled_graph_for_dynamic_shape, self.vllm_backend)
|
||||
|
||||
if (self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE
|
||||
and
|
||||
if (self.compilation_config.cudagraph_mode.\
|
||||
has_piecewise_cudagraphs() and
|
||||
not self.compilation_config.use_inductor_graph_partition):
|
||||
# We're using Dynamo-based piecewise splitting, so we wrap
|
||||
# the whole subgraph with a static graph wrapper.
|
||||
|
||||
@ -336,7 +336,7 @@ def maybe_use_cudagraph_partition_wrapper(vllm_config: VllmConfig):
|
||||
from vllm.config import CUDAGraphMode
|
||||
|
||||
compilation_config = vllm_config.compilation_config
|
||||
if (compilation_config.cudagraph_mode != CUDAGraphMode.NONE
|
||||
if (compilation_config.cudagraph_mode.has_piecewise_cudagraphs()
|
||||
and compilation_config.use_inductor_graph_partition):
|
||||
from torch._inductor.utils import CUDAGraphWrapperMetadata
|
||||
|
||||
@ -365,7 +365,7 @@ def maybe_use_cudagraph_partition_wrapper(vllm_config: VllmConfig):
|
||||
|
||||
yield
|
||||
|
||||
if (compilation_config.cudagraph_mode != CUDAGraphMode.NONE
|
||||
if (compilation_config.cudagraph_mode.has_piecewise_cudagraphs()
|
||||
and compilation_config.use_inductor_graph_partition):
|
||||
torch._inductor.utils.set_customized_partition_wrappers(None)
|
||||
|
||||
|
||||
@ -1,7 +1,6 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import os
|
||||
import time
|
||||
|
||||
from vllm.config import CompilationConfig, CompilationLevel, VllmConfig
|
||||
@ -18,13 +17,12 @@ def start_monitoring_torch_compile(vllm_config: VllmConfig):
|
||||
torch_compile_start_time = time.time()
|
||||
|
||||
compilation_config: CompilationConfig = vllm_config.compilation_config
|
||||
if compilation_config.level == CompilationLevel.PIECEWISE and \
|
||||
compilation_config.debug_dump_path:
|
||||
path = vllm_config.compile_debug_dump_path()
|
||||
if compilation_config.level == CompilationLevel.PIECEWISE and path:
|
||||
import depyf
|
||||
path = os.path.join(compilation_config.debug_dump_path,
|
||||
f"rank_{vllm_config.parallel_config.rank}")
|
||||
path.mkdir(parents=True, exist_ok=True)
|
||||
global context_manager
|
||||
context_manager = depyf.prepare_debug(path)
|
||||
context_manager = depyf.prepare_debug(path.as_posix())
|
||||
context_manager.__enter__()
|
||||
|
||||
|
||||
|
||||
@ -3,7 +3,6 @@
|
||||
import functools
|
||||
import operator
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import ClassVar, Optional
|
||||
|
||||
import regex as re
|
||||
@ -96,12 +95,10 @@ class VllmPatternMatcherPass(VllmInductorPass):
|
||||
|
||||
TODO(luka): use pattern object to manually produce pattern graph
|
||||
"""
|
||||
debug_dump_path = config.compilation_config.debug_dump_path
|
||||
debug_dump_path = config.compile_debug_dump_path()
|
||||
if not debug_dump_path:
|
||||
return
|
||||
|
||||
rank = config.parallel_config.rank
|
||||
debug_dump_path = Path(debug_dump_path) / f"rank_{rank}"
|
||||
debug_dump_path.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
from vllm.utils import unique_filepath
|
||||
|
||||
@ -92,12 +92,11 @@ class TorchCompileWrapperWithCustomDispatcher:
|
||||
return
|
||||
|
||||
self.compiled_codes.append(new_code)
|
||||
debug_dump_dir = self.vllm_config.compilation_config.debug_dump_path
|
||||
if isinstance(debug_dump_dir, str) and debug_dump_dir != "":
|
||||
rank = self.vllm_config.parallel_config.rank
|
||||
decompiled_file = os.path.join(debug_dump_dir, f"rank_{rank}",
|
||||
"transformed_code.py")
|
||||
if not os.path.exists(decompiled_file):
|
||||
|
||||
path = self.vllm_config.compile_debug_dump_path()
|
||||
if path:
|
||||
decompiled_file = path / "transformed_code.py"
|
||||
if not decompiled_file.exists():
|
||||
try:
|
||||
# usually the decompilation will succeed for most models,
|
||||
# as we guarantee a full-graph compilation in Dynamo.
|
||||
|
||||
@ -1,28 +1,6 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# ruff: noqa: F401
|
||||
import ast
|
||||
import copy
|
||||
import hashlib
|
||||
import inspect
|
||||
import json
|
||||
import os
|
||||
import textwrap
|
||||
from contextlib import contextmanager
|
||||
from dataclasses import field, fields, is_dataclass, replace
|
||||
from functools import cached_property, lru_cache
|
||||
from typing import (TYPE_CHECKING, Any, Literal, Optional, Protocol, TypeVar,
|
||||
Union, cast)
|
||||
|
||||
import regex as re
|
||||
import torch
|
||||
from pydantic import ConfigDict, SkipValidation
|
||||
from pydantic.dataclasses import dataclass
|
||||
from typing_extensions import runtime_checkable
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm import version
|
||||
from vllm.config.cache import (BlockSize, CacheConfig, CacheDType, MambaDType,
|
||||
PrefixCachingHashAlgo)
|
||||
from vllm.config.compilation import (CompilationConfig, CompilationLevel,
|
||||
@ -47,768 +25,82 @@ from vllm.config.scheduler import RunnerType, SchedulerConfig, SchedulerPolicy
|
||||
from vllm.config.speculative import SpeculativeConfig
|
||||
from vllm.config.speech_to_text import SpeechToTextConfig
|
||||
from vllm.config.structured_outputs import StructuredOutputsConfig
|
||||
from vllm.config.utils import ConfigType, config, get_attr_docs, is_init_field
|
||||
from vllm.logger import init_logger
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.transformers_utils.runai_utils import is_runai_obj_uri
|
||||
from vllm.utils import random_uuid
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from _typeshed import DataclassInstance
|
||||
from transformers.configuration_utils import PretrainedConfig
|
||||
|
||||
from vllm.model_executor.layers.quantization.base_config import (
|
||||
QuantizationConfig)
|
||||
else:
|
||||
DataclassInstance = Any
|
||||
PretrainedConfig = Any
|
||||
QuantizationConfig = Any
|
||||
QuantizationMethods = Any
|
||||
BaseModelLoader = Any
|
||||
LogitsProcessor = Any
|
||||
|
||||
logger = init_logger(__name__)
|
||||
DataclassInstanceT = TypeVar("DataclassInstanceT", bound=DataclassInstance)
|
||||
|
||||
|
||||
@runtime_checkable
|
||||
class SupportsHash(Protocol):
|
||||
|
||||
def compute_hash(self) -> str:
|
||||
...
|
||||
|
||||
|
||||
class SupportsMetricsInfo(Protocol):
|
||||
|
||||
def metrics_info(self) -> dict[str, str]:
|
||||
...
|
||||
|
||||
|
||||
@config
|
||||
@dataclass(config=ConfigDict(arbitrary_types_allowed=True))
|
||||
class VllmConfig:
|
||||
"""Dataclass which contains all vllm-related configuration. This
|
||||
simplifies passing around the distinct configurations in the codebase.
|
||||
"""
|
||||
|
||||
# TODO: use default_factory once default constructing ModelConfig doesn't
|
||||
# try to download a model
|
||||
model_config: ModelConfig = None # type: ignore
|
||||
"""Model configuration."""
|
||||
cache_config: CacheConfig = field(default_factory=CacheConfig)
|
||||
"""Cache configuration."""
|
||||
parallel_config: ParallelConfig = field(default_factory=ParallelConfig)
|
||||
"""Parallel configuration."""
|
||||
scheduler_config: SchedulerConfig = field(default_factory=SchedulerConfig)
|
||||
"""Scheduler configuration."""
|
||||
device_config: DeviceConfig = field(default_factory=DeviceConfig)
|
||||
"""Device configuration."""
|
||||
load_config: LoadConfig = field(default_factory=LoadConfig)
|
||||
"""Load configuration."""
|
||||
lora_config: Optional[LoRAConfig] = None
|
||||
"""LoRA configuration."""
|
||||
speculative_config: Optional[SpeculativeConfig] = None
|
||||
"""Speculative decoding configuration."""
|
||||
structured_outputs_config: StructuredOutputsConfig = field(
|
||||
default_factory=StructuredOutputsConfig)
|
||||
"""Structured outputs configuration."""
|
||||
observability_config: Optional[ObservabilityConfig] = None
|
||||
"""Observability configuration."""
|
||||
quant_config: Optional[QuantizationConfig] = None
|
||||
"""Quantization configuration."""
|
||||
compilation_config: CompilationConfig = field(
|
||||
default_factory=CompilationConfig)
|
||||
"""`torch.compile` and cudagraph capture configuration for the model.
|
||||
|
||||
As a shorthand, `-O<n>` can be used to directly specify the compilation
|
||||
level `n`: `-O3` is equivalent to `-O.level=3` (same as `-O='{"level":3}'`).
|
||||
Currently, -O <n> and -O=<n> are supported as well but this will likely be
|
||||
removed in favor of clearer -O<n> syntax in the future.
|
||||
|
||||
NOTE: level 0 is the default level without any optimization. level 1 and 2
|
||||
are for internal testing only. level 3 is the recommended level for
|
||||
production, also default in V1.
|
||||
|
||||
You can specify the full compilation config like so:
|
||||
`{"level": 3, "cudagraph_capture_sizes": [1, 2, 4, 8]}`
|
||||
"""
|
||||
kv_transfer_config: Optional[KVTransferConfig] = None
|
||||
"""The configurations for distributed KV cache transfer."""
|
||||
kv_events_config: Optional[KVEventsConfig] = None
|
||||
"""The configurations for event publishing."""
|
||||
# some opaque config, only used to provide additional information
|
||||
# for the hash computation, mainly used for testing, debugging or out of
|
||||
# tree config registration.
|
||||
additional_config: Union[dict, SupportsHash] = field(default_factory=dict)
|
||||
"""Additional config for specified platform. Different platforms may
|
||||
support different configs. Make sure the configs are valid for the platform
|
||||
you are using. Contents must be hashable."""
|
||||
instance_id: str = ""
|
||||
"""The ID of the vLLM instance."""
|
||||
|
||||
def compute_hash(self) -> str:
|
||||
"""
|
||||
WARNING: Whenever a new field is added to this config,
|
||||
ensure that it is included in the factors list if
|
||||
it affects the computation graph.
|
||||
|
||||
Provide a hash that uniquely identifies all the configs
|
||||
that affect the structure of the computation
|
||||
graph from input ids/embeddings to the final hidden states,
|
||||
excluding anything before input ids/embeddings and after
|
||||
the final hidden states.
|
||||
"""
|
||||
factors: list[Any] = []
|
||||
|
||||
# summarize vllm config
|
||||
vllm_factors: list[Any] = []
|
||||
from vllm import __version__
|
||||
vllm_factors.append(__version__)
|
||||
vllm_factors.append(envs.VLLM_USE_V1)
|
||||
if self.model_config:
|
||||
vllm_factors.append(self.model_config.compute_hash())
|
||||
else:
|
||||
vllm_factors.append("None")
|
||||
if self.cache_config:
|
||||
vllm_factors.append(self.cache_config.compute_hash())
|
||||
else:
|
||||
vllm_factors.append("None")
|
||||
if self.parallel_config:
|
||||
vllm_factors.append(self.parallel_config.compute_hash())
|
||||
else:
|
||||
vllm_factors.append("None")
|
||||
if self.scheduler_config:
|
||||
vllm_factors.append(self.scheduler_config.compute_hash())
|
||||
else:
|
||||
vllm_factors.append("None")
|
||||
if self.device_config:
|
||||
vllm_factors.append(self.device_config.compute_hash())
|
||||
else:
|
||||
vllm_factors.append("None")
|
||||
if self.load_config:
|
||||
vllm_factors.append(self.load_config.compute_hash())
|
||||
else:
|
||||
vllm_factors.append("None")
|
||||
if self.lora_config:
|
||||
vllm_factors.append(self.lora_config.compute_hash())
|
||||
# LoRA creates static buffers based on max_num_batched_tokens.
|
||||
# The tensor sizes and strides get captured in the torch.compile
|
||||
# graph explicitly.
|
||||
vllm_factors.append(
|
||||
str(self.scheduler_config.max_num_batched_tokens))
|
||||
else:
|
||||
vllm_factors.append("None")
|
||||
if self.speculative_config:
|
||||
vllm_factors.append(self.speculative_config.compute_hash())
|
||||
else:
|
||||
vllm_factors.append("None")
|
||||
if self.structured_outputs_config:
|
||||
vllm_factors.append(self.structured_outputs_config.compute_hash())
|
||||
else:
|
||||
vllm_factors.append("None")
|
||||
if self.observability_config:
|
||||
vllm_factors.append(self.observability_config.compute_hash())
|
||||
else:
|
||||
vllm_factors.append("None")
|
||||
if self.quant_config:
|
||||
pass # should be captured by model_config.quantization
|
||||
if self.compilation_config:
|
||||
vllm_factors.append(self.compilation_config.compute_hash())
|
||||
else:
|
||||
vllm_factors.append("None")
|
||||
if self.kv_transfer_config:
|
||||
vllm_factors.append(self.kv_transfer_config.compute_hash())
|
||||
else:
|
||||
vllm_factors.append("None")
|
||||
if self.additional_config:
|
||||
if isinstance(additional_config := self.additional_config, dict):
|
||||
additional_config_hash = hashlib.md5(
|
||||
json.dumps(additional_config, sort_keys=True).encode(),
|
||||
usedforsecurity=False,
|
||||
).hexdigest()
|
||||
else:
|
||||
additional_config_hash = additional_config.compute_hash()
|
||||
vllm_factors.append(additional_config_hash)
|
||||
else:
|
||||
vllm_factors.append("None")
|
||||
factors.append(vllm_factors)
|
||||
|
||||
hash_str = hashlib.md5(str(factors).encode(),
|
||||
usedforsecurity=False).hexdigest()[:10]
|
||||
return hash_str
|
||||
|
||||
def pad_for_cudagraph(self, batch_size: int) -> int:
|
||||
# if batch_size > self.compilation_config.max_capture_size,
|
||||
# it should raise an IndexError.
|
||||
# the caller should make sure the batch_size is within the range,
|
||||
# i.e., batch_size <= self.compilation_config.max_capture_size
|
||||
return self.compilation_config.bs_to_padded_graph_size[batch_size]
|
||||
|
||||
@staticmethod
|
||||
def _get_quantization_config(
|
||||
model_config: ModelConfig,
|
||||
load_config: LoadConfig) -> Optional[QuantizationConfig]:
|
||||
"""Get the quantization config."""
|
||||
from vllm.platforms import current_platform
|
||||
if model_config.quantization is not None:
|
||||
from vllm.model_executor.model_loader.weight_utils import (
|
||||
get_quant_config)
|
||||
quant_config = get_quant_config(model_config, load_config)
|
||||
capability_tuple = current_platform.get_device_capability()
|
||||
|
||||
if capability_tuple is not None:
|
||||
capability = capability_tuple.to_int()
|
||||
if capability < quant_config.get_min_capability():
|
||||
raise ValueError(
|
||||
f"The quantization method {model_config.quantization} "
|
||||
"is not supported for the current GPU. Minimum "
|
||||
f"capability: {quant_config.get_min_capability()}. "
|
||||
f"Current capability: {capability}.")
|
||||
supported_dtypes = quant_config.get_supported_act_dtypes()
|
||||
if model_config.dtype not in supported_dtypes:
|
||||
raise ValueError(
|
||||
f"{model_config.dtype} is not supported for quantization "
|
||||
f"method {model_config.quantization}. Supported dtypes: "
|
||||
f"{supported_dtypes}")
|
||||
quant_config.maybe_update_config(model_config.model)
|
||||
return quant_config
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def get_quantization_config(
|
||||
model_config: ModelConfig,
|
||||
load_config: LoadConfig) -> Optional[QuantizationConfig]:
|
||||
import copy
|
||||
|
||||
# For some reason, the _ version of this modifies the model_config
|
||||
# object, so using deepcopy to avoid this problem.
|
||||
return VllmConfig._get_quantization_config(copy.deepcopy(model_config),
|
||||
load_config)
|
||||
|
||||
def with_hf_config(
|
||||
self,
|
||||
hf_config: PretrainedConfig,
|
||||
architectures: Optional[list[str]] = None,
|
||||
) -> "VllmConfig":
|
||||
if architectures is not None:
|
||||
hf_config = copy.deepcopy(hf_config)
|
||||
hf_config.architectures = architectures
|
||||
|
||||
model_config = copy.deepcopy(self.model_config)
|
||||
model_config.hf_config = hf_config
|
||||
|
||||
return replace(self, model_config=model_config)
|
||||
|
||||
def __post_init__(self):
|
||||
"""Verify configs are valid & consistent with each other.
|
||||
"""
|
||||
|
||||
self.try_verify_and_update_config()
|
||||
|
||||
if self.model_config is not None:
|
||||
self.model_config.verify_with_parallel_config(self.parallel_config)
|
||||
self.model_config.verify_dual_chunk_attention_config(
|
||||
self.load_config)
|
||||
|
||||
self.cache_config.verify_with_parallel_config(self.parallel_config)
|
||||
|
||||
if self.lora_config is not None:
|
||||
self.lora_config.verify_with_cache_config(self.cache_config)
|
||||
self.lora_config.verify_with_model_config(self.model_config)
|
||||
|
||||
if self.quant_config is None and self.model_config is not None:
|
||||
self.quant_config = VllmConfig._get_quantization_config(
|
||||
self.model_config, self.load_config)
|
||||
|
||||
from vllm.platforms import current_platform
|
||||
if self.model_config is not None and \
|
||||
self.scheduler_config.chunked_prefill_enabled and \
|
||||
self.model_config.dtype == torch.float32 and \
|
||||
current_platform.get_device_capability() == (7, 5):
|
||||
logger.warning_once(
|
||||
"Turing devices tensor cores do not support float32 matmul. "
|
||||
"To workaround this limitation, vLLM will set 'ieee' input "
|
||||
"precision for chunked prefill triton kernels.")
|
||||
|
||||
# If the user does not explicitly set a compilation level, then
|
||||
# we use the default level. The default level depends on other
|
||||
# settings (see the below code).
|
||||
if self.compilation_config.level is None:
|
||||
if envs.VLLM_USE_V1:
|
||||
if (self.model_config is not None
|
||||
and not self.model_config.enforce_eager):
|
||||
self.compilation_config.level = CompilationLevel.PIECEWISE
|
||||
else:
|
||||
self.compilation_config.level = \
|
||||
CompilationLevel.NO_COMPILATION
|
||||
|
||||
else:
|
||||
# NB: Passing both --enforce-eager and a compilation level
|
||||
# in V0 means the compilation level wins out.
|
||||
self.compilation_config.level = CompilationLevel.NO_COMPILATION
|
||||
|
||||
# async tp is built on top of sequence parallelism
|
||||
# and requires it to be enabled.
|
||||
if self.compilation_config.pass_config.enable_async_tp:
|
||||
self.compilation_config.pass_config.enable_sequence_parallelism = \
|
||||
True
|
||||
if self.compilation_config.pass_config.enable_sequence_parallelism:
|
||||
self.compilation_config.custom_ops.append("+rms_norm")
|
||||
|
||||
if current_platform.support_static_graph_mode():
|
||||
# if cudagraph_mode is not explicitly set by users, set default
|
||||
# value
|
||||
if self.compilation_config.cudagraph_mode is None:
|
||||
if envs.VLLM_USE_V1 and self.compilation_config.level \
|
||||
== CompilationLevel.PIECEWISE:
|
||||
# default to full and piecewise for most models
|
||||
self.compilation_config.cudagraph_mode = \
|
||||
CUDAGraphMode.FULL_AND_PIECEWISE
|
||||
|
||||
# pooling models and encoder-decoder models
|
||||
# do not support full cudagraphs
|
||||
if self.model_config is not None and \
|
||||
(self.model_config.pooler_config is not None
|
||||
or self.model_config.is_encoder_decoder):
|
||||
self.compilation_config.cudagraph_mode = \
|
||||
CUDAGraphMode.PIECEWISE
|
||||
else:
|
||||
self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE
|
||||
|
||||
# disable cudagraph when enforce eager execution
|
||||
if self.model_config is not None and \
|
||||
self.model_config.enforce_eager:
|
||||
logger.info("Cudagraph is disabled under eager mode")
|
||||
self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE
|
||||
elif envs.VLLM_USE_V1:
|
||||
self.compilation_config.cudagraph_num_of_warmups = 1
|
||||
|
||||
self._set_cudagraph_sizes()
|
||||
else:
|
||||
self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE
|
||||
|
||||
if self.cache_config.kv_sharing_fast_prefill:
|
||||
|
||||
if self.speculative_config is not None and \
|
||||
self.speculative_config.use_eagle():
|
||||
raise NotImplementedError(
|
||||
"Fast prefill optimization for KV sharing is not "
|
||||
"compatible with EAGLE as EAGLE requires correct logits "
|
||||
"for all tokens while fast prefill gives incorrect logits "
|
||||
"for prompt tokens.")
|
||||
|
||||
logger.warning_once(
|
||||
"--kv-sharing-fast-prefill requires changes on model side for "
|
||||
"correctness and to realize prefill savings. ")
|
||||
|
||||
disable_chunked_prefill_reasons: list[str] = []
|
||||
|
||||
if self.model_config:
|
||||
if self.model_config.pooler_config:
|
||||
pooling_type = self.model_config.pooler_config.pooling_type
|
||||
if pooling_type is None or pooling_type.lower() != "last":
|
||||
disable_chunked_prefill_reasons.append(
|
||||
"Only \"last\" pooling supports chunked "
|
||||
"prefill and prefix caching; disabling both.")
|
||||
if not getattr(self.model_config.hf_config, "is_causal", True):
|
||||
disable_chunked_prefill_reasons.append(
|
||||
"Only models using causal attention supports chunked "
|
||||
"prefill and prefix caching; disabling both.")
|
||||
elif self.model_config.is_encoder_decoder:
|
||||
self.scheduler_config.max_num_encoder_input_tokens = \
|
||||
MULTIMODAL_REGISTRY.get_encdec_max_encoder_len(self.model_config)
|
||||
logger.debug(
|
||||
"Encoder-decoder model detected: setting "
|
||||
"`max_num_encoder_input_tokens` to encoder length (%s)",
|
||||
self.scheduler_config.max_num_encoder_input_tokens)
|
||||
self.scheduler_config.disable_chunked_mm_input = True
|
||||
disable_chunked_prefill_reasons.append(
|
||||
"Encoder-decoder models do not support chunked prefill nor"
|
||||
" prefix caching; disabling both.")
|
||||
if (self.model_config.architecture
|
||||
== "WhisperForConditionalGeneration"
|
||||
and os.environ.get("VLLM_WORKER_MULTIPROC_METHOD")
|
||||
!= "spawn"):
|
||||
logger.warning(
|
||||
"Whisper is known to have issues with "
|
||||
"forked workers. If startup is hanging, "
|
||||
"try setting 'VLLM_WORKER_MULTIPROC_METHOD' "
|
||||
"to 'spawn'.")
|
||||
|
||||
if disable_chunked_prefill_reasons:
|
||||
for reason in disable_chunked_prefill_reasons:
|
||||
logger.info(reason)
|
||||
self.scheduler_config.chunked_prefill_enabled = False
|
||||
self.scheduler_config.long_prefill_token_threshold = 0
|
||||
|
||||
if self.cache_config is not None:
|
||||
self.cache_config.enable_prefix_caching = False
|
||||
|
||||
if (self.kv_events_config is not None
|
||||
and self.kv_events_config.enable_kv_cache_events
|
||||
and not self.cache_config.enable_prefix_caching):
|
||||
logger.warning(
|
||||
"KV cache events are on, but prefix caching is not enabled."
|
||||
"Use --enable-prefix-caching to enable.")
|
||||
if (self.kv_events_config is not None
|
||||
and self.kv_events_config.publisher != "null"
|
||||
and not self.kv_events_config.enable_kv_cache_events):
|
||||
logger.warning("KV cache events are disabled,"
|
||||
"but the scheduler is configured to publish them."
|
||||
"Modify KVEventsConfig.enable_kv_cache_events"
|
||||
"to True to enable.")
|
||||
current_platform.check_and_update_config(self)
|
||||
|
||||
# final check of cudagraph mode after platform-specific update
|
||||
if envs.VLLM_USE_V1 and current_platform.is_cuda_alike():
|
||||
if self.compilation_config.cudagraph_mode == CUDAGraphMode.FULL \
|
||||
and self.model_config is not None and \
|
||||
not self.model_config.disable_cascade_attn:
|
||||
logger.info("CUDAGraphMode.FULL is not supported with "
|
||||
"cascade attention currently. Disabling cascade"
|
||||
"attention.")
|
||||
self.model_config.disable_cascade_attn = True
|
||||
|
||||
if self.compilation_config.cudagraph_mode\
|
||||
.requires_piecewise_compilation():
|
||||
assert self.compilation_config.level == \
|
||||
CompilationLevel.PIECEWISE, \
|
||||
"Compilation level should be CompilationLevel.PIECEWISE "\
|
||||
"when cudagraph_mode piecewise cudagraphs is used, "\
|
||||
f"cudagraph_mode={self.compilation_config.cudagraph_mode}"
|
||||
|
||||
if self.parallel_config.enable_dbo:
|
||||
a2a_backend = envs.VLLM_ALL2ALL_BACKEND
|
||||
assert a2a_backend in \
|
||||
["deepep_low_latency", "deepep_high_throughput"], \
|
||||
"Microbatching currently only supports the deepep_low_latency and "\
|
||||
f"deepep_high_throughput all2all backend. {a2a_backend} is not "\
|
||||
"supported. To fix set the VLLM_ALL2ALL_BACKEND environment "\
|
||||
"variable to deepep_low_latency or deepep_high_throughput and "\
|
||||
"install the DeepEP kernels."
|
||||
|
||||
if not self.instance_id:
|
||||
self.instance_id = random_uuid()[:5]
|
||||
|
||||
# Do this after all the updates to compilation_config.level
|
||||
if envs.VLLM_USE_V1 and \
|
||||
self.compilation_config.level == CompilationLevel.PIECEWISE:
|
||||
self.compilation_config.set_splitting_ops_for_v1()
|
||||
|
||||
if (envs.VLLM_USE_V1
|
||||
and not self.scheduler_config.disable_hybrid_kv_cache_manager):
|
||||
# logger should only print warning message for hybrid models. As we
|
||||
# can't know whether the model is hybrid or not now, so we don't log
|
||||
# warning message here and will log it later.
|
||||
if not current_platform.support_hybrid_kv_cache():
|
||||
# Hybrid KV cache manager is not supported on non-GPU platforms.
|
||||
self.scheduler_config.disable_hybrid_kv_cache_manager = True
|
||||
if self.kv_transfer_config is not None:
|
||||
# Hybrid KV cache manager is not compatible with KV transfer.
|
||||
self.scheduler_config.disable_hybrid_kv_cache_manager = True
|
||||
if self.kv_events_config is not None:
|
||||
# Hybrid KV cache manager is not compatible with KV events.
|
||||
self.scheduler_config.disable_hybrid_kv_cache_manager = True
|
||||
if self.model_config is not None and \
|
||||
self.model_config.attention_chunk_size is not None:
|
||||
if self.speculative_config is not None and \
|
||||
self.speculative_config.use_eagle():
|
||||
# Hybrid KV cache manager is not yet supported with chunked
|
||||
# local attention + eagle.
|
||||
self.scheduler_config.disable_hybrid_kv_cache_manager = True
|
||||
elif \
|
||||
not envs.VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE:
|
||||
logger.warning(
|
||||
"There is a latency regression when using chunked local"
|
||||
" attention with the hybrid KV cache manager. Disabling"
|
||||
" it, by default. To enable it, set the environment "
|
||||
"VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE=1."
|
||||
)
|
||||
# Hybrid KV cache manager is not yet supported with chunked
|
||||
# local attention.
|
||||
self.scheduler_config.disable_hybrid_kv_cache_manager = True
|
||||
|
||||
def update_sizes_for_sequence_parallelism(self,
|
||||
possible_sizes: list) -> list:
|
||||
# remove the sizes that not multiple of tp_size when
|
||||
# enable sequence parallelism
|
||||
removed_sizes = [
|
||||
size for size in possible_sizes
|
||||
if size % self.parallel_config.tensor_parallel_size != 0
|
||||
]
|
||||
if removed_sizes:
|
||||
logger.warning(
|
||||
"Batch sizes %s are removed because they are not "
|
||||
"multiple of tp_size %d when "
|
||||
"sequence parallelism is enabled", removed_sizes,
|
||||
self.parallel_config.tensor_parallel_size)
|
||||
|
||||
return [
|
||||
size for size in possible_sizes
|
||||
if size % self.parallel_config.tensor_parallel_size == 0
|
||||
]
|
||||
|
||||
def _set_cudagraph_sizes(self):
|
||||
"""
|
||||
vLLM defines the default candidate list of batch sizes for CUDA graph
|
||||
capture as:
|
||||
|
||||
```python
|
||||
max_graph_size = min(max_num_seqs * 2, 512)
|
||||
# 1, 2, 4, then multiples of 8 up to max_graph_size
|
||||
cuda_graph_sizes = [1, 2, 4, 8, 16, 24, 32, 40, ..., max_graph_size]
|
||||
|
||||
In the end, `vllm_config.compilation_config.cudagraph_capture_sizes`
|
||||
will be the final sizes to capture cudagraph (in descending order).
|
||||
|
||||
These sizes are used to capture and reuse CUDA graphs for
|
||||
performance-critical paths (e.g., decoding). Capturing enables
|
||||
significantly faster kernel dispatch by avoiding Python overhead. The
|
||||
list is then filtered based on `max_num_batched_tokens` (e.g., 8192 on
|
||||
most GPUs), which controls the total allowed number of tokens in a
|
||||
batch. Since each sequence may have a variable number of tokens, the
|
||||
maximum usable batch size will depend on actual sequence lengths.
|
||||
|
||||
Example:
|
||||
With `max_num_batched_tokens = 8192`, and typical sequences
|
||||
averaging ~32 tokens, most practical batch sizes fall below 256.
|
||||
However, the system will still allow capture sizes up to 512 if
|
||||
shape and memory permit.
|
||||
|
||||
Note:
|
||||
If users explicitly specify cudagraph capture sizes in the
|
||||
compilation config, those will override this default logic.
|
||||
At runtime:
|
||||
|
||||
- If batch size <= one of the `cudagraph_capture_sizes`, the closest
|
||||
padded CUDA graph will be used.
|
||||
- If batch size > largest `cudagraph_capture_sizes`, cudagraph will
|
||||
not be used.
|
||||
"""
|
||||
|
||||
# calculate the default `batch_size_capture_list`
|
||||
batch_size_capture_list = []
|
||||
if self.model_config is not None and \
|
||||
not self.model_config.enforce_eager:
|
||||
cuda_graph_sizes = self.scheduler_config.cuda_graph_sizes
|
||||
if len(cuda_graph_sizes) == 1:
|
||||
batch_size_capture_list = [1, 2, 4] + [
|
||||
i for i in range(8, cuda_graph_sizes[0] + 1, 8)
|
||||
]
|
||||
elif len(cuda_graph_sizes) > 1:
|
||||
batch_size_capture_list = sorted(cuda_graph_sizes)
|
||||
else:
|
||||
raise TypeError(f"Invalid value for {cuda_graph_sizes=}.")
|
||||
if self.parallel_config.tensor_parallel_size > 1 and \
|
||||
self.compilation_config.pass_config.enable_sequence_parallelism:
|
||||
batch_size_capture_list = \
|
||||
self.update_sizes_for_sequence_parallelism(batch_size_capture_list)
|
||||
max_num_tokens = self.scheduler_config.max_num_batched_tokens
|
||||
batch_size_capture_list = [
|
||||
size for size in batch_size_capture_list
|
||||
if size <= max_num_tokens
|
||||
]
|
||||
|
||||
self.compilation_config.init_with_cudagraph_sizes(
|
||||
batch_size_capture_list)
|
||||
|
||||
def recalculate_max_model_len(self, max_model_len: int):
|
||||
# Can only be called in try_verify_and_update_config
|
||||
model_config = self.model_config
|
||||
max_model_len = model_config.get_and_verify_max_len(max_model_len)
|
||||
self.model_config.max_model_len = max_model_len
|
||||
self.scheduler_config.max_model_len = max_model_len
|
||||
|
||||
def try_verify_and_update_config(self):
|
||||
if self.model_config is None:
|
||||
return
|
||||
|
||||
# Avoid running try_verify_and_update_config multiple times
|
||||
if getattr(self.model_config, "config_updated", False):
|
||||
return
|
||||
self.model_config.config_updated = True
|
||||
|
||||
architecture = self.model_config.architecture
|
||||
if architecture is None:
|
||||
return
|
||||
|
||||
from vllm.model_executor.models.config import (
|
||||
MODELS_CONFIG_MAP, HybridAttentionMambaModelConfig)
|
||||
cls = MODELS_CONFIG_MAP.get(architecture, None)
|
||||
if cls is not None:
|
||||
cls.verify_and_update_config(self)
|
||||
|
||||
if self.model_config.is_hybrid:
|
||||
HybridAttentionMambaModelConfig.verify_and_update_config(self)
|
||||
|
||||
if self.model_config.convert_type == "classify":
|
||||
# Maybe convert ForCausalLM into ForSequenceClassification model.
|
||||
from vllm.model_executor.models.adapters import (
|
||||
SequenceClassificationConfig)
|
||||
SequenceClassificationConfig.verify_and_update_config(self)
|
||||
|
||||
if hasattr(self.model_config, "model_weights") and is_runai_obj_uri(
|
||||
self.model_config.model_weights):
|
||||
if self.load_config.load_format == "auto":
|
||||
logger.info("Detected Run:ai model config. "
|
||||
"Overriding `load_format` to 'runai_streamer'")
|
||||
self.load_config.load_format = "runai_streamer"
|
||||
elif self.load_config.load_format != "runai_streamer":
|
||||
raise ValueError(f"To load a model from S3, 'load_format' "
|
||||
f"must be 'runai_streamer', "
|
||||
f"but got '{self.load_config.load_format}'. "
|
||||
f"Model: {self.model_config.model}")
|
||||
|
||||
def __str__(self):
|
||||
return (
|
||||
f"model={self.model_config.model!r}, "
|
||||
f"speculative_config={self.speculative_config!r}, "
|
||||
f"tokenizer={self.model_config.tokenizer!r}, "
|
||||
f"skip_tokenizer_init={self.model_config.skip_tokenizer_init}, "
|
||||
f"tokenizer_mode={self.model_config.tokenizer_mode}, "
|
||||
f"revision={self.model_config.revision}, "
|
||||
f"tokenizer_revision={self.model_config.tokenizer_revision}, "
|
||||
f"trust_remote_code={self.model_config.trust_remote_code}, "
|
||||
f"dtype={self.model_config.dtype}, "
|
||||
f"max_seq_len={self.model_config.max_model_len}, "
|
||||
f"download_dir={self.load_config.download_dir!r}, "
|
||||
f"load_format={self.load_config.load_format}, "
|
||||
f"tensor_parallel_size={self.parallel_config.tensor_parallel_size}, " # noqa
|
||||
f"pipeline_parallel_size={self.parallel_config.pipeline_parallel_size}, " # noqa
|
||||
f"data_parallel_size={self.parallel_config.data_parallel_size}, " # noqa
|
||||
f"disable_custom_all_reduce={self.parallel_config.disable_custom_all_reduce}, " # noqa
|
||||
f"quantization={self.model_config.quantization}, "
|
||||
f"enforce_eager={self.model_config.enforce_eager}, "
|
||||
f"kv_cache_dtype={self.cache_config.cache_dtype}, "
|
||||
f"device_config={self.device_config.device}, "
|
||||
f"structured_outputs_config={self.structured_outputs_config!r}, "
|
||||
f"observability_config={self.observability_config!r}, "
|
||||
f"seed={self.model_config.seed}, "
|
||||
f"served_model_name={self.model_config.served_model_name}, "
|
||||
f"enable_prefix_caching={self.cache_config.enable_prefix_caching}, "
|
||||
f"chunked_prefill_enabled={self.scheduler_config.chunked_prefill_enabled}, " # noqa
|
||||
f"pooler_config={self.model_config.pooler_config!r}, "
|
||||
f"compilation_config={self.compilation_config!r}")
|
||||
|
||||
|
||||
_current_vllm_config: Optional[VllmConfig] = None
|
||||
_current_prefix: Optional[str] = None
|
||||
|
||||
|
||||
@contextmanager
|
||||
def set_current_vllm_config(vllm_config: VllmConfig,
|
||||
check_compile=False,
|
||||
prefix: Optional[str] = None):
|
||||
"""
|
||||
Temporarily set the current vLLM config.
|
||||
Used during model initialization.
|
||||
We save the current vLLM config in a global variable,
|
||||
so that all modules can access it, e.g. custom ops
|
||||
can access the vLLM config to determine how to dispatch.
|
||||
"""
|
||||
global _current_vllm_config, _current_prefix
|
||||
old_vllm_config = _current_vllm_config
|
||||
old_prefix = _current_prefix
|
||||
from vllm.compilation.counter import compilation_counter
|
||||
num_models_seen = compilation_counter.num_models_seen
|
||||
try:
|
||||
_current_vllm_config = vllm_config
|
||||
_current_prefix = prefix
|
||||
yield
|
||||
except Exception:
|
||||
raise
|
||||
else:
|
||||
if check_compile:
|
||||
vllm_config.compilation_config.custom_op_log_check()
|
||||
|
||||
if check_compile and \
|
||||
vllm_config.compilation_config.level == CompilationLevel.PIECEWISE \
|
||||
and compilation_counter.num_models_seen == num_models_seen:
|
||||
# If the model supports compilation,
|
||||
# compilation_counter.num_models_seen should be increased
|
||||
# by at least 1.
|
||||
# If it is not increased, it means the model does not support
|
||||
# compilation (does not have @support_torch_compile decorator).
|
||||
logger.warning(
|
||||
"`torch.compile` is turned on, but the model %s"
|
||||
" does not support it. Please open an issue on GitHub"
|
||||
" if you want it to be supported.",
|
||||
vllm_config.model_config.model)
|
||||
finally:
|
||||
_current_vllm_config = old_vllm_config
|
||||
_current_prefix = old_prefix
|
||||
# Clear the compilation config cache when context changes
|
||||
get_cached_compilation_config.cache_clear()
|
||||
|
||||
|
||||
@lru_cache(maxsize=1)
|
||||
def get_cached_compilation_config():
|
||||
"""Cache config to avoid repeated calls to get_current_vllm_config()"""
|
||||
return get_current_vllm_config().compilation_config
|
||||
|
||||
|
||||
def get_current_vllm_config() -> VllmConfig:
|
||||
if _current_vllm_config is None:
|
||||
# in ci, usually when we test custom ops/modules directly,
|
||||
# we don't set the vllm config. In that case, we set a default
|
||||
# config.
|
||||
logger.warning("Current vLLM config is not set.")
|
||||
from vllm.config import VllmConfig
|
||||
return VllmConfig()
|
||||
return _current_vllm_config
|
||||
|
||||
|
||||
def get_current_model_prefix() -> str:
|
||||
"""
|
||||
Get the prefix of the model that's currently being initialized.
|
||||
"""
|
||||
assert _current_prefix is not None, \
|
||||
"Current model prefix is not set. "
|
||||
return _current_prefix
|
||||
|
||||
|
||||
T = TypeVar("T")
|
||||
|
||||
|
||||
def get_layers_from_vllm_config(
|
||||
vllm_config: VllmConfig,
|
||||
layer_type: type[T],
|
||||
layer_names: Optional[list[str]] = None) -> dict[str, T]:
|
||||
"""
|
||||
Get layers from the vLLM config.
|
||||
|
||||
Args:
|
||||
vllm_config: The vLLM config.
|
||||
layer_type: The type of the layer to get.
|
||||
layer_names: The names of the layers to get. If None, return all layers.
|
||||
"""
|
||||
|
||||
if layer_names is None:
|
||||
layer_names = list(
|
||||
vllm_config.compilation_config.static_forward_context.keys())
|
||||
|
||||
forward_context = vllm_config.compilation_config.static_forward_context
|
||||
|
||||
return {
|
||||
layer_name: forward_context[layer_name]
|
||||
for layer_name in layer_names
|
||||
if isinstance(forward_context[layer_name], layer_type)
|
||||
}
|
||||
|
||||
|
||||
def update_config(config: DataclassInstanceT,
|
||||
overrides: dict[str, Any]) -> DataclassInstanceT:
|
||||
processed_overrides = {}
|
||||
for field_name, value in overrides.items():
|
||||
assert hasattr(
|
||||
config, field_name), f"{type(config)} has no field `{field_name}`"
|
||||
current_value = getattr(config, field_name)
|
||||
if is_dataclass(current_value) and not is_dataclass(value):
|
||||
assert isinstance(value, dict), (
|
||||
f"Overrides to {type(config)}.{field_name} must be a dict"
|
||||
f" or {type(current_value)}, but got {type(value)}")
|
||||
value = update_config(
|
||||
current_value, # type: ignore[type-var]
|
||||
value)
|
||||
processed_overrides[field_name] = value
|
||||
return replace(config, **processed_overrides)
|
||||
from vllm.config.utils import (ConfigType, SupportsMetricsInfo, config,
|
||||
get_attr_docs, is_init_field, update_config)
|
||||
from vllm.config.vllm import (VllmConfig, get_cached_compilation_config,
|
||||
get_current_vllm_config,
|
||||
get_layers_from_vllm_config,
|
||||
set_current_vllm_config)
|
||||
|
||||
__all__ = [
|
||||
# From vllm.config.cache
|
||||
"BlockSize",
|
||||
"CacheConfig",
|
||||
"CacheDType",
|
||||
"MambaDType",
|
||||
"PrefixCachingHashAlgo",
|
||||
# From vllm.config.compilation
|
||||
"CompilationConfig",
|
||||
"CompilationLevel",
|
||||
"CUDAGraphMode",
|
||||
"PassConfig",
|
||||
# From vllm.config.device
|
||||
"Device",
|
||||
"DeviceConfig",
|
||||
# From vllm.config.kv_events
|
||||
"KVEventsConfig",
|
||||
# From vllm.config.kv_transfer
|
||||
"KVTransferConfig",
|
||||
# From vllm.config.load
|
||||
"LoadConfig",
|
||||
# From vllm.config.lora
|
||||
"LoRAConfig",
|
||||
# From vllm.config.model
|
||||
"ConvertOption",
|
||||
"HfOverrides",
|
||||
"LogprobsMode",
|
||||
"ModelConfig",
|
||||
"ModelDType",
|
||||
"ModelImpl",
|
||||
"RunnerOption",
|
||||
"TaskOption",
|
||||
"TokenizerMode",
|
||||
"iter_architecture_defaults",
|
||||
"try_match_architecture_defaults",
|
||||
# From vllm.config.multimodal
|
||||
"MMCacheType",
|
||||
"MMEncoderTPMode",
|
||||
"MultiModalConfig",
|
||||
# From vllm.config.observability
|
||||
"DetailedTraceModules",
|
||||
"ObservabilityConfig",
|
||||
# From vllm.config.parallel
|
||||
"DistributedExecutorBackend",
|
||||
"EPLBConfig",
|
||||
"ParallelConfig",
|
||||
# From vllm.config.pooler
|
||||
"PoolerConfig",
|
||||
# From vllm.config.scheduler
|
||||
"RunnerType",
|
||||
"SchedulerConfig",
|
||||
"SchedulerPolicy",
|
||||
# From vllm.config.speculative
|
||||
"SpeculativeConfig",
|
||||
# From vllm.config.speech_to_text
|
||||
"SpeechToTextConfig",
|
||||
# From vllm.config.structured_outputs
|
||||
"StructuredOutputsConfig",
|
||||
# From vllm.config.utils
|
||||
"ConfigType",
|
||||
"SupportsMetricsInfo",
|
||||
"config",
|
||||
"get_attr_docs",
|
||||
"is_init_field",
|
||||
"update_config",
|
||||
# From vllm.config.vllm
|
||||
"VllmConfig",
|
||||
"get_cached_compilation_config",
|
||||
"get_current_vllm_config",
|
||||
"set_current_vllm_config",
|
||||
"get_layers_from_vllm_config",
|
||||
]
|
||||
|
||||
@ -5,6 +5,7 @@ import enum
|
||||
import hashlib
|
||||
from collections import Counter
|
||||
from dataclasses import asdict, field
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, Any, Callable, ClassVar, Optional, Union
|
||||
|
||||
from pydantic import TypeAdapter, field_validator
|
||||
@ -61,9 +62,20 @@ class CUDAGraphMode(enum.Enum):
|
||||
def has_full_cudagraphs(self) -> bool:
|
||||
return self.max_cudagraph_mode() == CUDAGraphMode.FULL
|
||||
|
||||
def has_piecewise_cudagraphs(self) -> bool:
|
||||
return self.requires_piecewise_compilation()
|
||||
|
||||
def separate_routine(self) -> bool:
|
||||
return isinstance(self.value, tuple)
|
||||
|
||||
def valid_runtime_modes(self) -> bool:
|
||||
return self in [
|
||||
CUDAGraphMode.NONE, CUDAGraphMode.PIECEWISE, CUDAGraphMode.FULL
|
||||
]
|
||||
|
||||
def __str__(self) -> str:
|
||||
return self.name
|
||||
|
||||
|
||||
@config
|
||||
@dataclass
|
||||
@ -161,7 +173,7 @@ class CompilationConfig:
|
||||
- 1: dynamo as is.
|
||||
- 2: dynamo once.
|
||||
- 3: piecewise compilation."""
|
||||
debug_dump_path: str = ""
|
||||
debug_dump_path: Optional[Path] = None
|
||||
"""The path to dump the debug information."""
|
||||
cache_dir: str = ""
|
||||
"""The directory to store the compiled graph, to accelerate Inductor
|
||||
@ -269,7 +281,8 @@ class CompilationConfig:
|
||||
Note that this is orthogonal to the cudagraph capture logic
|
||||
outside of compilation.
|
||||
Warning: This flag is deprecated and will be removed in the next major or
|
||||
minor release, i.e. v0.11.0 or v1.0.0. Please use cudagraph_mode instead.
|
||||
minor release, i.e. v0.11.0 or v1.0.0. Please use cudagraph_mode=PIECEWISE
|
||||
instead.
|
||||
"""
|
||||
cudagraph_num_of_warmups: int = 0
|
||||
"""Number of warmup runs for cudagraph.
|
||||
@ -294,7 +307,8 @@ class CompilationConfig:
|
||||
flag cannot be used together with splitting_ops. This may provide
|
||||
performance benefits for smaller models.
|
||||
Warning: This flag is deprecated and will be removed in the next major or
|
||||
minor release, i.e. v0.11.0 or v1.0.0. Please use cudagraph_mode instead.
|
||||
minor release, i.e. v0.11.0 or v1.0.0. Please use cudagraph_mode=
|
||||
FULL_AND_PIECEWISE instead.
|
||||
"""
|
||||
|
||||
use_inductor_graph_partition: bool = False
|
||||
@ -407,10 +421,11 @@ class CompilationConfig:
|
||||
if pass_config_exclude:
|
||||
exclude["pass_config"] = pass_config_exclude
|
||||
|
||||
return TypeAdapter(CompilationConfig).dump_json(
|
||||
self,
|
||||
exclude=exclude, # type: ignore[arg-type]
|
||||
exclude_unset=True).decode()
|
||||
config = TypeAdapter(CompilationConfig).dump_python(self,
|
||||
exclude=exclude,
|
||||
exclude_unset=True)
|
||||
|
||||
return str(config)
|
||||
|
||||
__str__ = __repr__
|
||||
|
||||
@ -465,7 +480,8 @@ class CompilationConfig:
|
||||
if not self.use_cudagraph:
|
||||
logger.warning("use_cudagraph is deprecated, use "
|
||||
"cudagraph_mode=NONE instead.")
|
||||
if self.cudagraph_mode is not None:
|
||||
if self.cudagraph_mode is not None and \
|
||||
self.cudagraph_mode != CUDAGraphMode.NONE:
|
||||
raise ValueError(
|
||||
"use_cudagraph and cudagraph_mode are mutually"
|
||||
" exclusive, prefer cudagraph_mode since "
|
||||
@ -474,7 +490,8 @@ class CompilationConfig:
|
||||
if self.full_cuda_graph:
|
||||
logger.warning("full_cuda_graph is deprecated, use "
|
||||
"cudagraph_mode=FULL instead.")
|
||||
if self.cudagraph_mode is not None:
|
||||
if self.cudagraph_mode is not None and \
|
||||
not self.cudagraph_mode.has_full_cudagraphs():
|
||||
raise ValueError("full_cuda_graph and cudagraph_mode are "
|
||||
"mutually exclusive, prefer cudagraph_mode "
|
||||
"since full_cuda_graph is deprecated.")
|
||||
@ -571,48 +588,75 @@ class CompilationConfig:
|
||||
"set_splitting_ops_for_v1 should only be called when "
|
||||
"level is CompilationLevel.PIECEWISE")
|
||||
|
||||
if self.use_inductor_graph_partition:
|
||||
self.set_splitting_ops_for_inductor_graph_partition()
|
||||
return
|
||||
|
||||
if self.pass_config.enable_attn_fusion:
|
||||
# here use_inductor_graph_partition is False
|
||||
self.set_splitting_ops_for_attn_fusion()
|
||||
return
|
||||
|
||||
if self.splitting_ops is None:
|
||||
# NOTE: When using full cudagraph, instead of setting an empty
|
||||
# list and capture the full cudagraph inside the flattened fx
|
||||
# graph, we keep the piecewise fx graph structure but capture
|
||||
# the full cudagraph outside the fx graph. This reduces some
|
||||
# cpu overhead when the runtime batch_size is not cudagraph
|
||||
# captured. see https://github.com/vllm-project/vllm/pull/20059
|
||||
# for details. Make a copy to avoid mutating the class-level
|
||||
# list via reference.
|
||||
self.splitting_ops = list(self._attention_ops)
|
||||
elif len(self.splitting_ops) == 0:
|
||||
logger.warning_once(
|
||||
"Using piecewise compilation with empty splitting_ops")
|
||||
if self.cudagraph_mode == CUDAGraphMode.PIECEWISE:
|
||||
logger.warning_once(
|
||||
"Piecewise compilation with empty splitting_ops do not" \
|
||||
"contains piecewise cudagraph. Setting cudagraph_"
|
||||
"mode to NONE. Hint: If you are using attention backends "
|
||||
"that support cudagraph, consider manually setting "
|
||||
"cudagraph_mode to FULL or FULL_DECODE_ONLY to enable "
|
||||
"full cudagraphs.")
|
||||
self.cudagraph_mode = CUDAGraphMode.NONE
|
||||
elif self.cudagraph_mode == CUDAGraphMode.FULL_AND_PIECEWISE:
|
||||
logger.warning_once(
|
||||
"Piecewise compilation with empty splitting_ops do not "
|
||||
"contains piecewise cudagraph. Setting cudagraph_mode "
|
||||
"to FULL.")
|
||||
self.cudagraph_mode = CUDAGraphMode.FULL
|
||||
self.splitting_ops = []
|
||||
|
||||
def set_splitting_ops_for_inductor_graph_partition(self):
|
||||
assert self.use_inductor_graph_partition
|
||||
use_inductor_graph_partition_msg = (
|
||||
"When use_inductor_graph_partition=True, splitting_ops "
|
||||
"are ignored and set to an empty list. Instead, "
|
||||
"\"tags=(torch._C.Tag.cudagraph_unsafe, ),\" is "
|
||||
"used to annotate custom ops for graph partition.")
|
||||
|
||||
if self.splitting_ops is None:
|
||||
if self.use_inductor_graph_partition:
|
||||
# When using inductor graph partition, we set splitting_ops
|
||||
# to be empty and rely on torch._C.Tag.cudagraph_unsafe to
|
||||
# annotate custom ops as splitting ops.
|
||||
logger.warning_once(use_inductor_graph_partition_msg)
|
||||
self.splitting_ops = []
|
||||
else:
|
||||
# NOTE: When using full cudagraph, instead of setting an empty
|
||||
# list and capture the full cudagraph inside the flattened fx
|
||||
# graph, we keep the piecewise fx graph structure but capture
|
||||
# the full cudagraph outside the fx graph. This reduces some
|
||||
# cpu overhead when the runtime batch_size is not cudagraph
|
||||
# captured. see https://github.com/vllm-project/vllm/pull/20059
|
||||
# for details. make a copy to avoid mutating the class-level
|
||||
# list via reference.
|
||||
self.splitting_ops = list(self._attention_ops)
|
||||
elif len(self.splitting_ops) == 0:
|
||||
logger.warning_once(
|
||||
"Using piecewise compilation with empty "
|
||||
"splitting_ops and use_inductor_graph_partition"
|
||||
f"={self.use_inductor_graph_partition}.")
|
||||
if (self.cudagraph_mode == CUDAGraphMode.PIECEWISE
|
||||
and not self.use_inductor_graph_partition):
|
||||
logger.warning_once(
|
||||
"When compilation level is piecewise with empty "
|
||||
"splitting_ops, PIECEWISE cudagraph_mode will be "
|
||||
"treated as FULL cudagraph_mode. Please ensure you are "
|
||||
"using attention backends that support cudagraph or set "
|
||||
"cudagraph_mode to NONE explicitly if encountering "
|
||||
"any problems.")
|
||||
self.cudagraph_mode = CUDAGraphMode.FULL
|
||||
self.splitting_ops = []
|
||||
elif self.use_inductor_graph_partition:
|
||||
if self.splitting_ops is not None and \
|
||||
len(self.splitting_ops) > 0:
|
||||
logger.warning_once(use_inductor_graph_partition_msg)
|
||||
self.splitting_ops = []
|
||||
|
||||
def set_splitting_ops_for_attn_fusion(self):
|
||||
assert self.pass_config.enable_attn_fusion
|
||||
if self.splitting_ops is None:
|
||||
self.splitting_ops = []
|
||||
if self.cudagraph_mode.has_piecewise_cudagraphs():
|
||||
logger.warning_once(
|
||||
"enable_attn_fusion is incompatible with piecewise "
|
||||
"cudagraph when use_inductor_graph_partition is off."
|
||||
"In this case, splitting_ops will be set to empty "
|
||||
"list, and cudagraph_mode will be set to FULL. "
|
||||
"Please ensure you are using attention backends that "
|
||||
"support cudagraph or set cudagraph_mode to NONE "
|
||||
"explicitly if encountering any problems.")
|
||||
self.cudagraph_mode = CUDAGraphMode.FULL
|
||||
|
||||
assert not self.splitting_ops_contain_attention(), (
|
||||
"attention ops should not be in splitting_ops "
|
||||
"when enable_attn_fusion is True")
|
||||
|
||||
def splitting_ops_contain_attention(self) -> bool:
|
||||
return self.splitting_ops is not None and all(
|
||||
|
||||
@ -28,8 +28,8 @@ class KVTransferConfig:
|
||||
"""The engine id for KV transfers."""
|
||||
|
||||
kv_buffer_device: Optional[str] = "cuda"
|
||||
"""The device used by kv connector to buffer the KV cache.
|
||||
Currently only support 'cuda'."""
|
||||
"""The device used by kv connector to buffer the KV cache. Choices are
|
||||
'cuda' and 'cpu'."""
|
||||
|
||||
kv_buffer_size: float = 1e9
|
||||
"""The buffer size for TorchDistributedConnector. Measured in number of
|
||||
|
||||
@ -509,9 +509,14 @@ class ModelConfig:
|
||||
else: # task == "auto"
|
||||
pass
|
||||
else:
|
||||
debug_info = {
|
||||
"architectures": architectures,
|
||||
"is_generative_model": is_generative_model,
|
||||
"is_pooling_model": is_pooling_model,
|
||||
}
|
||||
raise AssertionError("The model should be a generative or "
|
||||
"pooling model when task is set to "
|
||||
f"{self.task!r}.")
|
||||
f"{self.task!r}. Found: {debug_info}")
|
||||
|
||||
self.runner = runner
|
||||
self.convert = convert
|
||||
@ -1329,11 +1334,13 @@ class ModelConfig:
|
||||
self.hf_config_path or self.model,
|
||||
trust_remote_code=self.trust_remote_code,
|
||||
revision=self.revision,
|
||||
config_format=self.config_format,
|
||||
)
|
||||
else:
|
||||
config = try_get_generation_config(
|
||||
self.generation_config,
|
||||
trust_remote_code=self.trust_remote_code,
|
||||
config_format=self.config_format,
|
||||
)
|
||||
|
||||
if config is None:
|
||||
|
||||
@ -2,7 +2,7 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import hashlib
|
||||
from dataclasses import field
|
||||
from dataclasses import InitVar, field
|
||||
from typing import Any, Literal, Union
|
||||
|
||||
from pydantic import SkipValidation, model_validator
|
||||
@ -84,6 +84,13 @@ class SchedulerConfig:
|
||||
is_multimodal_model: bool = False
|
||||
"""True if the model is multimodal."""
|
||||
|
||||
is_encoder_decoder: InitVar[bool] = False
|
||||
"""True if the model is an encoder-decoder model.
|
||||
|
||||
Note: This is stored in the ModelConfig, and is used only here to
|
||||
disable chunked prefill and prefix caching for encoder-decoder models.
|
||||
"""
|
||||
|
||||
# TODO (ywang96): Make this configurable.
|
||||
max_num_encoder_input_tokens: int = field(init=False)
|
||||
"""Multimodal encoder compute budget, only used in V1.
|
||||
@ -161,13 +168,23 @@ class SchedulerConfig:
|
||||
usedforsecurity=False).hexdigest()
|
||||
return hash_str
|
||||
|
||||
def __post_init__(self) -> None:
|
||||
def __post_init__(self, is_encoder_decoder: bool) -> None:
|
||||
if self.max_model_len is None:
|
||||
self.max_model_len = 8192
|
||||
|
||||
if self.max_num_seqs is None:
|
||||
self.max_num_seqs = 128
|
||||
|
||||
if is_encoder_decoder:
|
||||
# Chunked prefill should be disabled for encoder-decoder models.
|
||||
self.disable_chunked_mm_input = True
|
||||
self.chunked_prefill_enabled = False
|
||||
self.enable_chunked_prefill = False
|
||||
self.long_prefill_token_threshold = 0
|
||||
logger.info(
|
||||
"Encoder-decoder models do not support chunked prefill nor"
|
||||
" prefix caching; disabling both.")
|
||||
|
||||
if self.max_num_batched_tokens is None:
|
||||
if self.enable_chunked_prefill:
|
||||
self.max_num_batched_tokens = DEFAULT_MAX_NUM_BATCHED_TOKENS
|
||||
|
||||
@ -41,8 +41,7 @@ MTP_MODEL_TYPES = ("deepseek_mtp", "mimo_mtp", "glm4_moe_mtp", "ernie_mtp",
|
||||
@dataclass
|
||||
class SpeculativeConfig:
|
||||
"""Configuration for speculative decoding."""
|
||||
enforce_eager: Optional[bool] = None
|
||||
"""Override the default enforce_eager from model_config"""
|
||||
|
||||
# General speculative decoding control
|
||||
num_speculative_tokens: SkipValidation[int] = None # type: ignore
|
||||
"""The number of speculative tokens, if provided. It will default to the
|
||||
@ -220,11 +219,6 @@ class SpeculativeConfig:
|
||||
assert (
|
||||
self.target_model_config
|
||||
is not None), "target_model_config must be present for mtp"
|
||||
if self.target_model_config.hf_text_config.model_type \
|
||||
== "deepseek_v32":
|
||||
# FIXME(luccafong): cudgraph with v32 MTP is not supported,
|
||||
# remove this when the issue is fixed.
|
||||
self.enforce_eager = True
|
||||
# use the draft model from the same model:
|
||||
self.model = self.target_model_config.model
|
||||
# Align the quantization of draft model for cases such as
|
||||
|
||||
@ -1,21 +1,21 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
"""Utility functions for vLLM config dataclasses."""
|
||||
import ast
|
||||
import inspect
|
||||
import textwrap
|
||||
from dataclasses import MISSING, Field, field, fields, is_dataclass
|
||||
from typing import TYPE_CHECKING, Any, TypeVar
|
||||
from dataclasses import MISSING, Field, field, fields, is_dataclass, replace
|
||||
from typing import TYPE_CHECKING, Any, Protocol, TypeVar
|
||||
|
||||
import regex as re
|
||||
from typing_extensions import runtime_checkable
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from _typeshed import DataclassInstance
|
||||
|
||||
ConfigType = type[DataclassInstance]
|
||||
else:
|
||||
ConfigType = type
|
||||
DataclassInstance = Any
|
||||
|
||||
ConfigType = type[DataclassInstance]
|
||||
ConfigT = TypeVar("ConfigT", bound=ConfigType)
|
||||
|
||||
|
||||
@ -143,3 +143,33 @@ def get_attr_docs(cls: type[Any]) -> dict[str, str]:
|
||||
|
||||
def is_init_field(cls: ConfigType, name: str) -> bool:
|
||||
return next(f for f in fields(cls) if f.name == name).init
|
||||
|
||||
|
||||
@runtime_checkable
|
||||
class SupportsHash(Protocol):
|
||||
|
||||
def compute_hash(self) -> str:
|
||||
...
|
||||
|
||||
|
||||
class SupportsMetricsInfo(Protocol):
|
||||
|
||||
def metrics_info(self) -> dict[str, str]:
|
||||
...
|
||||
|
||||
|
||||
def update_config(config: ConfigT, overrides: dict[str, Any]) -> ConfigT:
|
||||
processed_overrides = {}
|
||||
for field_name, value in overrides.items():
|
||||
assert hasattr(
|
||||
config, field_name), f"{type(config)} has no field `{field_name}`"
|
||||
current_value = getattr(config, field_name)
|
||||
if is_dataclass(current_value) and not is_dataclass(value):
|
||||
assert isinstance(value, dict), (
|
||||
f"Overrides to {type(config)}.{field_name} must be a dict"
|
||||
f" or {type(current_value)}, but got {type(value)}")
|
||||
value = update_config(
|
||||
current_value, # type: ignore[type-var]
|
||||
value)
|
||||
processed_overrides[field_name] = value
|
||||
return replace(config, **processed_overrides)
|
||||
|
||||
788
vllm/config/vllm.py
Normal file
788
vllm/config/vllm.py
Normal file
@ -0,0 +1,788 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import copy
|
||||
import hashlib
|
||||
import json
|
||||
import os
|
||||
from contextlib import contextmanager
|
||||
from dataclasses import field, replace
|
||||
from functools import lru_cache
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, Any, Optional, TypeVar, Union
|
||||
|
||||
import torch
|
||||
from pydantic import ConfigDict
|
||||
from pydantic.dataclasses import dataclass
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm.logger import init_logger
|
||||
from vllm.transformers_utils.runai_utils import is_runai_obj_uri
|
||||
from vllm.utils import random_uuid
|
||||
|
||||
from .cache import CacheConfig
|
||||
from .compilation import CompilationConfig, CompilationLevel, CUDAGraphMode
|
||||
from .device import DeviceConfig
|
||||
from .kv_events import KVEventsConfig
|
||||
from .kv_transfer import KVTransferConfig
|
||||
from .load import LoadConfig
|
||||
from .lora import LoRAConfig
|
||||
from .model import ModelConfig
|
||||
from .observability import ObservabilityConfig
|
||||
from .parallel import ParallelConfig
|
||||
from .scheduler import SchedulerConfig
|
||||
from .speculative import SpeculativeConfig
|
||||
from .structured_outputs import StructuredOutputsConfig
|
||||
from .utils import SupportsHash, config
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from transformers import PretrainedConfig
|
||||
|
||||
from vllm.model_executor.layers.quantization.base_config import (
|
||||
QuantizationConfig)
|
||||
else:
|
||||
PretrainedConfig = Any
|
||||
|
||||
QuantizationConfig = Any
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
@config
|
||||
@dataclass(config=ConfigDict(arbitrary_types_allowed=True))
|
||||
class VllmConfig:
|
||||
"""Dataclass which contains all vllm-related configuration. This
|
||||
simplifies passing around the distinct configurations in the codebase.
|
||||
"""
|
||||
|
||||
# TODO: use default_factory once default constructing ModelConfig doesn't
|
||||
# try to download a model
|
||||
model_config: ModelConfig = None # type: ignore
|
||||
"""Model configuration."""
|
||||
cache_config: CacheConfig = field(default_factory=CacheConfig)
|
||||
"""Cache configuration."""
|
||||
parallel_config: ParallelConfig = field(default_factory=ParallelConfig)
|
||||
"""Parallel configuration."""
|
||||
scheduler_config: SchedulerConfig = field(default_factory=SchedulerConfig)
|
||||
"""Scheduler configuration."""
|
||||
device_config: DeviceConfig = field(default_factory=DeviceConfig)
|
||||
"""Device configuration."""
|
||||
load_config: LoadConfig = field(default_factory=LoadConfig)
|
||||
"""Load configuration."""
|
||||
lora_config: Optional[LoRAConfig] = None
|
||||
"""LoRA configuration."""
|
||||
speculative_config: Optional[SpeculativeConfig] = None
|
||||
"""Speculative decoding configuration."""
|
||||
structured_outputs_config: StructuredOutputsConfig = field(
|
||||
default_factory=StructuredOutputsConfig)
|
||||
"""Structured outputs configuration."""
|
||||
observability_config: Optional[ObservabilityConfig] = None
|
||||
"""Observability configuration."""
|
||||
quant_config: Optional[QuantizationConfig] = None
|
||||
"""Quantization configuration."""
|
||||
compilation_config: CompilationConfig = field(
|
||||
default_factory=CompilationConfig)
|
||||
"""`torch.compile` and cudagraph capture configuration for the model.
|
||||
|
||||
As a shorthand, `-O<n>` can be used to directly specify the compilation
|
||||
level `n`: `-O3` is equivalent to `-O.level=3` (same as `-O='{"level":3}'`).
|
||||
Currently, -O <n> and -O=<n> are supported as well but this will likely be
|
||||
removed in favor of clearer -O<n> syntax in the future.
|
||||
|
||||
NOTE: level 0 is the default level without any optimization. level 1 and 2
|
||||
are for internal testing only. level 3 is the recommended level for
|
||||
production, also default in V1.
|
||||
|
||||
You can specify the full compilation config like so:
|
||||
`{"level": 3, "cudagraph_capture_sizes": [1, 2, 4, 8]}`
|
||||
"""
|
||||
kv_transfer_config: Optional[KVTransferConfig] = None
|
||||
"""The configurations for distributed KV cache transfer."""
|
||||
kv_events_config: Optional[KVEventsConfig] = None
|
||||
"""The configurations for event publishing."""
|
||||
# some opaque config, only used to provide additional information
|
||||
# for the hash computation, mainly used for testing, debugging or out of
|
||||
# tree config registration.
|
||||
additional_config: Union[dict, SupportsHash] = field(default_factory=dict)
|
||||
"""Additional config for specified platform. Different platforms may
|
||||
support different configs. Make sure the configs are valid for the platform
|
||||
you are using. Contents must be hashable."""
|
||||
instance_id: str = ""
|
||||
"""The ID of the vLLM instance."""
|
||||
|
||||
def compute_hash(self) -> str:
|
||||
"""
|
||||
WARNING: Whenever a new field is added to this config,
|
||||
ensure that it is included in the factors list if
|
||||
it affects the computation graph.
|
||||
|
||||
Provide a hash that uniquely identifies all the configs
|
||||
that affect the structure of the computation
|
||||
graph from input ids/embeddings to the final hidden states,
|
||||
excluding anything before input ids/embeddings and after
|
||||
the final hidden states.
|
||||
"""
|
||||
factors: list[Any] = []
|
||||
|
||||
# summarize vllm config
|
||||
vllm_factors: list[Any] = []
|
||||
from vllm import __version__
|
||||
vllm_factors.append(__version__)
|
||||
vllm_factors.append(envs.VLLM_USE_V1)
|
||||
if self.model_config:
|
||||
vllm_factors.append(self.model_config.compute_hash())
|
||||
else:
|
||||
vllm_factors.append("None")
|
||||
if self.cache_config:
|
||||
vllm_factors.append(self.cache_config.compute_hash())
|
||||
else:
|
||||
vllm_factors.append("None")
|
||||
if self.parallel_config:
|
||||
vllm_factors.append(self.parallel_config.compute_hash())
|
||||
else:
|
||||
vllm_factors.append("None")
|
||||
if self.scheduler_config:
|
||||
vllm_factors.append(self.scheduler_config.compute_hash())
|
||||
else:
|
||||
vllm_factors.append("None")
|
||||
if self.device_config:
|
||||
vllm_factors.append(self.device_config.compute_hash())
|
||||
else:
|
||||
vllm_factors.append("None")
|
||||
if self.load_config:
|
||||
vllm_factors.append(self.load_config.compute_hash())
|
||||
else:
|
||||
vllm_factors.append("None")
|
||||
if self.lora_config:
|
||||
vllm_factors.append(self.lora_config.compute_hash())
|
||||
# LoRA creates static buffers based on max_num_batched_tokens.
|
||||
# The tensor sizes and strides get captured in the torch.compile
|
||||
# graph explicitly.
|
||||
vllm_factors.append(
|
||||
str(self.scheduler_config.max_num_batched_tokens))
|
||||
else:
|
||||
vllm_factors.append("None")
|
||||
if self.speculative_config:
|
||||
vllm_factors.append(self.speculative_config.compute_hash())
|
||||
else:
|
||||
vllm_factors.append("None")
|
||||
if self.structured_outputs_config:
|
||||
vllm_factors.append(self.structured_outputs_config.compute_hash())
|
||||
else:
|
||||
vllm_factors.append("None")
|
||||
if self.observability_config:
|
||||
vllm_factors.append(self.observability_config.compute_hash())
|
||||
else:
|
||||
vllm_factors.append("None")
|
||||
if self.quant_config:
|
||||
pass # should be captured by model_config.quantization
|
||||
if self.compilation_config:
|
||||
vllm_factors.append(self.compilation_config.compute_hash())
|
||||
else:
|
||||
vllm_factors.append("None")
|
||||
if self.kv_transfer_config:
|
||||
vllm_factors.append(self.kv_transfer_config.compute_hash())
|
||||
else:
|
||||
vllm_factors.append("None")
|
||||
if self.additional_config:
|
||||
if isinstance(additional_config := self.additional_config, dict):
|
||||
additional_config_hash = hashlib.md5(
|
||||
json.dumps(additional_config, sort_keys=True).encode(),
|
||||
usedforsecurity=False,
|
||||
).hexdigest()
|
||||
else:
|
||||
additional_config_hash = additional_config.compute_hash()
|
||||
vllm_factors.append(additional_config_hash)
|
||||
else:
|
||||
vllm_factors.append("None")
|
||||
factors.append(vllm_factors)
|
||||
|
||||
hash_str = hashlib.md5(str(factors).encode(),
|
||||
usedforsecurity=False).hexdigest()[:10]
|
||||
return hash_str
|
||||
|
||||
def pad_for_cudagraph(self, batch_size: int) -> int:
|
||||
# if batch_size > self.compilation_config.max_capture_size,
|
||||
# it should raise an IndexError.
|
||||
# the caller should make sure the batch_size is within the range,
|
||||
# i.e., batch_size <= self.compilation_config.max_capture_size
|
||||
return self.compilation_config.bs_to_padded_graph_size[batch_size]
|
||||
|
||||
@staticmethod
|
||||
def _get_quantization_config(
|
||||
model_config: ModelConfig,
|
||||
load_config: LoadConfig) -> Optional[QuantizationConfig]:
|
||||
"""Get the quantization config."""
|
||||
from vllm.platforms import current_platform
|
||||
if model_config.quantization is not None:
|
||||
from vllm.model_executor.model_loader.weight_utils import (
|
||||
get_quant_config)
|
||||
quant_config = get_quant_config(model_config, load_config)
|
||||
capability_tuple = current_platform.get_device_capability()
|
||||
|
||||
if capability_tuple is not None:
|
||||
capability = capability_tuple.to_int()
|
||||
if capability < quant_config.get_min_capability():
|
||||
raise ValueError(
|
||||
f"The quantization method {model_config.quantization} "
|
||||
"is not supported for the current GPU. Minimum "
|
||||
f"capability: {quant_config.get_min_capability()}. "
|
||||
f"Current capability: {capability}.")
|
||||
supported_dtypes = quant_config.get_supported_act_dtypes()
|
||||
if model_config.dtype not in supported_dtypes:
|
||||
raise ValueError(
|
||||
f"{model_config.dtype} is not supported for quantization "
|
||||
f"method {model_config.quantization}. Supported dtypes: "
|
||||
f"{supported_dtypes}")
|
||||
quant_config.maybe_update_config(model_config.model)
|
||||
return quant_config
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def get_quantization_config(
|
||||
model_config: ModelConfig,
|
||||
load_config: LoadConfig) -> Optional[QuantizationConfig]:
|
||||
import copy
|
||||
|
||||
# For some reason, the _ version of this modifies the model_config
|
||||
# object, so using deepcopy to avoid this problem.
|
||||
return VllmConfig._get_quantization_config(copy.deepcopy(model_config),
|
||||
load_config)
|
||||
|
||||
def with_hf_config(
|
||||
self,
|
||||
hf_config: PretrainedConfig,
|
||||
architectures: Optional[list[str]] = None,
|
||||
) -> "VllmConfig":
|
||||
if architectures is not None:
|
||||
hf_config = copy.deepcopy(hf_config)
|
||||
hf_config.architectures = architectures
|
||||
|
||||
model_config = copy.deepcopy(self.model_config)
|
||||
model_config.hf_config = hf_config
|
||||
|
||||
return replace(self, model_config=model_config)
|
||||
|
||||
def __post_init__(self):
|
||||
"""Verify configs are valid & consistent with each other.
|
||||
"""
|
||||
|
||||
self.try_verify_and_update_config()
|
||||
|
||||
if self.model_config is not None:
|
||||
self.model_config.verify_with_parallel_config(self.parallel_config)
|
||||
self.model_config.verify_dual_chunk_attention_config(
|
||||
self.load_config)
|
||||
|
||||
self.cache_config.verify_with_parallel_config(self.parallel_config)
|
||||
|
||||
if self.lora_config is not None:
|
||||
self.lora_config.verify_with_cache_config(self.cache_config)
|
||||
self.lora_config.verify_with_model_config(self.model_config)
|
||||
|
||||
if self.quant_config is None and self.model_config is not None:
|
||||
self.quant_config = VllmConfig._get_quantization_config(
|
||||
self.model_config, self.load_config)
|
||||
|
||||
from vllm.platforms import current_platform
|
||||
if self.model_config is not None and \
|
||||
self.scheduler_config.chunked_prefill_enabled and \
|
||||
self.model_config.dtype == torch.float32 and \
|
||||
current_platform.get_device_capability() == (7, 5):
|
||||
logger.warning_once(
|
||||
"Turing devices tensor cores do not support float32 matmul. "
|
||||
"To workaround this limitation, vLLM will set 'ieee' input "
|
||||
"precision for chunked prefill triton kernels.")
|
||||
|
||||
# If the user does not explicitly set a compilation level, then
|
||||
# we use the default level. The default level depends on other
|
||||
# settings (see the below code).
|
||||
if self.compilation_config.level is None:
|
||||
if envs.VLLM_USE_V1:
|
||||
if (self.model_config is not None
|
||||
and not self.model_config.enforce_eager):
|
||||
self.compilation_config.level = CompilationLevel.PIECEWISE
|
||||
else:
|
||||
self.compilation_config.level = \
|
||||
CompilationLevel.NO_COMPILATION
|
||||
|
||||
else:
|
||||
# NB: Passing both --enforce-eager and a compilation level
|
||||
# in V0 means the compilation level wins out.
|
||||
self.compilation_config.level = CompilationLevel.NO_COMPILATION
|
||||
|
||||
# async tp is built on top of sequence parallelism
|
||||
# and requires it to be enabled.
|
||||
if self.compilation_config.pass_config.enable_async_tp:
|
||||
self.compilation_config.pass_config.enable_sequence_parallelism = \
|
||||
True
|
||||
if self.compilation_config.pass_config.enable_sequence_parallelism:
|
||||
self.compilation_config.custom_ops.append("+rms_norm")
|
||||
|
||||
if current_platform.support_static_graph_mode():
|
||||
# if cudagraph_mode is not explicitly set by users, set default
|
||||
# value
|
||||
if self.compilation_config.cudagraph_mode is None:
|
||||
if envs.VLLM_USE_V1 and self.compilation_config.level \
|
||||
== CompilationLevel.PIECEWISE:
|
||||
# default to full and piecewise for most models
|
||||
self.compilation_config.cudagraph_mode = \
|
||||
CUDAGraphMode.FULL_AND_PIECEWISE
|
||||
|
||||
# pooling models and encoder-decoder models
|
||||
# do not support full cudagraphs
|
||||
if self.model_config is not None and \
|
||||
(self.model_config.pooler_config is not None
|
||||
or self.model_config.is_encoder_decoder):
|
||||
self.compilation_config.cudagraph_mode = \
|
||||
CUDAGraphMode.PIECEWISE
|
||||
else:
|
||||
self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE
|
||||
|
||||
# disable cudagraph when enforce eager execution
|
||||
if self.model_config is not None and \
|
||||
self.model_config.enforce_eager:
|
||||
logger.info("Cudagraph is disabled under eager mode")
|
||||
self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE
|
||||
elif envs.VLLM_USE_V1:
|
||||
self.compilation_config.cudagraph_num_of_warmups = 1
|
||||
|
||||
self._set_cudagraph_sizes()
|
||||
else:
|
||||
self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE
|
||||
|
||||
if self.cache_config.kv_sharing_fast_prefill:
|
||||
|
||||
if self.speculative_config is not None and \
|
||||
self.speculative_config.use_eagle():
|
||||
raise NotImplementedError(
|
||||
"Fast prefill optimization for KV sharing is not "
|
||||
"compatible with EAGLE as EAGLE requires correct logits "
|
||||
"for all tokens while fast prefill gives incorrect logits "
|
||||
"for prompt tokens.")
|
||||
|
||||
logger.warning_once(
|
||||
"--kv-sharing-fast-prefill requires changes on model side for "
|
||||
"correctness and to realize prefill savings. ")
|
||||
|
||||
disable_chunked_prefill_reasons: list[str] = []
|
||||
|
||||
if self.model_config:
|
||||
if self.model_config.pooler_config:
|
||||
pooling_type = self.model_config.pooler_config.pooling_type
|
||||
if pooling_type is None or pooling_type.lower() != "last":
|
||||
disable_chunked_prefill_reasons.append(
|
||||
"Only \"last\" pooling supports chunked "
|
||||
"prefill and prefix caching; disabling both.")
|
||||
if not getattr(self.model_config.hf_config, "is_causal", True):
|
||||
disable_chunked_prefill_reasons.append(
|
||||
"Only models using causal attention supports chunked "
|
||||
"prefill and prefix caching; disabling both.")
|
||||
elif self.model_config.is_encoder_decoder:
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
self.scheduler_config.max_num_encoder_input_tokens = \
|
||||
MULTIMODAL_REGISTRY.get_encdec_max_encoder_len(self.model_config)
|
||||
logger.debug(
|
||||
"Encoder-decoder model detected: setting "
|
||||
"`max_num_encoder_input_tokens` to encoder length (%s)",
|
||||
self.scheduler_config.max_num_encoder_input_tokens)
|
||||
if (self.model_config.architecture
|
||||
== "WhisperForConditionalGeneration"
|
||||
and os.environ.get("VLLM_WORKER_MULTIPROC_METHOD")
|
||||
!= "spawn"):
|
||||
logger.warning(
|
||||
"Whisper is known to have issues with "
|
||||
"forked workers. If startup is hanging, "
|
||||
"try setting 'VLLM_WORKER_MULTIPROC_METHOD' "
|
||||
"to 'spawn'.")
|
||||
|
||||
# Disable prefix caching only if chunked prefill is explicitly disabled
|
||||
# (and not merely unset)
|
||||
if (self.scheduler_config.chunked_prefill_enabled is False
|
||||
or disable_chunked_prefill_reasons):
|
||||
for reason in disable_chunked_prefill_reasons:
|
||||
logger.info(reason)
|
||||
self.scheduler_config.chunked_prefill_enabled = False
|
||||
self.scheduler_config.long_prefill_token_threshold = 0
|
||||
|
||||
if self.cache_config is not None:
|
||||
self.cache_config.enable_prefix_caching = False
|
||||
|
||||
if (self.kv_events_config is not None
|
||||
and self.kv_events_config.enable_kv_cache_events
|
||||
and not self.cache_config.enable_prefix_caching):
|
||||
logger.warning(
|
||||
"KV cache events are on, but prefix caching is not enabled."
|
||||
"Use --enable-prefix-caching to enable.")
|
||||
if (self.kv_events_config is not None
|
||||
and self.kv_events_config.publisher != "null"
|
||||
and not self.kv_events_config.enable_kv_cache_events):
|
||||
logger.warning("KV cache events are disabled,"
|
||||
"but the scheduler is configured to publish them."
|
||||
"Modify KVEventsConfig.enable_kv_cache_events"
|
||||
"to True to enable.")
|
||||
current_platform.check_and_update_config(self)
|
||||
|
||||
# Do this after all the updates to compilation_config.level
|
||||
if envs.VLLM_USE_V1 and \
|
||||
self.compilation_config.level == CompilationLevel.PIECEWISE:
|
||||
self.compilation_config.set_splitting_ops_for_v1()
|
||||
|
||||
# final check of cudagraph mode after all possible updates
|
||||
if envs.VLLM_USE_V1 and current_platform.is_cuda_alike():
|
||||
if self.compilation_config.cudagraph_mode.has_full_cudagraphs()\
|
||||
and self.model_config is not None and \
|
||||
not self.model_config.disable_cascade_attn and\
|
||||
not self.compilation_config.cudagraph_mode.\
|
||||
has_piecewise_cudagraphs():
|
||||
logger.warning_once(
|
||||
"No piecewise cudagraph for executing cascade attention."
|
||||
" Will fall back to eager execution if a batch runs "
|
||||
"into cascade attentions")
|
||||
|
||||
if self.compilation_config.cudagraph_mode\
|
||||
.requires_piecewise_compilation():
|
||||
assert self.compilation_config.level == \
|
||||
CompilationLevel.PIECEWISE, \
|
||||
"Compilation level should be CompilationLevel.PIECEWISE "\
|
||||
"when cudagraph_mode piecewise cudagraphs is used, "\
|
||||
f"cudagraph_mode={self.compilation_config.cudagraph_mode}"
|
||||
|
||||
# final migrate the deprecated flags
|
||||
self.compilation_config.use_cudagraph = self.compilation_config.\
|
||||
cudagraph_mode!= CUDAGraphMode.NONE
|
||||
self.compilation_config.full_cuda_graph = self.compilation_config.\
|
||||
cudagraph_mode.has_full_cudagraphs()
|
||||
|
||||
if self.parallel_config.enable_dbo:
|
||||
a2a_backend = envs.VLLM_ALL2ALL_BACKEND
|
||||
assert a2a_backend in \
|
||||
["deepep_low_latency", "deepep_high_throughput"], \
|
||||
"Microbatching currently only supports the deepep_low_latency and "\
|
||||
f"deepep_high_throughput all2all backend. {a2a_backend} is not "\
|
||||
"supported. To fix set the VLLM_ALL2ALL_BACKEND environment "\
|
||||
"variable to deepep_low_latency or deepep_high_throughput and "\
|
||||
"install the DeepEP kernels."
|
||||
|
||||
if not self.model_config.disable_cascade_attn:
|
||||
self.model_config.disable_cascade_attn = True
|
||||
logger.warning_once(
|
||||
"Disabling cascade attention when DBO is enabled.")
|
||||
|
||||
if not self.instance_id:
|
||||
self.instance_id = random_uuid()[:5]
|
||||
|
||||
if (envs.VLLM_USE_V1
|
||||
and not self.scheduler_config.disable_hybrid_kv_cache_manager):
|
||||
# logger should only print warning message for hybrid models. As we
|
||||
# can't know whether the model is hybrid or not now, so we don't log
|
||||
# warning message here and will log it later.
|
||||
if not current_platform.support_hybrid_kv_cache():
|
||||
# Hybrid KV cache manager is not supported on non-GPU platforms.
|
||||
self.scheduler_config.disable_hybrid_kv_cache_manager = True
|
||||
if self.kv_transfer_config is not None:
|
||||
# Hybrid KV cache manager is not compatible with KV transfer.
|
||||
self.scheduler_config.disable_hybrid_kv_cache_manager = True
|
||||
if self.kv_events_config is not None:
|
||||
# Hybrid KV cache manager is not compatible with KV events.
|
||||
self.scheduler_config.disable_hybrid_kv_cache_manager = True
|
||||
if self.model_config is not None and \
|
||||
self.model_config.attention_chunk_size is not None:
|
||||
if self.speculative_config is not None and \
|
||||
self.speculative_config.use_eagle():
|
||||
# Hybrid KV cache manager is not yet supported with chunked
|
||||
# local attention + eagle.
|
||||
self.scheduler_config.disable_hybrid_kv_cache_manager = True
|
||||
elif \
|
||||
not envs.VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE:
|
||||
logger.warning(
|
||||
"There is a latency regression when using chunked local"
|
||||
" attention with the hybrid KV cache manager. Disabling"
|
||||
" it, by default. To enable it, set the environment "
|
||||
"VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE=1."
|
||||
)
|
||||
# Hybrid KV cache manager is not yet supported with chunked
|
||||
# local attention.
|
||||
self.scheduler_config.disable_hybrid_kv_cache_manager = True
|
||||
|
||||
if self.compilation_config.debug_dump_path:
|
||||
self.compilation_config.debug_dump_path = \
|
||||
self.compilation_config.debug_dump_path.absolute().expanduser()
|
||||
if envs.VLLM_DEBUG_DUMP_PATH is not None:
|
||||
env_path = Path(envs.VLLM_DEBUG_DUMP_PATH).absolute().expanduser()
|
||||
if self.compilation_config.debug_dump_path:
|
||||
logger.warning(
|
||||
"Config-specified debug dump path is overridden"
|
||||
" by VLLM_DEBUG_DUMP_PATH to %s", env_path)
|
||||
self.compilation_config.debug_dump_path = env_path
|
||||
|
||||
def update_sizes_for_sequence_parallelism(self,
|
||||
possible_sizes: list) -> list:
|
||||
# remove the sizes that not multiple of tp_size when
|
||||
# enable sequence parallelism
|
||||
removed_sizes = [
|
||||
size for size in possible_sizes
|
||||
if size % self.parallel_config.tensor_parallel_size != 0
|
||||
]
|
||||
if removed_sizes:
|
||||
logger.warning(
|
||||
"Batch sizes %s are removed because they are not "
|
||||
"multiple of tp_size %d when "
|
||||
"sequence parallelism is enabled", removed_sizes,
|
||||
self.parallel_config.tensor_parallel_size)
|
||||
|
||||
return [
|
||||
size for size in possible_sizes
|
||||
if size % self.parallel_config.tensor_parallel_size == 0
|
||||
]
|
||||
|
||||
def _set_cudagraph_sizes(self):
|
||||
"""
|
||||
vLLM defines the default candidate list of batch sizes for CUDA graph
|
||||
capture as:
|
||||
|
||||
```python
|
||||
max_graph_size = min(max_num_seqs * 2, 512)
|
||||
# 1, 2, 4, then multiples of 8 up to max_graph_size
|
||||
cuda_graph_sizes = [1, 2, 4, 8, 16, 24, 32, 40, ..., max_graph_size]
|
||||
|
||||
In the end, `vllm_config.compilation_config.cudagraph_capture_sizes`
|
||||
will be the final sizes to capture cudagraph (in descending order).
|
||||
|
||||
These sizes are used to capture and reuse CUDA graphs for
|
||||
performance-critical paths (e.g., decoding). Capturing enables
|
||||
significantly faster kernel dispatch by avoiding Python overhead. The
|
||||
list is then filtered based on `max_num_batched_tokens` (e.g., 8192 on
|
||||
most GPUs), which controls the total allowed number of tokens in a
|
||||
batch. Since each sequence may have a variable number of tokens, the
|
||||
maximum usable batch size will depend on actual sequence lengths.
|
||||
|
||||
Example:
|
||||
With `max_num_batched_tokens = 8192`, and typical sequences
|
||||
averaging ~32 tokens, most practical batch sizes fall below 256.
|
||||
However, the system will still allow capture sizes up to 512 if
|
||||
shape and memory permit.
|
||||
|
||||
Note:
|
||||
If users explicitly specify cudagraph capture sizes in the
|
||||
compilation config, those will override this default logic.
|
||||
At runtime:
|
||||
|
||||
- If batch size <= one of the `cudagraph_capture_sizes`, the closest
|
||||
padded CUDA graph will be used.
|
||||
- If batch size > largest `cudagraph_capture_sizes`, cudagraph will
|
||||
not be used.
|
||||
"""
|
||||
|
||||
# calculate the default `batch_size_capture_list`
|
||||
batch_size_capture_list = []
|
||||
if self.model_config is not None and \
|
||||
not self.model_config.enforce_eager:
|
||||
cuda_graph_sizes = self.scheduler_config.cuda_graph_sizes
|
||||
if len(cuda_graph_sizes) == 1:
|
||||
batch_size_capture_list = [1, 2, 4] + [
|
||||
i for i in range(8, cuda_graph_sizes[0] + 1, 8)
|
||||
]
|
||||
elif len(cuda_graph_sizes) > 1:
|
||||
batch_size_capture_list = sorted(cuda_graph_sizes)
|
||||
else:
|
||||
raise TypeError(f"Invalid value for {cuda_graph_sizes=}.")
|
||||
if self.parallel_config.tensor_parallel_size > 1 and \
|
||||
self.compilation_config.pass_config.enable_sequence_parallelism:
|
||||
batch_size_capture_list = \
|
||||
self.update_sizes_for_sequence_parallelism(batch_size_capture_list)
|
||||
max_num_tokens = self.scheduler_config.max_num_batched_tokens
|
||||
batch_size_capture_list = [
|
||||
size for size in batch_size_capture_list
|
||||
if size <= max_num_tokens
|
||||
]
|
||||
|
||||
self.compilation_config.init_with_cudagraph_sizes(
|
||||
batch_size_capture_list)
|
||||
|
||||
def recalculate_max_model_len(self, max_model_len: int):
|
||||
# Can only be called in try_verify_and_update_config
|
||||
model_config = self.model_config
|
||||
max_model_len = model_config.get_and_verify_max_len(max_model_len)
|
||||
self.model_config.max_model_len = max_model_len
|
||||
self.scheduler_config.max_model_len = max_model_len
|
||||
|
||||
def try_verify_and_update_config(self):
|
||||
if self.model_config is None:
|
||||
return
|
||||
|
||||
# Avoid running try_verify_and_update_config multiple times
|
||||
if getattr(self.model_config, "config_updated", False):
|
||||
return
|
||||
self.model_config.config_updated = True
|
||||
|
||||
architecture = self.model_config.architecture
|
||||
if architecture is None:
|
||||
return
|
||||
|
||||
from vllm.model_executor.models.config import (
|
||||
MODELS_CONFIG_MAP, HybridAttentionMambaModelConfig)
|
||||
cls = MODELS_CONFIG_MAP.get(architecture, None)
|
||||
if cls is not None:
|
||||
cls.verify_and_update_config(self)
|
||||
|
||||
if self.model_config.is_hybrid:
|
||||
HybridAttentionMambaModelConfig.verify_and_update_config(self)
|
||||
|
||||
if self.model_config.convert_type == "classify":
|
||||
# Maybe convert ForCausalLM into ForSequenceClassification model.
|
||||
from vllm.model_executor.models.adapters import (
|
||||
SequenceClassificationConfig)
|
||||
SequenceClassificationConfig.verify_and_update_config(self)
|
||||
|
||||
if hasattr(self.model_config, "model_weights") and is_runai_obj_uri(
|
||||
self.model_config.model_weights):
|
||||
if self.load_config.load_format == "auto":
|
||||
logger.info("Detected Run:ai model config. "
|
||||
"Overriding `load_format` to 'runai_streamer'")
|
||||
self.load_config.load_format = "runai_streamer"
|
||||
elif self.load_config.load_format != "runai_streamer":
|
||||
raise ValueError(f"To load a model from S3, 'load_format' "
|
||||
f"must be 'runai_streamer', "
|
||||
f"but got '{self.load_config.load_format}'. "
|
||||
f"Model: {self.model_config.model}")
|
||||
|
||||
def compile_debug_dump_path(self) -> Optional[Path]:
|
||||
"""Returns a rank-aware path for dumping
|
||||
torch.compile debug information.
|
||||
"""
|
||||
if self.compilation_config.debug_dump_path is None:
|
||||
return None
|
||||
tp_rank = self.parallel_config.rank
|
||||
dp_rank = self.parallel_config.data_parallel_rank
|
||||
data_parallel_size = self.parallel_config.data_parallel_size
|
||||
append_path = f"rank_{tp_rank}" if data_parallel_size == 1 \
|
||||
else f"rank_{tp_rank}_dp_{dp_rank}"
|
||||
path = self.compilation_config.debug_dump_path / append_path
|
||||
return path
|
||||
|
||||
def __str__(self):
|
||||
return (
|
||||
f"model={self.model_config.model!r}, "
|
||||
f"speculative_config={self.speculative_config!r}, "
|
||||
f"tokenizer={self.model_config.tokenizer!r}, "
|
||||
f"skip_tokenizer_init={self.model_config.skip_tokenizer_init}, "
|
||||
f"tokenizer_mode={self.model_config.tokenizer_mode}, "
|
||||
f"revision={self.model_config.revision}, "
|
||||
f"tokenizer_revision={self.model_config.tokenizer_revision}, "
|
||||
f"trust_remote_code={self.model_config.trust_remote_code}, "
|
||||
f"dtype={self.model_config.dtype}, "
|
||||
f"max_seq_len={self.model_config.max_model_len}, "
|
||||
f"download_dir={self.load_config.download_dir!r}, "
|
||||
f"load_format={self.load_config.load_format}, "
|
||||
f"tensor_parallel_size={self.parallel_config.tensor_parallel_size}, " # noqa
|
||||
f"pipeline_parallel_size={self.parallel_config.pipeline_parallel_size}, " # noqa
|
||||
f"data_parallel_size={self.parallel_config.data_parallel_size}, " # noqa
|
||||
f"disable_custom_all_reduce={self.parallel_config.disable_custom_all_reduce}, " # noqa
|
||||
f"quantization={self.model_config.quantization}, "
|
||||
f"enforce_eager={self.model_config.enforce_eager}, "
|
||||
f"kv_cache_dtype={self.cache_config.cache_dtype}, "
|
||||
f"device_config={self.device_config.device}, "
|
||||
f"structured_outputs_config={self.structured_outputs_config!r}, "
|
||||
f"observability_config={self.observability_config!r}, "
|
||||
f"seed={self.model_config.seed}, "
|
||||
f"served_model_name={self.model_config.served_model_name}, "
|
||||
f"enable_prefix_caching={self.cache_config.enable_prefix_caching}, "
|
||||
f"chunked_prefill_enabled={self.scheduler_config.chunked_prefill_enabled}, " # noqa
|
||||
f"pooler_config={self.model_config.pooler_config!r}, "
|
||||
f"compilation_config={self.compilation_config!r}")
|
||||
|
||||
|
||||
_current_vllm_config: Optional[VllmConfig] = None
|
||||
_current_prefix: Optional[str] = None
|
||||
|
||||
|
||||
@contextmanager
|
||||
def set_current_vllm_config(vllm_config: VllmConfig,
|
||||
check_compile=False,
|
||||
prefix: Optional[str] = None):
|
||||
"""
|
||||
Temporarily set the current vLLM config.
|
||||
Used during model initialization.
|
||||
We save the current vLLM config in a global variable,
|
||||
so that all modules can access it, e.g. custom ops
|
||||
can access the vLLM config to determine how to dispatch.
|
||||
"""
|
||||
global _current_vllm_config, _current_prefix
|
||||
old_vllm_config = _current_vllm_config
|
||||
old_prefix = _current_prefix
|
||||
from vllm.compilation.counter import compilation_counter
|
||||
num_models_seen = compilation_counter.num_models_seen
|
||||
try:
|
||||
_current_vllm_config = vllm_config
|
||||
_current_prefix = prefix
|
||||
yield
|
||||
except Exception:
|
||||
raise
|
||||
else:
|
||||
if check_compile:
|
||||
vllm_config.compilation_config.custom_op_log_check()
|
||||
|
||||
if check_compile and \
|
||||
vllm_config.compilation_config.level == CompilationLevel.PIECEWISE \
|
||||
and compilation_counter.num_models_seen == num_models_seen:
|
||||
# If the model supports compilation,
|
||||
# compilation_counter.num_models_seen should be increased
|
||||
# by at least 1.
|
||||
# If it is not increased, it means the model does not support
|
||||
# compilation (does not have @support_torch_compile decorator).
|
||||
logger.warning(
|
||||
"`torch.compile` is turned on, but the model %s"
|
||||
" does not support it. Please open an issue on GitHub"
|
||||
" if you want it to be supported.",
|
||||
vllm_config.model_config.model)
|
||||
finally:
|
||||
_current_vllm_config = old_vllm_config
|
||||
_current_prefix = old_prefix
|
||||
# Clear the compilation config cache when context changes
|
||||
get_cached_compilation_config.cache_clear()
|
||||
|
||||
|
||||
@lru_cache(maxsize=1)
|
||||
def get_cached_compilation_config():
|
||||
"""Cache config to avoid repeated calls to get_current_vllm_config()"""
|
||||
return get_current_vllm_config().compilation_config
|
||||
|
||||
|
||||
def get_current_vllm_config() -> VllmConfig:
|
||||
if _current_vllm_config is None:
|
||||
# in ci, usually when we test custom ops/modules directly,
|
||||
# we don't set the vllm config. In that case, we set a default
|
||||
# config.
|
||||
logger.warning("Current vLLM config is not set.")
|
||||
return VllmConfig()
|
||||
return _current_vllm_config
|
||||
|
||||
|
||||
T = TypeVar("T")
|
||||
|
||||
|
||||
def get_layers_from_vllm_config(
|
||||
vllm_config: VllmConfig,
|
||||
layer_type: type[T],
|
||||
layer_names: Optional[list[str]] = None) -> dict[str, T]:
|
||||
"""
|
||||
Get layers from the vLLM config.
|
||||
|
||||
Args:
|
||||
vllm_config: The vLLM config.
|
||||
layer_type: The type of the layer to get.
|
||||
layer_names: The names of the layers to get. If None, return all layers.
|
||||
"""
|
||||
|
||||
if layer_names is None:
|
||||
layer_names = list(
|
||||
vllm_config.compilation_config.static_forward_context.keys())
|
||||
|
||||
forward_context = vllm_config.compilation_config.static_forward_context
|
||||
|
||||
return {
|
||||
layer_name: forward_context[layer_name]
|
||||
for layer_name in layer_names
|
||||
if isinstance(forward_context[layer_name], layer_type)
|
||||
}
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user