Compare commits
49 Commits
fix_ds_eag
...
use-uv-pyt
| Author | SHA1 | Date | |
|---|---|---|---|
| 728c365e4d | |||
| be8921fbba | |||
| d4e7a1152d | |||
| be22bb6f3d | |||
| 169313b9f8 | |||
| 0b018d8baf | |||
| c31246800c | |||
| 4134312b35 | |||
| da554f932e | |||
| aac622e0cd | |||
| 1726e93ef1 | |||
| ee04c0cd04 | |||
| c36f0aa300 | |||
| 5234dc7451 | |||
| 3b7c20a6b5 | |||
| f9e714813a | |||
| 2518230d3e | |||
| a332b84578 | |||
| 1405f0c7ba | |||
| 84d57342b6 | |||
| 57b46d769e | |||
| f48b6a03ba | |||
| 2a69ab4899 | |||
| 8d7da92fd7 | |||
| e952eee698 | |||
| 66bca9b8bd | |||
| 99028fda44 | |||
| 1244948885 | |||
| a73f6491c8 | |||
| 001e50c92c | |||
| 96ebcaa3ad | |||
| 5db1870bb9 | |||
| 2ce26b9b5d | |||
| a388252ac4 | |||
| 9a9f48dff7 | |||
| 67f3fb0844 | |||
| 43b752c325 | |||
| cfd302db9b | |||
| fb610ae684 | |||
| 2f652e6cdf | |||
| e6a226efba | |||
| a2e6fa7e03 | |||
| 9f1c4ecaf2 | |||
| ef283548f7 | |||
| f4db5e6de1 | |||
| 099aaee536 | |||
| 35fe398c7c | |||
| bb6d43047e | |||
| bc546f76a1 |
@ -50,19 +50,28 @@ steps:
|
||||
mirror_hardwares: [amdexperimental]
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/multimodal
|
||||
- tests/utils_
|
||||
commands:
|
||||
- pytest -v -s -m 'not cpu_test' multimodal
|
||||
- pytest -v -s utils_
|
||||
|
||||
- label: Async Engine, Inputs, Utils, Worker Test (CPU) # 4 mins
|
||||
timeout_in_minutes: 10
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/test_inputs.py
|
||||
- tests/test_outputs.py
|
||||
- tests/multimodal
|
||||
- tests/utils_
|
||||
- tests/standalone_tests/lazy_imports.py
|
||||
- tests/transformers_utils
|
||||
no_gpu: true
|
||||
commands:
|
||||
- python3 standalone_tests/lazy_imports.py
|
||||
- pytest -v -s test_inputs.py
|
||||
- pytest -v -s test_outputs.py
|
||||
- pytest -v -s multimodal
|
||||
- pytest -v -s utils_ # Utils
|
||||
- pytest -v -s transformers_utils # transformers_utils
|
||||
- pytest -v -s -m 'cpu_test' multimodal
|
||||
- pytest -v -s transformers_utils
|
||||
|
||||
- label: Python-only Installation Test # 10min
|
||||
timeout_in_minutes: 20
|
||||
@ -287,23 +296,34 @@ steps:
|
||||
- tests/v1
|
||||
commands:
|
||||
# split the test to avoid interference
|
||||
- pytest -v -s v1/core
|
||||
- pytest -v -s v1/executor
|
||||
- pytest -v -s v1/kv_offload
|
||||
- pytest -v -s v1/sample
|
||||
- pytest -v -s v1/logits_processors
|
||||
- pytest -v -s v1/worker
|
||||
- pytest -v -s v1/structured_output
|
||||
- pytest -v -s v1/spec_decode
|
||||
- pytest -v -s v1/kv_connector/unit
|
||||
- pytest -v -s v1/metrics
|
||||
- pytest -v -s -m 'not cpu_test' v1/kv_connector/unit
|
||||
- pytest -v -s -m 'not cpu_test' v1/metrics
|
||||
- pytest -v -s v1/test_oracle.py
|
||||
- pytest -v -s v1/test_request.py
|
||||
- pytest -v -s v1/test_serial_utils.py
|
||||
# Integration test for streaming correctness (requires special branch).
|
||||
- pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
|
||||
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
|
||||
|
||||
- label: V1 Test others (CPU) # 5 mins
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/v1
|
||||
no_gpu: true
|
||||
commands:
|
||||
# split the test to avoid interference
|
||||
- pytest -v -s v1/core
|
||||
- pytest -v -s v1/structured_output
|
||||
- pytest -v -s v1/test_serial_utils.py
|
||||
- pytest -v -s -m 'cpu_test' v1/kv_connector/unit
|
||||
- pytest -v -s -m 'cpu_test' v1/metrics
|
||||
|
||||
|
||||
- label: Examples Test # 30min
|
||||
timeout_in_minutes: 45
|
||||
mirror_hardwares: [amdexperimental]
|
||||
@ -533,10 +553,17 @@ steps:
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/tool_use
|
||||
- tests/mistral_tool_use
|
||||
commands:
|
||||
- pytest -v -s tool_use
|
||||
- pytest -v -s mistral_tool_use
|
||||
- pytest -v -s -m 'not cpu_test' tool_use
|
||||
|
||||
- label: OpenAI-Compatible Tool Use (CPU) # 5 mins
|
||||
timeout_in_minutes: 10
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/tool_use
|
||||
no_gpu: true
|
||||
commands:
|
||||
- pytest -v -s -m 'cpu_test' tool_use
|
||||
|
||||
##### models test #####
|
||||
|
||||
@ -576,13 +603,19 @@ steps:
|
||||
- vllm/
|
||||
- tests/models/test_transformers.py
|
||||
- tests/models/test_registry.py
|
||||
commands:
|
||||
- pytest -v -s models/test_transformers.py models/test_registry.py
|
||||
|
||||
- label: Basic Models Test (Other CPU) # 5min
|
||||
timeout_in_minutes: 10
|
||||
torch_nightly: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/test_utils.py
|
||||
- tests/models/test_vision.py
|
||||
no_gpu: true
|
||||
commands:
|
||||
- pytest -v -s models/test_transformers.py \
|
||||
models/test_registry.py \
|
||||
models/test_utils.py \
|
||||
models/test_vision.py
|
||||
- pytest -v -s models/test_utils.py models/test_vision.py
|
||||
|
||||
- label: Language Models Tests (Standard)
|
||||
timeout_in_minutes: 25
|
||||
@ -812,7 +845,7 @@ steps:
|
||||
- vllm/v1/attention/backends/flashinfer.py
|
||||
commands:
|
||||
- uv pip install --system 'gpt-oss[eval]==0.0.5'
|
||||
- pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58 --server-args '--tensor-parallel-size 2'
|
||||
- pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
|
||||
|
||||
- label: Blackwell Quantized MoE Test
|
||||
timeout_in_minutes: 60
|
||||
|
||||
1
.github/mergify.yml
vendored
@ -239,7 +239,6 @@ pull_request_rules:
|
||||
conditions:
|
||||
- or:
|
||||
- files~=^tests/tool_use/
|
||||
- files~=^tests/mistral_tool_use/
|
||||
- files~=^tests/entrypoints/openai/tool_parsers/
|
||||
- files=tests/entrypoints/openai/test_chat_with_tool_reasoning.py
|
||||
- files~=^vllm/entrypoints/openai/tool_parsers/
|
||||
|
||||
@ -37,7 +37,7 @@ install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
|
||||
set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12" "3.13")
|
||||
|
||||
# Supported AMD GPU architectures.
|
||||
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201")
|
||||
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151")
|
||||
|
||||
#
|
||||
# Supported/expected torch versions for CUDA/ROCm.
|
||||
@ -86,6 +86,9 @@ find_package(Torch REQUIRED)
|
||||
# Supported NVIDIA architectures.
|
||||
# This check must happen after find_package(Torch) because that's when CMAKE_CUDA_COMPILER_VERSION gets defined
|
||||
if(DEFINED CMAKE_CUDA_COMPILER_VERSION AND
|
||||
CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 13.0)
|
||||
set(CUDA_SUPPORTED_ARCHS "7.5;8.0;8.6;8.7;8.9;9.0;10.0;11.0;12.0")
|
||||
elseif(DEFINED CMAKE_CUDA_COMPILER_VERSION AND
|
||||
CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8)
|
||||
set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0")
|
||||
else()
|
||||
@ -175,6 +178,15 @@ if(NVCC_THREADS AND VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
list(APPEND VLLM_GPU_FLAGS "--threads=${NVCC_THREADS}")
|
||||
endif()
|
||||
|
||||
#
|
||||
# Set compression mode for CUDA >=13.x.
|
||||
#
|
||||
if(VLLM_GPU_LANG STREQUAL "CUDA" AND
|
||||
DEFINED CMAKE_CUDA_COMPILER_VERSION AND
|
||||
CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 13.0)
|
||||
list(APPEND VLLM_GPU_FLAGS "--compress-mode=size")
|
||||
endif()
|
||||
|
||||
#
|
||||
# Set CUDA include flags for CXX compiler.
|
||||
#
|
||||
@ -270,7 +282,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
|
||||
|
||||
# Set CUTLASS_REVISION. Used for FetchContent. Also fixes some bogus messages when building.
|
||||
set(CUTLASS_REVISION "v4.0.0" CACHE STRING "CUTLASS revision to use")
|
||||
set(CUTLASS_REVISION "v4.2.1" CACHE STRING "CUTLASS revision to use")
|
||||
|
||||
# Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
|
||||
if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
|
||||
@ -305,7 +317,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
"csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
|
||||
"csrc/quantization/fp4/nvfp4_quant_entry.cu"
|
||||
"csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu"
|
||||
"csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu"
|
||||
"csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
|
||||
"csrc/cutlass_extensions/common.cpp"
|
||||
"csrc/quantization/fp8/per_token_group_quant.cu")
|
||||
@ -440,7 +451,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
|
||||
# The cutlass_scaled_mm kernels for Geforce Blackwell SM120 (c3x, i.e. CUTLASS 3.x) require
|
||||
# CUDA 12.8 or later
|
||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "12.0;12.0a" "${CUDA_ARCHS}")
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
|
||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "12.0f" "${CUDA_ARCHS}")
|
||||
else()
|
||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "12.0a" "${CUDA_ARCHS}")
|
||||
endif()
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
||||
set(SRCS
|
||||
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm120.cu"
|
||||
@ -470,7 +485,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
|
||||
# The cutlass_scaled_mm kernels for Blackwell SM100 (c3x, i.e. CUTLASS 3.x)
|
||||
# require CUDA 12.8 or later
|
||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a" "${CUDA_ARCHS}")
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
|
||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
|
||||
else()
|
||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
|
||||
endif()
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
||||
set(SRCS
|
||||
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu"
|
||||
@ -550,7 +569,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
|
||||
# The nvfp4_scaled_mm_sm120 kernels for Geforce Blackwell SM120 require
|
||||
# CUDA 12.8 or later
|
||||
cuda_archs_loose_intersection(FP4_ARCHS "12.0;12.0a" "${CUDA_ARCHS}")
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
|
||||
cuda_archs_loose_intersection(FP4_ARCHS "12.0f" "${CUDA_ARCHS}")
|
||||
else()
|
||||
cuda_archs_loose_intersection(FP4_ARCHS "12.0a" "${CUDA_ARCHS}")
|
||||
endif()
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS)
|
||||
set(SRCS
|
||||
"csrc/quantization/fp4/nvfp4_quant_kernels.cu"
|
||||
@ -569,7 +592,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
endif()
|
||||
|
||||
# FP4 Archs and flags
|
||||
cuda_archs_loose_intersection(FP4_ARCHS "10.0a" "${CUDA_ARCHS}")
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
|
||||
cuda_archs_loose_intersection(FP4_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
|
||||
else()
|
||||
cuda_archs_loose_intersection(FP4_ARCHS "10.0a;10.1a;12.0a;12.1a" "${CUDA_ARCHS}")
|
||||
endif()
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS)
|
||||
set(SRCS
|
||||
"csrc/quantization/fp4/nvfp4_quant_kernels.cu"
|
||||
@ -591,7 +618,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
endif()
|
||||
|
||||
# CUTLASS MLA Archs and flags
|
||||
cuda_archs_loose_intersection(MLA_ARCHS "10.0a" "${CUDA_ARCHS}")
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
|
||||
cuda_archs_loose_intersection(MLA_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
|
||||
else()
|
||||
cuda_archs_loose_intersection(MLA_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
|
||||
endif()
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND MLA_ARCHS)
|
||||
set(SRCS
|
||||
"csrc/attention/mla/sm100_cutlass_mla_kernel.cu")
|
||||
@ -635,7 +666,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
|
||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f" "${CUDA_ARCHS}")
|
||||
else()
|
||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
|
||||
endif()
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
||||
set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x_sm100.cu")
|
||||
set_gencode_flags_for_srcs(
|
||||
@ -656,7 +691,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
endif()
|
||||
|
||||
# moe_data.cu is used by all CUTLASS MoE kernels.
|
||||
cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0a" "${CUDA_ARCHS}")
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
|
||||
cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
|
||||
else()
|
||||
cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
|
||||
endif()
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND CUTLASS_MOE_DATA_ARCHS)
|
||||
set(SRCS "csrc/quantization/cutlass_w8a8/moe/moe_data.cu")
|
||||
set_gencode_flags_for_srcs(
|
||||
@ -675,7 +714,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
|
||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
|
||||
else()
|
||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
|
||||
endif()
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
||||
set(SRCS "csrc/quantization/cutlass_w8a8/moe/blockwise_scaled_group_mm_sm100.cu")
|
||||
set_gencode_flags_for_srcs(
|
||||
|
||||
@ -584,8 +584,9 @@ def main(args: argparse.Namespace):
|
||||
topk = config.num_experts_per_tok
|
||||
intermediate_size = config.intermediate_size
|
||||
elif config.architectures[0] in (
|
||||
"DeepseekV3ForCausalLM",
|
||||
"DeepseekV2ForCausalLM",
|
||||
"DeepseekV3ForCausalLM",
|
||||
"DeepseekV32ForCausalLM",
|
||||
"Glm4MoeForCausalLM",
|
||||
):
|
||||
E = config.n_routed_experts
|
||||
|
||||
@ -310,13 +310,13 @@ function(cuda_archs_loose_intersection OUT_CUDA_ARCHS SRC_CUDA_ARCHS TGT_CUDA_AR
|
||||
list(REMOVE_DUPLICATES _PTX_ARCHS)
|
||||
list(REMOVE_DUPLICATES _SRC_CUDA_ARCHS)
|
||||
|
||||
# if x.0a is in SRC_CUDA_ARCHS and x.0 is in CUDA_ARCHS then we should
|
||||
# remove x.0a from SRC_CUDA_ARCHS and add x.0a to _CUDA_ARCHS
|
||||
# If x.0a or x.0f is in SRC_CUDA_ARCHS and x.0 is in CUDA_ARCHS then we should
|
||||
# remove x.0a or x.0f from SRC_CUDA_ARCHS and add x.0a or x.0f to _CUDA_ARCHS
|
||||
set(_CUDA_ARCHS)
|
||||
foreach(_arch ${_SRC_CUDA_ARCHS})
|
||||
if(_arch MATCHES "\\a$")
|
||||
if(_arch MATCHES "[af]$")
|
||||
list(REMOVE_ITEM _SRC_CUDA_ARCHS "${_arch}")
|
||||
string(REPLACE "a" "" _base "${_arch}")
|
||||
string(REGEX REPLACE "[af]$" "" _base "${_arch}")
|
||||
if ("${_base}" IN_LIST TGT_CUDA_ARCHS)
|
||||
list(REMOVE_ITEM _TGT_CUDA_ARCHS "${_base}")
|
||||
list(APPEND _CUDA_ARCHS "${_arch}")
|
||||
|
||||
@ -580,22 +580,22 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
|
||||
for (; tile_scheduler.is_valid(); ++tile_scheduler) {
|
||||
auto blk_coord = tile_scheduler.get_block_coord();
|
||||
auto problem_shape = params.problem_shape;
|
||||
auto local_split_kv = params.split_kv;
|
||||
auto local_split_kv = params.split_kv;
|
||||
if (params.mainloop.ptr_seq != nullptr) {
|
||||
get<1>(problem_shape) = params.mainloop.ptr_seq[get<2>(blk_coord)];
|
||||
if (params.ptr_split_kv != nullptr) {
|
||||
if (params.ptr_split_kv != nullptr) {
|
||||
local_split_kv = params.ptr_split_kv[get<2>(blk_coord)];
|
||||
}
|
||||
}
|
||||
if (local_split_kv <= get<3>(blk_coord))
|
||||
continue;
|
||||
if (local_split_kv <= get<3>(blk_coord))
|
||||
continue;
|
||||
load_page_table(
|
||||
blk_coord,
|
||||
problem_shape,
|
||||
params.mainloop,
|
||||
shared_storage.tensors,
|
||||
pipeline_page_table, pipeline_pt_producer_state,
|
||||
local_split_kv
|
||||
local_split_kv
|
||||
);
|
||||
}
|
||||
}
|
||||
@ -604,15 +604,15 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
|
||||
CUTLASS_PRAGMA_NO_UNROLL
|
||||
for (; tile_scheduler.is_valid(); ++tile_scheduler) {
|
||||
auto blk_coord = tile_scheduler.get_block_coord();
|
||||
auto problem_shape = params.problem_shape;
|
||||
auto local_split_kv = params.split_kv;
|
||||
auto problem_shape = params.problem_shape;
|
||||
auto local_split_kv = params.split_kv;
|
||||
if (params.mainloop.ptr_seq != nullptr) {
|
||||
get<1>(problem_shape) = params.mainloop.ptr_seq[get<2>(blk_coord)];
|
||||
if (params.ptr_split_kv != nullptr) {
|
||||
if (params.ptr_split_kv != nullptr) {
|
||||
local_split_kv = params.ptr_split_kv[get<2>(blk_coord)];
|
||||
}
|
||||
}
|
||||
if (local_split_kv <= get<3>(blk_coord))
|
||||
if (local_split_kv <= get<3>(blk_coord))
|
||||
continue;
|
||||
load_cpasync(
|
||||
blk_coord,
|
||||
@ -621,7 +621,7 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
|
||||
params.mainloop_params,
|
||||
shared_storage.tensors,
|
||||
pipeline_load_qk, pipeline_load_qk_producer_state,
|
||||
local_split_kv,
|
||||
local_split_kv,
|
||||
/* must be shared pipe */
|
||||
pipeline_page_table, pipeline_pt_consumer_state
|
||||
);
|
||||
@ -633,15 +633,15 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
|
||||
CUTLASS_PRAGMA_NO_UNROLL
|
||||
for (; tile_scheduler.is_valid(); ++tile_scheduler) {
|
||||
auto blk_coord = tile_scheduler.get_block_coord();
|
||||
auto problem_shape = params.problem_shape;
|
||||
auto local_split_kv = params.split_kv;
|
||||
auto problem_shape = params.problem_shape;
|
||||
auto local_split_kv = params.split_kv;
|
||||
if (params.mainloop.ptr_seq != nullptr) {
|
||||
get<1>(problem_shape) = params.mainloop.ptr_seq[get<2>(blk_coord)];
|
||||
if (params.ptr_split_kv != nullptr) {
|
||||
local_split_kv = params.ptr_split_kv[get<2>(blk_coord)];
|
||||
}
|
||||
if (params.ptr_split_kv != nullptr) {
|
||||
local_split_kv = params.ptr_split_kv[get<2>(blk_coord)];
|
||||
}
|
||||
}
|
||||
if (local_split_kv <= get<3>(blk_coord))
|
||||
if (local_split_kv <= get<3>(blk_coord))
|
||||
continue;
|
||||
load_tma</* paged= */ true>(
|
||||
blk_coord,
|
||||
@ -651,7 +651,7 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
|
||||
shared_storage.tensors,
|
||||
pipeline_load_qk, pipeline_load_qk_producer_state,
|
||||
pipeline_load_qk, pipeline_load_qk_producer_state,
|
||||
local_split_kv
|
||||
local_split_kv
|
||||
);
|
||||
cutlass::arch::NamedBarrier((kNumComputeWarps + kNumLoadWarps) * NumThreadsPerWarp, kNamedBarrierEpilogue).arrive_and_wait();
|
||||
}
|
||||
@ -660,15 +660,15 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
|
||||
CUTLASS_PRAGMA_NO_UNROLL
|
||||
for (; tile_scheduler.is_valid(); ++tile_scheduler) {
|
||||
auto blk_coord = tile_scheduler.get_block_coord();
|
||||
auto problem_shape = params.problem_shape;
|
||||
auto local_split_kv = params.split_kv;
|
||||
auto problem_shape = params.problem_shape;
|
||||
auto local_split_kv = params.split_kv;
|
||||
if (params.mainloop.ptr_seq != nullptr) {
|
||||
get<1>(problem_shape) = params.mainloop.ptr_seq[get<2>(blk_coord)];
|
||||
if (params.ptr_split_kv != nullptr) {
|
||||
if (params.ptr_split_kv != nullptr) {
|
||||
local_split_kv = params.ptr_split_kv[get<2>(blk_coord)];
|
||||
}
|
||||
}
|
||||
}
|
||||
if (local_split_kv <= get<3>(blk_coord))
|
||||
if (local_split_kv <= get<3>(blk_coord))
|
||||
continue;
|
||||
load_tma<false>(
|
||||
blk_coord,
|
||||
@ -678,7 +678,7 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
|
||||
shared_storage.tensors,
|
||||
pipeline_load_qk, pipeline_load_qk_producer_state,
|
||||
pipeline_load_qk, pipeline_load_qk_producer_state,
|
||||
local_split_kv
|
||||
local_split_kv
|
||||
);
|
||||
cutlass::arch::NamedBarrier((kNumComputeWarps + kNumLoadWarps) * NumThreadsPerWarp, kNamedBarrierEpilogue).arrive_and_wait();
|
||||
}
|
||||
@ -694,14 +694,14 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
|
||||
for (; tile_scheduler.is_valid(); ++tile_scheduler) {
|
||||
auto blk_coord = tile_scheduler.get_block_coord();
|
||||
auto problem_shape = params.problem_shape;
|
||||
auto local_split_kv = params.split_kv;
|
||||
auto local_split_kv = params.split_kv;
|
||||
if (params.mainloop.ptr_seq != nullptr) {
|
||||
get<1>(problem_shape) = params.mainloop.ptr_seq[get<2>(blk_coord)];
|
||||
if (params.ptr_split_kv != nullptr) {
|
||||
local_split_kv = params.ptr_split_kv[get<2>(blk_coord)];
|
||||
}
|
||||
}
|
||||
if (local_split_kv <= get<3>(blk_coord))
|
||||
if (local_split_kv <= get<3>(blk_coord))
|
||||
continue;
|
||||
mma(blk_coord,
|
||||
problem_shape,
|
||||
@ -711,7 +711,7 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
|
||||
pipeline_mma_s, pipeline_mma_s_producer_state,
|
||||
pipeline_p_mma, pipeline_p_mma_consumer_state,
|
||||
pipeline_mma_o, pipeline_mma_o_producer_state,
|
||||
local_split_kv
|
||||
local_split_kv
|
||||
);
|
||||
}
|
||||
}
|
||||
@ -726,15 +726,15 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
|
||||
for (; tile_scheduler.is_valid(); ++tile_scheduler) {
|
||||
auto blk_coord = tile_scheduler.get_block_coord();
|
||||
auto problem_shape = params.problem_shape;
|
||||
auto split_kv = params.split_kv;
|
||||
auto local_split_kv = split_kv;
|
||||
auto split_kv = params.split_kv;
|
||||
auto local_split_kv = split_kv;
|
||||
if (params.mainloop.ptr_seq != nullptr) {
|
||||
get<1>(problem_shape) = params.mainloop.ptr_seq[get<2>(blk_coord)];
|
||||
if (params.ptr_split_kv != nullptr) {
|
||||
if (params.ptr_split_kv != nullptr) {
|
||||
local_split_kv = params.ptr_split_kv[get<2>(blk_coord)];
|
||||
}
|
||||
}
|
||||
if (local_split_kv <= get<3>(blk_coord))
|
||||
if (local_split_kv <= get<3>(blk_coord))
|
||||
continue;
|
||||
compute(
|
||||
blk_coord,
|
||||
@ -745,7 +745,7 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
|
||||
pipeline_mma_s, pipeline_mma_s_consumer_state,
|
||||
pipeline_p_mma, pipeline_p_mma_producer_state,
|
||||
pipeline_mma_o, pipeline_mma_o_consumer_state,
|
||||
local_split_kv
|
||||
local_split_kv
|
||||
);
|
||||
}
|
||||
|
||||
@ -1900,7 +1900,7 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
|
||||
cutlass::arch::NamedBarrier(
|
||||
(kNumComputeWarps + kNumLoadWarps) * NumThreadsPerWarp,
|
||||
kNamedBarrierEpilogue
|
||||
).arrive();
|
||||
).arrive_and_wait();
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
@ -536,7 +536,9 @@ __global__ void indexer_k_quant_and_cache_kernel(
|
||||
for (int i = 0; i < VEC_SIZE; i++) {
|
||||
amax = fmaxf(amax, fabsf(float(k_val_ptr[i])));
|
||||
}
|
||||
#ifndef USE_ROCM
|
||||
__syncwarp();
|
||||
#endif
|
||||
|
||||
// Reduced amax
|
||||
for (int mask = 16; mask > 0; mask /= 2) {
|
||||
@ -546,7 +548,9 @@ __global__ void indexer_k_quant_and_cache_kernel(
|
||||
amax = fmaxf(amax, __shfl_xor_sync(unsigned(-1), amax, mask));
|
||||
#endif
|
||||
}
|
||||
#ifndef USE_ROCM
|
||||
__syncwarp();
|
||||
#endif
|
||||
float scale = fmaxf(amax, 1e-4) / 448.0f;
|
||||
if (use_ue8m0) {
|
||||
scale = exp2f(ceilf(log2f(scale)));
|
||||
@ -1167,4 +1171,4 @@ void indexer_k_quant_and_cache(
|
||||
|
||||
DISPATCH_BY_KV_CACHE_DTYPE(k.dtype(), "fp8_e4m3",
|
||||
CALL_INDEXER_K_QUANT_AND_CACHE);
|
||||
}
|
||||
}
|
||||
|
||||
@ -8,11 +8,37 @@
|
||||
#define VLLM_LAUNCH_BLOCKS_CAP 4
|
||||
#endif
|
||||
|
||||
// compile-time estimate of max threads per SM for launch bounds.
|
||||
// Compile-time estimate of max threads per SM for launch bounds.
|
||||
// Families: 1024, 1536, 2048 threads/SM.
|
||||
#ifndef VLLM_MAX_THREADS_PER_SM
|
||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 300
|
||||
#define VLLM_MAX_THREADS_PER_SM 1536
|
||||
#ifdef __CUDA_ARCH__
|
||||
|
||||
/* 1024 thr/SM: Turing (sm_75) */
|
||||
#if (__CUDA_ARCH__ == 750)
|
||||
#define VLLM_MAX_THREADS_PER_SM 1024
|
||||
|
||||
/* 1536 thr/SM: Ampere GA10x (sm_86/87), Ada (sm_89),
|
||||
GB20x consumer (sm_120/121), Thor (sm_101 or sm_110) */
|
||||
#elif (__CUDA_ARCH__ == 860) || (__CUDA_ARCH__ == 870) || \
|
||||
(__CUDA_ARCH__ == 890) || (__CUDA_ARCH__ == 1010) || \
|
||||
(__CUDA_ARCH__ == 1100) || (__CUDA_ARCH__ == 1200) || \
|
||||
(__CUDA_ARCH__ == 1210)
|
||||
#define VLLM_MAX_THREADS_PER_SM 1536
|
||||
|
||||
/* 2048 thr/SM: Volta (sm_70/72), Ampere GA100 (sm_80),
|
||||
Hopper (sm_90), Blackwell (sm_100/103) */
|
||||
#elif (__CUDA_ARCH__ == 700) || (__CUDA_ARCH__ == 720) || \
|
||||
(__CUDA_ARCH__ == 800) || (__CUDA_ARCH__ == 900) || \
|
||||
(__CUDA_ARCH__ == 1000) || (__CUDA_ARCH__ == 1030)
|
||||
#define VLLM_MAX_THREADS_PER_SM 2048
|
||||
|
||||
/* Fallback: use 2048 for unknown future CCs */
|
||||
#else
|
||||
#define VLLM_MAX_THREADS_PER_SM 2048
|
||||
#endif
|
||||
|
||||
#else
|
||||
/* Host pass (no __CUDA_ARCH__): neutral default */
|
||||
#define VLLM_MAX_THREADS_PER_SM 2048
|
||||
#endif
|
||||
#endif
|
||||
|
||||
@ -231,7 +231,7 @@ void cutlass_gemm_blockwise_sm100_fp8_dispatch(torch::Tensor& out,
|
||||
} else {
|
||||
cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
|
||||
OutType, 1, TILE_N, TILE_K, Shape<_64, Int<TILE_N>, Int<TILE_K>>,
|
||||
Shape<_1, _1, _1>, cutlass::epilogue::NoSmemWarpSpecialized1Sm,
|
||||
Shape<_1, _1, _1>, cutlass::epilogue::BlockwiseNoSmemWarpSpecialized1Sm,
|
||||
cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100>>(
|
||||
out, a, b, a_scales, b_scales);
|
||||
}
|
||||
@ -245,7 +245,7 @@ void cutlass_gemm_blockwise_sm100_fp8_dispatch(torch::Tensor& out,
|
||||
} else {
|
||||
cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
|
||||
OutType, 1, TILE_N, TILE_K, Shape<_128, Int<TILE_N>, Int<TILE_K>>,
|
||||
Shape<_1, _1, _1>, cutlass::epilogue::NoSmemWarpSpecialized1Sm,
|
||||
Shape<_1, _1, _1>, cutlass::epilogue::BlockwiseNoSmemWarpSpecialized1Sm,
|
||||
cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100>>(
|
||||
out, a, b, a_scales, b_scales);
|
||||
}
|
||||
@ -259,7 +259,7 @@ void cutlass_gemm_blockwise_sm100_fp8_dispatch(torch::Tensor& out,
|
||||
} else {
|
||||
cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
|
||||
OutType, 1, TILE_N, TILE_K, Shape<_256, Int<TILE_N>, Int<TILE_K>>,
|
||||
Shape<_2, _1, _1>, cutlass::epilogue::NoSmemWarpSpecialized2Sm,
|
||||
Shape<_2, _1, _1>, cutlass::epilogue::BlockwiseNoSmemWarpSpecialized2Sm,
|
||||
cutlass::gemm::KernelTmaWarpSpecializedBlockwise2SmSm100>>(
|
||||
out, a, b, a_scales, b_scales);
|
||||
}
|
||||
@ -271,10 +271,10 @@ void cutlass_gemm_blockwise_sm100_fp8_dispatch(torch::Tensor& out,
|
||||
// TMA epilogue isn't compatible with Swap A/B
|
||||
cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
|
||||
OutType, TILE_M, 1, TILE_K, Shape<Int<TILE_M>, Int<TILE_N>, Int<TILE_K>>,
|
||||
Shape<_1, _1, _1>, cutlass::epilogue::NoSmemWarpSpecialized1Sm,
|
||||
Shape<_1, _1, _1>, cutlass::epilogue::BlockwiseNoSmemWarpSpecialized1Sm,
|
||||
cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100, true>>(
|
||||
out, a, b, a_scales, b_scales);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace vllm
|
||||
} // namespace vllm
|
||||
@ -25,7 +25,10 @@ void dispatch_scaled_mm(torch::Tensor& c, torch::Tensor const& a,
|
||||
if constexpr (!std::is_same_v<Int8Func, std::nullptr_t>) {
|
||||
int8_func(c, a, b, a_scales, b_scales, bias);
|
||||
} else {
|
||||
TORCH_CHECK(false, "Int8 not supported for this architecture");
|
||||
int32_t version_num = get_sm_version_num();
|
||||
TORCH_CHECK(
|
||||
false, "Int8 not supported on SM", version_num,
|
||||
". Use FP8 quantization instead, or run on older arch (SM < 100).");
|
||||
}
|
||||
}
|
||||
} else {
|
||||
|
||||
@ -133,4 +133,4 @@ void cutlass_scaled_mm_sm100_fp8_epilogue(torch::Tensor& out,
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace vllm
|
||||
} // namespace vllm
|
||||
|
||||
@ -67,8 +67,9 @@ void cutlass_scaled_mm_sm100(torch::Tensor& c, torch::Tensor const& a,
|
||||
std::optional<torch::Tensor> const& bias);
|
||||
#endif
|
||||
|
||||
#if defined(ENABLE_SCALED_MM_SM90) && ENABLE_SCALED_MM_SM90 || \
|
||||
defined(ENABLE_SCALED_MM_SM100) && ENABLE_SCALED_MM_SM100
|
||||
#if defined(ENABLE_SCALED_MM_SM90) && ENABLE_SCALED_MM_SM90 || \
|
||||
defined(ENABLE_SCALED_MM_SM100) && ENABLE_SCALED_MM_SM100 || \
|
||||
defined(ENABLE_SCALED_MM_SM120) && ENABLE_SCALED_MM_SM120
|
||||
void get_cutlass_moe_mm_data_caller(
|
||||
const torch::Tensor& topk_ids, torch::Tensor& expert_offsets,
|
||||
torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
|
||||
|
||||
@ -40,7 +40,8 @@ using __hip_fp8_e5m2 = __hip_fp8_e5m2_fnuz;
|
||||
#define __HIP__FP8MFMA__
|
||||
#endif
|
||||
|
||||
#if defined(__HIPCC__) && (defined(__gfx1100__) || defined(__gfx1101__))
|
||||
#if defined(__HIPCC__) && (defined(__gfx1100__) || defined(__gfx1101__) || \
|
||||
defined(__gfx1150__) || defined(__gfx1151__))
|
||||
#define __HIP__GFX11__
|
||||
#endif
|
||||
|
||||
|
||||
@ -13,8 +13,8 @@ ARG PYTHON_VERSION=3.12
|
||||
# private registries that use a different repository naming conventions.
|
||||
#
|
||||
# Example:
|
||||
# docker build --build-arg BUILD_BASE_IMAGE=registry.acme.org/mirror/nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04
|
||||
ARG BUILD_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04
|
||||
# docker build --build-arg BUILD_BASE_IMAGE=registry.acme.org/mirror/nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04
|
||||
ARG BUILD_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04
|
||||
# TODO: Restore to base image after FlashInfer AOT wheel fixed
|
||||
ARG FINAL_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04
|
||||
|
||||
@ -79,31 +79,11 @@ ARG DEADSNAKES_MIRROR_URL
|
||||
ARG DEADSNAKES_GPGKEY_URL
|
||||
ARG GET_PIP_URL
|
||||
|
||||
# Install Python and other dependencies
|
||||
# Install minimal dependencies
|
||||
RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
|
||||
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
|
||||
&& apt-get update -y \
|
||||
&& apt-get install -y ccache software-properties-common git curl sudo \
|
||||
&& if [ ! -z ${DEADSNAKES_MIRROR_URL} ] ; then \
|
||||
if [ ! -z "${DEADSNAKES_GPGKEY_URL}" ] ; then \
|
||||
mkdir -p -m 0755 /etc/apt/keyrings ; \
|
||||
curl -L ${DEADSNAKES_GPGKEY_URL} | gpg --dearmor > /etc/apt/keyrings/deadsnakes.gpg ; \
|
||||
sudo chmod 644 /etc/apt/keyrings/deadsnakes.gpg ; \
|
||||
echo "deb [signed-by=/etc/apt/keyrings/deadsnakes.gpg] ${DEADSNAKES_MIRROR_URL} $(lsb_release -cs) main" > /etc/apt/sources.list.d/deadsnakes.list ; \
|
||||
fi ; \
|
||||
else \
|
||||
for i in 1 2 3; do \
|
||||
add-apt-repository -y ppa:deadsnakes/ppa && break || \
|
||||
{ echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \
|
||||
done ; \
|
||||
fi \
|
||||
&& apt-get update -y \
|
||||
&& apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
|
||||
&& update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
|
||||
&& update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
|
||||
&& ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
|
||||
&& curl -sS ${GET_PIP_URL} | python${PYTHON_VERSION} \
|
||||
&& python3 --version && python3 -m pip --version
|
||||
&& apt-get install -y ccache software-properties-common git curl sudo
|
||||
|
||||
ARG PIP_INDEX_URL UV_INDEX_URL
|
||||
ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
|
||||
@ -111,9 +91,9 @@ ARG PYTORCH_CUDA_INDEX_BASE_URL
|
||||
ARG PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL
|
||||
ARG PIP_KEYRING_PROVIDER UV_KEYRING_PROVIDER
|
||||
|
||||
# Install uv for faster pip installs
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
python3 -m pip install uv
|
||||
# Install uv and Python
|
||||
COPY --from=ghcr.io/astral-sh/uv:0.8.22 /uv /uvx /bin/
|
||||
RUN uv python install ${PYTHON_VERSION} --default --verbose
|
||||
|
||||
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
|
||||
# Reference: https://github.com/astral-sh/uv/pull/1694
|
||||
@ -296,32 +276,12 @@ ARG GET_PIP_URL
|
||||
RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
|
||||
echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment
|
||||
|
||||
# Install Python and other dependencies
|
||||
# Install minimal dependencies
|
||||
RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
|
||||
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
|
||||
&& apt-get update -y \
|
||||
&& apt-get install -y ccache software-properties-common git curl wget sudo vim python3-pip \
|
||||
&& apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
|
||||
&& if [ ! -z ${DEADSNAKES_MIRROR_URL} ] ; then \
|
||||
if [ ! -z "${DEADSNAKES_GPGKEY_URL}" ] ; then \
|
||||
mkdir -p -m 0755 /etc/apt/keyrings ; \
|
||||
curl -L ${DEADSNAKES_GPGKEY_URL} | gpg --dearmor > /etc/apt/keyrings/deadsnakes.gpg ; \
|
||||
sudo chmod 644 /etc/apt/keyrings/deadsnakes.gpg ; \
|
||||
echo "deb [signed-by=/etc/apt/keyrings/deadsnakes.gpg] ${DEADSNAKES_MIRROR_URL} $(lsb_release -cs) main" > /etc/apt/sources.list.d/deadsnakes.list ; \
|
||||
fi ; \
|
||||
else \
|
||||
for i in 1 2 3; do \
|
||||
add-apt-repository -y ppa:deadsnakes/ppa && break || \
|
||||
{ echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \
|
||||
done ; \
|
||||
fi \
|
||||
&& apt-get update -y \
|
||||
&& apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv libibverbs-dev \
|
||||
&& update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
|
||||
&& update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
|
||||
&& ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
|
||||
&& curl -sS ${GET_PIP_URL} | python${PYTHON_VERSION} \
|
||||
&& python3 --version && python3 -m pip --version
|
||||
&& apt-get install -y ccache software-properties-common git curl wget sudo vim \
|
||||
&& apt-get install -y ffmpeg libsm6 libxext6 libgl1
|
||||
|
||||
ARG PIP_INDEX_URL UV_INDEX_URL
|
||||
ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
|
||||
@ -329,9 +289,9 @@ ARG PYTORCH_CUDA_INDEX_BASE_URL
|
||||
ARG PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL
|
||||
ARG PIP_KEYRING_PROVIDER UV_KEYRING_PROVIDER
|
||||
|
||||
# Install uv for faster pip installs
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
python3 -m pip install uv
|
||||
# Install uv and Python
|
||||
COPY --from=ghcr.io/astral-sh/uv:0.8.22 /uv /uvx /bin/
|
||||
RUN uv python install ${PYTHON_VERSION} --default --verbose
|
||||
|
||||
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
|
||||
# Reference: https://github.com/astral-sh/uv/pull/1694
|
||||
|
||||
@ -47,7 +47,7 @@ ENV PATH="$VIRTUAL_ENV/bin:$PATH"
|
||||
|
||||
ENV UV_HTTP_TIMEOUT=500
|
||||
|
||||
# Install Python dependencies
|
||||
# Install Python dependencies
|
||||
ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
|
||||
ENV UV_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
|
||||
ENV UV_INDEX_STRATEGY="unsafe-best-match"
|
||||
@ -104,7 +104,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
--mount=type=cache,target=/root/.cache/ccache \
|
||||
--mount=type=cache,target=/workspace/vllm/.deps,sharing=locked \
|
||||
--mount=type=bind,source=.git,target=.git \
|
||||
VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel
|
||||
VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel
|
||||
|
||||
######################### TEST DEPS #########################
|
||||
FROM base AS vllm-test-deps
|
||||
@ -117,7 +117,7 @@ RUN --mount=type=bind,src=requirements/test.in,target=requirements/test.in \
|
||||
uv pip compile requirements/cpu-test.in -o requirements/cpu-test.txt --index-strategy unsafe-best-match --torch-backend cpu
|
||||
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install -r requirements/cpu-test.txt
|
||||
uv pip install -r requirements/cpu-test.txt
|
||||
|
||||
######################### DEV IMAGE #########################
|
||||
FROM vllm-build AS vllm-dev
|
||||
@ -130,12 +130,12 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
|
||||
|
||||
# install development dependencies (for testing)
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install -e tests/vllm_test_utils
|
||||
uv pip install -e tests/vllm_test_utils
|
||||
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
--mount=type=cache,target=/root/.cache/ccache \
|
||||
--mount=type=bind,source=.git,target=.git \
|
||||
VLLM_TARGET_DEVICE=cpu python3 setup.py develop
|
||||
VLLM_TARGET_DEVICE=cpu python3 setup.py develop
|
||||
|
||||
COPY --from=vllm-test-deps /workspace/vllm/requirements/cpu-test.txt requirements/test.txt
|
||||
|
||||
@ -160,11 +160,12 @@ ADD ./benchmarks/ ./benchmarks/
|
||||
ADD ./vllm/collect_env.py .
|
||||
ADD ./.buildkite/ ./.buildkite/
|
||||
|
||||
# Create symlink for vllm-workspace to maintain CI compatibility
|
||||
RUN ln -sf /workspace /vllm-workspace
|
||||
|
||||
# install development dependencies (for testing)
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install -e tests/vllm_test_utils
|
||||
|
||||
ENTRYPOINT ["bash"]
|
||||
uv pip install -e tests/vllm_test_utils
|
||||
|
||||
######################### RELEASE IMAGE #########################
|
||||
FROM base AS vllm-openai
|
||||
|
||||
@ -6,7 +6,7 @@ ARG CUDA_VERSION=12.8.0
|
||||
#
|
||||
#################### BASE BUILD IMAGE ####################
|
||||
# prepare basic build environment
|
||||
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 AS base
|
||||
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS base
|
||||
ARG CUDA_VERSION=12.8.0
|
||||
ARG PYTHON_VERSION=3.12
|
||||
ARG TARGETPLATFORM
|
||||
|
||||
@ -15,7 +15,7 @@ FROM ${BASE_IMAGE} AS base
|
||||
ENV PATH=/opt/rocm/llvm/bin:/opt/rocm/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
|
||||
ENV ROCM_PATH=/opt/rocm
|
||||
ENV LD_LIBRARY_PATH=/opt/rocm/lib:/usr/local/lib:
|
||||
ARG PYTORCH_ROCM_ARCH=gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201
|
||||
ARG PYTORCH_ROCM_ARCH=gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151
|
||||
ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}
|
||||
ENV AITER_ROCM_ARCH=gfx942;gfx950
|
||||
|
||||
@ -141,4 +141,4 @@ RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
|
||||
&& echo "FA_BRANCH: ${FA_BRANCH}" >> /app/versions.txt \
|
||||
&& echo "FA_REPO: ${FA_REPO}" >> /app/versions.txt \
|
||||
&& echo "AITER_BRANCH: ${AITER_BRANCH}" >> /app/versions.txt \
|
||||
&& echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt
|
||||
&& echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt
|
||||
|
||||
@ -1,2 +1,2 @@
|
||||
search:
|
||||
boost: 0.5
|
||||
exclude: true
|
||||
|
||||
|
Before Width: | Height: | Size: 119 KiB After Width: | Height: | Size: 127 KiB |
BIN
docs/assets/deployment/hf-inference-endpoints-catalog.png
Normal file
|
After Width: | Height: | Size: 627 KiB |
BIN
docs/assets/deployment/hf-inference-endpoints-choose-infra.png
Normal file
|
After Width: | Height: | Size: 350 KiB |
|
After Width: | Height: | Size: 814 KiB |
|
After Width: | Height: | Size: 267 KiB |
|
After Width: | Height: | Size: 354 KiB |
|
After Width: | Height: | Size: 781 KiB |
BIN
docs/assets/deployment/hf-inference-endpoints-new-endpoint.png
Normal file
|
After Width: | Height: | Size: 51 KiB |
|
After Width: | Height: | Size: 359 KiB |
BIN
docs/assets/deployment/hf-inference-endpoints-select-model.png
Normal file
|
After Width: | Height: | Size: 82 KiB |
170
docs/deployment/frameworks/hf_inference_endpoints.md
Normal file
@ -0,0 +1,170 @@
|
||||
# Hugging Face Inference Endpoints
|
||||
|
||||
## Overview
|
||||
|
||||
Models compatible with vLLM can be deployed on Hugging Face Inference Endpoints, either starting from the [Hugging Face Hub](https://huggingface.co) or directly from the [Inference Endpoints](https://endpoints.huggingface.co/) interface. This allows you to serve models in a fully managed environment with GPU acceleration, auto-scaling, and monitoring, without managing the infrastructure manually.
|
||||
|
||||
For advanced details on vLLM integration and deployment options, see [Advanced Deployment Details](#advanced-deployment-details).
|
||||
|
||||
## Deployment Methods
|
||||
|
||||
- [**Method 1: Deploy from the Catalog.**](#method-1-deploy-from-the-catalog) One-click deploy models from the Hugging Face Hub with ready-made optimized configurations.
|
||||
- [**Method 2: Guided Deployment (Transformers Models).**](#method-2-guided-deployment-transformers-models) Instantly deploy models tagged with `transformers` from the Hub UI using the **Deploy** button.
|
||||
- [**Method 3: Manual Deployment (Advanced Models).**](#method-3-manual-deployment-advanced-models) For models that either use custom code with the `transformers` tag, or don’t run with standard `transformers` but are supported by vLLM. This method requires manual configuration.
|
||||
|
||||
### Method 1: Deploy from the Catalog
|
||||
|
||||
This is the easiest way to get started with vLLM on Hugging Face Inference Endpoints. You can browse a catalog of models with verified and optimized deployment configuration at [Inference Endpoints](https://endpoints.huggingface.co/catalog) to maximize performance.
|
||||
|
||||
1. Go to [Endpoints Catalog](https://endpoints.huggingface.co/catalog) and in the **Inference Server** options, select `vLLM`.This will display the current list of models with optimized preconfigured options.
|
||||
|
||||

|
||||
|
||||
1. Select the desired model and click **Create Endpoint**.
|
||||
|
||||

|
||||
|
||||
1. Once the deployment is ready, you can use the endpoint. Update the `DEPLOYMENT_URL` with the URL provided in the console, remembering to append `/v1` as required.
|
||||
|
||||
```python
|
||||
# pip install openai
|
||||
from openai import OpenAI
|
||||
import os
|
||||
|
||||
client = OpenAI(
|
||||
base_url = DEPLOYMENT_URL,
|
||||
api_key = os.environ["HF_TOKEN"] # https://huggingface.co/settings/tokens
|
||||
)
|
||||
|
||||
chat_completion = client.chat.completions.create(
|
||||
model = "HuggingFaceTB/SmolLM3-3B",
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "Give me a brief explanation of gravity in simple terms."
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
stream = True
|
||||
)
|
||||
|
||||
for message in chat_completion:
|
||||
print(message.choices[0].delta.content, end = "")
|
||||
```
|
||||
|
||||
!!! note
|
||||
The catalog provides models optimized for vLLM, including GPU settings and inference engine configurations. You can monitor the endpoint and update the **container or its configuration** from the Inference Endpoints UI.
|
||||
|
||||
### Method 2: Guided Deployment (Transformers Models)
|
||||
|
||||
This method applies to models with the `transformers` library tag in their metadata. It allows you to deploy a model directly from the Hub UI without manual configuration.
|
||||
|
||||
1. Navigate to a model on [Hugging Face Hub](https://huggingface.co/models).
|
||||
For this example we will use the [`ibm-granite/granite-docling-258M`](https://huggingface.co/ibm-granite/granite-docling-258M) model. You can verify that the model is compatible by checking the front matter in the [README](https://huggingface.co/ibm-granite/granite-docling-258M/blob/main/README.md), where the library is tagged as `library: transformers`.
|
||||
|
||||
2. Locate the **Deploy** button. The button appears for models tagged with `transformers` at the top right of the [model card](https://huggingface.co/ibm-granite/granite-docling-258M).
|
||||
|
||||

|
||||
|
||||
3. Click to **Deploy** button > **HF Inference Endpoints**. You will be taken to the Inference Endpoints interface to configure the deployment.
|
||||
|
||||

|
||||
|
||||
4. Select the Hardware (we choose AWS>GPU>T4 for the example) and Container Configuration. Choose `vLLM` as the container type and finalize the deployment pressing **Create Endpoint**.
|
||||
|
||||

|
||||
|
||||
5. Use the deployed endpoint. Update the `DEPLOYMENT_URL` with the URL provided in the console (remember to add `/v1` needed). You can then use your endpoint programmatically or via the SDK.
|
||||
|
||||
```python
|
||||
# pip install openai
|
||||
from openai import OpenAI
|
||||
import os
|
||||
|
||||
client = OpenAI(
|
||||
base_url = DEPLOYMENT_URL,
|
||||
api_key = os.environ["HF_TOKEN"] # https://huggingface.co/settings/tokens
|
||||
)
|
||||
|
||||
chat_completion = client.chat.completions.create(
|
||||
model = "ibm-granite/granite-docling-258M",
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": "https://huggingface.co/ibm-granite/granite-docling-258M/resolve/main/assets/new_arxiv.png"
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "Convert this page to docling."
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
stream = True
|
||||
)
|
||||
|
||||
for message in chat_completion:
|
||||
print(message.choices[0].delta.content, end = "")
|
||||
```
|
||||
|
||||
!!! note
|
||||
This method uses best-guess defaults. You may need to adjust the configuration to fit your specific requirements.
|
||||
|
||||
### Method 3: Manual Deployment (Advanced Models)
|
||||
|
||||
Some models require manual deployment because they:
|
||||
|
||||
- Use custom code with the `transformers` tag
|
||||
- Don't run with standard `transformers` but are supported by `vLLM`
|
||||
|
||||
These models cannot be deployed using the **Deploy** button on the model card.
|
||||
|
||||
In this guide, we demonstrate manual deployment using the [rednote-hilab/dots.ocr](https://huggingface.co/rednote-hilab/dots.ocr) model, an OCR model integrated with vLLM (see vLLM [PR](https://github.com/vllm-project/vllm/pull/24645)).
|
||||
|
||||
1. Start a new deployment. Go to [Inference Endpoints](https://endpoints.huggingface.co/) and click `New`.
|
||||
|
||||

|
||||
|
||||
2. Search the model in the Hub. In the dialog, switch to **Hub** and search for the desired model.
|
||||
|
||||

|
||||
|
||||
3. Choosing infrastructure. On the configuration page, select the cloud provider and hardware from the available options.
|
||||
For this demo, we choose AWS and L4 GPU. Adjust according to your hardware needs.
|
||||
|
||||

|
||||
|
||||
4. Configure the container. Scroll to the **Container Configuration** and select `vLLM` as the container type.
|
||||
|
||||

|
||||
|
||||
5. Create the endpoint. Click **Create Endpoint** to deploy the model.
|
||||
|
||||
Once the endpoint is ready, you can use it with the OpenAI Completion API, cURL, or other SDKs. Remember to append `/v1` to the deployment URL if needed.
|
||||
|
||||
!!! note
|
||||
You can adjust the **container settings** (Container URI, Container Arguments) from the Inference Endpoints UI and press **Update Endpoint**. This redeploys the endpoint with the updated container configuration. Changes to the model itself require creating a new endpoint or redeploying with a different model. For example, for this demo, you may need to update the Container URI to the nightly image (`vllm/vllm-openai:nightly`) and add the `--trust-remote-code` flag in the container arguments.
|
||||
|
||||
## Advanced Deployment Details
|
||||
|
||||
With the [transformers backend integration](https://blog.vllm.ai/2025/04/11/transformers-backend.html), vLLM now offers Day 0 support for any model compatible with `transformers`. This means you can deploy such models immediately, leveraging vLLM’s optimized inference without additional backend modifications.
|
||||
|
||||
Hugging Face Inference Endpoints provides a fully managed environment for serving models via vLLM. You can deploy models without configuring servers, installing dependencies, or managing clusters. Endpoints also support deployment across multiple cloud providers (AWS, Azure, GCP) without the need for separate accounts.
|
||||
|
||||
The platform integrates seamlessly with the Hugging Face Hub, allowing you to deploy any vLLM- or `transformers`-compatible model, track usage, and update the inference engine directly. The vLLM engine comes preconfigured, enabling optimized inference and easy switching between models or engines without modifying your code. This setup simplifies production deployment: endpoints are ready in minutes, include monitoring and logging, and let you focus on serving models rather than maintaining infrastructure.
|
||||
|
||||
## Next Steps
|
||||
|
||||
- Explore the [Inference Endpoints](https://endpoints.huggingface.co/catalog) model catalog
|
||||
- Read the Inference Endpoints [documentation](https://huggingface.co/docs/inference-endpoints/en/index)
|
||||
- Learn about [Inference Endpoints engines](https://huggingface.co/docs/inference-endpoints/en/engines/vllm)
|
||||
- Understand the [transformers backend integration](https://blog.vllm.ai/2025/04/11/transformers-backend.html)
|
||||
@ -242,30 +242,8 @@ Example: `python3 -m tests.kernels.moe.modular_kernel_tools.profile_modular_kern
|
||||
|
||||
## FusedMoEPrepareAndFinalize Implementations
|
||||
|
||||
The following table lists the `FusedMoEPrepareAndFinalize` implementations at the time of writing,
|
||||
|
||||
| Implementation | Type | Comments |
|
||||
| :--- | :--- | :--- |
|
||||
| DeepEPHTPrepareAndFinalize | Contiguous / Non-Batched | Uses the DeepEP High-Throughput all2all kernels. |
|
||||
| DeepEPLLPrepareAndFinalize | Batched | Uses the DeepEP Low-Latency all2all kernels. |
|
||||
| PplxPrepareAndFinalize | Batched | Uses the Perplexity all2all kernels. |
|
||||
| FlashInferCutlassMoEPrepareAndFinalize | Contiguous | |
|
||||
| MoEPrepareAndFinalizeNoEP | Contiguous | This implementation is used when there is no EP. i.e. no all2all kernels are invoked. |
|
||||
| BatchedPrepareAndFinalize | Batched | A reference prepare/finalize class that reorganizes the tokens into expert batched format, i.e. E x max_num_tokens x K. (Doesn’t use any all2all kernels. This is primarily used in unit testing) |
|
||||
See [Fused MoE Kernel features](./moe_kernel_features.md#fused-moe-modular-all2all-backends) for a list of all the available modular prepare and finalize subclasses.
|
||||
|
||||
## FusedMoEPermuteExpertsUnpermute
|
||||
|
||||
The following table lists the `FusedMoEPermuteExpertsUnpermute` implementations at the time of writing,
|
||||
|
||||
| Implementation | Type | Comment |
|
||||
| :--- | :--- | :--- |
|
||||
| BatchedDeepGemmExperts | Batched | Uses the DeepGemm’s Masked Grouped Gemm kernels for the fused_moe operation. |
|
||||
| BatchedTritonExperts | Batched | Uses a Triton Kernel for the Batched matmuls. |
|
||||
| BatchedTritonOrDeepGemmExperts | Batched | Chooses either the `BatchedDeepGemmExperts` or `BatchedTritonExperts` based on environment settings. |
|
||||
| DeepGemmExperts | Contiguous / Non-Batched | Uses DeepGemm’s Grouped Gemm kernels for fused_moe operation. |
|
||||
| TritonExperts | Contiguous / Non-Batched | Uses a Triton Kernel for fused_moe matmuls. |
|
||||
| TritonOrDeepGemmExperts | Contiguous / Non-Batched | Chooses either the `DeepGemmExperts` or `TritonExperts` based on fused_moe inputs. |
|
||||
| CutlassExpertsFP8 | Supports both Batched and Contiguous formats | Uses Cutlass Grouped Gemm implementations for the fp8 matmuls. |
|
||||
| CutlassExpertsFP4 | Supports both Batched and Contiguous formats | Uses Cutlass Grouped Gemm implementations for the fp4 matmuls. |
|
||||
| FlashInferExperts | Contiguous | Uses fused_moe operation from FlashInfer |
|
||||
| NaiveBatchedExperts | Batched | Reference Batched Experts implementation. Primarily used in unit tests. |
|
||||
See [Fused MoE Kernel features](./moe_kernel_features.md#fused-moe-experts-kernels) for a list of all the available modular experts.
|
||||
|
||||
119
docs/design/moe_kernel_features.md
Normal file
@ -0,0 +1,119 @@
|
||||
# Fused MoE Kernel features
|
||||
|
||||
The purpose of this document is to provide an overview of the various MoE kernels (both modular and non-modular) so it will be easier to select an appropriate set of kernels for any particular situation. This includes information about the all2all backends used by modular kernels.
|
||||
|
||||
## Fused MoE Modular All2All backends
|
||||
|
||||
There are a number of all2all communication backends that are used to implement expert parallelism (EP) for the `FusedMoE` layer. The different `FusedMoEPrepareAndFinalize` sub-classes provide an interface for each all2all backend.
|
||||
|
||||
The following table describes the relevant features of each backend, i.e. activation format, supported quantization schemes and async support.
|
||||
|
||||
The output activation format (standard or batched) corresponds to the output of the prepare step of the `FusedMoEPrepareAndFinalize` subclass, the finalize step requires the same format. All the backend `prepare` methods expect activations in standard format and all the `finalize methods return activations in standard format. More details on the formats can be found in the [Fused MoE Modular Kernel](./fused_moe_modular_kernel.md) document.
|
||||
|
||||
The quantization types and formats enumerate which quantization schemes are supported by each `FusedMoEPrepareAndFinalize` class. The quantization can happen before or after the dispatch based on the format the all2all backend supports. e.g. deepep_high_throughput supports only block-quantized fp8 format, any other format will result in dispatching in higher precision and quantizing afterwards. The output of the prepare step for each backend is the quantized type. The finalize step generally requires the same input type as the original activations, e.g. if the original input is bfloat16 and the quantization scheme is fp8 w/per-tensor scales, `prepare` will return fp8/per-tensor scale activations and `finalize` will take bfloat16 activations. See the diagrams in [Fused MoE Modular Kernel](./fused_moe_modular_kernel.md) for more details on the types and formats of activations at each step of the MoE process. If no quantization type is specified, the kernel operates on float16 and/or bfloat16.
|
||||
|
||||
Async backends support the use of DBO (Dual Batch Overlap) and shared expert overlap (where shared experts are computed during the combine step).
|
||||
|
||||
Certain models require the topk weights to be applied to the input activations rather than the output activations when topk==1, e.g. llama. For modular kernels, this feature is supported by the `FusedMoEPrepareAndFinalize` subclass, for non-modular kernels, it is up to the experts function to deal with this flag.
|
||||
|
||||
unless otherwise specified, backends are controlled via `VLLM_ALL2ALL_BACKEND`. All backends except `flashinfer` only work with EP+DP or EP+TP. `Flashinfer` can work with EP or DP w/o EP.
|
||||
|
||||
<style>
|
||||
td {
|
||||
padding: 0.5rem !important;
|
||||
white-space: nowrap;
|
||||
}
|
||||
|
||||
th {
|
||||
padding: 0.5rem !important;
|
||||
min-width: 0 !important;
|
||||
}
|
||||
</style>
|
||||
|
||||
| Backend | Output act. format | Quant. types | Quant. format | Async | Apply Weight On Input | Sub-class |
|
||||
|---------------------------------------|--------------------|-----------------|------------------------|-------|-----------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
||||
| naive | standard | all<sup>1</sup> | G,A,T | N | <sup>6</sup> | [layer.py][vllm.model_executor.layers.fused_moe.layer.FusedMoE.forward_impl] |
|
||||
| pplx | batched | fp8,int8 | G,A,T | Y | Y | [`PplxPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.pplx_prepare_finalize.PplxPrepareAndFinalize] |
|
||||
| deepep_high_throughput | standard | fp8 | G(128),A,T<sup>2</sup> | Y | Y | [`DeepEPLLPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize.DeepEPLLPrepareAndFinalize] |
|
||||
| deepep_low_latency | batched | fp8 | G(128),A,T<sup>3</sup> | Y | Y | [`DeepEPHTPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize.DeepEPHTPrepareAndFinalize] |
|
||||
| flashinfer_all2allv | standard | nvfp4,fp8 | G,A,T | N | N | [`FlashInferAllToAllMoEPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize.FlashInferAllToAllMoEPrepareAndFinalize] |
|
||||
| flashinfer<sup>4</sup> | standard | nvfp4,fp8 | G,A,T | N | N | [`FlashInferCutlassMoEPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize.FlashInferCutlassMoEPrepareAndFinalize] |
|
||||
| flashinfer<sup>4</sup> | standard | nvfp4,fp8 | G,A,T | N | N | [`FlashInferCutlassMoEPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize.FlashInferCutlassMoEPrepareAndFinalize] |
|
||||
| MoEPrepareAndFinalizeNoEP<sup>5</sup> | standard | fp8,int8 | G,A,T | N | Y | [`MoEPrepareAndFinalizeNoEP`][vllm.model_executor.layers.fused_moe.prepare_finalize.MoEPrepareAndFinalizeNoEP] |
|
||||
| BatchedPrepareAndFinalize<sup>5</sup> | batched | fp8,int8 | G,A,T | N | Y | [`BatchedPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.fused_batched_moe.BatchedPrepareAndFinalize] |
|
||||
|
||||
!!! info "Table key"
|
||||
1. All types: mxfp4, nvfp4, int4, int8, fp8
|
||||
2. A,T quantization occurs after dispatch.
|
||||
3. All quantization happens after dispatch.
|
||||
4. Controlled by different env vars (`VLLM_FLASHINFER_MOE_BACKEND` "throughput" or "latency")
|
||||
5. This is a no-op dispatcher that can be used to pair with any modular experts to produce a modular kernel that runs w/o dispatch or combine. These cannot be selected via environment variable. These are generally use for testing or adapting an expert subclass to the `fused_experts` API.
|
||||
6. This depends on the experts implementation.
|
||||
|
||||
---
|
||||
|
||||
- G - Grouped
|
||||
- G(N) - Grouped w/block size N
|
||||
- A - Per activation token
|
||||
- T - Per tensor
|
||||
|
||||
Modular kernels are supported by the following `FusedMoEMethodBase` classes.
|
||||
|
||||
- [`ModelOptFp8MoEMethod`][vllm.model_executor.layers.quantization.modelopt.ModelOptFp8MoEMethod]
|
||||
- [`Fp8MoEMethod`][vllm.model_executor.layers.quantization.fp8.Fp8MoEMethod]
|
||||
- [`CompressedTensorsW4A4MoeMethod`][vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe.CompressedTensorsW4A4MoeMethod]
|
||||
- [`CompressedTensorsW8A8Fp8MoEMethod`][vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe.CompressedTensorsW8A8Fp8MoEMethod]
|
||||
- [`Mxfp4MoEMethod`][vllm.model_executor.layers.quantization.mxfp4.Mxfp4MoEMethod]
|
||||
- [`UnquantizedFusedMoEMethod`][vllm.model_executor.layers.fused_moe.layer.UnquantizedFusedMoEMethod]
|
||||
|
||||
## Fused MoE Experts Kernels
|
||||
|
||||
The are a number of MoE experts kernel implementations for different quantization types and architectures. Most follow the general API of the base Triton [`fused_experts`][vllm.model_executor.layers.fused_moe.fused_moe.fused_experts] function. Many have modular kernel adatpers so they can be used with compatible all2all backends. This table lists each experts kernel and its particular properties.
|
||||
|
||||
Each kernel must be provided with one of the supported input activation formats. Some flavors of kernels support both standard and batched formats through different entry points, e.g. `TritonExperts` and `BatchedTritonExperts`. Batched format kernels are currently only needed for matching with certain all2all backends, e.g. `pplx`, `DeepEPLLPrepareAndFinalize`.
|
||||
|
||||
Similar to the backend kernels, each experts kernel only supports certain quantization formats. For non-modular experts, the activations will be in the original type and quantized internally by the kernel. Modular experts will expect the activations to already be in the quantized format. Both types of experts will yield outputs in the original activation type.
|
||||
|
||||
Each experts kernel supports one or more activation functions, e.g. silu, gelu that are applied to the intermediate results.
|
||||
|
||||
As with the backends, some experts support applying topk weights on the input activations. The entries in the column in this table only apply to the non-modular experts.
|
||||
|
||||
Most experts flavors include an equivalent modular interface which will be a subclass of `FusedMoEPermuteExpertsUnpermute`.
|
||||
|
||||
To be used with a particular `FusedMoEPrepareAndFinalize` sub-class, MoE kernels must have compatible activation formats, quantization types and quantization formats.
|
||||
|
||||
| Kernel | Input act. format | Quant. types | Quant. format | Activation function | Apply Weight On Input | Modular | Source |
|
||||
|------------------------------|-----------------------|------------------|---------------|-------------------------------------------------------------|-----------------------|---------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
||||
| triton | standard | all<sup>1</sup> | G,A,T | silu, gelu,</br>swigluoai,</br>silu_no_mul,</br>gelu_no_mul | Y | Y | [`fused_experts`][vllm.model_executor.layers.fused_moe.fused_moe.fused_experts],</br>[`TritonExperts`][vllm.model_executor.layers.fused_moe.fused_moe.TritonExperts] |
|
||||
| triton (batched) | batched | all<sup>1</sup> | G,A,T | silu, gelu | <sup>6</sup> | Y | [`BatchedTritonExperts`][vllm.model_executor.layers.fused_moe.fused_batched_moe.BatchedTritonExperts] |
|
||||
| deep gemm | standard,</br>batched | fp8 | G(128),A,T | silu, gelu | <sup>6</sup> | Y | [`deep_gemm_moe_fp8`][vllm.model_executor.layers.fused_moe.deep_gemm_moe.deep_gemm_moe_fp8],</br>[`DeepGemmExperts`][vllm.model_executor.layers.fused_moe.deep_gemm_moe.DeepGemmExperts],</br>[`BatchedDeepGemmExperts`][vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe.BatchedDeepGemmExperts] |
|
||||
| cutlass_fp4 | standard,</br>batched | nvfp4 | A,T | silu | Y | Y | [`cutlass_moe_fp4`][vllm.model_executor.layers.fused_moe.cutlass_moe.cutlass_moe_fp4],</br>[`CutlassExpertsFp4`][vllm.model_executor.layers.fused_moe.cutlass_moe.CutlassExpertsFp4] |
|
||||
| cutlass_fp8 | standard,</br>batched | fp8 | A,T | silu, gelu | Y | Y | [`cutlass_moe_fp8`][vllm.model_executor.layers.fused_moe.cutlass_moe.cutlass_moe_fp8],</br>[`CutlassExpertsFp8`][vllm.model_executor.layers.fused_moe.cutlass_moe.CutlassExpertsFp8],</br>[`CutlasBatchedExpertsFp8`][vllm.model_executor.layers.fused_moe.cutlass_moe.CutlassBatchedExpertsFp8] |
|
||||
| flashinfer | standard | nvfp4,</br>fp8 | T | <sup>5</sup> | N | Y | [`flashinfer_cutlass_moe_fp4`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe.flashinfer_cutlass_moe_fp4],</br>[`FlashInferExperts`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe.FlashInferExperts] |
|
||||
| gpt oss triton | standard | N/A | N/A | <sup>5</sup> | Y | Y | [`triton_kernel_fused_experts`][vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe.triton_kernel_fused_experts],</br>[`OAITritonExperts`][vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe.OAITritonExperts] |
|
||||
| deep gemm+triton<sup>2</sup> | standard,</br>batched | all<sup>1</sup> | G(128),A,T | silu, gelu | <sup>6</sup> | Y | [`TritonOrDeepGemmExperts`][vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe.TritonOrDeepGemmExperts],</br>[`BatchedTritonOrDeepGemmExperts`][vllm.model_executor.layers.fused_moe.batched_triton_or_deep_gemm_moe.BatchedTritonOrDeepGemmExperts] |
|
||||
| marlin | standard | <sup>3</sup> | <sup>3</sup> | silu,</br>swigluoai | Y | N | [`fused_marlin_moe`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.fused_marlin_moe] |
|
||||
| trtllm | standard | mxfp4,</br>nvfp4 | G(16),G(32) | <sup>5</sup> | N | Y | [`TrtLlmGenExperts`][vllm.model_executor.layers.fused_moe.trtllm_moe.TrtLlmGenExperts] |
|
||||
| pallas | standard | N/A | N/A | silu | N | N | [`fused_moe`][vllm.model_executor.layers.fused_moe.moe_pallas.fused_moe] |
|
||||
| iterative | standard | N/A | N/A | silu | N | N | [`fused_moe`][vllm.model_executor.layers.fused_moe.moe_torch_iterative.fused_moe] |
|
||||
| rocm aiter moe | standard | fp8 | G(128),A,T | silu, gelu | Y | N | [`rocm_aiter_fused_experts`][vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe.rocm_aiter_fused_moe_impl] |
|
||||
| cpu_fused_moe | standard | N/A | N/A | silu | N | N | [`CPUFusedMOE`][vllm.model_executor.layers.fused_moe.cpu_fused_moe.CPUFusedMOE] |
|
||||
| naive batched<sup>4</sup> | batched | int8,</br>fp8 | G,A,T | silu, gelu | <sup>6</sup> | Y | [`NaiveBatchedExperts`][vllm.model_executor.layers.fused_moe.fused_batched_moe.NaiveBatchedExperts] |
|
||||
|
||||
!!! info "Table key"
|
||||
1. All types: mxfp4, nvfp4, int4, int8, fp8
|
||||
2. A dispatcher wrapper around triton and deep gemm experts. Will select based on type + shape + quantization params
|
||||
3. uint4, uint8, fp8, fp4
|
||||
4. This is a naive implementation of experts that supports batched format. Mainly used for testing.
|
||||
5. The `activation` parameter is ignored and SwiGlu is used by default instead.
|
||||
6. Only handled by or supported when used with modular kernels.
|
||||
|
||||
## Modular Kernel "families"
|
||||
|
||||
The following table shows "families" of modular kernels that are intended to work together. There are some combinations which may work but have not yet been tested, e.g. flashinfer with other fp8 experts. Note that the "naive" backend will work with any non-modular experts.
|
||||
|
||||
| backend | `FusedMoEPrepareAndFinalize` subclasses | `FusedMoEPermuteExpertsUnpermute` subclasses |
|
||||
|----------------------------------|------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------|
|
||||
| deepep_high_throughput,</br>pplx | `DeepEPHTPrepareAndFinalize`,</br>`PplxPrepareAndFinalize` | `BatchedDeepGemmExperts`,</br>`BatchedTritonExperts`,</br>`BatchedTritonOrDeepGemmExperts`,</br>`CutlassBatchedExpertsFp8` |
|
||||
| deepep_low_latency | `DeepEPLLPrepareAndFinalize` | `DeepGemmExperts`,</br>`TritonExperts`,</br>`TritonOrDeepGemmExperts`,</br>`CutlassExpertsFp8` |
|
||||
| flashinfer | `FlashInferCutlassMoEPrepareAndFinalize` | `FlashInferExperts` |
|
||||
@ -16,7 +16,7 @@ vLLM will take all the available factors into consideration, and decide a direct
|
||||
|
||||
The factors considered include:
|
||||
|
||||
- All the related configs (see the `compute_hash` functions in the [config.py](gh-file:vllm/config.py))
|
||||
- All the related configs (see the `compute_hash` functions in their respective configs in the [config folder](gh-file:vllm/config))
|
||||
- PyTorch configs (see the `compute_hash` functions in the [compiler_interface.py](gh-file:vllm/compilation/compiler_interface.py))
|
||||
- The model's forward function and the relevant functions called by the forward function (see below)
|
||||
|
||||
|
||||
@ -428,7 +428,7 @@ Our OpenAI-compatible server accepts multi-modal data via the [Chat Completions
|
||||
If no fallback is available, an error is raised and you have to provide the chat template manually via the `--chat-template` argument.
|
||||
|
||||
For certain models, we provide alternative chat templates inside <gh-dir:examples>.
|
||||
For example, VLM2Vec uses <gh-file:examples/template_vlm2vec.jinja> which is different from the default one for Phi-3-Vision.
|
||||
For example, VLM2Vec uses <gh-file:examples/template_vlm2vec_phi3v.jinja> which is different from the default one for Phi-3-Vision.
|
||||
|
||||
### Image Inputs
|
||||
|
||||
|
||||
@ -6,7 +6,11 @@ This quantization method is particularly useful for reducing model size while ma
|
||||
Please visit the HF collection of [quantized INT8 checkpoints of popular LLMs ready to use with vLLM](https://huggingface.co/collections/neuralmagic/int8-llms-for-vllm-668ec32c049dca0369816415).
|
||||
|
||||
!!! note
|
||||
INT8 computation is supported on NVIDIA GPUs with compute capability > 7.5 (Turing, Ampere, Ada Lovelace, Hopper, Blackwell).
|
||||
INT8 computation is supported on NVIDIA GPUs with compute capability > 7.5 (Turing, Ampere, Ada Lovelace, Hopper).
|
||||
|
||||
!!! warning
|
||||
**Blackwell GPU Limitation**: INT8 is not supported on compute capability >= 100 (e.g., RTX 6000 Blackwell).
|
||||
Use [FP8 quantization](fp8.md) instead, or run on Hopper/Ada/Ampere architectures.
|
||||
|
||||
## Prerequisites
|
||||
|
||||
|
||||
@ -24,6 +24,13 @@ vllm serve s3://core-llm/Llama-3-8b \
|
||||
--load-format runai_streamer
|
||||
```
|
||||
|
||||
To run model from Google Cloud Storage run:
|
||||
|
||||
```bash
|
||||
vllm serve gs://core-llm/Llama-3-8b \
|
||||
--load-format runai_streamer
|
||||
```
|
||||
|
||||
To run model from a S3 compatible object store run:
|
||||
|
||||
```bash
|
||||
|
||||
@ -626,7 +626,29 @@ See [this page](../features/multimodal_inputs.md) on how to pass multi-modal inp
|
||||
For hybrid-only models such as Llama-4, Step3 and Mistral-3, a text-only mode can be enabled by setting all supported multimodal modalities to 0 (e.g, `--limit-mm-per-prompt '{"image":0}`) so that their multimodal modules will not be loaded to free up more GPU memory for KV cache.
|
||||
|
||||
!!! note
|
||||
vLLM currently only supports adding LoRA to the language backbone of multimodal models.
|
||||
vLLM currently only supports dynamic LoRA adapters on the language backbone of multimodal models.
|
||||
If you wish to use a model with LoRA in the multi-modal encoder,
|
||||
please merge the weights into the base model first before running it in vLLM like a regular model.
|
||||
|
||||
```python
|
||||
from peft import PeftConfig, PeftModel
|
||||
from transformers import AutoModelForImageTextToText, AutoProcessor
|
||||
|
||||
def merge_and_save(model_id: str, output_dir: str):
|
||||
base_model = AutoModelForImageTextToText.from_pretrained(model_id)
|
||||
lora_model = PeftModel.from_pretrained(
|
||||
base_model,
|
||||
model_id,
|
||||
config=PeftConfig.from_pretrained(model_id),
|
||||
)
|
||||
model = lora_model.merge_and_unload().to(dtype=base_model.dtype)
|
||||
model._hf_peft_config_loaded = False # Needed to save the merged model
|
||||
|
||||
processor = AutoProcessor.from_pretrained(model_id)
|
||||
|
||||
model.save_pretrained(output_dir)
|
||||
processor.save_pretrained(output_dir)
|
||||
```
|
||||
|
||||
### Generative Models
|
||||
|
||||
@ -805,8 +827,8 @@ The following table lists those that are tested in vLLM.
|
||||
|
||||
| Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) |
|
||||
|--------------|--------|--------|-------------------|----------------------|---------------------------|---------------------|
|
||||
| `LlavaNextForConditionalGeneration`<sup>C</sup> | LLaVA-NeXT-based | T / I | `royokong/e5-v` | | | |
|
||||
| `Phi3VForCausalLM`<sup>C</sup> | Phi-3-Vision-based | T + I | `TIGER-Lab/VLM2Vec-Full` | 🚧 | ✅︎ | |
|
||||
| `LlavaNextForConditionalGeneration`<sup>C</sup> | LLaVA-NeXT-based | T / I | `royokong/e5-v` | | ✅︎ | ✅︎ |
|
||||
| `Phi3VForCausalLM`<sup>C</sup> | Phi-3-Vision-based | T + I | `TIGER-Lab/VLM2Vec-Full` | | ✅︎ | ✅︎ |
|
||||
| `*ForConditionalGeneration`<sup>C</sup>, `*ForCausalLM`<sup>C</sup>, etc. | Generative models | \* | N/A | \* | \* | \* |
|
||||
|
||||
<sup>C</sup> Automatically converted into an embedding model via `--convert embed`. ([details](./pooling_models.md#model-conversion))
|
||||
|
||||
@ -236,11 +236,33 @@ The following extra parameters are supported:
|
||||
Our Embeddings API is compatible with [OpenAI's Embeddings API](https://platform.openai.com/docs/api-reference/embeddings);
|
||||
you can use the [official OpenAI Python client](https://github.com/openai/openai-python) to interact with it.
|
||||
|
||||
If the model has a [chat template][chat-template], you can replace `inputs` with a list of `messages` (same schema as [Chat API][chat-api])
|
||||
which will be treated as a single prompt to the model.
|
||||
|
||||
Code example: <gh-file:examples/online_serving/pooling/openai_embedding_client.py>
|
||||
|
||||
If the model has a [chat template][chat-template], you can replace `inputs` with a list of `messages` (same schema as [Chat API][chat-api])
|
||||
which will be treated as a single prompt to the model. Here is a convenience function for calling the API while retaining OpenAI's type annotations:
|
||||
|
||||
??? code
|
||||
|
||||
```python
|
||||
from openai import OpenAI
|
||||
from openai._types import NOT_GIVEN, NotGiven
|
||||
from openai.types.chat import ChatCompletionMessageParam
|
||||
from openai.types.create_embedding_response import CreateEmbeddingResponse
|
||||
|
||||
def create_chat_embeddings(
|
||||
client: OpenAI,
|
||||
*,
|
||||
messages: list[ChatCompletionMessageParam],
|
||||
model: str,
|
||||
encoding_format: Union[Literal["base64", "float"], NotGiven] = NOT_GIVEN,
|
||||
) -> CreateEmbeddingResponse:
|
||||
return client.post(
|
||||
"/embeddings",
|
||||
cast_to=CreateEmbeddingResponse,
|
||||
body={"messages": messages, "model": model, "encoding_format": encoding_format},
|
||||
)
|
||||
```
|
||||
|
||||
#### Multi-modal inputs
|
||||
|
||||
You can pass multi-modal inputs to embedding models by defining a custom chat template for the server
|
||||
@ -254,7 +276,7 @@ and passing a list of `messages` in the request. Refer to the examples below for
|
||||
vllm serve TIGER-Lab/VLM2Vec-Full --runner pooling \
|
||||
--trust-remote-code \
|
||||
--max-model-len 4096 \
|
||||
--chat-template examples/template_vlm2vec.jinja
|
||||
--chat-template examples/template_vlm2vec_phi3v.jinja
|
||||
```
|
||||
|
||||
!!! important
|
||||
@ -262,34 +284,36 @@ and passing a list of `messages` in the request. Refer to the examples below for
|
||||
to run this model in embedding mode instead of text generation mode.
|
||||
|
||||
The custom chat template is completely different from the original one for this model,
|
||||
and can be found here: <gh-file:examples/template_vlm2vec.jinja>
|
||||
and can be found here: <gh-file:examples/template_vlm2vec_phi3v.jinja>
|
||||
|
||||
Since the request schema is not defined by OpenAI client, we post a request to the server using the lower-level `requests` library:
|
||||
|
||||
??? code
|
||||
|
||||
```python
|
||||
import requests
|
||||
|
||||
from openai import OpenAI
|
||||
client = OpenAI(
|
||||
base_url="http://localhost:8000/v1",
|
||||
api_key="EMPTY",
|
||||
)
|
||||
image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
|
||||
|
||||
response = requests.post(
|
||||
"http://localhost:8000/v1/embeddings",
|
||||
json={
|
||||
"model": "TIGER-Lab/VLM2Vec-Full",
|
||||
"messages": [{
|
||||
response = create_chat_embeddings(
|
||||
client,
|
||||
model="TIGER-Lab/VLM2Vec-Full",
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "image_url", "image_url": {"url": image_url}},
|
||||
{"type": "text", "text": "Represent the given image."},
|
||||
],
|
||||
}],
|
||||
"encoding_format": "float",
|
||||
},
|
||||
}
|
||||
],
|
||||
encoding_format="float",
|
||||
)
|
||||
response.raise_for_status()
|
||||
response_json = response.json()
|
||||
print("Embedding output:", response_json["data"][0]["embedding"])
|
||||
|
||||
print("Image embedding output:", response.data[0].embedding)
|
||||
```
|
||||
|
||||
=== "DSE-Qwen2-MRL"
|
||||
|
||||
@ -0,0 +1,30 @@
|
||||
# KV Load Failure Recovery Test
|
||||
|
||||
This example builds upon the `disaggregated-prefill-v1` example in `examples/offline_inference`.
|
||||
|
||||
It demonstrates vLLM's ability to recover from KV load failures in both synchronous and asynchronous loading modes. The goal is to verify that vLLM correctly identifies invalid KV blocks, reschedules the affected requests, and ensures successful and consistent output.
|
||||
|
||||
## Files
|
||||
|
||||
- `prefill_example.py` – performs the prefill stage and saves KV data (same as in `disaggregated-prefill-v1`).
|
||||
- `decode_example.py` – performs the decode stage. Accepts:
|
||||
- `--simulate-failure`: simulates KV load failure using a custom connector.
|
||||
- `--async-load`: enables asynchronous KV loading mode.
|
||||
- `rogue_shared_storage_connector.py` – defines `RogueSharedStorageConnector`, a subclass of `SharedStorageConnector`, that simulates missing or corrupted external KV blocks by failing to load blocks for the first decode request.
|
||||
- `run.sh` – orchestrates the test: runs the prefill stage, then three decode stages:
|
||||
1. Normal decode (baseline).
|
||||
2. Decode with simulated sync KV load failure.
|
||||
3. Decode with simulated async KV load failure.
|
||||
|
||||
Finally, it compares the output of the baseline with the recovered outputs to verify correctness.
|
||||
|
||||
## How It Works
|
||||
|
||||
- The test dynamically loads `RogueSharedStorageConnector` via `KVTransferConfig.kv_connector_module_path`, enabling controlled simulation of load failures without modifying the original connector.
|
||||
- The decode stages that simulate failure are expected to trigger recovery logic in vLLM, resulting in the same output as the baseline decode.
|
||||
- If recovery fails, the script prints a unified diff of the output mismatch and exits with error.
|
||||
|
||||
## Usage
|
||||
|
||||
```bash
|
||||
./run.sh
|
||||
@ -0,0 +1,85 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import argparse
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.config import KVTransferConfig
|
||||
|
||||
|
||||
def read_prompts():
|
||||
"""Read prompts from prefill_output.txt"""
|
||||
prompts = []
|
||||
try:
|
||||
with open("prefill_output.txt") as f:
|
||||
for line in f:
|
||||
prompts.append(line.strip())
|
||||
print(f"Loaded {len(prompts)} prompts from prefill_output.txt")
|
||||
return prompts
|
||||
except FileNotFoundError:
|
||||
print("Error: prefill_output.txt file not found")
|
||||
exit(-1)
|
||||
|
||||
|
||||
def main():
|
||||
prompts = read_prompts()
|
||||
sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10)
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"--simulate-failure", action="store_true", help="Simulate KV load failure."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--async-load", action="store_true", help="Simulate async KV load"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.simulate_failure:
|
||||
ktc = KVTransferConfig(
|
||||
kv_connector="RogueSharedStorageConnector",
|
||||
kv_role="kv_both",
|
||||
kv_connector_extra_config={
|
||||
"shared_storage_path": "local_storage",
|
||||
"async_load": args.async_load,
|
||||
},
|
||||
kv_connector_module_path="rogue_shared_storage_connector",
|
||||
)
|
||||
out_file = (
|
||||
"async_decode_recovered_output.txt"
|
||||
if args.async_load
|
||||
else "sync_decode_recovered_output.txt"
|
||||
)
|
||||
else:
|
||||
ktc = KVTransferConfig(
|
||||
kv_connector="SharedStorageConnector",
|
||||
kv_role="kv_both",
|
||||
kv_connector_extra_config={
|
||||
"shared_storage_path": "local_storage",
|
||||
},
|
||||
)
|
||||
out_file = "decode_output.txt"
|
||||
|
||||
llm = LLM(
|
||||
model="meta-llama/Llama-3.2-1B-Instruct",
|
||||
enforce_eager=True,
|
||||
gpu_memory_utilization=0.8,
|
||||
max_num_batched_tokens=64,
|
||||
max_num_seqs=16,
|
||||
kv_transfer_config=ktc,
|
||||
)
|
||||
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
|
||||
sep_str = "-" * 30
|
||||
with open(out_file, "w", encoding="utf-8") as f:
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
out_str = f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}"
|
||||
print(out_str)
|
||||
print(sep_str)
|
||||
f.write(out_str)
|
||||
f.write(sep_str)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@ -0,0 +1,58 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.config import KVTransferConfig
|
||||
|
||||
|
||||
def read_prompts():
|
||||
context = "Hi " * 1000
|
||||
context2 = "Hey " * 500
|
||||
return [
|
||||
context + "Hello, my name is",
|
||||
context + "The capital of France is",
|
||||
context2 + "Your name is",
|
||||
context2 + "The capital of China is",
|
||||
]
|
||||
|
||||
|
||||
def main():
|
||||
prompts = read_prompts()
|
||||
|
||||
sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=1)
|
||||
|
||||
llm = LLM(
|
||||
model="meta-llama/Llama-3.2-1B-Instruct",
|
||||
enforce_eager=True,
|
||||
gpu_memory_utilization=0.8,
|
||||
kv_transfer_config=KVTransferConfig(
|
||||
kv_connector="SharedStorageConnector",
|
||||
kv_role="kv_both",
|
||||
kv_connector_extra_config={"shared_storage_path": "local_storage"},
|
||||
),
|
||||
) # , max_model_len=2048, max_num_batched_tokens=2048)
|
||||
|
||||
# 1ST generation (prefill instance)
|
||||
outputs = llm.generate(
|
||||
prompts,
|
||||
sampling_params,
|
||||
)
|
||||
|
||||
new_prompts = []
|
||||
print("-" * 30)
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
new_prompts.append(prompt + generated_text)
|
||||
print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
|
||||
print("-" * 30)
|
||||
|
||||
# Write new_prompts to prefill_output.txt
|
||||
with open("prefill_output.txt", "w") as f:
|
||||
for prompt in new_prompts:
|
||||
f.write(prompt + "\n")
|
||||
print(f"Saved {len(new_prompts)} prompts to prefill_output.txt")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@ -0,0 +1,145 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
# ruff: noqa: E501
|
||||
import logging
|
||||
from dataclasses import dataclass, field
|
||||
from typing import TYPE_CHECKING, Optional
|
||||
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.distributed.kv_transfer.kv_connector.v1.base import (
|
||||
KVConnectorMetadata,
|
||||
KVConnectorRole,
|
||||
)
|
||||
from vllm.distributed.kv_transfer.kv_connector.v1.shared_storage_connector import (
|
||||
SharedStorageConnector,
|
||||
SharedStorageConnectorMetadata,
|
||||
)
|
||||
from vllm.forward_context import ForwardContext
|
||||
from vllm.v1.core.kv_cache_manager import KVCacheBlocks
|
||||
from vllm.v1.request import Request
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.v1.core.sched.output import SchedulerOutput
|
||||
|
||||
logger = logging.getLogger()
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
|
||||
@dataclass
|
||||
class RogueSharedStorageConnectorMetadata(SharedStorageConnectorMetadata):
|
||||
req_to_block_ids: dict[str, set[int]] = field(default_factory=dict)
|
||||
|
||||
@classmethod
|
||||
def from_base(cls, base: SharedStorageConnectorMetadata):
|
||||
return cls(requests=base.requests)
|
||||
|
||||
|
||||
class RogueSharedStorageConnector(SharedStorageConnector):
|
||||
def __init__(self, vllm_config: "VllmConfig", role: KVConnectorRole):
|
||||
super().__init__(vllm_config=vllm_config, role=role)
|
||||
self._async_load = vllm_config.kv_transfer_config.get_from_extra_config(
|
||||
"async_load", False
|
||||
)
|
||||
self._invalid_block_ids: set = None
|
||||
self._seen_requests: set = set()
|
||||
self._req_to_block_ids: dict[str, list[int]] = dict()
|
||||
|
||||
def bind_connector_metadata(self, connector_metadata: KVConnectorMetadata) -> None:
|
||||
assert isinstance(connector_metadata, RogueSharedStorageConnectorMetadata)
|
||||
index, failed_request = next(
|
||||
(
|
||||
(i, x)
|
||||
for i, x in enumerate(connector_metadata.requests)
|
||||
if not x.is_store
|
||||
),
|
||||
(None, None),
|
||||
)
|
||||
if index is not None:
|
||||
del connector_metadata.requests[index]
|
||||
self._invalid_block_ids = set(
|
||||
(
|
||||
failed_request.slot_mapping[:: self._block_size] // self._block_size
|
||||
).tolist()
|
||||
)
|
||||
logger.info(
|
||||
"Simulating failure to load all KV blocks for the "
|
||||
"first load request. Total blocks: %d",
|
||||
len(self._invalid_block_ids),
|
||||
)
|
||||
super().bind_connector_metadata(connector_metadata)
|
||||
|
||||
def clear_connector_metadata(self) -> None:
|
||||
self._invalid_block_ids = None
|
||||
super().clear_connector_metadata()
|
||||
|
||||
def start_load_kv(self, forward_context: ForwardContext, **kwargs) -> None:
|
||||
if self._async_load and forward_context.attn_metadata is None:
|
||||
# Bypass sanity check in super().start_load_kv
|
||||
forward_context.attn_metadata = "None"
|
||||
|
||||
super().start_load_kv(forward_context, **kwargs)
|
||||
|
||||
def get_finished(
|
||||
self, finished_req_ids: set[str]
|
||||
) -> tuple[Optional[set[str]], Optional[set[str]]]:
|
||||
if self._async_load:
|
||||
meta = self._get_connector_metadata()
|
||||
assert isinstance(meta, RogueSharedStorageConnectorMetadata)
|
||||
if meta.req_to_block_ids:
|
||||
return None, set(meta.req_to_block_ids)
|
||||
|
||||
return None, None
|
||||
|
||||
def get_block_ids_with_load_errors(self) -> set[int]:
|
||||
return self._invalid_block_ids
|
||||
|
||||
def get_num_new_matched_tokens(
|
||||
self,
|
||||
request: Request,
|
||||
num_computed_tokens: int,
|
||||
) -> tuple[int, bool]:
|
||||
if request.request_id in self._seen_requests:
|
||||
return 0, False
|
||||
|
||||
self._seen_requests.add(request.request_id)
|
||||
|
||||
num_tokens, _ = super().get_num_new_matched_tokens(request, num_computed_tokens)
|
||||
return num_tokens, self._async_load and num_tokens > 0
|
||||
|
||||
def update_state_after_alloc(
|
||||
self, request: Request, blocks: KVCacheBlocks, num_external_tokens: int
|
||||
):
|
||||
"""
|
||||
Update KVConnector state after block allocation.
|
||||
|
||||
If blocks were allocated, add to _requests_need_load,
|
||||
such that we load the KVs in the next forward pass.
|
||||
"""
|
||||
super().update_state_after_alloc(request, blocks, num_external_tokens)
|
||||
|
||||
if num_external_tokens > 0:
|
||||
self._req_to_block_ids[request.request_id] = blocks.get_block_ids()[0]
|
||||
|
||||
def build_connector_meta(
|
||||
self,
|
||||
scheduler_output: "SchedulerOutput",
|
||||
) -> KVConnectorMetadata:
|
||||
if not self._async_load:
|
||||
base = super().build_connector_meta(scheduler_output)
|
||||
meta = RogueSharedStorageConnectorMetadata.from_base(base)
|
||||
else:
|
||||
meta = RogueSharedStorageConnectorMetadata()
|
||||
if self._requests_need_load:
|
||||
for req_id, request in self._requests_need_load.items():
|
||||
meta.add_request(
|
||||
token_ids=request.prompt_token_ids,
|
||||
block_ids=self._req_to_block_ids[req_id],
|
||||
block_size=self._block_size,
|
||||
is_store=False,
|
||||
mm_hashes=[],
|
||||
)
|
||||
# Clear state
|
||||
self._requests_need_load.clear()
|
||||
meta.req_to_block_ids = self._req_to_block_ids
|
||||
self._req_to_block_ids = dict()
|
||||
return meta
|
||||
33
examples/offline_inference/kv_load_failure_recovery/run.sh
Executable file
@ -0,0 +1,33 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Constants
|
||||
SHARED_STORAGE_DIR="local_storage"
|
||||
PREFILL_OUTPUT="prefill_output.txt"
|
||||
DECODE_OUTPUT="decode_output.txt"
|
||||
SYNC_DECODE_RECOVERED_OUTPUT="sync_decode_recovered_output.txt"
|
||||
ASYNC_DECODE_RECOVERED_OUTPUT="async_decode_recovered_output.txt"
|
||||
|
||||
# Cleanup
|
||||
rm -rf "$SHARED_STORAGE_DIR"
|
||||
rm -f "$PREFILL_OUTPUT" "$DECODE_OUTPUT" "$SYNC_DECODE_RECOVERED_OUTPUT" "$ASYNC_DECODE_RECOVERED_OUTPUT"
|
||||
|
||||
# Run inference examples
|
||||
VLLM_ENABLE_V1_MULTIPROCESSING=0 CUDA_VISIBLE_DEVICES=0 python3 prefill_example.py
|
||||
VLLM_ENABLE_V1_MULTIPROCESSING=0 CUDA_VISIBLE_DEVICES=0 python3 decode_example.py
|
||||
VLLM_ENABLE_V1_MULTIPROCESSING=0 CUDA_VISIBLE_DEVICES=0 python3 decode_example.py --simulate-failure
|
||||
VLLM_ENABLE_V1_MULTIPROCESSING=0 CUDA_VISIBLE_DEVICES=0 python3 decode_example.py --simulate-failure --async-load
|
||||
|
||||
# Compare outputs
|
||||
if ! cmp -s "$DECODE_OUTPUT" "$SYNC_DECODE_RECOVERED_OUTPUT"; then
|
||||
echo "❌ Outputs differ: sync recovery failed."
|
||||
diff -u "$DECODE_OUTPUT" "$SYNC_DECODE_RECOVERED_OUTPUT"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if ! cmp -s "$DECODE_OUTPUT" "$ASYNC_DECODE_RECOVERED_OUTPUT"; then
|
||||
echo "❌ Outputs differ: async recovery failed."
|
||||
diff -u "$DECODE_OUTPUT" "$ASYNC_DECODE_RECOVERED_OUTPUT"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "✅ Outputs match: recovery successful."
|
||||
@ -10,6 +10,7 @@ on HuggingFace model repository.
|
||||
|
||||
from argparse import Namespace
|
||||
from dataclasses import asdict
|
||||
from pathlib import Path
|
||||
from typing import Literal, NamedTuple, Optional, TypedDict, Union, get_args
|
||||
|
||||
from PIL.Image import Image
|
||||
@ -19,6 +20,9 @@ from vllm.entrypoints.score_utils import ScoreMultiModalParam
|
||||
from vllm.multimodal.utils import fetch_image
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
|
||||
ROOT_DIR = Path(__file__).parent.parent.parent
|
||||
EXAMPLES_DIR = ROOT_DIR / "examples"
|
||||
|
||||
|
||||
class TextQuery(TypedDict):
|
||||
modality: Literal["text"]
|
||||
@ -82,23 +86,27 @@ def run_e5_v(query: Query) -> ModelRequestData:
|
||||
)
|
||||
|
||||
|
||||
def run_vlm2vec(query: Query) -> ModelRequestData:
|
||||
def _get_vlm2vec_prompt_image(query: Query, image_token: str):
|
||||
if query["modality"] == "text":
|
||||
text = query["text"]
|
||||
prompt = f"Find me an everyday image that matches the given caption: {text}" # noqa: E501
|
||||
image = None
|
||||
elif query["modality"] == "image":
|
||||
prompt = "<|image_1|> Find a day-to-day image that looks similar to the provided image." # noqa: E501
|
||||
prompt = f"{image_token} Find a day-to-day image that looks similar to the provided image." # noqa: E501
|
||||
image = query["image"]
|
||||
elif query["modality"] == "text+image":
|
||||
text = query["text"]
|
||||
prompt = (
|
||||
f"<|image_1|> Represent the given image with the following question: {text}" # noqa: E501
|
||||
)
|
||||
prompt = f"{image_token} Represent the given image with the following question: {text}" # noqa: E501
|
||||
image = query["image"]
|
||||
else:
|
||||
modality = query["modality"]
|
||||
raise ValueError(f"Unsupported query modality: '{modality}'")
|
||||
raise ValueError(f"Unsupported query modality: {modality!r}")
|
||||
|
||||
return prompt, image
|
||||
|
||||
|
||||
def run_vlm2vec_phi3v(query: Query) -> ModelRequestData:
|
||||
prompt, image = _get_vlm2vec_prompt_image(query, "<|image_1|>")
|
||||
|
||||
engine_args = EngineArgs(
|
||||
model="TIGER-Lab/VLM2Vec-Full",
|
||||
@ -116,6 +124,66 @@ def run_vlm2vec(query: Query) -> ModelRequestData:
|
||||
)
|
||||
|
||||
|
||||
def run_vlm2vec_qwen2vl(query: Query) -> ModelRequestData:
|
||||
# vLLM does not support LoRA adapters on multi-modal encoder,
|
||||
# so we merge the weights first
|
||||
from huggingface_hub.constants import HF_HUB_CACHE
|
||||
from peft import PeftConfig, PeftModel
|
||||
from transformers import AutoModelForImageTextToText, AutoProcessor
|
||||
|
||||
from vllm.entrypoints.chat_utils import load_chat_template
|
||||
|
||||
model_id = "TIGER-Lab/VLM2Vec-Qwen2VL-2B"
|
||||
|
||||
base_model = AutoModelForImageTextToText.from_pretrained(model_id)
|
||||
lora_model = PeftModel.from_pretrained(
|
||||
base_model,
|
||||
model_id,
|
||||
config=PeftConfig.from_pretrained(model_id),
|
||||
)
|
||||
model = lora_model.merge_and_unload().to(dtype=base_model.dtype)
|
||||
model._hf_peft_config_loaded = False # Needed to save the merged model
|
||||
|
||||
processor = AutoProcessor.from_pretrained(
|
||||
model_id,
|
||||
# `min_pixels` and `max_pixels` are deprecated
|
||||
size={"shortest_edge": 3136, "longest_edge": 12845056},
|
||||
)
|
||||
processor.chat_template = load_chat_template(
|
||||
# The original chat template is not correct
|
||||
EXAMPLES_DIR / "template_vlm2vec_qwen2vl.jinja",
|
||||
)
|
||||
|
||||
merged_path = str(
|
||||
Path(HF_HUB_CACHE) / ("models--" + model_id.replace("/", "--") + "-vllm")
|
||||
)
|
||||
print(f"Saving merged model to {merged_path}...")
|
||||
print(
|
||||
"NOTE: This directory is not tracked by `huggingface_hub` "
|
||||
"so you have to delete this manually if you don't want it anymore."
|
||||
)
|
||||
model.save_pretrained(merged_path)
|
||||
processor.save_pretrained(merged_path)
|
||||
print("Done!")
|
||||
|
||||
prompt, image = _get_vlm2vec_prompt_image(query, "<|image_pad|>")
|
||||
|
||||
engine_args = EngineArgs(
|
||||
model=merged_path,
|
||||
runner="pooling",
|
||||
max_model_len=4096,
|
||||
trust_remote_code=True,
|
||||
mm_processor_kwargs={"num_crops": 4},
|
||||
limit_mm_per_prompt={"image": 1},
|
||||
)
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompt=prompt,
|
||||
image=image,
|
||||
)
|
||||
|
||||
|
||||
def run_jinavl_reranker(query: Query) -> ModelRequestData:
|
||||
if query["modality"] != "text+images":
|
||||
raise ValueError(f"Unsupported query modality: '{query['modality']}'")
|
||||
@ -232,7 +300,8 @@ def run_score(model: str, modality: QueryModality, seed: Optional[int]):
|
||||
|
||||
model_example_map = {
|
||||
"e5_v": run_e5_v,
|
||||
"vlm2vec": run_vlm2vec,
|
||||
"vlm2vec_phi3v": run_vlm2vec_phi3v,
|
||||
"vlm2vec_qwen2vl": run_vlm2vec_qwen2vl,
|
||||
"jinavl_reranker": run_jinavl_reranker,
|
||||
}
|
||||
|
||||
@ -246,7 +315,7 @@ def parse_args():
|
||||
"--model-name",
|
||||
"-m",
|
||||
type=str,
|
||||
default="vlm2vec",
|
||||
default="vlm2vec_phi3v",
|
||||
choices=model_example_map.keys(),
|
||||
help="The name of the embedding model.",
|
||||
)
|
||||
|
||||
@ -4,69 +4,137 @@
|
||||
"""Example Python client for multimodal embedding API using vLLM API server
|
||||
NOTE:
|
||||
start a supported multimodal embeddings model server with `vllm serve`, e.g.
|
||||
vllm serve TIGER-Lab/VLM2Vec-Full --runner pooling --trust_remote_code --max_model_len=1024
|
||||
vllm serve TIGER-Lab/VLM2Vec-Full \
|
||||
--runner pooling \
|
||||
--trust-remote-code \
|
||||
--max-model-len 4096 \
|
||||
--chat-template examples/template_vlm2vec_phi3v.jinja
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import base64
|
||||
import io
|
||||
from typing import Literal, Union
|
||||
|
||||
import requests
|
||||
from openai import OpenAI
|
||||
from openai._types import NOT_GIVEN, NotGiven
|
||||
from openai.types.chat import ChatCompletionMessageParam
|
||||
from openai.types.create_embedding_response import CreateEmbeddingResponse
|
||||
from PIL import Image
|
||||
|
||||
# Modify OpenAI's API key and API base to use vLLM's API server.
|
||||
openai_api_key = "EMPTY"
|
||||
openai_api_base = "http://localhost:8000/v1"
|
||||
|
||||
image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
|
||||
|
||||
|
||||
def vlm2vec():
|
||||
response = requests.post(
|
||||
"http://localhost:8000/v1/embeddings",
|
||||
json={
|
||||
"model": "TIGER-Lab/VLM2Vec-Full",
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "image_url", "image_url": {"url": image_url}},
|
||||
{"type": "text", "text": "Represent the given image."},
|
||||
],
|
||||
}
|
||||
],
|
||||
"encoding_format": "float",
|
||||
},
|
||||
def create_chat_embeddings(
|
||||
client: OpenAI,
|
||||
*,
|
||||
messages: list[ChatCompletionMessageParam],
|
||||
model: str,
|
||||
encoding_format: Union[Literal["base64", "float"], NotGiven] = NOT_GIVEN,
|
||||
) -> CreateEmbeddingResponse:
|
||||
"""
|
||||
Convenience function for accessing vLLM's Chat Embeddings API,
|
||||
which is an extension of OpenAI's existing Embeddings API.
|
||||
"""
|
||||
return client.post(
|
||||
"/embeddings",
|
||||
cast_to=CreateEmbeddingResponse,
|
||||
body={"messages": messages, "model": model, "encoding_format": encoding_format},
|
||||
)
|
||||
response.raise_for_status()
|
||||
response_json = response.json()
|
||||
|
||||
print("Embedding output:", response_json["data"][0]["embedding"])
|
||||
|
||||
|
||||
def dse_qwen2_vl(inp: dict):
|
||||
# Embedding an Image
|
||||
if inp["type"] == "image":
|
||||
messages = [
|
||||
def run_vlm2vec(client: OpenAI, model: str):
|
||||
response = create_chat_embeddings(
|
||||
client,
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "image_url", "image_url": {"url": image_url}},
|
||||
{"type": "text", "text": "Represent the given image."},
|
||||
],
|
||||
}
|
||||
],
|
||||
model=model,
|
||||
encoding_format="float",
|
||||
)
|
||||
|
||||
print("Image embedding output:", response.data[0].embedding)
|
||||
|
||||
response = create_chat_embeddings(
|
||||
client,
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "image_url", "image_url": {"url": image_url}},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "Represent the given image with the following question: What is in the image.",
|
||||
},
|
||||
],
|
||||
}
|
||||
],
|
||||
model=model,
|
||||
encoding_format="float",
|
||||
)
|
||||
|
||||
print("Image+Text embedding output:", response.data[0].embedding)
|
||||
|
||||
response = create_chat_embeddings(
|
||||
client,
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": "A cat and a dog"},
|
||||
],
|
||||
}
|
||||
],
|
||||
model=model,
|
||||
encoding_format="float",
|
||||
)
|
||||
|
||||
print("Text embedding output:", response.data[0].embedding)
|
||||
|
||||
|
||||
def run_dse_qwen2_vl(client: OpenAI, model: str):
|
||||
response = create_chat_embeddings(
|
||||
client,
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": inp["image_url"],
|
||||
"url": image_url,
|
||||
},
|
||||
},
|
||||
{"type": "text", "text": "What is shown in this image?"},
|
||||
],
|
||||
}
|
||||
]
|
||||
# Embedding a Text Query
|
||||
else:
|
||||
# MrLight/dse-qwen2-2b-mrl-v1 requires a placeholder image
|
||||
# of the minimum input size
|
||||
buffer = io.BytesIO()
|
||||
image_placeholder = Image.new("RGB", (56, 56))
|
||||
image_placeholder.save(buffer, "png")
|
||||
buffer.seek(0)
|
||||
image_placeholder = base64.b64encode(buffer.read()).decode("utf-8")
|
||||
messages = [
|
||||
],
|
||||
model=model,
|
||||
encoding_format="float",
|
||||
)
|
||||
|
||||
print("Image embedding output:", response.data[0].embedding)
|
||||
|
||||
# MrLight/dse-qwen2-2b-mrl-v1 requires a placeholder image
|
||||
# of the minimum input size
|
||||
buffer = io.BytesIO()
|
||||
image_placeholder = Image.new("RGB", (56, 56))
|
||||
image_placeholder.save(buffer, "png")
|
||||
buffer.seek(0)
|
||||
image_placeholder = base64.b64encode(buffer.read()).decode("utf-8")
|
||||
response = create_chat_embeddings(
|
||||
client,
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
@ -76,23 +144,21 @@ def dse_qwen2_vl(inp: dict):
|
||||
"url": f"data:image/jpeg;base64,{image_placeholder}",
|
||||
},
|
||||
},
|
||||
{"type": "text", "text": f"Query: {inp['content']}"},
|
||||
{"type": "text", "text": "Query: What is the weather like today?"},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
response = requests.post(
|
||||
"http://localhost:8000/v1/embeddings",
|
||||
json={
|
||||
"model": "MrLight/dse-qwen2-2b-mrl-v1",
|
||||
"messages": messages,
|
||||
"encoding_format": "float",
|
||||
},
|
||||
],
|
||||
model=model,
|
||||
encoding_format="float",
|
||||
)
|
||||
response.raise_for_status()
|
||||
response_json = response.json()
|
||||
|
||||
print("Embedding output:", response_json["data"][0]["embedding"])
|
||||
print("Text embedding output:", response.data[0].embedding)
|
||||
|
||||
|
||||
model_example_map = {
|
||||
"vlm2vec": run_vlm2vec,
|
||||
"dse_qwen2_vl": run_dse_qwen2_vl,
|
||||
}
|
||||
|
||||
|
||||
def parse_args():
|
||||
@ -103,29 +169,24 @@ def parse_args():
|
||||
parser.add_argument(
|
||||
"--model",
|
||||
type=str,
|
||||
choices=["vlm2vec", "dse_qwen2_vl"],
|
||||
choices=model_example_map.keys(),
|
||||
required=True,
|
||||
help="Which model to call.",
|
||||
help="The name of the embedding model.",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def main(args):
|
||||
if args.model == "vlm2vec":
|
||||
vlm2vec()
|
||||
elif args.model == "dse_qwen2_vl":
|
||||
dse_qwen2_vl(
|
||||
{
|
||||
"type": "image",
|
||||
"image_url": image_url,
|
||||
}
|
||||
)
|
||||
dse_qwen2_vl(
|
||||
{
|
||||
"type": "text",
|
||||
"content": "What is the weather like today?",
|
||||
}
|
||||
)
|
||||
client = OpenAI(
|
||||
# defaults to os.environ.get("OPENAI_API_KEY")
|
||||
api_key=openai_api_key,
|
||||
base_url=openai_api_base,
|
||||
)
|
||||
|
||||
models = client.models.list()
|
||||
model_id = models.data[0].id
|
||||
|
||||
model_example_map[args.model](client, model_id)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
15
examples/template_vlm2vec_qwen2vl.jinja
Normal file
@ -0,0 +1,15 @@
|
||||
{%- if messages | length > 1 -%}
|
||||
{{ raise_exception('Embedding models should only embed one message at a time') }}
|
||||
{%- endif -%}
|
||||
|
||||
{% set vars = namespace(parts=[]) %}
|
||||
{%- for message in messages -%}
|
||||
{%- for content in message['content'] -%}
|
||||
{%- if content['type'] == 'text' -%}
|
||||
{%- set vars.parts = vars.parts + [content['text']] %}
|
||||
{%- elif content['type'] == 'image' -%}
|
||||
{%- set vars.parts = vars.parts + ['<|image_pad|>'] %}
|
||||
{%- endif -%}
|
||||
{%- endfor -%}
|
||||
{%- endfor -%}
|
||||
{{ vars.parts | join(' ') }}
|
||||
@ -126,6 +126,7 @@ markers = [
|
||||
"core_model: enable this model test in each PR instead of only nightly",
|
||||
"hybrid_model: models that contain mamba layers (including pure SSM and hybrid architectures)",
|
||||
"cpu_model: enable this model test in CPU tests",
|
||||
"cpu_test: mark test as CPU-only test",
|
||||
"split: run this test as part of a split",
|
||||
"distributed: run this test only in distributed GPU tests",
|
||||
"skip_v1: do not run this test with v1",
|
||||
|
||||
@ -29,8 +29,8 @@ opencv-python-headless >= 4.11.0 # required for video test
|
||||
datamodel_code_generator # required for minicpm3 test
|
||||
lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d # required for model evaluation test
|
||||
mteb>=1.38.11, <2 # required for mteb test
|
||||
transformers==4.52.4
|
||||
tokenizers==0.21.1
|
||||
transformers==4.56.2
|
||||
tokenizers==0.22.0
|
||||
schemathesis>=3.39.15 # Required for openai schema test.
|
||||
# quantization
|
||||
bitsandbytes>=0.46.1
|
||||
@ -43,6 +43,6 @@ tritonclient==2.51.0
|
||||
numba == 0.60.0; python_version == '3.9' # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding
|
||||
numba == 0.61.2; python_version > '3.9'
|
||||
numpy
|
||||
runai-model-streamer[s3]==0.14.0
|
||||
runai-model-streamer[s3,gcs]==0.14.0
|
||||
fastsafetensors>=0.1.10
|
||||
pydantic>=2.10 # 2.9 leads to error on python 3.10
|
||||
|
||||
@ -13,6 +13,6 @@ tensorizer==2.10.1
|
||||
packaging>=24.2
|
||||
setuptools>=77.0.3,<80.0.0
|
||||
setuptools-scm>=8
|
||||
runai-model-streamer[s3]==0.14.0
|
||||
runai-model-streamer[s3,gcs]==0.14.0
|
||||
conch-triton-kernels==1.2.1
|
||||
timm>=1.0.17
|
||||
timm>=1.0.17
|
||||
|
||||
@ -37,8 +37,8 @@ datamodel_code_generator # required for minicpm3 test
|
||||
# TODO: Use lm-eval[api]==0.4.10 once released
|
||||
lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d # required for model evaluation test
|
||||
mteb[bm25s]>=1.38.11, <2 # required for mteb test
|
||||
transformers==4.55.2
|
||||
tokenizers==0.21.1
|
||||
transformers==4.56.2
|
||||
tokenizers==0.22.0
|
||||
schemathesis>=3.39.15 # Required for openai schema test.
|
||||
# quantization
|
||||
bitsandbytes==0.46.1
|
||||
@ -51,7 +51,7 @@ tritonclient==2.51.0
|
||||
numba == 0.60.0; python_version == '3.9' # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding
|
||||
numba == 0.61.2; python_version > '3.9'
|
||||
numpy
|
||||
runai-model-streamer[s3]==0.14.0
|
||||
runai-model-streamer[s3,gcs]==0.14.0
|
||||
fastsafetensors>=0.1.10
|
||||
pydantic>=2.10 # 2.9 leads to error on python 3.10
|
||||
decord==0.6.0
|
||||
|
||||
@ -251,11 +251,27 @@ gitdb==4.0.12
|
||||
gitpython==3.1.44
|
||||
# via mlflow-skinny
|
||||
google-api-core==2.24.2
|
||||
# via opencensus
|
||||
# via
|
||||
# google-cloud-core
|
||||
# google-cloud-storage
|
||||
# opencensus
|
||||
google-auth==2.40.2
|
||||
# via
|
||||
# databricks-sdk
|
||||
# google-api-core
|
||||
# google-cloud-core
|
||||
# google-cloud-storage
|
||||
# runai-model-streamer-gcs
|
||||
google-cloud-core==2.4.3
|
||||
# via google-cloud-storage
|
||||
google-cloud-storage==3.4.0
|
||||
# via runai-model-streamer-gcs
|
||||
google-crc32c==1.7.1
|
||||
# via
|
||||
# google-cloud-storage
|
||||
# google-resumable-media
|
||||
google-resumable-media==2.7.2
|
||||
# via google-cloud-storage
|
||||
googleapis-common-protos==1.70.0
|
||||
# via google-api-core
|
||||
graphene==3.4.3
|
||||
@ -890,6 +906,7 @@ requests==2.32.3
|
||||
# docker
|
||||
# evaluate
|
||||
# google-api-core
|
||||
# google-cloud-storage
|
||||
# huggingface-hub
|
||||
# lightly
|
||||
# lm-eval
|
||||
@ -929,6 +946,8 @@ rtree==1.4.0
|
||||
# via torchgeo
|
||||
runai-model-streamer==0.14.0
|
||||
# via -r requirements/test.in
|
||||
runai-model-streamer-gcs==0.14.0
|
||||
# via runai-model-streamer
|
||||
runai-model-streamer-s3==0.14.0
|
||||
# via runai-model-streamer
|
||||
s3transfer==0.10.3
|
||||
@ -1072,7 +1091,7 @@ timm==1.0.17
|
||||
# segmentation-models-pytorch
|
||||
# terratorch
|
||||
# torchgeo
|
||||
tokenizers==0.21.1
|
||||
tokenizers==0.22.0
|
||||
# via
|
||||
# -r requirements/test.in
|
||||
# transformers
|
||||
@ -1153,7 +1172,7 @@ tqdm==4.66.6
|
||||
# transformers
|
||||
tqdm-multiprocess==0.0.11
|
||||
# via lm-eval
|
||||
transformers==4.55.2
|
||||
transformers==4.56.2
|
||||
# via
|
||||
# -r requirements/test.in
|
||||
# genai-perf
|
||||
|
||||
@ -731,6 +731,9 @@ class VllmRunner:
|
||||
init_ctx = (nullcontext() if default_torch_num_threads is None else
|
||||
set_default_torch_num_threads(default_torch_num_threads))
|
||||
|
||||
if not kwargs.get("compilation_config", None):
|
||||
kwargs["compilation_config"] = {"cudagraph_capture_sizes": [4]}
|
||||
|
||||
with init_ctx:
|
||||
self.llm = LLM(
|
||||
model=model_name,
|
||||
|
||||
@ -379,6 +379,14 @@ async def test_streaming(client: OpenAI, model_name: str, background: bool):
|
||||
if event.type == "response.created":
|
||||
resp_id = event.response.id
|
||||
|
||||
# test vllm custom types are in the response
|
||||
if event.type in [
|
||||
"response.completed", "response.in_progress",
|
||||
"response.created"
|
||||
]:
|
||||
assert 'input_messages' in event.response.model_extra
|
||||
assert 'output_messages' in event.response.model_extra
|
||||
|
||||
if current_event_mode != event.type:
|
||||
current_event_mode = event.type
|
||||
print(f"\n[{event.type}] ", end="", flush=True)
|
||||
|
||||
@ -14,7 +14,7 @@ from vllm.multimodal.utils import encode_image_base64, fetch_image
|
||||
MODEL_NAME = "TIGER-Lab/VLM2Vec-Full"
|
||||
MAXIMUM_IMAGES = 2
|
||||
|
||||
vlm2vec_jinja_path = VLLM_PATH / "examples/template_vlm2vec.jinja"
|
||||
vlm2vec_jinja_path = VLLM_PATH / "examples/template_vlm2vec_phi3v.jinja"
|
||||
assert vlm2vec_jinja_path.exists()
|
||||
|
||||
# Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
|
||||
|
||||
@ -2468,7 +2468,8 @@ def test_resolve_content_format_fallbacks(model, expected_format):
|
||||
("template_falcon.jinja", "string"),
|
||||
("template_inkbot.jinja", "string"),
|
||||
("template_teleflm.jinja", "string"),
|
||||
("template_vlm2vec.jinja", "openai"),
|
||||
("template_vlm2vec_phi3v.jinja", "openai"),
|
||||
("template_vlm2vec_qwen2vl.jinja", "openai"),
|
||||
("tool_chat_template_granite_20b_fc.jinja", "string"),
|
||||
("tool_chat_template_hermes.jinja", "string"),
|
||||
("tool_chat_template_internlm2_tool.jinja", "string"),
|
||||
|
||||
@ -26,7 +26,8 @@ def run_gpqa_eval(model_name: str, base_url: str) -> float:
|
||||
# Build the command to run the evaluation
|
||||
cmd = [
|
||||
sys.executable, "-m", "gpt_oss.evals", "--eval", "gpqa", "--model",
|
||||
model_name, "--reasoning-effort", "low", "--base-url", base_url
|
||||
model_name, "--reasoning-effort", "low", "--base-url", base_url,
|
||||
"--n-threads", "200"
|
||||
]
|
||||
|
||||
try:
|
||||
@ -72,8 +73,6 @@ def test_gpqa_correctness(request):
|
||||
|
||||
# Add standard server arguments
|
||||
server_args.extend([
|
||||
"--max-model-len",
|
||||
"32768",
|
||||
"--trust-remote-code",
|
||||
])
|
||||
|
||||
|
||||
@ -10,7 +10,7 @@ from vllm.model_executor.layers.mamba.ops.ssd_combined import (
|
||||
mamba_chunk_scan_combined_varlen)
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.v1.attention.backends.mamba2_attn import (
|
||||
_query_start_loc_to_chunk_indices_offsets)
|
||||
compute_varlen_chunk_metadata)
|
||||
|
||||
# Added by the IBM Team, 2024
|
||||
|
||||
@ -225,13 +225,9 @@ def test_mamba_chunk_scan_single_example(d_head, n_heads, seq_len_chunk_size,
|
||||
Y_min, final_state_min = ssd_minimal_discrete(X * dt.unsqueeze(-1), A * dt,
|
||||
B, C, chunk_size)
|
||||
|
||||
cu_seqlens = torch.tensor((0, seqlen), device='cuda').cumsum(dim=0)
|
||||
seq_idx = torch.zeros(seqlen, dtype=torch.int32, device=cu_seqlens.device)
|
||||
|
||||
chunk_indices, chunk_offsets = \
|
||||
_query_start_loc_to_chunk_indices_offsets(
|
||||
cu_seqlens, chunk_size, cu_seqlens[-1])
|
||||
|
||||
cu_seqlens = torch.tensor((0, seqlen), device="cuda").cumsum(dim=0)
|
||||
cu_chunk_seqlens, last_chunk_indices, seq_idx_chunks = (
|
||||
compute_varlen_chunk_metadata(cu_seqlens, chunk_size))
|
||||
# varlen has implicit batch=1
|
||||
X = X.squeeze(0)
|
||||
dt = dt.squeeze(0)
|
||||
@ -239,18 +235,20 @@ def test_mamba_chunk_scan_single_example(d_head, n_heads, seq_len_chunk_size,
|
||||
B = B.squeeze(0)
|
||||
C = C.squeeze(0)
|
||||
Y = torch.empty_like(X)
|
||||
final_state = mamba_chunk_scan_combined_varlen(X,
|
||||
dt,
|
||||
A,
|
||||
B,
|
||||
C,
|
||||
chunk_size,
|
||||
D=None,
|
||||
cu_seqlens=cu_seqlens,
|
||||
seq_idx=seq_idx,
|
||||
chunk_indices=chunk_indices,
|
||||
chunk_offsets=chunk_offsets,
|
||||
out=Y)
|
||||
final_state = mamba_chunk_scan_combined_varlen(
|
||||
X,
|
||||
dt,
|
||||
A,
|
||||
B,
|
||||
C,
|
||||
chunk_size,
|
||||
cu_seqlens=cu_seqlens.to(torch.int32),
|
||||
cu_chunk_seqlens=cu_chunk_seqlens,
|
||||
last_chunk_indices=last_chunk_indices,
|
||||
seq_idx=seq_idx_chunks,
|
||||
out=Y,
|
||||
D=None,
|
||||
)
|
||||
|
||||
# just test the last in sequence
|
||||
torch.testing.assert_close(Y[-1], Y_min[0, -1], atol=atol, rtol=rtol)
|
||||
@ -312,14 +310,13 @@ def test_mamba_chunk_scan_cont_batch(d_head, n_heads, seq_len_chunk_size_cases,
|
||||
exhausted: dict = {} # map: eg -> boolean indicating example is exhausted
|
||||
|
||||
states = None
|
||||
for Y_min, cu_seqlens, seq_idx, (
|
||||
for Y_min, cu_seqlens, _token_seq_idx, (
|
||||
A, dt, X, B, C) in generate_continuous_batched_examples(
|
||||
cases, num_examples, seqlen, last_taken, exhausted, n_heads,
|
||||
d_head, itype):
|
||||
|
||||
chunk_indices, chunk_offsets = \
|
||||
_query_start_loc_to_chunk_indices_offsets(
|
||||
cu_seqlens, chunk_size, cu_seqlens[-1])
|
||||
cu_chunk_seqlens, last_chunk_indices, seq_idx_chunks = (
|
||||
compute_varlen_chunk_metadata(cu_seqlens, chunk_size))
|
||||
|
||||
Y = torch.empty_like(X)
|
||||
new_states = mamba_chunk_scan_combined_varlen(
|
||||
@ -329,13 +326,13 @@ def test_mamba_chunk_scan_cont_batch(d_head, n_heads, seq_len_chunk_size_cases,
|
||||
B,
|
||||
C,
|
||||
chunk_size,
|
||||
D=None,
|
||||
cu_seqlens=cu_seqlens,
|
||||
seq_idx=seq_idx,
|
||||
chunk_indices=chunk_indices,
|
||||
chunk_offsets=chunk_offsets,
|
||||
initial_states=states,
|
||||
cu_seqlens=cu_seqlens.to(torch.int32),
|
||||
cu_chunk_seqlens=cu_chunk_seqlens,
|
||||
last_chunk_indices=last_chunk_indices,
|
||||
seq_idx=seq_idx_chunks,
|
||||
out=Y,
|
||||
D=None,
|
||||
initial_states=states,
|
||||
)
|
||||
|
||||
# just test the last in sequence
|
||||
@ -403,9 +400,8 @@ def test_mamba_chunk_scan_cont_batch_prefill_chunking(chunk_size, seqlens):
|
||||
device = X.device
|
||||
|
||||
## full seqlen computation
|
||||
chunk_indices, chunk_offsets = \
|
||||
_query_start_loc_to_chunk_indices_offsets(
|
||||
cu_seqlens, chunk_size, cu_seqlens[-1])
|
||||
cu_chunk_seqlens, last_chunk_indices, seq_idx_chunks = (
|
||||
compute_varlen_chunk_metadata(cu_seqlens, chunk_size))
|
||||
Y_ref = torch.empty_like(X)
|
||||
state_ref = mamba_chunk_scan_combined_varlen(
|
||||
X,
|
||||
@ -414,13 +410,13 @@ def test_mamba_chunk_scan_cont_batch_prefill_chunking(chunk_size, seqlens):
|
||||
B,
|
||||
C,
|
||||
chunk_size,
|
||||
D=None,
|
||||
cu_seqlens=cu_seqlens,
|
||||
seq_idx=seq_idx,
|
||||
chunk_indices=chunk_indices,
|
||||
chunk_offsets=chunk_offsets,
|
||||
initial_states=None,
|
||||
cu_seqlens=cu_seqlens.to(torch.int32),
|
||||
cu_chunk_seqlens=cu_chunk_seqlens,
|
||||
last_chunk_indices=last_chunk_indices,
|
||||
seq_idx=seq_idx_chunks,
|
||||
out=Y_ref,
|
||||
D=None,
|
||||
initial_states=None,
|
||||
)
|
||||
|
||||
## chunked seqlen computation
|
||||
@ -431,10 +427,6 @@ def test_mamba_chunk_scan_cont_batch_prefill_chunking(chunk_size, seqlens):
|
||||
torch.cumsum(chunked_seqlens, dim=0)
|
||||
],
|
||||
dim=0)
|
||||
chunked_seq_idx = torch.repeat_interleave(
|
||||
torch.arange(len(chunked_seqlens), device=device),
|
||||
chunked_seqlens,
|
||||
output_size=chunked_cu_seqlens[-1]).to(torch.int32)
|
||||
chunked_input_seq_len = chunked_cu_seqlens[-1]
|
||||
X_chunked = torch.zeros_like(X)[:chunked_input_seq_len, ...]
|
||||
dt_chunked = torch.zeros_like(dt)[:chunked_input_seq_len, ...]
|
||||
@ -450,9 +442,8 @@ def test_mamba_chunk_scan_cont_batch_prefill_chunking(chunk_size, seqlens):
|
||||
C_chunked[chunked_cu_seqlens[i]:chunked_cu_seqlens[i+1], ...] = chunk_f(C, i) # noqa: E501
|
||||
# fmt: on
|
||||
|
||||
chunk_indices, chunk_offsets = \
|
||||
_query_start_loc_to_chunk_indices_offsets(
|
||||
chunked_cu_seqlens, chunk_size, chunked_cu_seqlens[-1])
|
||||
cu_chunk_seqlens, last_chunk_indices, seq_idx_chunks = (
|
||||
compute_varlen_chunk_metadata(chunked_cu_seqlens, chunk_size))
|
||||
Y_partial = torch.empty_like(X_chunked)
|
||||
partial_state = mamba_chunk_scan_combined_varlen(
|
||||
X_chunked,
|
||||
@ -461,13 +452,13 @@ def test_mamba_chunk_scan_cont_batch_prefill_chunking(chunk_size, seqlens):
|
||||
B_chunked,
|
||||
C_chunked,
|
||||
chunk_size,
|
||||
D=None,
|
||||
cu_seqlens=chunked_cu_seqlens,
|
||||
seq_idx=chunked_seq_idx,
|
||||
chunk_indices=chunk_indices,
|
||||
chunk_offsets=chunk_offsets,
|
||||
initial_states=None,
|
||||
cu_seqlens=chunked_cu_seqlens.to(torch.int32),
|
||||
cu_chunk_seqlens=cu_chunk_seqlens,
|
||||
last_chunk_indices=last_chunk_indices,
|
||||
seq_idx=seq_idx_chunks,
|
||||
out=Y_partial,
|
||||
D=None,
|
||||
initial_states=None,
|
||||
)
|
||||
|
||||
# remaining chunk
|
||||
@ -477,10 +468,6 @@ def test_mamba_chunk_scan_cont_batch_prefill_chunking(chunk_size, seqlens):
|
||||
torch.cumsum(remaining_chunked_seqlens, dim=0)
|
||||
],
|
||||
dim=0)
|
||||
remaining_chunked_seq_idx = torch.repeat_interleave(
|
||||
torch.arange(len(remaining_chunked_seqlens), device=device),
|
||||
remaining_chunked_seqlens,
|
||||
output_size=remaining_chunked_cu_seqlens[-1]).to(torch.int32)
|
||||
remaining_chunked_input_seq_len = remaining_chunked_cu_seqlens[-1]
|
||||
# fmt: off
|
||||
remaining_X_chunked = torch.zeros_like(X)[:remaining_chunked_input_seq_len, ...] # noqa: E501
|
||||
@ -509,11 +496,9 @@ def test_mamba_chunk_scan_cont_batch_prefill_chunking(chunk_size, seqlens):
|
||||
assert concat_batch_f(B_chunked, remaining_B_chunked).equal(B)
|
||||
assert concat_batch_f(C_chunked, remaining_C_chunked).equal(C)
|
||||
|
||||
chunk_indices, chunk_offsets = \
|
||||
_query_start_loc_to_chunk_indices_offsets(
|
||||
remaining_chunked_cu_seqlens,
|
||||
chunk_size,
|
||||
remaining_chunked_cu_seqlens[-1])
|
||||
cu_chunk_seqlens, last_chunk_indices, seq_idx_chunks = (
|
||||
compute_varlen_chunk_metadata(remaining_chunked_cu_seqlens,
|
||||
chunk_size))
|
||||
|
||||
Y_chunked = torch.empty_like(remaining_X_chunked)
|
||||
state_chunked = mamba_chunk_scan_combined_varlen(
|
||||
@ -523,13 +508,13 @@ def test_mamba_chunk_scan_cont_batch_prefill_chunking(chunk_size, seqlens):
|
||||
remaining_B_chunked,
|
||||
remaining_C_chunked,
|
||||
chunk_size,
|
||||
D=None,
|
||||
cu_seqlens=remaining_chunked_cu_seqlens,
|
||||
seq_idx=remaining_chunked_seq_idx,
|
||||
chunk_indices=chunk_indices,
|
||||
chunk_offsets=chunk_offsets,
|
||||
initial_states=partial_state,
|
||||
cu_seqlens=remaining_chunked_cu_seqlens.to(torch.int32),
|
||||
cu_chunk_seqlens=cu_chunk_seqlens,
|
||||
last_chunk_indices=last_chunk_indices,
|
||||
seq_idx=seq_idx_chunks,
|
||||
out=Y_chunked,
|
||||
D=None,
|
||||
initial_states=partial_state,
|
||||
)
|
||||
Y = concat_batch_f(Y_partial, Y_chunked)
|
||||
|
||||
|
||||
@ -2,6 +2,7 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import glob
|
||||
import hashlib
|
||||
import os
|
||||
import tempfile
|
||||
|
||||
@ -9,7 +10,8 @@ import huggingface_hub.constants
|
||||
|
||||
from vllm.model_executor.model_loader.weight_utils import (
|
||||
download_weights_from_hf)
|
||||
from vllm.transformers_utils.runai_utils import (is_runai_obj_uri,
|
||||
from vllm.transformers_utils.runai_utils import (ObjectStorageModel,
|
||||
is_runai_obj_uri,
|
||||
list_safetensors)
|
||||
|
||||
|
||||
@ -34,6 +36,23 @@ def test_runai_list_safetensors_local():
|
||||
assert len(safetensors) == len(files)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_is_runai_obj_uri()
|
||||
test_runai_list_safetensors_local()
|
||||
def test_runai_pull_files_gcs(monkeypatch):
|
||||
monkeypatch.setenv("RUNAI_STREAMER_GCS_USE_ANONYMOUS_CREDENTIALS", "true")
|
||||
# Bypass default project lookup by setting GOOGLE_CLOUD_PROJECT
|
||||
monkeypatch.setenv("GOOGLE_CLOUD_PROJECT", "fake-project")
|
||||
filename = "LT08_L1GT_074061_20130309_20170505_01_T2_MTL.txt"
|
||||
gcs_bucket = "gs://gcp-public-data-landsat/LT08/01/074/061/LT08_L1GT_074061_20130309_20170505_01_T2/"
|
||||
gcs_url = f"{gcs_bucket}/{filename}"
|
||||
model = ObjectStorageModel(gcs_url)
|
||||
model.pull_files(gcs_bucket, allow_pattern=[f"*{filename}"])
|
||||
# To re-generate / change URLs:
|
||||
# gsutil ls -L gs://<gcs-url> | grep "Hash (md5)" | tr -d ' ' \
|
||||
# | cut -d":" -f2 | base64 -d | xxd -p
|
||||
expected_checksum = "f60dea775da1392434275b311b31a431"
|
||||
hasher = hashlib.new("md5")
|
||||
with open(os.path.join(model.dir, filename), 'rb') as f:
|
||||
# Read the file in chunks to handle large files efficiently
|
||||
for chunk in iter(lambda: f.read(4096), b''):
|
||||
hasher.update(chunk)
|
||||
actual_checksum = hasher.hexdigest()
|
||||
assert actual_checksum == expected_checksum
|
||||
|
||||
@ -214,7 +214,9 @@ VLM_TEST_SETTINGS = {
|
||||
vllm_runner_kwargs={
|
||||
"model_impl": "transformers",
|
||||
},
|
||||
marks=[large_gpu_mark(min_gb=32)],
|
||||
# FIXME: Investigate mrope issue
|
||||
marks=[large_gpu_mark(min_gb=32),
|
||||
pytest.mark.skip(reason="Mrope issue")],
|
||||
),
|
||||
#### Extended model tests
|
||||
"aria": VLMTestInfo(
|
||||
|
||||
@ -657,7 +657,8 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
|
||||
}
|
||||
|
||||
_TRANSFORMERS_BACKEND_MODELS = {
|
||||
"TransformersModel": _HfExamplesInfo("Qwen/Qwen3-Embedding-0.6B"),
|
||||
"TransformersEmbeddingModel": _HfExamplesInfo("BAAI/bge-base-en-v1.5", min_transformers_version="4.57.0.dev0"), # noqa: E501
|
||||
"TransformersForSequenceClassification": _HfExamplesInfo("papluca/xlm-roberta-base-language-detection", min_transformers_version="4.57.0.dev0"), # noqa: E501
|
||||
"TransformersForCausalLM": _HfExamplesInfo("hmellor/Ilama-3.2-1B", trust_remote_code=True), # noqa: E501
|
||||
"TransformersForMultimodalLM": _HfExamplesInfo("BAAI/Emu3-Chat-hf"),
|
||||
}
|
||||
|
||||
@ -9,9 +9,16 @@ from vllm.platforms import current_platform
|
||||
|
||||
from ..conftest import HfRunner, VllmRunner
|
||||
from ..utils import multi_gpu_test, prep_prompts
|
||||
from .registry import HF_EXAMPLE_MODELS
|
||||
from .utils import check_embeddings_close, check_logprobs_close
|
||||
|
||||
|
||||
def get_model(arch: str) -> str:
|
||||
model_info = HF_EXAMPLE_MODELS.get_hf_info(arch)
|
||||
model_info.check_transformers_version(on_fail="skip")
|
||||
return model_info.default
|
||||
|
||||
|
||||
def check_implementation(
|
||||
runner_ref: type[Union[HfRunner, VllmRunner]],
|
||||
runner_test: type[VllmRunner],
|
||||
@ -170,71 +177,47 @@ def test_embed_loading(vllm_runner, model):
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[
|
||||
# Encoder model
|
||||
"BAAI/bge-base-en-v1.5",
|
||||
])
|
||||
def test_embed_correctness(hf_runner, vllm_runner, example_prompts, model):
|
||||
import transformers
|
||||
from packaging.version import Version
|
||||
installed = Version(transformers.__version__)
|
||||
required = Version("4.57.0.dev0")
|
||||
if installed < required:
|
||||
pytest.skip("Encoder models with the Transformers backend require "
|
||||
f"transformers>={required}, but got {installed}")
|
||||
"arch",
|
||||
["TransformersEmbeddingModel", "TransformersForSequenceClassification"])
|
||||
def test_pooling(hf_runner, vllm_runner, example_prompts, arch):
|
||||
model = get_model(arch)
|
||||
|
||||
with vllm_runner(model, max_model_len=512,
|
||||
model_impl="transformers") as vllm_model:
|
||||
vllm_kwargs = dict(
|
||||
max_model_len=None,
|
||||
model_impl="transformers",
|
||||
compilation_config=dict(cudagraph_capture_sizes=[8]),
|
||||
)
|
||||
|
||||
hf_kwargs = dict()
|
||||
if arch == "TransformersEmbeddingModel":
|
||||
hf_kwargs["is_sentence_transformer"] = True
|
||||
elif arch == "TransformersForSequenceClassification":
|
||||
from transformers import AutoModelForSequenceClassification
|
||||
hf_kwargs["auto_cls"] = AutoModelForSequenceClassification
|
||||
|
||||
# The example_prompts has ending "\n", for example:
|
||||
# "Write a short story about a robot that dreams for the first time.\n"
|
||||
# sentence_transformers will strip the input texts, see:
|
||||
# https://github.com/UKPLab/sentence-transformers/blob/v3.1.1/sentence_transformers/models/Transformer.py#L159
|
||||
# This makes the input_ids different between hf_model and vllm_model.
|
||||
# So we need to strip the input texts to avoid test failing.
|
||||
example_prompts = [str(s).strip() for s in example_prompts]
|
||||
|
||||
with (vllm_runner(model, **vllm_kwargs) as
|
||||
vllm_model, hf_runner(model, **hf_kwargs) as hf_model):
|
||||
model_config = vllm_model.llm.llm_engine.model_config
|
||||
assert model_config.using_transformers_backend()
|
||||
|
||||
vllm_outputs = vllm_model.embed(example_prompts)
|
||||
|
||||
with hf_runner(model, is_sentence_transformer=True) as hf_model:
|
||||
hf_outputs = hf_model.encode(example_prompts)
|
||||
if arch == "TransformersEmbeddingModel":
|
||||
vllm_outputs = vllm_model.embed(example_prompts)
|
||||
hf_outputs = hf_model.encode(example_prompts)
|
||||
elif arch == "TransformersForSequenceClassification":
|
||||
vllm_outputs = vllm_model.classify(example_prompts)
|
||||
hf_outputs = hf_model.classify(example_prompts)
|
||||
|
||||
check_embeddings_close(
|
||||
embeddings_0_lst=hf_outputs,
|
||||
embeddings_1_lst=vllm_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
tol=1e-2,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
["jason9693/Qwen2.5-1.5B-apeach"],
|
||||
)
|
||||
@pytest.mark.parametrize("dtype", ["float"])
|
||||
def test_classify(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
dtype: str,
|
||||
) -> None:
|
||||
import torch
|
||||
from transformers import AutoModelForSequenceClassification
|
||||
|
||||
with vllm_runner(model,
|
||||
max_model_len=512,
|
||||
dtype=dtype,
|
||||
model_impl="transformers") as vllm_model:
|
||||
model_config = vllm_model.llm.llm_engine.model_config
|
||||
assert model_config.using_transformers_backend()
|
||||
|
||||
vllm_outputs = vllm_model.classify(example_prompts)
|
||||
|
||||
with hf_runner(model,
|
||||
dtype=dtype,
|
||||
auto_cls=AutoModelForSequenceClassification) as hf_model:
|
||||
hf_outputs = hf_model.classify(example_prompts)
|
||||
|
||||
for hf_output, vllm_output in zip(hf_outputs, vllm_outputs):
|
||||
hf_output = torch.tensor(hf_output)
|
||||
vllm_output = torch.tensor(vllm_output)
|
||||
|
||||
assert torch.allclose(hf_output, vllm_output,
|
||||
1e-3 if dtype == "float" else 1e-2)
|
||||
|
||||
@ -1,10 +1,13 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from vllm.model_executor.models.utils import AutoWeightsLoader
|
||||
|
||||
pytestmark = pytest.mark.cpu_test
|
||||
|
||||
|
||||
class ModuleWithBatchNorm(torch.nn.Module):
|
||||
|
||||
|
||||
@ -16,6 +16,8 @@ from vllm.model_executor.models.vision import (
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils import get_open_port, update_environment_variables
|
||||
|
||||
pytestmark = pytest.mark.cpu_test
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("select_layers", "num_layers_loaded", "max_possible_layers",
|
||||
|
||||
@ -19,6 +19,8 @@ from vllm.multimodal.inputs import (MultiModalFieldElem, MultiModalKwargsItem,
|
||||
MultiModalSharedField)
|
||||
from vllm.multimodal.processing import PromptInsertion
|
||||
|
||||
pytestmark = pytest.mark.cpu_test
|
||||
|
||||
|
||||
def _dummy_elem(
|
||||
modality: str,
|
||||
|
||||
@ -10,6 +10,8 @@ from PIL import Image, ImageDraw
|
||||
|
||||
from vllm.multimodal.hasher import MultiModalHasher
|
||||
|
||||
pytestmark = pytest.mark.cpu_test
|
||||
|
||||
ASSETS_DIR = Path(__file__).parent / "assets"
|
||||
assert ASSETS_DIR.exists()
|
||||
|
||||
|
||||
@ -8,6 +8,8 @@ from PIL import Image, ImageChops
|
||||
|
||||
from vllm.multimodal.image import ImageMediaIO, convert_image_mode
|
||||
|
||||
pytestmark = pytest.mark.cpu_test
|
||||
|
||||
ASSETS_DIR = Path(__file__).parent / "assets"
|
||||
assert ASSETS_DIR.exists()
|
||||
|
||||
|
||||
@ -1,10 +1,13 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from vllm.multimodal.inputs import MultiModalKwargs, NestedTensors
|
||||
|
||||
pytestmark = pytest.mark.cpu_test
|
||||
|
||||
|
||||
def assert_nested_tensors_equal(expected: NestedTensors,
|
||||
actual: NestedTensors):
|
||||
|
||||
@ -25,6 +25,8 @@ from vllm.transformers_utils.tokenizer import AnyTokenizer
|
||||
|
||||
from .utils import random_image
|
||||
|
||||
pytestmark = pytest.mark.cpu_test
|
||||
|
||||
|
||||
# yapf: disable
|
||||
@pytest.mark.parametrize(
|
||||
|
||||
@ -11,6 +11,8 @@ from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
|
||||
from ..models.utils import build_model_context
|
||||
|
||||
pytestmark = pytest.mark.cpu_test
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model_id,limit_mm_per_prompt,expected",
|
||||
|
||||
@ -17,6 +17,8 @@ from vllm.multimodal.video import (VIDEO_LOADER_REGISTRY, VideoLoader,
|
||||
|
||||
from .utils import cosine_similarity, create_video_from_image, normalize_image
|
||||
|
||||
pytestmark = pytest.mark.cpu_test
|
||||
|
||||
NUM_FRAMES = 10
|
||||
FAKE_OUTPUT_1 = np.random.rand(NUM_FRAMES, 1280, 720, 3)
|
||||
FAKE_OUTPUT_2 = np.random.rand(NUM_FRAMES, 1280, 720, 3)
|
||||
|
||||
@ -20,7 +20,6 @@ def test_pre_quantized_model(vllm_runner):
|
||||
output = llm.generate_greedy(["The capital of France is"],
|
||||
max_tokens=32)
|
||||
assert output
|
||||
print(output)
|
||||
|
||||
|
||||
@pytest.mark.skipif(not TORCHAO_AVAILABLE, reason="torchao is not available")
|
||||
@ -42,7 +41,6 @@ def test_opt_125m_int8wo_model_loading_with_params(vllm_runner,
|
||||
max_tokens=32)
|
||||
|
||||
assert output
|
||||
print(output)
|
||||
|
||||
|
||||
@pytest.mark.skipif(not TORCHAO_AVAILABLE, reason="torchao is not available")
|
||||
@ -57,7 +55,6 @@ def test_opt_125m_int4wo_model_per_module_quant(vllm_runner):
|
||||
max_tokens=32)
|
||||
|
||||
assert output
|
||||
print(output)
|
||||
|
||||
|
||||
@pytest.mark.skipif(not TORCHAO_AVAILABLE, reason="torchao is not available")
|
||||
@ -72,7 +69,6 @@ def test_qwenvl_int8wo_model_loading_with_params(vllm_runner):
|
||||
max_tokens=32)
|
||||
|
||||
assert output
|
||||
print(output)
|
||||
|
||||
|
||||
@pytest.mark.skipif(not TORCHAO_AVAILABLE, reason="torchao is not available")
|
||||
@ -92,7 +88,127 @@ def test_opt_125m_awq_int4wo_model_loading_with_params(vllm_runner):
|
||||
max_tokens=32)
|
||||
|
||||
assert output
|
||||
print(output)
|
||||
|
||||
|
||||
@pytest.mark.skipif(not TORCHAO_AVAILABLE, reason="torchao is not available")
|
||||
def test_on_the_fly_quant_config_dict_json(vllm_runner):
|
||||
"""Testing on the fly quantization, load_weights integration point,
|
||||
with config dict serialized to json string
|
||||
"""
|
||||
torch._dynamo.reset()
|
||||
model_name = "facebook/opt-125m"
|
||||
|
||||
import json
|
||||
|
||||
from torchao.core.config import config_to_dict
|
||||
from torchao.quantization import (
|
||||
Float8DynamicActivationFloat8WeightConfig, PerRow)
|
||||
|
||||
torchao_quant_config = Float8DynamicActivationFloat8WeightConfig(
|
||||
granularity=PerRow())
|
||||
hf_overrides = {
|
||||
"quantization_config_dict_json":
|
||||
json.dumps(config_to_dict(torchao_quant_config))
|
||||
}
|
||||
with vllm_runner(model_name=model_name,
|
||||
dtype="bfloat16",
|
||||
pt_load_map_location="cuda:0",
|
||||
quantization="torchao",
|
||||
hf_overrides=hf_overrides) as llm:
|
||||
output = llm.generate_greedy(["The capital of France is"],
|
||||
max_tokens=32)
|
||||
|
||||
assert output
|
||||
|
||||
|
||||
@pytest.mark.skipif(not TORCHAO_AVAILABLE, reason="torchao is not available")
|
||||
def test_on_the_fly_quant_config_file(vllm_runner):
|
||||
"""Testing on the fly quantization, load_weights integration point,
|
||||
with config file
|
||||
"""
|
||||
torch._dynamo.reset()
|
||||
model_name = "facebook/opt-125m"
|
||||
import json
|
||||
from tempfile import NamedTemporaryFile
|
||||
|
||||
from torchao.core.config import config_to_dict
|
||||
from torchao.quantization import (
|
||||
Float8DynamicActivationFloat8WeightConfig, PerRow)
|
||||
|
||||
config = Float8DynamicActivationFloat8WeightConfig(granularity=PerRow())
|
||||
|
||||
with NamedTemporaryFile(mode="w", delete=False) as f:
|
||||
f.write(json.dumps(config_to_dict(config)))
|
||||
# close the file to save it
|
||||
f.close()
|
||||
config_file_name = str(f.name)
|
||||
|
||||
hf_overrides = {"quantization_config_file": config_file_name}
|
||||
with vllm_runner(model_name=model_name,
|
||||
dtype="bfloat16",
|
||||
pt_load_map_location="cuda:0",
|
||||
quantization="torchao",
|
||||
hf_overrides=hf_overrides) as llm:
|
||||
output = llm.generate_greedy(["The capital of France is"],
|
||||
max_tokens=32)
|
||||
|
||||
assert output
|
||||
|
||||
|
||||
@pytest.mark.skipif(not TORCHAO_AVAILABLE, reason="torchao is not available")
|
||||
def test_reload_weights():
|
||||
import json
|
||||
|
||||
from torchao.core.config import config_to_dict
|
||||
from torchao.quantization import (
|
||||
Float8DynamicActivationFloat8WeightConfig, PerRow)
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
|
||||
torchao_quant_config = Float8DynamicActivationFloat8WeightConfig(
|
||||
granularity=PerRow())
|
||||
|
||||
hf_overrides = {
|
||||
"quantization_config_dict_json":
|
||||
json.dumps(config_to_dict(torchao_quant_config))
|
||||
}
|
||||
|
||||
llm = LLM(
|
||||
model="Qwen/Qwen3-0.6B",
|
||||
dtype="bfloat16",
|
||||
load_format="dummy",
|
||||
enforce_eager=True,
|
||||
quantization="torchao",
|
||||
hf_overrides=hf_overrides,
|
||||
)
|
||||
# Update load format from `dummy` to `auto`
|
||||
llm.collective_rpc("update_config",
|
||||
args=({
|
||||
"load_config": {
|
||||
"load_format": "auto"
|
||||
}
|
||||
}, ))
|
||||
# Now reload real weights inplace
|
||||
llm.collective_rpc("reload_weights")
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
# Create a sampling params object.
|
||||
sampling_params = SamplingParams(temperature=0, top_p=0.95)
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
# make sure it runs
|
||||
for output in outputs:
|
||||
generated_text = output.outputs[0].text
|
||||
assert generated_text
|
||||
# can also uncomment locally to make sure the generated
|
||||
# output makes sense
|
||||
# prompt = output.prompt
|
||||
# print(f"Prompt: {prompt!r}")
|
||||
# print(f"Output: {generated_text!r}")
|
||||
# print("-" * 60)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@ -6,6 +6,8 @@ import pytest
|
||||
from vllm.inputs import zip_enc_dec_prompts
|
||||
from vllm.inputs.parse import parse_and_batch_prompt
|
||||
|
||||
pytestmark = pytest.mark.cpu_test
|
||||
|
||||
STRING_INPUTS = [
|
||||
'',
|
||||
'foo',
|
||||
|
||||
@ -1,8 +1,12 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.outputs import RequestOutput
|
||||
|
||||
pytestmark = pytest.mark.cpu_test
|
||||
|
||||
|
||||
def test_request_output_forward_compatible():
|
||||
output = RequestOutput(request_id="test_request_id",
|
||||
|
||||
@ -12,7 +12,7 @@ from .utils import ARGS, CONFIGS, ServerConfig
|
||||
|
||||
|
||||
# for each server config, download the model and return the config
|
||||
@pytest.fixture(scope="session", params=CONFIGS.keys())
|
||||
@pytest.fixture(scope="package", params=CONFIGS.keys())
|
||||
def server_config(request):
|
||||
config = CONFIGS[request.param]
|
||||
|
||||
@ -26,7 +26,7 @@ def server_config(request):
|
||||
|
||||
|
||||
# run this for each server config
|
||||
@pytest.fixture(scope="session")
|
||||
@pytest.fixture(scope="package")
|
||||
def server(request, server_config: ServerConfig):
|
||||
model = server_config["model"]
|
||||
args_for_model = server_config["arguments"]
|
||||
@ -10,6 +10,8 @@ from vllm.entrypoints.openai.protocol import FunctionCall, ToolCall
|
||||
from vllm.entrypoints.openai.tool_parsers import Glm4MoeModelToolParser
|
||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||
|
||||
pytestmark = pytest.mark.cpu_test
|
||||
|
||||
pytest.skip("skip glm4_moe parser test", allow_module_level=True)
|
||||
# Use a common model that is likely to be available
|
||||
MODEL = "zai-org/GLM-4.5"
|
||||
|
||||
@ -15,6 +15,8 @@ from vllm.entrypoints.openai.tool_parsers import JambaToolParser
|
||||
from vllm.transformers_utils.detokenizer_utils import detokenize_incrementally
|
||||
from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer
|
||||
|
||||
pytestmark = pytest.mark.cpu_test
|
||||
|
||||
MODEL = "ai21labs/Jamba-tiny-dev"
|
||||
|
||||
|
||||
|
||||
@ -10,6 +10,8 @@ from vllm.entrypoints.openai.protocol import FunctionCall, ToolCall
|
||||
from vllm.entrypoints.openai.tool_parsers import KimiK2ToolParser
|
||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||
|
||||
pytestmark = pytest.mark.cpu_test
|
||||
|
||||
# Use a common model that is likely to be available
|
||||
MODEL = "moonshotai/Kimi-K2-Instruct"
|
||||
|
||||
|
||||
@ -12,6 +12,8 @@ from vllm.entrypoints.openai.protocol import (ChatCompletionToolsParam,
|
||||
from vllm.entrypoints.openai.tool_parsers import MinimaxToolParser
|
||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||
|
||||
pytestmark = pytest.mark.cpu_test
|
||||
|
||||
# Use a common model that is likely to be available
|
||||
MODEL = "MiniMaxAi/MiniMax-M1-40k"
|
||||
|
||||
|
||||
@ -18,6 +18,8 @@ from vllm.entrypoints.openai.tool_parsers.qwen3xml_tool_parser import (
|
||||
from vllm.transformers_utils.detokenizer_utils import detokenize_incrementally
|
||||
from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer
|
||||
|
||||
pytestmark = pytest.mark.cpu_test
|
||||
|
||||
MODEL = "Qwen/Qwen3-Coder-30B-A3B-Instruct-FP8"
|
||||
|
||||
|
||||
|
||||
@ -16,6 +16,8 @@ from vllm.entrypoints.openai.tool_parsers import SeedOssToolParser
|
||||
from vllm.transformers_utils.detokenizer_utils import detokenize_incrementally
|
||||
from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer
|
||||
|
||||
pytestmark = pytest.mark.cpu_test
|
||||
|
||||
# Use a common model that is likely to be available
|
||||
MODEL = "ByteDance-Seed/Seed-OSS-36B-Instruct"
|
||||
|
||||
|
||||
@ -12,6 +12,8 @@ from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
|
||||
ChatCompletionToolsParam)
|
||||
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
|
||||
|
||||
pytestmark = pytest.mark.cpu_test
|
||||
|
||||
EXAMPLE_TOOLS = [
|
||||
{
|
||||
"type": "function",
|
||||
|
||||
@ -14,6 +14,8 @@ from vllm.entrypoints.openai.tool_parsers import xLAMToolParser
|
||||
from vllm.transformers_utils.detokenizer_utils import detokenize_incrementally
|
||||
from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer
|
||||
|
||||
pytestmark = pytest.mark.cpu_test
|
||||
|
||||
# Use a common model that is likely to be available
|
||||
MODEL = "Salesforce/Llama-xLAM-2-8B-fc-r"
|
||||
|
||||
|
||||
@ -11,6 +11,8 @@ from vllm.v1.utils import ConstantList
|
||||
|
||||
from .utils import create_requests, create_scheduler
|
||||
|
||||
pytestmark = pytest.mark.cpu_test
|
||||
|
||||
|
||||
def _make_model_runner_output(
|
||||
scheduler_output: SchedulerOutput, ) -> ModelRunnerOutput:
|
||||
|
||||
@ -1,9 +1,12 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import pytest
|
||||
|
||||
from vllm.multimodal.inputs import MultiModalFeatureSpec, PlaceholderRange
|
||||
from vllm.v1.core.encoder_cache_manager import EncoderCacheManager
|
||||
|
||||
pytestmark = pytest.mark.cpu_test
|
||||
|
||||
|
||||
# ------------------ Mock Classes ------------------ #
|
||||
class MockRequest:
|
||||
|
||||
@ -32,6 +32,8 @@ from vllm.v1.request import Request
|
||||
|
||||
# yapf: enable
|
||||
|
||||
pytestmark = pytest.mark.cpu_test
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def _auto_init_hash_fn(request):
|
||||
|
||||
@ -25,6 +25,8 @@ from vllm.v1.core.kv_cache_utils import (BlockHash, BlockHashWithGroupId,
|
||||
from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
|
||||
KVCacheGroupSpec, SlidingWindowSpec)
|
||||
|
||||
pytestmark = pytest.mark.cpu_test
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def _auto_init_hash_fn(request):
|
||||
@ -1267,7 +1269,7 @@ def test_kv_cache_events(blocks_to_cache: int):
|
||||
|
||||
|
||||
def test_eagle_enabled_removes_last_block():
|
||||
"""Verify Eagle does NOT remove blocks when request
|
||||
"""Verify Eagle does NOT remove blocks when request
|
||||
length is divisible by block size."""
|
||||
block_size = 16
|
||||
manager = KVCacheManager(
|
||||
|
||||
@ -23,6 +23,8 @@ from vllm.v1.structured_output.request import StructuredOutputRequest
|
||||
|
||||
from .utils import EOS_TOKEN_ID, create_requests, create_scheduler
|
||||
|
||||
pytestmark = pytest.mark.cpu_test
|
||||
|
||||
|
||||
def test_add_requests():
|
||||
scheduler = create_scheduler()
|
||||
|
||||
@ -3,6 +3,7 @@
|
||||
|
||||
import random
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from vllm.v1.core.block_pool import BlockPool
|
||||
@ -13,6 +14,8 @@ from vllm.v1.core.single_type_kv_cache_manager import (
|
||||
from vllm.v1.kv_cache_interface import (ChunkedLocalAttentionSpec,
|
||||
SlidingWindowSpec)
|
||||
|
||||
pytestmark = pytest.mark.cpu_test
|
||||
|
||||
|
||||
def get_sliding_window_manager(sliding_window_spec, block_pool):
|
||||
return SlidingWindowManager(sliding_window_spec,
|
||||
|
||||
341
tests/v1/kv_connector/unit/test_kv_load_failure_recovery.py
Normal file
@ -0,0 +1,341 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from typing import Callable
|
||||
from unittest.mock import Mock
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.v1.core.sched.scheduler import Scheduler
|
||||
from vllm.v1.request import Request, RequestStatus
|
||||
|
||||
from .utils import (create_model_runner_output, create_request,
|
||||
create_scheduler, create_vllm_config)
|
||||
|
||||
|
||||
def _make_get_num_new_matched_tokens(
|
||||
req_num_new_matched_tokens: dict[str, int],
|
||||
async_load,
|
||||
) -> Callable[[Request, int], tuple[int, bool]]:
|
||||
|
||||
def get_num_new_matched_tokens(request: Request,
|
||||
_: int) -> tuple[int, bool]:
|
||||
value = req_num_new_matched_tokens.get(request.request_id, 0)
|
||||
return value, async_load
|
||||
|
||||
return get_num_new_matched_tokens
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def scheduler():
|
||||
vllm_config = create_vllm_config()
|
||||
return create_scheduler(vllm_config)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"num_prompt_blocks,"
|
||||
"num_external_computed_blocks,"
|
||||
"invalid_block_idxs",
|
||||
[
|
||||
(100, 99, {0, 98}),
|
||||
(100, 99, {50, 98}),
|
||||
(100, 99, {98}),
|
||||
],
|
||||
)
|
||||
def test_async_load_failure(
|
||||
scheduler: Scheduler,
|
||||
num_prompt_blocks: int,
|
||||
num_external_computed_blocks: int,
|
||||
invalid_block_idxs: set[int],
|
||||
):
|
||||
assert num_prompt_blocks >= num_external_computed_blocks
|
||||
|
||||
num_prompt_tokens = num_prompt_blocks * scheduler.block_size
|
||||
num_external_computed_tokens = (num_external_computed_blocks *
|
||||
scheduler.block_size)
|
||||
|
||||
request1 = create_request(num_tokens=num_prompt_tokens)
|
||||
scheduler.add_request(request=request1)
|
||||
request2 = create_request(num_tokens=num_prompt_tokens)
|
||||
scheduler.add_request(request=request2)
|
||||
request3 = create_request(num_tokens=num_prompt_tokens)
|
||||
scheduler.add_request(request=request3)
|
||||
|
||||
# Mock KV connector method.
|
||||
# req_id -> num_external_computed_tokens
|
||||
req_num_new_matched_tokens = {
|
||||
request1.request_id: num_external_computed_tokens,
|
||||
request2.request_id: num_external_computed_tokens,
|
||||
request3.request_id: num_external_computed_tokens,
|
||||
}
|
||||
|
||||
scheduler.connector = Mock()
|
||||
scheduler.connector.get_num_new_matched_tokens.side_effect = (
|
||||
_make_get_num_new_matched_tokens(req_num_new_matched_tokens,
|
||||
async_load=True))
|
||||
scheduler.connector.take_events.return_value = ()
|
||||
|
||||
scheduler_output = scheduler.schedule()
|
||||
|
||||
assert len(scheduler.waiting) == 3
|
||||
for request in scheduler.waiting:
|
||||
assert request.num_computed_tokens == 0
|
||||
assert request.status == RequestStatus.WAITING_FOR_REMOTE_KVS
|
||||
assert scheduler.connector.get_num_new_matched_tokens.call_count == 3
|
||||
|
||||
# Simulate a failure in loading some of request2 blocks.
|
||||
(req2_block_ids, ) = scheduler.kv_cache_manager.get_block_ids(
|
||||
request2.request_id)
|
||||
invalid_block_ids = {req2_block_ids[i] for i in invalid_block_idxs}
|
||||
model_runner_output = create_model_runner_output(
|
||||
reqs=[],
|
||||
finished_recving={request1.request_id, request3.request_id},
|
||||
invalid_block_ids=invalid_block_ids,
|
||||
use_eos=True)
|
||||
|
||||
scheduler.update_from_output(scheduler_output, model_runner_output)
|
||||
|
||||
min_invalid_block_idx = min(invalid_block_idxs)
|
||||
|
||||
assert len(scheduler.waiting) == 3
|
||||
for request in scheduler.waiting:
|
||||
if request.request_id == request2.request_id:
|
||||
assert request.num_computed_tokens == (min_invalid_block_idx *
|
||||
scheduler.block_size)
|
||||
else:
|
||||
assert request.num_computed_tokens == 0
|
||||
assert request.status == RequestStatus.WAITING_FOR_REMOTE_KVS
|
||||
assert scheduler.failed_recving_kv_req_ids == {request2.request_id}
|
||||
assert scheduler.connector.get_num_new_matched_tokens.call_count == 3
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"num_prompt_blocks,"
|
||||
"num_external_computed_blocks,"
|
||||
"invalid_block_idxs",
|
||||
[
|
||||
(100, 99, {0, 98}),
|
||||
(100, 99, {50, 98}),
|
||||
(100, 99, {98}),
|
||||
],
|
||||
)
|
||||
def test_sync_load_failure(
|
||||
scheduler: Scheduler,
|
||||
num_prompt_blocks: int,
|
||||
num_external_computed_blocks: int,
|
||||
invalid_block_idxs: set[int],
|
||||
):
|
||||
assert num_prompt_blocks >= num_external_computed_blocks
|
||||
|
||||
num_prompt_tokens = num_prompt_blocks * scheduler.block_size
|
||||
num_external_computed_tokens = (num_external_computed_blocks *
|
||||
scheduler.block_size)
|
||||
|
||||
request1 = create_request(num_tokens=num_prompt_tokens)
|
||||
scheduler.add_request(request=request1)
|
||||
request2 = create_request(num_tokens=num_prompt_tokens)
|
||||
scheduler.add_request(request=request2)
|
||||
request3 = create_request(num_tokens=num_prompt_tokens)
|
||||
scheduler.add_request(request=request3)
|
||||
|
||||
# Mock KV connector method.
|
||||
# req_id -> num_external_computed_tokens
|
||||
req_num_new_matched_tokens = {
|
||||
request1.request_id: num_external_computed_tokens,
|
||||
request2.request_id: num_external_computed_tokens,
|
||||
request3.request_id: num_external_computed_tokens,
|
||||
}
|
||||
|
||||
scheduler.connector = Mock()
|
||||
scheduler.connector.get_num_new_matched_tokens.side_effect = (
|
||||
_make_get_num_new_matched_tokens(req_num_new_matched_tokens,
|
||||
async_load=False))
|
||||
scheduler.connector.request_finished.return_value = (False, None)
|
||||
scheduler.connector.take_events.return_value = ()
|
||||
|
||||
scheduler_output = scheduler.schedule()
|
||||
|
||||
# req_id -> num_computed_tokens
|
||||
expected_computed_tokens = {
|
||||
request1.request_id: num_external_computed_tokens,
|
||||
request2.request_id: num_external_computed_tokens,
|
||||
request3.request_id: num_external_computed_tokens,
|
||||
}
|
||||
|
||||
assert len(scheduler.running) == 3
|
||||
assert len(scheduler_output.scheduled_new_reqs) == 3
|
||||
for request in scheduler_output.scheduled_new_reqs:
|
||||
assert request.num_computed_tokens == expected_computed_tokens[
|
||||
request.req_id]
|
||||
assert scheduler.connector.get_num_new_matched_tokens.call_count == 3
|
||||
|
||||
# Simulate a failure in loading some of request2 blocks.
|
||||
req2_block_ids = scheduler_output.scheduled_new_reqs[1].block_ids[0]
|
||||
invalid_block_ids = {req2_block_ids[i] for i in invalid_block_idxs}
|
||||
model_runner_output = create_model_runner_output(
|
||||
[request1, request2, request3],
|
||||
invalid_block_ids=invalid_block_ids,
|
||||
use_eos=True)
|
||||
|
||||
scheduler.update_from_output(scheduler_output, model_runner_output)
|
||||
|
||||
assert len(scheduler.running) == 1
|
||||
assert scheduler.running[0].request_id == request2.request_id
|
||||
assert scheduler.running[0].num_computed_tokens == (
|
||||
min(invalid_block_idxs) * scheduler.block_size)
|
||||
assert scheduler.connector.get_num_new_matched_tokens.call_count == 3
|
||||
assert scheduler.connector.request_finished.call_count == 2
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"num_prompt_blocks,"
|
||||
"num_external_computed_blocks,"
|
||||
"num_common_prefix_blocks,"
|
||||
"invalid_block_idxs",
|
||||
[
|
||||
(100, 99, 50, {0, 49}),
|
||||
(100, 99, 50, {25, 49}),
|
||||
(100, 99, 50, {49}),
|
||||
],
|
||||
)
|
||||
def test_sync_load_failure_with_shared_blocks(
|
||||
scheduler: Scheduler,
|
||||
num_prompt_blocks: int,
|
||||
num_external_computed_blocks: int,
|
||||
num_common_prefix_blocks: int,
|
||||
invalid_block_idxs: set[int],
|
||||
):
|
||||
assert (num_prompt_blocks >= num_external_computed_blocks >=
|
||||
num_common_prefix_blocks)
|
||||
|
||||
num_prompt_tokens = num_prompt_blocks * scheduler.block_size
|
||||
num_external_computed_tokens = (num_external_computed_blocks *
|
||||
scheduler.block_size)
|
||||
common_prefix_len = num_common_prefix_blocks * scheduler.block_size
|
||||
|
||||
request1 = create_request(num_tokens=num_prompt_tokens,
|
||||
common_prefix_len=common_prefix_len)
|
||||
scheduler.add_request(request=request1)
|
||||
request2 = create_request(num_tokens=num_prompt_tokens,
|
||||
common_prefix_len=common_prefix_len)
|
||||
scheduler.add_request(request=request2)
|
||||
|
||||
# Mock KV connector method.
|
||||
# req_id -> num_external_computed_tokens
|
||||
req_num_new_matched_tokens = {
|
||||
request1.request_id: num_external_computed_tokens,
|
||||
}
|
||||
|
||||
scheduler.connector = Mock()
|
||||
scheduler.connector.get_num_new_matched_tokens.side_effect = (
|
||||
_make_get_num_new_matched_tokens(req_num_new_matched_tokens,
|
||||
async_load=False))
|
||||
scheduler.connector.take_events.return_value = ()
|
||||
|
||||
scheduler_output = scheduler.schedule()
|
||||
|
||||
# req_id -> num_computed_tokens
|
||||
expected_computed_tokens = {
|
||||
request1.request_id: num_external_computed_tokens,
|
||||
request2.request_id: common_prefix_len,
|
||||
}
|
||||
|
||||
assert len(scheduler.running) == 2
|
||||
assert len(scheduler_output.scheduled_new_reqs) == 2
|
||||
for request in scheduler_output.scheduled_new_reqs:
|
||||
assert request.num_computed_tokens == expected_computed_tokens[
|
||||
request.req_id]
|
||||
assert scheduler.connector.get_num_new_matched_tokens.call_count == 2
|
||||
|
||||
# Simulate a failure in loading some of the shared blocks.
|
||||
req1_block_ids = scheduler_output.scheduled_new_reqs[0].block_ids[0]
|
||||
invalid_block_ids = {req1_block_ids[i] for i in invalid_block_idxs}
|
||||
model_runner_output = create_model_runner_output(
|
||||
[request1, request2],
|
||||
invalid_block_ids=invalid_block_ids,
|
||||
use_eos=True)
|
||||
|
||||
scheduler.update_from_output(scheduler_output, model_runner_output)
|
||||
|
||||
# req_id -> num_computed_tokens
|
||||
# all the common prefix blocks will be computed by request1
|
||||
expected_computed_tokens = {
|
||||
request1.request_id: min(invalid_block_idxs) * scheduler.block_size,
|
||||
request2.request_id: common_prefix_len,
|
||||
}
|
||||
|
||||
assert len(scheduler.running) == 2
|
||||
for request in scheduler.running:
|
||||
assert request.num_computed_tokens == expected_computed_tokens[
|
||||
request.request_id]
|
||||
assert scheduler.connector.get_num_new_matched_tokens.call_count == 2
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"num_prompt_blocks,"
|
||||
"num_external_computed_blocks,"
|
||||
"invalid_block_idxs",
|
||||
[
|
||||
(100, 99, {0, 50, 98}),
|
||||
(100, 99, {98, 50, 0}),
|
||||
],
|
||||
)
|
||||
def test_async_progressive_load_failure(
|
||||
scheduler: Scheduler,
|
||||
num_prompt_blocks: int,
|
||||
num_external_computed_blocks: int,
|
||||
invalid_block_idxs: set[int],
|
||||
):
|
||||
assert num_prompt_blocks >= num_external_computed_blocks
|
||||
|
||||
num_prompt_tokens = num_prompt_blocks * scheduler.block_size
|
||||
num_external_computed_tokens = (num_external_computed_blocks *
|
||||
scheduler.block_size)
|
||||
|
||||
request = create_request(num_tokens=num_prompt_tokens)
|
||||
scheduler.add_request(request=request)
|
||||
|
||||
# Mock KV connector method.
|
||||
# req_id -> num_external_computed_tokens
|
||||
req_num_new_matched_tokens = {
|
||||
request.request_id: num_external_computed_tokens,
|
||||
}
|
||||
|
||||
scheduler.connector = Mock()
|
||||
scheduler.connector.get_num_new_matched_tokens.side_effect = (
|
||||
_make_get_num_new_matched_tokens(req_num_new_matched_tokens,
|
||||
async_load=True))
|
||||
scheduler.connector.take_events.return_value = ()
|
||||
|
||||
scheduler_output = scheduler.schedule()
|
||||
|
||||
assert len(scheduler.waiting) == 1
|
||||
assert scheduler.waiting.peek_request().request_id == request.request_id
|
||||
assert request.num_computed_tokens == 0
|
||||
assert request.status == RequestStatus.WAITING_FOR_REMOTE_KVS
|
||||
assert scheduler.connector.get_num_new_matched_tokens.call_count == 1
|
||||
|
||||
min_invalid_block_idx = max(invalid_block_idxs) + 1
|
||||
# Simulate failures when progressively loading request blocks.
|
||||
for invalid_block_idx in invalid_block_idxs:
|
||||
(req_block_ids, ) = scheduler.kv_cache_manager.get_block_ids(
|
||||
request.request_id)
|
||||
invalid_block_ids = {req_block_ids[invalid_block_idx]}
|
||||
model_runner_output = create_model_runner_output(
|
||||
reqs=[],
|
||||
finished_recving=set(),
|
||||
invalid_block_ids=invalid_block_ids,
|
||||
use_eos=True)
|
||||
|
||||
scheduler.update_from_output(scheduler_output, model_runner_output)
|
||||
|
||||
min_invalid_block_idx = min(min_invalid_block_idx, invalid_block_idx)
|
||||
|
||||
assert len(scheduler.waiting) == 1
|
||||
assert scheduler.waiting.peek_request(
|
||||
).request_id == request.request_id
|
||||
assert request.num_computed_tokens == (min_invalid_block_idx *
|
||||
scheduler.block_size)
|
||||
assert request.status == RequestStatus.WAITING_FOR_REMOTE_KVS
|
||||
assert scheduler.failed_recving_kv_req_ids == {request.request_id}
|
||||
assert scheduler.connector.get_num_new_matched_tokens.call_count == 1
|
||||
@ -281,8 +281,8 @@ class RequestRunner:
|
||||
|
||||
model_runner_output = create_model_runner_output(
|
||||
reqs=self.scheduler.running,
|
||||
finished_sending=list(finished_sending),
|
||||
finished_recving=list(finished_recving),
|
||||
finished_sending=finished_sending,
|
||||
finished_recving=finished_recving,
|
||||
token_id=token_id)
|
||||
|
||||
if self.scheduler.running:
|
||||
|
||||
@ -3,34 +3,38 @@
|
||||
from concurrent.futures import Future
|
||||
from typing import Optional
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.distributed.kv_transfer.kv_connector.utils import KVOutputAggregator
|
||||
from vllm.v1.outputs import KVConnectorOutput, ModelRunnerOutput
|
||||
|
||||
pytestmark = pytest.mark.cpu_test
|
||||
|
||||
|
||||
class DummyModelRunnerOutput(ModelRunnerOutput):
|
||||
|
||||
def __init__(self,
|
||||
finished_sending: Optional[set[str]] = None,
|
||||
finished_recving: Optional[set[str]] = None):
|
||||
finished_recving: Optional[set[str]] = None,
|
||||
invalid_block_ids: Optional[set[int]] = None):
|
||||
self.kv_connector_output = KVConnectorOutput(
|
||||
finished_sending=finished_sending,
|
||||
finished_recving=finished_recving,
|
||||
)
|
||||
invalid_block_ids=invalid_block_ids or set())
|
||||
|
||||
def __repr__(self):
|
||||
return (
|
||||
f"DummyModelRunnerOutput("
|
||||
f"finished_sending={self.kv_connector_output.finished_sending},"
|
||||
f"finished_recving={self.kv_connector_output.finished_recving})")
|
||||
f"finished_recving={self.kv_connector_output.finished_recving})"
|
||||
f"invalid_block_ids={self.kv_connector_output.invalid_block_ids})")
|
||||
|
||||
|
||||
def test_aggregate_workers_output():
|
||||
aggregator = KVOutputAggregator(world_size=2)
|
||||
|
||||
output1 = DummyModelRunnerOutput(finished_sending={'req1'},
|
||||
finished_recving={'req2'})
|
||||
output2 = DummyModelRunnerOutput(finished_sending=None,
|
||||
finished_recving=None)
|
||||
output1 = DummyModelRunnerOutput()
|
||||
output2 = DummyModelRunnerOutput()
|
||||
|
||||
aggregated = aggregator.aggregate([output1, output2])
|
||||
|
||||
@ -38,11 +42,22 @@ def test_aggregate_workers_output():
|
||||
aggregated = aggregated.kv_connector_output
|
||||
assert aggregated.finished_sending is None
|
||||
assert aggregated.finished_recving is None
|
||||
assert not aggregated.invalid_block_ids
|
||||
|
||||
output1 = DummyModelRunnerOutput(finished_sending=None,
|
||||
finished_recving=None)
|
||||
output2 = DummyModelRunnerOutput(finished_sending={'req1'},
|
||||
finished_recving=None)
|
||||
output1 = DummyModelRunnerOutput(finished_sending={'req1'},
|
||||
finished_recving={'req2'})
|
||||
output2 = DummyModelRunnerOutput(invalid_block_ids={1})
|
||||
|
||||
aggregated = aggregator.aggregate([output1, output2])
|
||||
|
||||
assert aggregated is output1
|
||||
aggregated = aggregated.kv_connector_output
|
||||
assert aggregated.finished_sending is None
|
||||
assert aggregated.finished_recving is None
|
||||
assert aggregated.invalid_block_ids == {1}
|
||||
|
||||
output1 = DummyModelRunnerOutput(invalid_block_ids={2})
|
||||
output2 = DummyModelRunnerOutput(finished_sending={'req1'})
|
||||
|
||||
aggregated = aggregator.aggregate([output1, output2])
|
||||
|
||||
@ -50,11 +65,11 @@ def test_aggregate_workers_output():
|
||||
aggregated = aggregated.kv_connector_output
|
||||
assert aggregated.finished_sending == {'req1'}
|
||||
assert aggregated.finished_recving is None
|
||||
assert aggregated.invalid_block_ids == {2}
|
||||
|
||||
output1 = DummyModelRunnerOutput(finished_sending=None,
|
||||
finished_recving=None)
|
||||
output2 = DummyModelRunnerOutput(finished_sending={'req1'},
|
||||
finished_recving={'req2'})
|
||||
output1 = DummyModelRunnerOutput(invalid_block_ids={3, 4})
|
||||
output2 = DummyModelRunnerOutput(finished_recving={'req2'},
|
||||
invalid_block_ids={4, 5})
|
||||
|
||||
aggregated = aggregator.aggregate([output1, output2])
|
||||
|
||||
@ -62,6 +77,7 @@ def test_aggregate_workers_output():
|
||||
aggregated = aggregated.kv_connector_output
|
||||
assert aggregated.finished_sending is None
|
||||
assert aggregated.finished_recving == {'req2'}
|
||||
assert aggregated.invalid_block_ids == {3, 4, 5}
|
||||
|
||||
|
||||
def test_async_aggregate_workers_output():
|
||||
@ -71,10 +87,8 @@ def test_async_aggregate_workers_output():
|
||||
future2: Future[DummyModelRunnerOutput] = Future()
|
||||
result_future = aggregator.async_aggregate([future1, future2])
|
||||
|
||||
output1 = DummyModelRunnerOutput(finished_sending={'req1'},
|
||||
finished_recving={'req2'})
|
||||
output2 = DummyModelRunnerOutput(finished_sending=None,
|
||||
finished_recving=None)
|
||||
output1 = DummyModelRunnerOutput()
|
||||
output2 = DummyModelRunnerOutput()
|
||||
future1.set_result(output1)
|
||||
future2.set_result(output2)
|
||||
|
||||
@ -84,15 +98,32 @@ def test_async_aggregate_workers_output():
|
||||
aggregated = aggregated.kv_connector_output
|
||||
assert aggregated.finished_sending is None
|
||||
assert aggregated.finished_recving is None
|
||||
assert not aggregated.invalid_block_ids
|
||||
|
||||
future1 = Future()
|
||||
future2 = Future()
|
||||
result_future = aggregator.async_aggregate([future1, future2])
|
||||
|
||||
output1 = DummyModelRunnerOutput(finished_sending=None,
|
||||
finished_recving=None)
|
||||
output2 = DummyModelRunnerOutput(finished_sending={'req1'},
|
||||
finished_recving=None)
|
||||
output1 = DummyModelRunnerOutput(finished_sending={'req1'},
|
||||
finished_recving={'req2'})
|
||||
output2 = DummyModelRunnerOutput(invalid_block_ids={1})
|
||||
future1.set_result(output1)
|
||||
future2.set_result(output2)
|
||||
|
||||
assert result_future.done()
|
||||
aggregated = result_future.result()
|
||||
assert aggregated is output1
|
||||
aggregated = aggregated.kv_connector_output
|
||||
assert aggregated.finished_sending is None
|
||||
assert aggregated.finished_recving is None
|
||||
assert aggregated.invalid_block_ids == {1}
|
||||
|
||||
future1 = Future()
|
||||
future2 = Future()
|
||||
result_future = aggregator.async_aggregate([future1, future2])
|
||||
|
||||
output1 = DummyModelRunnerOutput(invalid_block_ids={2})
|
||||
output2 = DummyModelRunnerOutput(finished_sending={'req1'})
|
||||
future1.set_result(output1)
|
||||
future2.set_result(output2)
|
||||
|
||||
@ -102,15 +133,15 @@ def test_async_aggregate_workers_output():
|
||||
aggregated = aggregated.kv_connector_output
|
||||
assert aggregated.finished_sending == {'req1'}
|
||||
assert aggregated.finished_recving is None
|
||||
assert aggregated.invalid_block_ids == {2}
|
||||
|
||||
future1 = Future()
|
||||
future2 = Future()
|
||||
result_future = aggregator.async_aggregate([future1, future2])
|
||||
|
||||
output1 = DummyModelRunnerOutput(finished_sending=None,
|
||||
finished_recving=None)
|
||||
output2 = DummyModelRunnerOutput(finished_sending={'req1'},
|
||||
finished_recving={'req2'})
|
||||
output1 = DummyModelRunnerOutput(invalid_block_ids={3, 4})
|
||||
output2 = DummyModelRunnerOutput(finished_recving={'req2'},
|
||||
invalid_block_ids={4, 5})
|
||||
future1.set_result(output1)
|
||||
future2.set_result(output2)
|
||||
|
||||
@ -120,3 +151,4 @@ def test_async_aggregate_workers_output():
|
||||
aggregated = aggregated.kv_connector_output
|
||||
assert aggregated.finished_sending is None
|
||||
assert aggregated.finished_recving == {'req2'}
|
||||
assert aggregated.invalid_block_ids == {3, 4, 5}
|
||||
|
||||
@ -2,12 +2,16 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import copy
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.v1.outputs import EMPTY_MODEL_RUNNER_OUTPUT, KVConnectorOutput
|
||||
from vllm.v1.request import FinishReason, RequestStatus
|
||||
|
||||
from .utils import (assert_scheduler_empty, create_model_runner_output,
|
||||
create_request, create_scheduler, create_vllm_config)
|
||||
|
||||
pytestmark = pytest.mark.cpu_test
|
||||
|
||||
|
||||
def test_basic_lifecycle():
|
||||
"""Test lifecycle of a Remote Decode request."""
|
||||
@ -88,7 +92,7 @@ def test_basic_lifecycle():
|
||||
# (3b): execute_model()
|
||||
model_runner_output = copy.deepcopy(EMPTY_MODEL_RUNNER_OUTPUT)
|
||||
model_runner_output.kv_connector_output = KVConnectorOutput(
|
||||
finished_sending=[request_id])
|
||||
finished_sending={request_id})
|
||||
|
||||
# (3c): update_from_output()
|
||||
scheduler.update_from_output(scheduler_output, model_runner_output)
|
||||
@ -135,7 +139,7 @@ def test_short_prompt_lifecycle():
|
||||
scheduler_output = scheduler.schedule()
|
||||
# Use create_model_runner_output to pass kv_connector_output along
|
||||
model_runner_output = create_model_runner_output(
|
||||
reqs=[request], finished_sending=[request.request_id])
|
||||
reqs=[request], finished_sending={request.request_id})
|
||||
scheduler.update_from_output(scheduler_output, model_runner_output)
|
||||
assert_scheduler_empty(scheduler)
|
||||
|
||||
@ -191,6 +195,6 @@ def test_prefix_cache_lifecycle():
|
||||
scheduler_output = scheduler.schedule()
|
||||
model_runner_output = copy.deepcopy(EMPTY_MODEL_RUNNER_OUTPUT)
|
||||
model_runner_output.kv_connector_output = KVConnectorOutput(
|
||||
finished_sending=[request_remote.request_id])
|
||||
finished_sending={request_remote.request_id})
|
||||
scheduler.update_from_output(scheduler_output, model_runner_output)
|
||||
assert_scheduler_empty(scheduler)
|
||||
|
||||
@ -2,12 +2,16 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import copy
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.v1.outputs import EMPTY_MODEL_RUNNER_OUTPUT, KVConnectorOutput
|
||||
from vllm.v1.request import FinishReason, RequestStatus
|
||||
|
||||
from .utils import (assert_scheduler_empty, create_model_runner_output,
|
||||
create_request, create_scheduler, create_vllm_config)
|
||||
|
||||
pytestmark = pytest.mark.cpu_test
|
||||
|
||||
|
||||
def test_basic_lifecycle():
|
||||
"""Test lifecycle of a remote prefill."""
|
||||
@ -74,7 +78,7 @@ def test_basic_lifecycle():
|
||||
# (2b): forward(): request finishes recv.
|
||||
model_runner_output = copy.deepcopy(EMPTY_MODEL_RUNNER_OUTPUT)
|
||||
model_runner_output.kv_connector_output = KVConnectorOutput(
|
||||
finished_recving=[request_id])
|
||||
finished_recving={request_id})
|
||||
|
||||
# (2c): update_from_output():
|
||||
engine_core_outputs = scheduler.update_from_output(scheduler_output,
|
||||
@ -193,7 +197,7 @@ def test_interleaved_lifecycle():
|
||||
|
||||
model_runner_output = create_model_runner_output(
|
||||
[request_local_a, request_local_b],
|
||||
finished_recving=[request_remote.request_id])
|
||||
finished_recving={request_remote.request_id})
|
||||
scheduler.update_from_output(scheduler_output, model_runner_output)
|
||||
|
||||
# STEP 5: RECVed KVs are sent to ModelRunner.
|
||||
@ -242,16 +246,16 @@ def test_no_spurious_prefix_caching():
|
||||
request_id=1,
|
||||
block_size=BLOCK_SIZE,
|
||||
num_tokens=NUM_TOKENS,
|
||||
common_prefix_len=NUM_TOKENS,
|
||||
do_remote_prefill=True,
|
||||
use_all_1s_for_prompt_tokens=True,
|
||||
)
|
||||
|
||||
request_local = create_request(
|
||||
request_id=2,
|
||||
block_size=BLOCK_SIZE,
|
||||
num_tokens=NUM_TOKENS,
|
||||
common_prefix_len=NUM_TOKENS,
|
||||
do_remote_prefill=False,
|
||||
use_all_1s_for_prompt_tokens=True,
|
||||
)
|
||||
|
||||
# Schedule the remote prefill request. This should not
|
||||
@ -318,7 +322,7 @@ def test_full_block_prompt():
|
||||
scheduler_output = scheduler.schedule()
|
||||
model_runner_output = copy.deepcopy(EMPTY_MODEL_RUNNER_OUTPUT)
|
||||
model_runner_output.kv_connector_output = KVConnectorOutput(
|
||||
finished_recving=[request_id])
|
||||
finished_recving={request_id})
|
||||
scheduler.update_from_output(scheduler_output, model_runner_output)
|
||||
assert len(scheduler.waiting) == 1
|
||||
assert (request_id in scheduler.finished_recving_kv_req_ids)
|
||||
@ -398,7 +402,7 @@ def test_cannot_schedule_after_recv():
|
||||
# Step 3: finish recving (5 blocks in use)
|
||||
scheduler_output = scheduler.schedule()
|
||||
model_runner_output = create_model_runner_output(
|
||||
reqs=[request_normal], finished_recving=[request_remote.request_id])
|
||||
reqs=[request_normal], finished_recving={request_remote.request_id})
|
||||
scheduler.update_from_output(scheduler_output, model_runner_output)
|
||||
assert len(scheduler.running) == 1
|
||||
assert len(scheduler.waiting) == 1
|
||||
@ -512,7 +516,7 @@ def test_cannot_recv():
|
||||
# Step 5: finish recving (5 blocks in use)
|
||||
scheduler_output = scheduler.schedule()
|
||||
model_runner_output = create_model_runner_output(
|
||||
reqs=[], finished_recving=[request_remote.request_id])
|
||||
reqs=[], finished_recving={request_remote.request_id})
|
||||
scheduler.update_from_output(scheduler_output, model_runner_output)
|
||||
assert len(scheduler.running) == 0
|
||||
assert len(scheduler.waiting) == 1
|
||||
|
||||
@ -2,6 +2,7 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import tempfile
|
||||
from collections import defaultdict
|
||||
from itertools import count
|
||||
from typing import Any, Callable, Optional
|
||||
|
||||
import torch
|
||||
@ -61,12 +62,15 @@ def create_vllm_config(
|
||||
max_num_seqs: int = 16,
|
||||
max_num_batched_tokens: int = 64,
|
||||
block_size: int = 16,
|
||||
max_model_len: int = 10000,
|
||||
enable_chunked_prefill: bool = True,
|
||||
) -> VllmConfig:
|
||||
"""Initialize VllmConfig For Testing."""
|
||||
scheduler_config = SchedulerConfig(
|
||||
max_num_seqs=max_num_seqs,
|
||||
max_num_batched_tokens=max_num_batched_tokens,
|
||||
max_model_len=max_num_batched_tokens,
|
||||
max_model_len=max_model_len,
|
||||
enable_chunked_prefill=enable_chunked_prefill,
|
||||
)
|
||||
model_config = ModelConfig(
|
||||
model=model,
|
||||
@ -117,19 +121,27 @@ def create_scheduler(
|
||||
)
|
||||
|
||||
|
||||
_request_count = count(1)
|
||||
_none_hash_initialized = False
|
||||
|
||||
|
||||
def create_request(request_id: int,
|
||||
num_tokens: int = 10,
|
||||
max_tokens: int = 16,
|
||||
do_remote_decode: bool = False,
|
||||
do_remote_prefill: bool = False,
|
||||
use_all_1s_for_prompt_tokens: bool = False,
|
||||
num_remote_blocks: int = 3,
|
||||
block_size: int = 16,
|
||||
hash_fn: Callable = sha256) -> Request:
|
||||
def create_request(
|
||||
request_id: Optional[int] = None,
|
||||
num_tokens: int = 10,
|
||||
common_prefix_len=0,
|
||||
max_tokens: int = 16,
|
||||
do_remote_decode: bool = False,
|
||||
do_remote_prefill: bool = False,
|
||||
num_remote_blocks: int = 3,
|
||||
block_size: int = 16,
|
||||
hash_fn: Callable = sha256,
|
||||
) -> Request:
|
||||
"""Make dummy request for testing."""
|
||||
assert num_tokens >= common_prefix_len >= 0
|
||||
|
||||
if request_id is None:
|
||||
request_id = next(_request_count)
|
||||
|
||||
global _none_hash_initialized
|
||||
if not _none_hash_initialized:
|
||||
init_none_hash(hash_fn)
|
||||
@ -153,10 +165,9 @@ def create_request(request_id: int,
|
||||
max_tokens = 1 if do_remote_decode else max_tokens
|
||||
sampling_params = SamplingParams(max_tokens=max_tokens)
|
||||
|
||||
if use_all_1s_for_prompt_tokens:
|
||||
prompt_token_ids = [1] * num_tokens
|
||||
else:
|
||||
prompt_token_ids = [i * request_id for i in range(num_tokens)]
|
||||
common_prefix = [1] * common_prefix_len if common_prefix_len > 0 else []
|
||||
suffix = [i * request_id for i in range(num_tokens - common_prefix_len)]
|
||||
prompt_token_ids = common_prefix + suffix
|
||||
|
||||
req = Request(
|
||||
request_id=f"id-{request_id}",
|
||||
@ -173,8 +184,9 @@ def create_request(request_id: int,
|
||||
|
||||
def create_model_runner_output(
|
||||
reqs: list[Request],
|
||||
finished_sending: Optional[list[str]] = None,
|
||||
finished_recving: Optional[list[str]] = None,
|
||||
finished_sending: Optional[set[str]] = None,
|
||||
finished_recving: Optional[set[str]] = None,
|
||||
invalid_block_ids: Optional[set[int]] = None,
|
||||
use_eos: bool = False,
|
||||
token_id: int = 0,
|
||||
) -> ModelRunnerOutput:
|
||||
@ -189,10 +201,11 @@ def create_model_runner_output(
|
||||
sampled_token_ids = [[sampled_token] for _ in req_ids]
|
||||
|
||||
kv_connector_output = None if (
|
||||
finished_sending is None
|
||||
and finished_recving is None) else KVConnectorOutput(
|
||||
finished_sending is None and finished_recving is None
|
||||
and invalid_block_ids is None) else KVConnectorOutput(
|
||||
finished_sending=finished_sending,
|
||||
finished_recving=finished_recving,
|
||||
invalid_block_ids=invalid_block_ids or set(),
|
||||
)
|
||||
|
||||
# Make output data structure.
|
||||
|
||||
@ -7,6 +7,8 @@ import pytest
|
||||
from vllm.v1.metrics.reader import (Counter, Gauge, Histogram, Vector,
|
||||
get_metrics_snapshot)
|
||||
|
||||
pytestmark = pytest.mark.cpu_test
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def test_registry(monkeypatch):
|
||||
|
||||
@ -337,13 +337,19 @@ def test_load_model(mock_get_model, mock_get_layers, mock_get_pp_group, method,
|
||||
"target_attn_1": mock.MagicMock(),
|
||||
"target_attn_2": mock.MagicMock()
|
||||
}
|
||||
target_indx_layers: dict[str, mock.MagicMock] = {}
|
||||
# Draft model has one extra attention layer compared to target model
|
||||
all_attn_layers = {
|
||||
**target_attn_layers, "draft_extra_attn": mock.MagicMock()
|
||||
}
|
||||
|
||||
all_indx_layers: dict[str, mock.MagicMock] = {}
|
||||
|
||||
# Make mock_get_layers return different values for each call
|
||||
mock_get_layers.side_effect = [target_attn_layers, all_attn_layers]
|
||||
mock_get_layers.side_effect = [
|
||||
target_attn_layers, target_indx_layers, all_attn_layers,
|
||||
all_indx_layers
|
||||
]
|
||||
|
||||
# Setup mock for pp group to return the appropriate value for world size
|
||||
mock_pp_group = mock.MagicMock()
|
||||
@ -658,6 +664,9 @@ def test_propose_tree(spec_token_tree):
|
||||
# Mock runner for attention metadata building.
|
||||
proposer.runner = mock.MagicMock()
|
||||
proposer.runner.attn_groups.append([mock.MagicMock()])
|
||||
proposer.runner.attn_groups[0][0].metadata_builders = [
|
||||
attn_metadata_builder
|
||||
]
|
||||
proposer.runner.attn_groups[0][0].get_metadata_builder.return_value = \
|
||||
attn_metadata_builder
|
||||
proposer._get_attention_metadata_builder = mock.MagicMock(
|
||||
|
||||
@ -63,7 +63,13 @@ def test_mtp_load_model_unified(mock_get_model, mock_get_layers,
|
||||
|
||||
target_attn_layers = {"target_attn_1": mock.MagicMock()}
|
||||
all_attn_layers = {**target_attn_layers, "draft_attn_1": mock.MagicMock()}
|
||||
mock_get_layers.side_effect = [target_attn_layers, all_attn_layers]
|
||||
target_indexer_layers: dict = {}
|
||||
all_indexer_layers: dict = {}
|
||||
|
||||
mock_get_layers.side_effect = [
|
||||
target_attn_layers, target_indexer_layers, all_attn_layers,
|
||||
all_indexer_layers
|
||||
]
|
||||
|
||||
mock_pp_group = mock.MagicMock()
|
||||
mock_pp_group.world_size = 1
|
||||
|
||||