Compare commits
2 Commits
khluu/test
...
codex/remo
| Author | SHA1 | Date | |
|---|---|---|---|
| 85013bf094 | |||
| 07665f8679 |
@ -86,6 +86,10 @@ if [[ $commands == *"pytest -v -s models/test_registry.py"* ]]; then
|
||||
commands=${commands//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"}
|
||||
fi
|
||||
|
||||
if [[ $commands == *"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'"* ]]; then
|
||||
commands=${commands//"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'"/"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2 and not BambaForCausalLM and not Gemma2ForCausalLM and not Grok1ModelForCausalLM and not Zamba2ForCausalLM and not Gemma2Model and not GritLM'"}
|
||||
fi
|
||||
|
||||
if [[ $commands == *"pytest -v -s compile/test_basic_correctness.py"* ]]; then
|
||||
commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s compile/test_basic_correctness.py"}
|
||||
fi
|
||||
|
||||
@ -62,7 +62,7 @@ echo "--- Installing Python dependencies ---"
|
||||
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
|
||||
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
|
||||
&& python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
|
||||
&& python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
|
||||
&& python3 -m pip install --progress-bar off hf-transfer
|
||||
echo "--- Python dependencies installed ---"
|
||||
export VLLM_USE_V1=1
|
||||
export VLLM_XLA_CHECK_RECOMPILATION=1
|
||||
|
||||
@ -62,7 +62,7 @@ echo "--- Installing Python dependencies ---"
|
||||
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
|
||||
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
|
||||
&& python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
|
||||
&& python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
|
||||
&& python3 -m pip install --progress-bar off hf-transfer
|
||||
echo "--- Python dependencies installed ---"
|
||||
export VLLM_USE_V1=1
|
||||
export VLLM_XLA_CHECK_RECOMPILATION=1
|
||||
|
||||
@ -6,28 +6,24 @@
|
||||
# to generate the final pipeline yaml file.
|
||||
|
||||
# Documentation
|
||||
# label(str): the name of the test. emojis allowed.
|
||||
# fast_check(bool): whether to run this on each commit on the fastcheck pipeline.
|
||||
# torch_nightly(bool): whether to run this on vllm against the torch nightly pipeline.
|
||||
# fast_check_only(bool): run this test on the fastcheck pipeline only
|
||||
# optional(bool): never run this test by default (i.e. need to unblock manually) unless it's a scheduled nightly run.
|
||||
# soft_fail(bool): allow this step to fail without failing the entire pipeline (useful for flaky or experimental tests).
|
||||
# label(str): the name of the test. emoji allowed.
|
||||
# fast_check(bool): whether to run this on each commit on fastcheck pipeline.
|
||||
# torch_nightly(bool): whether to run this on vllm against torch nightly pipeline.
|
||||
# fast_check_only(bool): run this test on fastcheck pipeline only
|
||||
# optional(bool): never run this test by default (i.e. need to unblock manually) unless it's scheduled nightly run.
|
||||
# command(str): the single command to run for tests. incompatible with commands.
|
||||
# commands(list): the list of commands to run for the test. incompatible with command.
|
||||
# mirror_hardwares(list): the list of hardware to run the test on as well. currently only supports [amdexperimental]
|
||||
# gpu(str): override the GPU selection for the test. default is L4 GPUs. supports a100, b200, h200
|
||||
# num_gpus(int): override the number of GPUs for the test. defaults to 1 GPU. currently supports 2,4.
|
||||
# num_nodes(int): whether to simulate multi-node setup by launching multiple containers on one host,
|
||||
# in this case, commands must be specified. the first command runs on the first host, the second
|
||||
# commands(list): the list of commands to run for test. incompatbile with command.
|
||||
# mirror_hardwares(list): the list of hardwares to run the test on as well. currently only supports [amd]
|
||||
# gpu(str): override the GPU selection for the test. default is on L4 GPUs. currently only supports a100
|
||||
# num_gpus(int): override the number of GPUs for the test. default to 1 GPU. currently support 2,4.
|
||||
# num_nodes(int): whether to simulate multi-node setup by launch multiple containers on one host,
|
||||
# in this case, commands must be specified. the first command runs on first host, the second
|
||||
# command runs on the second host.
|
||||
# timeout_in_minutes(int): sets a timeout for the step in minutes. if not specified, uses the default timeout.
|
||||
# parallelism(int): number of parallel jobs to run for this step. enables test sharding using $$BUILDKITE_PARALLEL_JOB
|
||||
# and $$BUILDKITE_PARALLEL_JOB_COUNT environment variables.
|
||||
# working_dir(str): specify the place where the command should execute, default to /vllm-workspace/tests
|
||||
# source_file_dependencies(list): the list of prefixes to opt-in the test for, if empty, the test will always run.
|
||||
# working_dir(str): specify the place where command should execute, default to /vllm-workspace/tests
|
||||
# source_file_dependencies(list): the list of prefix to opt-in the test for, if empty, the test will always run.
|
||||
|
||||
# When adding a test
|
||||
# - If the test belongs to an existing group, add it there
|
||||
# - If the test belong to an existing group, add it there
|
||||
# - If the test is short, add to any existing step
|
||||
# - If the test takes more than 10min, then it is okay to create a new step.
|
||||
# Note that all steps execute in parallel.
|
||||
@ -114,7 +110,7 @@ steps:
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
|
||||
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
|
||||
- pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
|
||||
- VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
|
||||
|
||||
- label: Entrypoints Integration Test (API Server) # 100min
|
||||
timeout_in_minutes: 130
|
||||
@ -152,6 +148,7 @@ steps:
|
||||
num_gpus: 4
|
||||
source_file_dependencies:
|
||||
- vllm/distributed/
|
||||
- vllm/core/
|
||||
- tests/distributed/test_utils
|
||||
- tests/distributed/test_pynccl
|
||||
- tests/distributed/test_events
|
||||
@ -165,18 +162,11 @@ steps:
|
||||
- tests/v1/test_hybrid_lb_dp.py
|
||||
- tests/v1/engine/test_engine_core_client.py
|
||||
commands:
|
||||
# test with torchrun tp=2 and external_dp=2
|
||||
# test with tp=2 and external_dp=2
|
||||
- VLLM_USE_V1=0 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
|
||||
- torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
|
||||
# test with torchrun tp=2 and pp=2
|
||||
# test with tp=2 and pp=2
|
||||
- PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
|
||||
# test with torchrun tp=4 and dp=1
|
||||
- TP_SIZE=4 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
|
||||
# test with torchrun tp=2, pp=2 and dp=1
|
||||
- PP_SIZE=2 TP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
|
||||
# test with torchrun tp=1 and dp=4 with ep
|
||||
- DP_SIZE=4 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
|
||||
# test with torchrun tp=2 and dp=2 with ep
|
||||
- TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
|
||||
# test with internal dp
|
||||
- python3 ../examples/offline_inference/data_parallel.py --enforce-eager
|
||||
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
|
||||
@ -324,11 +314,12 @@ steps:
|
||||
- python3 offline_inference/vision_language.py --seed 0
|
||||
- python3 offline_inference/vision_language_pooling.py --seed 0
|
||||
- python3 offline_inference/vision_language_multi_image.py --seed 0
|
||||
- python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
||||
- VLLM_USE_V1=0 python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
||||
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
|
||||
- python3 offline_inference/basic/classify.py
|
||||
- python3 offline_inference/basic/embed.py
|
||||
- python3 offline_inference/basic/score.py
|
||||
- VLLM_USE_V1=0 python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2
|
||||
|
||||
- label: Platform Tests (CUDA) # 4min
|
||||
timeout_in_minutes: 15
|
||||
@ -878,6 +869,8 @@ steps:
|
||||
- tests/distributed/
|
||||
- vllm/compilation
|
||||
- vllm/worker/worker_base.py
|
||||
- vllm/worker/worker.py
|
||||
- vllm/worker/model_runner.py
|
||||
- entrypoints/llm/test_collective_rpc.py
|
||||
- tests/v1/test_async_llm_dp.py
|
||||
- tests/v1/test_external_lb_dp.py
|
||||
@ -901,7 +894,7 @@ steps:
|
||||
- pytest -v -s distributed/test_sequence_parallel.py
|
||||
# this test fails consistently.
|
||||
# TODO: investigate and fix
|
||||
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
|
||||
- VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
|
||||
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
|
||||
- pytest -v -s models/multimodal/generation/test_maverick.py
|
||||
|
||||
|
||||
17
.github/CODEOWNERS
vendored
17
.github/CODEOWNERS
vendored
@ -4,8 +4,11 @@
|
||||
# This lists cover the "core" components of vLLM that require careful review
|
||||
/vllm/attention @LucasWilkinson
|
||||
/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
||||
/vllm/core @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
||||
/vllm/engine/llm_engine.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
||||
/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
|
||||
/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
|
||||
/vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
||||
/vllm/model_executor/layers/fused_moe @mgoin
|
||||
/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @NickLucche
|
||||
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256
|
||||
@ -63,26 +66,18 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
|
||||
/tests/models/test_transformers.py @hmellor
|
||||
|
||||
# Docs
|
||||
/docs/mkdocs @hmellor
|
||||
/docs/**/*.yml @hmellor
|
||||
/requirements/docs.txt @hmellor
|
||||
.readthedocs.yaml @hmellor
|
||||
/docs @hmellor
|
||||
mkdocs.yaml @hmellor
|
||||
|
||||
# Linting
|
||||
.markdownlint.yaml @hmellor
|
||||
.pre-commit-config.yaml @hmellor
|
||||
/tools/pre_commit @hmellor
|
||||
|
||||
# CPU
|
||||
/vllm/v1/worker/cpu* @bigPYJ1151
|
||||
/vllm/v1/worker/^cpu @bigPYJ1151
|
||||
/csrc/cpu @bigPYJ1151
|
||||
/vllm/platforms/cpu.py @bigPYJ1151
|
||||
/cmake/cpu_extension.cmake @bigPYJ1151
|
||||
/docker/Dockerfile.cpu @bigPYJ1151
|
||||
|
||||
# Intel GPU
|
||||
/vllm/v1/worker/xpu* @jikunshang
|
||||
/vllm/v1/worker/^xpu @jikunshang
|
||||
/vllm/platforms/xpu.py @jikunshang
|
||||
/docker/Dockerfile.xpu @jikunshang
|
||||
|
||||
|
||||
4
.github/ISSUE_TEMPLATE/750-RFC.yml
vendored
4
.github/ISSUE_TEMPLATE/750-RFC.yml
vendored
@ -43,6 +43,10 @@ body:
|
||||
Any other things you would like to mention.
|
||||
validations:
|
||||
required: false
|
||||
- type: markdown
|
||||
attributes:
|
||||
value: >
|
||||
Thanks for contributing 🎉! The vLLM core team hosts a biweekly RFC review session at 9:30AM Pacific Time, while most RFCs can be discussed online, you can optionally sign up for a slot to discuss your RFC online [here](https://docs.google.com/document/d/1CiLVBZeIVfR7_PNAKVSusxpceywkoOOB78qoWqHvSZc/edit).
|
||||
- type: checkboxes
|
||||
id: askllm
|
||||
attributes:
|
||||
|
||||
@ -49,7 +49,7 @@ repos:
|
||||
rev: 0.6.17
|
||||
hooks:
|
||||
- id: pip-compile
|
||||
args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu128, --python-platform, x86_64-manylinux_2_28]
|
||||
args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu128]
|
||||
files: ^requirements/test\.(in|txt)$
|
||||
- repo: local
|
||||
hooks:
|
||||
@ -60,32 +60,38 @@ repos:
|
||||
files: ^requirements/test\.(in|txt)$
|
||||
- id: mypy-local
|
||||
name: Run mypy for local Python installation
|
||||
entry: python tools/pre_commit/mypy.py 0 "local"
|
||||
entry: tools/mypy.sh 0 "local"
|
||||
language: python
|
||||
types: [python]
|
||||
additional_dependencies: &mypy_deps [mypy==1.11.1, types-cachetools, types-setuptools, types-PyYAML, types-requests, pydantic]
|
||||
stages: [pre-commit] # Don't run in CI
|
||||
<<: &mypy_common
|
||||
language: python
|
||||
types_or: [python, pyi]
|
||||
require_serial: true
|
||||
additional_dependencies: [mypy==1.11.1, regex, types-cachetools, types-setuptools, types-PyYAML, types-requests, types-torch, pydantic]
|
||||
- id: mypy-3.9 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
|
||||
name: Run mypy for Python 3.9
|
||||
entry: python tools/pre_commit/mypy.py 1 "3.9"
|
||||
<<: *mypy_common
|
||||
entry: tools/mypy.sh 1 "3.9"
|
||||
language: python
|
||||
types: [python]
|
||||
additional_dependencies: *mypy_deps
|
||||
stages: [manual] # Only run in CI
|
||||
- id: mypy-3.10 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
|
||||
name: Run mypy for Python 3.10
|
||||
entry: python tools/pre_commit/mypy.py 1 "3.10"
|
||||
<<: *mypy_common
|
||||
entry: tools/mypy.sh 1 "3.10"
|
||||
language: python
|
||||
types: [python]
|
||||
additional_dependencies: *mypy_deps
|
||||
stages: [manual] # Only run in CI
|
||||
- id: mypy-3.11 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
|
||||
name: Run mypy for Python 3.11
|
||||
entry: python tools/pre_commit/mypy.py 1 "3.11"
|
||||
<<: *mypy_common
|
||||
entry: tools/mypy.sh 1 "3.11"
|
||||
language: python
|
||||
types: [python]
|
||||
additional_dependencies: *mypy_deps
|
||||
stages: [manual] # Only run in CI
|
||||
- id: mypy-3.12 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
|
||||
name: Run mypy for Python 3.12
|
||||
entry: python tools/pre_commit/mypy.py 1 "3.12"
|
||||
<<: *mypy_common
|
||||
entry: tools/mypy.sh 1 "3.12"
|
||||
language: python
|
||||
types: [python]
|
||||
additional_dependencies: *mypy_deps
|
||||
stages: [manual] # Only run in CI
|
||||
- id: shellcheck
|
||||
name: Lint shell scripts
|
||||
@ -149,10 +155,11 @@ repos:
|
||||
additional_dependencies: [regex]
|
||||
- id: check-pickle-imports
|
||||
name: Prevent new pickle/cloudpickle imports
|
||||
entry: python tools/pre_commit/check_pickle_imports.py
|
||||
entry: python tools/check_pickle_imports.py
|
||||
language: python
|
||||
types: [python]
|
||||
additional_dependencies: [regex]
|
||||
pass_filenames: false
|
||||
additional_dependencies: [pathspec, regex]
|
||||
- id: validate-config
|
||||
name: Validate configuration has default values and that each field has a docstring
|
||||
entry: python tools/validate_config.py
|
||||
|
||||
@ -11,13 +11,13 @@ from datetime import datetime
|
||||
from typing import Any
|
||||
|
||||
import torch
|
||||
import triton
|
||||
from tqdm import tqdm
|
||||
|
||||
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
|
||||
_w8a8_block_fp8_matmul,
|
||||
)
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.triton_utils import triton
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
|
||||
mp.set_start_method("spawn", force=True)
|
||||
|
||||
@ -1,38 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
#include <cuda_runtime_api.h>
|
||||
#include <algorithm>
|
||||
|
||||
// maximum blocks per SM cap
|
||||
#ifndef VLLM_LAUNCH_BLOCKS_CAP
|
||||
#define VLLM_LAUNCH_BLOCKS_CAP 4
|
||||
#endif
|
||||
|
||||
// compile-time estimate of max threads per SM for launch bounds.
|
||||
#ifndef VLLM_MAX_THREADS_PER_SM
|
||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 300
|
||||
#define VLLM_MAX_THREADS_PER_SM 1536
|
||||
#else
|
||||
#define VLLM_MAX_THREADS_PER_SM 2048
|
||||
#endif
|
||||
#endif
|
||||
|
||||
// compute the number of blocks per SM to request in __launch_bounds__
|
||||
#define VLLM_BLOCKS_DIV(VAL) (VLLM_MAX_THREADS_PER_SM / (VAL))
|
||||
#define VLLM_CLAMP_BLOCKS_PER_SM(VAL) \
|
||||
(((VAL) <= 0) \
|
||||
? 1 \
|
||||
: (((VAL) < VLLM_LAUNCH_BLOCKS_CAP) ? (VAL) : VLLM_LAUNCH_BLOCKS_CAP))
|
||||
#define VLLM_BLOCKS_PER_SM(BLOCK_THREADS) \
|
||||
VLLM_CLAMP_BLOCKS_PER_SM(VLLM_BLOCKS_DIV(BLOCK_THREADS))
|
||||
|
||||
// runtime-time helper to compute blocks/SM
|
||||
static inline int vllm_runtime_blocks_per_sm(int block_threads) {
|
||||
int device = -1;
|
||||
cudaGetDevice(&device);
|
||||
int max_threads_per_sm = VLLM_MAX_THREADS_PER_SM;
|
||||
cudaDeviceGetAttribute(&max_threads_per_sm,
|
||||
cudaDevAttrMaxThreadsPerMultiProcessor, device);
|
||||
int blocks = (block_threads > 0) ? (max_threads_per_sm / block_threads) : 1;
|
||||
return VLLM_CLAMP_BLOCKS_PER_SM(blocks);
|
||||
}
|
||||
@ -26,7 +26,6 @@
|
||||
#include "dispatch_utils.h"
|
||||
|
||||
#include "cuda_utils.h"
|
||||
#include "launch_bounds_utils.h"
|
||||
#include "nvfp4_utils.cuh"
|
||||
|
||||
namespace vllm {
|
||||
@ -64,7 +63,7 @@ __inline__ __device__ PackedVec<Type> compute_silu_mul(PackedVec<Type>& vec,
|
||||
|
||||
// Use UE4M3 by default.
|
||||
template <class Type, bool UE8M0_SF = false>
|
||||
__global__ void __launch_bounds__(1024, VLLM_BLOCKS_PER_SM(1024))
|
||||
__global__ void __launch_bounds__(1024, 4)
|
||||
silu_mul_cvt_fp16_to_fp4(int32_t numRows, int32_t numCols, Type const* in,
|
||||
float const* SFScale, uint32_t* out,
|
||||
uint32_t* SFout) {
|
||||
@ -132,8 +131,7 @@ void silu_and_mul_nvfp4_quant_sm1xxa(torch::Tensor& output, // [..., d]
|
||||
const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
|
||||
auto stream = at::cuda::getCurrentCUDAStream(input.get_device());
|
||||
dim3 block(std::min(int(n / ELTS_PER_THREAD), 1024));
|
||||
int const numBlocksPerSM =
|
||||
vllm_runtime_blocks_per_sm(static_cast<int>(block.x));
|
||||
int const numBlocksPerSM = 2048 / block.x;
|
||||
dim3 grid(std::min(int(m), multiProcessorCount * numBlocksPerSM));
|
||||
|
||||
VLLM_DISPATCH_HALF_TYPES(
|
||||
|
||||
@ -26,13 +26,12 @@
|
||||
#include "dispatch_utils.h"
|
||||
|
||||
#include "nvfp4_utils.cuh"
|
||||
#include "launch_bounds_utils.h"
|
||||
|
||||
namespace vllm {
|
||||
|
||||
// Use UE4M3 by default.
|
||||
template <class Type, bool UE8M0_SF = false, bool SMALL_NUM_EXPERTS = false>
|
||||
__global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
|
||||
__global__ void __launch_bounds__(512, 4)
|
||||
cvt_fp16_to_fp4(int32_t numRows, int32_t numCols, Type const* in,
|
||||
float const* SFScale, uint32_t* out, uint32_t* SFout,
|
||||
uint32_t* input_offset_by_experts,
|
||||
@ -130,7 +129,7 @@ __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
|
||||
|
||||
// Kernel for LARGE_M_TOPK = true (large m_topk optimized version)
|
||||
template <class Type, bool UE8M0_SF = false, bool SMALL_NUM_EXPERTS = false>
|
||||
__global__ void __launch_bounds__(1024, VLLM_BLOCKS_PER_SM(1024))
|
||||
__global__ void __launch_bounds__(1024, 4)
|
||||
cvt_fp16_to_fp4(int32_t numRows, int32_t numCols, Type const* in,
|
||||
float const* SFScale, uint32_t* out, uint32_t* SFout,
|
||||
uint32_t* input_offset_by_experts,
|
||||
@ -234,9 +233,8 @@ void quant_impl(void* output, void* output_scale, void* input,
|
||||
int const workSizePerRow = k / ELTS_PER_THREAD;
|
||||
int const totalWorkSize = m_topk * workSizePerRow;
|
||||
dim3 block(std::min(workSizePerRow, 512));
|
||||
// Get number of blocks per SM
|
||||
int const numBlocksPerSM =
|
||||
vllm_runtime_blocks_per_sm(static_cast<int>(block.x));
|
||||
// Get number of blocks per SM (assume we can fully utilize the SM).
|
||||
int const numBlocksPerSM = 2048 / block.x;
|
||||
dim3 grid(std::min(static_cast<int>((totalWorkSize + block.x - 1) / block.x),
|
||||
multiProcessorCount * numBlocksPerSM));
|
||||
while (grid.x <= multiProcessorCount && block.x > 64) {
|
||||
|
||||
@ -26,14 +26,13 @@
|
||||
#include "dispatch_utils.h"
|
||||
|
||||
#include "cuda_utils.h"
|
||||
#include "launch_bounds_utils.h"
|
||||
#include "nvfp4_utils.cuh"
|
||||
|
||||
namespace vllm {
|
||||
|
||||
// Use UE4M3 by default.
|
||||
template <class Type, bool UE8M0_SF = false>
|
||||
__global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
|
||||
__global__ void __launch_bounds__(512, 4)
|
||||
cvt_fp16_to_fp4(int32_t numRows, int32_t numCols, Type const* in,
|
||||
float const* SFScale, uint32_t* out, uint32_t* SFout) {
|
||||
using PackedVec = PackedVec<Type>;
|
||||
@ -76,9 +75,8 @@ void invokeFP4Quantization(int m, int n, T const* input, float const* SFScale,
|
||||
// Grid, Block size.
|
||||
// Each thread converts 8 values.
|
||||
dim3 block(std::min(int(n / ELTS_PER_THREAD), 512));
|
||||
// Get number of blocks per SM
|
||||
int const numBlocksPerSM =
|
||||
vllm_runtime_blocks_per_sm(static_cast<int>(block.x));
|
||||
// Get number of blocks per SM (assume we can fully utilize the SM).
|
||||
int const numBlocksPerSM = 2048 / block.x;
|
||||
dim3 grid(std::min(int(m), multiProcessorCount * numBlocksPerSM));
|
||||
|
||||
// Launch the cvt kernel.
|
||||
|
||||
@ -36,23 +36,22 @@ th:not(:first-child) {
|
||||
}
|
||||
</style>
|
||||
|
||||
| Feature | [CP][chunked-prefill] | [APC](automatic_prefix_caching.md) | [LoRA](lora.md) | [SD](spec_decode.md) | CUDA graph | [pooling](../models/pooling_models.md) | <abbr title="Encoder-Decoder Models">enc-dec</abbr> | <abbr title="Logprobs">logP</abbr> | <abbr title="Prompt Logprobs">prmpt logP</abbr> | <abbr title="Async Output Processing">async output</abbr> | multi-step | <abbr title="Multimodal Inputs">mm</abbr> | best-of | beam-search | [prompt-embeds](prompt_embeds.md) |
|
||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
||||
| [CP][chunked-prefill] | ✅ | | | | | | | | | | | | | | |
|
||||
| [APC](automatic_prefix_caching.md) | ✅ | ✅ | | | | | | | | | | | | | |
|
||||
| [LoRA](lora.md) | ✅ | ✅ | ✅ | | | | | | | | | | | | |
|
||||
| [SD](spec_decode.md) | ✅ | ✅ | ❌ | ✅ | | | | | | | | | | | |
|
||||
| CUDA graph | ✅ | ✅ | ✅ | ✅ | ✅ | | | | | | | | | | |
|
||||
| [pooling](../models/pooling_models.md) | 🟠\* | 🟠\* | ✅ | ❌ | ✅ | ✅ | | | | | | | | | |
|
||||
| <abbr title="Encoder-Decoder Models">enc-dec</abbr> | ❌ | [❌](gh-issue:7366) | ❌ | [❌](gh-issue:7366) | ✅ | ✅ | ✅ | | | | | | | | |
|
||||
| <abbr title="Logprobs">logP</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | | | | | | | |
|
||||
| <abbr title="Prompt Logprobs">prmpt logP</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | | | | | | |
|
||||
| <abbr title="Async Output Processing">async output</abbr> | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ | | | | | |
|
||||
| multi-step | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | | | | |
|
||||
| [mm](multimodal_inputs.md) | ✅ | ✅ | [🟠](gh-pr:4194)<sup>^</sup> | ❔ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❔ | ✅ | | | |
|
||||
| best-of | ✅ | ✅ | ✅ | [❌](gh-issue:6137) | ✅ | ❌ | ✅ | ✅ | ✅ | ❔ | [❌](gh-issue:7968) | ✅ | ✅ | | |
|
||||
| beam-search | ✅ | ✅ | ✅ | [❌](gh-issue:6137) | ✅ | ❌ | ✅ | ✅ | ✅ | ❔ | [❌](gh-issue:7968) | ❔ | ✅ | ✅ | |
|
||||
| [prompt-embeds](prompt_embeds.md) | ✅ | [❌](gh-issue:25096) | ? | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ? | ? | ❌ | ? | ? | ✅ |
|
||||
| Feature | [CP][chunked-prefill] | [APC](automatic_prefix_caching.md) | [LoRA](lora.md) | [SD](spec_decode.md) | CUDA graph | [pooling](../models/pooling_models.md) | <abbr title="Encoder-Decoder Models">enc-dec</abbr> | <abbr title="Logprobs">logP</abbr> | <abbr title="Prompt Logprobs">prmpt logP</abbr> | <abbr title="Async Output Processing">async output</abbr> | multi-step | <abbr title="Multimodal Inputs">mm</abbr> | best-of | beam-search |
|
||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
||||
| [CP][chunked-prefill] | ✅ | | | | | | | | | | | | | |
|
||||
| [APC](automatic_prefix_caching.md) | ✅ | ✅ | | | | | | | | | | | | |
|
||||
| [LoRA](lora.md) | ✅ | ✅ | ✅ | | | | | | | | | | | |
|
||||
| [SD](spec_decode.md) | ✅ | ✅ | ❌ | ✅ | | | | | | | | | | |
|
||||
| CUDA graph | ✅ | ✅ | ✅ | ✅ | ✅ | | | | | | | | | |
|
||||
| [pooling](../models/pooling_models.md) | 🟠\* | 🟠\* | ✅ | ❌ | ✅ | ✅ | | | | | | | | |
|
||||
| <abbr title="Encoder-Decoder Models">enc-dec</abbr> | ❌ | [❌](gh-issue:7366) | ❌ | [❌](gh-issue:7366) | ✅ | ✅ | ✅ | | | | | | | |
|
||||
| <abbr title="Logprobs">logP</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | | | | | | |
|
||||
| <abbr title="Prompt Logprobs">prmpt logP</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | | | | | |
|
||||
| <abbr title="Async Output Processing">async output</abbr> | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ | | | | |
|
||||
| multi-step | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | | | |
|
||||
| [mm](multimodal_inputs.md) | ✅ | ✅ | [🟠](gh-pr:4194)<sup>^</sup> | ❔ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❔ | ✅ | | |
|
||||
| best-of | ✅ | ✅ | ✅ | [❌](gh-issue:6137) | ✅ | ❌ | ✅ | ✅ | ✅ | ❔ | [❌](gh-issue:7968) | ✅ | ✅ | |
|
||||
| beam-search | ✅ | ✅ | ✅ | [❌](gh-issue:6137) | ✅ | ❌ | ✅ | ✅ | ✅ | ❔ | [❌](gh-issue:7968) | ❔ | ✅ | ✅ |
|
||||
|
||||
\* Chunked prefill and prefix caching are only applicable to last-token pooling.
|
||||
<sup>^</sup> LoRA is only applicable to the language backbone of multimodal models.
|
||||
@ -77,4 +76,3 @@ th:not(:first-child) {
|
||||
| multi-step | ✅ | ✅ | ✅ | ✅ | ✅ | [❌](gh-issue:8477) | ✅ | ❌ |
|
||||
| best-of | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||
| beam-search | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||
| [prompt-embeds](prompt_embeds.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ? | [❌](gh-issue:25097) |
|
||||
|
||||
@ -31,12 +31,6 @@ Now supports 5 types of connectors:
|
||||
--kv-transfer-config '{"kv_connector":"MultiConnector","kv_role":"kv_both","kv_connector_extra_config":{"connectors":[{"kv_connector":"NixlConnector","kv_role":"kv_both"},{"kv_connector":"SharedStorageConnector","kv_role":"kv_both","kv_connector_extra_config":{"shared_storage_path":"local_storage"}}]}}'
|
||||
```
|
||||
|
||||
- **OffloadingConnector**: enable offloading of KV data to CPU memory, customizing the CPU block size (in tokens) and number of blocks to allocate (per worker):
|
||||
|
||||
```bash
|
||||
--kv-transfer-config '{"kv_connector":"OffloadingConnector","kv_role":"kv_both","kv_connector_extra_config":{"block_size": 64, "num_cpu_blocks": 1000}}'
|
||||
```
|
||||
|
||||
## Benchmarks
|
||||
|
||||
Please refer to <gh-file:benchmarks/disagg_benchmarks> for disaggregated prefilling benchmarks.
|
||||
|
||||
@ -6,6 +6,9 @@ This page teaches you how to pass prompt embedding inputs to vLLM.
|
||||
|
||||
The traditional flow of text data for a Large Language Model goes from text to token ids (via a tokenizer) then from token ids to prompt embeddings. For a traditional decoder-only model (such as meta-llama/Llama-3.1-8B-Instruct), this step of converting token ids to prompt embeddings happens via a look-up from a learned embedding matrix, but the model is not limited to processing only the embeddings corresponding to its token vocabulary.
|
||||
|
||||
!!! note
|
||||
Prompt embeddings are currently only supported in the v0 engine.
|
||||
|
||||
## Offline Inference
|
||||
|
||||
To input multi-modal data, follow this schema in [vllm.inputs.EmbedsPrompt][]:
|
||||
|
||||
@ -59,7 +59,7 @@ enabling the corresponding APIs:
|
||||
#### Predefined models
|
||||
|
||||
If the [Pooler][vllm.model_executor.layers.pooler.Pooler] defined by the model accepts `pooler_config`,
|
||||
you can override some of its attributes via the `--pooler-config` option.
|
||||
you can override some of its attributes via the `--override-pooler-config` option.
|
||||
|
||||
#### Converted models
|
||||
|
||||
@ -75,7 +75,7 @@ the pooler assigned to each task has the following attributes by default:
|
||||
When loading [Sentence Transformers](https://huggingface.co/sentence-transformers) models,
|
||||
its Sentence Transformers configuration file (`modules.json`) takes priority over the model's defaults.
|
||||
|
||||
You can further customize this via the `--pooler-config` option,
|
||||
You can further customize this via the `--override-pooler-config` option,
|
||||
which takes priority over both the model's and Sentence Transformers's defaults.
|
||||
|
||||
## Offline Inference
|
||||
|
||||
@ -17,24 +17,9 @@ These models are what we list in [supported-text-models][supported-text-models]
|
||||
|
||||
### Transformers
|
||||
|
||||
vLLM also supports model implementations that are available in Transformers. You should expect the performance of a Transformers model implementation used in vLLM to be within <1% of the performance of a dedicated vLLM model implementation. We call this feature the "Transformers backend".
|
||||
vLLM also supports model implementations that are available in Transformers. This does not currently work for all models, but most decoder language models and common vision language models are supported! Vision-language models currently accept only image inputs. Support for video inputs will be added in future releases.
|
||||
|
||||
Currently, the Transformers backend works for the following:
|
||||
|
||||
- Modalities: embedding models, language models and vision-language models*
|
||||
- Architectures: encoder-only, decoder-only
|
||||
- Attention types: full attention and/or sliding attention
|
||||
|
||||
_*Vision-language models currently accept only image inputs. Support for video inputs will be added in a future release._
|
||||
|
||||
If the Transformers model implementation follows all the steps in [writing a custom model](#writing-custom-models) then, when used with the Transformers backend, it will be compatible with the following features of vLLM:
|
||||
|
||||
- All the features listed in the [compatibility matrix](../features/compatibility_matrix.md#feature-x-feature)
|
||||
- Any combination of the following vLLM parallelisation schemes:
|
||||
- Pipeline parallel
|
||||
- Tensor parallel
|
||||
|
||||
Checking if the modeling backend is Transformers is as simple as:
|
||||
To check if the modeling backend is Transformers, you can simply do this:
|
||||
|
||||
```python
|
||||
from vllm import LLM
|
||||
@ -42,12 +27,16 @@ llm = LLM(model=...) # Name or path of your model
|
||||
llm.apply_model(lambda model: print(type(model)))
|
||||
```
|
||||
|
||||
If the printed type starts with `Transformers...` then it's using the Transformers model implementation!
|
||||
If it is `TransformersForCausalLM` or `TransformersForMultimodalLM` then it means it's based on Transformers!
|
||||
|
||||
If a model has a vLLM implementation but you would prefer to use the Transformers implementation via the Transformers backend, set `model_impl="transformers"` for [offline inference](../serving/offline_inference.md) or `--model-impl transformers` for the [online serving](../serving/openai_compatible_server.md).
|
||||
!!! tip
|
||||
You can force the use of `TransformersForCausalLM` by setting `model_impl="transformers"` for [offline-inference](../serving/offline_inference.md) or `--model-impl transformers` for the [openai-compatible-server](../serving/openai_compatible_server.md).
|
||||
|
||||
!!! note
|
||||
For vision-language models, if you are loading with `dtype="auto"`, vLLM loads the whole model with config's `dtype` if it exists. In contrast the native Transformers will respect the `dtype` attribute of each backbone in the model. That might cause a slight difference in performance.
|
||||
vLLM may not fully optimise the Transformers implementation so you may see degraded performance if comparing a native model to a Transformers model in vLLM.
|
||||
|
||||
!!! note
|
||||
In case of vision language models if you are loading with `dtype="auto"`, vLLM loads the whole model with config's `dtype` if it exists. In contrast the native Transformers will respect the `dtype` attribute of each backbone in the model. That might cause a slight difference in performance.
|
||||
|
||||
#### Custom models
|
||||
|
||||
@ -77,11 +66,10 @@ This section details the necessary modifications to make to a Transformers compa
|
||||
To make your model compatible with the Transformers backend, it needs:
|
||||
|
||||
1. `kwargs` passed down through all modules from `MyModel` to `MyAttention`.
|
||||
1. If your model is encoder-only, you must also add `is_causal = False` to `MyAttention`.
|
||||
2. `MyAttention` must use `ALL_ATTENTION_FUNCTIONS` to call attention.
|
||||
3. `MyModel` must contain `_supports_attention_backend = True`.
|
||||
|
||||
<details class="code">
|
||||
<details>
|
||||
<summary>modeling_my_model.py</summary>
|
||||
|
||||
```python
|
||||
@ -90,7 +78,6 @@ from transformers import PreTrainedModel
|
||||
from torch import nn
|
||||
|
||||
class MyAttention(nn.Module):
|
||||
is_causal = False # Only do this for encoder-only models
|
||||
|
||||
def forward(self, hidden_states, **kwargs):
|
||||
...
|
||||
@ -114,13 +101,13 @@ Here is what happens in the background when this model is loaded:
|
||||
|
||||
1. The config is loaded.
|
||||
2. `MyModel` Python class is loaded from the `auto_map` in config, and we check that the model `is_backend_compatible()`.
|
||||
3. `MyModel` is loaded into one of the Transformers backend classes in <gh-file:vllm/model_executor/models/transformers.py> which sets `self.config._attn_implementation = "vllm"` so that vLLM's attention layer is used.
|
||||
3. `MyModel` is loaded into `TransformersForCausalLM` or `TransformersForMultimodalLM` (see <gh-file:vllm/model_executor/models/transformers.py>) which sets `self.config._attn_implementation = "vllm"` so that vLLM's attention layer is used.
|
||||
|
||||
That's it!
|
||||
|
||||
For your model to be compatible with vLLM's tensor parallel and/or pipeline parallel features, you must add `base_model_tp_plan` and/or `base_model_pp_plan` to your model's config class:
|
||||
|
||||
<details class="code">
|
||||
<details>
|
||||
<summary>configuration_my_model.py</summary>
|
||||
|
||||
```python
|
||||
@ -352,7 +339,6 @@ th {
|
||||
| `DeepseekV2ForCausalLM` | DeepSeek-V2 | `deepseek-ai/DeepSeek-V2`, `deepseek-ai/DeepSeek-V2-Chat`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `DeepseekV3ForCausalLM` | DeepSeek-V3 | `deepseek-ai/DeepSeek-V3`, `deepseek-ai/DeepSeek-R1`, `deepseek-ai/DeepSeek-V3.1`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `Dots1ForCausalLM` | dots.llm1 | `rednote-hilab/dots.llm1.base`, `rednote-hilab/dots.llm1.inst`, etc. | | ✅︎ | ✅︎ |
|
||||
| `DotsOCRForCausalLM` | dots_ocr | `rednote-hilab/dots.ocr` | | ✅︎ | ✅︎ |
|
||||
| `Ernie4_5ForCausalLM` | Ernie4.5 | `baidu/ERNIE-4.5-0.3B-PT`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `Ernie4_5_MoeForCausalLM` | Ernie4.5MoE | `baidu/ERNIE-4.5-21B-A3B-PT`, `baidu/ERNIE-4.5-300B-A47B-PT`, etc. |✅︎| ✅︎ | ✅︎ |
|
||||
| `ExaoneForCausalLM` | EXAONE-3 | `LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
@ -471,7 +457,7 @@ These models primarily support the [`LLM.embed`](./pooling_models.md#llmembed) A
|
||||
|
||||
!!! note
|
||||
`ssmits/Qwen2-7B-Instruct-embed-base` has an improperly defined Sentence Transformers config.
|
||||
You need to manually set mean pooling by passing `--pooler-config '{"pooling_type": "MEAN"}'`.
|
||||
You need to manually set mean pooling by passing `--override-pooler-config '{"pooling_type": "MEAN"}'`.
|
||||
|
||||
!!! note
|
||||
For `Alibaba-NLP/gte-Qwen2-*`, you need to enable `--trust-remote-code` for the correct tokenizer to be loaded.
|
||||
@ -566,7 +552,7 @@ If your model is not in the above list, we will try to automatically convert the
|
||||
|
||||
!!! important
|
||||
For process-supervised reward models such as `peiyi9979/math-shepherd-mistral-7b-prm`, the pooling config should be set explicitly,
|
||||
e.g.: `--pooler-config '{"pooling_type": "STEP", "step_tag_id": 123, "returned_token_ids": [456, 789]}'`.
|
||||
e.g.: `--override-pooler-config '{"pooling_type": "STEP", "step_tag_id": 123, "returned_token_ids": [456, 789]}'`.
|
||||
|
||||
#### Token Classification
|
||||
|
||||
|
||||
510
examples/offline_inference/profiling.py
Normal file
510
examples/offline_inference/profiling.py
Normal file
@ -0,0 +1,510 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import inspect
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
from argparse import RawTextHelpFormatter
|
||||
from collections.abc import Generator
|
||||
from dataclasses import asdict, dataclass
|
||||
from typing import Any, Optional, TypeAlias
|
||||
|
||||
import torch
|
||||
import tqdm
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.engine.arg_utils import EngineArgs
|
||||
from vllm.profiler.layerwise_profile import layerwise_profile
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
|
||||
BATCH_SIZE_DEFAULT = 1
|
||||
PROMPT_LEN_DEFAULT = 256
|
||||
|
||||
|
||||
@dataclass
|
||||
class ProfileContext:
|
||||
engine_args: EngineArgs
|
||||
prompt_len: int
|
||||
batch_size: int
|
||||
|
||||
# The profiler can run in 2 modes,
|
||||
# 1. Run profiler for user specified num_steps
|
||||
num_steps: Optional[int] = None
|
||||
# 2. Run profiler until all requests complete
|
||||
complete_num_requests_per_step: Optional[int] = None
|
||||
|
||||
save_chrome_traces_folder: Optional[str] = None
|
||||
|
||||
|
||||
def get_dtype(dtype: str):
|
||||
if dtype == "torch.float":
|
||||
return torch.float
|
||||
else:
|
||||
return dtype
|
||||
|
||||
|
||||
OutputLen_NumReqs_Map: TypeAlias = dict[int, int]
|
||||
|
||||
|
||||
def compute_request_output_lengths(
|
||||
batch_size: int, step_requests: list[int]
|
||||
) -> OutputLen_NumReqs_Map:
|
||||
"""
|
||||
Given the number of requests, batch_size, and the number of requests
|
||||
that each engine-step should process, step_requests, determine the
|
||||
output lengths of the requests such that step_request is honoured.
|
||||
|
||||
Example:
|
||||
if batch size = 128 and step_request = [128, 128, 96, 64, 32, 1]
|
||||
then return,
|
||||
{2 : 32, 3 : 32, 4 : 32, 5 : 31, 6 : 1}, meaning,
|
||||
32 requests should have output length 2,
|
||||
32 requests should have output length 3,
|
||||
32 requests should have output length 4,
|
||||
31 requests should have output length 5,
|
||||
1 request should have output length 6.
|
||||
|
||||
Args:
|
||||
batch_size (int): Number of requests submitted for profile. This is
|
||||
args.batch_size.
|
||||
step_requests (list[int]): step_requests[i] is the number of requests
|
||||
that the ith engine step should process.
|
||||
|
||||
Returns:
|
||||
OutputLen_NumReqs_Map : A dictionary with output-length as keys and the
|
||||
number of requests required to have that output-length as values.
|
||||
"""
|
||||
ol_nr: OutputLen_NumReqs_Map = {}
|
||||
|
||||
# Number of request that are assigned an output-length
|
||||
num_reqs_assigned: int = 0
|
||||
num_steps: int = len(step_requests)
|
||||
|
||||
# sanity check. The first step (prefill-step), must process all requests.
|
||||
assert step_requests[0] == batch_size
|
||||
|
||||
# Begin assignments from the last step.
|
||||
output_length: int = num_steps
|
||||
for num_requests_at_step in reversed(step_requests):
|
||||
if num_reqs_assigned == batch_size:
|
||||
break
|
||||
|
||||
assert num_reqs_assigned < batch_size
|
||||
|
||||
# Remove the number of requests that have been determined
|
||||
# to participate in this step and beyond.
|
||||
num_reqs_unassigned_at_step = num_requests_at_step - num_reqs_assigned
|
||||
assert num_reqs_unassigned_at_step >= 0
|
||||
|
||||
if num_reqs_unassigned_at_step > 0:
|
||||
ol_nr[output_length] = num_reqs_unassigned_at_step
|
||||
num_reqs_assigned += num_reqs_unassigned_at_step
|
||||
|
||||
output_length -= 1
|
||||
|
||||
# sanity checks.
|
||||
assert sum(ol_nr.values()) == batch_size, (
|
||||
"Number of requests in output-length assignment does not match "
|
||||
f"batch-size.\n batch size {batch_size} - "
|
||||
f"step requests {step_requests} - assignments {ol_nr}"
|
||||
)
|
||||
|
||||
# Check that the output-length is in [1, num-steps]. Output length must be
|
||||
# at least 1 as all requests must participate in the prefill-step.
|
||||
assert all(ol >= 1 and ol <= num_steps for ol in ol_nr), (
|
||||
"Output lengths of requests should be in range "
|
||||
f"[1, num-engine-steps].\n batch size {batch_size} - "
|
||||
f"step requests {step_requests} - assignments {ol_nr}"
|
||||
)
|
||||
|
||||
return ol_nr
|
||||
|
||||
|
||||
def determine_requests_per_step(context: ProfileContext) -> list[int]:
|
||||
"""
|
||||
Determine number of requests each engine step should process.
|
||||
If context.num_steps is set, then all engine steps process the
|
||||
same number of requests and the output list is of length
|
||||
context.num_steps.
|
||||
|
||||
If context.complete_num_requests_per_step is set, then each decode step
|
||||
processes fewer and fewer requests until there are no requests to process.
|
||||
In this case, the output list is as big as the number of steps
|
||||
required to process all requests.
|
||||
|
||||
Args:
|
||||
context: ProfileContext object.
|
||||
|
||||
Returns:
|
||||
list[int]: Number of requests to process for all engine-steps.
|
||||
output[i], contains the number of requests that the ith step
|
||||
should process.
|
||||
"""
|
||||
if context.num_steps:
|
||||
# All requests must run until num_engine_steps. This implies
|
||||
# that their output lengths must be equal to num_engine_steps.
|
||||
return [context.batch_size] * context.num_steps
|
||||
|
||||
assert (
|
||||
context.complete_num_requests_per_step
|
||||
and context.complete_num_requests_per_step > 0
|
||||
), (
|
||||
f"Expected a positive complete_num_requests_per_step argument."
|
||||
f"Instead got {context.complete_num_requests_per_step}"
|
||||
)
|
||||
|
||||
# We start dropping after the first decode step.
|
||||
step_requests = [
|
||||
context.batch_size, # prefill
|
||||
context.batch_size, # decode
|
||||
]
|
||||
|
||||
num_running_requests = context.batch_size
|
||||
num_running_requests -= context.complete_num_requests_per_step
|
||||
while num_running_requests > 0:
|
||||
step_requests.append(num_running_requests)
|
||||
num_running_requests -= context.complete_num_requests_per_step
|
||||
|
||||
if step_requests[-1] != 1:
|
||||
# have 1 request running at the last step. This is often
|
||||
# useful
|
||||
step_requests.append(1)
|
||||
|
||||
return step_requests
|
||||
|
||||
|
||||
def run_profile(
|
||||
context: ProfileContext, csv_output: Optional[str], json_output: Optional[str]
|
||||
):
|
||||
print("Run profile with:")
|
||||
for key, value in asdict(context).items():
|
||||
print(f" {key} = {value}")
|
||||
|
||||
requests_per_step: list[int] = determine_requests_per_step(context)
|
||||
|
||||
ol_nr: OutputLen_NumReqs_Map = compute_request_output_lengths(
|
||||
context.batch_size, requests_per_step
|
||||
)
|
||||
|
||||
num_steps_to_profile: int = len(requests_per_step)
|
||||
max_output_len: int = max(ol_nr.keys())
|
||||
assert max_output_len >= 1
|
||||
|
||||
# Create sampling params
|
||||
sampling_params = SamplingParams(
|
||||
temperature=0.8,
|
||||
top_p=0.95,
|
||||
# max_tokens is set on a per-request basis.
|
||||
max_tokens=None,
|
||||
ignore_eos=True,
|
||||
)
|
||||
|
||||
# Create LLM
|
||||
llm = LLM(**asdict(context.engine_args))
|
||||
batch_size = context.batch_size
|
||||
prompt_len = context.prompt_len
|
||||
|
||||
scheduler_config = llm.llm_engine.vllm_config.scheduler_config
|
||||
max_model_len = llm.llm_engine.model_config.max_model_len
|
||||
max_num_batched_tokens = scheduler_config.max_num_batched_tokens
|
||||
max_num_seqs = scheduler_config.max_num_seqs
|
||||
|
||||
if batch_size * prompt_len > max_num_batched_tokens:
|
||||
print(
|
||||
f"ERROR: chosen batch_size * prompt_len "
|
||||
f"({batch_size} * {prompt_len} = {batch_size * prompt_len}) is "
|
||||
f"larger than max_num_batched_tokens ({max_num_batched_tokens}) "
|
||||
f"and therefore cannot be run in a single profile step, please "
|
||||
f"choose a smaller batch size or prompt length, or increase "
|
||||
f"--max-num-batched-tokens"
|
||||
)
|
||||
sys.exit(-1)
|
||||
if batch_size > max_num_seqs:
|
||||
print(
|
||||
f"ERROR: chosen batch_size ({batch_size}) is larger than "
|
||||
f"max_num_seqs ({max_num_seqs}) and therefore cannot be run in a "
|
||||
f"single profile step, please choose a smaller batch size"
|
||||
)
|
||||
sys.exit(-1)
|
||||
print(
|
||||
"llm.llm_engine.model_config.max_model_len: ",
|
||||
llm.llm_engine.model_config.max_model_len,
|
||||
)
|
||||
if prompt_len + max_output_len > llm.llm_engine.model_config.max_model_len:
|
||||
print(
|
||||
f"ERROR: chosen prompt_len + max_output_len ({prompt_len} + "
|
||||
f"{max_output_len} = {prompt_len + max_output_len}) is larger "
|
||||
f"than the model's max_model_len ({max_model_len}), please "
|
||||
f"choose a smaller prompt_len or max_output_len, or increase "
|
||||
f"--max-model-len"
|
||||
)
|
||||
sys.exit(-1)
|
||||
|
||||
def add_requests():
|
||||
def get_output_len_generator() -> Generator[int, Any, Any]:
|
||||
for output_len, num_reqs in ol_nr.items():
|
||||
for _ in range(num_reqs):
|
||||
yield output_len
|
||||
|
||||
output_len_generator = get_output_len_generator()
|
||||
for i in range(batch_size):
|
||||
sampling_params.max_tokens = next(output_len_generator)
|
||||
assert isinstance(sampling_params.max_tokens, int)
|
||||
|
||||
prompt_token_ids = torch.randint(
|
||||
llm.get_tokenizer().vocab_size, size=(prompt_len,)
|
||||
).tolist()
|
||||
|
||||
llm.llm_engine.add_request(
|
||||
request_id=f"seq{i}",
|
||||
prompt={"prompt_token_ids": prompt_token_ids},
|
||||
params=sampling_params,
|
||||
)
|
||||
|
||||
def abort_requests():
|
||||
for i in range(batch_size):
|
||||
llm.llm_engine.abort_request(f"seq{i}")
|
||||
|
||||
# Warm up run
|
||||
print("Warm up run ...")
|
||||
add_requests()
|
||||
llm.llm_engine.step() # Prefill
|
||||
llm.llm_engine.step() # Decode
|
||||
abort_requests()
|
||||
|
||||
print("Profile run ...")
|
||||
add_requests()
|
||||
|
||||
with layerwise_profile() as prefill_prof:
|
||||
llm.llm_engine.step() # First step is prefill
|
||||
|
||||
decode_profs = []
|
||||
for _ in tqdm.tqdm(range(num_steps_to_profile - 1)):
|
||||
num_running_seqs = llm.llm_engine.scheduler[0].get_num_unfinished_seq_groups()
|
||||
with layerwise_profile(num_running_seqs=num_running_seqs) as decode_prof:
|
||||
llm.llm_engine.step()
|
||||
decode_profs.append(decode_prof)
|
||||
|
||||
decode_results_list = [prof.results for prof in decode_profs]
|
||||
prefill_results = prefill_prof.results
|
||||
has_decode = len(decode_results_list) > 0
|
||||
|
||||
LINE_WIDTH = 80
|
||||
print("=" * LINE_WIDTH)
|
||||
print(f"= Prefill Model Table (prompt_len={prompt_len}, batch_size={batch_size})")
|
||||
print("=" * LINE_WIDTH)
|
||||
print()
|
||||
prefill_results.print_model_table()
|
||||
|
||||
if has_decode:
|
||||
print()
|
||||
print("=" * LINE_WIDTH)
|
||||
print(
|
||||
f"= First Decode Step Model Table "
|
||||
f"(prompt_len={prompt_len}, batch_size={batch_size})"
|
||||
)
|
||||
print("=" * LINE_WIDTH)
|
||||
print()
|
||||
decode_results_list[0].print_model_table()
|
||||
|
||||
print()
|
||||
print("=" * LINE_WIDTH)
|
||||
print(f"= Prefill Summary Table (prompt_len={prompt_len}, batch_size={batch_size})")
|
||||
print("=" * LINE_WIDTH)
|
||||
print()
|
||||
prefill_results.print_summary_table()
|
||||
|
||||
if has_decode:
|
||||
print()
|
||||
print("=" * LINE_WIDTH)
|
||||
print(
|
||||
f"= First Decode Step Summary Table "
|
||||
f"(prompt_len={prompt_len}, batch_size={batch_size})"
|
||||
)
|
||||
print("=" * LINE_WIDTH)
|
||||
print()
|
||||
decode_results_list[0].print_summary_table()
|
||||
|
||||
if csv_output:
|
||||
csv_filename_base = (
|
||||
csv_output[:-4] if csv_output.endswith(".csv") else csv_output
|
||||
)
|
||||
prefill_results.export_model_stats_table_csv(
|
||||
csv_filename_base + "_prefill_model_table.csv"
|
||||
)
|
||||
prefill_results.export_summary_stats_table_csv(
|
||||
csv_filename_base + "_prefill_summary_table.csv"
|
||||
)
|
||||
|
||||
if has_decode:
|
||||
decode_results_list[0].export_model_stats_table_csv(
|
||||
csv_filename_base + "_decode_model_table.csv"
|
||||
)
|
||||
decode_results_list[0].export_summary_stats_table_csv(
|
||||
csv_filename_base + "_decode_summary_table.csv"
|
||||
)
|
||||
|
||||
if json_output:
|
||||
cuda_devices = [
|
||||
torch.cuda.get_device_properties(dev_idx)
|
||||
for dev_idx in range(torch.cuda.device_count())
|
||||
]
|
||||
|
||||
json_dict = {
|
||||
"context": {
|
||||
"python_version": f"{sys.version}",
|
||||
"torch_version": f"{torch.__version__}",
|
||||
"torch_cuda_version": f"{torch.version.cuda}",
|
||||
"cuda_devices": f"{cuda_devices}",
|
||||
**asdict(context),
|
||||
},
|
||||
"prefill": prefill_results.convert_stats_to_dict(),
|
||||
}
|
||||
|
||||
if has_decode:
|
||||
for idx, dr in enumerate(decode_results_list):
|
||||
json_dict[f"decode_{idx + 1}"] = dr.convert_stats_to_dict()
|
||||
|
||||
# Add .json to json_output filename if it doesn't exist already.
|
||||
json_output_file = (
|
||||
json_output if json_output.endswith(".json") else json_output + ".json"
|
||||
)
|
||||
with open(json_output_file, "w+") as f:
|
||||
json.dump(json_dict, f, indent=2)
|
||||
pass
|
||||
|
||||
if context.save_chrome_traces_folder is not None:
|
||||
os.makedirs(context.save_chrome_traces_folder, exist_ok=True)
|
||||
prefill_prof.profiler.export_chrome_trace(
|
||||
context.save_chrome_traces_folder + "/prefill.json"
|
||||
)
|
||||
for idx, decode_prof in enumerate(decode_profs):
|
||||
decode_prof.profiler.export_chrome_trace(
|
||||
context.save_chrome_traces_folder + f"/decode_{idx + 1}.json"
|
||||
)
|
||||
print(
|
||||
"Traces saved as prefill.json and decode_1.json, etc."
|
||||
f" in folder {context.save_chrome_traces_folder}"
|
||||
)
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = FlexibleArgumentParser(
|
||||
description="""
|
||||
Profile a model
|
||||
|
||||
example:
|
||||
```
|
||||
python examples/offline_inference/profiling.py \\
|
||||
--model neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8 --batch-size 4 \\
|
||||
--prompt-len 512 --max-num-batched-tokens 8196 --json Llama31-8b-FP8 \\
|
||||
--enforce-eager run_num_steps -n 2
|
||||
```
|
||||
|
||||
then you can use various tools to analyze the json output
|
||||
terminal ascii tables:
|
||||
```
|
||||
python tools/profiler/print_layerwise_table.py \\
|
||||
--json-trace Llama31-8b-FP8.json --phase prefill --table summary
|
||||
```
|
||||
or create matplotlib stacked bar charts:
|
||||
```
|
||||
python tools/profiler/visualize_layerwise_profile.py \\
|
||||
--json-trace Llama31-8b-FP8.json \\
|
||||
--output-directory profile_breakdown --plot-metric pct_cuda_time
|
||||
```
|
||||
""",
|
||||
formatter_class=RawTextHelpFormatter,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--csv",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Export the results as multiple csv file. This should be the root "
|
||||
"filename, will create <filename>_prefill_model_table.csv, "
|
||||
"<filename>_prefill_summary_table.csv, "
|
||||
"<filename>_decode_model_table.csv, and "
|
||||
"<filename>_decode_summary_table.csv",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--json",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Export the results as a json file. This should be the filename",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--save-chrome-traces-folder",
|
||||
type=str,
|
||||
help="Save chrome traces for the prefill and decode "
|
||||
"will save traces as prefill.json and decode_1.json, "
|
||||
"etc. inside this folder",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--prompt-len",
|
||||
type=int,
|
||||
default=PROMPT_LEN_DEFAULT,
|
||||
help=f"Length of the random prompt to use when profiling, all batched "
|
||||
f"requests use the same prompt_len, default={PROMPT_LEN_DEFAULT}",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--batch-size",
|
||||
type=int,
|
||||
default=BATCH_SIZE_DEFAULT,
|
||||
help=f"Number of requests to run as a single batch, "
|
||||
f"default={BATCH_SIZE_DEFAULT}",
|
||||
)
|
||||
|
||||
subparsers = parser.add_subparsers(dest="cmd")
|
||||
|
||||
run_num_steps_parser = subparsers.add_parser(
|
||||
"run_num_steps", help="This variation profiles n engine.step() invocations."
|
||||
)
|
||||
run_num_steps_parser.add_argument(
|
||||
"-n",
|
||||
"--num-steps",
|
||||
type=int,
|
||||
help="Number of engine steps to profile.\n"
|
||||
"Setting it to 1, profiles only the prefill step.\n"
|
||||
"Setting it to 2, profiles the prefill and first decode step\n"
|
||||
"Setting it to 3, profiles the prefill, 1st and 2nd decode steps\n"
|
||||
"and so on ...",
|
||||
)
|
||||
|
||||
run_to_completion_parser = subparsers.add_parser(
|
||||
"run_to_completion",
|
||||
help="This variation profiles all the engine.step() invocations"
|
||||
"until the engine exhausts all submitted requests.",
|
||||
)
|
||||
run_to_completion_parser.add_argument(
|
||||
"-n",
|
||||
"--complete-num-requests-per-step",
|
||||
type=int,
|
||||
help="Complete complete_num_requests_per_step requests every decode step."
|
||||
"For e.g., with batch_size 128 and complete_num_requests_per_step 32,"
|
||||
"the profiler is run for 6 engine steps, with the steps processing, "
|
||||
"128, 128, 96, 64, 32, 1 requests respectively.\n"
|
||||
"Note that we tack-on a one-request step at the end as it is often "
|
||||
"useful.",
|
||||
)
|
||||
|
||||
EngineArgs.add_cli_args(parser)
|
||||
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def main(args):
|
||||
context = ProfileContext(
|
||||
engine_args=EngineArgs.from_cli_args(args),
|
||||
**{
|
||||
k: v
|
||||
for k, v in vars(args).items()
|
||||
if k in inspect.signature(ProfileContext).parameters
|
||||
},
|
||||
)
|
||||
run_profile(context, csv_output=args.csv, json_output=args.json)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parse_args()
|
||||
main(args)
|
||||
@ -5,6 +5,7 @@ from urllib.request import urlopen
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
|
||||
os.environ["VLLM_ATTENTION_BACKEND"] = "DUAL_CHUNK_FLASH_ATTN"
|
||||
os.environ["VLLM_ALLOW_LONG_MAX_MODEL_LEN"] = "1"
|
||||
|
||||
|
||||
|
||||
@ -1,81 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
experimental support for data-parallel inference with torchrun
|
||||
Note the data load balancing and distribution is done out of the vllm engine,
|
||||
no internal lb supported in external_launcher mode.
|
||||
"""
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
|
||||
# Create prompts, the same across all ranks
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
] * 50
|
||||
|
||||
# Create sampling parameters, the same across all ranks
|
||||
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
||||
|
||||
# Use `distributed_executor_backend="external_launcher"` so that
|
||||
# this llm engine/instance only creates one worker.
|
||||
# it is important to set an explicit seed to make sure that
|
||||
# all ranks have the same random seed, so that sampling can be
|
||||
# deterministic across ranks.
|
||||
llm = LLM(
|
||||
model="microsoft/Phi-mini-MoE-instruct",
|
||||
tensor_parallel_size=1,
|
||||
data_parallel_size=2,
|
||||
pipeline_parallel_size=1,
|
||||
enable_expert_parallel=False,
|
||||
distributed_executor_backend="external_launcher",
|
||||
max_model_len=4096,
|
||||
gpu_memory_utilization=0.6,
|
||||
seed=1,
|
||||
)
|
||||
|
||||
dp_rank = llm.llm_engine.vllm_config.parallel_config.data_parallel_rank
|
||||
dp_size = llm.llm_engine.vllm_config.parallel_config.data_parallel_size
|
||||
|
||||
prompts = [
|
||||
f"{idx}.{prompt}" for idx, prompt in enumerate(prompts) if idx % dp_size == dp_rank
|
||||
]
|
||||
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
|
||||
|
||||
# all ranks will have the same outputs
|
||||
print("-" * 50)
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}\n")
|
||||
print("-" * 50)
|
||||
"""
|
||||
Further tips:
|
||||
|
||||
1. to communicate control messages across all ranks, use the cpu group,
|
||||
a PyTorch ProcessGroup with GLOO backend.
|
||||
|
||||
```python
|
||||
from vllm.distributed.parallel_state import get_world_group
|
||||
cpu_group = get_world_group().cpu_group
|
||||
torch_rank = dist.get_rank(group=cpu_group)
|
||||
if torch_rank == 0:
|
||||
# do something for rank 0, e.g. saving the results to disk.
|
||||
```
|
||||
|
||||
2. to communicate data across all ranks, use the model's device group,
|
||||
a PyTorch ProcessGroup with NCCL backend.
|
||||
```python
|
||||
from vllm.distributed.parallel_state import get_world_group
|
||||
device_group = get_world_group().device_group
|
||||
```
|
||||
|
||||
3. to access the model directly in every rank, use the following code:
|
||||
```python
|
||||
llm.llm_engine.model_executor.driver_worker.worker.model_runner.model
|
||||
```
|
||||
"""
|
||||
@ -126,23 +126,6 @@ def run_chameleon(questions: list[str], modality: str) -> ModelRequestData:
|
||||
)
|
||||
|
||||
|
||||
# Dots-OCR
|
||||
def run_dots_ocr(questions: list[str], modality: str) -> ModelRequestData:
|
||||
assert modality == "image"
|
||||
|
||||
prompts = [f"<|img|><|imgpad|><|endofimg|>{question}" for question in questions]
|
||||
engine_args = EngineArgs(
|
||||
model="rednote-hilab/dots.ocr",
|
||||
limit_mm_per_prompt={modality: 1},
|
||||
trust_remote_code=True,
|
||||
)
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompts=prompts,
|
||||
)
|
||||
|
||||
|
||||
def run_command_a_vision(questions: list[str], modality: str) -> ModelRequestData:
|
||||
assert modality == "image"
|
||||
|
||||
@ -1693,7 +1676,6 @@ model_example_map = {
|
||||
"aya_vision": run_aya_vision,
|
||||
"blip-2": run_blip2,
|
||||
"chameleon": run_chameleon,
|
||||
"dots_ocr": run_dots_ocr,
|
||||
"command_a_vision": run_command_a_vision,
|
||||
"deepseek_vl_v2": run_deepseek_vl2,
|
||||
"ernie45_vl": run_ernie45_vl,
|
||||
|
||||
@ -42,7 +42,7 @@ python client.py
|
||||
|
||||
### Server Configuration
|
||||
|
||||
The key parameters for chunked processing are in the `--pooler-config`:
|
||||
The key parameters for chunked processing are in the `--override-pooler-config`:
|
||||
|
||||
```json
|
||||
{
|
||||
|
||||
@ -13,7 +13,7 @@ Prerequisites:
|
||||
|
||||
# MEAN pooling (processes all chunks, recommended for complete coverage)
|
||||
vllm serve intfloat/multilingual-e5-large \
|
||||
--pooler-config \
|
||||
--override-pooler-config \
|
||||
'{"pooling_type": "MEAN", "normalize": true, ' \
|
||||
'"enable_chunked_processing": true, "max_embed_len": 3072000}' \
|
||||
--served-model-name multilingual-e5-large \
|
||||
@ -23,7 +23,7 @@ Prerequisites:
|
||||
|
||||
# OR CLS pooling (native CLS within chunks, MEAN aggregation across chunks)
|
||||
vllm serve BAAI/bge-large-en-v1.5 \
|
||||
--pooler-config \
|
||||
--override-pooler-config \
|
||||
'{"pooling_type": "CLS", "normalize": true, ' \
|
||||
'"enable_chunked_processing": true, "max_embed_len": 1048576}' \
|
||||
--served-model-name bge-large-en-v1.5 \
|
||||
|
||||
@ -103,7 +103,7 @@ POOLER_CONFIG="{\"pooling_type\": \"$POOLING_TYPE\", \"normalize\": true, \"enab
|
||||
vllm serve "$MODEL_NAME" \
|
||||
--tensor-parallel-size "$GPU_COUNT" \
|
||||
--enforce-eager \
|
||||
--pooler-config "$POOLER_CONFIG" \
|
||||
--override-pooler-config "$POOLER_CONFIG" \
|
||||
--served-model-name ${MODEL_CODE} \
|
||||
--api-key "$API_KEY" \
|
||||
--trust-remote-code \
|
||||
|
||||
@ -1,6 +1,8 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import argparse
|
||||
import dataclasses
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
@ -325,7 +327,12 @@ def main():
|
||||
|
||||
|
||||
if args.command == "serialize":
|
||||
engine_args = EngineArgs.from_cli_args(args)
|
||||
eng_args_dict = {f.name: getattr(args, f.name) for f in
|
||||
dataclasses.fields(EngineArgs)}
|
||||
|
||||
engine_args = EngineArgs.from_cli_args(
|
||||
argparse.Namespace(**eng_args_dict)
|
||||
)
|
||||
|
||||
input_dir = tensorizer_dir.rstrip('/')
|
||||
suffix = args.suffix if args.suffix else uuid.uuid4().hex
|
||||
|
||||
@ -102,7 +102,6 @@ plugins:
|
||||
- https://numpy.org/doc/stable/objects.inv
|
||||
- https://pytorch.org/docs/stable/objects.inv
|
||||
- https://psutil.readthedocs.io/en/stable/objects.inv
|
||||
- https://huggingface.co/docs/transformers/main/en/objects.inv
|
||||
|
||||
markdown_extensions:
|
||||
- attr_list
|
||||
|
||||
@ -70,6 +70,7 @@ line-length = 80
|
||||
"vllm/_version.py" = ["ALL"]
|
||||
# Python 3.8 typing - skip V0 code
|
||||
"vllm/attention/**/*.py" = ["UP006", "UP035"]
|
||||
"vllm/core/**/*.py" = ["UP006", "UP035"]
|
||||
"vllm/engine/**/*.py" = ["UP006", "UP035"]
|
||||
"vllm/executor/**/*.py" = ["UP006", "UP035"]
|
||||
"vllm/worker/**/*.py" = ["UP006", "UP035"]
|
||||
@ -110,6 +111,28 @@ ignore_missing_imports = true
|
||||
check_untyped_defs = true
|
||||
follow_imports = "silent"
|
||||
|
||||
# After fixing type errors resulting from follow_imports: "skip" -> "silent",
|
||||
# move the directory here and remove it from tools/mypy.sh
|
||||
files = [
|
||||
"vllm/*.py",
|
||||
"vllm/assets",
|
||||
"vllm/entrypoints",
|
||||
"vllm/core",
|
||||
"vllm/inputs",
|
||||
"vllm/logging_utils",
|
||||
"vllm/multimodal",
|
||||
"vllm/platforms",
|
||||
"vllm/transformers_utils",
|
||||
"vllm/triton_utils",
|
||||
"vllm/usage",
|
||||
]
|
||||
# TODO(woosuk): Include the code from Megatron and HuggingFace.
|
||||
exclude = [
|
||||
"vllm/model_executor/parallel_utils/|vllm/model_executor/models/",
|
||||
# Ignore triton kernels in ops.
|
||||
'vllm/attention/ops/.*\.py$'
|
||||
]
|
||||
|
||||
[tool.isort]
|
||||
skip_glob = [
|
||||
".buildkite/*",
|
||||
|
||||
@ -24,7 +24,7 @@ outlines_core == 0.2.11
|
||||
# required for outlines backend disk cache
|
||||
diskcache == 5.6.3
|
||||
lark == 1.2.2
|
||||
xgrammar == 0.1.24; platform_machine == "x86_64" or platform_machine == "aarch64" or platform_machine == "arm64"
|
||||
xgrammar == 0.1.23; platform_machine == "x86_64" or platform_machine == "aarch64" or platform_machine == "arm64"
|
||||
typing_extensions >= 4.10
|
||||
filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317
|
||||
partial-json-parser # used for parsing partial JSON outputs
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
# This file was autogenerated by uv via the following command:
|
||||
# uv pip compile requirements/test.in -o requirements/test.txt --index-strategy unsafe-best-match --torch-backend cu128 --python-platform x86_64-manylinux_2_28
|
||||
# uv pip compile requirements/test.in -o requirements/test.txt --index-strategy unsafe-best-match --torch-backend cu128
|
||||
absl-py==2.1.0
|
||||
# via rouge-score
|
||||
accelerate==1.0.1
|
||||
|
||||
@ -11,7 +11,7 @@ from unittest.mock import Mock
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from vllm import LLM
|
||||
from vllm import LLM, envs
|
||||
from vllm.v1.engine.llm_engine import LLMEngine as LLMEngineV1
|
||||
|
||||
from ..conftest import HfRunner, VllmRunner
|
||||
@ -26,6 +26,14 @@ MODELS = [
|
||||
TARGET_TEST_SUITE = os.environ.get("TARGET_TEST_SUITE", "L4")
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def v1(run_with_both_engines):
|
||||
# Simple autouse wrapper to run both engines for each test
|
||||
# This can be promoted up to conftest.py to run for every
|
||||
# test in a package
|
||||
pass
|
||||
|
||||
|
||||
def test_vllm_gc_ed():
|
||||
"""Verify vllm instance is GC'ed when it is deleted"""
|
||||
llm = LLM("distilbert/distilgpt2")
|
||||
@ -68,6 +76,17 @@ def test_models(
|
||||
model_executor: str,
|
||||
enable_prompt_embeds: bool,
|
||||
) -> None:
|
||||
|
||||
if enable_prompt_embeds and envs.is_set(
|
||||
"VLLM_USE_V1") and envs.VLLM_USE_V1:
|
||||
pytest.skip("enable_prompt_embeds is not supported in v1.")
|
||||
|
||||
if not envs.VLLM_USE_V1:
|
||||
if async_scheduling:
|
||||
pytest.skip("async_scheduling only supported in v1.")
|
||||
if model_executor != "uni":
|
||||
pytest.skip("only test uniproc executor for v0.")
|
||||
|
||||
if backend == "XFORMERS" and model == "google/gemma-2-2b-it":
|
||||
pytest.skip(
|
||||
f"{backend} does not support gemma2 with full context length.")
|
||||
@ -145,6 +164,11 @@ def test_models_distributed(
|
||||
extra_env: dict[str, str],
|
||||
enable_prompt_embeds: bool,
|
||||
) -> None:
|
||||
|
||||
if enable_prompt_embeds and envs.is_set(
|
||||
"VLLM_USE_V1") and envs.VLLM_USE_V1:
|
||||
pytest.skip("enable_prompt_embeds is not supported in v1.")
|
||||
|
||||
if test_suite != TARGET_TEST_SUITE:
|
||||
pytest.skip(f"Skip test for {test_suite}")
|
||||
|
||||
|
||||
@ -122,12 +122,11 @@ def test_cumem_with_cudagraph():
|
||||
# sleep mode with safetensors
|
||||
("meta-llama/Llama-3.2-1B", True),
|
||||
# sleep mode with pytorch checkpoint
|
||||
("facebook/opt-125m", True),
|
||||
("facebook/opt-125m", False),
|
||||
])
|
||||
def test_end_to_end(monkeypatch: pytest.MonkeyPatch, model: str, use_v1: bool):
|
||||
with monkeypatch.context() as m:
|
||||
assert use_v1
|
||||
m.setenv("VLLM_USE_V1", "1")
|
||||
m.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
|
||||
free, total = torch.cuda.mem_get_info()
|
||||
used_bytes_baseline = total - free # in case other process is running
|
||||
llm = LLM(model, enable_sleep_mode=True)
|
||||
|
||||
39
tests/build_cython.py
Normal file
39
tests/build_cython.py
Normal file
@ -0,0 +1,39 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import Cython.Compiler.Options
|
||||
from Cython.Build import cythonize
|
||||
from setuptools import setup
|
||||
|
||||
Cython.Compiler.Options.annotate = True
|
||||
|
||||
infiles = []
|
||||
|
||||
infiles += [
|
||||
"vllm/engine/llm_engine.py",
|
||||
"vllm/transformers_utils/detokenizer.py",
|
||||
"vllm/engine/output_processor/single_step.py",
|
||||
"vllm/outputs.py",
|
||||
"vllm/engine/output_processor/stop_checker.py",
|
||||
]
|
||||
|
||||
infiles += [
|
||||
"vllm/core/scheduler.py",
|
||||
"vllm/sequence.py",
|
||||
"vllm/core/block_manager.py",
|
||||
]
|
||||
|
||||
infiles += [
|
||||
"vllm/model_executor/layers/sampler.py",
|
||||
"vllm/sampling_params.py",
|
||||
"vllm/utils/__init__.py",
|
||||
]
|
||||
|
||||
setup(ext_modules=cythonize(infiles,
|
||||
annotate=False,
|
||||
force=True,
|
||||
compiler_directives={
|
||||
'language_level': "3",
|
||||
'infer_types': True
|
||||
}))
|
||||
|
||||
# example usage: python3 build_cython.py build_ext --inplace
|
||||
@ -1,7 +1,6 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import weakref
|
||||
from collections.abc import Sequence
|
||||
from copy import deepcopy
|
||||
from typing import Callable, Union
|
||||
@ -11,26 +10,7 @@ from torch._ops import OpOverload
|
||||
|
||||
from vllm.compilation.fx_utils import find_op_nodes
|
||||
from vllm.compilation.inductor_pass import InductorPass
|
||||
from vllm.compilation.pass_manager import with_pattern_match_debug
|
||||
from vllm.compilation.vllm_inductor_pass import VllmInductorPass
|
||||
from vllm.config import VllmConfig, get_current_vllm_config
|
||||
|
||||
|
||||
class LazyInitPass(InductorPass):
|
||||
"""
|
||||
If there's a pass that we want to initialize lazily in a test,
|
||||
we can wrap it in LazyInitPass, which will initialize the pass when invoked
|
||||
and then immediately invoke it.
|
||||
"""
|
||||
|
||||
def __init__(self, pass_cls: type[VllmInductorPass],
|
||||
vllm_config: VllmConfig):
|
||||
self.pass_cls = pass_cls
|
||||
self.vllm_config = weakref.proxy(vllm_config) # avoid cycle
|
||||
|
||||
def __call__(self, graph: fx.Graph) -> None:
|
||||
self.pass_ = self.pass_cls(self.vllm_config)
|
||||
self.pass_(graph)
|
||||
from vllm.config import get_current_vllm_config
|
||||
|
||||
|
||||
class TestBackend:
|
||||
@ -60,16 +40,10 @@ class TestBackend:
|
||||
example_inputs,
|
||||
config_patches=self.inductor_config)
|
||||
|
||||
@with_pattern_match_debug
|
||||
def post_pass(self, graph: fx.Graph):
|
||||
self.graph_pre_pass = deepcopy(graph)
|
||||
|
||||
VllmInductorPass.dump_prefix = 0
|
||||
for pass_ in self.custom_passes:
|
||||
pass_(graph)
|
||||
VllmInductorPass.dump_prefix += 1
|
||||
|
||||
VllmInductorPass.dump_prefix = None
|
||||
|
||||
self.graph_post_pass = deepcopy(graph)
|
||||
# assign by reference, will reflect the final state of the graph
|
||||
|
||||
@ -46,10 +46,7 @@ backend_configs = {
|
||||
# FA3 on Hopper
|
||||
"FA3":
|
||||
BackendConfig(name="FA3",
|
||||
env_vars={
|
||||
"VLLM_FLASH_ATTN_VERSION": "3",
|
||||
"VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH": "16",
|
||||
},
|
||||
env_vars={"VLLM_FLASH_ATTN_VERSION": "3"},
|
||||
comp_config={
|
||||
"cudagraph_mode": "FULL",
|
||||
},
|
||||
@ -69,7 +66,6 @@ backend_configs = {
|
||||
BackendConfig(name="FlashAttentionMLA",
|
||||
env_vars={
|
||||
"VLLM_ATTENTION_BACKEND": "FLASH_ATTN_MLA",
|
||||
"VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH": "16",
|
||||
},
|
||||
comp_config={
|
||||
"cudagraph_mode": "FULL_DECODE_ONLY",
|
||||
@ -93,10 +89,7 @@ backend_configs = {
|
||||
# FA2
|
||||
"FA2":
|
||||
BackendConfig(name="FA2",
|
||||
env_vars={
|
||||
"VLLM_FLASH_ATTN_VERSION": "2",
|
||||
"VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH": "16",
|
||||
},
|
||||
env_vars={"VLLM_FLASH_ATTN_VERSION": "2"},
|
||||
comp_config={
|
||||
"cudagraph_mode": "FULL",
|
||||
}),
|
||||
|
||||
@ -15,7 +15,6 @@ from vllm.config import (CompilationConfig, CompilationLevel, CUDAGraphMode,
|
||||
VllmConfig, set_current_vllm_config)
|
||||
from vllm.envs import VLLM_USE_V1
|
||||
from vllm.forward_context import BatchDescriptor, set_forward_context
|
||||
from vllm.utils import is_torch_equal_or_newer
|
||||
|
||||
# This import automatically registers `torch.ops.silly.attention`
|
||||
from ..silly_attention import get_global_counter, reset_global_counter
|
||||
@ -51,21 +50,16 @@ class SillyModel(nn.Module):
|
||||
return x
|
||||
|
||||
|
||||
def _run_simple_model(
|
||||
splitting_ops,
|
||||
use_inductor_graph_partition,
|
||||
use_inductor,
|
||||
expected_num_piecewise_graphs_seen,
|
||||
expected_num_piecewise_capturable_graphs_seen,
|
||||
expected_num_backend_compilations,
|
||||
expected_num_cudagraph_captured,
|
||||
):
|
||||
@pytest.mark.parametrize("use_inductor", [True, False])
|
||||
@torch.inference_mode()
|
||||
def test_simple_piecewise_compile(use_inductor):
|
||||
assert VLLM_USE_V1
|
||||
|
||||
vllm_config = VllmConfig(compilation_config=CompilationConfig(
|
||||
level=CompilationLevel.PIECEWISE,
|
||||
use_cudagraph=True,
|
||||
use_inductor=use_inductor,
|
||||
splitting_ops=splitting_ops,
|
||||
use_inductor_graph_partition=use_inductor_graph_partition,
|
||||
splitting_ops=["silly.attention"],
|
||||
cudagraph_copy_inputs=True,
|
||||
cudagraph_capture_sizes=[1, 2],
|
||||
))
|
||||
@ -76,11 +70,11 @@ def _run_simple_model(
|
||||
|
||||
with compilation_counter.expect(
|
||||
num_graphs_seen=1, # one graph for the model
|
||||
num_piecewise_graphs_seen=expected_num_piecewise_graphs_seen,
|
||||
num_piecewise_capturable_graphs_seen=
|
||||
expected_num_piecewise_capturable_graphs_seen,
|
||||
num_backend_compilations=expected_num_backend_compilations,
|
||||
num_cudagraph_captured=expected_num_cudagraph_captured,
|
||||
num_piecewise_graphs_seen=5, # 2 * num_layers + 1
|
||||
num_piecewise_capturable_graphs_seen=3, # 1 + num_layers
|
||||
num_backend_compilations=3, # num_piecewise_capturable_graphs_seen
|
||||
num_cudagraph_captured=
|
||||
6, # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
|
||||
), set_forward_context(None,
|
||||
vllm_config=vllm_config): # background context
|
||||
# warm up with background context
|
||||
@ -110,46 +104,3 @@ def _run_simple_model(
|
||||
output = model(input)
|
||||
assert get_global_counter() == 2
|
||||
assert torch.allclose(output.cpu(), torch.tensor([19.0, 19.0]))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("use_inductor", [True, False])
|
||||
@torch.inference_mode()
|
||||
def test_simple_piecewise_compile(use_inductor):
|
||||
assert VLLM_USE_V1
|
||||
_run_simple_model(
|
||||
splitting_ops=["silly.attention"],
|
||||
use_inductor_graph_partition=False,
|
||||
use_inductor=use_inductor,
|
||||
expected_num_piecewise_graphs_seen=5, # 2 * num_layers + 1
|
||||
expected_num_piecewise_capturable_graphs_seen=3, # 1 + num_layers
|
||||
expected_num_backend_compilations=
|
||||
3, # num_piecewise_capturable_graphs_seen
|
||||
expected_num_cudagraph_captured=
|
||||
6, # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
|
||||
)
|
||||
|
||||
|
||||
@torch.inference_mode()
|
||||
@pytest.mark.parametrize("splitting_ops", [["silly.attention"], []])
|
||||
def test_simple_inductor_graph_partition(splitting_ops):
|
||||
assert VLLM_USE_V1
|
||||
if not is_torch_equal_or_newer("2.9.0.dev"):
|
||||
pytest.skip("inductor graph partition is only available "
|
||||
"in PyTorch 2.9+")
|
||||
|
||||
_run_simple_model(
|
||||
# inductor graph partition automatically resets splitting_ops
|
||||
# to be an empty list
|
||||
splitting_ops=splitting_ops,
|
||||
use_inductor_graph_partition=True,
|
||||
use_inductor=True,
|
||||
expected_num_piecewise_graphs_seen=
|
||||
1, # since not splitting at fx graph level
|
||||
expected_num_piecewise_capturable_graphs_seen=
|
||||
1, # since not splitting at fx graph level
|
||||
expected_num_backend_compilations=
|
||||
1, # since not splitting at fx graph level
|
||||
expected_num_cudagraph_captured=
|
||||
6, # inductor graph partition still captures 6
|
||||
# graph, same as fx graph partition.
|
||||
)
|
||||
|
||||
@ -60,5 +60,4 @@ direct_register_custom_op(
|
||||
mutates_args=["out"],
|
||||
fake_impl=silly_attention_fake,
|
||||
target_lib=silly_lib,
|
||||
tags=(torch._C.Tag.cudagraph_unsafe, ),
|
||||
)
|
||||
|
||||
@ -294,8 +294,6 @@ def async_tp_pass_on_test_model(local_rank: int, world_size: int,
|
||||
compiled_model = torch.compile(model, backend=backend)
|
||||
compiled_model(hidden_states)
|
||||
|
||||
assert async_tp_pass.matched_count == 1
|
||||
|
||||
# In pre-nodes, all gather or reduce scatter should exist,
|
||||
# fused_matmul_reduce_scatter or fused_all_gather_matmul should not
|
||||
backend.check_before_ops(model.ops_in_model_before(), fully_replaced=False)
|
||||
|
||||
@ -20,6 +20,7 @@ class TestSetting:
|
||||
tp_size: int
|
||||
attn_backend: str
|
||||
method: str
|
||||
fullgraph: bool
|
||||
|
||||
|
||||
# we cannot afford testing the full Cartesian product
|
||||
@ -35,6 +36,7 @@ class TestSetting:
|
||||
tp_size=2,
|
||||
attn_backend="FLASH_ATTN",
|
||||
method="generate",
|
||||
fullgraph=True,
|
||||
),
|
||||
# llama model with quantization
|
||||
TestSetting(
|
||||
@ -44,6 +46,7 @@ class TestSetting:
|
||||
tp_size=1,
|
||||
attn_backend="FLASH_ATTN",
|
||||
method="generate",
|
||||
fullgraph=True,
|
||||
),
|
||||
# MoE model
|
||||
TestSetting(
|
||||
@ -53,6 +56,7 @@ class TestSetting:
|
||||
tp_size=2,
|
||||
attn_backend="FLASH_ATTN",
|
||||
method="generate",
|
||||
fullgraph=True,
|
||||
),
|
||||
# embedding model
|
||||
TestSetting(
|
||||
@ -69,6 +73,7 @@ class TestSetting:
|
||||
tp_size=1,
|
||||
attn_backend="FLASH_ATTN",
|
||||
method="encode",
|
||||
fullgraph=True,
|
||||
),
|
||||
TestSetting(
|
||||
model="BAAI/bge-base-en-v1.5",
|
||||
@ -77,6 +82,7 @@ class TestSetting:
|
||||
tp_size=1,
|
||||
attn_backend="FLASH_ATTN",
|
||||
method="encode",
|
||||
fullgraph=True,
|
||||
),
|
||||
# vision language model
|
||||
TestSetting(
|
||||
@ -86,6 +92,7 @@ class TestSetting:
|
||||
tp_size=1,
|
||||
attn_backend="FLASH_ATTN",
|
||||
method="generate_with_image",
|
||||
fullgraph=False,
|
||||
),
|
||||
],
|
||||
)
|
||||
@ -102,8 +109,9 @@ def test_compile_correctness(
|
||||
tp_size = test_setting.tp_size
|
||||
attn_backend = test_setting.attn_backend
|
||||
method = test_setting.method
|
||||
if cuda_device_count_stateless() < pp_size * tp_size:
|
||||
pytest.skip(f"Need at least {pp_size}*{tp_size} CUDA gpus but got "
|
||||
fullgraph = test_setting.fullgraph
|
||||
if cuda_device_count_stateless() != pp_size * tp_size:
|
||||
pytest.skip(f"Need exactly {pp_size}*{tp_size} CUDA gpus but got "
|
||||
f"{cuda_device_count_stateless()}")
|
||||
|
||||
with monkeypatch.context() as m:
|
||||
@ -141,5 +149,9 @@ def test_compile_correctness(
|
||||
]:
|
||||
all_args.append(final_args + [f"-O{level}"])
|
||||
all_envs.append({})
|
||||
if level != CompilationLevel.DYNAMO_ONCE and not fullgraph:
|
||||
# "DYNAMO_ONCE" will always use fullgraph
|
||||
all_envs[-1][
|
||||
"VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "0" # type: ignore
|
||||
|
||||
compare_all_settings(model, all_args * 3, all_envs, method=method)
|
||||
|
||||
@ -4,7 +4,7 @@ import pytest
|
||||
|
||||
import vllm
|
||||
from vllm.compilation.counter import compilation_counter
|
||||
from vllm.config import CompilationConfig, VllmConfig
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.utils import _is_torch_equal_or_newer
|
||||
|
||||
|
||||
@ -26,14 +26,6 @@ def test_use_cudagraphs_dynamic(monkeypatch):
|
||||
assert not vllm_config.compilation_config.use_cudagraph
|
||||
|
||||
|
||||
def test_custom_op():
|
||||
# proper syntax
|
||||
_ = CompilationConfig(custom_ops=["+quant_fp8", "-silu_and_mul"])
|
||||
|
||||
with pytest.raises(ValueError, match="Invalid syntax '"):
|
||||
_ = CompilationConfig(custom_ops=["quant_fp8"])
|
||||
|
||||
|
||||
# forked needed to workaround https://github.com/vllm-project/vllm/issues/21073
|
||||
@pytest.mark.forked
|
||||
# NB: We don't test VLLM_DISABLE_COMPILE_CACHE=0 because that depends
|
||||
|
||||
@ -3,7 +3,6 @@
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import tempfile
|
||||
from typing import Any, Optional, Union
|
||||
|
||||
@ -11,13 +10,9 @@ import pytest
|
||||
import torch
|
||||
|
||||
from tests.quantization.utils import is_quant_method_supported
|
||||
from tests.v1.attention.utils import _Backend
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.attention.selector import global_force_attn_backend_context_manager
|
||||
from vllm.config import (CompilationConfig, CompilationLevel, CUDAGraphMode,
|
||||
PassConfig)
|
||||
from vllm.config import CompilationConfig, CompilationLevel, PassConfig
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils import is_torch_equal_or_newer
|
||||
|
||||
from ..utils import create_new_process_for_each_test
|
||||
|
||||
@ -84,7 +79,9 @@ def test_full_graph(
|
||||
):
|
||||
model, model_kwargs = model_info
|
||||
|
||||
with monkeypatch.context():
|
||||
with monkeypatch.context() as m:
|
||||
# make sure these models can be captured in full graph mode
|
||||
m.setenv("VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE", "1")
|
||||
print(f"MODEL={model}")
|
||||
|
||||
run_model(optimization_level, model, model_kwargs)
|
||||
@ -110,18 +107,6 @@ def test_full_graph(
|
||||
(CompilationConfig(level=CompilationLevel.PIECEWISE,
|
||||
debug_dump_path=tempfile.gettempdir()),
|
||||
("facebook/opt-125m", {})),
|
||||
] + [
|
||||
# graph inductor partition
|
||||
(
|
||||
CompilationConfig(
|
||||
level=CompilationLevel.PIECEWISE,
|
||||
# inductor graph partition uses
|
||||
# torch._C.Tag.cudagraph_unsafe to specify splitting ops
|
||||
use_inductor_graph_partition=True,
|
||||
cudagraph_mode=CUDAGraphMode.PIECEWISE,
|
||||
compile_sizes=[1, 2]),
|
||||
model) for model in models_list(all=False)
|
||||
if is_torch_equal_or_newer("2.9.0.dev")
|
||||
])
|
||||
# only test some of the models
|
||||
@create_new_process_for_each_test()
|
||||
@ -129,51 +114,11 @@ def test_custom_compile_config(
|
||||
compilation_config: CompilationConfig,
|
||||
model_info: tuple[str, dict[str, Any]],
|
||||
):
|
||||
if (compilation_config.use_inductor_graph_partition
|
||||
and not is_torch_equal_or_newer("2.9.0.dev")):
|
||||
pytest.skip("inductor graph partition is only available "
|
||||
"in PyTorch 2.9+")
|
||||
|
||||
model, model_kwargs = model_info
|
||||
print(f"MODEL={model}")
|
||||
run_model(compilation_config, model, model_kwargs)
|
||||
|
||||
|
||||
def test_inductor_graph_partition_attn_fusion(caplog_vllm):
|
||||
if not is_torch_equal_or_newer("2.9.0.dev"):
|
||||
pytest.skip("inductor graph partition is only available "
|
||||
"in PyTorch 2.9+")
|
||||
|
||||
model = "nvidia/Llama-4-Scout-17B-16E-Instruct-FP8"
|
||||
compilation_config = CompilationConfig(
|
||||
level=CompilationLevel.PIECEWISE,
|
||||
use_inductor_graph_partition=True,
|
||||
cudagraph_mode=CUDAGraphMode.PIECEWISE,
|
||||
custom_ops=["+quant_fp8"],
|
||||
pass_config=PassConfig(enable_attn_fusion=True, enable_noop=True),
|
||||
)
|
||||
model_kwargs = {
|
||||
"kv_cache_dtype": "fp8",
|
||||
"max_model_len": 1024,
|
||||
}
|
||||
with caplog_vllm.at_level(
|
||||
logging.DEBUG), global_force_attn_backend_context_manager(
|
||||
_Backend.FLASHINFER):
|
||||
run_model(compilation_config, model, model_kwargs)
|
||||
|
||||
try:
|
||||
assert ("Fused quantization onto 48 attention nodes"
|
||||
in caplog_vllm.text), caplog_vllm.text
|
||||
except AssertionError:
|
||||
# Note: this message is only triggered when the compilation goes
|
||||
# through the custom pass. Due to multiple layers of cache on
|
||||
# PyTorch side, the compilation of a graph may be cached such
|
||||
# that custom pass directly goes through cache. In this case,
|
||||
# we go through this branch and assert that the pass is not
|
||||
# triggered.
|
||||
assert "Fused quantization" not in caplog_vllm.text
|
||||
|
||||
|
||||
def run_model(compile_config: Union[int, CompilationConfig], model: str,
|
||||
model_kwargs: dict[str, Any]):
|
||||
prompts = [
|
||||
|
||||
@ -8,10 +8,9 @@ import vllm.envs as envs
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.compilation.activation_quant_fusion import ActivationQuantFusionPass
|
||||
from vllm.compilation.fix_functionalization import FixFunctionalizationPass
|
||||
from vllm.compilation.fusion import FUSED_OPS, RMSNormQuantFusionPass
|
||||
from vllm.compilation.fusion import FUSED_OPS, FusionPass
|
||||
from vllm.compilation.fx_utils import find_auto_fn, find_auto_fn_maybe, is_func
|
||||
from vllm.compilation.noop_elimination import NoOpEliminationPass
|
||||
from vllm.compilation.post_cleanup import PostCleanupPass
|
||||
from vllm.config import CompilationConfig, PassConfig, VllmConfig
|
||||
from vllm.model_executor.layers.quantization.utils.quant_utils import (
|
||||
QuantKey, kFp8DynamicTokenSym, kFp8StaticTensorSym)
|
||||
@ -59,12 +58,11 @@ def test_fix_functionalization(model: str, quant_key: QuantKey,
|
||||
vllm_config.compilation_config = CompilationConfig(
|
||||
pass_config=PassConfig(enable_fusion=do_fusion, enable_noop=True))
|
||||
noop_pass = NoOpEliminationPass(vllm_config)
|
||||
fusion_pass = RMSNormQuantFusionPass(vllm_config)
|
||||
cleanup_pass = PostCleanupPass(vllm_config)
|
||||
fusion_pass = FusionPass.instance(vllm_config)
|
||||
act_quant_fusion_pass = ActivationQuantFusionPass(vllm_config)
|
||||
|
||||
passes = [noop_pass, fusion_pass, act_quant_fusion_pass, cleanup_pass
|
||||
] if do_fusion else [noop_pass, cleanup_pass]
|
||||
passes = [noop_pass, fusion_pass, act_quant_fusion_pass
|
||||
] if do_fusion else [noop_pass]
|
||||
func_pass = FixFunctionalizationPass(vllm_config)
|
||||
backend_func = TestBackend(*passes, func_pass)
|
||||
backend_no_func = TestBackend(*passes)
|
||||
|
||||
@ -4,11 +4,11 @@
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
import vllm.envs as envs
|
||||
import vllm.plugins
|
||||
from vllm.compilation.fusion import (FUSED_OPS, QUANT_OPS, FusedRMSQuantKey,
|
||||
RMSNormQuantFusionPass)
|
||||
FusionPass)
|
||||
from vllm.compilation.noop_elimination import NoOpEliminationPass
|
||||
from vllm.compilation.post_cleanup import PostCleanupPass
|
||||
from vllm.config import (CompilationConfig, CompilationLevel, PassConfig,
|
||||
VllmConfig)
|
||||
from vllm.model_executor.layers.layernorm import RMSNorm
|
||||
@ -79,15 +79,15 @@ class TestModel(torch.nn.Module):
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
|
||||
@pytest.mark.parametrize("hidden_size", [64])
|
||||
@pytest.mark.parametrize("num_tokens", [257])
|
||||
@pytest.mark.parametrize("hidden_size", [64, 3392, 4096])
|
||||
@pytest.mark.parametrize("num_tokens", [7, 256, 533, 2048, 2049])
|
||||
@pytest.mark.parametrize("eps", [1e-5, 1e-6])
|
||||
@pytest.mark.parametrize("static", [True, False])
|
||||
# cuda_force_torch used to test torch code path on platforms that
|
||||
# cutlass_fp8_supported() == True.
|
||||
@pytest.mark.parametrize("cuda_force_torch",
|
||||
[True, False] if cutlass_fp8_supported() else [True])
|
||||
@pytest.mark.skipif(not current_platform.is_cuda_alike(),
|
||||
@pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda", "rocm"],
|
||||
reason="Only test on CUDA and ROCm")
|
||||
def test_fusion_rmsnorm_quant(dtype, hidden_size, num_tokens, eps, static,
|
||||
cuda_force_torch):
|
||||
@ -104,10 +104,9 @@ def test_fusion_rmsnorm_quant(dtype, hidden_size, num_tokens, eps, static,
|
||||
with vllm.config.set_current_vllm_config(vllm_config):
|
||||
# Reshape pass is needed for the fusion pass to work
|
||||
noop_pass = NoOpEliminationPass(vllm_config)
|
||||
fusion_pass = RMSNormQuantFusionPass(vllm_config)
|
||||
cleanup_pass = PostCleanupPass(vllm_config)
|
||||
fusion_pass = FusionPass.instance(vllm_config)
|
||||
|
||||
backend = TestBackend(noop_pass, fusion_pass, cleanup_pass)
|
||||
backend = TestBackend(noop_pass, fusion_pass)
|
||||
model = TestModel(hidden_size, eps, static, cuda_force_torch)
|
||||
|
||||
# First dimension dynamic
|
||||
@ -129,8 +128,6 @@ def test_fusion_rmsnorm_quant(dtype, hidden_size, num_tokens, eps, static,
|
||||
|
||||
torch.testing.assert_close(result, result2, atol=ATOL, rtol=RTOL)
|
||||
|
||||
assert fusion_pass.matched_count == 2
|
||||
|
||||
# In pre-nodes, fp8 quant should be there and fused kernels should not
|
||||
backend.check_before_ops(model.ops_in_model_before())
|
||||
|
||||
|
||||
@ -9,7 +9,6 @@ import vllm.envs as envs
|
||||
from vllm.compilation.collective_fusion import AllReduceFusionPass
|
||||
from vllm.compilation.fix_functionalization import FixFunctionalizationPass
|
||||
from vllm.compilation.noop_elimination import NoOpEliminationPass
|
||||
from vllm.compilation.post_cleanup import PostCleanupPass
|
||||
from vllm.config import (CompilationConfig, CompilationLevel, DeviceConfig,
|
||||
ModelConfig, PassConfig, VllmConfig)
|
||||
from vllm.distributed import tensor_model_parallel_all_reduce
|
||||
@ -216,10 +215,8 @@ def all_reduce_fusion_pass_on_test_model(local_rank: int, world_size: int,
|
||||
all_reduce_fusion_pass = AllReduceFusionPass(vllm_config)
|
||||
noop_pass = NoOpEliminationPass(vllm_config)
|
||||
func_pass = FixFunctionalizationPass(vllm_config)
|
||||
cleanup_pass = PostCleanupPass(vllm_config)
|
||||
|
||||
backend = TestBackend(all_reduce_fusion_pass, noop_pass, func_pass,
|
||||
cleanup_pass)
|
||||
backend = TestBackend(all_reduce_fusion_pass, noop_pass, func_pass)
|
||||
|
||||
token_num = batch_size * seq_len
|
||||
model = test_model_cls(hidden_size, token_num)
|
||||
@ -230,7 +227,6 @@ def all_reduce_fusion_pass_on_test_model(local_rank: int, world_size: int,
|
||||
compiled_model = torch.compile(model, backend=backend)
|
||||
compiled_model(hidden_states, residual)
|
||||
|
||||
assert all_reduce_fusion_pass.matched_count == 1
|
||||
backend.check_before_ops(model.ops_in_model_before(), fully_replaced=False)
|
||||
backend.check_after_ops(model.ops_in_model_after())
|
||||
del all_reduce_fusion_pass
|
||||
|
||||
@ -6,19 +6,18 @@ from typing import Optional
|
||||
import pytest
|
||||
import torch._dynamo
|
||||
|
||||
from tests.compile.backend import LazyInitPass, TestBackend
|
||||
from tests.compile.backend import TestBackend
|
||||
from tests.models.utils import check_outputs_equal
|
||||
from tests.v1.attention.utils import (BatchSpec, _Backend,
|
||||
create_common_attn_metadata)
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant
|
||||
from vllm.attention import Attention, AttentionMetadata
|
||||
from vllm.attention import Attention
|
||||
from vllm.attention.selector import global_force_attn_backend_context_manager
|
||||
from vllm.compilation.fusion import QUANT_OPS
|
||||
from vllm.compilation.fusion_attn import ATTN_OP, AttnFusionPass
|
||||
from vllm.compilation.fx_utils import find_op_nodes
|
||||
from vllm.compilation.noop_elimination import NoOpEliminationPass
|
||||
from vllm.compilation.post_cleanup import PostCleanupPass
|
||||
from vllm.config import (CacheConfig, CompilationConfig, CompilationLevel,
|
||||
ModelConfig, PassConfig, SchedulerConfig, VllmConfig,
|
||||
set_current_vllm_config)
|
||||
@ -28,7 +27,6 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
|
||||
from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
|
||||
Fp8LinearOp)
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils import is_torch_equal_or_newer
|
||||
from vllm.v1.kv_cache_interface import AttentionSpec
|
||||
|
||||
FP8_DTYPE = current_platform.fp8_dtype()
|
||||
@ -55,7 +53,8 @@ def test_attention_fusion_v0(example_prompts, monkeypatch, model: str,
|
||||
# Use global backends
|
||||
global backend, backend_unfused
|
||||
|
||||
monkeypatch.setenv("VLLM_USE_V1", "1")
|
||||
use_v1 = False # can be made a param once V1 support added
|
||||
monkeypatch.setenv("VLLM_USE_V1", str(int(use_v1)))
|
||||
monkeypatch.setenv("VLLM_USE_TRITON_FLASH_ATTN", str(int(use_triton_fa)))
|
||||
|
||||
# Prompt 4 seems too open-ended, differs between fused and unfused
|
||||
@ -105,7 +104,7 @@ def test_attention_fusion_v0(example_prompts, monkeypatch, model: str,
|
||||
|
||||
# AttnFusionPass needs attention layers to be registered in config upon init
|
||||
# so we initialize it during compilation.
|
||||
attn_pass = LazyInitPass(AttnFusionPass, vllm_config)
|
||||
attn_pass = lambda *args, **kw: AttnFusionPass(vllm_config)(*args, **kw)
|
||||
backend = TestBackend(NoOpEliminationPass(vllm_config), attn_pass)
|
||||
llm2 = LLM(model,
|
||||
enforce_eager=True,
|
||||
@ -198,8 +197,7 @@ class AttentionQuantPatternModel(torch.nn.Module):
|
||||
device=self.device,
|
||||
)
|
||||
|
||||
def build_attn_metadata(self, batch_size: int, use_hnd: bool) \
|
||||
-> AttentionMetadata:
|
||||
def build_attn_metadata(self, batch_size: int, use_hnd: bool):
|
||||
"""Initialize attention metadata."""
|
||||
|
||||
# Create common attn metadata
|
||||
@ -336,16 +334,11 @@ else:
|
||||
[7, 256, 533] if current_platform.is_cuda() else [8])
|
||||
@pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16])
|
||||
@pytest.mark.parametrize("model_name, model_class", MODELS)
|
||||
@pytest.mark.parametrize("backend",
|
||||
[_Backend.FLASHINFER] if current_platform.is_cuda()
|
||||
else [_Backend.TRITON_ATTN_VLLM_V1])
|
||||
@pytest.mark.parametrize("backend", [_Backend.FLASHINFER] if
|
||||
current_platform.is_cuda() else [_Backend.ROCM_FLASH])
|
||||
@pytest.mark.parametrize(
|
||||
"split_attention",
|
||||
[False, True] if current_platform.is_rocm() else [False])
|
||||
# TODO(boyuan): test inductor graph partition on rocm
|
||||
@pytest.mark.parametrize(
|
||||
"use_inductor_graph_partition",
|
||||
[False] if current_platform.is_rocm() else [False, True])
|
||||
@pytest.mark.skipif(not current_platform.is_cuda_alike(),
|
||||
reason="Only test ROCm or CUDA")
|
||||
@pytest.mark.skipif(not current_platform.supports_fp8(), reason="Need FP8")
|
||||
@ -359,15 +352,9 @@ def test_attention_quant_pattern(num_qo_heads: int, num_kv_heads: int,
|
||||
dtype: torch.dtype, model_name: str,
|
||||
model_class: type[AttentionQuantPatternModel],
|
||||
backend: _Backend, split_attention: bool,
|
||||
use_inductor_graph_partition: bool,
|
||||
monkeypatch, dist_init, caplog_vllm):
|
||||
monkeypatch, dist_init):
|
||||
"""Test AttentionStaticQuantPattern fusion pass"""
|
||||
|
||||
if use_inductor_graph_partition and not is_torch_equal_or_newer(
|
||||
"2.9.0.dev"):
|
||||
pytest.skip("inductor graph partition is only available "
|
||||
"in PyTorch 2.9+")
|
||||
|
||||
monkeypatch.setenv("VLLM_USE_V1", "1")
|
||||
if split_attention:
|
||||
monkeypatch.setenv("VLLM_V1_USE_PREFILL_DECODE_ATTENTION", "1")
|
||||
@ -385,7 +372,6 @@ def test_attention_quant_pattern(num_qo_heads: int, num_kv_heads: int,
|
||||
compilation_config=CompilationConfig(
|
||||
level=CompilationLevel.PIECEWISE,
|
||||
custom_ops=["+quant_fp8"],
|
||||
use_inductor_graph_partition=use_inductor_graph_partition,
|
||||
),
|
||||
cache_config=CacheConfig(cache_dtype="fp8"))
|
||||
|
||||
@ -449,17 +435,15 @@ def test_attention_quant_pattern(num_qo_heads: int, num_kv_heads: int,
|
||||
|
||||
# Create test backend with fusion passes enabled
|
||||
noop_pass = NoOpEliminationPass(vllm_config)
|
||||
attn_pass = LazyInitPass(AttnFusionPass, vllm_config)
|
||||
cleanup_pass = PostCleanupPass(vllm_config)
|
||||
|
||||
test_backend = TestBackend(noop_pass, attn_pass, cleanup_pass)
|
||||
attn_pass = lambda *args, **kw: AttnFusionPass(vllm_config)(*args, **kw
|
||||
)
|
||||
test_backend = TestBackend(noop_pass, attn_pass)
|
||||
|
||||
# Compile model with fusion enabled
|
||||
model_compiled = torch.compile(model_fused,
|
||||
backend=test_backend,
|
||||
fullgraph=True)
|
||||
assert model_compiled.attn._o_scale_float is None
|
||||
|
||||
result_fused_1 = model_compiled(q, k, v)
|
||||
|
||||
if backend == _Backend.FLASHINFER:
|
||||
@ -469,7 +453,6 @@ def test_attention_quant_pattern(num_qo_heads: int, num_kv_heads: int,
|
||||
# _o_scale_float
|
||||
assert model_compiled.attn._o_scale_float is not None
|
||||
result_fused_2 = model_compiled(q, k, v)
|
||||
|
||||
assert model_compiled.attn._o_scale_float is not None
|
||||
|
||||
torch.testing.assert_close(result_unfused,
|
||||
@ -488,9 +471,6 @@ def test_attention_quant_pattern(num_qo_heads: int, num_kv_heads: int,
|
||||
test_backend.check_before_ops([QUANT_OPS[quant_key]],
|
||||
fully_replaced=True)
|
||||
|
||||
# access the underlying `AttnFusionPass` on the `LazyInitPass`
|
||||
assert attn_pass.pass_.matched_count == sum(attn_fusion_supported)
|
||||
|
||||
# Check attention ops in the graph before and after fusion
|
||||
attn_nodes_pre = list(find_op_nodes(ATTN_OP, test_backend.graph_pre_pass))
|
||||
attn_nodes_post = list(find_op_nodes(ATTN_OP,
|
||||
|
||||
@ -6,12 +6,10 @@ import torch
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm.compilation.fix_functionalization import FixFunctionalizationPass
|
||||
from vllm.compilation.fusion import RMSNormQuantFusionPass
|
||||
from vllm.compilation.fusion import FusionPass
|
||||
from vllm.compilation.fx_utils import find_auto_fn, find_auto_fn_maybe, is_func
|
||||
from vllm.compilation.noop_elimination import NoOpEliminationPass
|
||||
from vllm.compilation.post_cleanup import PostCleanupPass
|
||||
from vllm.compilation.sequence_parallelism import SequenceParallelismPass
|
||||
from vllm.compilation.vllm_inductor_pass import VllmInductorPass
|
||||
from vllm.config import (CompilationConfig, DeviceConfig, ModelConfig,
|
||||
PassConfig, VllmConfig)
|
||||
from vllm.distributed import tensor_model_parallel_all_reduce
|
||||
@ -106,7 +104,7 @@ class TestQuantModel(torch.nn.Module):
|
||||
# Initialize weights
|
||||
torch.nn.init.normal_(self.gate_proj, std=0.02)
|
||||
|
||||
self.fp8_linear = Fp8LinearOp(act_quant_static=True)
|
||||
self.fp8_linear = Fp8LinearOp(use_per_token_if_dynamic=False)
|
||||
|
||||
self.scale = torch.rand(1, dtype=torch.float32)
|
||||
# Create a weight that is compatible with torch._scaled_mm,
|
||||
@ -139,7 +137,8 @@ class TestQuantModel(torch.nn.Module):
|
||||
# layer normalization
|
||||
norm_output, residual_output = self.norm(all_reduce, residual)
|
||||
|
||||
# scaled_mm with static input quantization
|
||||
# for static input quantization
|
||||
# self.fp8_linear is initialized with use_per_token_if_dynamic=False
|
||||
fp8_linear_result = self.fp8_linear.apply(norm_output,
|
||||
self.w,
|
||||
self.wscale,
|
||||
@ -254,20 +253,16 @@ def sequence_parallelism_pass_on_test_model(
|
||||
dtype=dtype,
|
||||
seed=42)
|
||||
|
||||
noop_pass = NoOpEliminationPass(vllm_config)
|
||||
sequence_parallelism_pass = SequenceParallelismPass(vllm_config)
|
||||
noop_pass = NoOpEliminationPass(vllm_config)
|
||||
func_pass = FixFunctionalizationPass(vllm_config)
|
||||
cleanup_pass = PostCleanupPass(vllm_config)
|
||||
|
||||
passes_for_backend: list[VllmInductorPass] = \
|
||||
[noop_pass, sequence_parallelism_pass]
|
||||
passes_for_backend = [noop_pass, sequence_parallelism_pass]
|
||||
|
||||
if enable_fusion:
|
||||
fusion_pass = RMSNormQuantFusionPass(vllm_config)
|
||||
fusion_pass = FusionPass.instance(vllm_config)
|
||||
passes_for_backend.append(fusion_pass)
|
||||
|
||||
passes_for_backend.append(cleanup_pass)
|
||||
|
||||
backend_no_func = TestBackend(*passes_for_backend)
|
||||
backend_func = TestBackend(*passes_for_backend, func_pass)
|
||||
|
||||
@ -284,8 +279,6 @@ def sequence_parallelism_pass_on_test_model(
|
||||
compiled_model_func = torch.compile(model, backend=backend_func)
|
||||
compiled_model_func(hidden_states, residual)
|
||||
|
||||
assert sequence_parallelism_pass.matched_count == 1
|
||||
|
||||
# In pre-nodes, all reduce should be there,
|
||||
# reduce scatter and all gather should not
|
||||
backend_no_func.check_before_ops(model.ops_in_model_before())
|
||||
|
||||
@ -15,7 +15,6 @@ from vllm.compilation.activation_quant_fusion import (
|
||||
# yapf: enable
|
||||
from vllm.compilation.fusion import QUANT_OPS
|
||||
from vllm.compilation.noop_elimination import NoOpEliminationPass
|
||||
from vllm.compilation.post_cleanup import PostCleanupPass
|
||||
from vllm.config import CompilationConfig, PassConfig, VllmConfig
|
||||
from vllm.model_executor.layers.activation import SiluAndMul
|
||||
from vllm.model_executor.layers.quantization.utils.quant_utils import (
|
||||
@ -70,10 +69,6 @@ class TestSiluMulNvfp4QuantModel(torch.nn.Module):
|
||||
|
||||
def __init__(self, hidden_size: int, x: torch.Tensor, **kwargs):
|
||||
super().__init__()
|
||||
from vllm.compilation.activation_quant_fusion import (
|
||||
silu_and_mul_nvfp4_quant_supported)
|
||||
assert silu_and_mul_nvfp4_quant_supported
|
||||
|
||||
self.silu_and_mul = SiluAndMul()
|
||||
|
||||
# create nvfp4 weight
|
||||
@ -132,11 +127,7 @@ def test_fusion_silu_and_mul_quant(num_tokens, hidden_size, dtype, model_class,
|
||||
pass_config=PassConfig(enable_fusion=True, enable_noop=True))
|
||||
fusion_pass = ActivationQuantFusionPass(config)
|
||||
|
||||
passes = [
|
||||
NoOpEliminationPass(config), fusion_pass,
|
||||
PostCleanupPass(config)
|
||||
]
|
||||
backend = TestBackend(*passes)
|
||||
backend = TestBackend(NoOpEliminationPass(config), fusion_pass)
|
||||
model = model_class(hidden_size=hidden_size,
|
||||
cuda_force_torch=cuda_force_torch,
|
||||
x=x)
|
||||
@ -160,8 +151,6 @@ def test_fusion_silu_and_mul_quant(num_tokens, hidden_size, dtype, model_class,
|
||||
atol=atol,
|
||||
rtol=rtol)
|
||||
|
||||
assert fusion_pass.matched_count == 1
|
||||
|
||||
# In pre-nodes, quant op should be present and fused kernels should not
|
||||
backend.check_before_ops(model.ops_in_model_before())
|
||||
|
||||
|
||||
@ -19,7 +19,6 @@ import socket
|
||||
import tempfile
|
||||
import threading
|
||||
from collections.abc import Generator
|
||||
from contextlib import nullcontext
|
||||
from enum import Enum
|
||||
from typing import Any, Callable, Optional, TypedDict, TypeVar, Union, cast
|
||||
|
||||
@ -40,20 +39,19 @@ from vllm import LLM, SamplingParams
|
||||
from vllm.assets.audio import AudioAsset
|
||||
from vllm.assets.image import ImageAsset
|
||||
from vllm.assets.video import VideoAsset
|
||||
from vllm.config.model import (ConvertOption, RunnerOption,
|
||||
_get_and_verify_dtype)
|
||||
from vllm.config import ConvertOption, RunnerOption, _get_and_verify_dtype
|
||||
from vllm.connections import global_http_connection
|
||||
from vllm.distributed import (cleanup_dist_env_and_memory,
|
||||
init_distributed_environment,
|
||||
initialize_model_parallel)
|
||||
from vllm.inputs import TextPrompt
|
||||
from vllm.inputs import (ExplicitEncoderDecoderPrompt, TextPrompt,
|
||||
to_enc_dec_tuple_list, zip_enc_dec_prompts)
|
||||
from vllm.logger import init_logger
|
||||
from vllm.logprobs import Logprob
|
||||
from vllm.multimodal.utils import fetch_image
|
||||
from vllm.outputs import RequestOutput
|
||||
from vllm.sampling_params import BeamSearchParams
|
||||
from vllm.sequence import Logprob
|
||||
from vllm.transformers_utils.utils import maybe_model_redirect
|
||||
from vllm.utils import set_default_torch_num_threads
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
@ -160,6 +158,26 @@ def cleanup_VLLM_USE_V1(monkeypatch):
|
||||
monkeypatch.delenv("VLLM_USE_V1")
|
||||
|
||||
|
||||
@pytest.fixture(params=[True, False])
|
||||
def run_with_both_engines(request, monkeypatch):
|
||||
# Automatically runs tests twice, once with V1 and once without
|
||||
use_v1 = request.param
|
||||
# Tests decorated with `@skip_v1` are only run without v1
|
||||
skip_v0 = request.node.get_closest_marker("skip_v0")
|
||||
skip_v1 = request.node.get_closest_marker("skip_v1")
|
||||
|
||||
if use_v1:
|
||||
if skip_v1:
|
||||
pytest.skip("Skipping test on vllm V1")
|
||||
monkeypatch.setenv('VLLM_USE_V1', '1')
|
||||
else:
|
||||
if skip_v0:
|
||||
pytest.skip("Skipping test on vllm V0")
|
||||
monkeypatch.setenv('VLLM_USE_V1', '0')
|
||||
|
||||
yield
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def init_test_http_connection():
|
||||
# pytest_asyncio may use a different event loop per test
|
||||
@ -226,6 +244,39 @@ class DecoderPromptType(Enum):
|
||||
EMPTY_STR = 3
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def example_encoder_decoder_prompts(
|
||||
) -> dict[DecoderPromptType, list[ExplicitEncoderDecoderPrompt]]:
|
||||
'''
|
||||
Returns an encoder prompt list and a decoder prompt list, wherein each pair
|
||||
of same-index entries in both lists corresponds to an (encoder prompt,
|
||||
decoder prompt) tuple.
|
||||
|
||||
Returns:
|
||||
|
||||
* Encoder prompt list
|
||||
* Decoder prompt list (reverse of encoder prompt list)
|
||||
'''
|
||||
|
||||
encoder_prompts = []
|
||||
for filename in _TEST_PROMPTS:
|
||||
encoder_prompts += _read_prompts(filename)
|
||||
|
||||
custom_decoder_prompts = encoder_prompts[::-1]
|
||||
empty_str_decoder_prompts = [""] * len(encoder_prompts)
|
||||
none_decoder_prompts = [None] * len(encoder_prompts)
|
||||
|
||||
# NONE decoder prompt type
|
||||
return {
|
||||
DecoderPromptType.NONE:
|
||||
zip_enc_dec_prompts(encoder_prompts, none_decoder_prompts),
|
||||
DecoderPromptType.EMPTY_STR:
|
||||
zip_enc_dec_prompts(encoder_prompts, empty_str_decoder_prompts),
|
||||
DecoderPromptType.CUSTOM:
|
||||
zip_enc_dec_prompts(encoder_prompts, custom_decoder_prompts),
|
||||
}
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def example_long_prompts() -> list[str]:
|
||||
prompts = []
|
||||
@ -287,35 +338,6 @@ class HfRunner:
|
||||
is_cross_encoder: bool = False,
|
||||
skip_tokenizer_init: bool = False,
|
||||
auto_cls: type[_BaseAutoModelClass] = AutoModelForCausalLM,
|
||||
# Set this to avoid hanging issue
|
||||
default_torch_num_threads: Optional[int] = None,
|
||||
) -> None:
|
||||
init_ctx = (nullcontext() if default_torch_num_threads is None else
|
||||
set_default_torch_num_threads(default_torch_num_threads))
|
||||
|
||||
with init_ctx:
|
||||
self._init(
|
||||
model_name=model_name,
|
||||
dtype=dtype,
|
||||
model_kwargs=model_kwargs,
|
||||
trust_remote_code=trust_remote_code,
|
||||
is_sentence_transformer=is_sentence_transformer,
|
||||
is_cross_encoder=is_cross_encoder,
|
||||
skip_tokenizer_init=skip_tokenizer_init,
|
||||
auto_cls=auto_cls,
|
||||
)
|
||||
|
||||
def _init(
|
||||
self,
|
||||
model_name: str,
|
||||
dtype: str = "auto",
|
||||
*,
|
||||
model_kwargs: Optional[dict[str, Any]] = None,
|
||||
trust_remote_code: bool = True,
|
||||
is_sentence_transformer: bool = False,
|
||||
is_cross_encoder: bool = False,
|
||||
skip_tokenizer_init: bool = False,
|
||||
auto_cls: type[_BaseAutoModelClass] = AutoModelForCausalLM,
|
||||
) -> None:
|
||||
model_name = maybe_model_redirect(model_name)
|
||||
self.model_name = model_name
|
||||
@ -668,6 +690,68 @@ class HfRunner:
|
||||
return [(output_ids, output_str, output_logprobs)
|
||||
for output_ids, output_str, output_logprobs in outputs]
|
||||
|
||||
def generate_encoder_decoder_greedy_logprobs_limit(
|
||||
self,
|
||||
encoder_decoder_prompts: list[ExplicitEncoderDecoderPrompt[str, str]],
|
||||
max_tokens: int,
|
||||
num_logprobs: Optional[int],
|
||||
images: Optional[PromptImageInput] = None,
|
||||
**kwargs: Any,
|
||||
) -> list[TokensTextLogprobs]:
|
||||
'''
|
||||
Greedy logprobs generation for vLLM encoder/decoder models
|
||||
'''
|
||||
|
||||
all_logprobs: list[list[dict[int, float]]] = []
|
||||
all_output_ids: list[list[int]] = []
|
||||
all_output_strs: list[str] = []
|
||||
|
||||
for i, (encoder_prompt, decoder_prompt) in enumerate(
|
||||
to_enc_dec_tuple_list(encoder_decoder_prompts)):
|
||||
processor_kwargs: dict[str, Any] = {
|
||||
"text": encoder_prompt,
|
||||
"return_tensors": "pt",
|
||||
}
|
||||
if images is not None and images[i] is not None:
|
||||
processor_kwargs["images"] = images[i]
|
||||
|
||||
encoder_inputs = self.processor(**processor_kwargs)
|
||||
encoder_inputs = self.wrap_device(encoder_inputs)
|
||||
|
||||
if decoder_prompt is None:
|
||||
decoder_input_ids = None
|
||||
else:
|
||||
decoder_inputs = self.tokenizer(decoder_prompt,
|
||||
return_tensors="pt")
|
||||
decoder_input_ids = self.wrap_device(decoder_inputs.input_ids)
|
||||
|
||||
output = self.model.generate(
|
||||
decoder_input_ids=decoder_input_ids,
|
||||
use_cache=True,
|
||||
do_sample=False,
|
||||
max_new_tokens=max_tokens,
|
||||
output_hidden_states=True,
|
||||
return_dict_in_generate=True,
|
||||
**encoder_inputs,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
(
|
||||
seq_logprobs_lst,
|
||||
output_len,
|
||||
) = self._hidden_states_to_logprobs(output.decoder_hidden_states,
|
||||
num_logprobs)
|
||||
|
||||
all_logprobs.append(seq_logprobs_lst)
|
||||
seq_ids = output.sequences[0]
|
||||
output_ids = seq_ids[-output_len:]
|
||||
all_output_ids.append(output_ids.tolist())
|
||||
all_output_strs.append(self.tokenizer.decode(output_ids))
|
||||
|
||||
outputs = zip(all_output_ids, all_output_strs, all_logprobs)
|
||||
return [(output_ids, output_str, output_logprobs)
|
||||
for output_ids, output_str, output_logprobs in outputs]
|
||||
|
||||
def encode(self, prompts: list[str], *args,
|
||||
**kwargs) -> list[list[torch.Tensor]]:
|
||||
return self.model.encode(prompts, *args, **kwargs)
|
||||
@ -724,32 +808,26 @@ class VllmRunner:
|
||||
enable_chunked_prefill: Optional[bool] = False,
|
||||
swap_space: int = 4,
|
||||
enforce_eager: Optional[bool] = False,
|
||||
# Set this to avoid hanging issue
|
||||
default_torch_num_threads: Optional[int] = None,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
init_ctx = (nullcontext() if default_torch_num_threads is None else
|
||||
set_default_torch_num_threads(default_torch_num_threads))
|
||||
|
||||
with init_ctx:
|
||||
self.llm = LLM(
|
||||
model=model_name,
|
||||
runner=runner,
|
||||
convert=convert,
|
||||
tokenizer=tokenizer_name,
|
||||
tokenizer_mode=tokenizer_mode,
|
||||
trust_remote_code=trust_remote_code,
|
||||
dtype=dtype,
|
||||
seed=seed,
|
||||
swap_space=swap_space,
|
||||
enforce_eager=enforce_eager,
|
||||
disable_log_stats=disable_log_stats,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
max_model_len=max_model_len,
|
||||
block_size=block_size,
|
||||
enable_chunked_prefill=enable_chunked_prefill,
|
||||
**kwargs,
|
||||
)
|
||||
self.llm = LLM(
|
||||
model=model_name,
|
||||
runner=runner,
|
||||
convert=convert,
|
||||
tokenizer=tokenizer_name,
|
||||
tokenizer_mode=tokenizer_mode,
|
||||
trust_remote_code=trust_remote_code,
|
||||
dtype=dtype,
|
||||
seed=seed,
|
||||
swap_space=swap_space,
|
||||
enforce_eager=enforce_eager,
|
||||
disable_log_stats=disable_log_stats,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
max_model_len=max_model_len,
|
||||
block_size=block_size,
|
||||
enable_chunked_prefill=enable_chunked_prefill,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def get_inputs(
|
||||
self,
|
||||
@ -862,6 +940,26 @@ class VllmRunner:
|
||||
if sampling_params.prompt_logprobs is None else
|
||||
toks_str_logsprobs_prompt_logprobs)
|
||||
|
||||
def generate_encoder_decoder_w_logprobs(
|
||||
self,
|
||||
encoder_decoder_prompts: list[ExplicitEncoderDecoderPrompt[str, str]],
|
||||
sampling_params: SamplingParams,
|
||||
) -> Union[list[TokensTextLogprobs],
|
||||
list[TokensTextLogprobsPromptLogprobs]]:
|
||||
'''
|
||||
Logprobs generation for vLLM encoder/decoder models
|
||||
'''
|
||||
|
||||
assert sampling_params.logprobs is not None
|
||||
req_outputs = self.llm.generate(encoder_decoder_prompts,
|
||||
sampling_params=sampling_params)
|
||||
toks_str_logsprobs_prompt_logprobs = (
|
||||
self._final_steps_generate_w_logprobs(req_outputs))
|
||||
# Omit prompt logprobs if not required by sampling params
|
||||
return ([x[0:-1] for x in toks_str_logsprobs_prompt_logprobs]
|
||||
if sampling_params.prompt_logprobs is None else
|
||||
toks_str_logsprobs_prompt_logprobs)
|
||||
|
||||
def generate_greedy(
|
||||
self,
|
||||
prompts: Union[list[str], list[torch.Tensor]],
|
||||
@ -939,6 +1037,29 @@ class VllmRunner:
|
||||
|
||||
return perplexities
|
||||
|
||||
def generate_encoder_decoder_greedy_logprobs(
|
||||
self,
|
||||
encoder_decoder_prompts: list[ExplicitEncoderDecoderPrompt[str, str]],
|
||||
max_tokens: int,
|
||||
num_logprobs: Optional[int],
|
||||
num_prompt_logprobs: Optional[int] = None,
|
||||
skip_special_tokens: bool = True,
|
||||
) -> Union[list[TokensTextLogprobs],
|
||||
list[TokensTextLogprobsPromptLogprobs]]:
|
||||
greedy_logprobs_params = SamplingParams(
|
||||
temperature=0.0,
|
||||
max_tokens=max_tokens,
|
||||
logprobs=num_logprobs,
|
||||
prompt_logprobs=(num_prompt_logprobs),
|
||||
skip_special_tokens=skip_special_tokens,
|
||||
)
|
||||
'''
|
||||
Greedy logprobs generation for vLLM encoder/decoder models
|
||||
'''
|
||||
|
||||
return self.generate_encoder_decoder_w_logprobs(
|
||||
encoder_decoder_prompts, greedy_logprobs_params)
|
||||
|
||||
def generate_beam_search(
|
||||
self,
|
||||
prompts: list[str],
|
||||
@ -1003,7 +1124,17 @@ class VllmRunner:
|
||||
return [req_output.outputs.score for req_output in req_outputs]
|
||||
|
||||
def apply_model(self, func: Callable[[nn.Module], _R]) -> list[_R]:
|
||||
return self.llm.apply_model(func)
|
||||
if hasattr(self.llm.llm_engine, "model_executor"):
|
||||
# This works either in V0 or in V1 with
|
||||
# VLLM_ENABLE_V1_MULTIPROCESSING=0
|
||||
executor = self.llm.llm_engine.model_executor
|
||||
return executor.apply_model(func)
|
||||
|
||||
# This works in V1 with VLLM_ALLOW_INSECURE_SERIALIZATION=1
|
||||
def _apply_model(self):
|
||||
return func(self.get_model())
|
||||
|
||||
return self.llm.llm_engine.collective_rpc(_apply_model)
|
||||
|
||||
def get_llm(self) -> LLM:
|
||||
return self.llm
|
||||
|
||||
@ -32,6 +32,10 @@ def _test_stopping(llm: LLM,
|
||||
assert output.stop_reason == expected_reason
|
||||
|
||||
|
||||
def _set_async_mode(llm, is_async):
|
||||
llm.llm_engine.scheduler[0].use_async_output_proc = is_async
|
||||
|
||||
|
||||
def _stop_basic(llm):
|
||||
_test_stopping(llm,
|
||||
stop=["."],
|
||||
@ -99,8 +103,40 @@ def test_stop_strings():
|
||||
# async output processing below.
|
||||
llm = LLM(MODEL, enforce_eager=envs.VLLM_USE_V1)
|
||||
|
||||
_stop_basic(llm)
|
||||
_stop_multi_tokens(llm)
|
||||
_stop_partial_token(llm)
|
||||
# FIXME: this does not respect include_in_output=False
|
||||
# _stop_token_id(llm)
|
||||
if envs.VLLM_USE_V1:
|
||||
_stop_basic(llm)
|
||||
else:
|
||||
_set_async_mode(llm, True)
|
||||
_stop_basic(llm)
|
||||
|
||||
_set_async_mode(llm, False)
|
||||
_stop_basic(llm)
|
||||
|
||||
if envs.VLLM_USE_V1:
|
||||
_stop_multi_tokens(llm)
|
||||
else:
|
||||
_set_async_mode(llm, True)
|
||||
_stop_multi_tokens(llm)
|
||||
|
||||
_set_async_mode(llm, False)
|
||||
_stop_multi_tokens(llm)
|
||||
|
||||
if envs.VLLM_USE_V1:
|
||||
_stop_partial_token(llm)
|
||||
else:
|
||||
_set_async_mode(llm, True)
|
||||
_stop_partial_token(llm)
|
||||
|
||||
_set_async_mode(llm, False)
|
||||
_stop_partial_token(llm)
|
||||
|
||||
if envs.VLLM_USE_V1:
|
||||
# FIXME: this does not respect include_in_output=False
|
||||
# _stop_token_id(llm)
|
||||
pass
|
||||
else:
|
||||
_set_async_mode(llm, True)
|
||||
_stop_token_id(llm)
|
||||
|
||||
_set_async_mode(llm, False)
|
||||
_stop_token_id(llm)
|
||||
|
||||
@ -14,7 +14,7 @@ from typing import Literal, NamedTuple, Optional
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.config.model import _FLOAT16_NOT_SUPPORTED_MODELS, RunnerOption
|
||||
from vllm.config import _FLOAT16_NOT_SUPPORTED_MODELS, RunnerOption
|
||||
from vllm.logger import init_logger
|
||||
from vllm.transformers_utils.config import get_config
|
||||
|
||||
|
||||
@ -1,81 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# unit test for `examples/offline_inference/torchrun_example.py`
|
||||
import os
|
||||
import random
|
||||
|
||||
import torch.distributed as dist
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.distributed.parallel_state import get_tp_group, get_world_group
|
||||
|
||||
dist.init_process_group(backend="gloo")
|
||||
|
||||
# Create prompts
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
] * 10
|
||||
dp_size = int(os.getenv("DP_SIZE", "1"))
|
||||
dp_rank = int(os.getenv("DP_RANK", "0"))
|
||||
|
||||
if dp_size > 1:
|
||||
# distribute the prompts across the data parallel ranks
|
||||
prompts = [
|
||||
prompt for idx, prompt in enumerate(prompts)
|
||||
if idx % dp_size == dp_rank
|
||||
]
|
||||
|
||||
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
||||
|
||||
# set different `gpu_memory_utilization` and `swap_space` for different ranks,
|
||||
# to test if all ranks agree on the same kv cache configuration.
|
||||
llm = LLM(model="microsoft/Phi-mini-MoE-instruct",
|
||||
tensor_parallel_size=int(os.getenv("TP_SIZE", "1")),
|
||||
pipeline_parallel_size=int(os.getenv("PP_SIZE", "1")),
|
||||
enable_expert_parallel=int(os.getenv("ENABLE_EP", "0")) == 1,
|
||||
distributed_executor_backend="external_launcher",
|
||||
gpu_memory_utilization=random.uniform(0.7, 0.9),
|
||||
swap_space=random.randint(1, 4),
|
||||
seed=0)
|
||||
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
|
||||
group = get_world_group() if dp_size == 1 else get_tp_group()
|
||||
cpu_group = group.cpu_group
|
||||
group_rank = dist.get_rank(group=cpu_group)
|
||||
|
||||
|
||||
def test_consistent_across_ranks(obj):
|
||||
if group_rank == 0:
|
||||
dist.broadcast_object_list([obj], src=group.ranks[0], group=cpu_group)
|
||||
else:
|
||||
container = [None]
|
||||
dist.broadcast_object_list(container,
|
||||
src=group.ranks[0],
|
||||
group=cpu_group)
|
||||
assert container[0] == obj
|
||||
|
||||
|
||||
test_consistent_across_ranks(
|
||||
llm.llm_engine.vllm_config.cache_config.num_cpu_blocks)
|
||||
test_consistent_across_ranks(
|
||||
llm.llm_engine.vllm_config.cache_config.num_gpu_blocks)
|
||||
|
||||
# make sure we can access the model parameters from the calling process
|
||||
# of the `LLM` instance.
|
||||
params = list(llm.llm_engine.model_executor.driver_worker.worker.model_runner.
|
||||
model.parameters())
|
||||
test_consistent_across_ranks(len(params))
|
||||
|
||||
# all ranks should have the same outputs
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
test_consistent_across_ranks(prompt)
|
||||
test_consistent_across_ranks(generated_text)
|
||||
print(f"Rank {group_rank}, Prompt: {prompt!r}, "
|
||||
f"Generated text: {generated_text!r}")
|
||||
@ -25,6 +25,12 @@ TOKEN_IDS = [
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def v1(run_with_both_engines):
|
||||
"""We can run both engines for this test."""
|
||||
pass
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def llm():
|
||||
# pytest caches the fixture so we use weakref.proxy to
|
||||
|
||||
@ -6,6 +6,14 @@ import pytest
|
||||
from vllm import LLM
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def v1(run_with_both_engines):
|
||||
# Simple autouse wrapper to run both engines for each test
|
||||
# This can be promoted up to conftest.py to run for every
|
||||
# test in a package
|
||||
pass
|
||||
|
||||
|
||||
def test_empty_prompt():
|
||||
llm = LLM(model="openai-community/gpt2", enforce_eager=True)
|
||||
with pytest.raises(ValueError, match='decoder prompt cannot be empty'):
|
||||
|
||||
@ -1,7 +1,6 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import datetime
|
||||
from typing import Union
|
||||
|
||||
import openai # use the official client for correctness check
|
||||
@ -285,62 +284,3 @@ async def test_tool_id_kimi_k2(k2_client: openai.AsyncOpenAI, model_name: str,
|
||||
output.extend(chunk.choices[0].delta.tool_calls)
|
||||
for o in output:
|
||||
assert o.id is None or o.id == 'functions.get_current_weather:0'
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
@pytest.mark.parametrize("arguments", ["{}", ''])
|
||||
async def test_no_args_tool_call(client: openai.AsyncOpenAI, model_name: str,
|
||||
arguments: str):
|
||||
# Step 1: Define a tool that requires no parameters
|
||||
tools = [{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "get_current_time",
|
||||
"description":
|
||||
"Get the current date and time. No parameters needed.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {}, # No parameters
|
||||
"required": [] # No required fields
|
||||
}
|
||||
}
|
||||
}]
|
||||
messages = [{"role": "user", "content": "What time is it now?"}]
|
||||
# Step 2: Send user message and let model decide whether to call the tool
|
||||
response = await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
tools=tools,
|
||||
tool_choice="auto" # Let model choose automatically
|
||||
)
|
||||
|
||||
# Step 3: Check if model wants to call a tool
|
||||
message = response.choices[0].message
|
||||
if message.tool_calls:
|
||||
# Get the first tool call
|
||||
tool_call = message.tool_calls[0]
|
||||
tool_name = tool_call.function.name
|
||||
# Step 4: Execute the tool locally (no parameters)
|
||||
if tool_name == "get_current_time":
|
||||
# Test both empty string and "{}" for no-arg tool calls
|
||||
tool_call.function.arguments = arguments
|
||||
messages.append(message)
|
||||
current_time = datetime.datetime.now()
|
||||
result = current_time.isoformat()
|
||||
messages.append({
|
||||
"role": "tool",
|
||||
"tool_call_id": tool_call.id,
|
||||
"content": result,
|
||||
})
|
||||
# Step 5: Send tool result back to model to continue conversation
|
||||
final_response = await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
)
|
||||
# Output final natural language response
|
||||
assert final_response.choices[0].message.content is not None
|
||||
|
||||
else:
|
||||
# No tool called — just print model's direct reply
|
||||
assert message.content is not None
|
||||
|
||||
@ -14,6 +14,9 @@ from transformers import AutoConfig
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
pytest.skip("Skipping prompt_embeds test until V1 supports it.",
|
||||
allow_module_level=True)
|
||||
|
||||
# any model with a chat template should work here
|
||||
MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
|
||||
|
||||
@ -33,6 +36,7 @@ def default_server_args() -> list[str]:
|
||||
"--enforce-eager",
|
||||
# Prompt Embeds server args
|
||||
"--enable-prompt-embeds",
|
||||
"--no-enable-chunked-prefill",
|
||||
]
|
||||
|
||||
|
||||
@ -60,7 +64,6 @@ def create_dummy_embeds(num_tokens: int = 5) -> str:
|
||||
return base64.b64encode(buffer.getvalue()).decode('utf-8')
|
||||
|
||||
|
||||
@pytest.mark.skip("This test is skipped because it is flaky.")
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_completions_with_prompt_embeds(
|
||||
|
||||
@ -5,7 +5,7 @@ from contextlib import suppress
|
||||
from dataclasses import dataclass, field
|
||||
from http import HTTPStatus
|
||||
from typing import Optional
|
||||
from unittest.mock import AsyncMock, MagicMock
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
import pytest
|
||||
|
||||
@ -83,31 +83,20 @@ def register_mock_resolver():
|
||||
def mock_serving_setup():
|
||||
"""Provides a mocked engine and serving completion instance."""
|
||||
mock_engine = MagicMock(spec=AsyncLLM)
|
||||
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
|
||||
mock_engine.errored = False
|
||||
|
||||
tokenizer = get_tokenizer(MODEL_NAME)
|
||||
mock_engine.get_tokenizer = AsyncMock(return_value=tokenizer)
|
||||
|
||||
async def mock_add_lora_side_effect(lora_request: LoRARequest):
|
||||
def mock_add_lora_side_effect(lora_request: LoRARequest):
|
||||
"""Simulate engine behavior when adding LoRAs."""
|
||||
if lora_request.lora_name == "test-lora":
|
||||
# Simulate successful addition
|
||||
return True
|
||||
if lora_request.lora_name == "invalid-lora":
|
||||
return
|
||||
elif lora_request.lora_name == "invalid-lora":
|
||||
# Simulate failure during addition (e.g. invalid format)
|
||||
raise ValueError(f"Simulated failure adding LoRA: "
|
||||
f"{lora_request.lora_name}")
|
||||
return True
|
||||
|
||||
mock_engine.add_lora = AsyncMock(side_effect=mock_add_lora_side_effect)
|
||||
|
||||
async def mock_generate(*args, **kwargs):
|
||||
for _ in []:
|
||||
yield _
|
||||
|
||||
mock_engine.generate = MagicMock(spec=AsyncLLM.generate,
|
||||
side_effect=mock_generate)
|
||||
|
||||
mock_engine.add_lora.side_effect = mock_add_lora_side_effect
|
||||
mock_engine.generate.reset_mock()
|
||||
mock_engine.add_lora.reset_mock()
|
||||
|
||||
@ -142,7 +131,7 @@ async def test_serving_completion_with_lora_resolver(mock_serving_setup,
|
||||
with suppress(Exception):
|
||||
await serving_completion.create_completion(req_found)
|
||||
|
||||
mock_engine.add_lora.assert_awaited_once()
|
||||
mock_engine.add_lora.assert_called_once()
|
||||
called_lora_request = mock_engine.add_lora.call_args[0][0]
|
||||
assert isinstance(called_lora_request, LoRARequest)
|
||||
assert called_lora_request.lora_name == lora_model_name
|
||||
@ -168,7 +157,7 @@ async def test_serving_completion_resolver_not_found(mock_serving_setup,
|
||||
|
||||
response = await serving_completion.create_completion(req)
|
||||
|
||||
mock_engine.add_lora.assert_not_awaited()
|
||||
mock_engine.add_lora.assert_not_called()
|
||||
mock_engine.generate.assert_not_called()
|
||||
|
||||
assert isinstance(response, ErrorResponse)
|
||||
@ -192,7 +181,7 @@ async def test_serving_completion_resolver_add_lora_fails(
|
||||
response = await serving_completion.create_completion(req)
|
||||
|
||||
# Assert add_lora was called before the failure
|
||||
mock_engine.add_lora.assert_awaited_once()
|
||||
mock_engine.add_lora.assert_called_once()
|
||||
called_lora_request = mock_engine.add_lora.call_args[0][0]
|
||||
assert isinstance(called_lora_request, LoRARequest)
|
||||
assert called_lora_request.lora_name == invalid_model
|
||||
|
||||
@ -432,7 +432,7 @@ def test_metrics_exist_run_batch(use_v1: bool):
|
||||
"--port",
|
||||
port,
|
||||
],
|
||||
env={"VLLM_USE_V1": "1"})
|
||||
env={"VLLM_USE_V1": "1" if use_v1 else "0"})
|
||||
|
||||
def is_server_up(url):
|
||||
try:
|
||||
|
||||
@ -287,57 +287,6 @@ async def test_stateful_multi_turn(client: OpenAI, model_name: str):
|
||||
assert response3.status == "completed"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_streaming_types(client: OpenAI, model_name: str):
|
||||
prompts = [
|
||||
"tell me a story about a cat in 20 words",
|
||||
]
|
||||
|
||||
# this links the "done" type with the "start" type
|
||||
# so every "done" type should have a corresponding "start" type
|
||||
# and every open block should be closed by the end of the stream
|
||||
pairs_of_event_types = {
|
||||
"response.completed": "response.created",
|
||||
"response.output_item.done": "response.output_item.added",
|
||||
"response.content_part.done": "response.content_part.added",
|
||||
"response.output_text.done": "response.output_text.delta",
|
||||
"response.web_search_call.done": "response.web_search_call.added",
|
||||
"response.reasoning_text.done": "response.reasoning_text.delta",
|
||||
"response.reasoning_part.done": "response.reasoning_part.added",
|
||||
}
|
||||
|
||||
for prompt in prompts:
|
||||
response = await client.responses.create(
|
||||
model=model_name,
|
||||
input=prompt,
|
||||
reasoning={"effort": "low"},
|
||||
tools=[],
|
||||
stream=True,
|
||||
background=False,
|
||||
)
|
||||
|
||||
stack_of_event_types = []
|
||||
async for event in response:
|
||||
if event.type == 'response.created':
|
||||
stack_of_event_types.append(event.type)
|
||||
elif event.type == 'response.completed':
|
||||
assert stack_of_event_types[-1] == pairs_of_event_types[
|
||||
event.type]
|
||||
stack_of_event_types.pop()
|
||||
if event.type.endswith("added"):
|
||||
stack_of_event_types.append(event.type)
|
||||
elif event.type.endswith("delta"):
|
||||
if stack_of_event_types[-1] == event.type:
|
||||
continue
|
||||
stack_of_event_types.append(event.type)
|
||||
elif event.type.endswith("done"):
|
||||
assert stack_of_event_types[-1] == pairs_of_event_types[
|
||||
event.type]
|
||||
stack_of_event_types.pop()
|
||||
assert len(stack_of_event_types) == 0
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
@pytest.mark.parametrize("background", [True, False])
|
||||
@ -394,10 +343,7 @@ async def test_streaming(client: OpenAI, model_name: str, background: bool):
|
||||
assert event.item_id == current_item_id
|
||||
|
||||
# verify content_index_id is correct
|
||||
if event.type in [
|
||||
"response.content_part.added",
|
||||
"response.reasoning_part.added"
|
||||
]:
|
||||
if event.type == "response.content_part.added":
|
||||
assert event.content_index != current_content_index
|
||||
current_content_index = event.content_index
|
||||
elif event.type in [
|
||||
@ -515,7 +461,6 @@ async def test_function_calling(client: OpenAI, model_name: str):
|
||||
model=model_name,
|
||||
input="What's the weather like in Paris today?",
|
||||
tools=tools,
|
||||
temperature=0.0,
|
||||
)
|
||||
assert response is not None
|
||||
assert response.status == "completed"
|
||||
@ -744,18 +689,3 @@ async def test_function_calling_full_history(client: OpenAI, model_name: str):
|
||||
assert response_2 is not None
|
||||
assert response_2.status == "completed"
|
||||
assert response_2.output_text is not None
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_output_messages_enabled(client: OpenAI, model_name: str,
|
||||
server):
|
||||
response = await client.responses.create(
|
||||
model=model_name,
|
||||
input="What is the capital of South Korea?",
|
||||
extra_body={"enable_response_messages": True})
|
||||
|
||||
assert response is not None
|
||||
assert response.status == "completed"
|
||||
assert len(response.input_messages) > 0
|
||||
assert len(response.output_messages) > 0
|
||||
|
||||
@ -45,39 +45,8 @@ TOOLS = [{
|
||||
},
|
||||
}]
|
||||
|
||||
PRODUCT_TOOLS = [{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "get_product_info",
|
||||
"description": "Get detailed information of a product based on its "
|
||||
"product ID.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"inserted": {
|
||||
"type": "boolean",
|
||||
"description": "inserted.",
|
||||
},
|
||||
"product_id": {
|
||||
"type": "integer",
|
||||
"description": "The product ID of the product.",
|
||||
},
|
||||
},
|
||||
"required": ["product_id", "inserted"],
|
||||
},
|
||||
},
|
||||
}]
|
||||
|
||||
MESSAGES = [{"role": "user", "content": "What's the weather like in Boston?"}]
|
||||
|
||||
PRODUCT_MESSAGES = [{
|
||||
"role":
|
||||
"user",
|
||||
"content":
|
||||
"Hi! Do you have any detailed information about the product id "
|
||||
"7355608 and inserted true?"
|
||||
}]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_non_streaming_tool_call():
|
||||
@ -158,103 +127,3 @@ async def test_streaming_tool_call():
|
||||
print("\n[Streaming Test Passed]")
|
||||
print(f"Reconstructed Tool Call: {reconstructed_tool_call['name']}")
|
||||
print(f"Reconstructed Arguments: {arguments}")
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_non_streaming_product_tool_call():
|
||||
"""Test tool call integer and boolean parameters in non-streaming mode."""
|
||||
with RemoteOpenAIServer(MODEL_NAME, SERVER_ARGS) as server:
|
||||
client = server.get_async_client()
|
||||
|
||||
response = await client.chat.completions.create(
|
||||
model=LORA_MODEL,
|
||||
messages=PRODUCT_MESSAGES,
|
||||
tools=PRODUCT_TOOLS,
|
||||
tool_choice="auto",
|
||||
temperature=0.66,
|
||||
)
|
||||
|
||||
assert response.choices
|
||||
choice = response.choices[0]
|
||||
message = choice.message
|
||||
|
||||
assert choice.finish_reason == "tool_calls"
|
||||
assert message.tool_calls is not None
|
||||
|
||||
tool_call = message.tool_calls[0]
|
||||
assert tool_call.type == "function"
|
||||
assert tool_call.function.name == "get_product_info"
|
||||
|
||||
arguments = json.loads(tool_call.function.arguments)
|
||||
assert "product_id" in arguments
|
||||
assert "inserted" in arguments
|
||||
|
||||
product_id = arguments.get("product_id")
|
||||
inserted = arguments.get("inserted")
|
||||
|
||||
assert isinstance(product_id, int)
|
||||
assert product_id == 7355608
|
||||
assert isinstance(inserted, bool)
|
||||
assert inserted is True
|
||||
|
||||
print("\n[Non-Streaming Product Test Passed]")
|
||||
print(f"Tool Call: {tool_call.function.name}")
|
||||
print(f"Arguments: {arguments}")
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_streaming_product_tool_call():
|
||||
"""Test tool call integer and boolean parameters in streaming mode."""
|
||||
with RemoteOpenAIServer(MODEL_NAME, SERVER_ARGS) as server:
|
||||
client = server.get_async_client()
|
||||
|
||||
stream = await client.chat.completions.create(
|
||||
model=LORA_MODEL,
|
||||
messages=PRODUCT_MESSAGES,
|
||||
tools=PRODUCT_TOOLS,
|
||||
tool_choice="auto",
|
||||
temperature=0.66,
|
||||
stream=True,
|
||||
)
|
||||
|
||||
tool_call_chunks = {}
|
||||
async for chunk in stream:
|
||||
if not chunk.choices:
|
||||
continue
|
||||
|
||||
delta = chunk.choices[0].delta
|
||||
if not delta or not delta.tool_calls:
|
||||
continue
|
||||
|
||||
for tool_chunk in delta.tool_calls:
|
||||
index = tool_chunk.index
|
||||
if index not in tool_call_chunks:
|
||||
tool_call_chunks[index] = {"name": "", "arguments": ""}
|
||||
|
||||
if tool_chunk.function.name:
|
||||
tool_call_chunks[index]["name"] += tool_chunk.function.name
|
||||
if tool_chunk.function.arguments:
|
||||
tool_call_chunks[index][
|
||||
"arguments"] += tool_chunk.function.arguments
|
||||
|
||||
assert len(tool_call_chunks) == 1
|
||||
reconstructed_tool_call = tool_call_chunks[0]
|
||||
|
||||
assert reconstructed_tool_call["name"] == "get_product_info"
|
||||
|
||||
arguments = json.loads(reconstructed_tool_call["arguments"])
|
||||
assert "product_id" in arguments
|
||||
assert "inserted" in arguments
|
||||
|
||||
# Handle type coercion for streaming test as well
|
||||
product_id = arguments.get("product_id")
|
||||
inserted = arguments.get("inserted")
|
||||
|
||||
assert isinstance(product_id, int)
|
||||
assert product_id == 7355608
|
||||
assert isinstance(inserted, bool)
|
||||
assert inserted is True
|
||||
|
||||
print("\n[Streaming Product Test Passed]")
|
||||
print(f"Reconstructed Tool Call: {reconstructed_tool_call['name']}")
|
||||
print(f"Reconstructed Arguments: {arguments}")
|
||||
|
||||
@ -216,7 +216,7 @@ def server_with_chunked_processing():
|
||||
"--enforce-eager",
|
||||
"--max-model-len",
|
||||
"512", # Set smaller max_model_len to trigger chunking mechanism
|
||||
'--pooler-config',
|
||||
'--override-pooler-config',
|
||||
('{"pooling_type": "MEAN", "normalize": true, '
|
||||
'"enable_chunked_processing": true, "max_embed_len": 10000}'),
|
||||
"--gpu-memory-utilization",
|
||||
|
||||
@ -60,7 +60,7 @@ def test_api_server_process_manager_init(api_server_args, with_stats_update):
|
||||
global WORKER_RUNTIME_SECONDS
|
||||
WORKER_RUNTIME_SECONDS = 0.5
|
||||
|
||||
# Copy the args to avoid mutating them
|
||||
# Copy the args to avoid mutating the
|
||||
args = api_server_args.copy()
|
||||
|
||||
if not with_stats_update:
|
||||
|
||||
@ -19,7 +19,7 @@ pytest -s -v tests/gsm8k/test_gsm8k_correctness.py \
|
||||
vllm serve Qwen/Qwen2.5-1.5B-Instruct --port 8000
|
||||
|
||||
# Run evaluation
|
||||
python tests/evals/gsm8k/gsm8k_eval.py --port 8000
|
||||
python tests/gsm8k/gsm8k_eval.py --port 8000
|
||||
```
|
||||
|
||||
## Configuration Format
|
||||
|
||||
@ -18,7 +18,7 @@ if not current_platform.is_rocm():
|
||||
from xformers import ops as xops
|
||||
from xformers.ops.fmha.attn_bias import BlockDiagonalCausalMask
|
||||
|
||||
from tests.kernels.utils import make_alibi_bias
|
||||
from vllm.attention.backends.xformers import _make_alibi_bias
|
||||
|
||||
FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
|
||||
# This will change depending on the compute capability.
|
||||
@ -429,8 +429,8 @@ def test_multi_query_kv_attention(
|
||||
alibi_bias = None
|
||||
if use_alibi:
|
||||
alibi_slopes = torch.randn(num_query_heads, dtype=torch.float)
|
||||
attn_bias = make_alibi_bias(alibi_slopes, num_kv_heads, dtype,
|
||||
seq_lens)
|
||||
attn_bias = _make_alibi_bias(alibi_slopes, num_kv_heads, dtype,
|
||||
seq_lens)
|
||||
output = torch.empty_like(query)
|
||||
start = 0
|
||||
# Dynamic sequence length not supported with custom attn_bias.
|
||||
|
||||
@ -67,23 +67,30 @@ def generate_params():
|
||||
return params
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="Skipped for now. Should be revisited.")
|
||||
@pytest.mark.parametrize("device, name, use_mla, block_size",
|
||||
generate_params())
|
||||
@pytest.mark.parametrize("use_v1", [True, False])
|
||||
def test_env(
|
||||
device: str,
|
||||
name: str,
|
||||
use_mla: bool,
|
||||
block_size: int,
|
||||
use_v1: bool,
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
):
|
||||
"""Test attention backend selection with valid device-backend pairs."""
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_USE_V1", "1")
|
||||
m.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
|
||||
m.setenv(STR_BACKEND_ENV_VAR, name)
|
||||
m.setenv("VLLM_MLA_DISABLE", "1" if use_mla else "0")
|
||||
|
||||
if name == "FLASHINFER" and not use_v1:
|
||||
pytest.skip("FlashInfer backend is only available on V1 engine")
|
||||
|
||||
if device == "cpu":
|
||||
if not use_v1:
|
||||
pytest.skip("CPU backend only supports V1")
|
||||
|
||||
with patch("vllm.attention.selector.current_platform",
|
||||
CpuPlatform()):
|
||||
backend = get_attn_backend(16, torch.float16, None, block_size,
|
||||
@ -130,7 +137,7 @@ def test_env(
|
||||
block_size,
|
||||
False,
|
||||
use_mla=use_mla)
|
||||
expected = f"{name}_VLLM_V1"
|
||||
expected = f"{name}_VLLM_V1" if use_v1 else name
|
||||
assert backend.get_name() == expected
|
||||
else:
|
||||
backend = get_attn_backend(16,
|
||||
@ -139,7 +146,7 @@ def test_env(
|
||||
block_size,
|
||||
False,
|
||||
use_mla=use_mla)
|
||||
expected = "TRITON_ATTN_VLLM_V1"
|
||||
expected = "TRITON_ATTN_VLLM_V1" if use_v1 else "ROCM_FLASH"
|
||||
assert backend.get_name() == expected
|
||||
|
||||
elif device == "cuda":
|
||||
@ -156,7 +163,11 @@ def test_env(
|
||||
# - TRITON_MLA: fallback for other cases
|
||||
|
||||
if name == "CUTLASS_MLA":
|
||||
if block_size != 128:
|
||||
if not use_v1:
|
||||
# CUTLASS_MLA only supported on V1 engine
|
||||
pytest.skip(
|
||||
"CUTLASS_MLA only supported on V1 engine")
|
||||
elif block_size != 128:
|
||||
# CUTLASS_MLA only supports block_size == 128
|
||||
pytest.skip(
|
||||
"CUTLASS_MLA only supports block_size 128")
|
||||
@ -170,7 +181,11 @@ def test_env(
|
||||
expected = "CUTLASS_MLA_VLLM_V1"
|
||||
assert backend.get_name() == expected
|
||||
elif name == "FLASHINFER_MLA":
|
||||
if block_size not in [32, 64]:
|
||||
if not use_v1:
|
||||
# FlashInfer MLA only supported on V1 engine
|
||||
pytest.skip(
|
||||
"FlashInfer MLA only supported on V1 engine")
|
||||
elif block_size not in [32, 64]:
|
||||
# FlashInfer MLA only supports block_size 32 or 64
|
||||
pytest.skip(
|
||||
"FlashInfer MLA only supports block_size 32 "
|
||||
@ -202,17 +217,23 @@ def test_env(
|
||||
block_size,
|
||||
False,
|
||||
use_mla=use_mla)
|
||||
expected = f"{name}_VLLM_V1"
|
||||
expected = f"{name}_VLLM_V1" if use_v1 else name
|
||||
assert backend.get_name() == expected
|
||||
elif name == "FLASH_ATTN_MLA":
|
||||
backend = get_attn_backend(16,
|
||||
torch.float16,
|
||||
None,
|
||||
block_size,
|
||||
False,
|
||||
use_mla=use_mla)
|
||||
expected = "FLASH_ATTN_MLA"
|
||||
assert backend.get_name() == expected
|
||||
if not use_v1:
|
||||
# FlashAttention MLA only supported on V1 engine
|
||||
pytest.skip(
|
||||
"FlashAttention MLA only supported on V1 engine"
|
||||
)
|
||||
else:
|
||||
backend = get_attn_backend(16,
|
||||
torch.float16,
|
||||
None,
|
||||
block_size,
|
||||
False,
|
||||
use_mla=use_mla)
|
||||
expected = "FLASH_ATTN_MLA"
|
||||
assert backend.get_name() == expected
|
||||
else:
|
||||
# TRITON_MLA or other fallback
|
||||
backend = get_attn_backend(16,
|
||||
@ -221,7 +242,8 @@ def test_env(
|
||||
block_size,
|
||||
False,
|
||||
use_mla=use_mla)
|
||||
expected = "TRITON_MLA_VLLM_V1"
|
||||
expected = ("TRITON_MLA_VLLM_V1"
|
||||
if use_v1 else "TRITON_MLA")
|
||||
assert backend.get_name() == expected
|
||||
elif name == "FLASHINFER":
|
||||
backend = get_attn_backend(16,
|
||||
@ -230,7 +252,7 @@ def test_env(
|
||||
block_size,
|
||||
False,
|
||||
use_mla=use_mla)
|
||||
expected = "FLASHINFER_VLLM_V1"
|
||||
expected = "FLASHINFER_VLLM_V1" if use_v1 else name
|
||||
assert backend.get_name() == expected
|
||||
else:
|
||||
backend = get_attn_backend(32,
|
||||
@ -239,30 +261,36 @@ def test_env(
|
||||
block_size,
|
||||
False,
|
||||
use_mla=use_mla)
|
||||
expected = "FLASH_ATTN_VLLM_V1"
|
||||
expected = "FLASH_ATTN_VLLM_V1" if use_v1 else name
|
||||
assert backend.get_name() == expected
|
||||
|
||||
backend = get_attn_backend(16,
|
||||
torch.float16,
|
||||
None,
|
||||
block_size,
|
||||
False,
|
||||
use_mla=use_mla)
|
||||
assert backend.get_name() == "FLEX_ATTENTION", (
|
||||
"Should fallback to FlexAttention if head size is "
|
||||
"not supported by FlashAttention")
|
||||
if use_v1:
|
||||
backend = get_attn_backend(16,
|
||||
torch.float16,
|
||||
None,
|
||||
block_size,
|
||||
False,
|
||||
use_mla=use_mla)
|
||||
assert backend.get_name() == "FLEX_ATTENTION", (
|
||||
"Should fallback to FlexAttention if head size is "
|
||||
"not supported by FlashAttention")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("device", ["cpu", "cuda"])
|
||||
@pytest.mark.parametrize("use_v1", [True, False])
|
||||
def test_fp32_fallback(
|
||||
device: str,
|
||||
use_v1: bool,
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
):
|
||||
"""Test attention backend selection with fp32."""
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_USE_V1", "1")
|
||||
m.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
|
||||
|
||||
if device == "cpu":
|
||||
if not use_v1:
|
||||
pytest.skip("CPU backend only supports V1")
|
||||
|
||||
with patch("vllm.attention.selector.current_platform",
|
||||
CpuPlatform()):
|
||||
backend = get_attn_backend(16, torch.float32, None, 16, False)
|
||||
@ -272,7 +300,8 @@ def test_fp32_fallback(
|
||||
with patch("vllm.attention.selector.current_platform",
|
||||
CudaPlatform()):
|
||||
backend = get_attn_backend(16, torch.float32, None, 16, False)
|
||||
assert backend.get_name() == "FLEX_ATTENTION"
|
||||
assert (backend.get_name() == "FLEX_ATTENTION"
|
||||
if use_v1 else "XFORMERS")
|
||||
|
||||
|
||||
def test_flash_attn(monkeypatch: pytest.MonkeyPatch):
|
||||
@ -328,11 +357,12 @@ def test_flash_attn(monkeypatch: pytest.MonkeyPatch):
|
||||
assert backend.get_name() != STR_FLASH_ATTN_VAL
|
||||
|
||||
|
||||
def test_invalid_env(monkeypatch: pytest.MonkeyPatch):
|
||||
@pytest.mark.parametrize("use_v1", [True, False])
|
||||
def test_invalid_env(use_v1: bool, monkeypatch: pytest.MonkeyPatch):
|
||||
"""Test that invalid attention backend names raise ValueError."""
|
||||
with monkeypatch.context() as m, patch(
|
||||
"vllm.attention.selector.current_platform", CudaPlatform()):
|
||||
m.setenv("VLLM_USE_V1", "1")
|
||||
m.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
|
||||
m.setenv(STR_BACKEND_ENV_VAR, STR_INVALID_VAL)
|
||||
|
||||
# Should raise ValueError for invalid backend
|
||||
|
||||
@ -11,7 +11,7 @@ import torch
|
||||
from xformers import ops as xops
|
||||
from xformers.ops.fmha.attn_bias import BlockDiagonalCausalFromBottomRightMask
|
||||
|
||||
from tests.kernels.utils import make_alibi_bias
|
||||
from vllm.attention.backends.xformers import _make_alibi_bias
|
||||
from vllm.attention.ops.chunked_prefill_paged_decode import (
|
||||
chunked_prefill_paged_decode)
|
||||
from vllm.attention.ops.prefix_prefill import context_attention_fwd
|
||||
@ -470,7 +470,7 @@ def test_contexted_kv_attention_alibi(
|
||||
key = key.unsqueeze(0)
|
||||
value = value.unsqueeze(0)
|
||||
|
||||
attn_bias = make_alibi_bias(alibi_slopes, num_kv_heads, dtype, seq_lens)
|
||||
attn_bias = _make_alibi_bias(alibi_slopes, num_kv_heads, dtype, seq_lens)
|
||||
output_ref = torch.empty_like(output)
|
||||
seq_start = 0
|
||||
query_start = 0
|
||||
@ -479,7 +479,7 @@ def test_contexted_kv_attention_alibi(
|
||||
# FIXME(DefTruth): Because xformers does not support dynamic sequence
|
||||
# lengths with custom attention bias, we process each prompt one by
|
||||
# one. This is inefficient, especially when we have many short prompts.
|
||||
# modified from: vllm/v1/attention/backends/xformers.py#L343
|
||||
# modified from: vllm/attention/backends/xformers.py#L343
|
||||
for i, (query_len, seq_len) in enumerate(zip(query_lens, seq_lens)):
|
||||
seq_end = seq_start + seq_len
|
||||
query_end = query_start + query_len
|
||||
|
||||
@ -16,7 +16,6 @@ def clear_cache():
|
||||
_cached_get_attn_backend.cache_clear()
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="Skipped for now. Should be revisited.")
|
||||
def test_selector(monkeypatch: pytest.MonkeyPatch):
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv(STR_BACKEND_ENV_VAR, "ROCM_FLASH")
|
||||
|
||||
@ -83,7 +83,7 @@ def ref_paged_attn(
|
||||
@pytest.mark.parametrize("num_heads", NUM_HEADS)
|
||||
@pytest.mark.parametrize("head_size", HEAD_SIZES)
|
||||
@pytest.mark.parametrize("block_size", BLOCK_SIZES)
|
||||
@pytest.mark.parametrize("sliding_window", [None, 64, 128, 256])
|
||||
@pytest.mark.parametrize("sliding_window", [None, 256])
|
||||
@pytest.mark.parametrize("dtype", DTYPES)
|
||||
@pytest.mark.parametrize("soft_cap", [None, 50.0])
|
||||
@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
|
||||
|
||||
@ -1,12 +1,9 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from typing import NamedTuple
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
from packaging.version import Version
|
||||
from transformers import AutoConfig
|
||||
from transformers import __version__ as TRANSFORMERS_VERSION
|
||||
|
||||
from vllm.model_executor.layers.rotary_embedding import get_rope
|
||||
from vllm.platforms import current_platform
|
||||
@ -18,7 +15,6 @@ def generate_test_data(num_tokens: int, num_q_heads: int, num_kv_heads: int,
|
||||
head_size: int, max_position_embeddings: int,
|
||||
dtype: torch.dtype, device: torch.device):
|
||||
"""Generate test data for given configuration."""
|
||||
current_platform.seed_everything(42)
|
||||
# Create 2D positions (3, num_tokens) for multimodal case
|
||||
positions = torch.randint(0,
|
||||
max_position_embeddings // 4, (3, num_tokens),
|
||||
@ -37,37 +33,22 @@ def generate_test_data(num_tokens: int, num_q_heads: int, num_kv_heads: int,
|
||||
return positions, query, key
|
||||
|
||||
|
||||
class MRoPETestInfo(NamedTuple):
|
||||
model_name: str
|
||||
# https://github.com/pytorch/pytorch/blob/main/torch/testing/_comparison.py#L1317
|
||||
atol: float = 1e-2
|
||||
rtol: float = 1.6e-2
|
||||
marks: list[pytest.MarkDecorator] = []
|
||||
def unroll_model_tp_dict(model_tp_dict):
|
||||
return [(model_name, tp_size)
|
||||
for model_name, tp_sizes in model_tp_dict.items()
|
||||
for tp_size in tp_sizes]
|
||||
|
||||
|
||||
TRANSFORMERS_BASE_VERSION = Version(TRANSFORMERS_VERSION).base_version
|
||||
model_tp_dict = {
|
||||
"Qwen/Qwen2-VL-7B-Instruct": [1, 2],
|
||||
"Qwen/Qwen2-VL-72B-Instruct": [1, 2],
|
||||
"Qwen/Qwen2.5-VL-72B-Instruct": [1, 2],
|
||||
"zai-org/GLM-4.1V-9B-Thinking": [1, 2],
|
||||
}
|
||||
|
||||
MODELS_TO_TEST = [
|
||||
MRoPETestInfo(model_name="zai-org/GLM-4.1V-9B-Thinking"),
|
||||
MRoPETestInfo(model_name="Qwen/Qwen2-VL-7B-Instruct"),
|
||||
MRoPETestInfo(model_name="Qwen/Qwen2-VL-72B-Instruct"),
|
||||
MRoPETestInfo(model_name="Qwen/Qwen2.5-VL-72B-Instruct"),
|
||||
MRoPETestInfo(
|
||||
model_name="Qwen/Qwen3-VL-4B-Instruct",
|
||||
marks=[
|
||||
pytest.mark.skipif(
|
||||
Version(TRANSFORMERS_BASE_VERSION) < Version("4.57.0"),
|
||||
reason="Qwen3-VL only available after Transformers v4.57",
|
||||
)
|
||||
]),
|
||||
MRoPETestInfo(
|
||||
model_name="Qwen/Qwen3-VL-30B-A3B-Instruct",
|
||||
marks=[
|
||||
pytest.mark.skipif(
|
||||
Version(TRANSFORMERS_BASE_VERSION) < Version("4.57.0"),
|
||||
reason="Qwen3-VL only available after Transformers v4.57",
|
||||
)
|
||||
]),
|
||||
# https://github.com/pytorch/pytorch/blob/main/torch/testing/_comparison.py#L1317
|
||||
dtype_atol_rtol_list = [
|
||||
[torch.bfloat16, 1e-2, 1.6e-2],
|
||||
]
|
||||
|
||||
num_tokens_list = [11, 8192]
|
||||
@ -75,29 +56,20 @@ num_tokens_list = [11, 8192]
|
||||
|
||||
@pytest.mark.skipif(not current_platform.is_cuda_alike(),
|
||||
reason="Skipping CUDA/ROCm only tests.")
|
||||
@pytest.mark.parametrize("model_info, model_name", [
|
||||
pytest.param(test_config, test_config.model_name, marks=test_config.marks)
|
||||
for test_config in MODELS_TO_TEST
|
||||
])
|
||||
@pytest.mark.parametrize("tp_size", [1, 2])
|
||||
@pytest.mark.parametrize("dtype", [torch.bfloat16])
|
||||
@pytest.mark.parametrize("model_name, tp_size",
|
||||
unroll_model_tp_dict(model_tp_dict))
|
||||
@pytest.mark.parametrize("dtype, atol, rtol", dtype_atol_rtol_list)
|
||||
@pytest.mark.parametrize("num_tokens", num_tokens_list)
|
||||
def test_mrope(model_name: str, model_info: MRoPETestInfo, tp_size: int,
|
||||
dtype: torch.dtype, num_tokens: int):
|
||||
|
||||
atol = model_info.atol
|
||||
rtol = model_info.rtol
|
||||
def test_mrope(model_name, tp_size, dtype, atol, rtol, num_tokens):
|
||||
|
||||
config = AutoConfig.from_pretrained(model_name)
|
||||
config = config.get_text_config()
|
||||
|
||||
# get the model config
|
||||
total_num_kv_heads = config.num_key_value_heads
|
||||
total_num_heads = config.num_attention_heads
|
||||
num_heads = total_num_heads // tp_size
|
||||
num_kv_heads = max(1, total_num_kv_heads // tp_size)
|
||||
head_dim = (config.head_dim if hasattr(config, "head_dim") else
|
||||
config.hidden_size // total_num_heads)
|
||||
head_dim = config.hidden_size // total_num_heads
|
||||
is_neox_style = True
|
||||
|
||||
rope_theta = config.rope_theta
|
||||
@ -139,30 +111,24 @@ def test_mrope(model_name: str, model_info: MRoPETestInfo, tp_size: int,
|
||||
|
||||
@pytest.mark.skipif(not current_platform.is_cuda_alike(),
|
||||
reason="Skipping CUDA/ROCm only tests.")
|
||||
@pytest.mark.parametrize("model_info, model_name", [
|
||||
pytest.param(test_config, test_config.model_name, marks=test_config.marks)
|
||||
for test_config in MODELS_TO_TEST
|
||||
])
|
||||
@pytest.mark.parametrize("tp_size", [1, 2])
|
||||
@pytest.mark.parametrize("dtype", [torch.bfloat16])
|
||||
@pytest.mark.parametrize("num_tokens", num_tokens_list)
|
||||
def test_mrope_torch_compile_tracing(model_name: str,
|
||||
model_info: MRoPETestInfo, tp_size: int,
|
||||
dtype: torch.dtype, num_tokens: int):
|
||||
|
||||
atol = model_info.atol
|
||||
rtol = model_info.rtol
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model_name, tp_size",
|
||||
unroll_model_tp_dict({
|
||||
"Qwen/Qwen2-VL-7B-Instruct": [1, 2],
|
||||
"zai-org/GLM-4.1V-9B-Thinking": [1, 2]
|
||||
}))
|
||||
@pytest.mark.parametrize("dtype, atol, rtol", dtype_atol_rtol_list)
|
||||
@pytest.mark.parametrize("num_tokens", [4])
|
||||
def test_mrope_torch_compile_tracing(model_name, tp_size, dtype, atol, rtol,
|
||||
num_tokens):
|
||||
config = AutoConfig.from_pretrained(model_name)
|
||||
config = config.get_text_config()
|
||||
|
||||
# get the model config
|
||||
total_num_kv_heads = config.num_key_value_heads
|
||||
total_num_heads = config.num_attention_heads
|
||||
num_heads = total_num_heads // tp_size
|
||||
num_kv_heads = max(1, total_num_kv_heads // tp_size)
|
||||
head_dim = (config.head_dim if hasattr(config, "head_dim") else
|
||||
config.hidden_size // total_num_heads)
|
||||
head_dim = config.hidden_size // total_num_heads
|
||||
is_neox_style = True
|
||||
rope_theta = config.rope_theta
|
||||
max_position = config.max_position_embeddings
|
||||
|
||||
@ -11,8 +11,7 @@ import pytest
|
||||
import torch
|
||||
|
||||
import vllm.model_executor.layers.fused_moe.modular_kernel as mk
|
||||
from vllm.config import VllmConfig, set_current_vllm_config
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.config import VllmConfig, current_platform, set_current_vllm_config
|
||||
from vllm.utils import has_deep_ep, has_deep_gemm, has_pplx
|
||||
from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe
|
||||
|
||||
|
||||
@ -1,24 +1,21 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import importlib
|
||||
import importlib.metadata
|
||||
from dataclasses import dataclass
|
||||
from importlib.util import find_spec
|
||||
from typing import Optional
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
from packaging import version
|
||||
|
||||
from vllm.model_executor.layers.quantization.quark.quark import ( # noqa: E501
|
||||
QuarkLinearMethod, QuarkW4A4MXFP4)
|
||||
from vllm.model_executor.layers.quantization.quark.quark_moe import ( # noqa: E501
|
||||
QuarkW4A4MXFp4MoEMethod)
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils.flashinfer import has_flashinfer
|
||||
|
||||
QUARK_MXFP4_AVAILABLE = find_spec("quark") is not None and version.parse(
|
||||
importlib.metadata.version("amd-quark")) >= version.parse('0.8.99')
|
||||
QUARK_MXFP4_AVAILABLE = importlib.util.find_spec(
|
||||
"quark") is not None and version.parse(
|
||||
importlib.metadata.version("amd-quark")) >= version.parse('0.8.99')
|
||||
|
||||
TRTLLM_GEN_MXFP4_AVAILABLE = current_platform.is_cuda(
|
||||
) and current_platform.is_device_capability(100)
|
||||
@ -42,12 +39,6 @@ class ModelCase:
|
||||
tp: int
|
||||
|
||||
|
||||
@pytest.fixture(scope="function", autouse=True)
|
||||
def enable_pickle(monkeypatch):
|
||||
"""`LLM.apply_model` requires pickling a function."""
|
||||
monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
|
||||
|
||||
|
||||
@pytest.mark.parametrize('model_case', [
|
||||
ModelCase("fxmarty/qwen_1.5-moe-a2.7b-mxfp4", tp=1),
|
||||
ModelCase("fxmarty/deepseek_r1_3_layers_mxfp4", tp=8),
|
||||
@ -64,19 +55,21 @@ def test_mxfp4_loading_and_execution_moe(vllm_runner, model_case: ModelCase):
|
||||
tensor_parallel_size=model_case.tp,
|
||||
load_format="dummy") as llm:
|
||||
|
||||
def check_model(model):
|
||||
layer = model.model.layers[0]
|
||||
# TODO: llm.apply_model(check_model) currently relies on V0 internals.
|
||||
# Re-enable this later.
|
||||
# def check_model(model):
|
||||
# layer = model.model.layers[0]
|
||||
|
||||
qkv_proj = layer.self_attn.qkv_proj
|
||||
# qkv_proj = layer.self_attn.qkv_proj
|
||||
|
||||
assert isinstance(qkv_proj.quant_method, QuarkLinearMethod)
|
||||
assert isinstance(qkv_proj.scheme, QuarkW4A4MXFP4)
|
||||
# assert isinstance(qkv_proj.quant_method, QuarkLinearMethod)
|
||||
# assert isinstance(qkv_proj.scheme, QuarkW4A4MXFP4)
|
||||
|
||||
assert isinstance(layer.mlp.experts.quant_method,
|
||||
QuarkW4A4MXFp4MoEMethod)
|
||||
# assert isinstance(layer.mlp.experts.quant_method,
|
||||
# QuarkW4A4MXFp4MoEMethod)
|
||||
|
||||
if model_case.model_id == "fxmarty/qwen_1.5-moe-a2.7b-mxfp4":
|
||||
llm.apply_model(check_model)
|
||||
# if model_case.model_id == "fxmarty/qwen_1.5-moe-a2.7b-mxfp4":
|
||||
# llm.apply_model(check_model)
|
||||
|
||||
output = llm.generate_greedy("Today I am in the French Alps and",
|
||||
max_tokens=20)
|
||||
|
||||
@ -513,6 +513,10 @@ def make_backend(backend_name: str) -> AttentionBackend:
|
||||
Construct the backend instance determined by the backend_name string
|
||||
argument.
|
||||
|
||||
"XFORMERS" -> construct xformers backend
|
||||
|
||||
TODO: other backends
|
||||
|
||||
Note: at time of writing the Attention wrapper automatically selects
|
||||
its own backend for Attention.forward(); so the backend instance which
|
||||
you generate with this function is not meant to be used for *running*
|
||||
@ -524,68 +528,18 @@ def make_backend(backend_name: str) -> AttentionBackend:
|
||||
|
||||
* Backend instance
|
||||
'''
|
||||
if backend_name in (STR_XFORMERS_ATTN_VAL, "XFORMERS_VLLM_V1"):
|
||||
from vllm.v1.attention.backends.xformers import (
|
||||
XFormersAttentionBackend)
|
||||
return XFormersAttentionBackend()
|
||||
if backend_name in (STR_FLASH_ATTN_VAL, "FLASH_ATTN_VLLM_V1"):
|
||||
from vllm.v1.attention.backends.flash_attn import FlashAttentionBackend
|
||||
if backend_name == STR_XFORMERS_ATTN_VAL:
|
||||
# NOTE: xFormers backend cannot be imported for CPU and AMD GPUs.
|
||||
from vllm.attention.backends.xformers import XFormersBackend
|
||||
return XFormersBackend()
|
||||
elif backend_name == STR_FLASH_ATTN_VAL:
|
||||
from vllm.attention.backends.flash_attn import FlashAttentionBackend
|
||||
return FlashAttentionBackend()
|
||||
if backend_name == "TRITON_ATTN_VLLM_V1":
|
||||
from vllm.v1.attention.backends.triton_attn import (
|
||||
TritonAttentionBackend)
|
||||
return TritonAttentionBackend()
|
||||
if backend_name == "FLEX_ATTENTION":
|
||||
from vllm.v1.attention.backends.flex_attention import (
|
||||
FlexAttentionBackend)
|
||||
return FlexAttentionBackend()
|
||||
if backend_name in ("TORCH_SDPA", "TORCH_SDPA_VLLM_V1"):
|
||||
from vllm.v1.attention.backends.cpu_attn import TorchSDPABackend
|
||||
return TorchSDPABackend()
|
||||
if backend_name == "FLASHINFER":
|
||||
from vllm.v1.attention.backends.flashinfer import FlashInferBackend
|
||||
return FlashInferBackend()
|
||||
|
||||
raise AssertionError(
|
||||
f"Unrecognized backend_name {backend_name} for unit test")
|
||||
|
||||
|
||||
def make_alibi_bias(
|
||||
alibi_slopes: torch.Tensor,
|
||||
num_kv_heads: int,
|
||||
dtype: torch.dtype,
|
||||
seq_lens: list[int],
|
||||
) -> list[Any]:
|
||||
"""Create ALiBi biases compatible with xFormers attention tests."""
|
||||
from xformers.ops.fmha.attn_bias import LowerTriangularMaskWithTensorBias
|
||||
|
||||
if alibi_slopes is None:
|
||||
return [None for _ in seq_lens]
|
||||
|
||||
attn_biases: list[Any] = []
|
||||
num_heads = alibi_slopes.shape[0]
|
||||
assert num_heads >= num_kv_heads, (
|
||||
"ALiBi slopes expect at least as many heads as KV heads")
|
||||
|
||||
for seq_len in seq_lens:
|
||||
bias = torch.arange(seq_len, dtype=dtype, device=alibi_slopes.device)
|
||||
bias = bias[None, :] - bias[:, None]
|
||||
|
||||
padded_len = (seq_len + 7) // 8 * 8
|
||||
bias_tensor = torch.empty(
|
||||
1,
|
||||
num_heads,
|
||||
seq_len,
|
||||
padded_len,
|
||||
device=alibi_slopes.device,
|
||||
dtype=dtype,
|
||||
)[:, :, :, :seq_len].copy_(bias)
|
||||
bias_tensor.mul_(alibi_slopes[:, None, None])
|
||||
attn_biases.append(LowerTriangularMaskWithTensorBias(bias_tensor))
|
||||
|
||||
return attn_biases
|
||||
|
||||
|
||||
def _make_metadata_tensors(
|
||||
seq_lens: Optional[list[int]],
|
||||
context_lens: Optional[list[int]],
|
||||
@ -959,6 +913,7 @@ def make_test_metadata(
|
||||
return attn_backend_obj.make_metadata(
|
||||
num_prefills=num_prefills,
|
||||
slot_mapping=(None if kv_mmap is None else kv_mmap.slot_mapping),
|
||||
multi_modal_placeholder_index_maps=None,
|
||||
enable_kv_scales_calculation=True,
|
||||
num_prefill_tokens=num_prefill_tokens,
|
||||
num_decode_tokens=num_decode_tokens,
|
||||
@ -1008,6 +963,7 @@ def make_test_metadata(
|
||||
return attn_backend_obj.make_metadata(
|
||||
num_prefills=num_prefills,
|
||||
slot_mapping=kv_mmap.slot_mapping,
|
||||
multi_modal_placeholder_index_maps=None,
|
||||
enable_kv_scales_calculation=True,
|
||||
num_prefill_tokens=num_prefill_tokens,
|
||||
num_decode_tokens=num_decode_tokens,
|
||||
|
||||
@ -17,6 +17,7 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear,
|
||||
MergedColumnParallelLinear,
|
||||
RowParallelLinear)
|
||||
from vllm.model_executor.layers.logits_processor import LogitsProcessor
|
||||
from vllm.model_executor.layers.sampler import Sampler
|
||||
from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
|
||||
from vllm.model_executor.models.interfaces import SupportsLoRA
|
||||
from vllm.platforms import current_platform
|
||||
@ -96,6 +97,7 @@ def dummy_model() -> nn.Module:
|
||||
# Special handling for lm_head & sampler
|
||||
("lm_head", ParallelLMHead(512, 10)),
|
||||
("logits_processor", LogitsProcessor(512)),
|
||||
("sampler", Sampler())
|
||||
]))
|
||||
model.config = MagicMock()
|
||||
model.embedding_modules = {"lm_head": "lm_head"}
|
||||
@ -123,6 +125,7 @@ def dummy_model_gate_up() -> nn.Module:
|
||||
# Special handling for lm_head & sampler
|
||||
("lm_head", ParallelLMHead(512, 10)),
|
||||
("logits_processor", LogitsProcessor(512)),
|
||||
("sampler", Sampler())
|
||||
]))
|
||||
model.config = MagicMock()
|
||||
model.packed_modules_mapping = {
|
||||
|
||||
@ -6,10 +6,10 @@ Script to test add_lora, remove_lora, pin_lora, list_loras functions.
|
||||
import pytest
|
||||
|
||||
from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
|
||||
from vllm.engine.llm_engine import LLMEngine
|
||||
from vllm.entrypoints.openai.api_server import (
|
||||
build_async_engine_client_from_engine_args)
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.v1.engine.llm_engine import LLMEngine
|
||||
|
||||
MODEL_PATH = "meta-llama/Llama-2-7b-hf"
|
||||
LORA_MODULE_PATH = "yard1/llama-2-7b-sql-lora-test"
|
||||
|
||||
@ -8,12 +8,11 @@ import torch
|
||||
from safetensors.torch import load_file
|
||||
from torch import nn
|
||||
|
||||
from vllm.config import ModelConfig, VllmConfig
|
||||
from vllm.config.lora import LoRAConfig
|
||||
from vllm.lora.layers import (ColumnParallelLinearWithLoRA,
|
||||
MergedColumnParallelLinearWithLoRA,
|
||||
RowParallelLinearWithLoRA)
|
||||
from vllm.lora.lora_weights import LoRALayerWeights, PackedLoRALayerWeights
|
||||
from vllm.lora.lora import LoRALayerWeights, PackedLoRALayerWeights
|
||||
from vllm.lora.models import (LoRAMapping, LoRAModel, LoRAModelManager,
|
||||
LRUCacheLoRAModelManager)
|
||||
from vllm.lora.peft_helper import PEFTHelper
|
||||
@ -436,19 +435,10 @@ def test_lru_cache_worker_adapter_manager(dist_init, dummy_model, device,
|
||||
target_modules=["layer1.dense1", "dense2"],
|
||||
lora_dtype=DEFAULT_DTYPE,
|
||||
)
|
||||
|
||||
model_config = ModelConfig(max_model_len=16)
|
||||
vllm_config = VllmConfig(model_config=model_config,
|
||||
lora_config=lora_config)
|
||||
|
||||
vllm_config.scheduler_config.max_num_seqs = 4
|
||||
vllm_config.scheduler_config.max_num_batched_tokens = 2
|
||||
worker_adapter_manager = LRUCacheWorkerLoRAManager(
|
||||
vllm_config, device, EMBEDDING_MODULES, EMBEDDING_PADDING_MODULES)
|
||||
|
||||
worker_adapter_manager.max_num_seqs = 4
|
||||
worker_adapter_manager.max_num_batched_tokens = 2
|
||||
|
||||
4, 2,
|
||||
dummy_model.unpadded_vocab_size - lora_config.lora_extra_vocab_size,
|
||||
lora_config, device, EMBEDDING_MODULES, EMBEDDING_PADDING_MODULES)
|
||||
worker_adapter_manager.create_lora_manager(dummy_model)
|
||||
|
||||
mapping = LoRAMapping([], [])
|
||||
@ -527,20 +517,10 @@ def test_worker_adapter_manager(dist_init, dummy_model_gate_up, device,
|
||||
max_cpu_loras=4,
|
||||
max_loras=4,
|
||||
lora_dtype=DEFAULT_DTYPE)
|
||||
|
||||
model_config = ModelConfig(max_model_len=16)
|
||||
vllm_config = VllmConfig(model_config=model_config,
|
||||
lora_config=lora_config)
|
||||
|
||||
vllm_config.scheduler_config.max_num_seqs = 4
|
||||
vllm_config.scheduler_config.max_num_batched_tokens = 2
|
||||
|
||||
worker_adapter_manager = WorkerLoRAManager(vllm_config, device,
|
||||
EMBEDDING_MODULES,
|
||||
EMBEDDING_PADDING_MODULES)
|
||||
worker_adapter_manager.vocab_size = (
|
||||
dummy_model_gate_up.unpadded_vocab_size -
|
||||
lora_config.lora_extra_vocab_size)
|
||||
worker_adapter_manager = WorkerLoRAManager(
|
||||
4, 2, dummy_model_gate_up.unpadded_vocab_size -
|
||||
lora_config.lora_extra_vocab_size, lora_config, device,
|
||||
EMBEDDING_MODULES, EMBEDDING_PADDING_MODULES)
|
||||
worker_adapter_manager.create_lora_manager(dummy_model_gate_up)
|
||||
|
||||
dummy_lora_files = f"{tmp_path}/lora_adapter"
|
||||
|
||||
@ -9,7 +9,7 @@ from typing import Optional, Union
|
||||
import torch
|
||||
from safetensors.torch import save_file
|
||||
|
||||
from vllm.lora.lora_weights import LoRALayerWeights, PackedLoRALayerWeights
|
||||
from vllm.lora.lora import LoRALayerWeights, PackedLoRALayerWeights
|
||||
|
||||
|
||||
class DummyLoRAManager:
|
||||
|
||||
@ -15,8 +15,7 @@ from ...utils import check_logprobs_close
|
||||
# have a clean way to fall back, so we fail with
|
||||
# a clear msg when it happens.
|
||||
# https://github.com/vllm-project/vllm/issues/14524
|
||||
# NOTE(woosuk): Skipping these tests until V1 supports them.
|
||||
# REQUIRES_V0 = ["microsoft/phi-2", "stabilityai/stablelm-3b-4e1t"]
|
||||
REQUIRES_V0 = ["microsoft/phi-2", "stabilityai/stablelm-3b-4e1t"]
|
||||
|
||||
# This list contains the model that are using AITER kernel.
|
||||
# Skip model that are not using AITER tests.
|
||||
@ -114,6 +113,9 @@ def test_models(hf_runner, vllm_runner, example_prompts, model: str,
|
||||
model_info.check_available_online(on_fail="skip")
|
||||
model_info.check_transformers_version(on_fail="skip")
|
||||
|
||||
if model in REQUIRES_V0:
|
||||
monkeypatch.setenv("VLLM_USE_V1", "0")
|
||||
|
||||
if use_rocm_aiter and (model in AITER_MODEL_LIST):
|
||||
monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
|
||||
elif use_rocm_aiter and model not in AITER_MODEL_LIST:
|
||||
@ -123,6 +125,12 @@ def test_models(hf_runner, vllm_runner, example_prompts, model: str,
|
||||
# in parts of the operators
|
||||
pytest.skip(f"Skipping '{model}' model test with AITER kernel.")
|
||||
|
||||
# Note: can be removed when
|
||||
# https://github.com/vllm-project/vllm/pull/24278 finished
|
||||
if current_platform.is_cpu() and use_prompt_embeds:
|
||||
pytest.skip("Skipping use_prompt_embeds=True with "
|
||||
"V1-only CPU backend.")
|
||||
|
||||
with hf_runner(model) as hf_model:
|
||||
hf_outputs = hf_model.generate_greedy_logprobs_limit(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
|
||||
@ -8,7 +8,7 @@ from tests.utils import multi_gpu_test
|
||||
from vllm.engine.arg_utils import EngineArgs
|
||||
from vllm.sampling_params import SamplingParams
|
||||
|
||||
from ...utils import check_logprobs_close
|
||||
from ...utils import check_logprobs_close, check_outputs_equal
|
||||
|
||||
# Mark all tests as hybrid
|
||||
pytestmark = pytest.mark.hybrid_model
|
||||
@ -88,6 +88,15 @@ def test_models(
|
||||
hf_outputs = hf_model.generate_greedy_logprobs_limit(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_USE_V1", "0")
|
||||
if model not in V0_UNSUPPORTED_MODELS:
|
||||
with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
|
||||
vllm_v0_outputs = vllm_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
else:
|
||||
vllm_v0_outputs = None
|
||||
|
||||
if model in V1_SUPPORTED_MODELS:
|
||||
with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
|
||||
vllm_v1_outputs = vllm_model.generate_greedy_logprobs(
|
||||
@ -95,6 +104,14 @@ def test_models(
|
||||
else:
|
||||
vllm_v1_outputs = None
|
||||
|
||||
if vllm_v0_outputs is not None:
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=vllm_v0_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm-v0",
|
||||
)
|
||||
|
||||
if model in V1_SUPPORTED_MODELS:
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=hf_outputs,
|
||||
@ -140,6 +157,45 @@ def test_batching(
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
|
||||
@pytest.mark.parametrize("max_tokens", [32])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
@pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16])
|
||||
def test_chunked_prefill(
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
chunked_prefill_token_size: int,
|
||||
monkeypatch,
|
||||
) -> None:
|
||||
max_num_seqs = chunked_prefill_token_size
|
||||
max_num_batched_tokens = chunked_prefill_token_size
|
||||
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_USE_V1", "0")
|
||||
with vllm_runner(model,
|
||||
enable_chunked_prefill=True,
|
||||
max_num_batched_tokens=max_num_batched_tokens,
|
||||
max_num_seqs=max_num_seqs) as vllm_model:
|
||||
chunked = vllm_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
|
||||
with vllm_runner(model,
|
||||
enable_chunked_prefill=False,
|
||||
max_num_seqs=max_num_seqs) as vllm_model:
|
||||
non_chunked = vllm_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=chunked,
|
||||
outputs_1_lst=non_chunked,
|
||||
name_0="chunked",
|
||||
name_1="non_chunked",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
|
||||
@pytest.mark.parametrize("max_tokens", [10])
|
||||
def test_chunked_prefill_with_parallel_sampling(
|
||||
@ -201,6 +257,38 @@ def test_mamba_cache_cg_padding(
|
||||
"Could be related to mamba cache not padded correctly")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
|
||||
@pytest.mark.parametrize("max_tokens", [20])
|
||||
def test_models_preemption_recompute(
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
max_tokens: int,
|
||||
monkeypatch,
|
||||
) -> None:
|
||||
"""
|
||||
Tests that outputs are identical with and w/o preemptions (recompute).
|
||||
"""
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_USE_V1", "0")
|
||||
with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
|
||||
scheduler = vllm_model.llm.llm_engine.scheduler[0]
|
||||
scheduler.ENABLE_ARTIFICIAL_PREEMPT = True
|
||||
preempt_vllm_outputs = vllm_model.generate_greedy(
|
||||
example_prompts, max_tokens)
|
||||
|
||||
scheduler.ENABLE_ARTIFICIAL_PREEMPT = False
|
||||
vllm_outputs = vllm_model.generate_greedy(example_prompts,
|
||||
max_tokens)
|
||||
|
||||
check_outputs_equal(
|
||||
outputs_0_lst=preempt_vllm_outputs,
|
||||
outputs_1_lst=vllm_outputs,
|
||||
name_0="vllm_preepmtions",
|
||||
name_1="vllm",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
|
||||
def test_fail_upon_inc_requests_and_finished_requests_lt_available_blocks(
|
||||
vllm_runner,
|
||||
@ -298,10 +386,27 @@ def test_full_cuda_graph(
|
||||
hf_outputs = hf_model.generate_greedy_logprobs_limit(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_USE_V1", "0")
|
||||
if model not in V0_UNSUPPORTED_MODELS:
|
||||
with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
|
||||
vllm_v0_outputs = vllm_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
else:
|
||||
vllm_v0_outputs = None
|
||||
|
||||
with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
|
||||
vllm_v1_outputs = vllm_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
|
||||
if vllm_v0_outputs is not None:
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=vllm_v0_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm-v0",
|
||||
)
|
||||
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=vllm_v1_outputs,
|
||||
@ -337,12 +442,27 @@ def test_fp32_cache_state(
|
||||
hf_outputs = hf_model.generate_greedy_logprobs_limit(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_USE_V1", "0")
|
||||
with vllm_runner(model,
|
||||
max_num_seqs=MAX_NUM_SEQS,
|
||||
**{cache_dtype_param: "float32"}) as vllm_model:
|
||||
vllm_v0_outputs = vllm_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
|
||||
with vllm_runner(model,
|
||||
max_num_seqs=MAX_NUM_SEQS,
|
||||
**{cache_dtype_param: "float32"}) as vllm_model:
|
||||
vllm_v1_outputs = vllm_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=vllm_v0_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm-v0",
|
||||
)
|
||||
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=vllm_v1_outputs,
|
||||
|
||||
@ -58,7 +58,7 @@ def test_models(
|
||||
|
||||
vllm_extra_kwargs = {}
|
||||
if model == "ssmits/Qwen2-7B-Instruct-embed-base":
|
||||
vllm_extra_kwargs["pooler_config"] = \
|
||||
vllm_extra_kwargs["override_pooler_config"] = \
|
||||
PoolerConfig(pooling_type="MEAN", normalize=False)
|
||||
|
||||
max_model_len: Optional[int] = 512
|
||||
|
||||
@ -1,7 +1,6 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from vllm.config.pooler import PoolerConfig
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
|
||||
@ -100,7 +99,7 @@ def test_gemma_multimodal(
|
||||
convert="classify",
|
||||
load_format="auto",
|
||||
hf_overrides=update_config,
|
||||
pooler_config=PoolerConfig(pooling_type="LAST"),
|
||||
override_pooler_config={"pooling_type": "LAST"},
|
||||
max_model_len=512,
|
||||
enforce_eager=True,
|
||||
tensor_parallel_size=1,
|
||||
|
||||
@ -24,18 +24,18 @@ def test_classify_models_using_activation(
|
||||
dtype: str,
|
||||
) -> None:
|
||||
|
||||
with vllm_runner(
|
||||
model,
|
||||
max_model_len=512,
|
||||
dtype=dtype,
|
||||
pooler_config=PoolerConfig(activation=False)) as vllm_model:
|
||||
with vllm_runner(model,
|
||||
max_model_len=512,
|
||||
dtype=dtype,
|
||||
override_pooler_config=PoolerConfig(
|
||||
activation=False)) as vllm_model:
|
||||
wo_activation_out = vllm_model.classify(example_prompts)
|
||||
|
||||
with vllm_runner(
|
||||
model,
|
||||
max_model_len=512,
|
||||
dtype=dtype,
|
||||
pooler_config=PoolerConfig(activation=True)) as vllm_model:
|
||||
with vllm_runner(model,
|
||||
max_model_len=512,
|
||||
dtype=dtype,
|
||||
override_pooler_config=PoolerConfig(
|
||||
activation=True)) as vllm_model:
|
||||
w_activation_out = vllm_model.classify(example_prompts)
|
||||
|
||||
for wo_activation, w_activation in zip(wo_activation_out,
|
||||
@ -43,8 +43,9 @@ def test_classify_models_using_activation(
|
||||
wo_activation = torch.tensor(wo_activation)
|
||||
w_activation = torch.tensor(w_activation)
|
||||
|
||||
assert not torch.allclose(wo_activation, w_activation,
|
||||
atol=1e-2), "pooler_config is not working"
|
||||
assert not torch.allclose(
|
||||
wo_activation, w_activation,
|
||||
atol=1e-2), "override_pooler_config is not working"
|
||||
assert torch.allclose(softmax(wo_activation), w_activation,
|
||||
1e-3 if dtype == "float" else 1e-2)
|
||||
|
||||
@ -64,22 +65,23 @@ def test_embed_models_using_normalize(
|
||||
dtype: str,
|
||||
) -> None:
|
||||
|
||||
with vllm_runner(model,
|
||||
max_model_len=512,
|
||||
dtype=dtype,
|
||||
override_pooler_config=PoolerConfig(
|
||||
normalize=False)) as vllm_model:
|
||||
wo_normalize = torch.tensor(vllm_model.embed(example_prompts))
|
||||
|
||||
with vllm_runner(
|
||||
model,
|
||||
max_model_len=512,
|
||||
dtype=dtype,
|
||||
pooler_config=PoolerConfig(normalize=False)) as vllm_model:
|
||||
wo_normalize = torch.tensor(vllm_model.embed(example_prompts))
|
||||
|
||||
with vllm_runner(model,
|
||||
max_model_len=512,
|
||||
dtype=dtype,
|
||||
pooler_config=PoolerConfig(normalize=True)) as vllm_model:
|
||||
override_pooler_config=PoolerConfig(normalize=True)) as vllm_model:
|
||||
w_normalize = torch.tensor(vllm_model.embed(example_prompts))
|
||||
|
||||
assert not torch.allclose(
|
||||
wo_normalize, w_normalize,
|
||||
atol=1e-2), "pooler_config normalize is not working"
|
||||
atol=1e-2), "override_pooler_config normalize is not working"
|
||||
assert torch.allclose(
|
||||
F.normalize(wo_normalize, p=2, dim=-1), w_normalize,
|
||||
atol=1e-2), "w_normal should be close to normal(wo_normal)."
|
||||
@ -100,16 +102,18 @@ def test_reward_models_using_softmax(
|
||||
dtype: str,
|
||||
) -> None:
|
||||
|
||||
with vllm_runner(model,
|
||||
max_model_len=1024,
|
||||
dtype=dtype,
|
||||
pooler_config=PoolerConfig(softmax=False)) as vllm_model:
|
||||
with vllm_runner(
|
||||
model,
|
||||
max_model_len=1024,
|
||||
dtype=dtype,
|
||||
override_pooler_config=PoolerConfig(softmax=False)) as vllm_model:
|
||||
wo_softmax = vllm_model.encode(example_prompts)
|
||||
|
||||
with vllm_runner(model,
|
||||
max_model_len=1024,
|
||||
dtype=dtype,
|
||||
pooler_config=PoolerConfig(softmax=True)) as vllm_model:
|
||||
with vllm_runner(
|
||||
model,
|
||||
max_model_len=1024,
|
||||
dtype=dtype,
|
||||
override_pooler_config=PoolerConfig(softmax=True)) as vllm_model:
|
||||
w_softmax = vllm_model.encode(example_prompts)
|
||||
|
||||
for wo, w in zip(wo_softmax, w_softmax):
|
||||
@ -117,7 +121,7 @@ def test_reward_models_using_softmax(
|
||||
w = torch.tensor(w)
|
||||
|
||||
assert not torch.allclose(
|
||||
wo, w, atol=1e-2), "pooler_config softmax is not working"
|
||||
wo, w, atol=1e-2), "override_pooler_config softmax is not working"
|
||||
assert torch.allclose(
|
||||
softmax(wo), w,
|
||||
atol=1e-2), "w_softmax should be close to softmax(wo_softmax)."
|
||||
@ -1,5 +1,6 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import os
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
@ -81,7 +82,7 @@ def test_prm_models(
|
||||
check_transformers_version("Qwen/Qwen2.5-Math-PRM-7B",
|
||||
max_transformers_version="4.53.2")
|
||||
|
||||
if current_platform.is_cpu():
|
||||
if current_platform.is_cpu() and os.environ.get("VLLM_USE_V1", "0") == "0":
|
||||
pytest.skip("CPU only supports V1")
|
||||
|
||||
if current_platform.is_rocm():
|
||||
|
||||
@ -32,6 +32,13 @@ from .vlm_utils.types import (CustomTestOptions, ExpandableVLMTestArgs,
|
||||
if current_platform.is_rocm():
|
||||
os.environ["VLLM_USE_TRITON_FLASH_ATTN"] = "0"
|
||||
|
||||
REQUIRES_V0_MODELS = [
|
||||
# V1 Test: not enough KV cache space in C1.
|
||||
"fuyu",
|
||||
# V1 Test: Deadlock issue when processing mm_inputs
|
||||
"llava-onevision-transformers",
|
||||
]
|
||||
|
||||
# yapf: disable
|
||||
COMMON_BROADCAST_SETTINGS = {
|
||||
"test_type": VLMTestType.IMAGE,
|
||||
@ -179,11 +186,8 @@ VLM_TEST_SETTINGS = {
|
||||
image_size_factors=[(0.25, 0.5, 1.0)],
|
||||
vllm_runner_kwargs={
|
||||
"model_impl": "transformers",
|
||||
"default_torch_num_threads": 1,
|
||||
},
|
||||
# FIXME: Investigate why the test hangs
|
||||
# when processing the 3rd prompt in vLLM
|
||||
marks=[pytest.mark.core_model, pytest.mark.skip(reason="Test hangs")],
|
||||
marks=[pytest.mark.core_model],
|
||||
),
|
||||
"idefics3-transformers": VLMTestInfo(
|
||||
models=["HuggingFaceTB/SmolVLM-256M-Instruct"],
|
||||
@ -316,7 +320,6 @@ VLM_TEST_SETTINGS = {
|
||||
vllm_output_post_proc=model_utils.fuyu_vllm_to_hf_output,
|
||||
num_logprobs=10,
|
||||
image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
|
||||
marks=[large_gpu_mark(min_gb=32)],
|
||||
),
|
||||
"gemma3": VLMTestInfo(
|
||||
models=["google/gemma-3-4b-it"],
|
||||
@ -858,14 +861,13 @@ VLM_TEST_SETTINGS = _mark_splits(VLM_TEST_SETTINGS, num_groups=2)
|
||||
test_type=VLMTestType.IMAGE,
|
||||
create_new_process_for_each_test=False,
|
||||
))
|
||||
def test_single_image_models(
|
||||
tmp_path: PosixPath,
|
||||
model_type: str,
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
image_assets: ImageTestAssets,
|
||||
):
|
||||
def test_single_image_models(tmp_path: PosixPath, model_type: str,
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
image_assets: ImageTestAssets, monkeypatch):
|
||||
if model_type in REQUIRES_V0_MODELS:
|
||||
monkeypatch.setenv("VLLM_USE_V1", "0")
|
||||
model_test_info = VLM_TEST_SETTINGS[model_type]
|
||||
runners.run_single_image_test(
|
||||
tmp_path=tmp_path,
|
||||
@ -884,14 +886,13 @@ def test_single_image_models(
|
||||
test_type=VLMTestType.MULTI_IMAGE,
|
||||
create_new_process_for_each_test=False,
|
||||
))
|
||||
def test_multi_image_models(
|
||||
tmp_path: PosixPath,
|
||||
model_type: str,
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
image_assets: ImageTestAssets,
|
||||
):
|
||||
def test_multi_image_models(tmp_path: PosixPath, model_type: str,
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
image_assets: ImageTestAssets, monkeypatch):
|
||||
if model_type in REQUIRES_V0_MODELS:
|
||||
monkeypatch.setenv("VLLM_USE_V1", "0")
|
||||
model_test_info = VLM_TEST_SETTINGS[model_type]
|
||||
runners.run_multi_image_test(
|
||||
tmp_path=tmp_path,
|
||||
@ -910,13 +911,13 @@ def test_multi_image_models(
|
||||
test_type=VLMTestType.EMBEDDING,
|
||||
create_new_process_for_each_test=False,
|
||||
))
|
||||
def test_image_embedding_models(
|
||||
model_type: str,
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
image_assets: ImageTestAssets,
|
||||
):
|
||||
def test_image_embedding_models(model_type: str,
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
image_assets: ImageTestAssets, monkeypatch):
|
||||
if model_type in REQUIRES_V0_MODELS:
|
||||
monkeypatch.setenv("VLLM_USE_V1", "0")
|
||||
model_test_info = VLM_TEST_SETTINGS[model_type]
|
||||
runners.run_embedding_test(
|
||||
model_test_info=model_test_info,
|
||||
@ -934,13 +935,11 @@ def test_image_embedding_models(
|
||||
test_type=VLMTestType.VIDEO,
|
||||
create_new_process_for_each_test=False,
|
||||
))
|
||||
def test_video_models(
|
||||
model_type: str,
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
video_assets: VideoTestAssets,
|
||||
):
|
||||
def test_video_models(model_type: str, test_case: ExpandableVLMTestArgs,
|
||||
hf_runner: type[HfRunner], vllm_runner: type[VllmRunner],
|
||||
video_assets: VideoTestAssets, monkeypatch):
|
||||
if model_type in REQUIRES_V0_MODELS:
|
||||
monkeypatch.setenv("VLLM_USE_V1", "0")
|
||||
model_test_info = VLM_TEST_SETTINGS[model_type]
|
||||
runners.run_video_test(
|
||||
model_test_info=model_test_info,
|
||||
@ -958,13 +957,11 @@ def test_video_models(
|
||||
test_type=VLMTestType.AUDIO,
|
||||
create_new_process_for_each_test=False,
|
||||
))
|
||||
def test_audio_models(
|
||||
model_type: str,
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
audio_assets: AudioTestAssets,
|
||||
):
|
||||
def test_audio_models(model_type: str, test_case: ExpandableVLMTestArgs,
|
||||
hf_runner: type[HfRunner], vllm_runner: type[VllmRunner],
|
||||
audio_assets: AudioTestAssets, monkeypatch):
|
||||
if model_type in REQUIRES_V0_MODELS:
|
||||
monkeypatch.setenv("VLLM_USE_V1", "0")
|
||||
model_test_info = VLM_TEST_SETTINGS[model_type]
|
||||
runners.run_audio_test(
|
||||
model_test_info=model_test_info,
|
||||
@ -987,7 +984,10 @@ def test_custom_inputs_models(
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
monkeypatch,
|
||||
):
|
||||
if model_type in REQUIRES_V0_MODELS:
|
||||
monkeypatch.setenv("VLLM_USE_V1", "0")
|
||||
model_test_info = VLM_TEST_SETTINGS[model_type]
|
||||
runners.run_custom_inputs_test(
|
||||
model_test_info=model_test_info,
|
||||
@ -1006,14 +1006,13 @@ def test_custom_inputs_models(
|
||||
create_new_process_for_each_test=True,
|
||||
))
|
||||
@create_new_process_for_each_test()
|
||||
def test_single_image_models_heavy(
|
||||
tmp_path: PosixPath,
|
||||
model_type: str,
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
image_assets: ImageTestAssets,
|
||||
):
|
||||
def test_single_image_models_heavy(tmp_path: PosixPath, model_type: str,
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
image_assets: ImageTestAssets, monkeypatch):
|
||||
if model_type in REQUIRES_V0_MODELS:
|
||||
monkeypatch.setenv("VLLM_USE_V1", "0")
|
||||
model_test_info = VLM_TEST_SETTINGS[model_type]
|
||||
runners.run_single_image_test(
|
||||
tmp_path=tmp_path,
|
||||
@ -1033,14 +1032,13 @@ def test_single_image_models_heavy(
|
||||
create_new_process_for_each_test=True,
|
||||
))
|
||||
@create_new_process_for_each_test()
|
||||
def test_multi_image_models_heavy(
|
||||
tmp_path: PosixPath,
|
||||
model_type: str,
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
image_assets: ImageTestAssets,
|
||||
):
|
||||
def test_multi_image_models_heavy(tmp_path: PosixPath, model_type: str,
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
image_assets: ImageTestAssets, monkeypatch):
|
||||
if model_type in REQUIRES_V0_MODELS:
|
||||
monkeypatch.setenv("VLLM_USE_V1", "0")
|
||||
model_test_info = VLM_TEST_SETTINGS[model_type]
|
||||
runners.run_multi_image_test(
|
||||
tmp_path=tmp_path,
|
||||
@ -1060,13 +1058,14 @@ def test_multi_image_models_heavy(
|
||||
create_new_process_for_each_test=True,
|
||||
))
|
||||
@create_new_process_for_each_test()
|
||||
def test_image_embedding_models_heavy(
|
||||
model_type: str,
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
image_assets: ImageTestAssets,
|
||||
):
|
||||
def test_image_embedding_models_heavy(model_type: str,
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
image_assets: ImageTestAssets,
|
||||
monkeypatch):
|
||||
if model_type in REQUIRES_V0_MODELS:
|
||||
monkeypatch.setenv("VLLM_USE_V1", "0")
|
||||
model_test_info = VLM_TEST_SETTINGS[model_type]
|
||||
runners.run_embedding_test(
|
||||
model_test_info=model_test_info,
|
||||
@ -1084,13 +1083,12 @@ def test_image_embedding_models_heavy(
|
||||
test_type=VLMTestType.VIDEO,
|
||||
create_new_process_for_each_test=True,
|
||||
))
|
||||
def test_video_models_heavy(
|
||||
model_type: str,
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
video_assets: VideoTestAssets,
|
||||
):
|
||||
def test_video_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs,
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
video_assets: VideoTestAssets, monkeypatch):
|
||||
if model_type in REQUIRES_V0_MODELS:
|
||||
monkeypatch.setenv("VLLM_USE_V1", "0")
|
||||
model_test_info = VLM_TEST_SETTINGS[model_type]
|
||||
runners.run_video_test(
|
||||
model_test_info=model_test_info,
|
||||
@ -1108,13 +1106,12 @@ def test_video_models_heavy(
|
||||
test_type=VLMTestType.AUDIO,
|
||||
create_new_process_for_each_test=True,
|
||||
))
|
||||
def test_audio_models_heavy(
|
||||
model_type: str,
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
audio_assets: AudioTestAssets,
|
||||
):
|
||||
def test_audio_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs,
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
audio_assets: AudioTestAssets, monkeypatch):
|
||||
if model_type in REQUIRES_V0_MODELS:
|
||||
monkeypatch.setenv("VLLM_USE_V1", "0")
|
||||
model_test_info = VLM_TEST_SETTINGS[model_type]
|
||||
runners.run_audio_test(
|
||||
model_test_info=model_test_info,
|
||||
@ -1138,7 +1135,10 @@ def test_custom_inputs_models_heavy(
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
monkeypatch,
|
||||
):
|
||||
if model_type in REQUIRES_V0_MODELS:
|
||||
monkeypatch.setenv("VLLM_USE_V1", "0")
|
||||
model_test_info = VLM_TEST_SETTINGS[model_type]
|
||||
runners.run_custom_inputs_test(
|
||||
model_test_info=model_test_info,
|
||||
|
||||
@ -7,8 +7,8 @@ from typing import Optional
|
||||
import pytest
|
||||
from transformers import AutoModelForSpeechSeq2Seq
|
||||
|
||||
from vllm.logprobs import SampleLogprobs
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.sequence import SampleLogprobs
|
||||
|
||||
from ....conftest import (AudioTestAssets, HfRunner, PromptAudioInput,
|
||||
VllmRunner)
|
||||
|
||||
@ -12,10 +12,10 @@ from huggingface_hub import snapshot_download
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
from vllm.assets.image import ImageAsset
|
||||
from vllm.logprobs import SampleLogprobs
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.multimodal.image import convert_image_mode, rescale_image_size
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.sequence import SampleLogprobs
|
||||
|
||||
from ....conftest import (IMAGE_ASSETS, HfRunner, PromptAudioInput,
|
||||
PromptImageInput, VllmRunner)
|
||||
|
||||
@ -12,12 +12,13 @@ from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
|
||||
from mistral_common.tokens.tokenizers.multimodal import image_from_chunk
|
||||
from transformers import AutoProcessor
|
||||
|
||||
from vllm import SamplingParams, TextPrompt, TokensPrompt
|
||||
from vllm.logprobs import Logprob, SampleLogprobs
|
||||
from vllm import RequestOutput, SamplingParams, TextPrompt, TokensPrompt
|
||||
from vllm.multimodal import MultiModalDataBuiltins
|
||||
from vllm.multimodal.inputs import PlaceholderRange
|
||||
from vllm.sequence import Logprob, SampleLogprobs
|
||||
|
||||
from ....utils import VLLM_PATH, large_gpu_test
|
||||
from ...utils import check_logprobs_close
|
||||
from ...utils import check_logprobs_close, dummy_hf_overrides
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from _typeshed import StrPath
|
||||
@ -184,3 +185,47 @@ def test_chat(vllm_runner, max_model_len: int, model: str, dtype: str,
|
||||
outputs_1_lst=logprobs,
|
||||
name_0="h100_ref",
|
||||
name_1="output")
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"image_urls,expected_ranges",
|
||||
[(IMG_URLS[:1], [PlaceholderRange(offset=11, length=494)]),
|
||||
(IMG_URLS[1:4], [
|
||||
PlaceholderRange(offset=11, length=266),
|
||||
PlaceholderRange(offset=277, length=1056),
|
||||
PlaceholderRange(offset=1333, length=418)
|
||||
])])
|
||||
def test_multi_modal_placeholders(vllm_runner, image_urls: list[str],
|
||||
expected_ranges: list[PlaceholderRange],
|
||||
local_asset_server, monkeypatch) -> None:
|
||||
local_image_urls = [local_asset_server.url_for(u) for u in image_urls]
|
||||
prompt = _create_engine_inputs_hf(local_image_urls)
|
||||
|
||||
# This placeholder checking test only works with V0 engine
|
||||
# where `multi_modal_placeholders` is returned with `RequestOutput`
|
||||
monkeypatch.setenv("VLLM_USE_V1", "0")
|
||||
with vllm_runner(
|
||||
"mistral-community/pixtral-12b",
|
||||
max_model_len=8192,
|
||||
limit_mm_per_prompt=LIMIT_MM_PER_PROMPT,
|
||||
load_format="dummy",
|
||||
hf_overrides=dummy_hf_overrides,
|
||||
) as vllm_model:
|
||||
outputs = vllm_model.llm.generate(prompt)
|
||||
|
||||
assert len(outputs) == 1, f"{len(outputs)=}"
|
||||
output: RequestOutput = outputs[0]
|
||||
assert hasattr(output,
|
||||
"multi_modal_placeholders"), f"{output.__dict__=}"
|
||||
assert "image" in output.multi_modal_placeholders, \
|
||||
f"{output.multi_modal_placeholders.keys()=}"
|
||||
image_placeholder_ranges: list[
|
||||
PlaceholderRange] = output.multi_modal_placeholders["image"]
|
||||
assert len(image_placeholder_ranges) == len(
|
||||
expected_ranges), f"{image_placeholder_ranges=}"
|
||||
for real_range, expected_range in zip(image_placeholder_ranges,
|
||||
expected_ranges):
|
||||
assert real_range.offset == expected_range.offset, \
|
||||
f"{real_range=} {expected_range=}"
|
||||
assert real_range.length == expected_range.length, \
|
||||
f"{real_range=} {expected_range=}"
|
||||
|
||||
@ -17,9 +17,11 @@ from ...utils import check_logprobs_close
|
||||
|
||||
|
||||
@pytest.fixture(scope="function", autouse=True)
|
||||
def enable_pickle(monkeypatch):
|
||||
"""`LLM.apply_model` requires pickling a function."""
|
||||
monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
|
||||
def use_v0_only(monkeypatch):
|
||||
"""
|
||||
V1 Test: batch_make_xxxxx_embeddings calls a V0 internal
|
||||
"""
|
||||
monkeypatch.setenv('VLLM_USE_V1', '0')
|
||||
|
||||
|
||||
models = ["Qwen/Qwen2-VL-2B-Instruct"]
|
||||
@ -124,8 +126,9 @@ def batch_make_image_embeddings(
|
||||
image_grid_thw_on_device = image_grid_thw.to(visual.device,
|
||||
dtype=torch.int64)
|
||||
return visual(pixel_values_on_device,
|
||||
grid_thw=image_grid_thw_on_device).cpu()
|
||||
grid_thw=image_grid_thw_on_device)
|
||||
|
||||
# V1 Test: this calls a V0 internal.
|
||||
image_embeds = torch.concat(llm.apply_model(get_image_embeds))
|
||||
|
||||
# split into original batches
|
||||
@ -207,8 +210,9 @@ def batch_make_video_embeddings(
|
||||
video_grid_thw_on_device = video_grid_thw.to(visual.device,
|
||||
dtype=torch.int64)
|
||||
return visual(pixel_values_on_device,
|
||||
grid_thw=video_grid_thw_on_device).cpu()
|
||||
grid_thw=video_grid_thw_on_device)
|
||||
|
||||
# V1 Test: this calls a V0 internal.
|
||||
video_embeds = torch.concat(llm.apply_model(get_image_embeds))
|
||||
|
||||
# split into original batches
|
||||
@ -262,20 +266,19 @@ def run_embedding_input_test(
|
||||
processor = AutoProcessor.from_pretrained(model)
|
||||
|
||||
# max_model_len should be greater than image_feature_size
|
||||
with vllm_runner(
|
||||
model,
|
||||
runner="generate",
|
||||
max_model_len=4000,
|
||||
max_num_seqs=3,
|
||||
dtype=dtype,
|
||||
limit_mm_per_prompt={
|
||||
"image": mm_limit,
|
||||
"video": mm_limit
|
||||
},
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
default_torch_num_threads=1,
|
||||
) as vllm_model:
|
||||
with vllm_runner(model,
|
||||
runner="generate",
|
||||
max_model_len=4000,
|
||||
max_num_seqs=3,
|
||||
dtype=dtype,
|
||||
limit_mm_per_prompt={
|
||||
"image": mm_limit,
|
||||
"video": mm_limit
|
||||
},
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
distributed_executor_backend=distributed_executor_backend
|
||||
) as vllm_model:
|
||||
|
||||
outputs_per_case_for_original_input = [
|
||||
vllm_model.generate_greedy_logprobs(prompts,
|
||||
max_tokens,
|
||||
@ -326,8 +329,9 @@ def run_embedding_input_test(
|
||||
@pytest.mark.parametrize("max_tokens", [128])
|
||||
@pytest.mark.parametrize("num_logprobs", [10])
|
||||
def test_qwen2_vl_image_embeddings_input(vllm_runner, image_assets, model,
|
||||
size_factors, dtype, max_tokens,
|
||||
num_logprobs, monkeypatch) -> None:
|
||||
size_factors, dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int) -> None:
|
||||
images = [asset.pil_image for asset in image_assets]
|
||||
|
||||
inputs_per_case: list[tuple[
|
||||
|
||||
@ -19,7 +19,7 @@ from transformers import (AutoConfig, AutoTokenizer, BatchFeature,
|
||||
GenerationConfig, GenerationMixin)
|
||||
from transformers.video_utils import VideoMetadata
|
||||
|
||||
from vllm.logprobs import SampleLogprobs
|
||||
from vllm.sequence import SampleLogprobs
|
||||
from vllm.utils import is_list_of
|
||||
|
||||
from .....conftest import HfRunner, ImageAsset, ImageTestAssets
|
||||
|
||||
@ -12,7 +12,7 @@ from transformers import AutoModelForCausalLM
|
||||
from transformers.models.auto.auto_factory import _BaseAutoModelClass
|
||||
|
||||
from vllm.config import RunnerOption
|
||||
from vllm.logprobs import SampleLogprobs
|
||||
from vllm.sequence import SampleLogprobs
|
||||
from vllm.transformers_utils.tokenizer import AnyTokenizer
|
||||
|
||||
from .....conftest import (AUDIO_ASSETS, IMAGE_ASSETS, HfRunner, ImageAsset,
|
||||
|
||||
@ -4,6 +4,8 @@
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from vllm.utils import set_default_torch_num_threads
|
||||
|
||||
from ....conftest import VllmRunner
|
||||
|
||||
|
||||
@ -28,17 +30,19 @@ def _run_test(
|
||||
} for _ in range(10)
|
||||
]
|
||||
|
||||
with vllm_runner(
|
||||
model,
|
||||
runner="pooling",
|
||||
dtype="half",
|
||||
enforce_eager=True,
|
||||
skip_tokenizer_init=True,
|
||||
# Limit the maximum number of sequences to avoid the
|
||||
# test going OOM during the warmup run
|
||||
max_num_seqs=32,
|
||||
default_torch_num_threads=1,
|
||||
) as vllm_model:
|
||||
with (
|
||||
set_default_torch_num_threads(1),
|
||||
vllm_runner(
|
||||
model,
|
||||
runner="pooling",
|
||||
dtype=torch.float16,
|
||||
enforce_eager=True,
|
||||
skip_tokenizer_init=True,
|
||||
# Limit the maximum number of sequences to avoid the
|
||||
# test going OOM during the warmup run
|
||||
max_num_seqs=32,
|
||||
) as vllm_model,
|
||||
):
|
||||
vllm_model.encode(prompt)
|
||||
|
||||
|
||||
|
||||
@ -45,15 +45,12 @@ def run_awq_test(
|
||||
# will hurt multiprocessing backend with fork method (the default method).
|
||||
|
||||
# max_model_len should be greater than image_feature_size
|
||||
with vllm_runner(
|
||||
source_model,
|
||||
max_model_len=4096,
|
||||
dtype=dtype,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
enforce_eager=True,
|
||||
default_torch_num_threads=1,
|
||||
) as vllm_model:
|
||||
with vllm_runner(source_model,
|
||||
max_model_len=4096,
|
||||
dtype=dtype,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
enforce_eager=True) as vllm_model:
|
||||
source_outputs_per_image = [
|
||||
vllm_model.generate_greedy_logprobs(prompts,
|
||||
max_tokens,
|
||||
@ -62,16 +59,13 @@ def run_awq_test(
|
||||
for prompts, images in inputs_per_image
|
||||
]
|
||||
|
||||
with vllm_runner(
|
||||
quant_model,
|
||||
quantization="awq",
|
||||
max_model_len=4096,
|
||||
dtype=dtype,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
enforce_eager=True,
|
||||
default_torch_num_threads=1,
|
||||
) as vllm_model:
|
||||
with vllm_runner(quant_model,
|
||||
quantization="awq",
|
||||
max_model_len=4096,
|
||||
dtype=dtype,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
enforce_eager=True) as vllm_model:
|
||||
quant_outputs_per_image = [
|
||||
vllm_model.generate_greedy_logprobs(prompts,
|
||||
max_tokens,
|
||||
@ -114,8 +108,12 @@ def run_awq_test(
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
@torch.inference_mode()
|
||||
def test_awq_models(vllm_runner, image_assets, source_model, quant_model,
|
||||
size_factors, dtype, max_tokens, num_logprobs) -> None:
|
||||
size_factors, dtype, max_tokens, num_logprobs,
|
||||
monkeypatch) -> None:
|
||||
|
||||
# Test V1: this test hangs during setup on single-scale input.
|
||||
# TODO: fixure out why and re-enable this on V1.
|
||||
monkeypatch.setenv("VLLM_USE_V1", "0")
|
||||
run_awq_test(
|
||||
vllm_runner,
|
||||
image_assets,
|
||||
|
||||
@ -5,7 +5,10 @@
|
||||
Run `pytest tests/quantization/test_bitsandbytes.py`.
|
||||
'''
|
||||
|
||||
import gc
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
from transformers import BitsAndBytesConfig
|
||||
|
||||
from tests.quantization.utils import is_quant_method_supported
|
||||
@ -128,15 +131,12 @@ def test_4bit_bnb_moe_model(hf_runner, vllm_runner, example_prompts,
|
||||
))
|
||||
with vllm_runner(model_name,
|
||||
quantization='bitsandbytes',
|
||||
enforce_eager=False,
|
||||
default_torch_num_threads=1) as llm:
|
||||
enforce_eager=False) as llm:
|
||||
vllm_outputs = llm.generate_greedy_logprobs(example_prompts,
|
||||
max_tokens=32,
|
||||
num_logprobs=5)
|
||||
|
||||
with hf_runner(model_name,
|
||||
model_kwargs=hf_model_kwargs,
|
||||
default_torch_num_threads=1) as llm:
|
||||
with hf_runner(model_name, model_kwargs=hf_model_kwargs) as llm:
|
||||
transformers_outputs = llm.generate_greedy_logprobs_limit(
|
||||
example_prompts, max_tokens=32, num_logprobs=5)
|
||||
check_logprobs_close(
|
||||
@ -174,8 +174,7 @@ def test_4bit_bnb_embedding_model(
|
||||
runner="pooling",
|
||||
dtype=dtype,
|
||||
gpu_memory_utilization=0.5,
|
||||
quantization="bitsandbytes",
|
||||
default_torch_num_threads=1) as vllm_model:
|
||||
quantization="bitsandbytes") as vllm_model:
|
||||
vllm_outputs = vllm_model.embed(example_prompts)
|
||||
|
||||
hf_model_kwargs = dict(quantization_config=BitsAndBytesConfig(
|
||||
@ -185,7 +184,6 @@ def test_4bit_bnb_embedding_model(
|
||||
dtype=dtype,
|
||||
model_kwargs=hf_model_kwargs,
|
||||
is_sentence_transformer=True,
|
||||
default_torch_num_threads=1,
|
||||
) as hf_model:
|
||||
hf_outputs = hf_model.encode(example_prompts)
|
||||
|
||||
@ -224,22 +222,26 @@ def validate_generated_texts(hf_runner,
|
||||
with vllm_runner(model_name,
|
||||
quantization=None if pre_quant else 'bitsandbytes',
|
||||
tensor_parallel_size=vllm_tp_size,
|
||||
enforce_eager=False,
|
||||
default_torch_num_threads=1) as llm:
|
||||
enforce_eager=False) as llm:
|
||||
|
||||
vllm_outputs = llm.generate_greedy(prompts, max_tokens)
|
||||
vllm_logs = log_generated_texts(prompts, vllm_outputs, "VllmRunner")
|
||||
|
||||
# Clean up the GPU memory for the next test
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
if hf_model_kwargs is None:
|
||||
hf_model_kwargs = {}
|
||||
|
||||
# Run with HF runner
|
||||
with hf_runner(model_name,
|
||||
model_kwargs=hf_model_kwargs,
|
||||
default_torch_num_threads=1) as llm:
|
||||
with hf_runner(model_name, model_kwargs=hf_model_kwargs) as llm:
|
||||
hf_outputs = llm.generate_greedy(prompts, max_tokens)
|
||||
hf_logs = log_generated_texts(prompts, hf_outputs, "HfRunner")
|
||||
|
||||
# Clean up the GPU memory for the next test
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
# Compare the generated strings
|
||||
for hf_log, vllm_log in zip(hf_logs, vllm_logs):
|
||||
hf_str = hf_log["generated_text"]
|
||||
|
||||
@ -32,10 +32,13 @@ from ..utils import check_logprobs_close
|
||||
# Due to low-precision numerical divergence, we only test logprob of 4 tokens
|
||||
@pytest.mark.parametrize("max_tokens", [4])
|
||||
@pytest.mark.parametrize("enforce_eager", [True])
|
||||
@pytest.mark.parametrize("backend", ["FLASH_ATTN"])
|
||||
@pytest.mark.parametrize("backend", ["FLASH_ATTN", "XFORMERS"])
|
||||
# NOTE: Increasing this in this suite will fail CI because we currently cannot
|
||||
# reset distributed env properly. Use a value > 1 just when you test.
|
||||
@pytest.mark.parametrize("tensor_parallel_size", [1])
|
||||
# Due to low-precision numerical divergence, this test is too sensitive for
|
||||
# the async postprocessor
|
||||
@pytest.mark.parametrize("disable_async_output_proc", [True])
|
||||
def test_models(
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
@ -46,6 +49,7 @@ def test_models(
|
||||
enforce_eager: bool,
|
||||
backend: str,
|
||||
tensor_parallel_size: int,
|
||||
disable_async_output_proc: bool,
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
) -> None:
|
||||
"""
|
||||
@ -57,9 +61,6 @@ def test_models(
|
||||
pytest.skip(
|
||||
f"{kv_cache_dtype} is currently not supported on ROCm/HIP.")
|
||||
|
||||
if not current_platform.is_kv_cache_dtype_supported(kv_cache_dtype, None):
|
||||
pytest.skip(f"{kv_cache_dtype} is not supported on this platform.")
|
||||
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("TOKENIZERS_PARALLELISM", 'true')
|
||||
m.setenv(STR_BACKEND_ENV_VAR, backend)
|
||||
@ -73,6 +74,7 @@ def test_models(
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
enforce_eager=enforce_eager,
|
||||
kv_cache_dtype="auto",
|
||||
disable_async_output_proc=disable_async_output_proc,
|
||||
) as vllm_model:
|
||||
baseline_outputs = vllm_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, NUM_LOG_PROBS)
|
||||
@ -83,6 +85,7 @@ def test_models(
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
enforce_eager=enforce_eager,
|
||||
kv_cache_dtype=kv_cache_dtype,
|
||||
disable_async_output_proc=disable_async_output_proc,
|
||||
) as vllm_model:
|
||||
test_outputs = vllm_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, NUM_LOG_PROBS)
|
||||
@ -107,6 +110,9 @@ def test_models(
|
||||
])
|
||||
# Due to low-precision numerical divergence, we only test logprob of 4 tokens
|
||||
@pytest.mark.parametrize("max_tokens", [4])
|
||||
# Due to low-precision numerical divergence, this test is too sensitive for
|
||||
# the async postprocessor
|
||||
@pytest.mark.parametrize("disable_async_output_proc", [True])
|
||||
def test_cpu_models(
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
@ -114,6 +120,7 @@ def test_cpu_models(
|
||||
base_model: str,
|
||||
test_model: str,
|
||||
max_tokens: int,
|
||||
disable_async_output_proc: bool,
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
) -> None:
|
||||
"""
|
||||
@ -131,6 +138,7 @@ def test_cpu_models(
|
||||
max_model_len=MAX_MODEL_LEN,
|
||||
dtype="bfloat16",
|
||||
kv_cache_dtype="auto",
|
||||
disable_async_output_proc=disable_async_output_proc,
|
||||
) as vllm_model:
|
||||
baseline_outputs = vllm_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, NUM_LOG_PROBS)
|
||||
@ -140,6 +148,7 @@ def test_cpu_models(
|
||||
max_model_len=MAX_MODEL_LEN,
|
||||
dtype="bfloat16",
|
||||
kv_cache_dtype=kv_cache_dtype,
|
||||
disable_async_output_proc=disable_async_output_proc,
|
||||
) as vllm_model:
|
||||
test_outputs = vllm_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, NUM_LOG_PROBS)
|
||||
|
||||
@ -448,8 +448,6 @@ _MULTIMODAL_EXAMPLE_MODELS = {
|
||||
max_transformers_version="4.48", # noqa: E501
|
||||
transformers_version_reason="HF model is not compatible.", # noqa: E501
|
||||
hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]}), # noqa: E501
|
||||
"DotsOCRForCausalLM": _HfExamplesInfo("rednote-hilab/dots.ocr",
|
||||
trust_remote_code=True),
|
||||
"Emu3ForConditionalGeneration": _HfExamplesInfo("BAAI/Emu3-Chat-hf"),
|
||||
"Ernie4_5_VLMoeForConditionalGeneration": _HfExamplesInfo("baidu/ERNIE-4.5-VL-28B-A3B-PT", # noqa: E501
|
||||
trust_remote_code=True),
|
||||
|
||||
@ -7,6 +7,8 @@ from unittest.mock import patch
|
||||
import pytest
|
||||
|
||||
from vllm import LLM
|
||||
from vllm.config import ModelImpl
|
||||
from vllm.engine.llm_engine import LLMEngine as V0LLMEngine
|
||||
from vllm.utils import GiB_bytes
|
||||
from vllm.v1.core.kv_cache_utils import get_kv_cache_configs
|
||||
from vllm.v1.engine.core import EngineCore as V1EngineCore
|
||||
@ -60,6 +62,10 @@ def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch,
|
||||
False))
|
||||
|
||||
# Avoid calling model.forward()
|
||||
def _initialize_kv_caches_v0(self) -> None:
|
||||
self.cache_config.num_gpu_blocks = 0
|
||||
self.cache_config.num_cpu_blocks = 0
|
||||
|
||||
def _initialize_kv_caches_v1(self, vllm_config):
|
||||
kv_cache_specs = self.model_executor.get_kv_cache_specs()
|
||||
scheduler_kv_cache_config = get_kv_cache_configs(
|
||||
@ -71,15 +77,16 @@ def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch,
|
||||
# gpu_blocks (> 0), cpu_blocks, scheduler_kv_cache_config
|
||||
return 1, 0, scheduler_kv_cache_config
|
||||
|
||||
with (patch.object(V1EngineCore, "_initialize_kv_caches",
|
||||
with (patch.object(V0LLMEngine, "_initialize_kv_caches",
|
||||
_initialize_kv_caches_v0),
|
||||
patch.object(V1EngineCore, "_initialize_kv_caches",
|
||||
_initialize_kv_caches_v1), monkeypatch.context() as m):
|
||||
if model_info.v0_only:
|
||||
# NOTE(woosuk): skip the test for V0-only models
|
||||
return
|
||||
|
||||
m.setenv("VLLM_USE_V1", "0")
|
||||
if model_arch in ("Phi4FlashForCausalLM", "MotifForCausalLM"):
|
||||
pytest.skip(
|
||||
"Differential Flash Attention backend has been removed.")
|
||||
# Phi4FlashForCausalLM and MotifForCausalLM
|
||||
# only supports DIFFERENTIAL_FLASH_ATTN backend
|
||||
m.setenv("VLLM_ATTENTION_BACKEND", "DIFFERENTIAL_FLASH_ATTN")
|
||||
if model_arch == "GptOssForCausalLM":
|
||||
# FIXME: A hack to bypass FA3 assertion because our CI's L4 GPU
|
||||
# has cc==8.9 which hasn't supported FA3 yet. Remove this hack when
|
||||
@ -104,8 +111,8 @@ def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch,
|
||||
# these tests seem to produce leftover memory
|
||||
gpu_memory_utilization=0.80,
|
||||
load_format="dummy",
|
||||
model_impl="transformers"
|
||||
if model_arch in _TRANSFORMERS_BACKEND_MODELS else "vllm",
|
||||
model_impl=ModelImpl.TRANSFORMERS
|
||||
if model_arch in _TRANSFORMERS_BACKEND_MODELS else ModelImpl.VLLM,
|
||||
hf_overrides=hf_overrides_fn,
|
||||
max_num_seqs=model_info.max_num_seqs)
|
||||
|
||||
|
||||
@ -42,7 +42,6 @@ def test_oot_registration_text_generation(
|
||||
assert rest == ""
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="This test is skipped because it failed on V1.")
|
||||
@create_new_process_for_each_test()
|
||||
def test_oot_registration_embedding(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
@ -63,7 +62,6 @@ def test_oot_registration_embedding(
|
||||
image = convert_image_mode(ImageAsset("cherry_blossom").pil_image, "RGB")
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="This test is skipped because it failed on V1.")
|
||||
@create_new_process_for_each_test()
|
||||
def test_oot_registration_multimodal(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
|
||||
@ -5,6 +5,7 @@ import pytest
|
||||
import torch
|
||||
|
||||
from tests.conftest import VllmRunner
|
||||
from vllm.utils import set_default_torch_num_threads
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
@ -24,17 +25,19 @@ def test_inference(
|
||||
prompt = dict(prompt_token_ids=[1],
|
||||
multi_modal_data=dict(pixel_values=pixel_values,
|
||||
location_coords=location_coords))
|
||||
with vllm_runner(
|
||||
model,
|
||||
runner="pooling",
|
||||
dtype="half",
|
||||
enforce_eager=True,
|
||||
skip_tokenizer_init=True,
|
||||
# Limit the maximum number of sequences to avoid the
|
||||
# test going OOM during the warmup run
|
||||
max_num_seqs=32,
|
||||
default_torch_num_threads=1,
|
||||
) as vllm_model:
|
||||
with (
|
||||
set_default_torch_num_threads(1),
|
||||
vllm_runner(
|
||||
model,
|
||||
runner="pooling",
|
||||
dtype=torch.float16,
|
||||
enforce_eager=True,
|
||||
skip_tokenizer_init=True,
|
||||
# Limit the maximum number of sequences to avoid the
|
||||
# test going OOM during the warmup run
|
||||
max_num_seqs=32,
|
||||
) as vllm_model,
|
||||
):
|
||||
|
||||
vllm_output = vllm_model.llm.encode(prompt)
|
||||
assert torch.equal(
|
||||
|
||||
@ -9,7 +9,7 @@ from vllm.platforms import current_platform
|
||||
|
||||
from ..conftest import HfRunner, VllmRunner
|
||||
from ..utils import multi_gpu_test, prep_prompts
|
||||
from .utils import check_embeddings_close, check_logprobs_close
|
||||
from .utils import check_logprobs_close
|
||||
|
||||
|
||||
def check_implementation(
|
||||
@ -165,40 +165,6 @@ def test_embed_loading(vllm_runner, model):
|
||||
assert model_config.using_transformers_backend()
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[
|
||||
# Encoder model
|
||||
"BAAI/bge-base-en-v1.5",
|
||||
])
|
||||
def test_embed_correctness(hf_runner, vllm_runner, example_prompts, model):
|
||||
import transformers
|
||||
from packaging.version import Version
|
||||
installed = Version(transformers.__version__)
|
||||
required = Version("4.57.0.dev0")
|
||||
if installed < required:
|
||||
pytest.skip("Encoder models with the Transformers backend require "
|
||||
f"transformers>={required}, but got {installed}")
|
||||
|
||||
with vllm_runner(model, max_model_len=512,
|
||||
model_impl="transformers") as vllm_model:
|
||||
model_config = vllm_model.llm.llm_engine.model_config
|
||||
assert model_config.using_transformers_backend()
|
||||
|
||||
vllm_outputs = vllm_model.embed(example_prompts)
|
||||
|
||||
with hf_runner(model, is_sentence_transformer=True) as hf_model:
|
||||
hf_outputs = hf_model.encode(example_prompts)
|
||||
|
||||
check_embeddings_close(
|
||||
embeddings_0_lst=hf_outputs,
|
||||
embeddings_1_lst=vllm_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
tol=1e-2,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
["jason9693/Qwen2.5-1.5B-apeach"],
|
||||
|
||||
@ -12,7 +12,7 @@ from transformers import PretrainedConfig
|
||||
|
||||
from vllm.config import ModelConfig, ModelDType, RunnerOption
|
||||
from vllm.inputs import InputContext
|
||||
from vllm.logprobs import Logprob, PromptLogprobs, SampleLogprobs
|
||||
from vllm.sequence import Logprob, PromptLogprobs, SampleLogprobs
|
||||
|
||||
from .registry import HF_EXAMPLE_MODELS
|
||||
|
||||
|
||||
@ -1,140 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
# test_audio.py
|
||||
import base64
|
||||
from pathlib import Path
|
||||
from unittest.mock import patch
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from vllm.multimodal.audio import (AudioMediaIO, AudioResampler,
|
||||
resample_audio_librosa,
|
||||
resample_audio_scipy)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def dummy_audio():
|
||||
return np.array([0.0, 0.1, 0.2, 0.3, 0.4], dtype=float)
|
||||
|
||||
|
||||
def test_resample_audio_librosa(dummy_audio):
|
||||
with patch("vllm.multimodal.audio.librosa.resample") as mock_resample:
|
||||
mock_resample.return_value = dummy_audio * 2
|
||||
out = resample_audio_librosa(dummy_audio,
|
||||
orig_sr=44100,
|
||||
target_sr=22050)
|
||||
mock_resample.assert_called_once_with(dummy_audio,
|
||||
orig_sr=44100,
|
||||
target_sr=22050)
|
||||
assert np.all(out == dummy_audio * 2)
|
||||
|
||||
|
||||
def test_resample_audio_scipy(dummy_audio):
|
||||
out_down = resample_audio_scipy(dummy_audio, orig_sr=4, target_sr=2)
|
||||
out_up = resample_audio_scipy(dummy_audio, orig_sr=2, target_sr=4)
|
||||
out_same = resample_audio_scipy(dummy_audio, orig_sr=4, target_sr=4)
|
||||
|
||||
assert len(out_down) == 3
|
||||
assert len(out_up) == 10
|
||||
assert np.all(out_same == dummy_audio)
|
||||
|
||||
|
||||
@pytest.mark.xfail(
|
||||
reason="resample_audio_scipy is buggy for non-integer ratios")
|
||||
def test_resample_audio_scipy_non_integer_ratio(dummy_audio):
|
||||
out = resample_audio_scipy(dummy_audio, orig_sr=5, target_sr=3)
|
||||
|
||||
expected_len = int(round(len(dummy_audio) * 3 / 5))
|
||||
assert len(out) == expected_len
|
||||
|
||||
assert isinstance(out, np.ndarray)
|
||||
assert np.isfinite(out).all()
|
||||
|
||||
|
||||
def test_audio_resampler_librosa_calls_resample(dummy_audio):
|
||||
resampler = AudioResampler(target_sr=22050, method="librosa")
|
||||
with patch(
|
||||
"vllm.multimodal.audio.resample_audio_librosa") as mock_resample:
|
||||
mock_resample.return_value = dummy_audio
|
||||
out = resampler.resample(dummy_audio, orig_sr=44100)
|
||||
mock_resample.assert_called_once_with(dummy_audio,
|
||||
orig_sr=44100,
|
||||
target_sr=22050)
|
||||
assert np.all(out == dummy_audio)
|
||||
|
||||
|
||||
def test_audio_resampler_scipy_calls_resample(dummy_audio):
|
||||
resampler = AudioResampler(target_sr=22050, method="scipy")
|
||||
with patch("vllm.multimodal.audio.resample_audio_scipy") as mock_resample:
|
||||
mock_resample.return_value = dummy_audio
|
||||
out = resampler.resample(dummy_audio, orig_sr=44100)
|
||||
mock_resample.assert_called_once_with(dummy_audio,
|
||||
orig_sr=44100,
|
||||
target_sr=22050)
|
||||
assert np.all(out == dummy_audio)
|
||||
|
||||
|
||||
def test_audio_resampler_invalid_method(dummy_audio):
|
||||
resampler = AudioResampler(target_sr=22050, method="invalid")
|
||||
with pytest.raises(ValueError):
|
||||
resampler.resample(dummy_audio, orig_sr=44100)
|
||||
|
||||
|
||||
def test_audio_resampler_no_target_sr(dummy_audio):
|
||||
resampler = AudioResampler(target_sr=None)
|
||||
with pytest.raises(RuntimeError):
|
||||
resampler.resample(dummy_audio, orig_sr=44100)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def dummy_audio_bytes():
|
||||
return b"FAKEAUDIOBYTES"
|
||||
|
||||
|
||||
def test_audio_media_io_load_bytes(dummy_audio_bytes):
|
||||
audio_io = AudioMediaIO()
|
||||
with patch("vllm.multimodal.audio.librosa.load") as mock_load:
|
||||
mock_load.return_value = (np.array([0.1, 0.2]), 16000)
|
||||
out = audio_io.load_bytes(dummy_audio_bytes)
|
||||
mock_load.assert_called_once()
|
||||
assert isinstance(out[0], np.ndarray)
|
||||
assert out[1] == 16000
|
||||
|
||||
|
||||
def test_audio_media_io_load_base64(dummy_audio_bytes):
|
||||
audio_io = AudioMediaIO()
|
||||
encoded = base64.b64encode(dummy_audio_bytes).decode("utf-8")
|
||||
with patch.object(AudioMediaIO, "load_bytes") as mock_load_bytes:
|
||||
mock_load_bytes.return_value = (np.array([0.1, 0.2]), 16000)
|
||||
out = audio_io.load_base64("audio/wav", encoded)
|
||||
mock_load_bytes.assert_called_once()
|
||||
assert isinstance(out[0], np.ndarray)
|
||||
assert out[1] == 16000
|
||||
|
||||
|
||||
def test_audio_media_io_load_file():
|
||||
audio_io = AudioMediaIO()
|
||||
path = Path("/fake/path.wav")
|
||||
with patch("vllm.multimodal.audio.librosa.load") as mock_load:
|
||||
mock_load.return_value = (np.array([0.1, 0.2]), 16000)
|
||||
out = audio_io.load_file(path)
|
||||
mock_load.assert_called_once_with(path, sr=None)
|
||||
assert isinstance(out[0], np.ndarray)
|
||||
assert out[1] == 16000
|
||||
|
||||
|
||||
def test_audio_media_io_encode_base64(dummy_audio):
|
||||
audio_io = AudioMediaIO()
|
||||
media = (dummy_audio, 16000)
|
||||
with patch("vllm.multimodal.audio.soundfile.write") as mock_write:
|
||||
|
||||
def write_to_buffer(buffer, *_args, **_kwargs):
|
||||
buffer.write(b"dummy_wav_data")
|
||||
|
||||
mock_write.side_effect = write_to_buffer
|
||||
|
||||
out = audio_io.encode_base64(media)
|
||||
decoded = base64.b64decode(out)
|
||||
assert decoded == b"dummy_wav_data"
|
||||
mock_write.assert_called_once()
|
||||
@ -9,6 +9,7 @@ from vllm.model_executor.models.llava import (LlavaDummyInputsBuilder,
|
||||
LlavaForConditionalGeneration,
|
||||
LlavaMultiModalProcessor,
|
||||
LlavaProcessingInfo)
|
||||
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
|
||||
|
||||
@ -17,10 +18,11 @@ from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
dummy_inputs=LlavaDummyInputsBuilder)
|
||||
class MyLlava(LlavaForConditionalGeneration):
|
||||
|
||||
def compute_logits(self,
|
||||
hidden_states: torch.Tensor) -> Optional[torch.Tensor]:
|
||||
def compute_logits(
|
||||
self, hidden_states: torch.Tensor,
|
||||
sampling_metadata: SamplingMetadata) -> Optional[torch.Tensor]:
|
||||
# this dummy model always predicts the first token
|
||||
logits = super().compute_logits(hidden_states)
|
||||
logits = super().compute_logits(hidden_states, sampling_metadata)
|
||||
if logits is not None:
|
||||
logits.zero_()
|
||||
logits[:, 0] += 1.0
|
||||
|
||||
@ -6,14 +6,16 @@ from typing import Optional
|
||||
import torch
|
||||
|
||||
from vllm.model_executor.models.opt import OPTForCausalLM
|
||||
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||
|
||||
|
||||
class MyOPTForCausalLM(OPTForCausalLM):
|
||||
|
||||
def compute_logits(self,
|
||||
hidden_states: torch.Tensor) -> Optional[torch.Tensor]:
|
||||
def compute_logits(
|
||||
self, hidden_states: torch.Tensor,
|
||||
sampling_metadata: SamplingMetadata) -> Optional[torch.Tensor]:
|
||||
# this dummy model always predicts the first token
|
||||
logits = super().compute_logits(hidden_states)
|
||||
logits = super().compute_logits(hidden_states, sampling_metadata)
|
||||
if logits is not None:
|
||||
logits.zero_()
|
||||
logits[:, 0] += 1.0
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user