Compare commits
84 Commits
codex/add-
...
khluu/clea
| Author | SHA1 | Date | |
|---|---|---|---|
| 6e71a2a6c8 | |||
| d328f7894f | |||
| 98aee612aa | |||
| 598bd74cf8 | |||
| 2417798471 | |||
| 9480ae24e3 | |||
| f399182e8c | |||
| 1c41310584 | |||
| c83c4ff815 | |||
| 0e1759cd54 | |||
| e66ed3e675 | |||
| e0653f6c0b | |||
| 38ba061f6f | |||
| 0a74e9d0f2 | |||
| 8bd5844989 | |||
| ce30dca5c4 | |||
| 2f0bab3f26 | |||
| fad73be1a5 | |||
| 56d04089ef | |||
| 7be0cb8e9e | |||
| 1fa1d6a9a0 | |||
| d59c986444 | |||
| 04d0c60770 | |||
| 2b41cbbf03 | |||
| 0235103cbb | |||
| a344a5aa0a | |||
| 5685370271 | |||
| a0e0efd6bd | |||
| cf91a89dd2 | |||
| 39a22dcaac | |||
| 41c80698b3 | |||
| 7c8271cd1e | |||
| 3e330fcb21 | |||
| d46934b229 | |||
| 107284959a | |||
| dc1a53186d | |||
| 55602bb2e6 | |||
| d7fbc6ddac | |||
| 5438967fbc | |||
| 422e793fa6 | |||
| 1cb39dbcdd | |||
| 437c3ce026 | |||
| 499b074bfd | |||
| ff0e59d83a | |||
| b55713683c | |||
| acc1a6e10a | |||
| 8c742a66d1 | |||
| 183a70967a | |||
| 14b4326b94 | |||
| 752d2e1c36 | |||
| 81eea3d348 | |||
| 9701352e4b | |||
| 749be00a98 | |||
| 5b8077b8ac | |||
| 038e9be4eb | |||
| 68a349114f | |||
| e80bca309e | |||
| fb4983e112 | |||
| 379ea2823a | |||
| 3a6acad431 | |||
| 5490d633ce | |||
| 628d00cd7b | |||
| 4071c76cf3 | |||
| f1bddbd852 | |||
| 9748c5198b | |||
| ee52a32705 | |||
| 8fb85b7bb6 | |||
| 5b31cb1781 | |||
| d660c98c1b | |||
| 5674a40366 | |||
| 8c3e199998 | |||
| 1c26b42296 | |||
| b7adf94c4a | |||
| 4d7fe40fc0 | |||
| 0dc9532065 | |||
| 72a69132dc | |||
| d90d8eb674 | |||
| 0a2f4c0793 | |||
| 1cf3753b90 | |||
| 4f7cde7272 | |||
| 67c14906aa | |||
| 69f46359dd | |||
| d9e00dbd1f | |||
| ad39106b16 |
@ -25,8 +25,8 @@ numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE
|
||||
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
|
||||
|
||||
# Run the image, setting --shm-size=4g for tensor parallel.
|
||||
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
|
||||
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2
|
||||
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
|
||||
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2
|
||||
|
||||
function cpu_tests() {
|
||||
set -e
|
||||
@ -89,17 +89,33 @@ function cpu_tests() {
|
||||
pytest -x -s -v \
|
||||
tests/lora/test_qwen2vl.py"
|
||||
|
||||
# online serving
|
||||
# online serving: tp+pp
|
||||
docker exec cpu-test-"$NUMA_NODE" bash -c '
|
||||
set -e
|
||||
VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 &
|
||||
server_pid=$!
|
||||
timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
|
||||
vllm bench serve \
|
||||
--backend vllm \
|
||||
--dataset-name random \
|
||||
--model meta-llama/Llama-3.2-3B-Instruct \
|
||||
--num-prompts 20 \
|
||||
--endpoint /v1/completions'
|
||||
--endpoint /v1/completions
|
||||
kill -s SIGTERM $server_pid &'
|
||||
|
||||
# online serving: tp+dp
|
||||
docker exec cpu-test-"$NUMA_NODE" bash -c '
|
||||
set -e
|
||||
VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -dp=2 &
|
||||
server_pid=$!
|
||||
timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
|
||||
vllm bench serve \
|
||||
--backend vllm \
|
||||
--dataset-name random \
|
||||
--model meta-llama/Llama-3.2-3B-Instruct \
|
||||
--num-prompts 20 \
|
||||
--endpoint /v1/completions
|
||||
kill -s SIGTERM $server_pid &'
|
||||
}
|
||||
|
||||
# All of CPU tests are expected to be finished less than 40 mins.
|
||||
|
||||
@ -462,8 +462,8 @@ steps:
|
||||
- tests/quantization
|
||||
commands:
|
||||
# temporary install here since we need nightly, will move to requirements/test.in
|
||||
# after torchao 0.12 release
|
||||
- pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126
|
||||
# after torchao 0.12 release, and pin a working version of torchao nightly here
|
||||
- pip install --pre torchao==0.13.0.dev20250814 --index-url https://download.pytorch.org/whl/nightly/cu128
|
||||
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
|
||||
|
||||
- label: LM Eval Small Models # 53min
|
||||
@ -566,8 +566,7 @@ steps:
|
||||
- tests/models/multimodal
|
||||
commands:
|
||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||
- pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py
|
||||
- pytest -v -s models/multimodal/processing/test_tensor_schema.py
|
||||
- pytest -v -s models/multimodal/processing
|
||||
|
||||
- label: Multi-Modal Models Test (Standard)
|
||||
mirror_hardwares: [amdexperimental]
|
||||
@ -770,6 +769,11 @@ steps:
|
||||
- pytest -v -s plugins_tests/test_platform_plugins.py
|
||||
- pip uninstall vllm_add_dummy_platform -y
|
||||
# end platform plugin tests
|
||||
# begin io_processor plugins test, all the code in between uses the prithvi_io_processor plugin
|
||||
- pip install -e ./plugins/prithvi_io_processor_plugin
|
||||
- pytest -v -s plugins_tests/test_io_processor_plugins.py
|
||||
- pip uninstall prithvi_io_processor_plugin -y
|
||||
# end io_processor plugins test
|
||||
# other tests continue here:
|
||||
- pytest -v -s plugins_tests/test_scheduler_plugins.py
|
||||
- pip install -e ./plugins/vllm_add_dummy_model
|
||||
|
||||
4
.github/workflows/issue_autolabel.yml
vendored
4
.github/workflows/issue_autolabel.yml
vendored
@ -49,6 +49,10 @@ jobs:
|
||||
term: "VLLM_ROCM_",
|
||||
searchIn: "both"
|
||||
},
|
||||
{
|
||||
term: "aiter",
|
||||
searchIn: "title"
|
||||
},
|
||||
{
|
||||
term: "rocm",
|
||||
searchIn: "title"
|
||||
|
||||
@ -45,8 +45,8 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1
|
||||
# requirements.txt files and should be kept consistent. The ROCm torch
|
||||
# versions are derived from docker/Dockerfile.rocm
|
||||
#
|
||||
set(TORCH_SUPPORTED_VERSION_CUDA "2.7.1")
|
||||
set(TORCH_SUPPORTED_VERSION_ROCM "2.7.0")
|
||||
set(TORCH_SUPPORTED_VERSION_CUDA "2.8.0")
|
||||
set(TORCH_SUPPORTED_VERSION_ROCM "2.8.0")
|
||||
|
||||
#
|
||||
# Try to find python package with an executable that exactly matches
|
||||
|
||||
@ -110,7 +110,12 @@ become available.
|
||||
|
||||
🚧: to be supported
|
||||
|
||||
**Note**: HuggingFace dataset's `dataset-name` should be set to `hf`
|
||||
**Note**: HuggingFace dataset's `dataset-name` should be set to `hf`.
|
||||
For local `dataset-path`, please set `hf-name` to its Hugging Face ID like
|
||||
|
||||
```bash
|
||||
--dataset-path /datasets/VisionArena-Chat/ --hf-name lmarena-ai/VisionArena-Chat
|
||||
```
|
||||
|
||||
## 🚀 Example - Online Benchmark
|
||||
|
||||
|
||||
@ -16,6 +16,7 @@ assert current_platform.is_cuda(), (
|
||||
# DeepSeek-V3 weight shapes
|
||||
DEEPSEEK_V3_SHAPES = [
|
||||
(512 + 64, 7168),
|
||||
(2112, 7168),
|
||||
((128 + 64) * 128, 7168),
|
||||
(128 * (128 + 128), 512),
|
||||
(7168, 16384),
|
||||
|
||||
@ -141,6 +141,7 @@ def get_weight_shapes(tp_size):
|
||||
# cannot TP
|
||||
total = [
|
||||
(512 + 64, 7168),
|
||||
(2112, 7168),
|
||||
((128 + 64) * 128, 7168),
|
||||
(128 * (128 + 128), 512),
|
||||
(7168, 16384),
|
||||
|
||||
@ -27,11 +27,12 @@
|
||||
|
||||
template<int kNThreads_, int kNItems_, int kNRows_, bool kIsEvenLen_,
|
||||
bool kIsVariableB_, bool kIsVariableC_,
|
||||
bool kHasZ_, bool kVarlen_, typename input_t_, typename weight_t_>
|
||||
bool kHasZ_, bool kVarlen_, typename input_t_, typename weight_t_, typename state_t_>
|
||||
struct Selective_Scan_fwd_kernel_traits {
|
||||
static_assert(kNItems_ % 4 == 0);
|
||||
using input_t = input_t_;
|
||||
using weight_t = weight_t_;
|
||||
using state_t = state_t_;
|
||||
static constexpr int kNThreads = kNThreads_;
|
||||
// Setting MinBlocksPerMP to be 3 (instead of 2) for 128 threads improves occupancy.
|
||||
static constexpr int kMinBlocks = kNThreads < 128 ? 5 : 3;
|
||||
@ -132,7 +133,7 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
|
||||
input_t *Bvar = reinterpret_cast<input_t *>(params.B_ptr) + sequence_start_index * params.B_batch_stride + group_id * params.B_group_stride;
|
||||
weight_t *C = reinterpret_cast<weight_t *>(params.C_ptr) + dim_id * kNRows * params.C_d_stride;
|
||||
input_t *Cvar = reinterpret_cast<input_t *>(params.C_ptr) + sequence_start_index * params.C_batch_stride + group_id * params.C_group_stride;
|
||||
input_t *ssm_states = reinterpret_cast<input_t *>(params.ssm_states_ptr) +
|
||||
typename Ktraits::state_t *ssm_states = reinterpret_cast<typename Ktraits::state_t *>(params.ssm_states_ptr) +
|
||||
cache_index * params.ssm_states_batch_stride +
|
||||
dim_id * kNRows * params.ssm_states_dim_stride;
|
||||
|
||||
@ -261,7 +262,7 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
|
||||
if (threadIdx.x == 0) {
|
||||
smem_running_prefix[state_idx] = prefix_op.running_prefix;
|
||||
if (chunk == n_chunks - 1) {
|
||||
ssm_states[state_idx * params.ssm_states_dstate_stride] = input_t(prefix_op.running_prefix.y);
|
||||
ssm_states[state_idx * params.ssm_states_dstate_stride] = typename Ktraits::state_t(prefix_op.running_prefix.y);
|
||||
}
|
||||
}
|
||||
#pragma unroll
|
||||
@ -310,7 +311,7 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
|
||||
}
|
||||
}
|
||||
|
||||
template<int kNThreads, int kNItems, typename input_t, typename weight_t>
|
||||
template<int kNThreads, int kNItems, typename input_t, typename weight_t, typename state_t>
|
||||
void selective_scan_fwd_launch(SSMParamsBase ¶ms, cudaStream_t stream) {
|
||||
// Only kNRows == 1 is tested for now, which ofc doesn't differ from previously when we had each block
|
||||
// processing 1 row.
|
||||
@ -321,7 +322,7 @@ void selective_scan_fwd_launch(SSMParamsBase ¶ms, cudaStream_t stream) {
|
||||
BOOL_SWITCH(params.seqlen % (kNThreads * kNItems) == 0, kIsEvenLen, [&] {
|
||||
BOOL_SWITCH(params.z_ptr != nullptr , kHasZ, [&] {
|
||||
BOOL_SWITCH(params.query_start_loc_ptr != nullptr , kVarlen, [&] {
|
||||
using Ktraits = Selective_Scan_fwd_kernel_traits<kNThreads, kNItems, kNRows, kIsEvenLen, kIsVariableB, kIsVariableC, kHasZ, kVarlen, input_t, weight_t>;
|
||||
using Ktraits = Selective_Scan_fwd_kernel_traits<kNThreads, kNItems, kNRows, kIsEvenLen, kIsVariableB, kIsVariableC, kHasZ, kVarlen, input_t, weight_t, state_t>;
|
||||
constexpr int kSmemSize = Ktraits::kSmemSize + kNRows * MAX_DSTATE * sizeof(typename Ktraits::scan_t);
|
||||
dim3 grid(params.batch, params.dim / kNRows);
|
||||
auto kernel = &selective_scan_fwd_kernel<Ktraits>;
|
||||
@ -341,59 +342,78 @@ void selective_scan_fwd_launch(SSMParamsBase ¶ms, cudaStream_t stream) {
|
||||
});
|
||||
}
|
||||
|
||||
template<typename input_t, typename weight_t>
|
||||
template<typename input_t, typename weight_t, typename state_t>
|
||||
void selective_scan_fwd_cuda(SSMParamsBase ¶ms, cudaStream_t stream) {
|
||||
|
||||
#ifndef USE_ROCM
|
||||
if (params.seqlen <= 128) {
|
||||
selective_scan_fwd_launch<32, 4, input_t, weight_t>(params, stream);
|
||||
selective_scan_fwd_launch<32, 4, input_t, weight_t, state_t>(params, stream);
|
||||
} else if (params.seqlen <= 256) {
|
||||
selective_scan_fwd_launch<32, 8, input_t, weight_t>(params, stream);
|
||||
selective_scan_fwd_launch<32, 8, input_t, weight_t, state_t>(params, stream);
|
||||
} else if (params.seqlen <= 512) {
|
||||
selective_scan_fwd_launch<32, 16, input_t, weight_t>(params, stream);
|
||||
selective_scan_fwd_launch<32, 16, input_t, weight_t, state_t>(params, stream);
|
||||
} else if (params.seqlen <= 1024) {
|
||||
selective_scan_fwd_launch<64, 16, input_t, weight_t>(params, stream);
|
||||
selective_scan_fwd_launch<64, 16, input_t, weight_t, state_t>(params, stream);
|
||||
} else {
|
||||
selective_scan_fwd_launch<128, 16, input_t, weight_t>(params, stream);
|
||||
selective_scan_fwd_launch<128, 16, input_t, weight_t, state_t>(params, stream);
|
||||
}
|
||||
#else
|
||||
if (params.seqlen <= 256) {
|
||||
selective_scan_fwd_launch<64, 4, input_t, weight_t>(params, stream);
|
||||
selective_scan_fwd_launch<64, 4, input_t, weight_t, state_t>(params, stream);
|
||||
} else if (params.seqlen <= 512) {
|
||||
selective_scan_fwd_launch<64, 8, input_t, weight_t>(params, stream);
|
||||
selective_scan_fwd_launch<64, 8, input_t, weight_t, state_t>(params, stream);
|
||||
} else if (params.seqlen <= 1024) {
|
||||
selective_scan_fwd_launch<64, 16, input_t, weight_t>(params, stream);
|
||||
selective_scan_fwd_launch<64, 16, input_t, weight_t, state_t>(params, stream);
|
||||
} else {
|
||||
selective_scan_fwd_launch<128, 16, input_t, weight_t>(params, stream);
|
||||
selective_scan_fwd_launch<128, 16, input_t, weight_t, state_t>(params, stream);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
template void selective_scan_fwd_cuda<at::BFloat16, float>(SSMParamsBase ¶ms, cudaStream_t stream);
|
||||
template void selective_scan_fwd_cuda<at::Half, float>(SSMParamsBase ¶ms, cudaStream_t stream);
|
||||
template void selective_scan_fwd_cuda<float, float>(SSMParamsBase ¶ms, cudaStream_t stream);
|
||||
template void selective_scan_fwd_cuda<at::BFloat16, float, at::BFloat16>(SSMParamsBase ¶ms, cudaStream_t stream);
|
||||
template void selective_scan_fwd_cuda<at::BFloat16, float, float>(SSMParamsBase ¶ms, cudaStream_t stream);
|
||||
template void selective_scan_fwd_cuda<at::Half, float, at::Half>(SSMParamsBase ¶ms, cudaStream_t stream);
|
||||
template void selective_scan_fwd_cuda<at::Half, float, float>(SSMParamsBase ¶ms, cudaStream_t stream);
|
||||
template void selective_scan_fwd_cuda<float, float, float>(SSMParamsBase ¶ms, cudaStream_t stream);
|
||||
|
||||
#define CHECK_SHAPE(x, ...) TORCH_CHECK(x.sizes() == torch::IntArrayRef({__VA_ARGS__}), #x " must have shape (" #__VA_ARGS__ ")")
|
||||
|
||||
#define DISPATCH_WTYPE_ITYPE_FLOAT_AND_HALF_AND_BF16(ITYPE, NAME, ...) \
|
||||
#define DISPATCH_WTYPE_ITYPE_FLOAT_AND_HALF_AND_BF16(ITYPE, STYPE, NAME, ...) \
|
||||
if (ITYPE == at::ScalarType::Half) { \
|
||||
using input_t = at::Half; \
|
||||
using weight_t = float; \
|
||||
__VA_ARGS__(); \
|
||||
if (STYPE == at::ScalarType::Half) { \
|
||||
using state_t = at::Half; \
|
||||
__VA_ARGS__(); \
|
||||
} else if (STYPE == at::ScalarType::Float) { \
|
||||
using state_t = float; \
|
||||
__VA_ARGS__(); \
|
||||
} else { \
|
||||
AT_ERROR(#NAME, " not implemented for state type '", toString(STYPE), "'"); \
|
||||
} \
|
||||
} else if (ITYPE == at::ScalarType::BFloat16) { \
|
||||
using input_t = at::BFloat16; \
|
||||
using weight_t = float; \
|
||||
__VA_ARGS__(); \
|
||||
if (STYPE == at::ScalarType::BFloat16) { \
|
||||
using state_t = at::BFloat16; \
|
||||
__VA_ARGS__(); \
|
||||
} else if (STYPE == at::ScalarType::Float) { \
|
||||
using state_t = float; \
|
||||
__VA_ARGS__(); \
|
||||
} else { \
|
||||
AT_ERROR(#NAME, " not implemented for state type '", toString(STYPE), "'"); \
|
||||
} \
|
||||
} else if (ITYPE == at::ScalarType::Float) { \
|
||||
using input_t = float; \
|
||||
using weight_t = float; \
|
||||
using state_t = float; \
|
||||
__VA_ARGS__(); \
|
||||
} else { \
|
||||
AT_ERROR(#NAME, " not implemented for input type '", toString(ITYPE), "'"); \
|
||||
}
|
||||
|
||||
|
||||
template<typename input_t, typename weight_t>
|
||||
template<typename input_t, typename weight_t, typename state_t>
|
||||
void selective_scan_fwd_cuda(SSMParamsBase ¶ms, cudaStream_t stream);
|
||||
|
||||
void set_ssm_params_fwd(SSMParamsBase ¶ms,
|
||||
@ -648,7 +668,9 @@ void selective_scan_fwd(const torch::Tensor &u, const torch::Tensor &delta,
|
||||
|
||||
// Right now u has BHL layout and delta has HBL layout, and we want out to have HBL layout
|
||||
at::Tensor out = delta;
|
||||
TORCH_CHECK(ssm_states.scalar_type() == input_type);
|
||||
// ssm_states can now be either the same as input_type or float32
|
||||
auto state_type = ssm_states.scalar_type();
|
||||
TORCH_CHECK(state_type == input_type || state_type == at::ScalarType::Float);
|
||||
TORCH_CHECK(ssm_states.is_cuda());
|
||||
TORCH_CHECK(ssm_states.stride(-1) == 1);
|
||||
|
||||
@ -670,7 +692,7 @@ void selective_scan_fwd(const torch::Tensor &u, const torch::Tensor &delta,
|
||||
|
||||
const at::cuda::OptionalCUDAGuard device_guard(device_of(u));
|
||||
auto stream = at::cuda::getCurrentCUDAStream().stream();
|
||||
DISPATCH_WTYPE_ITYPE_FLOAT_AND_HALF_AND_BF16(u.scalar_type(), "selective_scan_fwd", [&] {
|
||||
selective_scan_fwd_cuda<input_t, weight_t>(params, stream);
|
||||
DISPATCH_WTYPE_ITYPE_FLOAT_AND_HALF_AND_BF16(u.scalar_type(), ssm_states.scalar_type(), "selective_scan_fwd", [&] {
|
||||
selective_scan_fwd_cuda<input_t, weight_t, state_t>(params, stream);
|
||||
});
|
||||
}
|
||||
|
||||
@ -130,8 +130,8 @@ void silu_and_mul(torch::Tensor& out, torch::Tensor& input);
|
||||
void silu_and_mul_quant(torch::Tensor& out, torch::Tensor& input,
|
||||
torch::Tensor& scale);
|
||||
|
||||
#ifndef USE_ROCM
|
||||
|
||||
#if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \
|
||||
(defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120)
|
||||
void silu_and_mul_nvfp4_quant(torch::Tensor& out,
|
||||
torch::Tensor& output_block_scale,
|
||||
torch::Tensor& input,
|
||||
|
||||
@ -115,7 +115,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
||||
"silu_and_mul_quant(Tensor! result, Tensor input, Tensor scale) -> ()");
|
||||
ops.impl("silu_and_mul_quant", torch::kCUDA, &silu_and_mul_quant);
|
||||
|
||||
#ifndef USE_ROCM
|
||||
#if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \
|
||||
(defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120)
|
||||
ops.def(
|
||||
"silu_and_mul_nvfp4_quant(Tensor! result, Tensor! result_block_scale, "
|
||||
"Tensor input, Tensor input_global_scale) -> ()");
|
||||
|
||||
@ -82,6 +82,7 @@ ARG GET_PIP_URL
|
||||
# Install Python and other dependencies
|
||||
RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
|
||||
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
|
||||
&& apt-get clean -y \
|
||||
&& apt-get update -y \
|
||||
&& apt-get install -y ccache software-properties-common git curl sudo \
|
||||
&& if [ ! -z ${DEADSNAKES_MIRROR_URL} ] ; then \
|
||||
@ -432,11 +433,10 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
|
||||
|
||||
# Install DeepGEMM from source
|
||||
ARG DEEPGEMM_GIT_REF="7b6b5563b9d4c1ae07ffbce7f78ad3ac9204827c"
|
||||
ARG DEEPGEMM_GIT_REF
|
||||
COPY tools/install_deepgemm.sh /tmp/install_deepgemm.sh
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
VLLM_DOCKER_BUILD_CONTEXT=1 /tmp/install_deepgemm.sh --cuda-version "${CUDA_VERSION}" --ref "${DEEPGEMM_GIT_REF}" \
|
||||
&& rm /tmp/install_deepgemm.sh
|
||||
VLLM_DOCKER_BUILD_CONTEXT=1 /tmp/install_deepgemm.sh --cuda-version "${CUDA_VERSION}" ${DEEPGEMM_GIT_REF:+--ref "$DEEPGEMM_GIT_REF"}
|
||||
|
||||
# Install EP kernels(pplx-kernels and DeepEP), NixL
|
||||
COPY tools/ep_kernels/install_python_libraries.sh install_python_libraries.sh
|
||||
|
||||
@ -174,8 +174,10 @@ Regardless, you need to set `mm_encoder_tp_mode="data"` in engine arguments to u
|
||||
|
||||
Known supported models:
|
||||
|
||||
- GLM-4.5V GLM-4.1V (<gh-pr:23168>)
|
||||
- Kimi-VL (<gh-pr:23817>)
|
||||
- Llama4 (<gh-pr:18368>)
|
||||
- MiniCPM-V-4 (<gh-pr:23327>)
|
||||
- MiniCPM-V-2.5 or above (<gh-pr:23327>, <gh-pr:23948>)
|
||||
- Qwen2.5-VL (<gh-pr:22742>)
|
||||
- Step3 (<gh-pr:22697>)
|
||||
|
||||
|
||||
@ -121,3 +121,31 @@ To support a model with interleaving sliding windows, we need to take care of th
|
||||
- In the modeling code, parse the correct sliding window value for every layer, and pass it to the attention layer's `per_layer_sliding_window` argument. For reference, check [this line](https://github.com/vllm-project/vllm/blob/996357e4808ca5eab97d4c97c7d25b3073f46aab/vllm/model_executor/models/llama.py#L171).
|
||||
|
||||
With these two steps, interleave sliding windows should work with the model.
|
||||
|
||||
### How to support models that use Mamba?
|
||||
|
||||
We consider 3 different scenarios:
|
||||
|
||||
1. Models that use Mamba layers (either Mamba-1 or Mamba-2) but do not use attention layers.
|
||||
2. Models that combine Mamba layers (either Mamba-1 or Mamba-2) together with attention layers.
|
||||
3. Models that combine Mamba-like mechanisms (e.g., Linear Attention, ShortConv) together with attention layers.
|
||||
|
||||
For case (1), we recommend looking at the implementation of [`MambaForCausalLM`](gh-file:vllm/model_executor/models/mamba.py) (for Mamba-1) or [`Mamba2ForCausalLM`](gh-file:vllm/model_executor/models/mamba2.py) (for Mamba-2) as a reference.
|
||||
The model should inherit protocol `IsAttentionFree` and also implement class methods `get_mamba_state_dtype_from_config` and `get_mamba_state_shape_from_config` to calculate the state shapes and data types from the config.
|
||||
For the mamba layers themselves, please use the [`MambaMixer`](gh-file:vllm/model_executor/layers/mamba/mamba_mixer.py) (for Mamba-1) or [`MambaMixer2`](gh-file:vllm/model_executor/layers/mamba/mamba_mixer2.py) (for Mamba-2) classes.
|
||||
Please *do not* use the `MambaCacheManager` (deprecated in V1) or replicate any of the V0-specific code paths in the existing model implementations.
|
||||
V0-only classes and code will be removed in the very near future.
|
||||
The model should also be added to the `MODELS_CONFIG_MAP` dictionary in <gh-file:vllm/model_executor/models/config.py> to ensure that the runtime defaults are optimized.
|
||||
|
||||
For case (2), we recommend using as a reference the implementation of [`JambaForCausalLM`](gh-file:vllm/model_executor/models/jamba.py) (for an example of a model that uses Mamba-1 and attention together) or [`BambaForCausalLM`](gh-file:vllm/model_executor/models/bamba.py) (for an example of a model that uses Mamba-2 and attention together).
|
||||
These models should follow the same instructions as case (1), but they should inherit protocol `IsHybrid` (instead of `IsAttentionFree`) and it is *not* necessary to add them to the `MODELS_CONFIG_MAP` (their runtime defaults will be inferred from the protocol).
|
||||
|
||||
For case (3), we recommend looking at the implementation of [`MiniMaxText01ForCausalLM`](gh-file:vllm/model_executor/models/minimax_text_01.py) or [`Lfm2ForCausalLM`](gh-file:vllm/model_executor/models/lfm2.py) as a reference, which use custom "mamba-like" layers `MiniMaxText01LinearAttention` and `ShortConv` respectively.
|
||||
Please follow the same guidelines as case (2) for implementing these models.
|
||||
We use "mamba-like" to refer to layers that posses a state that is updated in-place, rather than being appended-to (like KV cache for attention).
|
||||
For implementing new custom mamba-like layers, one should inherit from `MambaBase` and implement the methods `get_state_dtype`, `get_state_shape` to calculate the data types and state shapes at runtime, as well as `mamba_type` and `get_attn_backend`.
|
||||
It is also necessary to implement the "attention meta-data" class which handles the meta-data that is common across all layers.
|
||||
Please see [`LinearAttentionMetadata`](gh-file:vllm/v1/attention/backends/linear_attn.py) or [`ShortConvAttentionMetadata`](gh-file:v1/attention/backends/short_conv_attn.py) for examples of this.
|
||||
Finally, if one wants to support torch compile and CUDA graphs, it necessary to wrap the call to the mamba-like layer inside a custom op and register it.
|
||||
Please see the calls to `direct_register_custom_op` in <gh-file:vllm/model_executor/models/minimax_text_01.py> or <gh-file:vllm/model_executor/layers/mamba/short_conv.py> for examples of this.
|
||||
The new custom op should then be added to the list `_attention_ops` in <gh-file:vllm/config/compilation.py> to ensure that piecewise CUDA graphs works as intended.
|
||||
|
||||
@ -73,6 +73,8 @@ apt install nsight-systems-cli
|
||||
|
||||
### Example commands and usage
|
||||
|
||||
When profiling with `nsys`, it is advisable to set the environment variable `VLLM_WORKER_MULTIPROC_METHOD=spawn`. The default is to use the `fork` method instead of `spawn`. More information on the topic can be found in the [Nsight Systems release notes](https://docs.nvidia.com/nsight-systems/ReleaseNotes/index.html#general-issues).
|
||||
|
||||
#### Offline Inference
|
||||
|
||||
For basic usage, you can just append `nsys profile -o report.nsys-rep --trace-fork-before-exec=true --cuda-graph-trace=node` before any existing script you would run for offline inference.
|
||||
|
||||
@ -22,7 +22,7 @@ Deploy the following yaml file `lws.yaml`
|
||||
metadata:
|
||||
name: vllm
|
||||
spec:
|
||||
replicas: 2
|
||||
replicas: 1
|
||||
leaderWorkerTemplate:
|
||||
size: 2
|
||||
restartPolicy: RecreateGroupOnPodRestart
|
||||
@ -41,7 +41,7 @@ Deploy the following yaml file `lws.yaml`
|
||||
- sh
|
||||
- -c
|
||||
- "bash /vllm-workspace/examples/online_serving/multi-node-serving.sh leader --ray_cluster_size=$(LWS_GROUP_SIZE);
|
||||
python3 -m vllm.entrypoints.openai.api_server --port 8080 --model meta-llama/Meta-Llama-3.1-405B-Instruct --tensor-parallel-size 8 --pipeline_parallel_size 2"
|
||||
vllm serve meta-llama/Meta-Llama-3.1-405B-Instruct --port 8080 --tensor-parallel-size 8 --pipeline_parallel_size 2"
|
||||
resources:
|
||||
limits:
|
||||
nvidia.com/gpu: "8"
|
||||
@ -126,8 +126,6 @@ Should get an output similar to this:
|
||||
NAME READY STATUS RESTARTS AGE
|
||||
vllm-0 1/1 Running 0 2s
|
||||
vllm-0-1 1/1 Running 0 2s
|
||||
vllm-1 1/1 Running 0 2s
|
||||
vllm-1-1 1/1 Running 0 2s
|
||||
```
|
||||
|
||||
Verify that the distributed tensor-parallel inference works:
|
||||
|
||||
78
docs/design/io_processor_plugins.md
Normal file
78
docs/design/io_processor_plugins.md
Normal file
@ -0,0 +1,78 @@
|
||||
# IO Processor Plugins
|
||||
|
||||
IO Processor plugins are a feature that allows pre and post processing of the model input and output for pooling models. The idea is that users are allowed to pass a custom input to vLLM that is converted into one or more model prompts and fed to the model `encode` method. One potential use-case of such plugins is that of using vLLM for generating multi-modal data. Say users feed an image to vLLM and get an image in output.
|
||||
|
||||
When performing an inference with IO Processor plugins, the prompt type is defined by the plugin and the same is valid for the final request output. vLLM does not perform any validation of input/output data, and it is up to the plugin to ensure the correct data is being fed to the model and returned to the user. As of now these plugins support only pooling models and can be triggerd via the `encode` method in `LLM` and `AsyncLLM`, or in online serving mode via the `/pooling` endpoint.
|
||||
|
||||
## Writing an IO Processor Plugin
|
||||
|
||||
IO Processor plugins implement the `IOProcessor` interface (<gh-file:vllm/plugins/io_processors/interface.py>):
|
||||
|
||||
```python
|
||||
IOProcessorInput = TypeVar('IOProcessorInput')
|
||||
IOProcessorOutput = TypeVar('IOProcessorOutput')
|
||||
|
||||
class IOProcessor(ABC, Generic[IOProcessorInput, IOProcessorOutput]):
|
||||
|
||||
def __init__(self, vllm_config: VllmConfig):
|
||||
self.vllm_config = vllm_config
|
||||
|
||||
@abstractmethod
|
||||
def pre_process(
|
||||
self,
|
||||
prompt: IOProcessorInput,
|
||||
request_id: Optional[str] = None,
|
||||
**kwargs,
|
||||
) -> Union[PromptType, Sequence[PromptType]]:
|
||||
raise NotImplementedError
|
||||
|
||||
async def pre_process_async(
|
||||
self,
|
||||
prompt: IOProcessorInput,
|
||||
request_id: Optional[str] = None,
|
||||
**kwargs,
|
||||
) -> Union[PromptType, Sequence[PromptType]]:
|
||||
return self.pre_process(prompt, request_id, **kwargs)
|
||||
|
||||
@abstractmethod
|
||||
def post_process(self,
|
||||
model_output: Sequence[PoolingRequestOutput],
|
||||
request_id: Optional[str] = None,
|
||||
**kwargs) -> IOProcessorOutput:
|
||||
raise NotImplementedError
|
||||
|
||||
async def post_process_async(
|
||||
self,
|
||||
model_output: AsyncGenerator[tuple[int, PoolingRequestOutput]],
|
||||
request_id: Optional[str] = None,
|
||||
**kwargs,
|
||||
) -> IOProcessorOutput:
|
||||
collected_output = [item async for i, item in model_output]
|
||||
return self.post_process(collected_output, request_id, **kwargs)
|
||||
|
||||
@abstractmethod
|
||||
def parse_request(self, request: Any) -> IOProcessorInput:
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def output_to_response(
|
||||
self, plugin_output: IOProcessorOutput) -> IOProcessorResponse:
|
||||
raise NotImplementedError
|
||||
```
|
||||
|
||||
The `parse_request` method is used for validating the user prompt and converting it into the input expected by the `pre_process`/`pre_process_async` methods.
|
||||
The `pre_process*` methods take the validated plugin input to generate vLLM's model prompts for regular inference.
|
||||
The `post_process*` methods take `PoolingRequestOutput` objects as input and generate a custom plugin output.
|
||||
|
||||
The `output_to_response` method is used only for online serving and converts the plugin output to the `IOProcessorResponse` type that is then returned by the API Server. The implementation of the `/io_processor_pooling` serving endpoint is available here <gh-file:vllm/entrypoints/openai/serving_pooling_with_io_plugin.py>.
|
||||
|
||||
An example implementation of a plugin that enables generating geotiff images with the PrithviGeospatialMAE model is available [here](https://github.com/christian-pinto/prithvi_io_processor_plugin). Please, also refer to our online (<gh-file:examples/online_serving/prithvi_geospatial_mae.py>) and offline (<gh-file:examples/offline_inference/prithvi_geospatial_mae_io_processor.py>) inference examples.
|
||||
|
||||
## Using an IO Processor plugin
|
||||
|
||||
IO Processor plugins are loaded at engine startup and there are two methods for specifying the name of the plugin to be loaded:
|
||||
|
||||
1. Via vLLM's `EngineArgs`: setting the `io_processor_plugin` argument in the `EngineArgs` used to initialize the `AsyncLLM`. The same can be achieved by passing the `io_processor_plugin` argument to `LLM` in offline mode, or by passing the `--io-processor-plugin` argument in serving mode.
|
||||
2. Via the model HF configuration: adding an `io_processor_plugin` field to the model config (config.json).
|
||||
|
||||
The order also determines method priority. i.e., setting the plugin name via `EngineArgs` will override any plugin name specified in the model HF config (config.json).
|
||||
@ -49,6 +49,8 @@ Every plugin has three parts:
|
||||
|
||||
- **Platform plugins** (with group name `vllm.platform_plugins`): The primary use case for these plugins is to register custom, out-of-the-tree platforms into vLLM. The plugin function should return `None` when the platform is not supported in the current environment, or the platform class's fully qualified name when the platform is supported.
|
||||
|
||||
- **IO Processor plugins** (with group name `vllm.io_processor_plugins`): The primary use case for these plugins is to register custom pre/post processing of the model prompt and model output for poling models. The plugin function returns the IOProcessor's class fully qualified name.
|
||||
|
||||
## Guidelines for Writing Plugins
|
||||
|
||||
- **Being re-entrant**: The function specified in the entry point should be re-entrant, meaning it can be called multiple times without causing issues. This is necessary because the function might be called multiple times in some processes.
|
||||
|
||||
@ -13,6 +13,41 @@ To input multi-modal data, follow this schema in [vllm.inputs.PromptType][]:
|
||||
- `prompt`: The prompt should follow the format that is documented on HuggingFace.
|
||||
- `multi_modal_data`: This is a dictionary that follows the schema defined in [vllm.multimodal.inputs.MultiModalDataDict][].
|
||||
|
||||
### Stable UUIDs for Caching (multi_modal_uuids)
|
||||
|
||||
When using multi-modal inputs, vLLM normally hashes each media item by content to enable caching across requests. You can optionally pass `multi_modal_uuids` to provide your own stable IDs for each item so caching can reuse work across requests without rehashing the raw content.
|
||||
|
||||
??? code
|
||||
|
||||
```python
|
||||
from vllm import LLM
|
||||
from PIL import Image
|
||||
|
||||
# Qwen2.5-VL example with two images
|
||||
llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct")
|
||||
|
||||
prompt = "USER: <image><image>\nDescribe the differences.\nASSISTANT:"
|
||||
img_a = Image.open("/path/to/a.jpg")
|
||||
img_b = Image.open("/path/to/b.jpg")
|
||||
|
||||
outputs = llm.generate({
|
||||
"prompt": prompt,
|
||||
"multi_modal_data": {"image": [img_a, img_b]},
|
||||
# Provide stable IDs for caching.
|
||||
# Requirements (matched by this example):
|
||||
# - Include every modality present in multi_modal_data.
|
||||
# - For lists, provide the same number of entries.
|
||||
# - Use None to fall back to content hashing for that item.
|
||||
"multi_modal_uuids": {"image": ["sku-1234-a", None]},
|
||||
})
|
||||
|
||||
for o in outputs:
|
||||
print(o.outputs[0].text)
|
||||
```
|
||||
|
||||
!!! warning
|
||||
If both multimodal processor caching and prefix caching are disabled, user-provided `multi_modal_uuids` are ignored.
|
||||
|
||||
### Image Inputs
|
||||
|
||||
You can pass a single image to the `'image'` field of the multi-modal dictionary, as shown in the following examples:
|
||||
|
||||
@ -96,6 +96,7 @@ Currently, there are no pre-built CPU wheels.
|
||||
- `VLLM_CPU_KVCACHE_SPACE`: specify the KV Cache size (e.g, `VLLM_CPU_KVCACHE_SPACE=40` means 40 GiB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users. Default value is `0`.
|
||||
- `VLLM_CPU_OMP_THREADS_BIND`: specify the CPU cores dedicated to the OpenMP threads, can be set as CPU id lists or `auto` (by default). For example, `VLLM_CPU_OMP_THREADS_BIND=0-31` means there will be 32 OpenMP threads bound on 0-31 CPU cores. `VLLM_CPU_OMP_THREADS_BIND=0-31|32-63` means there will be 2 tensor parallel processes, 32 OpenMP threads of rank0 are bound on 0-31 CPU cores, and the OpenMP threads of rank1 are bound on 32-63 CPU cores. By setting to `auto`, the OpenMP threads of each rank are bound to the CPU cores in each NUMA node respectively.
|
||||
- `VLLM_CPU_NUM_OF_RESERVED_CPU`: specify the number of CPU cores which are not dedicated to the OpenMP threads for each rank. The variable only takes effect when VLLM_CPU_OMP_THREADS_BIND is set to `auto`. Default value is `None`. If the value is not set and use `auto` thread binding, no CPU will be reserved for `world_size == 1`, 1 CPU per rank will be reserved for `world_size > 1`.
|
||||
- `CPU_VISIBLE_MEMORY_NODES`: specify visible NUMA memory nodes for vLLM CPU workers, similar to ```CUDA_VISIBLE_DEVICES```. The variable only takes effect when VLLM_CPU_OMP_THREADS_BIND is set to `auto`. The variable provides more control for the auto thread-binding feature, such as masking nodes and changing nodes binding sequence.
|
||||
- `VLLM_CPU_MOE_PREPACK` (x86 only): whether to use prepack for MoE layer. This will be passed to `ipex.llm.modules.GatedMLPMOE`. Default is `1` (True). On unsupported CPUs, you might need to set this to `0` (False).
|
||||
- `VLLM_CPU_SGL_KERNEL` (x86 only, Experimental): whether to use small-batch optimized kernels for linear layer and MoE layer, especially for low-latency requirements like online serving. The kernels require AMX instruction set, BFloat16 weight type and weight shapes divisible by 32. Default is `0` (False).
|
||||
|
||||
@ -179,7 +180,7 @@ Inference batch size is an important parameter for the performance. Larger batch
|
||||
- Offline Inference: `256 * world_size`
|
||||
- Online Serving: `128 * world_size`
|
||||
|
||||
vLLM CPU supports tensor parallel (TP) and pipeline parallel (PP) to leverage multiple CPU sockets and memory nodes. For more details of tuning TP and PP, please refer to [Optimization and Tuning](../../configuration/optimization.md). For vLLM CPU, it is recommend to use TP and PP together if there are enough CPU sockets and memory nodes.
|
||||
vLLM CPU supports data parallel (DP), tensor parallel (TP) and pipeline parallel (PP) to leverage multiple CPU sockets and memory nodes. For more details of tuning DP, TP and PP, please refer to [Optimization and Tuning](../../configuration/optimization.md). For vLLM CPU, it is recommend to use DP, TP and PP together if there are enough CPU sockets and memory nodes.
|
||||
|
||||
### Which quantization configs does vLLM CPU support?
|
||||
|
||||
@ -193,3 +194,35 @@ vLLM CPU supports tensor parallel (TP) and pipeline parallel (PP) to leverage mu
|
||||
- Both of them require `amx` CPU flag.
|
||||
- `VLLM_CPU_MOE_PREPACK` can provides better performance for MoE models
|
||||
- `VLLM_CPU_SGL_KERNEL` can provides better performance for MoE models and small-batch scenarios.
|
||||
|
||||
### Why do I see `get_mempolicy: Operation not permitted` when running in Docker?
|
||||
|
||||
In some container environments (like Docker), NUMA-related syscalls used by vLLM (e.g., `get_mempolicy`, `migrate_pages`) are blocked/denied in the runtime's default seccomp/capabilities settings. This may lead to warnings like `get_mempolicy: Operation not permitted`. Functionality is not affected, but NUMA memory binding/migration optimizations may not take effect and performance can be suboptimal.
|
||||
|
||||
To enable these optimizations inside Docker with the least privilege, you can follow below tips:
|
||||
|
||||
```bash
|
||||
docker run ... --cap-add SYS_NICE --security-opt seccomp=unconfined ...
|
||||
|
||||
# 1) `--cap-add SYS_NICE` is to address `get_mempolicy` EPERM issue.
|
||||
|
||||
# 2) `--security-opt seccomp=unconfined` is to enable `migrate_pages` for `numa_migrate_pages()`.
|
||||
# Actually, `seccomp=unconfined` bypasses the seccomp for container,
|
||||
# if it's unacceptable, you can customize your own seccomp profile,
|
||||
# based on docker/runtime default.json and add `migrate_pages` to `SCMP_ACT_ALLOW` list.
|
||||
|
||||
# reference : https://docs.docker.com/engine/security/seccomp/
|
||||
```
|
||||
|
||||
Alternatively, running with `--privileged=true` also works but is broader and not generally recommended.
|
||||
|
||||
In K8S, the following configuration can be added to workload yaml to achieve the same effect as above:
|
||||
|
||||
```yaml
|
||||
securityContext:
|
||||
seccompProfile:
|
||||
type: Unconfined
|
||||
capabilities:
|
||||
add:
|
||||
- SYS_NICE
|
||||
```
|
||||
|
||||
@ -48,6 +48,10 @@ docker run --rm \
|
||||
--dtype=bfloat16 \
|
||||
other vLLM OpenAI server arguments
|
||||
```
|
||||
|
||||
!!! tip
|
||||
An alternative of `--privileged=true` is `--cap-add SYS_NICE --security-opt seccomp=unconfined`.
|
||||
|
||||
# --8<-- [end:build-image-from-source]
|
||||
# --8<-- [start:extra-information]
|
||||
# --8<-- [end:extra-information]
|
||||
|
||||
@ -16,8 +16,8 @@ cd vllm_source
|
||||
Third, install required dependencies:
|
||||
|
||||
```bash
|
||||
uv pip install -r requirements/cpu-build.txt --torch-backend auto
|
||||
uv pip install -r requirements/cpu.txt --torch-backend auto
|
||||
uv pip install -r requirements/cpu-build.txt --torch-backend cpu
|
||||
uv pip install -r requirements/cpu.txt --torch-backend cpu
|
||||
```
|
||||
|
||||
??? console "pip"
|
||||
|
||||
@ -89,6 +89,9 @@ docker run --rm \
|
||||
other vLLM OpenAI server arguments
|
||||
```
|
||||
|
||||
!!! tip
|
||||
An alternative of `--privileged true` is `--cap-add SYS_NICE --security-opt seccomp=unconfined`.
|
||||
|
||||
# --8<-- [end:build-image-from-source]
|
||||
# --8<-- [start:extra-information]
|
||||
# --8<-- [end:extra-information]
|
||||
|
||||
@ -43,7 +43,8 @@ docker build -f docker/Dockerfile.cpu \
|
||||
|
||||
# Launching OpenAI server
|
||||
docker run --rm \
|
||||
--privileged=true \
|
||||
--security-opt seccomp=unconfined \
|
||||
--cap-add SYS_NICE \
|
||||
--shm-size=4g \
|
||||
-p 8000:8000 \
|
||||
-e VLLM_CPU_KVCACHE_SPACE=<KV cache space> \
|
||||
|
||||
@ -335,9 +335,9 @@ th {
|
||||
| `CohereForCausalLM`, `Cohere2ForCausalLM` | Command-R, Command-A | `CohereLabs/c4ai-command-r-v01`, `CohereLabs/c4ai-command-r7b-12-2024`, `CohereLabs/c4ai-command-a-03-2025`, `CohereLabs/command-a-reasoning-08-2025`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `DbrxForCausalLM` | DBRX | `databricks/dbrx-base`, `databricks/dbrx-instruct`, etc. | | ✅︎ | ✅︎ |
|
||||
| `DeciLMForCausalLM` | DeciLM | `nvidia/Llama-3_3-Nemotron-Super-49B-v1`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `DeepseekForCausalLM` | DeepSeek | `deepseek-ai/deepseek-llm-67b-base`, `deepseek-ai/deepseek-llm-7b-chat`, etc. | | ✅︎ | ✅︎ |
|
||||
| `DeepseekV2ForCausalLM` | DeepSeek-V2 | `deepseek-ai/DeepSeek-V2`, `deepseek-ai/DeepSeek-V2-Chat`, etc. | | ✅︎ | ✅︎ |
|
||||
| `DeepseekV3ForCausalLM` | DeepSeek-V3 | `deepseek-ai/DeepSeek-V3-Base`, `deepseek-ai/DeepSeek-V3`, etc. | | ✅︎ | ✅︎ |
|
||||
| `DeepseekForCausalLM` | DeepSeek | `deepseek-ai/deepseek-llm-67b-base`, `deepseek-ai/deepseek-llm-7b-chat`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `DeepseekV2ForCausalLM` | DeepSeek-V2 | `deepseek-ai/DeepSeek-V2`, `deepseek-ai/DeepSeek-V2-Chat`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `DeepseekV3ForCausalLM` | DeepSeek-V3 | `deepseek-ai/DeepSeek-V3`, `deepseek-ai/DeepSeek-R1`, `deepseek-ai/DeepSeek-V3.1`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `Dots1ForCausalLM` | dots.llm1 | `rednote-hilab/dots.llm1.base`, `rednote-hilab/dots.llm1.inst`, etc. | | ✅︎ | ✅︎ |
|
||||
| `Ernie4_5ForCausalLM` | Ernie4.5 | `baidu/ERNIE-4.5-0.3B-PT`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `Ernie4_5_MoeForCausalLM` | Ernie4.5MoE | `baidu/ERNIE-4.5-21B-A3B-PT`, `baidu/ERNIE-4.5-300B-A47B-PT`, etc. |✅︎| ✅︎ | ✅︎ |
|
||||
@ -634,7 +634,8 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
|
||||
| `InternS1ForConditionalGeneration` | Intern-S1 | T + I<sup>E+</sup> + V<sup>E+</sup> | `internlm/Intern-S1`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `InternVLChatModel` | InternVL 3.5, InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + I<sup>E+</sup> + (V<sup>E+</sup>) | `OpenGVLab/InternVL3_5-14B`, `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `InternVLForConditionalGeneration` | InternVL 3.0 (HF format) | T + I<sup>E+</sup> + V<sup>E+</sup> | `OpenGVLab/InternVL3-1B-hf`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `KeyeForConditionalGeneration` | Keye-VL-8B-Preview | T + I<sup>E+</sup> + V<sup>E+</sup> | `Kwai-Keye/Keye-VL-8B-Preview` | | | ✅︎ |
|
||||
| `KeyeForConditionalGeneration` | Keye-VL-8B-Preview | T + I<sup>E+</sup> + V<sup>E+</sup> | `Kwai-Keye/Keye-VL-8B-Preview` | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `KeyeVL1_5ForConditionalGeneration` | Keye-VL-1_5-8B | T + I<sup>E+</sup> + V<sup>E+</sup> | `Kwai-Keye/Keye-VL-1_5-8B` | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `KimiVLForConditionalGeneration` | Kimi-VL-A3B-Instruct, Kimi-VL-A3B-Thinking | T + I<sup>+</sup> | `moonshotai/Kimi-VL-A3B-Instruct`, `moonshotai/Kimi-VL-A3B-Thinking` | | ✅︎ | ✅︎ |
|
||||
| `Llama4ForConditionalGeneration` | Llama 4 | T + I<sup>+</sup> | `meta-llama/Llama-4-Scout-17B-16E-Instruct`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct`, etc. | | ✅︎ | ✅︎ |
|
||||
| `Llama_Nemotron_Nano_VL` | Llama Nemotron Nano VL | T + I<sup>E+</sup> | `nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1` | ✅︎ | ✅︎ | ✅︎ |
|
||||
|
||||
@ -107,16 +107,14 @@ to enable simultaneous generation and embedding using the same engine instance i
|
||||
#### Mamba Models
|
||||
|
||||
Models using selective state-space mechanisms instead of standard transformer attention are supported.
|
||||
Models that use Mamba-2 and Mamba-1 layers (e.g., `Mamba2ForCausalLM`, `MambaForCausalLM`) are supported.
|
||||
Please note that prefix caching is not yet supported for these models.
|
||||
Models that use Mamba-2 and Mamba-1 layers (e.g., `Mamba2ForCausalLM`, `MambaForCausalLM`,`FalconMambaForCausalLM`) are supported.
|
||||
|
||||
Models that combine Mamba-2 and Mamba-1 layers with standard attention layers are also supported (e.g., `BambaForCausalLM`,
|
||||
Hybrid models that combine Mamba-2 and Mamba-1 layers with standard attention layers are also supported (e.g., `BambaForCausalLM`,
|
||||
`Zamba2ForCausalLM`, `NemotronHForCausalLM`, `FalconH1ForCausalLM` and `GraniteMoeHybridForCausalLM`, `JambaForCausalLM`).
|
||||
Please note that prefix caching is not yet supported for these models.
|
||||
|
||||
Hybrid models with mechanisms different to Mamba are also supported (e.g, `MiniMaxText01ForCausalLM`, `MiniMaxM1ForCausalLM`).
|
||||
Please note that prefix caching is not yet supported for these models.
|
||||
It is also necessary to enforce eager mode for these models in V1.
|
||||
Hybrid models with mechanisms different to Mamba are also supported (e.g, `MiniMaxText01ForCausalLM`, `MiniMaxM1ForCausalLM`, `Lfm2ForCausalLM`).
|
||||
|
||||
Please note that prefix caching is not yet supported for any of the above models.
|
||||
|
||||
#### Encoder-Decoder Models
|
||||
|
||||
|
||||
@ -23,7 +23,7 @@ def create_test_prompts(
|
||||
2 requests for base model, 4 requests for the LoRA. We define 2
|
||||
different LoRA adapters (using the same model for demo purposes).
|
||||
Since we also set `max_loras=1`, the expectation is that the requests
|
||||
with the second LoRA adapter will be ran after all requests with the
|
||||
with the second LoRA adapter will be run after all requests with the
|
||||
first adapter have finished.
|
||||
"""
|
||||
return [
|
||||
|
||||
@ -0,0 +1,60 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import base64
|
||||
import os
|
||||
|
||||
import torch
|
||||
|
||||
from vllm import LLM
|
||||
from vllm.pooling_params import PoolingParams
|
||||
|
||||
# This example shows how to perform an offline inference that generates
|
||||
# multimodal data. In this specific case this example will take a geotiff
|
||||
# image as input, process it using the multimodal data processor, and
|
||||
# perform inference.
|
||||
# Reuirement - install plugin at:
|
||||
# https://github.com/christian-pinto/prithvi_io_processor_plugin
|
||||
|
||||
|
||||
def main():
|
||||
torch.set_default_dtype(torch.float16)
|
||||
image_url = "https://huggingface.co/christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM/resolve/main/India_900498_S2Hand.tif" # noqa: E501
|
||||
|
||||
img_prompt = dict(
|
||||
data=image_url,
|
||||
data_format="url",
|
||||
image_format="tiff",
|
||||
out_data_format="b64_json",
|
||||
)
|
||||
|
||||
llm = LLM(
|
||||
model="christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM",
|
||||
skip_tokenizer_init=True,
|
||||
trust_remote_code=True,
|
||||
enforce_eager=True,
|
||||
# Limit the maximum number of parallel requests
|
||||
# to avoid the model going OOM.
|
||||
# The maximum number depends on the available GPU memory
|
||||
max_num_seqs=32,
|
||||
io_processor_plugin="prithvi_to_tiff_india",
|
||||
)
|
||||
|
||||
pooling_params = PoolingParams(task="encode", softmax=False)
|
||||
pooler_output = llm.encode(
|
||||
img_prompt,
|
||||
pooling_params=pooling_params,
|
||||
)
|
||||
output = pooler_output[0].outputs
|
||||
|
||||
print(output)
|
||||
decoded_data = base64.b64decode(output.data)
|
||||
|
||||
file_path = os.path.join(os.getcwd(), "offline_prediction.tiff")
|
||||
with open(file_path, "wb") as f:
|
||||
f.write(decoded_data)
|
||||
|
||||
print(f"Output file path: {file_path}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@ -683,6 +683,37 @@ def run_keye_vl(questions: list[str], modality: str) -> ModelRequestData:
|
||||
)
|
||||
|
||||
|
||||
# Keye-VL-1.5
|
||||
def run_keye_vl1_5(questions: list[str], modality: str) -> ModelRequestData:
|
||||
model_name = "Kwai-Keye/Keye-VL-1.5-8B"
|
||||
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
max_model_len=8192,
|
||||
trust_remote_code=True,
|
||||
limit_mm_per_prompt={modality: 1},
|
||||
)
|
||||
|
||||
if modality == "image":
|
||||
placeholder = "<|image_pad|>"
|
||||
elif modality == "video":
|
||||
placeholder = "<|video_pad|>"
|
||||
|
||||
prompts = [
|
||||
(
|
||||
f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
|
||||
f"{question}<|im_end|>\n"
|
||||
"<|im_start|>assistant\n"
|
||||
)
|
||||
for question in questions
|
||||
]
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompts=prompts,
|
||||
)
|
||||
|
||||
|
||||
# Kimi-VL
|
||||
def run_kimi_vl(questions: list[str], modality: str) -> ModelRequestData:
|
||||
assert modality == "image"
|
||||
@ -1648,6 +1679,7 @@ model_example_map = {
|
||||
"interns1": run_interns1,
|
||||
"internvl_chat": run_internvl,
|
||||
"keye_vl": run_keye_vl,
|
||||
"keye_vl1_5": run_keye_vl1_5,
|
||||
"kimi_vl": run_kimi_vl,
|
||||
"llama4": run_llama4,
|
||||
"llava": run_llava,
|
||||
|
||||
@ -542,6 +542,43 @@ def load_keye_vl(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
)
|
||||
|
||||
|
||||
def load_keye_vl1_5(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
model_name = "Kwai-Keye/Keye-VL-1_5-8B"
|
||||
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
trust_remote_code=True,
|
||||
max_model_len=8192,
|
||||
max_num_seqs=5,
|
||||
limit_mm_per_prompt={"image": len(image_urls)},
|
||||
)
|
||||
|
||||
placeholders = [{"type": "image", "image": url} for url in image_urls]
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
*placeholders,
|
||||
{"type": "text", "text": question},
|
||||
],
|
||||
},
|
||||
]
|
||||
|
||||
processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
|
||||
|
||||
prompt = processor.apply_chat_template(
|
||||
messages, tokenize=False, add_generation_prompt=True
|
||||
)
|
||||
|
||||
image_data = [fetch_image(url) for url in image_urls]
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompt=prompt,
|
||||
image_data=image_data,
|
||||
)
|
||||
|
||||
|
||||
def load_kimi_vl(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
model_name = "moonshotai/Kimi-VL-A3B-Instruct"
|
||||
|
||||
@ -1209,6 +1246,7 @@ model_example_map = {
|
||||
"interns1": load_interns1,
|
||||
"internvl_chat": load_internvl,
|
||||
"keye_vl": load_keye_vl,
|
||||
"keye_vl1_5": load_keye_vl1_5,
|
||||
"kimi_vl": load_kimi_vl,
|
||||
"llama4": load_llama4,
|
||||
"llava": load_llava,
|
||||
|
||||
@ -27,10 +27,12 @@ class BlockStored(KVCacheEvent):
|
||||
token_ids: list[int]
|
||||
block_size: int
|
||||
lora_id: Optional[int]
|
||||
medium: Optional[str]
|
||||
|
||||
|
||||
class BlockRemoved(KVCacheEvent):
|
||||
block_hashes: list[int]
|
||||
medium: Optional[str]
|
||||
|
||||
|
||||
class AllBlocksCleared(KVCacheEvent):
|
||||
|
||||
@ -11,7 +11,7 @@
|
||||
# Example usage:
|
||||
# On the head node machine, start the Ray head node process and run a vLLM server.
|
||||
# ./multi-node-serving.sh leader --ray_port=6379 --ray_cluster_size=<SIZE> [<extra ray args>] && \
|
||||
# python3 -m vllm.entrypoints.openai.api_server --port 8080 --model meta-llama/Meta-Llama-3.1-405B-Instruct --tensor-parallel-size 8 --pipeline_parallel_size 2
|
||||
# vllm serve meta-llama/Meta-Llama-3.1-405B-Instruct --port 8080 --tensor-parallel-size 8 --pipeline_parallel_size 2
|
||||
#
|
||||
# On each worker node, start the Ray worker node process.
|
||||
# ./multi-node-serving.sh worker --ray_address=<HEAD_NODE_IP> --ray_port=6379 [<extra ray args>]
|
||||
|
||||
@ -266,10 +266,52 @@ def run_audio(model: str) -> None:
|
||||
print("Chat completion output from base64 encoded audio:", result)
|
||||
|
||||
|
||||
def run_multi_audio(model: str) -> None:
|
||||
from vllm.assets.audio import AudioAsset
|
||||
|
||||
# Two different audios to showcase batched inference.
|
||||
audio_url = AudioAsset("winning_call").url
|
||||
audio_base64 = encode_base64_content_from_url(audio_url)
|
||||
audio_url2 = AudioAsset("azacinto_foscolo").url
|
||||
audio_base64_2 = encode_base64_content_from_url(audio_url2)
|
||||
|
||||
# OpenAI-compatible schema (`input_audio`)
|
||||
chat_completion_from_base64 = client.chat.completions.create(
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": "Are these two audios the same?"},
|
||||
{
|
||||
"type": "input_audio",
|
||||
"input_audio": {
|
||||
"data": audio_base64,
|
||||
"format": "wav",
|
||||
},
|
||||
},
|
||||
{
|
||||
"type": "input_audio",
|
||||
"input_audio": {
|
||||
"data": audio_base64_2,
|
||||
"format": "wav",
|
||||
},
|
||||
},
|
||||
],
|
||||
}
|
||||
],
|
||||
model=model,
|
||||
max_completion_tokens=64,
|
||||
)
|
||||
|
||||
result = chat_completion_from_base64.choices[0].message.content
|
||||
print("Chat completion output from input audio:", result)
|
||||
|
||||
|
||||
example_function_map = {
|
||||
"text-only": run_text_only,
|
||||
"single-image": run_single_image,
|
||||
"multi-image": run_multi_image,
|
||||
"multi-audio": run_multi_audio,
|
||||
"video": run_video,
|
||||
"audio": run_audio,
|
||||
}
|
||||
|
||||
55
examples/online_serving/prithvi_geospatial_mae.py
Normal file
55
examples/online_serving/prithvi_geospatial_mae.py
Normal file
@ -0,0 +1,55 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import base64
|
||||
import os
|
||||
|
||||
import requests
|
||||
|
||||
# This example shows how to perform an online inference that generates
|
||||
# multimodal data. In this specific case this example will take a geotiff
|
||||
# image as input, process it using the multimodal data processor, and
|
||||
# perform inference.
|
||||
# Reuirements :
|
||||
# - install plugin at:
|
||||
# https://github.com/christian-pinto/prithvi_io_processor_plugin
|
||||
# - start vllm in serving mode with the below args
|
||||
# --model='christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM'
|
||||
# --task embed --trust-remote-code
|
||||
# --skip-tokenizer-init --enforce-eager
|
||||
# --io-processor-plugin prithvi_to_tiff_india
|
||||
|
||||
|
||||
def main():
|
||||
image_url = "https://huggingface.co/christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM/resolve/main/India_900498_S2Hand.tif" # noqa: E501
|
||||
server_endpoint = "http://localhost:8000/pooling"
|
||||
|
||||
request_payload_url = {
|
||||
"data": {
|
||||
"data": image_url,
|
||||
"data_format": "url",
|
||||
"image_format": "tiff",
|
||||
"out_data_format": "b64_json",
|
||||
},
|
||||
"priority": 0,
|
||||
"model": "christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM",
|
||||
"softmax": False,
|
||||
}
|
||||
|
||||
ret = requests.post(server_endpoint, json=request_payload_url)
|
||||
|
||||
print(f"response.status_code: {ret.status_code}")
|
||||
print(f"response.reason:{ret.reason}")
|
||||
|
||||
response = ret.json()
|
||||
|
||||
decoded_image = base64.b64decode(response["data"]["data"])
|
||||
|
||||
out_path = os.path.join(os.getcwd(), "online_prediction.tiff")
|
||||
|
||||
with open(out_path, "wb") as f:
|
||||
f.write(decoded_image)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@ -402,7 +402,7 @@
|
||||
},
|
||||
"disableTextWrap": false,
|
||||
"editorMode": "builder",
|
||||
"expr": "histogram_quantile(0.99, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
|
||||
"expr": "histogram_quantile(0.99, sum by(le) (rate(vllm:inter_token_latency_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
|
||||
"fullMetaSearch": false,
|
||||
"includeNullMetadata": false,
|
||||
"instant": false,
|
||||
@ -418,7 +418,7 @@
|
||||
},
|
||||
"disableTextWrap": false,
|
||||
"editorMode": "builder",
|
||||
"expr": "histogram_quantile(0.95, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
|
||||
"expr": "histogram_quantile(0.95, sum by(le) (rate(vllm:inter_token_latency_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
|
||||
"fullMetaSearch": false,
|
||||
"hide": false,
|
||||
"includeNullMetadata": false,
|
||||
@ -435,7 +435,7 @@
|
||||
},
|
||||
"disableTextWrap": false,
|
||||
"editorMode": "builder",
|
||||
"expr": "histogram_quantile(0.9, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
|
||||
"expr": "histogram_quantile(0.9, sum by(le) (rate(vllm:inter_token_latency_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
|
||||
"fullMetaSearch": false,
|
||||
"hide": false,
|
||||
"includeNullMetadata": false,
|
||||
@ -452,7 +452,7 @@
|
||||
},
|
||||
"disableTextWrap": false,
|
||||
"editorMode": "builder",
|
||||
"expr": "histogram_quantile(0.5, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
|
||||
"expr": "histogram_quantile(0.5, sum by(le) (rate(vllm:inter_token_latency_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
|
||||
"fullMetaSearch": false,
|
||||
"hide": false,
|
||||
"includeNullMetadata": false,
|
||||
@ -468,7 +468,7 @@
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "rate(vllm:time_per_output_token_seconds_sum{model_name=\"$model_name\"}[$__rate_interval])\n/\nrate(vllm:time_per_output_token_seconds_count{model_name=\"$model_name\"}[$__rate_interval])",
|
||||
"expr": "rate(vllm:inter_token_latency_seconds_sum{model_name=\"$model_name\"}[$__rate_interval])\n/\nrate(vllm:inter_token_latency_seconds_count{model_name=\"$model_name\"}[$__rate_interval])",
|
||||
"hide": false,
|
||||
"instant": false,
|
||||
"legendFormat": "Mean",
|
||||
@ -476,7 +476,7 @@
|
||||
"refId": "E"
|
||||
}
|
||||
],
|
||||
"title": "Time Per Output Token Latency",
|
||||
"title": "Inter Token Latency",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
|
||||
@ -6,7 +6,7 @@ requires = [
|
||||
"packaging>=24.2",
|
||||
"setuptools>=77.0.3,<80.0.0",
|
||||
"setuptools-scm>=8.0",
|
||||
"torch == 2.7.1",
|
||||
"torch == 2.8.0",
|
||||
"wheel",
|
||||
"jinja2",
|
||||
]
|
||||
|
||||
@ -4,7 +4,8 @@ ninja
|
||||
packaging>=24.2
|
||||
setuptools>=77.0.3,<80.0.0
|
||||
setuptools-scm>=8
|
||||
torch==2.7.1
|
||||
torch==2.8.0
|
||||
wheel
|
||||
jinja2>=3.1.6
|
||||
regex
|
||||
build
|
||||
|
||||
@ -9,17 +9,16 @@ packaging>=24.2
|
||||
setuptools>=77.0.3,<80.0.0
|
||||
--extra-index-url https://download.pytorch.org/whl/cpu
|
||||
torch==2.6.0+cpu; platform_machine == "x86_64" # torch>2.6.0+cpu has performance regression on x86 platform, see https://github.com/pytorch/pytorch/pull/151218
|
||||
torch==2.7.0; platform_system == "Darwin"
|
||||
torch==2.7.0; platform_machine == "ppc64le"
|
||||
torch==2.6.0; platform_machine == "aarch64" # for arm64 CPUs, torch 2.7.0 has a issue: https://github.com/vllm-project/vllm/issues/17960
|
||||
torch==2.8.0; platform_system == "Darwin"
|
||||
torch==2.8.0; platform_machine == "ppc64le" or platform_machine == "aarch64"
|
||||
|
||||
# required for the image processor of minicpm-o-2_6, this must be updated alongside torch
|
||||
torchaudio; platform_machine != "ppc64le" and platform_machine != "s390x"
|
||||
torchaudio==2.7.0; platform_machine == "ppc64le"
|
||||
torchaudio==2.8.0; platform_machine == "ppc64le"
|
||||
|
||||
# required for the image processor of phi3v, this must be updated alongside torch
|
||||
torchvision; platform_machine != "ppc64le" and platform_machine != "s390x"
|
||||
torchvision==0.22.0; platform_machine == "ppc64le"
|
||||
torchvision==0.23.0; platform_machine == "ppc64le"
|
||||
datasets # for benchmark scripts
|
||||
|
||||
# Intel Extension for PyTorch, only for x86_64 CPUs
|
||||
|
||||
@ -6,9 +6,9 @@ numba == 0.61.2; python_version > '3.9'
|
||||
|
||||
# Dependencies for NVIDIA GPUs
|
||||
ray[cgraph]>=2.48.0 # Ray Compiled Graph, required for pipeline parallelism in V1.
|
||||
torch==2.7.1
|
||||
torchaudio==2.7.1
|
||||
torch==2.8.0
|
||||
torchaudio==2.8.0
|
||||
# These must be updated alongside torch
|
||||
torchvision==0.22.1 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
|
||||
# https://github.com/facebookresearch/xformers/releases/tag/v0.0.31
|
||||
xformers==0.0.31; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch >= 2.7
|
||||
torchvision==0.23.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
|
||||
# https://github.com/facebookresearch/xformers/releases/tag/v0.0.32.post1
|
||||
xformers==0.0.32.post1; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch >= 2.8
|
||||
|
||||
@ -1,10 +1,10 @@
|
||||
# Common dependencies
|
||||
-r common.txt
|
||||
|
||||
--extra-index-url https://download.pytorch.org/whl/rocm6.2.4
|
||||
torch==2.7.0
|
||||
torchvision==0.22.0
|
||||
torchaudio==2.7.0
|
||||
--extra-index-url https://download.pytorch.org/whl/rocm6.3
|
||||
torch==2.8.0
|
||||
torchvision==0.23.0
|
||||
torchaudio==2.8.0
|
||||
|
||||
triton==3.3.0
|
||||
cmake>=3.26.1,<4
|
||||
|
||||
@ -22,9 +22,9 @@ sentence-transformers # required for embedding tests
|
||||
soundfile # required for audio tests
|
||||
jiwer # required for audio tests
|
||||
timm >=1.0.17 # required for internvl and gemma3n-mm test
|
||||
torch==2.7.1
|
||||
torchaudio==2.7.1
|
||||
torchvision==0.22.1
|
||||
torch==2.8.0
|
||||
torchaudio==2.8.0
|
||||
torchvision==0.23.0
|
||||
transformers_stream_generator # required for qwen-vl test
|
||||
matplotlib # required for qwen-vl test
|
||||
mistral_common[image,audio] >= 1.8.2 # required for voxtral test
|
||||
|
||||
@ -541,42 +541,42 @@ numpy==1.26.4
|
||||
# tritonclient
|
||||
# vocos
|
||||
# xarray
|
||||
nvidia-cublas-cu12==12.8.3.14
|
||||
nvidia-cublas-cu12==12.8.4.1
|
||||
# via
|
||||
# nvidia-cudnn-cu12
|
||||
# nvidia-cusolver-cu12
|
||||
# torch
|
||||
nvidia-cuda-cupti-cu12==12.8.57
|
||||
nvidia-cuda-cupti-cu12==12.8.90
|
||||
# via torch
|
||||
nvidia-cuda-nvrtc-cu12==12.8.61
|
||||
nvidia-cuda-nvrtc-cu12==12.8.93
|
||||
# via torch
|
||||
nvidia-cuda-runtime-cu12==12.8.57
|
||||
nvidia-cuda-runtime-cu12==12.8.90
|
||||
# via torch
|
||||
nvidia-cudnn-cu12==9.7.1.26
|
||||
nvidia-cudnn-cu12==9.10.2.21
|
||||
# via torch
|
||||
nvidia-cufft-cu12==11.3.3.41
|
||||
nvidia-cufft-cu12==11.3.3.83
|
||||
# via torch
|
||||
nvidia-cufile-cu12==1.13.0.11
|
||||
nvidia-cufile-cu12==1.13.1.3
|
||||
# via torch
|
||||
nvidia-curand-cu12==10.3.9.55
|
||||
nvidia-curand-cu12==10.3.9.90
|
||||
# via torch
|
||||
nvidia-cusolver-cu12==11.7.2.55
|
||||
nvidia-cusolver-cu12==11.7.3.90
|
||||
# via torch
|
||||
nvidia-cusparse-cu12==12.5.7.53
|
||||
nvidia-cusparse-cu12==12.5.8.93
|
||||
# via
|
||||
# nvidia-cusolver-cu12
|
||||
# torch
|
||||
nvidia-cusparselt-cu12==0.6.3
|
||||
nvidia-cusparselt-cu12==0.7.1
|
||||
# via torch
|
||||
nvidia-nccl-cu12==2.26.2
|
||||
nvidia-nccl-cu12==2.27.3
|
||||
# via torch
|
||||
nvidia-nvjitlink-cu12==12.8.61
|
||||
nvidia-nvjitlink-cu12==12.8.93
|
||||
# via
|
||||
# nvidia-cufft-cu12
|
||||
# nvidia-cusolver-cu12
|
||||
# nvidia-cusparse-cu12
|
||||
# torch
|
||||
nvidia-nvtx-cu12==12.8.55
|
||||
nvidia-nvtx-cu12==12.8.90
|
||||
# via torch
|
||||
omegaconf==2.3.0
|
||||
# via
|
||||
@ -1069,7 +1069,7 @@ tomli==2.2.1
|
||||
# via schemathesis
|
||||
tomli-w==1.2.0
|
||||
# via schemathesis
|
||||
torch==2.7.1+cu128
|
||||
torch==2.8.0+cu128
|
||||
# via
|
||||
# -r requirements/test.in
|
||||
# accelerate
|
||||
@ -1098,7 +1098,7 @@ torch==2.7.1+cu128
|
||||
# torchvision
|
||||
# vector-quantize-pytorch
|
||||
# vocos
|
||||
torchaudio==2.7.1+cu128
|
||||
torchaudio==2.8.0+cu128
|
||||
# via
|
||||
# -r requirements/test.in
|
||||
# encodec
|
||||
@ -1111,7 +1111,7 @@ torchmetrics==1.7.4
|
||||
# pytorch-lightning
|
||||
# terratorch
|
||||
# torchgeo
|
||||
torchvision==0.22.1+cu128
|
||||
torchvision==0.23.0+cu128
|
||||
# via
|
||||
# -r requirements/test.in
|
||||
# lightly
|
||||
@ -1152,7 +1152,7 @@ transformers==4.55.2
|
||||
# transformers-stream-generator
|
||||
transformers-stream-generator==0.0.5
|
||||
# via -r requirements/test.in
|
||||
triton==3.3.1
|
||||
triton==3.4.0
|
||||
# via torch
|
||||
tritonclient==2.51.0
|
||||
# via
|
||||
|
||||
@ -98,7 +98,7 @@ def test_api_server(api_server, distributed_executor_backend: str):
|
||||
pool.join()
|
||||
|
||||
# check cancellation stats
|
||||
# give it some times to update the stats
|
||||
# give it some time to update the stats
|
||||
time.sleep(1)
|
||||
|
||||
num_aborted_requests = requests.get(
|
||||
|
||||
@ -1120,6 +1120,9 @@ class VllmRunner:
|
||||
|
||||
return self.llm.llm_engine.collective_rpc(_apply_model)
|
||||
|
||||
def get_llm(self) -> LLM:
|
||||
return self.llm
|
||||
|
||||
def __enter__(self):
|
||||
return self
|
||||
|
||||
|
||||
@ -439,10 +439,10 @@ def test_auto_prefix_caching_with_preemption(baseline_llm_generator,
|
||||
@pytest.mark.parametrize("seed", [1])
|
||||
def test_auto_prefix_caching_after_eviction_start(baseline_llm_generator,
|
||||
test_llm_generator):
|
||||
"""Verify block manager v2 with auto prefix caching could works normal
|
||||
"""Verify block manager v2 with auto prefix caching could work normally
|
||||
even when eviction started.
|
||||
With APC enabled, all blocks are held by native block at the beginning.
|
||||
Then blocks are managed by evictor instead. If cache hit at the evitor's
|
||||
Then blocks are managed by evictor instead. If cache hit at the evictor's
|
||||
block, then it could be reused, or we need to recompute its kv cache.
|
||||
"""
|
||||
output_len = 10
|
||||
|
||||
@ -292,7 +292,7 @@ SP_TEST_MODELS = [
|
||||
# TODO support other models
|
||||
# [LANGUAGE GENERATION]
|
||||
"meta-llama/Llama-3.2-1B-Instruct",
|
||||
"RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8"
|
||||
"RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8",
|
||||
]
|
||||
|
||||
|
||||
|
||||
@ -167,7 +167,7 @@ def test_get_kwargs():
|
||||
# dict should have json tip in help
|
||||
json_tip = "Should either be a valid JSON string or JSON keys"
|
||||
assert json_tip in kwargs["json_tip"]["help"]
|
||||
# nested config should should construct the nested config
|
||||
# nested config should construct the nested config
|
||||
assert kwargs["nested_config"]["type"]('{"field": 2}') == NestedConfig(2)
|
||||
|
||||
|
||||
@ -337,10 +337,6 @@ def test_human_readable_model_len():
|
||||
args = parser.parse_args(["--max-model-len", "10.212345k"])
|
||||
assert args.max_model_len == 10212
|
||||
|
||||
# Auto via -1
|
||||
args = parser.parse_args(["--max-model-len", "-1"])
|
||||
assert args.max_model_len == -1
|
||||
|
||||
# Invalid (do not allow decimals with binary multipliers)
|
||||
for invalid in ["1a", "pwd", "10.24", "1.23M"]:
|
||||
with pytest.raises(ArgumentError):
|
||||
|
||||
@ -201,3 +201,32 @@ table: "table_1" | "table_2"
|
||||
condition: column "=" number
|
||||
number: "1" | "2"
|
||||
""")
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def zephyr_lora_files():
|
||||
"""Download zephyr LoRA files once per test session."""
|
||||
from huggingface_hub import snapshot_download
|
||||
return snapshot_download(repo_id="typeof/zephyr-7b-beta-lora")
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def zephyr_lora_added_tokens_files(zephyr_lora_files):
|
||||
"""Create zephyr LoRA files with added tokens once per test session."""
|
||||
import shutil
|
||||
from tempfile import TemporaryDirectory
|
||||
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
tmp_dir = TemporaryDirectory()
|
||||
tmp_model_dir = f"{tmp_dir.name}/zephyr"
|
||||
shutil.copytree(zephyr_lora_files, tmp_model_dir)
|
||||
tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-beta")
|
||||
# Copy tokenizer to adapter and add some unique tokens
|
||||
# 32000, 32001, 32002
|
||||
added = tokenizer.add_tokens(["vllm1", "vllm2", "vllm3"],
|
||||
special_tokens=True)
|
||||
assert added == 3
|
||||
tokenizer.save_pretrained(tmp_model_dir)
|
||||
yield tmp_model_dir
|
||||
tmp_dir.cleanup()
|
||||
|
||||
@ -62,3 +62,9 @@ def test_encode_api(llm: LLM):
|
||||
err_msg = "pooling_task must be one of.+"
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
llm.encode(prompts, use_tqdm=False)
|
||||
|
||||
|
||||
def test_score_api(llm: LLM):
|
||||
err_msg = "Score API is only enabled for num_labels == 1."
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
llm.score("ping", "pong", use_tqdm=False)
|
||||
|
||||
27
tests/entrypoints/openai/conftest.py
Normal file
27
tests/entrypoints/openai/conftest.py
Normal file
@ -0,0 +1,27 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import pytest
|
||||
|
||||
from vllm.assets.audio import AudioAsset
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mary_had_lamb():
|
||||
path = AudioAsset('mary_had_lamb').get_local_path()
|
||||
with open(str(path), "rb") as f:
|
||||
yield f
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def winning_call():
|
||||
path = AudioAsset('winning_call').get_local_path()
|
||||
with open(str(path), "rb") as f:
|
||||
yield f
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def foscolo():
|
||||
# Test translation it->en
|
||||
path = AudioAsset('azacinto_foscolo').get_local_path()
|
||||
with open(str(path), "rb") as f:
|
||||
yield f
|
||||
@ -15,8 +15,6 @@ import torch
|
||||
from openai import BadRequestError, OpenAI
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
from .test_completion import zephyr_lora_added_tokens_files # noqa: F401
|
||||
from .test_completion import zephyr_lora_files # noqa: F401
|
||||
|
||||
# any model with a chat template should work here
|
||||
MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
|
||||
|
||||
@ -226,3 +226,33 @@ def test_pooling(server: RemoteOpenAIServer, model_name: str):
|
||||
},
|
||||
)
|
||||
assert response.json()["error"]["type"] == "BadRequestError"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
def test_score(server: RemoteOpenAIServer, model_name: str):
|
||||
# score api is only enabled for num_labels == 1.
|
||||
response = requests.post(
|
||||
server.url_for("score"),
|
||||
json={
|
||||
"model": model_name,
|
||||
"text_1": "ping",
|
||||
"text_2": "pong",
|
||||
},
|
||||
)
|
||||
assert response.json()["error"]["type"] == "BadRequestError"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
def test_rerank(server: RemoteOpenAIServer, model_name: str):
|
||||
# rerank api is only enabled for num_labels == 1.
|
||||
response = requests.post(
|
||||
server.url_for("rerank"),
|
||||
json={
|
||||
"model": model_name,
|
||||
"query": "ping",
|
||||
"documents": ["pong"],
|
||||
},
|
||||
)
|
||||
assert response.json()["error"]["type"] == "BadRequestError"
|
||||
|
||||
@ -27,6 +27,28 @@ def serve_parser():
|
||||
return make_arg_parser(parser)
|
||||
|
||||
|
||||
### Test config parsing
|
||||
def test_config_arg_parsing(serve_parser, cli_config_file):
|
||||
args = serve_parser.parse_args([])
|
||||
assert args.port == 8000
|
||||
args = serve_parser.parse_args(['--config', cli_config_file])
|
||||
assert args.port == 12312
|
||||
args = serve_parser.parse_args([
|
||||
'--config',
|
||||
cli_config_file,
|
||||
'--port',
|
||||
'9000',
|
||||
])
|
||||
assert args.port == 9000
|
||||
args = serve_parser.parse_args([
|
||||
'--port',
|
||||
'9000',
|
||||
'--config',
|
||||
cli_config_file,
|
||||
])
|
||||
assert args.port == 9000
|
||||
|
||||
|
||||
### Tests for LoRA module parsing
|
||||
def test_valid_key_value_format(serve_parser):
|
||||
# Test old format: name=path
|
||||
|
||||
@ -3,8 +3,6 @@
|
||||
# imports for guided decoding tests
|
||||
import json
|
||||
import os
|
||||
import shutil
|
||||
from tempfile import TemporaryDirectory
|
||||
from typing import Optional
|
||||
|
||||
import jsonschema
|
||||
@ -14,9 +12,7 @@ import pytest_asyncio
|
||||
import regex as re
|
||||
import requests
|
||||
# downloading lora to test lora requests
|
||||
from huggingface_hub import snapshot_download
|
||||
from openai import BadRequestError
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||
|
||||
@ -26,32 +22,10 @@ from ...utils import RemoteOpenAIServer
|
||||
MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
|
||||
# technically these adapters use a different base model,
|
||||
# but we're not testing generation quality here
|
||||
LORA_NAME = "typeof/zephyr-7b-beta-lora"
|
||||
|
||||
GUIDED_DECODING_BACKENDS = ["outlines", "xgrammar", "guidance"]
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def zephyr_lora_files():
|
||||
return snapshot_download(repo_id=LORA_NAME)
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def zephyr_lora_added_tokens_files(zephyr_lora_files):
|
||||
tmp_dir = TemporaryDirectory()
|
||||
tmp_model_dir = f"{tmp_dir.name}/zephyr"
|
||||
shutil.copytree(zephyr_lora_files, tmp_model_dir)
|
||||
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
|
||||
# Copy tokenizer to adapter and add some unique tokens
|
||||
# 32000, 32001, 32002
|
||||
added = tokenizer.add_tokens(["vllm1", "vllm2", "vllm3"],
|
||||
special_tokens=True)
|
||||
assert added == 3
|
||||
tokenizer.save_pretrained(tmp_model_dir)
|
||||
yield tmp_model_dir
|
||||
tmp_dir.cleanup()
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def default_server_args(zephyr_lora_files, zephyr_lora_added_tokens_files):
|
||||
return [
|
||||
|
||||
@ -3,48 +3,23 @@
|
||||
|
||||
import base64
|
||||
import io
|
||||
import shutil
|
||||
from tempfile import TemporaryDirectory
|
||||
|
||||
import openai # use the official client for correctness check
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
import torch
|
||||
# downloading lora to test lora requests
|
||||
from huggingface_hub import snapshot_download
|
||||
from openai import BadRequestError
|
||||
from transformers import AutoConfig, AutoTokenizer
|
||||
from transformers import AutoConfig
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
# any model with a chat template should work here
|
||||
MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
|
||||
LORA_NAME = "typeof/zephyr-7b-beta-lora"
|
||||
|
||||
CONFIG = AutoConfig.from_pretrained(MODEL_NAME)
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def zephyr_lora_files():
|
||||
return snapshot_download(repo_id=LORA_NAME)
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def zephyr_lora_added_tokens_files(zephyr_lora_files):
|
||||
tmp_dir = TemporaryDirectory()
|
||||
tmp_model_dir = f"{tmp_dir.name}/zephyr"
|
||||
shutil.copytree(zephyr_lora_files, tmp_model_dir)
|
||||
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
|
||||
# Copy tokenizer to adapter and add some unique tokens
|
||||
# 32000, 32001, 32002
|
||||
added = tokenizer.add_tokens(["vllm1", "vllm2", "vllm3"],
|
||||
special_tokens=True)
|
||||
assert added == 3
|
||||
tokenizer.save_pretrained(tmp_model_dir)
|
||||
yield tmp_model_dir
|
||||
tmp_dir.cleanup()
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def default_server_args(
|
||||
zephyr_lora_files,
|
||||
|
||||
@ -9,8 +9,6 @@ from contextlib import suppress
|
||||
import openai # use the official client for correctness check
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
# downloading lora to test lora requests
|
||||
from huggingface_hub import snapshot_download
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
@ -18,7 +16,6 @@ from ...utils import RemoteOpenAIServer
|
||||
MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
|
||||
# technically this needs Mistral-7B-v0.1 as base, but we're not testing
|
||||
# generation quality here
|
||||
LORA_NAME = "typeof/zephyr-7b-beta-lora"
|
||||
|
||||
BADREQUEST_CASES = [
|
||||
(
|
||||
@ -48,11 +45,6 @@ BADREQUEST_CASES = [
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def zephyr_lora_files():
|
||||
return snapshot_download(repo_id=LORA_NAME)
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def monkeypatch_module():
|
||||
from _pytest.monkeypatch import MonkeyPatch
|
||||
|
||||
@ -47,6 +47,7 @@ class MockModelConfig:
|
||||
allowed_local_media_path: str = ""
|
||||
encoder_config = None
|
||||
generation_config: str = "auto"
|
||||
skip_tokenizer_init: bool = False
|
||||
|
||||
def get_diff_sampling_param(self):
|
||||
return self.diff_sampling_param or {}
|
||||
|
||||
@ -250,12 +250,15 @@ EXPECTED_METRICS_V1 = [
|
||||
"vllm:request_params_max_tokens_sum",
|
||||
"vllm:request_params_max_tokens_bucket",
|
||||
"vllm:request_params_max_tokens_count",
|
||||
"vllm:time_to_first_token_seconds_sum",
|
||||
"vllm:time_to_first_token_seconds_bucket",
|
||||
"vllm:time_to_first_token_seconds_count",
|
||||
"vllm:time_per_output_token_seconds_sum",
|
||||
"vllm:time_per_output_token_seconds_bucket",
|
||||
"vllm:time_per_output_token_seconds_count",
|
||||
"vllm:time_to_first_token_seconds_sum",
|
||||
"vllm:time_to_first_token_seconds_bucket",
|
||||
"vllm:time_to_first_token_seconds_count",
|
||||
"vllm:inter_token_latency_seconds_sum",
|
||||
"vllm:inter_token_latency_seconds_bucket",
|
||||
"vllm:inter_token_latency_seconds_count",
|
||||
"vllm:e2e_request_latency_seconds_sum",
|
||||
"vllm:e2e_request_latency_seconds_bucket",
|
||||
"vllm:e2e_request_latency_seconds_count",
|
||||
@ -273,7 +276,11 @@ EXPECTED_METRICS_V1 = [
|
||||
"vllm:request_decode_time_seconds_count",
|
||||
]
|
||||
|
||||
HIDDEN_DEPRECATED_METRICS: list[str] = []
|
||||
HIDDEN_DEPRECATED_METRICS: list[str] = [
|
||||
"vllm:time_per_output_token_seconds_sum",
|
||||
"vllm:time_per_output_token_seconds_bucket",
|
||||
"vllm:time_per_output_token_seconds_count",
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@ -289,9 +296,10 @@ async def test_metrics_exist(server: RemoteOpenAIServer,
|
||||
assert response.status_code == HTTPStatus.OK
|
||||
|
||||
for metric in (EXPECTED_METRICS_V1 if use_v1 else EXPECTED_METRICS):
|
||||
if (not server.show_hidden_metrics
|
||||
and metric not in HIDDEN_DEPRECATED_METRICS):
|
||||
assert metric in response.text
|
||||
if (metric in HIDDEN_DEPRECATED_METRICS
|
||||
and not server.show_hidden_metrics):
|
||||
continue
|
||||
assert metric in response.text
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
|
||||
@ -4,8 +4,6 @@
|
||||
import openai # use the official client for correctness check
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
# downloading lora to test lora requests
|
||||
from huggingface_hub import snapshot_download
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
@ -13,12 +11,6 @@ from ...utils import RemoteOpenAIServer
|
||||
MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
|
||||
# technically this needs Mistral-7B-v0.1 as base, but we're not testing
|
||||
# generation quality here
|
||||
LORA_NAME = "typeof/zephyr-7b-beta-lora"
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def zephyr_lora_files():
|
||||
return snapshot_download(repo_id=LORA_NAME)
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
|
||||
@ -11,8 +11,6 @@ from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
from .test_completion import default_server_args # noqa: F401
|
||||
from .test_completion import zephyr_lora_added_tokens_files # noqa: F401
|
||||
from .test_completion import zephyr_lora_files # noqa: F401
|
||||
from .test_completion import MODEL_NAME
|
||||
|
||||
|
||||
|
||||
73
tests/entrypoints/openai/test_token_in_token_out.py
Normal file
73
tests/entrypoints/openai/test_token_in_token_out.py
Normal file
@ -0,0 +1,73 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import os
|
||||
import tempfile
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.model_executor.model_loader.weight_utils import (
|
||||
download_weights_from_hf)
|
||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
MODEL_NAME = "Qwen/Qwen3-0.6B"
|
||||
MODEL_PATH = os.path.join(tempfile.gettempdir(), "qwen3_06b")
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def server():
|
||||
global MODEL_PATH
|
||||
MODEL_PATH = download_weights_from_hf(
|
||||
MODEL_NAME,
|
||||
allow_patterns=["*"],
|
||||
cache_dir=MODEL_PATH,
|
||||
ignore_patterns=["tokenizer*", "vocab*", "*.safetensors"])
|
||||
args = [
|
||||
"--max-model-len",
|
||||
"2048",
|
||||
"--max-num-seqs",
|
||||
"128",
|
||||
"--enforce-eager",
|
||||
"--skip-tokenizer-init",
|
||||
"--load-format",
|
||||
"dummy",
|
||||
]
|
||||
with RemoteOpenAIServer(MODEL_PATH, args) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_token_in_token_out_and_logprobs(server):
|
||||
"""
|
||||
Test token-in-token-out and token_ids align with prompt_logprobs
|
||||
& logprobs when return_tokens_as_token_ids is enabled.
|
||||
"""
|
||||
tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
|
||||
text = "Hello, world! How are you today?"
|
||||
token_ids = tokenizer.encode(text)
|
||||
async with server.get_async_client() as client:
|
||||
# Test with both return_token_ids and return_tokens_as_token_ids enabled
|
||||
completion = await client.completions.create(
|
||||
model=MODEL_PATH,
|
||||
prompt=token_ids,
|
||||
max_tokens=20,
|
||||
temperature=0,
|
||||
echo=True,
|
||||
extra_body={
|
||||
"return_token_ids": True,
|
||||
},
|
||||
)
|
||||
|
||||
# Verify all fields are present
|
||||
assert (completion.choices[0].token_ids is not None
|
||||
and 0 < len(completion.choices[0].token_ids) <= 20)
|
||||
assert completion.choices[0].prompt_token_ids is not None
|
||||
|
||||
# Decode prompt tokens
|
||||
if completion.choices[0].prompt_token_ids:
|
||||
prompt_text = tokenizer.decode(
|
||||
completion.choices[0].prompt_token_ids)
|
||||
# The decoded prompt should match or close to original prompt
|
||||
assert prompt_text == text
|
||||
@ -8,8 +8,6 @@ import requests
|
||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
from .test_completion import zephyr_lora_added_tokens_files # noqa: F401
|
||||
from .test_completion import zephyr_lora_files # noqa: F401
|
||||
|
||||
# any model with a chat template should work here
|
||||
MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
|
||||
|
||||
@ -12,8 +12,6 @@ import pytest
|
||||
import pytest_asyncio
|
||||
import soundfile as sf
|
||||
|
||||
from vllm.assets.audio import AudioAsset
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
MODEL_NAME = "openai/whisper-large-v3-turbo"
|
||||
@ -24,20 +22,6 @@ MISTRAL_FORMAT_ARGS = [
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mary_had_lamb():
|
||||
path = AudioAsset('mary_had_lamb').get_local_path()
|
||||
with open(str(path), "rb") as f:
|
||||
yield f
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def winning_call():
|
||||
path = AudioAsset('winning_call').get_local_path()
|
||||
with open(str(path), "rb") as f:
|
||||
yield f
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def server():
|
||||
with RemoteOpenAIServer(MODEL_NAME, SERVER_ARGS) as remote_server:
|
||||
@ -76,6 +60,25 @@ async def test_basic_audio(mary_had_lamb, model_name):
|
||||
assert out_usage["seconds"] == 16, out_usage["seconds"]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_basic_audio_gemma(foscolo):
|
||||
# Gemma accuracy on some of the audio samples we use is particularly bad,
|
||||
# hence we use a different one here. WER is evaluated separately.
|
||||
model_name = "google/gemma-3n-E2B-it"
|
||||
server_args = ["--enforce-eager"]
|
||||
|
||||
with RemoteOpenAIServer(model_name, server_args) as remote_server:
|
||||
client = remote_server.get_async_client()
|
||||
transcription = await client.audio.transcriptions.create(
|
||||
model=model_name,
|
||||
file=foscolo,
|
||||
language="it",
|
||||
response_format="text",
|
||||
temperature=0.0)
|
||||
out = json.loads(transcription)['text']
|
||||
assert "da cui vergine nacque Venere" in out
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_non_asr_model(winning_call):
|
||||
# text to text model
|
||||
|
||||
@ -12,32 +12,24 @@ import pytest
|
||||
import pytest_asyncio
|
||||
import soundfile as sf
|
||||
|
||||
from vllm.assets.audio import AudioAsset
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
MODEL_NAME = "openai/whisper-small"
|
||||
SERVER_ARGS = ["--enforce-eager"]
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def foscolo():
|
||||
# Test translation it->en
|
||||
path = AudioAsset('azacinto_foscolo').get_local_path()
|
||||
with open(str(path), "rb") as f:
|
||||
yield f
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def server():
|
||||
with RemoteOpenAIServer(MODEL_NAME, SERVER_ARGS) as remote_server:
|
||||
yield remote_server
|
||||
@pytest.fixture(scope="module",
|
||||
params=["openai/whisper-small", "google/gemma-3n-E2B-it"])
|
||||
def server(request):
|
||||
# Parametrize over model name
|
||||
with RemoteOpenAIServer(request.param, SERVER_ARGS) as remote_server:
|
||||
yield remote_server, request.param
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def client(server):
|
||||
async def client_and_model(server):
|
||||
server, model_name = server
|
||||
async with server.get_async_client() as async_client:
|
||||
yield async_client
|
||||
yield async_client, model_name
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@ -56,27 +48,29 @@ async def test_non_asr_model(foscolo):
|
||||
|
||||
# NOTE: (NickLucche) the large-v3-turbo model was not trained on translation!
|
||||
@pytest.mark.asyncio
|
||||
async def test_basic_audio(foscolo, client):
|
||||
async def test_basic_audio(foscolo, client_and_model):
|
||||
client, model_name = client_and_model
|
||||
translation = await client.audio.translations.create(
|
||||
model=MODEL_NAME,
|
||||
model=model_name,
|
||||
file=foscolo,
|
||||
response_format="text",
|
||||
# TODO remove once language detection is implemented
|
||||
extra_body=dict(language="it"),
|
||||
# TODO remove `language="it"` once language detection is implemented
|
||||
extra_body=dict(language="it", to_language="en"),
|
||||
temperature=0.0)
|
||||
out = json.loads(translation)['text'].strip().lower()
|
||||
assert "greek sea" in out
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_audio_prompt(foscolo, client):
|
||||
async def test_audio_prompt(foscolo, client_and_model):
|
||||
client, model_name = client_and_model
|
||||
# Condition whisper on starting text
|
||||
prompt = "Nor have I ever"
|
||||
transcription = await client.audio.translations.create(
|
||||
model=MODEL_NAME,
|
||||
model=model_name,
|
||||
file=foscolo,
|
||||
prompt=prompt,
|
||||
extra_body=dict(language="it"),
|
||||
extra_body=dict(language="it", to_language="en"),
|
||||
response_format="text",
|
||||
temperature=0.0)
|
||||
out = json.loads(transcription)['text']
|
||||
@ -85,22 +79,27 @@ async def test_audio_prompt(foscolo, client):
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_streaming_response(foscolo, client, server):
|
||||
async def test_streaming_response(foscolo, client_and_model, server):
|
||||
client, model_name = client_and_model
|
||||
translation = ""
|
||||
res_no_stream = await client.audio.translations.create(
|
||||
model=MODEL_NAME,
|
||||
model=model_name,
|
||||
file=foscolo,
|
||||
response_format="json",
|
||||
extra_body=dict(language="it"),
|
||||
extra_body=dict(language="it", to_language="en", seed=42),
|
||||
temperature=0.0)
|
||||
|
||||
# Stream via HTTPX since OpenAI translation client doesn't expose streaming
|
||||
server, model_name = server
|
||||
url = server.url_for("v1/audio/translations")
|
||||
headers = {"Authorization": f"Bearer {server.DUMMY_API_KEY}"}
|
||||
data = {
|
||||
"model": MODEL_NAME,
|
||||
"model": model_name,
|
||||
"language": "it",
|
||||
"to_language": "en",
|
||||
"stream": True,
|
||||
"temperature": 0.0,
|
||||
"seed": 42,
|
||||
}
|
||||
foscolo.seek(0)
|
||||
async with httpx.AsyncClient() as http_client:
|
||||
@ -121,16 +120,24 @@ async def test_streaming_response(foscolo, client, server):
|
||||
text = chunk["choices"][0].get("delta", {}).get("content")
|
||||
translation += text or ""
|
||||
|
||||
assert translation == res_no_stream.text
|
||||
res_stream = translation.split()
|
||||
# NOTE There's a small non-deterministic issue here, likely in the attn
|
||||
# computation, which will cause a few tokens to be different, while still
|
||||
# being very close semantically.
|
||||
assert sum([
|
||||
x == y for x, y in zip(res_stream, res_no_stream.text.split())
|
||||
]) >= len(res_stream) * 0.9
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_stream_options(foscolo, client, server):
|
||||
async def test_stream_options(foscolo, server):
|
||||
server, model_name = server
|
||||
url = server.url_for("v1/audio/translations")
|
||||
headers = {"Authorization": f"Bearer {server.DUMMY_API_KEY}"}
|
||||
data = {
|
||||
"model": MODEL_NAME,
|
||||
"model": model_name,
|
||||
"language": "it",
|
||||
"to_language": "en",
|
||||
"stream": True,
|
||||
"stream_include_usage": True,
|
||||
"stream_continuous_usage_stats": True,
|
||||
@ -164,7 +171,10 @@ async def test_stream_options(foscolo, client, server):
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_long_audio_request(foscolo, client):
|
||||
async def test_long_audio_request(foscolo, client_and_model):
|
||||
client, model_name = client_and_model
|
||||
if model_name == "google/gemma-3n-E2B-it":
|
||||
pytest.skip("Gemma3n does not support long audio requests")
|
||||
foscolo.seek(0)
|
||||
audio, sr = librosa.load(foscolo)
|
||||
repeated_audio = np.tile(audio, 2)
|
||||
@ -173,9 +183,9 @@ async def test_long_audio_request(foscolo, client):
|
||||
sf.write(buffer, repeated_audio, sr, format='WAV')
|
||||
buffer.seek(0)
|
||||
translation = await client.audio.translations.create(
|
||||
model=MODEL_NAME,
|
||||
model=model_name,
|
||||
file=buffer,
|
||||
extra_body=dict(language="it"),
|
||||
extra_body=dict(language="it", to_language="en"),
|
||||
response_format="text",
|
||||
temperature=0.0)
|
||||
out = json.loads(translation)['text'].strip().lower()
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -282,7 +282,7 @@ def triton_impl(a: torch.Tensor, topk_ids: torch.Tensor,
|
||||
a1_scale=a1_scale,
|
||||
block_shape=block_shape,
|
||||
# Make sure this is set to False so we
|
||||
# dont end up comparing the same implementation.
|
||||
# don't end up comparing the same implementation.
|
||||
allow_deep_gemm=False)
|
||||
|
||||
|
||||
|
||||
@ -8,7 +8,8 @@ from vllm.model_executor.layers.activation import SiluAndMul
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.scalar_type import scalar_types
|
||||
|
||||
if not current_platform.has_device_capability(100):
|
||||
if not (current_platform.has_device_capability(100)
|
||||
and hasattr(torch.ops._C, "silu_and_mul_nvfp4_quant")):
|
||||
pytest.skip(reason="Nvfp4 Requires compute capability of 10 or above.",
|
||||
allow_module_level=True)
|
||||
|
||||
|
||||
@ -59,10 +59,10 @@ async def requests_processing_time(llm,
|
||||
@pytest.mark.asyncio
|
||||
async def test_add_lora(chatglm3_lora_files):
|
||||
"""
|
||||
The add_lora function is used to pre-load some LoRA adapters into the
|
||||
The add_lora function is used to preload some LoRA adapters into the
|
||||
engine in anticipation of future requests using these adapters. To test
|
||||
this functionality, we use the async engine to process some requests - We
|
||||
do it twice, once with add_lora() pre-loading and once without.
|
||||
do it twice, once with add_lora() preloading and once without.
|
||||
|
||||
We measure the request processing time in both cases and expect the time
|
||||
to be lesser in the case with add_lora() calls.
|
||||
|
||||
@ -87,6 +87,9 @@ def test_chatglm3_lora_tp4(chatglm3_lora_files):
|
||||
@multi_gpu_test(num_gpus=4)
|
||||
@create_new_process_for_each_test()
|
||||
def test_chatglm3_lora_tp4_fully_sharded_loras(chatglm3_lora_files):
|
||||
# https://github.com/NVIDIA/nccl/issues/1790, set a lower value for
|
||||
# gpu_memory_utilization here because NCCL >= 2.26.3 seems to use
|
||||
# more GPU memory causing vLLM to OOM
|
||||
llm = vllm.LLM(MODEL_PATH,
|
||||
max_model_len=1024,
|
||||
enable_lora=True,
|
||||
@ -95,7 +98,8 @@ def test_chatglm3_lora_tp4_fully_sharded_loras(chatglm3_lora_files):
|
||||
tensor_parallel_size=4,
|
||||
trust_remote_code=True,
|
||||
fully_sharded_loras=True,
|
||||
enable_chunked_prefill=True)
|
||||
enable_chunked_prefill=True,
|
||||
gpu_memory_utilization=0.85)
|
||||
output1 = do_sample(llm, chatglm3_lora_files, lora_id=1)
|
||||
for i in range(len(EXPECTED_LORA_OUTPUT)):
|
||||
assert output1[i] == EXPECTED_LORA_OUTPUT[i]
|
||||
|
||||
@ -18,7 +18,7 @@ def test_allowed_token_ids_with_lora_vocab(llama_2_7b_base_huggingface_id,
|
||||
adapters that define additional tokens.
|
||||
"""
|
||||
|
||||
# Setup a base model compatible with the sql_lora_files adapter and
|
||||
# Set up a base model compatible with the sql_lora_files adapter and
|
||||
# a known number of tokens in the base model.
|
||||
model_config = ModelConfig(
|
||||
model=llama_2_7b_base_huggingface_id,
|
||||
@ -84,7 +84,7 @@ def test_allowed_token_ids_with_lora_adapter_no_vocab(
|
||||
adapters that do not define additional tokens.
|
||||
"""
|
||||
|
||||
# Setup a base model compatible with the qwen25vl_lora_files adapter and
|
||||
# Set up a base model compatible with the qwen25vl_lora_files adapter and
|
||||
# a known number of tokens in the base model.
|
||||
model_config = ModelConfig(
|
||||
model=qwen25vl_base_huggingface_id,
|
||||
|
||||
@ -13,7 +13,7 @@ from ...registry import HF_EXAMPLE_MODELS
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
# These have unsupported head_dim for FA. We do not
|
||||
# not have a clean way to fall back, so we fail with
|
||||
# have a clean way to fall back, so we fail with
|
||||
# a clear msg when it happens.
|
||||
# https://github.com/vllm-project/vllm/issues/14524
|
||||
REQUIRES_V0 = ["microsoft/phi-2", "stabilityai/stablelm-3b-4e1t"]
|
||||
@ -92,7 +92,8 @@ AITER_MODEL_LIST = [
|
||||
pytest.param(
|
||||
"allenai/OLMoE-1B-7B-0924-Instruct",
|
||||
marks=[pytest.mark.cpu_model],
|
||||
)
|
||||
),
|
||||
pytest.param("swiss-ai/Apertus-8B"), # apertus
|
||||
])
|
||||
@pytest.mark.parametrize("max_tokens", [32])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
|
||||
@ -34,17 +34,6 @@ HYBRID_MODELS = [
|
||||
"LiquidAI/LFM2-1.2B",
|
||||
]
|
||||
|
||||
HF_UNSUPPORTED_MODELS = [
|
||||
# The HF transformers implementation of
|
||||
# Mamba2 is buggy for Codestral as it doesn't handle n_groups, so the test
|
||||
# doesn't compare vLLM output with HF output.
|
||||
# See https://github.com/huggingface/transformers/pull/35943
|
||||
"yujiepan/mamba2-codestral-v0.1-tiny-random",
|
||||
# transformers 4.55 is still producing garbage for this model
|
||||
# TODO(tdoublep): follow-up on transformers side
|
||||
"ibm-granite/granite-4.0-tiny-preview"
|
||||
]
|
||||
|
||||
V1_SUPPORTED_MODELS = [
|
||||
"state-spaces/mamba-130m-hf",
|
||||
"ai21labs/Jamba-tiny-dev",
|
||||
@ -65,6 +54,11 @@ V0_UNSUPPORTED_MODELS = [
|
||||
"LiquidAI/LFM2-1.2B",
|
||||
]
|
||||
|
||||
FP32_STATE_MODELS = [
|
||||
"state-spaces/mamba-130m-hf",
|
||||
"Zyphra/Zamba2-1.2B-instruct",
|
||||
]
|
||||
|
||||
# Avoid OOM
|
||||
MAX_NUM_SEQS = 4
|
||||
|
||||
@ -85,20 +79,13 @@ def test_models(
|
||||
try:
|
||||
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
|
||||
model_info.check_available_online(on_fail="skip")
|
||||
hf_version_check = model_info.check_transformers_version(
|
||||
on_fail="return")
|
||||
model_info.check_transformers_version(on_fail="skip")
|
||||
except ValueError:
|
||||
hf_version_check = None
|
||||
|
||||
if hf_version_check is not None:
|
||||
print(f"Skipping transformers comparison because: {hf_version_check}")
|
||||
pass
|
||||
|
||||
with hf_runner(model) as hf_model:
|
||||
if model not in HF_UNSUPPORTED_MODELS and hf_version_check is None:
|
||||
hf_outputs = hf_model.generate_greedy_logprobs_limit(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
else:
|
||||
hf_outputs = None
|
||||
hf_outputs = hf_model.generate_greedy_logprobs_limit(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_USE_V1", "0")
|
||||
@ -116,7 +103,7 @@ def test_models(
|
||||
else:
|
||||
vllm_v1_outputs = None
|
||||
|
||||
if hf_outputs is not None and vllm_v0_outputs is not None:
|
||||
if vllm_v0_outputs is not None:
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=vllm_v0_outputs,
|
||||
@ -125,12 +112,10 @@ def test_models(
|
||||
)
|
||||
|
||||
if model in V1_SUPPORTED_MODELS:
|
||||
ref_outputs = hf_outputs if hf_outputs is not None else vllm_v0_outputs
|
||||
assert ref_outputs is not None
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=ref_outputs,
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=vllm_v1_outputs,
|
||||
name_0="hf" if hf_outputs is not None else "vllm-v0",
|
||||
name_0="hf",
|
||||
name_1="vllm-v1",
|
||||
)
|
||||
|
||||
@ -397,11 +382,8 @@ def test_full_cuda_graph(
|
||||
pass
|
||||
|
||||
with hf_runner(model) as hf_model:
|
||||
if model not in HF_UNSUPPORTED_MODELS:
|
||||
hf_outputs = hf_model.generate_greedy_logprobs_limit(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
else:
|
||||
hf_outputs = None
|
||||
hf_outputs = hf_model.generate_greedy_logprobs_limit(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_USE_V1", "0")
|
||||
@ -416,7 +398,7 @@ def test_full_cuda_graph(
|
||||
vllm_v1_outputs = vllm_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
|
||||
if hf_outputs is not None and vllm_v0_outputs is not None:
|
||||
if vllm_v0_outputs is not None:
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=vllm_v0_outputs,
|
||||
@ -424,17 +406,15 @@ def test_full_cuda_graph(
|
||||
name_1="vllm-v0",
|
||||
)
|
||||
|
||||
ref_outputs = hf_outputs if hf_outputs is not None else vllm_v0_outputs
|
||||
assert ref_outputs is not None
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=ref_outputs,
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=vllm_v1_outputs,
|
||||
name_0="hf" if hf_outputs is not None else "vllm-v0",
|
||||
name_0="hf",
|
||||
name_1="vllm-v1",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", ["Zyphra/Zamba2-1.2B-instruct"])
|
||||
@pytest.mark.parametrize("model", FP32_STATE_MODELS)
|
||||
@pytest.mark.parametrize("max_tokens", [64])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
def test_fp32_state(
|
||||
@ -455,11 +435,8 @@ def test_fp32_state(
|
||||
pass
|
||||
|
||||
with hf_runner(model) as hf_model:
|
||||
if model not in HF_UNSUPPORTED_MODELS:
|
||||
hf_outputs = hf_model.generate_greedy_logprobs_limit(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
else:
|
||||
hf_outputs = None
|
||||
hf_outputs = hf_model.generate_greedy_logprobs_limit(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_USE_V1", "0")
|
||||
@ -475,18 +452,16 @@ def test_fp32_state(
|
||||
vllm_v1_outputs = vllm_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
|
||||
if hf_outputs is not None:
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=vllm_v0_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm-v0",
|
||||
)
|
||||
|
||||
ref_outputs = hf_outputs if hf_outputs is not None else vllm_v0_outputs
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=ref_outputs,
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=vllm_v0_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm-v0",
|
||||
)
|
||||
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=vllm_v1_outputs,
|
||||
name_0="hf" if hf_outputs is not None else "vllm-v0",
|
||||
name_0="hf",
|
||||
name_1="vllm-v1",
|
||||
)
|
||||
|
||||
@ -20,7 +20,7 @@ MISTRAL_FORMAT_MODELS = [
|
||||
"mistralai/Mistral-7B-Instruct-v0.3",
|
||||
# uses the v3-Tekken tokenizer
|
||||
"mistralai/Ministral-8B-Instruct-2410",
|
||||
# Mistral-Nemo is to big for CI, but passes locally
|
||||
# Mistral-Nemo is too big for CI, but passes locally
|
||||
# "mistralai/Mistral-Nemo-Instruct-2407"
|
||||
]
|
||||
|
||||
@ -273,7 +273,7 @@ def test_mistral_function_calling(vllm_runner, model: str, dtype: str) -> None:
|
||||
|
||||
|
||||
def test_mistral_function_call_nested_json():
|
||||
"""Ensure that the function-name regex captures the entire outer-most
|
||||
"""Ensure that the function-name regex captures the entire outermost
|
||||
JSON block, including nested braces."""
|
||||
|
||||
# Create a minimal stub tokenizer that provides the few attributes the
|
||||
|
||||
@ -154,7 +154,7 @@ def batch_make_image_embeddings(
|
||||
embed_counter += cur_batch_embed_len
|
||||
image_counter += cur_batch_image_count
|
||||
|
||||
# ensure we don't lost any images or embeddings
|
||||
# ensure we don't lose any images or embeddings
|
||||
assert embed_counter == image_embeds.size(0)
|
||||
assert image_counter == image_grid_thw.size(0)
|
||||
assert len(image_batches) == len(result)
|
||||
@ -238,7 +238,7 @@ def batch_make_video_embeddings(
|
||||
embed_counter += cur_batch_embed_len
|
||||
video_counter += cur_batch_video_count
|
||||
|
||||
# ensure we don't lost any videos or embeddings
|
||||
# ensure we don't lose any videos or embeddings
|
||||
assert embed_counter == video_embeds.size(0)
|
||||
assert video_counter == video_grid_thw.size(0)
|
||||
assert len(video_batches) == len(result)
|
||||
|
||||
@ -1,12 +1,9 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Custom input builders for edge-cases in different models."""
|
||||
from io import BytesIO
|
||||
from typing import Callable
|
||||
|
||||
import requests
|
||||
from PIL import Image
|
||||
|
||||
from vllm.assets.image import ImageAsset
|
||||
from vllm.multimodal.image import rescale_image_size
|
||||
from vllm.multimodal.video import (rescale_video_size, resize_video,
|
||||
sample_frames_from_video)
|
||||
@ -118,9 +115,9 @@ def different_patch_input_cases_internvl():
|
||||
|
||||
|
||||
def windows_attention_image_qwen2_5_vl():
|
||||
# image from regression issue: https://github.com/vllm-project/vllm/issues/15122
|
||||
image_url = "https://aomediacodec.github.io/av1-avif/testFiles/Link-U/hato.jpg"
|
||||
image = Image.open(BytesIO(requests.get(image_url).content))
|
||||
|
||||
# image from regression issue: https://github.com/vllm-project/vllm/issues/15122 # noqa: E501
|
||||
image = ImageAsset("hato").pil_image
|
||||
|
||||
question = "Describe the image."
|
||||
img_prompt = "<|vision_start|><|image_pad|><|vision_end|>"
|
||||
|
||||
@ -293,6 +293,7 @@ def _test_processing_correctness_one(
|
||||
"OpenGVLab/InternVL3_5-GPT-OSS-20B-A4B-Preview",
|
||||
"OpenGVLab/InternVL3_5-30B-A3B",
|
||||
"Kwai-Keye/Keye-VL-8B-Preview",
|
||||
"Kwai-Keye/Keye-VL-1_5-8B",
|
||||
"moonshotai/Kimi-VL-A3B-Instruct",
|
||||
"meta-llama/Llama-4-Scout-17B-16E-Instruct",
|
||||
"llava-hf/llava-1.5-7b-hf",
|
||||
|
||||
@ -1,30 +1,31 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import tempfile
|
||||
from collections.abc import Iterable
|
||||
from contextlib import contextmanager
|
||||
from functools import partial
|
||||
from typing import Any, Union
|
||||
from unittest.mock import patch
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
import torch.nn as nn
|
||||
from mistral_common.protocol.instruct.messages import (ImageChunk, TextChunk,
|
||||
UserMessage)
|
||||
from mistral_common.protocol.instruct.request import ChatCompletionRequest
|
||||
from PIL import Image
|
||||
|
||||
from vllm.config import ModelConfig
|
||||
from vllm.engine.llm_engine import LLMEngine as V0LLMEngine
|
||||
from vllm.config import ModelConfig, VllmConfig, set_current_vllm_config
|
||||
from vllm.distributed import (cleanup_dist_env_and_memory,
|
||||
init_distributed_environment,
|
||||
initialize_model_parallel)
|
||||
from vllm.inputs import InputProcessingContext
|
||||
from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs,
|
||||
MultiModalKwargs)
|
||||
from vllm.model_executor.model_loader.utils import set_default_torch_dtype
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY, BatchedTensorInputs
|
||||
from vllm.multimodal.processing import BaseMultiModalProcessor
|
||||
from vllm.multimodal.utils import group_mm_kwargs_by_modality
|
||||
from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
|
||||
from vllm.utils import GiB_bytes, is_list_of, set_default_torch_num_threads
|
||||
from vllm.v1.core.kv_cache_utils import get_kv_cache_config
|
||||
from vllm.v1.engine.core import EngineCore as V1EngineCore
|
||||
from vllm.utils import is_list_of
|
||||
|
||||
from ....conftest import VllmRunner
|
||||
from ...registry import _MULTIMODAL_EXAMPLE_MODELS, HF_EXAMPLE_MODELS
|
||||
from ...utils import dummy_hf_overrides
|
||||
|
||||
@ -137,6 +138,27 @@ def create_batched_mm_kwargs(
|
||||
return group_mm_kwargs_by_modality(items)
|
||||
|
||||
|
||||
@contextmanager
|
||||
def initialize_dummy_model(model_cls: nn.Module, model_config: ModelConfig):
|
||||
temp_file = tempfile.mkstemp()[1]
|
||||
init_distributed_environment(
|
||||
world_size=1,
|
||||
rank=0,
|
||||
distributed_init_method=f"file://{temp_file}",
|
||||
local_rank=0,
|
||||
backend="nccl",
|
||||
)
|
||||
initialize_model_parallel(tensor_model_parallel_size=1)
|
||||
vllm_config = VllmConfig(model_config=model_config)
|
||||
with set_current_vllm_config(vllm_config=vllm_config):
|
||||
with set_default_torch_dtype(model_config.dtype):
|
||||
model = model_cls(vllm_config=vllm_config)
|
||||
yield model
|
||||
|
||||
del model
|
||||
cleanup_dist_env_and_memory()
|
||||
|
||||
|
||||
def get_model_id_to_test(
|
||||
model_arch_list: Iterable[str]) -> list[tuple[str, str]]:
|
||||
filtered_results = []
|
||||
@ -155,8 +177,7 @@ def get_model_id_to_test(
|
||||
@pytest.mark.parametrize(
|
||||
"model_arch, model_id",
|
||||
get_model_id_to_test(_MULTIMODAL_EXAMPLE_MODELS.keys()))
|
||||
def test_model_tensor_schema(model_arch: str, model_id: str,
|
||||
vllm_runner: type[VllmRunner], monkeypatch):
|
||||
def test_model_tensor_schema(model_arch: str, model_id: str):
|
||||
if model_arch in ARCH_TO_SKIP:
|
||||
pytest.skip(f"Skipping {model_arch} due to {ARCH_TO_SKIP[model_arch]}")
|
||||
if model_id in REPO_ID_TO_SKIP:
|
||||
@ -177,14 +198,20 @@ def test_model_tensor_schema(model_arch: str, model_id: str,
|
||||
tokenizer_mode=model_info.tokenizer_mode,
|
||||
revision=model_info.revision,
|
||||
trust_remote_code=model_info.trust_remote_code,
|
||||
hf_overrides=model_info.hf_overrides,
|
||||
hf_overrides=hf_overrides_fn,
|
||||
)
|
||||
model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
|
||||
factories = MULTIMODAL_REGISTRY._processor_factories[model_cls]
|
||||
|
||||
if not any(
|
||||
hasattr(model_cls, f"_parse_and_validate_{m}_input")
|
||||
for m in ["image", "video", "audio"]):
|
||||
inputs_parse_methods = []
|
||||
for attr_name in dir(model_cls):
|
||||
attr = getattr(model_cls, attr_name)
|
||||
if hasattr(attr, "__annotations__"):
|
||||
return_type = attr.__annotations__.get("return", None)
|
||||
if return_type is not None and "Input" in str(return_type):
|
||||
inputs_parse_methods.append(attr_name)
|
||||
|
||||
if not any(inputs_parse_methods):
|
||||
pytest.skip(f"{model_arch} does not support tensor schema validation.")
|
||||
|
||||
ctx = InputProcessingContext(
|
||||
@ -197,68 +224,13 @@ def test_model_tensor_schema(model_arch: str, model_id: str,
|
||||
modality: 3 if limit is None else limit
|
||||
for modality, limit in supported_mm_limits.items()
|
||||
}
|
||||
model_config.get_multimodal_config().limit_per_prompt = limit_mm_per_prompt
|
||||
processor = factories.build_processor(ctx, cache=None)
|
||||
|
||||
# Avoid calling model.forward()
|
||||
def _initialize_kv_caches_v0(self) -> None:
|
||||
self.cache_config.num_gpu_blocks = 0
|
||||
self.cache_config.num_cpu_blocks = 0
|
||||
|
||||
def _initialize_kv_caches_v1(self, vllm_config):
|
||||
kv_cache_specs = self.model_executor.get_kv_cache_specs()
|
||||
scheduler_kv_cache_config = get_kv_cache_config(
|
||||
vllm_config,
|
||||
kv_cache_specs[0],
|
||||
10 * GiB_bytes,
|
||||
)
|
||||
|
||||
# gpu_blocks (> 0), cpu_blocks, scheduler_kv_cache_config
|
||||
return 1, 0, scheduler_kv_cache_config
|
||||
|
||||
with (patch.object(V0LLMEngine, "_initialize_kv_caches",
|
||||
_initialize_kv_caches_v0),
|
||||
patch.object(V1EngineCore, "_initialize_kv_caches",
|
||||
_initialize_kv_caches_v1), monkeypatch.context() as m):
|
||||
m.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
|
||||
if model_info.v0_only:
|
||||
m.setenv("VLLM_USE_V1", "0")
|
||||
|
||||
# TODO(Isotr0py): Can we avoid initializing engine?
|
||||
with (
|
||||
set_default_torch_num_threads(1),
|
||||
vllm_runner(
|
||||
model_id,
|
||||
tokenizer_name=model_info.tokenizer,
|
||||
tokenizer_mode=model_info.tokenizer_mode,
|
||||
revision=model_info.revision,
|
||||
trust_remote_code=model_info.trust_remote_code,
|
||||
max_model_len=model_info.max_model_len,
|
||||
load_format="dummy",
|
||||
hf_overrides=hf_overrides_fn,
|
||||
limit_mm_per_prompt=limit_mm_per_prompt,
|
||||
enforce_eager=True,
|
||||
) as vllm_model,
|
||||
):
|
||||
model_config = vllm_model.llm.llm_engine.model_config
|
||||
llm_engine = vllm_model.llm.llm_engine
|
||||
|
||||
if hasattr(llm_engine, "processor"):
|
||||
# v1 processor
|
||||
mm_registry = llm_engine.processor.mm_registry
|
||||
else:
|
||||
# v0 input_preprocessor
|
||||
mm_registry = llm_engine.input_preprocessor.mm_registry
|
||||
|
||||
processor = mm_registry.create_processor(model_config)
|
||||
|
||||
def validate_model_input(model, modality: str,
|
||||
mm_kwargs: MultiModalKwargs):
|
||||
method_name = f"_parse_and_validate_{modality}_input"
|
||||
if hasattr(model, method_name):
|
||||
getattr(model, method_name)(**mm_kwargs)
|
||||
|
||||
for modality, _, mm_kwargs in create_batched_mm_kwargs(
|
||||
model_config, processor):
|
||||
valid_func = partial(validate_model_input,
|
||||
modality=modality,
|
||||
mm_kwargs=mm_kwargs)
|
||||
vllm_model.apply_model(valid_func)
|
||||
with initialize_dummy_model(model_cls, model_config) as model:
|
||||
for modality, _, mm_kwargs in create_batched_mm_kwargs(
|
||||
model_config, processor):
|
||||
for method_name in inputs_parse_methods:
|
||||
print(f"Testing `{method_name}` with modality={modality} "
|
||||
f"and mm_kwargs{list(mm_kwargs.keys())}")
|
||||
getattr(model, method_name)(modality=modality, **mm_kwargs)
|
||||
|
||||
@ -137,6 +137,9 @@ class _HfExamplesInfo:
|
||||
# yapf: disable
|
||||
_TEXT_GENERATION_EXAMPLE_MODELS = {
|
||||
# [Decoder-only]
|
||||
"ApertusForCausalLM": _HfExamplesInfo("swiss-ai/Apertus-8B",
|
||||
min_transformers_version="4.56.0",
|
||||
trust_remote_code=True),
|
||||
"AquilaModel": _HfExamplesInfo("BAAI/AquilaChat-7B",
|
||||
trust_remote_code=True),
|
||||
"AquilaForCausalLM": _HfExamplesInfo("BAAI/AquilaChat2-7B",
|
||||
@ -151,7 +154,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
|
||||
"BailingMoeForCausalLM": _HfExamplesInfo("inclusionAI/Ling-lite-1.5",
|
||||
trust_remote_code=True),
|
||||
"BambaForCausalLM": _HfExamplesInfo("ibm-ai-platform/Bamba-9B-v1",
|
||||
min_transformers_version="4.56.0",
|
||||
min_transformers_version="4.55.3",
|
||||
extras={"tiny": "hmellor/tiny-random-BambaForCausalLM"}), # noqa: E501
|
||||
"BloomForCausalLM": _HfExamplesInfo("bigscience/bloom-560m",
|
||||
{"1b": "bigscience/bloomz-1b1"}),
|
||||
@ -205,7 +208,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
|
||||
"GptOssForCausalLM": _HfExamplesInfo("lmsys/gpt-oss-20b-bf16"),
|
||||
"GraniteForCausalLM": _HfExamplesInfo("ibm/PowerLM-3b"),
|
||||
"GraniteMoeForCausalLM": _HfExamplesInfo("ibm/PowerMoE-3b"),
|
||||
"GraniteMoeHybridForCausalLM": _HfExamplesInfo("ibm-granite/granite-4.0-tiny-preview"), # noqa: E501
|
||||
"GraniteMoeHybridForCausalLM": _HfExamplesInfo("ibm-granite/granite-4.0-tiny-preview", # noqa: E501
|
||||
min_transformers_version="4.55.3"),
|
||||
"GraniteMoeSharedForCausalLM": _HfExamplesInfo("ibm-research/moe-7b-1b-active-shared-experts"), # noqa: E501
|
||||
"Grok1ModelForCausalLM": _HfExamplesInfo("hpcai-tech/grok-1",
|
||||
trust_remote_code=True),
|
||||
@ -225,7 +229,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
|
||||
trust_remote_code=True),
|
||||
"JAISLMHeadModel": _HfExamplesInfo("inceptionai/jais-13b-chat"),
|
||||
"JambaForCausalLM": _HfExamplesInfo("ai21labs/AI21-Jamba-1.5-Mini",
|
||||
min_transformers_version="4.56.0",
|
||||
min_transformers_version="4.55.3",
|
||||
extras={
|
||||
"tiny": "ai21labs/Jamba-tiny-dev",
|
||||
"random": "ai21labs/Jamba-tiny-random", # noqa: E501
|
||||
@ -241,7 +245,11 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
|
||||
"Llama4ForCausalLM": _HfExamplesInfo("meta-llama/Llama-4-Scout-17B-16E-Instruct", # noqa: E501
|
||||
is_available_online=False),
|
||||
"MambaForCausalLM": _HfExamplesInfo("state-spaces/mamba-130m-hf"),
|
||||
"Mamba2ForCausalLM": _HfExamplesInfo("mistralai/Mamba-Codestral-7B-v0.1"),
|
||||
"Mamba2ForCausalLM": _HfExamplesInfo("mistralai/Mamba-Codestral-7B-v0.1",
|
||||
min_transformers_version="4.55.3",
|
||||
extras={
|
||||
"random": "yujiepan/mamba2-codestral-v0.1-tiny-random", # noqa: E501
|
||||
}),
|
||||
"FalconMambaForCausalLM": _HfExamplesInfo("tiiuae/falcon-mamba-7b-instruct"), # noqa: E501
|
||||
"MiniCPMForCausalLM": _HfExamplesInfo("openbmb/MiniCPM-2B-sft-bf16",
|
||||
trust_remote_code=True),
|
||||
@ -435,6 +443,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
|
||||
"InternVLForConditionalGeneration": _HfExamplesInfo("OpenGVLab/InternVL3-1B-hf"), # noqa: E501
|
||||
"KeyeForConditionalGeneration": _HfExamplesInfo("Kwai-Keye/Keye-VL-8B-Preview", # noqa: E501
|
||||
trust_remote_code=True),
|
||||
"KeyeVL1_5ForConditionalGeneration": _HfExamplesInfo("Kwai-Keye/Keye-VL-1_5-8B", # noqa: E501
|
||||
trust_remote_code=True),
|
||||
"KimiVLForConditionalGeneration": _HfExamplesInfo("moonshotai/Kimi-VL-A3B-Instruct", # noqa: E501
|
||||
extras={"thinking": "moonshotai/Kimi-VL-A3B-Thinking"}, # noqa: E501
|
||||
trust_remote_code=True),
|
||||
|
||||
@ -24,6 +24,9 @@ from .registry import HF_EXAMPLE_MODELS
|
||||
|
||||
@pytest.mark.parametrize("model_arch", ModelRegistry.get_supported_archs())
|
||||
def test_registry_imports(model_arch):
|
||||
# Skip if transformers version is incompatible
|
||||
model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch)
|
||||
model_info.check_transformers_version(on_fail="skip")
|
||||
# Ensure all model classes can be imported successfully
|
||||
model_cls = ModelRegistry._try_load_model_cls(model_arch)
|
||||
assert model_cls is not None
|
||||
|
||||
@ -636,8 +636,10 @@ def run_dp_sharded_mrope_vision_model_vs_direct(local_rank: int,
|
||||
|
||||
# Run the model through the sharded function
|
||||
with torch.inference_mode():
|
||||
sharded_output = run_dp_sharded_mrope_vision_model(
|
||||
vision_model, pixel_values, grid_thw_list)
|
||||
sharded_output = run_dp_sharded_mrope_vision_model(vision_model,
|
||||
pixel_values,
|
||||
grid_thw_list,
|
||||
rope_type="rope_3d")
|
||||
sharded_output = torch.cat(sharded_output, dim=0)
|
||||
|
||||
# Check that the world size is setup correctly
|
||||
@ -691,8 +693,10 @@ def run_dp_sharded_mrope_vision_model_empty_input_worker(
|
||||
|
||||
# Should handle empty input gracefully
|
||||
with torch.inference_mode():
|
||||
output = run_dp_sharded_mrope_vision_model(vision_model, pixel_values,
|
||||
grid_thw_list)
|
||||
output = run_dp_sharded_mrope_vision_model(vision_model,
|
||||
pixel_values,
|
||||
grid_thw_list,
|
||||
rope_type="rope_3d")
|
||||
|
||||
assert len(output) == 0
|
||||
|
||||
@ -745,8 +749,10 @@ def run_dp_sharded_mrope_vision_model_uneven_load_worker(
|
||||
|
||||
# Should handle uneven distribution without errors
|
||||
with torch.inference_mode():
|
||||
output_tuple = run_dp_sharded_mrope_vision_model(
|
||||
vision_model, pixel_values, grid_thw_list)
|
||||
output_tuple = run_dp_sharded_mrope_vision_model(vision_model,
|
||||
pixel_values,
|
||||
grid_thw_list,
|
||||
rope_type="rope_3d")
|
||||
|
||||
# Verify output shape is reasonable
|
||||
merge_factor = vision_model.spatial_merge_size**2
|
||||
|
||||
@ -0,0 +1,8 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
def register_prithvi_india():
|
||||
return "prithvi_io_processor.prithvi_processor.PrithviMultimodalDataProcessorIndia" # noqa: E501
|
||||
|
||||
|
||||
def register_prithvi_valencia():
|
||||
return "prithvi_io_processor.prithvi_processor.PrithviMultimodalDataProcessorValencia" # noqa: E501
|
||||
@ -0,0 +1,432 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import base64
|
||||
import datetime
|
||||
import os
|
||||
import tempfile
|
||||
import urllib.request
|
||||
from collections.abc import Sequence
|
||||
from typing import Any, Optional, Union
|
||||
|
||||
import albumentations
|
||||
import numpy as np
|
||||
import rasterio
|
||||
import regex as re
|
||||
import torch
|
||||
from einops import rearrange
|
||||
from terratorch.datamodules import Sen1Floods11NonGeoDataModule
|
||||
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.entrypoints.openai.protocol import (IOProcessorRequest,
|
||||
IOProcessorResponse)
|
||||
from vllm.inputs.data import PromptType
|
||||
from vllm.logger import init_logger
|
||||
from vllm.outputs import PoolingRequestOutput
|
||||
from vllm.plugins.io_processors.interface import (IOProcessor,
|
||||
IOProcessorInput,
|
||||
IOProcessorOutput)
|
||||
|
||||
from .types import DataModuleConfig, ImagePrompt, ImageRequestOutput
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
NO_DATA = -9999
|
||||
NO_DATA_FLOAT = 0.0001
|
||||
OFFSET = 0
|
||||
PERCENTILE = 99
|
||||
|
||||
DEFAULT_INPUT_INDICES = [0, 1, 2, 3, 4, 5]
|
||||
|
||||
datamodule_config: DataModuleConfig = {
|
||||
"bands": ["BLUE", "GREEN", "RED", "NIR_NARROW", "SWIR_1", "SWIR_2"],
|
||||
"batch_size":
|
||||
16,
|
||||
"constant_scale":
|
||||
0.0001,
|
||||
"data_root":
|
||||
"/dccstor/geofm-finetuning/datasets/sen1floods11",
|
||||
"drop_last":
|
||||
True,
|
||||
"no_data_replace":
|
||||
0.0,
|
||||
"no_label_replace":
|
||||
-1,
|
||||
"num_workers":
|
||||
8,
|
||||
"test_transform": [
|
||||
albumentations.Resize(always_apply=False,
|
||||
height=448,
|
||||
interpolation=1,
|
||||
p=1,
|
||||
width=448),
|
||||
albumentations.pytorch.ToTensorV2(transpose_mask=False,
|
||||
always_apply=True,
|
||||
p=1.0),
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
def save_geotiff(image: torch.Tensor, meta: dict,
|
||||
out_format: str) -> str | bytes:
|
||||
"""Save multi-band image in Geotiff file.
|
||||
|
||||
Args:
|
||||
image: np.ndarray with shape (bands, height, width)
|
||||
output_path: path where to save the image
|
||||
meta: dict with meta info.
|
||||
"""
|
||||
if out_format == "path":
|
||||
# create temp file
|
||||
file_path = os.path.join(os.getcwd(), "prediction.tiff")
|
||||
with rasterio.open(file_path, "w", **meta) as dest:
|
||||
for i in range(image.shape[0]):
|
||||
dest.write(image[i, :, :], i + 1)
|
||||
|
||||
return file_path
|
||||
elif out_format == "b64_json":
|
||||
with tempfile.NamedTemporaryFile() as tmpfile:
|
||||
with rasterio.open(tmpfile.name, "w", **meta) as dest:
|
||||
for i in range(image.shape[0]):
|
||||
dest.write(image[i, :, :], i + 1)
|
||||
|
||||
file_data = tmpfile.read()
|
||||
return base64.b64encode(file_data)
|
||||
|
||||
else:
|
||||
raise ValueError("Unknown output format")
|
||||
|
||||
|
||||
def _convert_np_uint8(float_image: torch.Tensor):
|
||||
image = float_image.numpy() * 255.0
|
||||
image = image.astype(dtype=np.uint8)
|
||||
|
||||
return image
|
||||
|
||||
|
||||
def read_geotiff(
|
||||
file_path: Optional[str] = None,
|
||||
path_type: Optional[str] = None,
|
||||
file_data: Optional[bytes] = None,
|
||||
) -> tuple[torch.Tensor, dict, tuple[float, float] | None]:
|
||||
"""Read all bands from *file_path* and return image + meta info.
|
||||
|
||||
Args:
|
||||
file_path: path to image file.
|
||||
|
||||
Returns:
|
||||
np.ndarray with shape (bands, height, width)
|
||||
meta info dict
|
||||
"""
|
||||
|
||||
if all([x is None for x in [file_path, path_type, file_data]]):
|
||||
raise Exception("All input fields to read_geotiff are None")
|
||||
write_to_file: Optional[bytes] = None
|
||||
path: Optional[str] = None
|
||||
if file_data is not None:
|
||||
# with tempfile.NamedTemporaryFile() as tmpfile:
|
||||
# tmpfile.write(file_data)
|
||||
# path = tmpfile.name
|
||||
|
||||
write_to_file = file_data
|
||||
elif file_path is not None and path_type == "url":
|
||||
resp = urllib.request.urlopen(file_path)
|
||||
# with tempfile.NamedTemporaryFile() as tmpfile:
|
||||
# tmpfile.write(resp.read())
|
||||
# path = tmpfile.name
|
||||
write_to_file = resp.read()
|
||||
elif file_path is not None and path_type == "path":
|
||||
path = file_path
|
||||
elif file_path is not None and path_type == "b64_json":
|
||||
image_data = base64.b64decode(file_path)
|
||||
# with tempfile.NamedTemporaryFile() as tmpfile:
|
||||
# tmpfile.write(image_data)
|
||||
# path = tmpfile.name
|
||||
write_to_file = image_data
|
||||
else:
|
||||
raise Exception("Wrong combination of parameters to read_geotiff")
|
||||
|
||||
with tempfile.NamedTemporaryFile() as tmpfile:
|
||||
path_to_use = None
|
||||
if write_to_file:
|
||||
tmpfile.write(write_to_file)
|
||||
path_to_use = tmpfile.name
|
||||
elif path:
|
||||
path_to_use = path
|
||||
|
||||
with rasterio.open(path_to_use) as src:
|
||||
img = src.read()
|
||||
meta = src.meta
|
||||
try:
|
||||
coords = src.lnglat()
|
||||
except Exception:
|
||||
# Cannot read coords
|
||||
coords = None
|
||||
|
||||
return img, meta, coords
|
||||
|
||||
|
||||
def load_image(
|
||||
data: Union[list[str]],
|
||||
path_type: str,
|
||||
mean: Optional[list[float]] = None,
|
||||
std: Optional[list[float]] = None,
|
||||
indices: Optional[Union[list[int], None]] = None,
|
||||
):
|
||||
"""Build an input example by loading images in *file_paths*.
|
||||
|
||||
Args:
|
||||
file_paths: list of file paths .
|
||||
mean: list containing mean values for each band in the
|
||||
images in *file_paths*.
|
||||
std: list containing std values for each band in the
|
||||
images in *file_paths*.
|
||||
|
||||
Returns:
|
||||
np.array containing created example
|
||||
list of meta info for each image in *file_paths*
|
||||
"""
|
||||
|
||||
imgs = []
|
||||
metas = []
|
||||
temporal_coords = []
|
||||
location_coords = []
|
||||
|
||||
for file in data:
|
||||
# if isinstance(file, bytes):
|
||||
# img, meta, coords = read_geotiff(file_data=file)
|
||||
# else:
|
||||
img, meta, coords = read_geotiff(file_path=file, path_type=path_type)
|
||||
# Rescaling (don't normalize on nodata)
|
||||
img = np.moveaxis(img, 0, -1) # channels last for rescaling
|
||||
if indices is not None:
|
||||
img = img[..., indices]
|
||||
if mean is not None and std is not None:
|
||||
img = np.where(img == NO_DATA, NO_DATA_FLOAT, (img - mean) / std)
|
||||
|
||||
imgs.append(img)
|
||||
metas.append(meta)
|
||||
if coords is not None:
|
||||
location_coords.append(coords)
|
||||
|
||||
try:
|
||||
match = re.search(r"(\d{7,8}T\d{6})", file)
|
||||
if match:
|
||||
year = int(match.group(1)[:4])
|
||||
julian_day = match.group(1).split("T")[0][4:]
|
||||
if len(julian_day) == 3:
|
||||
julian_day = int(julian_day)
|
||||
else:
|
||||
julian_day = (datetime.datetime.strptime(
|
||||
julian_day, "%m%d").timetuple().tm_yday)
|
||||
temporal_coords.append([year, julian_day])
|
||||
except Exception:
|
||||
logger.exception("Could not extract timestamp for %s", file)
|
||||
|
||||
imgs = np.stack(imgs, axis=0) # num_frames, H, W, C
|
||||
imgs = np.moveaxis(imgs, -1, 0).astype("float32") # C, num_frames, H, W
|
||||
imgs = np.expand_dims(imgs, axis=0) # add batch di
|
||||
|
||||
return imgs, temporal_coords, location_coords, metas
|
||||
|
||||
|
||||
class PrithviMultimodalDataProcessor(IOProcessor):
|
||||
|
||||
def __init__(self, vllm_config: VllmConfig):
|
||||
|
||||
super().__init__(vllm_config)
|
||||
|
||||
self.datamodule = Sen1Floods11NonGeoDataModule(
|
||||
data_root=datamodule_config["data_root"],
|
||||
batch_size=datamodule_config["batch_size"],
|
||||
num_workers=datamodule_config["num_workers"],
|
||||
bands=datamodule_config["bands"],
|
||||
drop_last=datamodule_config["drop_last"],
|
||||
test_transform=datamodule_config["test_transform"],
|
||||
)
|
||||
self.img_size = 512
|
||||
self.h1 = 1
|
||||
self.w1 = 1
|
||||
self.original_h = 512
|
||||
self.original_w = 512
|
||||
self.batch_size = 1
|
||||
self.meta_data = None
|
||||
self.requests_cache: dict[str, dict[str, Any]] = {}
|
||||
self.indices = DEFAULT_INPUT_INDICES
|
||||
|
||||
def parse_request(self, request: Any) -> IOProcessorInput:
|
||||
if type(request) is dict:
|
||||
image_prompt = ImagePrompt(**request)
|
||||
return image_prompt
|
||||
if isinstance(request, IOProcessorRequest):
|
||||
if not hasattr(request, "data"):
|
||||
raise ValueError(
|
||||
"missing 'data' field in OpenAIBaseModel Request")
|
||||
|
||||
request_data = request.data
|
||||
|
||||
if type(request_data) is dict:
|
||||
return ImagePrompt(**request_data)
|
||||
else:
|
||||
raise ValueError("Unable to parse the request data")
|
||||
|
||||
raise ValueError("Unable to parse request")
|
||||
|
||||
def output_to_response(
|
||||
self, plugin_output: IOProcessorOutput) -> IOProcessorResponse:
|
||||
return IOProcessorResponse(
|
||||
request_id=plugin_output.request_id,
|
||||
data=plugin_output,
|
||||
)
|
||||
|
||||
def pre_process(
|
||||
self,
|
||||
prompt: IOProcessorInput,
|
||||
request_id: Optional[str] = None,
|
||||
**kwargs,
|
||||
) -> Union[PromptType, Sequence[PromptType]]:
|
||||
|
||||
image_data = dict(prompt)
|
||||
|
||||
if request_id:
|
||||
self.requests_cache[request_id] = {
|
||||
"out_format": image_data["out_data_format"],
|
||||
}
|
||||
|
||||
input_data, temporal_coords, location_coords, meta_data = load_image(
|
||||
data=[image_data["data"]],
|
||||
indices=self.indices,
|
||||
path_type=image_data["data_format"],
|
||||
)
|
||||
|
||||
self.meta_data = meta_data[0]
|
||||
|
||||
if input_data.mean() > 1:
|
||||
input_data = input_data / 10000 # Convert to range 0-1
|
||||
|
||||
self.original_h, self.original_w = input_data.shape[-2:]
|
||||
pad_h = (self.img_size -
|
||||
(self.original_h % self.img_size)) % self.img_size
|
||||
pad_w = (self.img_size -
|
||||
(self.original_w % self.img_size)) % self.img_size
|
||||
input_data = np.pad(
|
||||
input_data,
|
||||
((0, 0), (0, 0), (0, 0), (0, pad_h), (0, pad_w)),
|
||||
mode="reflect",
|
||||
)
|
||||
|
||||
batch = torch.tensor(input_data)
|
||||
windows = batch.unfold(3, self.img_size,
|
||||
self.img_size).unfold(4, self.img_size,
|
||||
self.img_size)
|
||||
self.h1, self.w1 = windows.shape[3:5]
|
||||
windows = rearrange(
|
||||
windows,
|
||||
"b c t h1 w1 h w -> (b h1 w1) c t h w",
|
||||
h=self.img_size,
|
||||
w=self.img_size,
|
||||
)
|
||||
|
||||
# Split into batches if number of windows > batch_size
|
||||
num_batches = (windows.shape[0] // self.batch_size
|
||||
if windows.shape[0] > self.batch_size else 1)
|
||||
windows = torch.tensor_split(windows, num_batches, dim=0)
|
||||
|
||||
if temporal_coords:
|
||||
temporal_coords = torch.tensor(temporal_coords).unsqueeze(0)
|
||||
else:
|
||||
temporal_coords = None
|
||||
if location_coords:
|
||||
location_coords = torch.tensor(location_coords[0]).unsqueeze(0)
|
||||
else:
|
||||
location_coords = None
|
||||
|
||||
prompts = []
|
||||
for window in windows:
|
||||
# Apply standardization
|
||||
window = self.datamodule.test_transform(
|
||||
image=window.squeeze().numpy().transpose(1, 2, 0))
|
||||
window = self.datamodule.aug(window)["image"]
|
||||
prompts.append({
|
||||
"prompt_token_ids": [1],
|
||||
"multi_modal_data": {
|
||||
"pixel_values": window.to(torch.float16)[0],
|
||||
"location_coords": location_coords.to(torch.float16),
|
||||
},
|
||||
})
|
||||
|
||||
return prompts
|
||||
|
||||
def post_process(
|
||||
self,
|
||||
model_output: Sequence[PoolingRequestOutput],
|
||||
request_id: Optional[str] = None,
|
||||
**kwargs,
|
||||
) -> IOProcessorOutput:
|
||||
|
||||
pred_imgs_list = []
|
||||
|
||||
if request_id and (request_id in self.requests_cache):
|
||||
out_format = self.requests_cache[request_id]["out_format"]
|
||||
else:
|
||||
out_format = "b64_json"
|
||||
|
||||
for output in model_output:
|
||||
y_hat = output.outputs.data.argmax(dim=1)
|
||||
pred = torch.nn.functional.interpolate(
|
||||
y_hat.unsqueeze(1).float(),
|
||||
size=self.img_size,
|
||||
mode="nearest",
|
||||
)
|
||||
pred_imgs_list.append(pred)
|
||||
|
||||
pred_imgs: torch.Tensor = torch.concat(pred_imgs_list, dim=0)
|
||||
|
||||
# Build images from patches
|
||||
pred_imgs = rearrange(
|
||||
pred_imgs,
|
||||
"(b h1 w1) c h w -> b c (h1 h) (w1 w)",
|
||||
h=self.img_size,
|
||||
w=self.img_size,
|
||||
b=1,
|
||||
c=1,
|
||||
h1=self.h1,
|
||||
w1=self.w1,
|
||||
)
|
||||
|
||||
# Cut padded area back to original size
|
||||
pred_imgs = pred_imgs[..., :self.original_h, :self.original_w]
|
||||
|
||||
# Squeeze (batch size 1)
|
||||
pred_imgs = pred_imgs[0]
|
||||
|
||||
if not self.meta_data:
|
||||
raise ValueError("No metadata available for the current task")
|
||||
self.meta_data.update(count=1, dtype="uint8", compress="lzw", nodata=0)
|
||||
out_data = save_geotiff(_convert_np_uint8(pred_imgs), self.meta_data,
|
||||
out_format)
|
||||
|
||||
return ImageRequestOutput(type=out_format,
|
||||
format="tiff",
|
||||
data=out_data,
|
||||
request_id=request_id)
|
||||
|
||||
|
||||
class PrithviMultimodalDataProcessorIndia(PrithviMultimodalDataProcessor):
|
||||
|
||||
def __init__(self, vllm_config: VllmConfig):
|
||||
|
||||
super().__init__(vllm_config)
|
||||
|
||||
self.indices = [1, 2, 3, 8, 11, 12]
|
||||
|
||||
|
||||
class PrithviMultimodalDataProcessorValencia(PrithviMultimodalDataProcessor):
|
||||
|
||||
def __init__(self, vllm_config: VllmConfig):
|
||||
|
||||
super().__init__(vllm_config)
|
||||
|
||||
self.indices = [0, 1, 2, 3, 4, 5]
|
||||
@ -0,0 +1,59 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from typing import Any, Literal, Optional, TypedDict, Union
|
||||
|
||||
import albumentations
|
||||
from pydantic import BaseModel
|
||||
|
||||
|
||||
class DataModuleConfig(TypedDict):
|
||||
bands: list[str]
|
||||
batch_size: int
|
||||
constant_scale: float
|
||||
data_root: str
|
||||
drop_last: bool
|
||||
no_data_replace: float
|
||||
no_label_replace: int
|
||||
num_workers: int
|
||||
test_transform: list[
|
||||
albumentations.core.transforms_interface.BasicTransform]
|
||||
|
||||
|
||||
class ImagePrompt(BaseModel):
|
||||
|
||||
data_format: Literal["b64_json", "bytes", "url"]
|
||||
"""
|
||||
This is the data type for the input image
|
||||
"""
|
||||
|
||||
image_format: str
|
||||
"""
|
||||
This is the image format (e.g., jpeg, png, etc.)
|
||||
"""
|
||||
|
||||
out_data_format: Literal["b64_json", "url"]
|
||||
|
||||
data: Any
|
||||
"""
|
||||
Input image data
|
||||
"""
|
||||
|
||||
|
||||
MultiModalPromptType = Union[ImagePrompt]
|
||||
|
||||
|
||||
class ImageRequestOutput(BaseModel):
|
||||
"""
|
||||
The output data of an image request to vLLM.
|
||||
|
||||
Args:
|
||||
type (str): The data content type [path, object]
|
||||
format (str): The image format (e.g., jpeg, png, etc.)
|
||||
data (Any): The resulting data.
|
||||
"""
|
||||
|
||||
type: Literal["path", "b64_json"]
|
||||
format: str
|
||||
data: str
|
||||
request_id: Optional[str] = None
|
||||
16
tests/plugins/prithvi_io_processor_plugin/setup.py
Normal file
16
tests/plugins/prithvi_io_processor_plugin/setup.py
Normal file
@ -0,0 +1,16 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from setuptools import setup
|
||||
|
||||
setup(
|
||||
name="prithvi_io_processor_plugin",
|
||||
version="0.1",
|
||||
packages=["prithvi_io_processor"],
|
||||
entry_points={
|
||||
"vllm.io_processor_plugins": [
|
||||
"prithvi_to_tiff_india = prithvi_io_processor:register_prithvi_india", # noqa: E501
|
||||
"prithvi_to_tiff_valencia = prithvi_io_processor:register_prithvi_valencia", # noqa: E501
|
||||
]
|
||||
},
|
||||
)
|
||||
@ -1,12 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.fixture(scope="function", autouse=True)
|
||||
def use_v0_only(monkeypatch):
|
||||
"""
|
||||
Since this module is V0 only, set VLLM_USE_V1=0 for
|
||||
all tests in the module.
|
||||
"""
|
||||
monkeypatch.setenv('VLLM_USE_V1', '0')
|
||||
138
tests/plugins_tests/test_io_processor_plugins.py
Normal file
138
tests/plugins_tests/test_io_processor_plugins.py
Normal file
@ -0,0 +1,138 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import base64
|
||||
|
||||
import pytest
|
||||
import requests
|
||||
|
||||
from tests.utils import RemoteOpenAIServer
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.entrypoints.llm import LLM
|
||||
from vllm.entrypoints.openai.protocol import IOProcessorResponse
|
||||
from vllm.plugins.io_processors import get_io_processor
|
||||
from vllm.pooling_params import PoolingParams
|
||||
|
||||
MODEL_NAME = "christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM"
|
||||
|
||||
image_url = "https://huggingface.co/christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM/resolve/main/valencia_example_2024-10-26.tiff" # noqa: E501
|
||||
|
||||
|
||||
def test_loading_missing_plugin():
|
||||
vllm_config = VllmConfig()
|
||||
with pytest.raises(ValueError):
|
||||
get_io_processor(vllm_config, "wrong_plugin")
|
||||
|
||||
|
||||
def test_loading_engine_with_wrong_plugin():
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
LLM(
|
||||
model=MODEL_NAME,
|
||||
skip_tokenizer_init=True,
|
||||
trust_remote_code=True,
|
||||
enforce_eager=True,
|
||||
# Limit the maximum number of parallel requests
|
||||
# to avoid the model going OOM in CI.
|
||||
max_num_seqs=32,
|
||||
io_processor_plugin="wrong_plugin",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
def test_prithvi_mae_plugin_offline(vllm_runner, model_name: str):
|
||||
|
||||
img_prompt = dict(
|
||||
data=image_url,
|
||||
data_format="url",
|
||||
image_format="tiff",
|
||||
out_data_format="b64_json",
|
||||
)
|
||||
|
||||
pooling_params = PoolingParams(task="encode", softmax=False)
|
||||
|
||||
with vllm_runner(
|
||||
model_name,
|
||||
runner="pooling",
|
||||
skip_tokenizer_init=True,
|
||||
trust_remote_code=True,
|
||||
enforce_eager=True,
|
||||
# Limit the maximum number of parallel requests
|
||||
# to avoid the model going OOM in CI.
|
||||
max_num_seqs=1,
|
||||
io_processor_plugin="prithvi_to_tiff_valencia",
|
||||
) as llm_runner:
|
||||
pooler_output = llm_runner.get_llm().encode(
|
||||
img_prompt,
|
||||
pooling_params=pooling_params,
|
||||
)
|
||||
output = pooler_output[0].outputs
|
||||
|
||||
# verify the output is formatted as expected for this plugin
|
||||
assert all(
|
||||
hasattr(output, attr)
|
||||
for attr in ["type", "format", "data", "request_id"])
|
||||
|
||||
# We just check that the output is a valid base64 string.
|
||||
# Raises an exception and fails the test if the string is corrupted.
|
||||
base64.b64decode(output.data)
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def server():
|
||||
args = [
|
||||
"--runner",
|
||||
"pooling",
|
||||
"--enforce-eager",
|
||||
"--trust-remote-code",
|
||||
"--skip-tokenizer-init",
|
||||
# Limit the maximum number of parallel requests
|
||||
# to avoid the model going OOM in CI.
|
||||
"--max-num-seqs",
|
||||
"32",
|
||||
"--io-processor-plugin",
|
||||
"prithvi_to_tiff_valencia"
|
||||
]
|
||||
|
||||
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_prithvi_mae_plugin_online(
|
||||
server: RemoteOpenAIServer,
|
||||
model_name: str,
|
||||
):
|
||||
|
||||
request_payload_url = {
|
||||
"data": {
|
||||
"data": image_url,
|
||||
"data_format": "url",
|
||||
"image_format": "tiff",
|
||||
"out_data_format": "b64_json",
|
||||
},
|
||||
"priority": 0,
|
||||
"model": model_name,
|
||||
"softmax": False
|
||||
}
|
||||
|
||||
ret = requests.post(
|
||||
server.url_for("pooling"),
|
||||
json=request_payload_url,
|
||||
)
|
||||
|
||||
response = ret.json()
|
||||
|
||||
# verify the request response is in the correct format
|
||||
assert (parsed_response := IOProcessorResponse(**response))
|
||||
|
||||
# verify the output is formatted as expected for this plugin
|
||||
plugin_data = parsed_response.data
|
||||
|
||||
assert all(
|
||||
plugin_data.get(attr)
|
||||
for attr in ["type", "format", "data", "request_id"])
|
||||
|
||||
# We just check that the output is a valid base64 string.
|
||||
# Raises an exception and fails the test if the string is corrupted.
|
||||
base64.b64decode(plugin_data["data"])
|
||||
@ -7,6 +7,15 @@ import torch
|
||||
from vllm.plugins import load_general_plugins
|
||||
|
||||
|
||||
@pytest.fixture(scope="function", autouse=True)
|
||||
def use_v0_only(monkeypatch):
|
||||
"""
|
||||
Since this module is V0 only, set VLLM_USE_V1=0 for
|
||||
all tests in the module.
|
||||
"""
|
||||
monkeypatch.setenv('VLLM_USE_V1', '0')
|
||||
|
||||
|
||||
def test_platform_plugins():
|
||||
# simulate workload by running an example
|
||||
import runpy
|
||||
|
||||
@ -64,8 +64,6 @@ def _run_incremental_decode(tokenizer,
|
||||
request = EngineCoreRequest("",
|
||||
prompt_token_ids,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
params,
|
||||
None,
|
||||
None,
|
||||
|
||||
@ -2,12 +2,17 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import json
|
||||
from collections.abc import Generator
|
||||
from typing import Optional
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.entrypoints.openai.protocol import FunctionCall, ToolCall
|
||||
from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
|
||||
DeltaMessage, FunctionCall,
|
||||
ToolCall)
|
||||
from vllm.entrypoints.openai.tool_parsers import xLAMToolParser
|
||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||
from vllm.transformers_utils.detokenizer import detokenize_incrementally
|
||||
from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer
|
||||
|
||||
# Use a common model that is likely to be available
|
||||
MODEL = "Salesforce/Llama-xLAM-2-8B-fc-r"
|
||||
@ -36,6 +41,56 @@ def assert_tool_calls(actual_tool_calls: list[ToolCall],
|
||||
assert actual_tool_call.function == expected_tool_call.function
|
||||
|
||||
|
||||
def stream_delta_message_generator(
|
||||
xlam_tool_parser: xLAMToolParser,
|
||||
xlam_tokenizer: AnyTokenizer,
|
||||
model_output: str,
|
||||
request: Optional[ChatCompletionRequest] = None,
|
||||
) -> Generator[DeltaMessage, None, None]:
|
||||
all_token_ids = xlam_tokenizer.encode(model_output,
|
||||
add_special_tokens=False)
|
||||
|
||||
previous_text = ""
|
||||
previous_tokens = None
|
||||
prefix_offset = 0
|
||||
read_offset = 0
|
||||
for i, delta_token in enumerate(all_token_ids):
|
||||
delta_token_ids = [delta_token]
|
||||
previous_token_ids = all_token_ids[:i]
|
||||
current_token_ids = all_token_ids[:i + 1]
|
||||
|
||||
(new_tokens, delta_text, new_prefix_offset,
|
||||
new_read_offset) = (detokenize_incrementally(
|
||||
tokenizer=xlam_tokenizer,
|
||||
all_input_ids=current_token_ids,
|
||||
prev_tokens=previous_tokens,
|
||||
prefix_offset=prefix_offset,
|
||||
read_offset=read_offset,
|
||||
skip_special_tokens=False,
|
||||
spaces_between_special_tokens=True,
|
||||
))
|
||||
|
||||
current_text = previous_text + delta_text
|
||||
|
||||
delta_message = xlam_tool_parser.extract_tool_calls_streaming(
|
||||
previous_text,
|
||||
current_text,
|
||||
delta_text,
|
||||
previous_token_ids,
|
||||
current_token_ids,
|
||||
delta_token_ids,
|
||||
request=request,
|
||||
)
|
||||
if delta_message:
|
||||
yield delta_message
|
||||
|
||||
previous_text = current_text
|
||||
previous_tokens = (previous_tokens +
|
||||
new_tokens if previous_tokens else new_tokens)
|
||||
prefix_offset = new_prefix_offset
|
||||
read_offset = new_read_offset
|
||||
|
||||
|
||||
def test_extract_tool_calls_no_tools(xlam_tool_parser):
|
||||
model_output = "This is a test"
|
||||
extracted_tool_calls = xlam_tool_parser.extract_tool_calls(
|
||||
@ -51,6 +106,7 @@ def test_extract_tool_calls_no_tools(xlam_tool_parser):
|
||||
"single_tool_with_think_tag",
|
||||
"single_tool_with_json_code_block",
|
||||
"single_tool_with_tool_calls_tag",
|
||||
"single_tool_with_tool_call_xml_tags",
|
||||
],
|
||||
argnames=["model_output", "expected_tool_calls", "expected_content"],
|
||||
argvalues=[
|
||||
@ -118,6 +174,20 @@ def test_extract_tool_calls_no_tools(xlam_tool_parser):
|
||||
],
|
||||
"I'll check the weather for you.",
|
||||
),
|
||||
(
|
||||
"""I'll help you check the weather.<tool_call>[{"name": "get_current_weather", "arguments": {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}}]</tool_call>""", # noqa: E501
|
||||
[
|
||||
ToolCall(function=FunctionCall(
|
||||
name="get_current_weather",
|
||||
arguments=json.dumps({
|
||||
"city": "Dallas",
|
||||
"state": "TX",
|
||||
"unit": "fahrenheit",
|
||||
}),
|
||||
))
|
||||
],
|
||||
"I'll help you check the weather.",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_extract_tool_calls(xlam_tool_parser, model_output,
|
||||
@ -245,3 +315,147 @@ def test_streaming_with_list_structure(xlam_tool_parser):
|
||||
assert hasattr(result, "tool_calls")
|
||||
assert len(result.tool_calls) == 1
|
||||
assert result.tool_calls[0].function.name == "get_current_weather"
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
ids=[
|
||||
"parallel_tool_calls",
|
||||
"single_tool_with_think_tag",
|
||||
"single_tool_with_json_code_block",
|
||||
"single_tool_with_tool_calls_tag",
|
||||
"single_tool_with_tool_call_xml_tags",
|
||||
],
|
||||
argnames=["model_output", "expected_tool_calls", "expected_content"],
|
||||
argvalues=[
|
||||
(
|
||||
"""[{"name": "get_current_weather", "arguments": {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}}, {"name": "get_current_weather", "arguments": {"city": "Orlando", "state": "FL", "unit": "fahrenheit"}}]""", # noqa: E501
|
||||
[
|
||||
ToolCall(function=FunctionCall(
|
||||
name="get_current_weather",
|
||||
arguments=json.dumps({
|
||||
"city": "Dallas",
|
||||
"state": "TX",
|
||||
"unit": "fahrenheit",
|
||||
}),
|
||||
)),
|
||||
ToolCall(function=FunctionCall(
|
||||
name="get_current_weather",
|
||||
arguments=json.dumps({
|
||||
"city": "Orlando",
|
||||
"state": "FL",
|
||||
"unit": "fahrenheit",
|
||||
}),
|
||||
)),
|
||||
],
|
||||
"",
|
||||
),
|
||||
(
|
||||
"""<think>I'll help you with that.</think>[{"name": "get_current_weather", "arguments": {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}}]""", # noqa: E501
|
||||
[
|
||||
ToolCall(function=FunctionCall(
|
||||
name="get_current_weather",
|
||||
arguments=json.dumps({
|
||||
"city": "Dallas",
|
||||
"state": "TX",
|
||||
"unit": "fahrenheit",
|
||||
}),
|
||||
))
|
||||
],
|
||||
"<think>I'll help you with that.</think>",
|
||||
),
|
||||
(
|
||||
"""```json\n[{"name": "get_current_weather", "arguments": {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}}]\n```""", # noqa: E501
|
||||
[
|
||||
ToolCall(function=FunctionCall(
|
||||
name="get_current_weather",
|
||||
arguments=json.dumps({
|
||||
"city": "Dallas",
|
||||
"state": "TX",
|
||||
"unit": "fahrenheit",
|
||||
}),
|
||||
))
|
||||
],
|
||||
"",
|
||||
),
|
||||
(
|
||||
"""[TOOL_CALLS][{"name": "get_current_weather", "arguments": {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}}]""", # noqa: E501
|
||||
[
|
||||
ToolCall(function=FunctionCall(
|
||||
name="get_current_weather",
|
||||
arguments=json.dumps({
|
||||
"city": "Dallas",
|
||||
"state": "TX",
|
||||
"unit": "fahrenheit",
|
||||
}),
|
||||
))
|
||||
],
|
||||
"",
|
||||
),
|
||||
(
|
||||
"""I can help with that.<tool_call>[{"name": "get_current_weather", "arguments": {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}}]</tool_call>""", # noqa: E501
|
||||
[
|
||||
ToolCall(function=FunctionCall(
|
||||
name="get_current_weather",
|
||||
arguments=json.dumps({
|
||||
"city": "Dallas",
|
||||
"state": "TX",
|
||||
"unit": "fahrenheit",
|
||||
}),
|
||||
))
|
||||
],
|
||||
"I can help with that.",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_extract_tool_calls_streaming_incremental(
|
||||
xlam_tool_parser,
|
||||
xlam_tokenizer,
|
||||
model_output,
|
||||
expected_tool_calls,
|
||||
expected_content,
|
||||
):
|
||||
"""Verify the XLAM Parser streaming behavior by verifying each chunk is as expected.""" # noqa: E501
|
||||
request = ChatCompletionRequest(model=MODEL, messages=[], tools=[])
|
||||
|
||||
chunks = []
|
||||
for delta_message in stream_delta_message_generator(
|
||||
xlam_tool_parser, xlam_tokenizer, model_output, request):
|
||||
chunks.append(delta_message)
|
||||
|
||||
# Should have multiple chunks
|
||||
assert len(chunks) >= 3
|
||||
|
||||
# Should have a chunk with tool header (id, name, type) for the first tool call # noqa: E501
|
||||
header_found = False
|
||||
expected_first_tool = expected_tool_calls[0]
|
||||
for chunk in chunks:
|
||||
if chunk.tool_calls and chunk.tool_calls[0].id:
|
||||
header_found = True
|
||||
assert (chunk.tool_calls[0].function.name ==
|
||||
expected_first_tool.function.name)
|
||||
assert chunk.tool_calls[0].type == "function"
|
||||
# Arguments may be empty initially or None
|
||||
if chunk.tool_calls[0].function.arguments is not None:
|
||||
# If present, should be empty string initially
|
||||
assert chunk.tool_calls[0].function.arguments == ""
|
||||
break
|
||||
assert header_found
|
||||
|
||||
# Should have chunks with incremental arguments
|
||||
arg_chunks = []
|
||||
for chunk in chunks:
|
||||
if (chunk.tool_calls and chunk.tool_calls[0].function.arguments
|
||||
and chunk.tool_calls[0].function.arguments != ""
|
||||
and chunk.tool_calls[0].index ==
|
||||
0 # Only collect arguments from the first tool call
|
||||
):
|
||||
arg_chunks.append(chunk.tool_calls[0].function.arguments)
|
||||
|
||||
# Arguments should be streamed incrementally
|
||||
assert len(arg_chunks) > 1
|
||||
|
||||
# Concatenated arguments should form valid JSON for the first tool call
|
||||
full_args = "".join(arg_chunks)
|
||||
parsed_args = json.loads(full_args)
|
||||
expected_args = json.loads(expected_first_tool.function.arguments)
|
||||
assert parsed_args == expected_args
|
||||
|
||||
@ -379,9 +379,9 @@ def test_duplicate_dict_args(caplog_vllm, parser):
|
||||
def test_supports_kw(callable,kw_name,requires_kw_only,
|
||||
allow_var_kwargs,is_supported):
|
||||
assert supports_kw(
|
||||
callable=callable,
|
||||
kw_name=kw_name,
|
||||
requires_kw_only=requires_kw_only,
|
||||
callable=callable,
|
||||
kw_name=kw_name,
|
||||
requires_kw_only=requires_kw_only,
|
||||
allow_var_kwargs=allow_var_kwargs
|
||||
) == is_supported
|
||||
|
||||
@ -948,6 +948,36 @@ def test_join_host_port():
|
||||
assert join_host_port("::1", 5555) == "[::1]:5555"
|
||||
|
||||
|
||||
def test_json_count_leaves():
|
||||
"""Test json_count_leaves function from jsontree utility."""
|
||||
from vllm.utils.jsontree import json_count_leaves
|
||||
|
||||
# Single leaf values
|
||||
assert json_count_leaves(42) == 1
|
||||
assert json_count_leaves("hello") == 1
|
||||
assert json_count_leaves(None) == 1
|
||||
|
||||
# Empty containers
|
||||
assert json_count_leaves([]) == 0
|
||||
assert json_count_leaves({}) == 0
|
||||
assert json_count_leaves(()) == 0
|
||||
|
||||
# Flat structures
|
||||
assert json_count_leaves([1, 2, 3]) == 3
|
||||
assert json_count_leaves({"a": 1, "b": 2}) == 2
|
||||
assert json_count_leaves((1, 2, 3)) == 3
|
||||
|
||||
# Nested structures
|
||||
nested_dict = {"a": 1, "b": {"c": 2, "d": 3}}
|
||||
assert json_count_leaves(nested_dict) == 3
|
||||
|
||||
nested_list = [1, [2, 3], 4]
|
||||
assert json_count_leaves(nested_list) == 4
|
||||
|
||||
mixed_nested = {"list": [1, 2], "dict": {"x": 3}, "value": 4}
|
||||
assert json_count_leaves(mixed_nested) == 4
|
||||
|
||||
|
||||
def test_convert_ids_list_to_tokens():
|
||||
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-1.5B-Instruct")
|
||||
token_ids = tokenizer.encode("Hello, world!")
|
||||
|
||||
@ -7,7 +7,8 @@ import pytest
|
||||
import torch
|
||||
|
||||
from vllm.config import ModelConfig, SchedulerConfig, VllmConfig
|
||||
from vllm.multimodal.inputs import MultiModalKwargsItem, PlaceholderRange
|
||||
from vllm.multimodal.inputs import (MultiModalFeatureSpec,
|
||||
MultiModalKwargsItem, PlaceholderRange)
|
||||
from vllm.sampling_params import SamplingParams
|
||||
from vllm.utils import GiB_bytes, sha256, sha256_cbor_64bit
|
||||
from vllm.v1.core.kv_cache_manager import KVCacheManager
|
||||
@ -37,17 +38,20 @@ def make_request(
|
||||
mm_hashes: Optional[list[str]] = None,
|
||||
cache_salt: Optional[str] = None,
|
||||
):
|
||||
if mm_positions is None:
|
||||
mm_kwargs = None
|
||||
else:
|
||||
mm_item = MultiModalKwargsItem.dummy("dummy_m")
|
||||
mm_kwargs = [mm_item] * len(mm_positions)
|
||||
mm_features = []
|
||||
if mm_positions is not None:
|
||||
for j, position in enumerate(mm_positions):
|
||||
identifier = mm_hashes[j] if mm_hashes else f"hash_{j}"
|
||||
mm_feature = MultiModalFeatureSpec(
|
||||
data=MultiModalKwargsItem.dummy("dummy_m"),
|
||||
mm_position=position,
|
||||
identifier=identifier,
|
||||
modality="image")
|
||||
mm_features.append(mm_feature)
|
||||
|
||||
return Request(request_id=request_id,
|
||||
prompt_token_ids=prompt_token_ids,
|
||||
multi_modal_kwargs=mm_kwargs,
|
||||
multi_modal_hashes=mm_hashes,
|
||||
multi_modal_placeholders=mm_positions,
|
||||
mm_features=mm_features if mm_features else None,
|
||||
sampling_params=SamplingParams(max_tokens=17),
|
||||
pooling_params=None,
|
||||
eos_token_id=100,
|
||||
@ -243,7 +247,7 @@ def test_free_kv_cache_block_queue_append_n():
|
||||
|
||||
def test_free_kv_cache_block_queue_popleft_n():
|
||||
blocks = [KVCacheBlock(block_id=i) for i in range(6)]
|
||||
# Create a empty FreeKVCacheBlockQueue with these blocks
|
||||
# Create an empty FreeKVCacheBlockQueue with these blocks
|
||||
queue = FreeKVCacheBlockQueue(
|
||||
[blocks[1], blocks[3], blocks[5], blocks[4], blocks[0], blocks[2]])
|
||||
assert queue.num_free_blocks == 6
|
||||
@ -597,8 +601,14 @@ def test_unify_kv_cache_configs():
|
||||
]
|
||||
|
||||
unify_kv_cache_configs(need_sort_kv_cache_config)
|
||||
assert need_sort_kv_cache_config[0].num_blocks == 10
|
||||
assert need_sort_kv_cache_config[1].num_blocks == 10
|
||||
sorted_kv_cache_groups = [
|
||||
KVCacheGroupSpec(["layer1"], new_kv_cache_spec()),
|
||||
KVCacheGroupSpec(["layer2"], new_kv_cache_spec(num_kv_heads=4)),
|
||||
]
|
||||
assert (
|
||||
need_sort_kv_cache_config[0].kv_cache_groups == sorted_kv_cache_groups)
|
||||
assert (
|
||||
need_sort_kv_cache_config[1].kv_cache_groups == sorted_kv_cache_groups)
|
||||
|
||||
diff_kv_cache_config = [
|
||||
KVCacheConfig(
|
||||
|
||||
@ -9,7 +9,8 @@ import pytest
|
||||
import torch
|
||||
|
||||
from vllm.distributed.kv_events import AllBlocksCleared, BlockRemoved
|
||||
from vllm.multimodal.inputs import MultiModalKwargsItem, PlaceholderRange
|
||||
from vllm.multimodal.inputs import (MultiModalFeatureSpec,
|
||||
MultiModalKwargsItem, PlaceholderRange)
|
||||
from vllm.sampling_params import SamplingParams
|
||||
from vllm.utils import sha256, sha256_cbor_64bit
|
||||
from vllm.v1.core.block_pool import BlockPool
|
||||
@ -32,17 +33,20 @@ def make_request(
|
||||
prompt_logprobs: Optional[int] = None,
|
||||
cache_salt: Optional[str] = None,
|
||||
):
|
||||
if mm_positions is None:
|
||||
mm_kwargs = None
|
||||
else:
|
||||
mm_item = MultiModalKwargsItem.dummy("dummy_m")
|
||||
mm_kwargs = [mm_item] * len(mm_positions)
|
||||
mm_features = []
|
||||
if mm_positions is not None:
|
||||
for j, position in enumerate(mm_positions):
|
||||
identifier = mm_hashes[j] if mm_hashes else f"hash_{j}"
|
||||
mm_feature = MultiModalFeatureSpec(
|
||||
data=MultiModalKwargsItem.dummy("dummy_m"),
|
||||
mm_position=position,
|
||||
identifier=identifier,
|
||||
modality="image")
|
||||
mm_features.append(mm_feature)
|
||||
|
||||
return Request(request_id=request_id,
|
||||
prompt_token_ids=prompt_token_ids,
|
||||
multi_modal_kwargs=mm_kwargs,
|
||||
multi_modal_hashes=mm_hashes,
|
||||
multi_modal_placeholders=mm_positions,
|
||||
mm_features=mm_features if mm_features else None,
|
||||
sampling_params=SamplingParams(
|
||||
max_tokens=17, prompt_logprobs=prompt_logprobs),
|
||||
pooling_params=None,
|
||||
|
||||
@ -8,7 +8,8 @@ import torch
|
||||
|
||||
from vllm.config import (CacheConfig, KVTransferConfig, ModelConfig,
|
||||
SchedulerConfig, SpeculativeConfig, VllmConfig)
|
||||
from vllm.multimodal.inputs import MultiModalKwargsItem, PlaceholderRange
|
||||
from vllm.multimodal.inputs import (MultiModalFeatureSpec,
|
||||
MultiModalKwargsItem, PlaceholderRange)
|
||||
from vllm.sampling_params import GuidedDecodingParams, SamplingParams
|
||||
from vllm.v1.core.sched.output import CachedRequestData, SchedulerOutput
|
||||
from vllm.v1.core.sched.scheduler import Scheduler
|
||||
@ -1308,21 +1309,24 @@ def create_requests_with_priority(
|
||||
prompt_logprobs=prompt_logprobs)
|
||||
requests = []
|
||||
for i in range(num_requests):
|
||||
mm_features = []
|
||||
if mm_positions is not None:
|
||||
mm_position = mm_positions[i]
|
||||
mm_item = MultiModalKwargsItem.dummy("dummy_m")
|
||||
mm_kwargs = [mm_item] * len(mm_position)
|
||||
else:
|
||||
mm_position = None
|
||||
mm_kwargs = None
|
||||
for j, position in enumerate(mm_position):
|
||||
identifier = f"hash{i}_{j}"
|
||||
mm_feature = MultiModalFeatureSpec(
|
||||
data=MultiModalKwargsItem.dummy("dummy_m"),
|
||||
mm_position=position,
|
||||
identifier=identifier,
|
||||
modality="image")
|
||||
mm_features.append(mm_feature)
|
||||
|
||||
request = Request(
|
||||
request_id=f"{i + starting_idx}",
|
||||
prompt_token_ids=[i + starting_idx] * num_tokens,
|
||||
sampling_params=sampling_params,
|
||||
pooling_params=None,
|
||||
multi_modal_kwargs=mm_kwargs,
|
||||
multi_modal_placeholders=mm_position,
|
||||
multi_modal_hashes=None,
|
||||
mm_features=mm_features if mm_features else None,
|
||||
eos_token_id=EOS_TOKEN_ID,
|
||||
arrival_time=arrival_times[i],
|
||||
priority=priorities[i],
|
||||
@ -1801,9 +1805,7 @@ def test_schedule_skip_tokenizer_init_structured_output_request():
|
||||
request = Request(
|
||||
request_id="0",
|
||||
prompt_token_ids=[0, 1],
|
||||
multi_modal_kwargs=None,
|
||||
multi_modal_hashes=None,
|
||||
multi_modal_placeholders=None,
|
||||
mm_features=None,
|
||||
sampling_params=sampling_params,
|
||||
pooling_params=None,
|
||||
eos_token_id=EOS_TOKEN_ID,
|
||||
|
||||
@ -6,7 +6,8 @@ import torch
|
||||
|
||||
from vllm.config import (CacheConfig, KVTransferConfig, ModelConfig,
|
||||
SchedulerConfig, SpeculativeConfig, VllmConfig)
|
||||
from vllm.multimodal.inputs import MultiModalKwargsItem, PlaceholderRange
|
||||
from vllm.multimodal.inputs import (MultiModalFeatureSpec,
|
||||
MultiModalKwargsItem, PlaceholderRange)
|
||||
from vllm.sampling_params import SamplingParams
|
||||
from vllm.v1.core.kv_cache_utils import (get_request_block_hasher,
|
||||
init_none_hash)
|
||||
@ -139,19 +140,20 @@ def create_requests(
|
||||
prompt_logprobs=prompt_logprobs)
|
||||
requests = []
|
||||
for i in range(num_requests):
|
||||
mm_features = []
|
||||
if mm_positions is not None:
|
||||
mm_position = mm_positions[i]
|
||||
mm_item = MultiModalKwargsItem.dummy("dummy_m")
|
||||
mm_kwargs = [mm_item] * len(mm_position)
|
||||
# Dummy hash for each mm item should be unique
|
||||
# since encoder cache tracks entries by hash
|
||||
mm_hashes = [
|
||||
"hash" + str(i) + "_" + str(j) for j in range(len(mm_position))
|
||||
]
|
||||
else:
|
||||
mm_position = None
|
||||
mm_kwargs = None
|
||||
mm_hashes = None
|
||||
for j, position in enumerate(mm_position):
|
||||
# Dummy hash for each mm item should be unique
|
||||
# since encoder cache tracks entries by hash
|
||||
identifier = f"hash{i}_{j}"
|
||||
mm_feature = MultiModalFeatureSpec(
|
||||
data=MultiModalKwargsItem.dummy("dummy_m"),
|
||||
mm_position=position,
|
||||
identifier=identifier,
|
||||
modality="image")
|
||||
mm_features.append(mm_feature)
|
||||
|
||||
prompt_token_ids = ([0] * num_tokens if same_prompt else [i] *
|
||||
num_tokens)
|
||||
request = Request(
|
||||
@ -159,9 +161,7 @@ def create_requests(
|
||||
prompt_token_ids=prompt_token_ids,
|
||||
sampling_params=sampling_params,
|
||||
pooling_params=None,
|
||||
multi_modal_kwargs=mm_kwargs,
|
||||
multi_modal_placeholders=mm_position,
|
||||
multi_modal_hashes=mm_hashes,
|
||||
mm_features=mm_features if mm_features else None,
|
||||
eos_token_id=EOS_TOKEN_ID,
|
||||
block_hasher=block_hasher,
|
||||
)
|
||||
|
||||
@ -64,6 +64,7 @@ def cleanup(llm: LLM, compilation_config: CompilationConfig):
|
||||
|
||||
@fork_new_process_for_each_test
|
||||
@pytest.mark.parametrize("enforce_eager", [True])
|
||||
@pytest.mark.skip(reason="Disable until Gemma3n supports fast prefill")
|
||||
def test_kv_sharing_fast_prefill(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
enforce_eager: bool,
|
||||
|
||||
@ -35,9 +35,7 @@ def make_request() -> EngineCoreRequest:
|
||||
return EngineCoreRequest(
|
||||
request_id=str(uuid.uuid4()),
|
||||
prompt_token_ids=PROMPT_TOKENS,
|
||||
mm_kwargs=None,
|
||||
mm_hashes=None,
|
||||
mm_placeholders=None,
|
||||
mm_features=None,
|
||||
sampling_params=SamplingParams(),
|
||||
pooling_params=None,
|
||||
eos_token_id=None,
|
||||
@ -308,17 +306,17 @@ def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch):
|
||||
|
||||
# Schedule Batch 1: (10, req0)
|
||||
assert engine_core.step_with_batch_queue()[0] is None
|
||||
assert engine_core.batch_queue.qsize() == 1
|
||||
scheduler_output = engine_core.batch_queue.queue[-1][1]
|
||||
assert len(engine_core.batch_queue) == 1
|
||||
scheduler_output = engine_core.batch_queue[-1][1]
|
||||
assert scheduler_output.num_scheduled_tokens["0"] == 10
|
||||
# num_computed_tokens should have been updated immediately.
|
||||
assert engine_core.scheduler.requests[
|
||||
req0.request_id].num_computed_tokens == 10
|
||||
|
||||
# Schedule Batch 2: (2, req0), (8, req1)
|
||||
assert engine_core.step_with_batch_queue()[0] is None
|
||||
assert engine_core.batch_queue.qsize() == 2
|
||||
scheduler_output = engine_core.batch_queue.queue[-1][1]
|
||||
assert engine_core.step_with_batch_queue()[0] == {}
|
||||
assert len(engine_core.batch_queue) == 1
|
||||
scheduler_output = engine_core.batch_queue[-1][1]
|
||||
assert scheduler_output.num_scheduled_tokens["0"] == 2
|
||||
assert scheduler_output.num_scheduled_tokens["1"] == 8
|
||||
# num_computed_tokens should have been updated immediately.
|
||||
@ -327,42 +325,32 @@ def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch):
|
||||
|
||||
assert engine_core.scheduler.get_num_unfinished_requests() == 2
|
||||
|
||||
# Batch queue is full. Finish Batch 1.
|
||||
engine_core.step_with_batch_queue()
|
||||
|
||||
# Schedule Batch 3: (4, req1). Note that req0 cannot be scheduled
|
||||
# Finish Batch 1 and schedule Batch 3: (4, req1).
|
||||
# Note that req0 cannot be scheduled
|
||||
# because it is in the decoding stage now.
|
||||
engine_core.step_with_batch_queue()
|
||||
assert engine_core.batch_queue.qsize() == 2
|
||||
scheduler_output = engine_core.batch_queue.queue[-1][1]
|
||||
assert len(engine_core.batch_queue) == 1
|
||||
scheduler_output = engine_core.batch_queue[-1][1]
|
||||
assert scheduler_output.num_scheduled_tokens["1"] == 4
|
||||
|
||||
# Batch queue is full. Finish Batch 2. Get first token of req0.
|
||||
# Finish Batch 2. Get first token of req0.
|
||||
# Schedule Batch 4: (1, req0).
|
||||
output = engine_core.step_with_batch_queue()[0].get(0)
|
||||
assert output is not None
|
||||
assert len(output.outputs) == 1
|
||||
assert engine_core.scheduler.requests[req0.request_id].num_tokens == 13
|
||||
|
||||
# Schedule Batch 4: (1, req0).
|
||||
engine_core.step_with_batch_queue()
|
||||
assert engine_core.batch_queue.qsize() == 2
|
||||
scheduler_output = engine_core.batch_queue.queue[-1][1]
|
||||
scheduler_output = engine_core.batch_queue[-1][1]
|
||||
assert scheduler_output.num_scheduled_tokens["0"] == 1
|
||||
|
||||
# Batch queue is full. Finish Batch 3. Get first token of req1.
|
||||
# Finish Batch 3. Get first token of req1. Schedule Batch 5: (1, req1).
|
||||
output = engine_core.step_with_batch_queue()[0].get(0)
|
||||
assert output is not None
|
||||
assert len(output.outputs) == 1
|
||||
assert engine_core.scheduler.requests[req1.request_id].num_tokens == 13
|
||||
|
||||
# Schedule Batch 5: (1, req1).
|
||||
engine_core.step_with_batch_queue()
|
||||
assert engine_core.batch_queue.qsize() == 2
|
||||
scheduler_output = engine_core.batch_queue.queue[-1][1]
|
||||
scheduler_output = engine_core.batch_queue[-1][1]
|
||||
assert scheduler_output.num_scheduled_tokens["1"] == 1
|
||||
|
||||
# Loop until req0 is finished.
|
||||
step = 0
|
||||
req_id = 0
|
||||
expected_num_tokens = [
|
||||
engine_core.scheduler.requests["0"].num_tokens + 1,
|
||||
@ -370,19 +358,14 @@ def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch):
|
||||
]
|
||||
while engine_core.scheduler.get_num_unfinished_requests() == 2:
|
||||
output = engine_core.step_with_batch_queue()[0]
|
||||
if step % 2 == 0:
|
||||
# Even steps consumes an output.
|
||||
assert output is not None
|
||||
assert len(output[0].outputs) == 1
|
||||
if req_id in engine_core.scheduler.requests:
|
||||
assert engine_core.scheduler.requests[
|
||||
req_id].num_tokens == expected_num_tokens[req_id]
|
||||
expected_num_tokens[req_id] += 1
|
||||
req_id = (req_id + 1) % 2
|
||||
else:
|
||||
# Odd steps schedules a new batch.
|
||||
assert output is None
|
||||
step += 1
|
||||
# Every step consumes an output.
|
||||
assert output is not None
|
||||
assert len(output[0].outputs) == 1
|
||||
if req_id in engine_core.scheduler.requests:
|
||||
assert engine_core.scheduler.requests[
|
||||
req_id].num_tokens == expected_num_tokens[req_id]
|
||||
expected_num_tokens[req_id] += 1
|
||||
req_id = (req_id + 1) % 2
|
||||
|
||||
|
||||
@multi_gpu_test(num_gpus=2)
|
||||
|
||||
@ -52,9 +52,7 @@ def make_request(
|
||||
return EngineCoreRequest(
|
||||
request_id=str(uuid.uuid4()),
|
||||
prompt_token_ids=prompt_tokens_ids,
|
||||
mm_kwargs=None,
|
||||
mm_hashes=None,
|
||||
mm_placeholders=None,
|
||||
mm_features=None,
|
||||
sampling_params=params,
|
||||
pooling_params=None,
|
||||
eos_token_id=None,
|
||||
|
||||
@ -26,16 +26,14 @@ def test_fast_inc_detok_invalid_utf8_err_case():
|
||||
prompt_token_ids = [107, 4606, 236787, 107]
|
||||
params = SamplingParams(skip_special_tokens=True)
|
||||
request = EngineCoreRequest(
|
||||
"test",
|
||||
prompt_token_ids,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
params,
|
||||
None,
|
||||
None,
|
||||
0.0,
|
||||
None,
|
||||
request_id="test",
|
||||
prompt_token_ids=prompt_token_ids,
|
||||
mm_features=None,
|
||||
sampling_params=params,
|
||||
pooling_params=None,
|
||||
eos_token_id=None,
|
||||
arrival_time=0.0,
|
||||
lora_request=None,
|
||||
cache_salt=None,
|
||||
data_parallel_rank=None,
|
||||
)
|
||||
|
||||
@ -52,11 +52,9 @@ def test_incremental_detokenization(request_output_kind: RequestOutputKind,
|
||||
requests = [
|
||||
EngineCoreRequest(request_id=f"request-{idx}",
|
||||
prompt_token_ids=prompt_tokens,
|
||||
arrival_time=0,
|
||||
mm_kwargs=None,
|
||||
mm_hashes=None,
|
||||
mm_placeholders=None,
|
||||
mm_features=None,
|
||||
eos_token_id=None,
|
||||
arrival_time=0,
|
||||
lora_request=None,
|
||||
cache_salt=None,
|
||||
data_parallel_rank=None,
|
||||
@ -401,11 +399,9 @@ def test_logprobs_processor(request_output_kind: RequestOutputKind,
|
||||
requests = [
|
||||
EngineCoreRequest(request_id=request_id_list[idx],
|
||||
prompt_token_ids=prompt_tokens,
|
||||
arrival_time=0,
|
||||
mm_kwargs=None,
|
||||
mm_hashes=None,
|
||||
mm_placeholders=None,
|
||||
mm_features=None,
|
||||
eos_token_id=None,
|
||||
arrival_time=0,
|
||||
lora_request=None,
|
||||
cache_salt=None,
|
||||
data_parallel_rank=None,
|
||||
@ -566,11 +562,9 @@ def test_stop_token(include_stop_str_in_output: bool,
|
||||
request = EngineCoreRequest(
|
||||
request_id=request_id,
|
||||
prompt_token_ids=prompt_tokens,
|
||||
arrival_time=0,
|
||||
mm_kwargs=None,
|
||||
mm_hashes=None,
|
||||
mm_placeholders=None,
|
||||
mm_features=None,
|
||||
eos_token_id=eos_token_id,
|
||||
arrival_time=0,
|
||||
lora_request=None,
|
||||
cache_salt=None,
|
||||
data_parallel_rank=None,
|
||||
@ -665,11 +659,9 @@ def test_stop_string(include_stop_str_in_output: bool,
|
||||
EngineCoreRequest(
|
||||
request_id=request_id_list[idx],
|
||||
prompt_token_ids=prompt_tokens,
|
||||
arrival_time=0,
|
||||
mm_kwargs=None,
|
||||
mm_hashes=None,
|
||||
mm_placeholders=None,
|
||||
mm_features=None,
|
||||
eos_token_id=None,
|
||||
arrival_time=0,
|
||||
lora_request=None,
|
||||
cache_salt=None,
|
||||
data_parallel_rank=None,
|
||||
@ -781,11 +773,9 @@ def test_iteration_stats(dummy_test_vectors):
|
||||
EngineCoreRequest(
|
||||
request_id=f"request-{idx}",
|
||||
prompt_token_ids=prompt_tokens,
|
||||
arrival_time=0,
|
||||
mm_kwargs=None,
|
||||
mm_hashes=None,
|
||||
mm_placeholders=None,
|
||||
mm_features=None,
|
||||
eos_token_id=None,
|
||||
arrival_time=0,
|
||||
lora_request=None,
|
||||
cache_salt=None,
|
||||
data_parallel_rank=None,
|
||||
|
||||
229
tests/v1/engine/test_processor_multi_modal_uuids.py
Normal file
229
tests/v1/engine/test_processor_multi_modal_uuids.py
Normal file
@ -0,0 +1,229 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.assets.image import ImageAsset
|
||||
from vllm.assets.video import VideoAsset
|
||||
from vllm.config import CacheConfig, DeviceConfig, ModelConfig, VllmConfig
|
||||
from vllm.platforms.interface import UnspecifiedPlatform
|
||||
from vllm.sampling_params import SamplingParams
|
||||
from vllm.v1.engine import processor as processor_mod
|
||||
from vllm.v1.engine.processor import Processor
|
||||
|
||||
cherry_pil_image = ImageAsset("cherry_blossom").pil_image
|
||||
stop_pil_image = ImageAsset("stop_sign").pil_image
|
||||
baby_reading_np_ndarrays = VideoAsset("baby_reading").np_ndarrays
|
||||
|
||||
|
||||
# Mock processor for testing
|
||||
def _mk_processor(monkeypatch,
|
||||
*,
|
||||
mm_cache_gb: float = 4.0,
|
||||
enable_prefix_caching: bool = True) -> Processor:
|
||||
"""
|
||||
Create a Processor instance with minimal configuration suitable for unit
|
||||
tests without accessing external resources.
|
||||
"""
|
||||
monkeypatch.setattr(ModelConfig,
|
||||
"try_get_generation_config",
|
||||
lambda self: {},
|
||||
raising=True)
|
||||
monkeypatch.setattr(ModelConfig,
|
||||
"__post_init__",
|
||||
lambda self: None,
|
||||
raising=True)
|
||||
monkeypatch.setattr(UnspecifiedPlatform,
|
||||
"is_async_output_supported",
|
||||
classmethod(lambda cls, enforce_eager: True),
|
||||
raising=True)
|
||||
monkeypatch.setattr(
|
||||
ModelConfig,
|
||||
"verify_async_output_proc",
|
||||
lambda self, parallel_config, speculative_config, device_config: None,
|
||||
raising=True)
|
||||
monkeypatch.setattr(ModelConfig,
|
||||
"verify_with_parallel_config",
|
||||
lambda self, parallel_config: None,
|
||||
raising=True)
|
||||
monkeypatch.setattr(processor_mod,
|
||||
"processor_cache_from_config",
|
||||
lambda vllm_config, mm_registry: None,
|
||||
raising=True)
|
||||
|
||||
monkeypatch.setattr(VllmConfig,
|
||||
"__post_init__",
|
||||
lambda self: None,
|
||||
raising=True)
|
||||
|
||||
model_config = ModelConfig(
|
||||
skip_tokenizer_init=True,
|
||||
max_model_len=128,
|
||||
mm_processor_cache_gb=mm_cache_gb,
|
||||
generation_config="vllm",
|
||||
tokenizer="dummy",
|
||||
)
|
||||
|
||||
# Minimal multimodal_config to satisfy references in
|
||||
# Processor.process_inputs.
|
||||
class _MockMMConfig:
|
||||
|
||||
def __init__(self, gb: float):
|
||||
self.mm_processor_cache_gb = gb
|
||||
|
||||
model_config.multimodal_config = _MockMMConfig(
|
||||
mm_cache_gb) # type: ignore[attr-defined]
|
||||
vllm_config = VllmConfig(
|
||||
model_config=model_config,
|
||||
cache_config=CacheConfig(enable_prefix_caching=enable_prefix_caching),
|
||||
device_config=DeviceConfig(device="cpu"),
|
||||
)
|
||||
|
||||
# Pass tokenizer=None; InputPreprocessor handles None when
|
||||
# skip_tokenizer_init is True.
|
||||
return Processor(vllm_config, tokenizer=None) # type: ignore[arg-type]
|
||||
|
||||
|
||||
def test_multi_modal_uuids_length_mismatch_raises(monkeypatch):
|
||||
processor = _mk_processor(monkeypatch)
|
||||
|
||||
prompt = {
|
||||
"prompt": "USER: <image>\nDescribe\nASSISTANT:",
|
||||
"multi_modal_data": {
|
||||
"image": [cherry_pil_image, stop_pil_image]
|
||||
},
|
||||
# Mismatch: 2 items but only 1 uuid provided
|
||||
"multi_modal_uuids": {
|
||||
"image": ["hash_cherry"]
|
||||
},
|
||||
}
|
||||
|
||||
with pytest.raises(ValueError, match="must have same length as data"):
|
||||
processor.process_inputs(
|
||||
request_id="req-1",
|
||||
prompt=prompt, # type: ignore[arg-type]
|
||||
params=SamplingParams(),
|
||||
)
|
||||
|
||||
|
||||
def test_multi_modal_uuids_missing_modality_raises(monkeypatch):
|
||||
processor = _mk_processor(monkeypatch)
|
||||
|
||||
prompt = {
|
||||
"prompt": "USER: <image><video>\nDescribe\nASSISTANT:",
|
||||
# Two modalities provided in data
|
||||
"multi_modal_data": {
|
||||
"image": [cherry_pil_image],
|
||||
"video": [baby_reading_np_ndarrays]
|
||||
},
|
||||
# Only image uuids provided; video missing should raise
|
||||
"multi_modal_uuids": {
|
||||
"image": ["hash_cherry"]
|
||||
},
|
||||
}
|
||||
|
||||
with pytest.raises(ValueError,
|
||||
match="must be provided if multi_modal_data"):
|
||||
processor.process_inputs(
|
||||
request_id="req-2",
|
||||
prompt=prompt, # type: ignore[arg-type]
|
||||
params=SamplingParams(),
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"mm_cache_gb, enable_prefix_caching",
|
||||
[
|
||||
(4.0, True), # default behavior
|
||||
(4.0, False), # prefix caching disabled
|
||||
(0.0, True), # processor cache disabled
|
||||
],
|
||||
)
|
||||
def test_multi_modal_uuids_accepts_none_and_passes_through(
|
||||
monkeypatch, mm_cache_gb: float, enable_prefix_caching: bool):
|
||||
processor = _mk_processor(monkeypatch,
|
||||
mm_cache_gb=mm_cache_gb,
|
||||
enable_prefix_caching=enable_prefix_caching)
|
||||
|
||||
# Capture the overrides passed to InputPreprocessor.preprocess
|
||||
captured: dict[str, object] = {}
|
||||
|
||||
def fake_preprocess(prompt,
|
||||
*,
|
||||
tokenization_kwargs=None,
|
||||
lora_request=None,
|
||||
mm_hash_overrides=None):
|
||||
captured["mm_hash_overrides"] = mm_hash_overrides
|
||||
# Minimal processed inputs for decoder-only flow
|
||||
return {"type": "token", "prompt_token_ids": [1]}
|
||||
|
||||
# Monkeypatch only the bound preprocess method on this instance
|
||||
monkeypatch.setattr(processor.input_preprocessor,
|
||||
"preprocess",
|
||||
fake_preprocess,
|
||||
raising=True)
|
||||
|
||||
# Use a consistent two-image scenario across all configurations
|
||||
mm_uuids = {"image": [None, "hash_stop"], "video": None}
|
||||
prompt = {
|
||||
"prompt": "USER: <image><image>\nTwo images\nASSISTANT:",
|
||||
"multi_modal_data": {
|
||||
"image": [cherry_pil_image, stop_pil_image],
|
||||
"video": baby_reading_np_ndarrays,
|
||||
},
|
||||
"multi_modal_uuids": mm_uuids,
|
||||
}
|
||||
|
||||
processor.process_inputs(
|
||||
request_id="req-3",
|
||||
prompt=prompt, # type: ignore[arg-type]
|
||||
params=SamplingParams(),
|
||||
)
|
||||
|
||||
assert captured["mm_hash_overrides"] == mm_uuids
|
||||
|
||||
|
||||
def test_multi_modal_uuids_ignored_when_caching_disabled(monkeypatch):
|
||||
# When both processor cache is 0 and prefix caching disabled, the
|
||||
# processor builds overrides from request id instead of using user UUIDs.
|
||||
processor = _mk_processor(monkeypatch,
|
||||
mm_cache_gb=0.0,
|
||||
enable_prefix_caching=False)
|
||||
|
||||
captured: dict[str, object] = {}
|
||||
|
||||
def fake_preprocess(prompt,
|
||||
*,
|
||||
tokenization_kwargs=None,
|
||||
lora_request=None,
|
||||
mm_hash_overrides=None):
|
||||
captured["mm_hash_overrides"] = mm_hash_overrides
|
||||
return {"type": "token", "prompt_token_ids": [1]}
|
||||
|
||||
monkeypatch.setattr(processor.input_preprocessor,
|
||||
"preprocess",
|
||||
fake_preprocess,
|
||||
raising=True)
|
||||
|
||||
request_id = "req-42"
|
||||
mm_uuids = {"image": ["hash_cherry", "hash_stop"], "video": "hash_video"}
|
||||
prompt = {
|
||||
"prompt": "USER: <image><image><video>\nDescribe\nASSISTANT:",
|
||||
"multi_modal_data": {
|
||||
"image": [cherry_pil_image, stop_pil_image],
|
||||
"video": baby_reading_np_ndarrays,
|
||||
},
|
||||
"multi_modal_uuids": mm_uuids,
|
||||
}
|
||||
|
||||
processor.process_inputs(
|
||||
request_id=request_id,
|
||||
prompt=prompt, # type: ignore[arg-type]
|
||||
params=SamplingParams(),
|
||||
)
|
||||
|
||||
# Expect request-id-based overrides are passed through
|
||||
assert captured["mm_hash_overrides"] == {
|
||||
"image": [f"{request_id}-image-0", f"{request_id}-image-1"],
|
||||
"video": [f"{request_id}-video-0"],
|
||||
}
|
||||
@ -27,7 +27,7 @@ class CustomMultiprocExecutor(MultiprocExecutor):
|
||||
kwargs: Optional[dict] = None,
|
||||
non_block: bool = False,
|
||||
unique_reply_rank: Optional[int] = None) -> list[Any]:
|
||||
# Drop marker to show that this was ran
|
||||
# Drop marker to show that this was run
|
||||
with open(".marker", "w"):
|
||||
...
|
||||
return super().collective_rpc(method, timeout, args, kwargs)
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user