Compare commits
98 Commits
split_kv_c
...
woosuk/fix
| Author | SHA1 | Date | |
|---|---|---|---|
| 936da0f740 | |||
| 20098c10d9 | |||
| ee7a66dd9a | |||
| 431535b522 | |||
| 711e912946 | |||
| e69e0b8b5f | |||
| ddc9048394 | |||
| b1a63d1b3b | |||
| 48ecb4438b | |||
| e57fc15971 | |||
| 4bdf400218 | |||
| 7852b82b93 | |||
| a2a5f79e09 | |||
| c59a0eca42 | |||
| b716ab93a7 | |||
| 138f0d1e75 | |||
| 2506ce5189 | |||
| 47fd08aaf9 | |||
| 12aed7e453 | |||
| d90e212a3a | |||
| 2821986450 | |||
| 6c117cff7d | |||
| 7ac67ea525 | |||
| ce75e15373 | |||
| aed16879a9 | |||
| cf278ff3b2 | |||
| 838d7116ba | |||
| 5089fd749c | |||
| a3d087adec | |||
| 058525b997 | |||
| 1dfea5f4a9 | |||
| cea91a32f2 | |||
| a684c0124c | |||
| f2718d2948 | |||
| 825fdb11ad | |||
| 8c1d4acbfe | |||
| 486c5599e3 | |||
| a6149aa587 | |||
| 6c8a3c099b | |||
| 31a8a2a7bc | |||
| 1a0a04dae9 | |||
| 6d8246aaff | |||
| 9d1c50a5ac | |||
| 9a4600e4dc | |||
| 9fac6aa30b | |||
| a53ad626d6 | |||
| 1c3dad22ff | |||
| d2a30a2d93 | |||
| 75fb112d80 | |||
| 38db529f66 | |||
| 064cac7bb7 | |||
| e19bce40a1 | |||
| 505805b645 | |||
| bbdc0f2366 | |||
| dc34059360 | |||
| c4cb0af98a | |||
| 1c3b1634aa | |||
| 2ea50e977a | |||
| b419937c78 | |||
| 5f696c33b1 | |||
| 67244c86f0 | |||
| 072d7e53e5 | |||
| 01a583fea4 | |||
| bc19d75985 | |||
| fbd6523ac0 | |||
| 470484a4f5 | |||
| 21da73343a | |||
| 66072b36db | |||
| 3ed1ec4af2 | |||
| 5a33ae9a3f | |||
| c9ff9e6f0c | |||
| eaffe4486c | |||
| 8ed039d527 | |||
| 37970105fe | |||
| cc935fdd7e | |||
| abdfcd4f3d | |||
| 4f02b77de4 | |||
| 29283e8976 | |||
| 05b044e698 | |||
| aa3f105c59 | |||
| ef7eefe17a | |||
| 350c94deb3 | |||
| f4cd80f944 | |||
| 349e0e3462 | |||
| 81b16a2bc9 | |||
| e111d5b0ae | |||
| a904ea78ea | |||
| b7433ca1a4 | |||
| 5c65a72bb1 | |||
| 9d8a2d86d2 | |||
| 3bc18127ff | |||
| bec060fd99 | |||
| 52bc9d5b3e | |||
| dc2979c585 | |||
| 027d37df38 | |||
| b98219670f | |||
| 32baf1d036 | |||
| 3127274d02 |
@ -167,12 +167,6 @@ if [[ $commands == *" entrypoints/llm "* ]]; then
|
||||
--ignore=entrypoints/llm/test_prompt_validation.py "}
|
||||
fi
|
||||
|
||||
#Obsolete currently
|
||||
##ignore certain Entrypoints/llm tests
|
||||
#if [[ $commands == *" && pytest -v -s entrypoints/llm/test_guided_generate.py"* ]]; then
|
||||
# commands=${commands//" && pytest -v -s entrypoints/llm/test_guided_generate.py"/" "}
|
||||
#fi
|
||||
|
||||
# --ignore=entrypoints/openai/test_encoder_decoder.py \
|
||||
# --ignore=entrypoints/openai/test_embedding.py \
|
||||
# --ignore=entrypoints/openai/test_oot_registration.py
|
||||
|
||||
@ -46,22 +46,18 @@ steps:
|
||||
mirror_hardwares: [amdexperimental]
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/async_engine
|
||||
- tests/test_inputs.py
|
||||
- tests/test_outputs.py
|
||||
- tests/multimodal
|
||||
- tests/utils_
|
||||
- tests/worker
|
||||
- tests/standalone_tests/lazy_imports.py
|
||||
- tests/transformers_utils
|
||||
commands:
|
||||
- python3 standalone_tests/lazy_imports.py
|
||||
- pytest -v -s async_engine # AsyncLLMEngine
|
||||
- pytest -v -s test_inputs.py
|
||||
- pytest -v -s test_outputs.py
|
||||
- pytest -v -s multimodal
|
||||
- pytest -v -s utils_ # Utils
|
||||
- pytest -v -s worker # Worker
|
||||
- pytest -v -s transformers_utils # transformers_utils
|
||||
|
||||
- label: Python-only Installation Test # 10min
|
||||
@ -82,14 +78,12 @@ steps:
|
||||
- vllm/
|
||||
- tests/basic_correctness/test_basic_correctness
|
||||
- tests/basic_correctness/test_cpu_offload
|
||||
- tests/basic_correctness/test_preemption
|
||||
- tests/basic_correctness/test_cumem.py
|
||||
commands:
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- pytest -v -s basic_correctness/test_cumem.py
|
||||
- pytest -v -s basic_correctness/test_basic_correctness.py
|
||||
- pytest -v -s basic_correctness/test_cpu_offload.py
|
||||
- VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
|
||||
|
||||
- label: Entrypoints Unit Tests # 5min
|
||||
timeout_in_minutes: 10
|
||||
@ -114,8 +108,7 @@ steps:
|
||||
- tests/entrypoints/offline_mode
|
||||
commands:
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
|
||||
- pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
|
||||
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
|
||||
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
|
||||
- VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
|
||||
|
||||
@ -287,6 +280,7 @@ steps:
|
||||
# split the test to avoid interference
|
||||
- pytest -v -s v1/core
|
||||
- pytest -v -s v1/executor
|
||||
- pytest -v -s v1/kv_offload
|
||||
- pytest -v -s v1/sample
|
||||
- pytest -v -s v1/logits_processors
|
||||
- pytest -v -s v1/worker
|
||||
|
||||
19
.github/CODEOWNERS
vendored
19
.github/CODEOWNERS
vendored
@ -41,7 +41,6 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
|
||||
|
||||
# Test ownership
|
||||
/.buildkite/lm-eval-harness @mgoin @simon-mo
|
||||
/tests/async_engine @njhill @robertgshaw2-redhat @simon-mo
|
||||
/tests/distributed/test_multi_node_assignment.py @youkaichao
|
||||
/tests/distributed/test_pipeline_parallel.py @youkaichao
|
||||
/tests/distributed/test_same_node.py @youkaichao
|
||||
@ -50,7 +49,6 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
|
||||
/tests/kernels @mgoin @tlrmchlsmth @WoosukKwon @yewentao256
|
||||
/tests/models @DarkLight1337 @ywang96
|
||||
/tests/multimodal @DarkLight1337 @ywang96 @NickLucche
|
||||
/tests/prefix_caching @comaniac @KuntaiDu
|
||||
/tests/quantization @mgoin @robertgshaw2-redhat @yewentao256
|
||||
/tests/test_inputs.py @DarkLight1337 @ywang96
|
||||
/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
|
||||
@ -63,19 +61,30 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
|
||||
/tests/v1/kv_connector @ApostaC
|
||||
/tests/v1/offloading @ApostaC
|
||||
|
||||
# Transformers backend
|
||||
/vllm/model_executor/models/transformers.py @hmellor
|
||||
/tests/models/test_transformers.py @hmellor
|
||||
|
||||
# Docs
|
||||
/docs @hmellor
|
||||
/docs/mkdocs @hmellor
|
||||
/docs/**/*.yml @hmellor
|
||||
/requirements/docs.txt @hmellor
|
||||
.readthedocs.yaml @hmellor
|
||||
mkdocs.yaml @hmellor
|
||||
|
||||
# Linting
|
||||
.markdownlint.yaml @hmellor
|
||||
.pre-commit-config.yaml @hmellor
|
||||
|
||||
# CPU
|
||||
/vllm/v1/worker/^cpu @bigPYJ1151
|
||||
/vllm/v1/worker/cpu* @bigPYJ1151
|
||||
/csrc/cpu @bigPYJ1151
|
||||
/vllm/platforms/cpu.py @bigPYJ1151
|
||||
/cmake/cpu_extension.cmake @bigPYJ1151
|
||||
/docker/Dockerfile.cpu @bigPYJ1151
|
||||
|
||||
# Intel GPU
|
||||
/vllm/v1/worker/^xpu @jikunshang
|
||||
/vllm/v1/worker/xpu* @jikunshang
|
||||
/vllm/platforms/xpu.py @jikunshang
|
||||
/docker/Dockerfile.xpu @jikunshang
|
||||
|
||||
|
||||
19
.github/mergify.yml
vendored
19
.github/mergify.yml
vendored
@ -171,7 +171,7 @@ pull_request_rules:
|
||||
- files=examples/online_serving/openai_chat_completion_structured_outputs.py
|
||||
- files=examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
|
||||
- files~=^tests/v1/structured_output/
|
||||
- files=tests/v1/entrypoints/llm/test_guided_generate.py
|
||||
- files=tests/v1/entrypoints/llm/test_struct_output_generate.py
|
||||
- files~=^vllm/v1/structured_output/
|
||||
actions:
|
||||
label:
|
||||
@ -302,3 +302,20 @@ pull_request_rules:
|
||||
label:
|
||||
remove:
|
||||
- needs-rebase
|
||||
|
||||
- name: label-kv-connector
|
||||
description: Automatically apply kv-connector label
|
||||
conditions:
|
||||
- or:
|
||||
- files~=^examples/online_serving/disaggregated[^/]*/.*
|
||||
- files~=^examples/offline_inference/disaggregated[^/]*/.*
|
||||
- files~=^examples/others/lmcache/
|
||||
- files~=^tests/v1/kv_connector/
|
||||
- files~=^vllm/distributed/kv_transfer/
|
||||
- title~=(?i)\bP/?D\b
|
||||
- title~=(?i)NIXL
|
||||
- title~=(?i)LMCache
|
||||
actions:
|
||||
label:
|
||||
add:
|
||||
- kv-connector
|
||||
@ -49,7 +49,7 @@ repos:
|
||||
rev: 0.6.17
|
||||
hooks:
|
||||
- id: pip-compile
|
||||
args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu128]
|
||||
args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu128, --python-platform, x86_64-manylinux_2_28]
|
||||
files: ^requirements/test\.(in|txt)$
|
||||
- repo: local
|
||||
hooks:
|
||||
@ -164,9 +164,7 @@ repos:
|
||||
name: Validate configuration has default values and that each field has a docstring
|
||||
entry: python tools/validate_config.py
|
||||
language: python
|
||||
types: [python]
|
||||
pass_filenames: true
|
||||
files: vllm/config.py|tests/test_config.py|vllm/entrypoints/openai/cli_args.py
|
||||
additional_dependencies: [regex]
|
||||
# Keep `suggestion` last
|
||||
- id: suggestion
|
||||
name: Suggestion
|
||||
|
||||
@ -696,11 +696,11 @@ def evaluate(ret, args):
|
||||
return re.match(args.regex, actual) is not None
|
||||
|
||||
def _eval_correctness(expected, actual):
|
||||
if args.structure_type == "guided_json":
|
||||
if args.structure_type == "json":
|
||||
return _eval_correctness_json(expected, actual)
|
||||
elif args.structure_type == "guided_regex":
|
||||
elif args.structure_type == "regex":
|
||||
return _eval_correctness_regex(expected, actual)
|
||||
elif args.structure_type == "guided_choice":
|
||||
elif args.structure_type == "choice":
|
||||
return _eval_correctness_choice(expected, actual)
|
||||
else:
|
||||
return None
|
||||
@ -780,18 +780,18 @@ def main(args: argparse.Namespace):
|
||||
)
|
||||
|
||||
if args.dataset == "grammar":
|
||||
args.structure_type = "guided_grammar"
|
||||
args.structure_type = "grammar"
|
||||
elif args.dataset == "regex":
|
||||
args.structure_type = "guided_regex"
|
||||
args.structure_type = "regex"
|
||||
elif args.dataset == "choice":
|
||||
args.structure_type = "guided_choice"
|
||||
args.structure_type = "choice"
|
||||
else:
|
||||
args.structure_type = "guided_json"
|
||||
args.structure_type = "json"
|
||||
|
||||
if args.no_structured_output:
|
||||
args.structured_output_ratio = 0
|
||||
if args.save_results:
|
||||
result_file_name = f"{args.structured_output_ratio}guided"
|
||||
result_file_name = f"{args.structured_output_ratio}so"
|
||||
result_file_name += f"_{backend}"
|
||||
result_file_name += f"_{args.request_rate}qps"
|
||||
result_file_name += f"_{args.model.split('/')[-1]}"
|
||||
|
||||
@ -11,13 +11,13 @@ from datetime import datetime
|
||||
from typing import Any
|
||||
|
||||
import torch
|
||||
import triton
|
||||
from tqdm import tqdm
|
||||
|
||||
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
|
||||
_w8a8_block_fp8_matmul,
|
||||
)
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.triton_utils import triton
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
|
||||
mp.set_start_method("spawn", force=True)
|
||||
|
||||
@ -17,4 +17,8 @@
|
||||
#warning "unsupported vLLM cpu implementation"
|
||||
#endif
|
||||
|
||||
#ifdef _OPENMP
|
||||
#include <omp.h>
|
||||
#endif
|
||||
|
||||
#endif
|
||||
38
csrc/launch_bounds_utils.h
Normal file
38
csrc/launch_bounds_utils.h
Normal file
@ -0,0 +1,38 @@
|
||||
#pragma once
|
||||
|
||||
#include <cuda_runtime_api.h>
|
||||
#include <algorithm>
|
||||
|
||||
// maximum blocks per SM cap
|
||||
#ifndef VLLM_LAUNCH_BLOCKS_CAP
|
||||
#define VLLM_LAUNCH_BLOCKS_CAP 4
|
||||
#endif
|
||||
|
||||
// compile-time estimate of max threads per SM for launch bounds.
|
||||
#ifndef VLLM_MAX_THREADS_PER_SM
|
||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 300
|
||||
#define VLLM_MAX_THREADS_PER_SM 1536
|
||||
#else
|
||||
#define VLLM_MAX_THREADS_PER_SM 2048
|
||||
#endif
|
||||
#endif
|
||||
|
||||
// compute the number of blocks per SM to request in __launch_bounds__
|
||||
#define VLLM_BLOCKS_DIV(VAL) (VLLM_MAX_THREADS_PER_SM / (VAL))
|
||||
#define VLLM_CLAMP_BLOCKS_PER_SM(VAL) \
|
||||
(((VAL) <= 0) \
|
||||
? 1 \
|
||||
: (((VAL) < VLLM_LAUNCH_BLOCKS_CAP) ? (VAL) : VLLM_LAUNCH_BLOCKS_CAP))
|
||||
#define VLLM_BLOCKS_PER_SM(BLOCK_THREADS) \
|
||||
VLLM_CLAMP_BLOCKS_PER_SM(VLLM_BLOCKS_DIV(BLOCK_THREADS))
|
||||
|
||||
// runtime-time helper to compute blocks/SM
|
||||
static inline int vllm_runtime_blocks_per_sm(int block_threads) {
|
||||
int device = -1;
|
||||
cudaGetDevice(&device);
|
||||
int max_threads_per_sm = VLLM_MAX_THREADS_PER_SM;
|
||||
cudaDeviceGetAttribute(&max_threads_per_sm,
|
||||
cudaDevAttrMaxThreadsPerMultiProcessor, device);
|
||||
int blocks = (block_threads > 0) ? (max_threads_per_sm / block_threads) : 1;
|
||||
return VLLM_CLAMP_BLOCKS_PER_SM(blocks);
|
||||
}
|
||||
@ -21,6 +21,7 @@
|
||||
#include <torch/all.h>
|
||||
#include <cuda_fp16.h>
|
||||
#include <cuda_bf16.h>
|
||||
#include <cuda/std/limits>
|
||||
#include <cooperative_groups.h>
|
||||
#include <cooperative_groups/reduce.h>
|
||||
namespace cg = cooperative_groups;
|
||||
@ -28,7 +29,6 @@ namespace cg = cooperative_groups;
|
||||
namespace vllm {
|
||||
namespace moe {
|
||||
|
||||
constexpr float kNegInfinity = INFINITY * -1;
|
||||
constexpr unsigned FULL_WARP_MASK = 0xffffffff;
|
||||
constexpr int32_t WARP_SIZE = 32;
|
||||
constexpr int32_t BLOCK_SIZE = 512;
|
||||
@ -411,14 +411,21 @@ __device__ inline float cuda_cast<float, __nv_bfloat16>(__nv_bfloat16 val) {
|
||||
return __bfloat162float(val);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ inline T neg_inf() {
|
||||
// cuda::std::numeric_limits<T>::infinity() returns `0` for [T=bf16 or fp16]
|
||||
// so we need to cast from fp32
|
||||
return cuda_cast<T, float>(-cuda::std::numeric_limits<float>::infinity());
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ void topk_with_k2(T* output, T const* input,
|
||||
cg::thread_block_tile<32> const& tile,
|
||||
int32_t const lane_id,
|
||||
int const num_experts_per_group) {
|
||||
// Get the top2 per thread
|
||||
T largest = -INFINITY;
|
||||
T second_largest = -INFINITY;
|
||||
T largest = neg_inf<T>();
|
||||
T second_largest = neg_inf<T>();
|
||||
|
||||
if (num_experts_per_group > WARP_SIZE) {
|
||||
for (int i = lane_id; i < num_experts_per_group; i += WARP_SIZE) {
|
||||
@ -513,8 +520,8 @@ __global__ void group_idx_and_topk_idx_kernel(
|
||||
warp_id * topk;
|
||||
s_topk_idx += warp_id * topk;
|
||||
|
||||
T value = kNegInfinity;
|
||||
T topk_group_value = kNegInfinity;
|
||||
T value = neg_inf<T>();
|
||||
T topk_group_value = neg_inf<T>();
|
||||
int32_t num_equalto_topkth_group;
|
||||
|
||||
#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
|
||||
@ -525,11 +532,8 @@ __global__ void group_idx_and_topk_idx_kernel(
|
||||
if (case_id < num_tokens) {
|
||||
// calculate group_idx
|
||||
int32_t target_num_min = WARP_SIZE - n_group + topk_group;
|
||||
if (lane_id < n_group &&
|
||||
(isfinite(cuda_cast<float, T>(
|
||||
group_scores[lane_id])))) // The check is necessary to avoid
|
||||
// abnormal input
|
||||
{
|
||||
// The check is necessary to avoid abnormal input
|
||||
if (lane_id < n_group && cuda::std::isfinite(group_scores[lane_id])) {
|
||||
value = group_scores[lane_id];
|
||||
}
|
||||
|
||||
@ -540,11 +544,11 @@ __global__ void group_idx_and_topk_idx_kernel(
|
||||
__syncwarp(); // Ensure all threads have valid data before reduction
|
||||
topk_group_value = cg::reduce(tile, value, cg::greater<T>());
|
||||
if (value == topk_group_value) {
|
||||
value = kNegInfinity;
|
||||
value = neg_inf<T>();
|
||||
}
|
||||
pre_count_equal_to_top_value = count_equal_to_top_value;
|
||||
count_equal_to_top_value = __popc(__ballot_sync(
|
||||
FULL_WARP_MASK, (value == cuda_cast<T, float>(kNegInfinity))));
|
||||
count_equal_to_top_value =
|
||||
__popc(__ballot_sync(FULL_WARP_MASK, (value == neg_inf<T>())));
|
||||
}
|
||||
num_equalto_topkth_group = target_num_min - pre_count_equal_to_top_value;
|
||||
}
|
||||
@ -552,11 +556,10 @@ __global__ void group_idx_and_topk_idx_kernel(
|
||||
|
||||
warp_topk::WarpSelect</*capability*/ WARP_SIZE, /*greater*/ true, T, int32_t,
|
||||
/* is_stable */ true>
|
||||
queue((int32_t)topk, -INFINITY);
|
||||
queue((int32_t)topk, neg_inf<T>());
|
||||
|
||||
int count_equalto_topkth_group = 0;
|
||||
bool if_proceed_next_topk =
|
||||
(topk_group_value != cuda_cast<T, float>(kNegInfinity));
|
||||
bool if_proceed_next_topk = topk_group_value != neg_inf<T>();
|
||||
if (case_id < num_tokens && if_proceed_next_topk) {
|
||||
for (int i_group = 0; i_group < n_group; i_group++) {
|
||||
if ((group_scores[i_group] > topk_group_value) ||
|
||||
@ -566,10 +569,10 @@ __global__ void group_idx_and_topk_idx_kernel(
|
||||
for (int32_t i = lane_id; i < align_num_experts_per_group;
|
||||
i += WARP_SIZE) {
|
||||
T candidates =
|
||||
(i < num_experts_per_group) && isfinite(cuda_cast<float, T>(
|
||||
scores_with_bias[offset + i]))
|
||||
(i < num_experts_per_group) &&
|
||||
cuda::std::isfinite(scores_with_bias[offset + i])
|
||||
? scores_with_bias[offset + i]
|
||||
: cuda_cast<T, float>(kNegInfinity);
|
||||
: neg_inf<T>();
|
||||
queue.add(candidates, offset + i);
|
||||
}
|
||||
if (group_scores[i_group] == topk_group_value) {
|
||||
@ -598,7 +601,8 @@ __global__ void group_idx_and_topk_idx_kernel(
|
||||
if (i < topk) {
|
||||
s_topk_value[i] = value;
|
||||
}
|
||||
topk_sum += reduce(tile, cuda_cast<float, T>(value), cg::plus<float>());
|
||||
topk_sum +=
|
||||
cg::reduce(tile, cuda_cast<float, T>(value), cg::plus<float>());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -365,7 +365,6 @@ __global__ void silu_mul_fp8_quant_deep_gemm_kernel(
|
||||
int32_t compute_pipeline_offset_64 = 0;
|
||||
|
||||
for (int32_t t = n_tokens_lower; t < n_tokens_upper; ++t) {
|
||||
__nv_bfloat16 y_max_bf16 = EPS;
|
||||
__nv_bfloat162 results_bf162[2];
|
||||
|
||||
cp_async_wait<NUM_STAGES - 2>();
|
||||
@ -405,7 +404,7 @@ __global__ void silu_mul_fp8_quant_deep_gemm_kernel(
|
||||
auto _y_max2 =
|
||||
__hmax2(__habs2(results_bf162[0]), __habs2(results_bf162[1]));
|
||||
|
||||
y_max_bf16 = __hmax(_y_max2.x, _y_max2.y);
|
||||
__nv_bfloat16 y_max_bf16 = __hmax(EPS, __hmax(_y_max2.x, _y_max2.y));
|
||||
|
||||
// An entire group is assigned to a single warp, so a simple warp reduce
|
||||
// is used.
|
||||
|
||||
@ -26,6 +26,7 @@
|
||||
#include "dispatch_utils.h"
|
||||
|
||||
#include "cuda_utils.h"
|
||||
#include "launch_bounds_utils.h"
|
||||
#include "nvfp4_utils.cuh"
|
||||
|
||||
namespace vllm {
|
||||
@ -63,7 +64,7 @@ __inline__ __device__ PackedVec<Type> compute_silu_mul(PackedVec<Type>& vec,
|
||||
|
||||
// Use UE4M3 by default.
|
||||
template <class Type, bool UE8M0_SF = false>
|
||||
__global__ void __launch_bounds__(1024, 4)
|
||||
__global__ void __launch_bounds__(1024, VLLM_BLOCKS_PER_SM(1024))
|
||||
silu_mul_cvt_fp16_to_fp4(int32_t numRows, int32_t numCols, Type const* in,
|
||||
float const* SFScale, uint32_t* out,
|
||||
uint32_t* SFout) {
|
||||
@ -131,7 +132,8 @@ void silu_and_mul_nvfp4_quant_sm1xxa(torch::Tensor& output, // [..., d]
|
||||
const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
|
||||
auto stream = at::cuda::getCurrentCUDAStream(input.get_device());
|
||||
dim3 block(std::min(int(n / ELTS_PER_THREAD), 1024));
|
||||
int const numBlocksPerSM = 2048 / block.x;
|
||||
int const numBlocksPerSM =
|
||||
vllm_runtime_blocks_per_sm(static_cast<int>(block.x));
|
||||
dim3 grid(std::min(int(m), multiProcessorCount * numBlocksPerSM));
|
||||
|
||||
VLLM_DISPATCH_HALF_TYPES(
|
||||
|
||||
@ -26,12 +26,13 @@
|
||||
#include "dispatch_utils.h"
|
||||
|
||||
#include "nvfp4_utils.cuh"
|
||||
#include "launch_bounds_utils.h"
|
||||
|
||||
namespace vllm {
|
||||
|
||||
// Use UE4M3 by default.
|
||||
template <class Type, bool UE8M0_SF = false, bool SMALL_NUM_EXPERTS = false>
|
||||
__global__ void __launch_bounds__(512, 4)
|
||||
__global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
|
||||
cvt_fp16_to_fp4(int32_t numRows, int32_t numCols, Type const* in,
|
||||
float const* SFScale, uint32_t* out, uint32_t* SFout,
|
||||
uint32_t* input_offset_by_experts,
|
||||
@ -129,7 +130,7 @@ __global__ void __launch_bounds__(512, 4)
|
||||
|
||||
// Kernel for LARGE_M_TOPK = true (large m_topk optimized version)
|
||||
template <class Type, bool UE8M0_SF = false, bool SMALL_NUM_EXPERTS = false>
|
||||
__global__ void __launch_bounds__(1024, 4)
|
||||
__global__ void __launch_bounds__(1024, VLLM_BLOCKS_PER_SM(1024))
|
||||
cvt_fp16_to_fp4(int32_t numRows, int32_t numCols, Type const* in,
|
||||
float const* SFScale, uint32_t* out, uint32_t* SFout,
|
||||
uint32_t* input_offset_by_experts,
|
||||
@ -233,8 +234,9 @@ void quant_impl(void* output, void* output_scale, void* input,
|
||||
int const workSizePerRow = k / ELTS_PER_THREAD;
|
||||
int const totalWorkSize = m_topk * workSizePerRow;
|
||||
dim3 block(std::min(workSizePerRow, 512));
|
||||
// Get number of blocks per SM (assume we can fully utilize the SM).
|
||||
int const numBlocksPerSM = 2048 / block.x;
|
||||
// Get number of blocks per SM
|
||||
int const numBlocksPerSM =
|
||||
vllm_runtime_blocks_per_sm(static_cast<int>(block.x));
|
||||
dim3 grid(std::min(static_cast<int>((totalWorkSize + block.x - 1) / block.x),
|
||||
multiProcessorCount * numBlocksPerSM));
|
||||
while (grid.x <= multiProcessorCount && block.x > 64) {
|
||||
|
||||
@ -26,13 +26,14 @@
|
||||
#include "dispatch_utils.h"
|
||||
|
||||
#include "cuda_utils.h"
|
||||
#include "launch_bounds_utils.h"
|
||||
#include "nvfp4_utils.cuh"
|
||||
|
||||
namespace vllm {
|
||||
|
||||
// Use UE4M3 by default.
|
||||
template <class Type, bool UE8M0_SF = false>
|
||||
__global__ void __launch_bounds__(512, 4)
|
||||
__global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
|
||||
cvt_fp16_to_fp4(int32_t numRows, int32_t numCols, Type const* in,
|
||||
float const* SFScale, uint32_t* out, uint32_t* SFout) {
|
||||
using PackedVec = PackedVec<Type>;
|
||||
@ -75,8 +76,9 @@ void invokeFP4Quantization(int m, int n, T const* input, float const* SFScale,
|
||||
// Grid, Block size.
|
||||
// Each thread converts 8 values.
|
||||
dim3 block(std::min(int(n / ELTS_PER_THREAD), 512));
|
||||
// Get number of blocks per SM (assume we can fully utilize the SM).
|
||||
int const numBlocksPerSM = 2048 / block.x;
|
||||
// Get number of blocks per SM
|
||||
int const numBlocksPerSM =
|
||||
vllm_runtime_blocks_per_sm(static_cast<int>(block.x));
|
||||
dim3 grid(std::min(int(m), multiProcessorCount * numBlocksPerSM));
|
||||
|
||||
// Launch the cvt kernel.
|
||||
|
||||
@ -29,7 +29,10 @@ ARG VLLM_BRANCH="main"
|
||||
ONBUILD RUN git clone ${VLLM_REPO} \
|
||||
&& cd vllm \
|
||||
&& git fetch -v --prune -- origin ${VLLM_BRANCH} \
|
||||
&& git checkout FETCH_HEAD
|
||||
&& git checkout FETCH_HEAD \
|
||||
&& if [ ${VLLM_REPO} != "https://github.com/vllm-project/vllm.git" ] ; then \
|
||||
git remote add upstream "https://github.com/vllm-project/vllm.git" \
|
||||
&& git fetch upstream ; fi
|
||||
FROM fetch_vllm_${REMOTE_VLLM} AS fetch_vllm
|
||||
|
||||
# -----------------------
|
||||
|
||||
@ -1,25 +1,23 @@
|
||||
ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:6.4.1-complete
|
||||
ARG HIPBLASLT_BRANCH="aa0bda7b"
|
||||
ARG HIPBLAS_COMMON_BRANCH="9b80ba8e"
|
||||
ARG LEGACY_HIPBLASLT_OPTION=
|
||||
ARG TRITON_BRANCH="e5be006"
|
||||
ARG TRITON_REPO="https://github.com/triton-lang/triton.git"
|
||||
ARG PYTORCH_BRANCH="f717b2af"
|
||||
ARG PYTORCH_VISION_BRANCH="v0.21.0"
|
||||
ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:7.0-complete
|
||||
ARG TRITON_BRANCH="f9e5bf54"
|
||||
ARG TRITON_REPO="https://github.com/ROCm/triton.git"
|
||||
ARG PYTORCH_BRANCH="b2fb6885"
|
||||
ARG PYTORCH_VISION_BRANCH="v0.23.0"
|
||||
ARG PYTORCH_REPO="https://github.com/ROCm/pytorch.git"
|
||||
ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"
|
||||
ARG FA_BRANCH="1a7f4dfa"
|
||||
ARG FA_BRANCH="0e60e394"
|
||||
ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git"
|
||||
ARG AITER_BRANCH="4822e675"
|
||||
ARG AITER_BRANCH="2ab9f4cd"
|
||||
ARG AITER_REPO="https://github.com/ROCm/aiter.git"
|
||||
|
||||
FROM ${BASE_IMAGE} AS base
|
||||
|
||||
ENV PATH=/opt/rocm/llvm/bin:$PATH
|
||||
ENV PATH=/opt/rocm/llvm/bin:/opt/rocm/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
|
||||
ENV ROCM_PATH=/opt/rocm
|
||||
ENV LD_LIBRARY_PATH=/opt/rocm/lib:/usr/local/lib:
|
||||
ARG PYTORCH_ROCM_ARCH=gfx90a;gfx942;gfx1100;gfx1101;gfx1200;gfx1201
|
||||
ARG PYTORCH_ROCM_ARCH=gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201
|
||||
ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}
|
||||
ENV AITER_ROCM_ARCH=gfx942;gfx950
|
||||
|
||||
ARG PYTHON_VERSION=3.12
|
||||
|
||||
@ -45,29 +43,6 @@ RUN apt-get update -y \
|
||||
|
||||
RUN pip install -U packaging 'cmake<4' ninja wheel 'setuptools<80' pybind11 Cython
|
||||
|
||||
FROM base AS build_hipblaslt
|
||||
ARG HIPBLASLT_BRANCH
|
||||
ARG HIPBLAS_COMMON_BRANCH
|
||||
# Set to "--legacy_hipblas_direct" for ROCm<=6.2
|
||||
ARG LEGACY_HIPBLASLT_OPTION
|
||||
RUN git clone https://github.com/ROCm/hipBLAS-common.git
|
||||
RUN apt-get remove -y hipblaslt && apt-get autoremove -y && apt-get autoclean -y
|
||||
RUN cd hipBLAS-common \
|
||||
&& git checkout ${HIPBLAS_COMMON_BRANCH} \
|
||||
&& mkdir build \
|
||||
&& cd build \
|
||||
&& cmake .. \
|
||||
&& make package \
|
||||
&& dpkg -i ./*.deb
|
||||
RUN git clone https://github.com/ROCm/hipBLASLt
|
||||
RUN cd hipBLASLt \
|
||||
&& git checkout ${HIPBLASLT_BRANCH} \
|
||||
&& apt-get install -y llvm-dev \
|
||||
&& ./install.sh -dc --architecture ${PYTORCH_ROCM_ARCH} ${LEGACY_HIPBLASLT_OPTION} \
|
||||
&& cd build/release \
|
||||
&& make package
|
||||
RUN mkdir -p /app/install && cp /app/hipBLASLt/build/release/*.deb /app/hipBLAS-common/build/*.deb /app/install
|
||||
|
||||
FROM base AS build_triton
|
||||
ARG TRITON_BRANCH
|
||||
ARG TRITON_REPO
|
||||
@ -121,13 +96,11 @@ RUN cd aiter \
|
||||
&& git checkout ${AITER_BRANCH} \
|
||||
&& git submodule update --init --recursive \
|
||||
&& pip install -r requirements.txt
|
||||
RUN pip install pyyaml && cd aiter && PREBUILD_KERNELS=1 GPU_ARCHS=gfx942 python3 setup.py bdist_wheel --dist-dir=dist && ls /app/aiter/dist/*.whl
|
||||
RUN pip install pyyaml && cd aiter && PREBUILD_KERNELS=1 GPU_ARCHS=${AITER_ROCM_ARCH} python3 setup.py bdist_wheel --dist-dir=dist && ls /app/aiter/dist/*.whl
|
||||
RUN mkdir -p /app/install && cp /app/aiter/dist/*.whl /app/install
|
||||
|
||||
FROM base AS debs
|
||||
RUN mkdir /app/debs
|
||||
RUN --mount=type=bind,from=build_hipblaslt,src=/app/install/,target=/install \
|
||||
cp /install/*.deb /app/debs
|
||||
RUN --mount=type=bind,from=build_triton,src=/app/install/,target=/install \
|
||||
cp /install/*.whl /app/debs
|
||||
RUN --mount=type=bind,from=build_amdsmi,src=/app/install/,target=/install \
|
||||
@ -138,11 +111,6 @@ RUN --mount=type=bind,from=build_aiter,src=/app/install/,target=/install \
|
||||
cp /install/*.whl /app/debs
|
||||
|
||||
FROM base AS final
|
||||
RUN --mount=type=bind,from=build_hipblaslt,src=/app/install/,target=/install \
|
||||
dpkg -i /install/*deb \
|
||||
&& perl -p -i -e 's/, hipblas-common-dev \([^)]*?\), /, /g' /var/lib/dpkg/status \
|
||||
&& perl -p -i -e 's/, hipblaslt-dev \([^)]*?\), /, /g' /var/lib/dpkg/status \
|
||||
&& perl -p -i -e 's/, hipblaslt \([^)]*?\), /, /g' /var/lib/dpkg/status
|
||||
RUN --mount=type=bind,from=build_triton,src=/app/install/,target=/install \
|
||||
pip install /install/*.whl
|
||||
RUN --mount=type=bind,from=build_amdsmi,src=/app/install/,target=/install \
|
||||
@ -153,9 +121,6 @@ RUN --mount=type=bind,from=build_aiter,src=/app/install/,target=/install \
|
||||
pip install /install/*.whl
|
||||
|
||||
ARG BASE_IMAGE
|
||||
ARG HIPBLAS_COMMON_BRANCH
|
||||
ARG HIPBLASLT_BRANCH
|
||||
ARG LEGACY_HIPBLASLT_OPTION
|
||||
ARG TRITON_BRANCH
|
||||
ARG TRITON_REPO
|
||||
ARG PYTORCH_BRANCH
|
||||
@ -167,9 +132,6 @@ ARG FA_REPO
|
||||
ARG AITER_BRANCH
|
||||
ARG AITER_REPO
|
||||
RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
|
||||
&& echo "HIPBLAS_COMMON_BRANCH: ${HIPBLAS_COMMON_BRANCH}" >> /app/versions.txt \
|
||||
&& echo "HIPBLASLT_BRANCH: ${HIPBLASLT_BRANCH}" >> /app/versions.txt \
|
||||
&& echo "LEGACY_HIPBLASLT_OPTION: ${LEGACY_HIPBLASLT_OPTION}" >> /app/versions.txt \
|
||||
&& echo "TRITON_BRANCH: ${TRITON_BRANCH}" >> /app/versions.txt \
|
||||
&& echo "TRITON_REPO: ${TRITON_REPO}" >> /app/versions.txt \
|
||||
&& echo "PYTORCH_BRANCH: ${PYTORCH_BRANCH}" >> /app/versions.txt \
|
||||
@ -177,5 +139,6 @@ RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
|
||||
&& echo "PYTORCH_REPO: ${PYTORCH_REPO}" >> /app/versions.txt \
|
||||
&& echo "PYTORCH_VISION_REPO: ${PYTORCH_VISION_REPO}" >> /app/versions.txt \
|
||||
&& echo "FA_BRANCH: ${FA_BRANCH}" >> /app/versions.txt \
|
||||
&& echo "FA_REPO: ${FA_REPO}" >> /app/versions.txt \
|
||||
&& echo "AITER_BRANCH: ${AITER_BRANCH}" >> /app/versions.txt \
|
||||
&& echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt
|
||||
@ -14,7 +14,7 @@ API documentation for vLLM's configuration classes.
|
||||
- [vllm.config.LoRAConfig][]
|
||||
- [vllm.config.MultiModalConfig][]
|
||||
- [vllm.config.PoolerConfig][]
|
||||
- [vllm.config.DecodingConfig][]
|
||||
- [vllm.config.StructuredOutputsConfig][]
|
||||
- [vllm.config.ObservabilityConfig][]
|
||||
- [vllm.config.KVTransferConfig][]
|
||||
- [vllm.config.CompilationConfig][]
|
||||
@ -46,7 +46,6 @@ Engine classes for offline and online inference.
|
||||
Inference parameters for vLLM APIs.
|
||||
|
||||
[](){ #sampling-params }
|
||||
[](){ #pooling-params }
|
||||
|
||||
- [vllm.SamplingParams][]
|
||||
- [vllm.PoolingParams][]
|
||||
|
||||
@ -175,6 +175,7 @@ Regardless, you need to set `mm_encoder_tp_mode="data"` in engine arguments to u
|
||||
Known supported models:
|
||||
|
||||
- GLM-4.5V GLM-4.1V (<gh-pr:23168>)
|
||||
- InternVL (<gh-pr:23909>)
|
||||
- Kimi-VL (<gh-pr:23817>)
|
||||
- Llama4 (<gh-pr:18368>)
|
||||
- MiniCPM-V-2.5 or above (<gh-pr:23327>, <gh-pr:23948>)
|
||||
|
||||
@ -26,113 +26,123 @@ See <gh-file:LICENSE>.
|
||||
|
||||
## Developing
|
||||
|
||||
--8<-- "docs/getting_started/installation/python_env_setup.inc.md"
|
||||
|
||||
Depending on the kind of development you'd like to do (e.g. Python, CUDA), you can choose to build vLLM with or without compilation.
|
||||
Check out the [building from source][build-from-source] documentation for details.
|
||||
|
||||
For an optimized workflow when iterating on C++/CUDA kernels, see the [Incremental Compilation Workflow](./incremental_build.md) for recommendations.
|
||||
|
||||
### Building the docs with MkDocs
|
||||
|
||||
#### Introduction to MkDocs
|
||||
|
||||
[MkDocs](https://github.com/mkdocs/mkdocs) is a fast, simple and downright gorgeous static site generator that's geared towards building project documentation. Documentation source files are written in Markdown, and configured with a single YAML configuration file.
|
||||
|
||||
#### Install MkDocs and Plugins
|
||||
|
||||
Install MkDocs along with the [plugins](https://github.com/vllm-project/vllm/blob/main/mkdocs.yaml) used in the vLLM documentation, as well as required dependencies:
|
||||
|
||||
```bash
|
||||
uv pip install -r requirements/docs.txt
|
||||
```
|
||||
|
||||
!!! note
|
||||
Ensure that your Python version is compatible with the plugins (e.g., `mkdocs-awesome-nav` requires Python 3.10+)
|
||||
|
||||
#### Verify Installation
|
||||
|
||||
Confirm that MkDocs is correctly installed:
|
||||
|
||||
```bash
|
||||
mkdocs --version
|
||||
```
|
||||
|
||||
Example output:
|
||||
|
||||
```console
|
||||
mkdocs, version 1.6.1 from /opt/miniconda3/envs/mkdoc/lib/python3.10/site-packages/mkdocs (Python 3.10)
|
||||
```
|
||||
|
||||
#### Clone the `vLLM` repository
|
||||
The first step of contributing to vLLM is to clone the GitHub repository:
|
||||
|
||||
```bash
|
||||
git clone https://github.com/vllm-project/vllm.git
|
||||
cd vllm
|
||||
```
|
||||
|
||||
#### Start the Development Server
|
||||
Then, configure your Python virtual environment.
|
||||
|
||||
MkDocs comes with a built-in dev-server that lets you preview your documentation as you work on it. Make sure you're in the same directory as the `mkdocs.yml` configuration file, and then start the server by running the `mkdocs serve` command:
|
||||
--8<-- "docs/getting_started/installation/python_env_setup.inc.md"
|
||||
|
||||
If you are only developing vLLM's Python code, install vLLM using:
|
||||
|
||||
```bash
|
||||
mkdocs serve
|
||||
VLLM_USE_PRECOMPILED=1 uv pip install -e .
|
||||
```
|
||||
|
||||
Example output:
|
||||
If you are developing vLLM's Python and CUDA/C++ code, install vLLM using:
|
||||
|
||||
```console
|
||||
INFO - Documentation built in 106.83 seconds
|
||||
INFO - [22:02:02] Watching paths for changes: 'docs', 'mkdocs.yaml'
|
||||
INFO - [22:02:02] Serving on http://127.0.0.1:8000/
|
||||
```bash
|
||||
uv pip install -e .
|
||||
```
|
||||
|
||||
#### View in Your Browser
|
||||
For more details about installing from source and installing for other hardware, check out the [installation instructions](../getting_started/installation/README.md) for your hardware and head to the "Build wheel from source" section.
|
||||
|
||||
Open up [http://127.0.0.1:8000/](http://127.0.0.1:8000/) in your browser to see a live preview:.
|
||||
|
||||
#### Learn More
|
||||
|
||||
For additional features and advanced configurations, refer to the official [MkDocs Documentation](https://www.mkdocs.org/).
|
||||
|
||||
## Testing
|
||||
|
||||
??? console "Commands"
|
||||
|
||||
```bash
|
||||
# These commands are only for Nvidia CUDA platforms.
|
||||
uv pip install -r requirements/common.txt -r requirements/dev.txt --torch-backend=auto
|
||||
|
||||
# Linting, formatting and static type checking
|
||||
pre-commit install
|
||||
|
||||
# You can manually run pre-commit with
|
||||
pre-commit run --all-files --show-diff-on-failure
|
||||
|
||||
# To manually run something from CI that does not run
|
||||
# locally by default, you can run:
|
||||
pre-commit run mypy-3.9 --hook-stage manual --all-files
|
||||
|
||||
# Unit tests
|
||||
pytest tests/
|
||||
|
||||
# Run tests for a single test file with detailed output
|
||||
pytest -s -v tests/test_logger.py
|
||||
```
|
||||
For an optimized workflow when iterating on C++/CUDA kernels, see the [Incremental Compilation Workflow](./incremental_build.md) for recommendations.
|
||||
|
||||
!!! tip
|
||||
Since the <gh-file:docker/Dockerfile> ships with Python 3.12, all tests in CI (except `mypy`) are run with Python 3.12.
|
||||
vLLM is compatible with Python versions 3.9 to 3.12. However, vLLM's default [Dockerfile](gh-file:docker/Dockerfile) ships with Python 3.12 and tests in CI (except `mypy`) are run with Python 3.12.
|
||||
|
||||
Therefore, we recommend developing with Python 3.12 to minimise the chance of your local environment clashing with our CI environment.
|
||||
|
||||
!!! note "Install python3-dev if Python.h is missing"
|
||||
### Linting
|
||||
|
||||
vLLM uses `pre-commit` to lint and format the codebase. See <https://pre-commit.com/#usage> if `pre-commit` is new to you. Setting up `pre-commit` is as easy as:
|
||||
|
||||
```bash
|
||||
uv pip install pre-commit
|
||||
pre-commit install
|
||||
```
|
||||
|
||||
vLLM's `pre-commit` hooks will now run automatically every time you commit.
|
||||
|
||||
!!! tip "Tips"
|
||||
You can manually run the `pre-commit` hooks using:
|
||||
|
||||
```bash
|
||||
pre-commit run # runs on staged files
|
||||
pre-commit run -a # runs on all files (short for --all-files)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
Some `pre-commit` hooks only run in CI. If you need to, you can run them locally with:
|
||||
|
||||
```bash
|
||||
pre-commit run --hook-stage manual markdownlint
|
||||
pre-commit run --hook-stage manual mypy-3.9
|
||||
```
|
||||
|
||||
### Documentation
|
||||
|
||||
MkDocs is a fast, simple and downright gorgeous static site generator that's geared towards building project documentation. Documentation source files are written in Markdown, and configured with a single YAML configuration file, <gh-file:mkdocs.yaml>.
|
||||
|
||||
Get started with:
|
||||
|
||||
```bash
|
||||
uv pip install -r requirements/docs.txt
|
||||
```
|
||||
|
||||
!!! tip
|
||||
Ensure that your Python version is compatible with the plugins
|
||||
(e.g., `mkdocs-awesome-nav` requires Python 3.10+)
|
||||
|
||||
MkDocs comes with a built-in dev-server that lets you preview your documentation as you work on it.
|
||||
From the root of the repository, run:
|
||||
|
||||
```bash
|
||||
mkdocs serve # with API ref (~10 minutes)
|
||||
API_AUTONAV_EXCLUDE=vllm mkdocs serve # API ref off (~15 seconds)
|
||||
```
|
||||
|
||||
Once you see `Serving on http://127.0.0.1:8000/` in the logs, the live preview is ready!
|
||||
Open <http://127.0.0.1:8000/> in your browser to see it.
|
||||
|
||||
For additional features and advanced configurations, refer to the:
|
||||
|
||||
- [MkDocs documentation](https://www.mkdocs.org/)
|
||||
- [Material for MkDocs documentation](https://squidfunk.github.io/mkdocs-material/) (the MkDocs theme we use)
|
||||
|
||||
### Testing
|
||||
|
||||
vLLM uses `pytest` to test the codebase.
|
||||
|
||||
```bash
|
||||
# Install the test dependencies used in CI (CUDA only)
|
||||
uv pip install -r requirements/common.txt -r requirements/dev.txt --torch-backend=auto
|
||||
|
||||
# Install some common test dependencies (hardware agnostic)
|
||||
uv pip install pytest pytest-asyncio
|
||||
|
||||
# Run all tests
|
||||
pytest tests/
|
||||
|
||||
# Run tests for a single test file with detailed output
|
||||
pytest -s -v tests/test_logger.py
|
||||
```
|
||||
|
||||
!!! tip "Install python3-dev if Python.h is missing"
|
||||
If any of the above commands fails with `Python.h: No such file or directory`, install
|
||||
`python3-dev` with `sudo apt install python3-dev`.
|
||||
|
||||
!!! note
|
||||
!!! warning "Warnings"
|
||||
Currently, the repository is not fully checked by `mypy`.
|
||||
|
||||
!!! note
|
||||
---
|
||||
|
||||
Currently, not all unit tests pass when run on CPU platforms. If you don't have access to a GPU
|
||||
platform to run unit tests locally, rely on the continuous integration system to run the tests for
|
||||
now.
|
||||
@ -194,8 +204,7 @@ appropriately to indicate the type of change. Please use one of the following:
|
||||
The PR needs to meet the following code quality standards:
|
||||
|
||||
- We adhere to [Google Python style guide](https://google.github.io/styleguide/pyguide.html) and [Google C++ style guide](https://google.github.io/styleguide/cppguide.html).
|
||||
- Pass all linter checks. Please use `pre-commit` to format your code. See
|
||||
<https://pre-commit.com/#usage> if `pre-commit` is new to you.
|
||||
- Pass all linter checks.
|
||||
- The code needs to be well-documented to ensure future contributors can easily
|
||||
understand the code.
|
||||
- Include sufficient tests to ensure the project stays correct and robust. This
|
||||
|
||||
@ -156,7 +156,6 @@ vllm serve Qwen/Qwen2-VL-7B-Instruct
|
||||
```bash
|
||||
vllm bench serve \
|
||||
--backend openai-chat \
|
||||
--endpoint-type openai-chat \
|
||||
--model Qwen/Qwen2-VL-7B-Instruct \
|
||||
--endpoint /v1/chat/completions \
|
||||
--dataset-name hf \
|
||||
@ -230,7 +229,6 @@ vllm serve Qwen/Qwen2-VL-7B-Instruct
|
||||
```bash
|
||||
vllm bench serve \
|
||||
--backend openai-chat \
|
||||
--endpoint-type openai-chat \
|
||||
--model Qwen/Qwen2-VL-7B-Instruct \
|
||||
--endpoint /v1/chat/completions \
|
||||
--dataset-name hf \
|
||||
@ -245,7 +243,6 @@ vllm bench serve \
|
||||
```bash
|
||||
vllm bench serve \
|
||||
--backend openai-chat \
|
||||
--endpoint-type openai-chat \
|
||||
--model Qwen/Qwen2-VL-7B-Instruct \
|
||||
--endpoint /v1/chat/completions \
|
||||
--dataset-name hf \
|
||||
|
||||
@ -10,12 +10,12 @@ vLLM currently supports the following reasoning models:
|
||||
|
||||
| Model Series | Parser Name | Structured Output Support | Tool Calling |
|
||||
|--------------|-------------|------------------|-------------|
|
||||
| [DeepSeek R1 series](https://huggingface.co/collections/deepseek-ai/deepseek-r1-678e1e131c0169c0bc89728d) | `deepseek_r1` | `guided_json`, `guided_regex` | ❌ |
|
||||
| [QwQ-32B](https://huggingface.co/Qwen/QwQ-32B) | `deepseek_r1` | `guided_json`, `guided_regex` | ✅ |
|
||||
| [DeepSeek R1 series](https://huggingface.co/collections/deepseek-ai/deepseek-r1-678e1e131c0169c0bc89728d) | `deepseek_r1` | `json`, `regex` | ❌ |
|
||||
| [QwQ-32B](https://huggingface.co/Qwen/QwQ-32B) | `deepseek_r1` | `json`, `regex` | ✅ |
|
||||
| [IBM Granite 3.2 language models](https://huggingface.co/collections/ibm-granite/granite-32-language-models-67b3bc8c13508f6d064cff9a) | `granite` | ❌ | ❌ |
|
||||
| [Qwen3 series](https://huggingface.co/collections/Qwen/qwen3-67dd247413f0e2e4f653967f) | `qwen3` | `guided_json`, `guided_regex` | ✅ |
|
||||
| [Hunyuan A13B series](https://huggingface.co/collections/tencent/hunyuan-a13b-685ec38e5b46321e3ea7c4be) | `hunyuan_a13b` | `guided_json`, `guided_regex` | ✅ |
|
||||
| [GLM-4.5 series](https://huggingface.co/collections/zai-org/glm-45-687c621d34bda8c9e4bf503b) | `glm45` | `guided_json`, `guided_regex` | ✅ |
|
||||
| [Qwen3 series](https://huggingface.co/collections/Qwen/qwen3-67dd247413f0e2e4f653967f) | `qwen3` | `json`, `regex` | ✅ |
|
||||
| [Hunyuan A13B series](https://huggingface.co/collections/tencent/hunyuan-a13b-685ec38e5b46321e3ea7c4be) | `hunyuan_a13b` | `json`, `regex` | ✅ |
|
||||
| [GLM-4.5 series](https://huggingface.co/collections/zai-org/glm-45-687c621d34bda8c9e4bf503b) | `glm45` | `json`, `regex` | ✅ |
|
||||
|
||||
!!! note
|
||||
IBM Granite 3.2 reasoning is disabled by default; to enable it, you must also pass `thinking=True` in your `chat_template_kwargs`.
|
||||
|
||||
@ -12,23 +12,23 @@ You can generate structured outputs using the OpenAI's [Completions](https://pla
|
||||
|
||||
The following parameters are supported, which must be added as extra parameters:
|
||||
|
||||
- `guided_choice`: the output will be exactly one of the choices.
|
||||
- `guided_regex`: the output will follow the regex pattern.
|
||||
- `guided_json`: the output will follow the JSON schema.
|
||||
- `guided_grammar`: the output will follow the context free grammar.
|
||||
- `choice`: the output will be exactly one of the choices.
|
||||
- `regex`: the output will follow the regex pattern.
|
||||
- `json`: the output will follow the JSON schema.
|
||||
- `grammar`: the output will follow the context free grammar.
|
||||
- `structural_tag`: Follow a JSON schema within a set of specified tags within the generated text.
|
||||
|
||||
You can see the complete list of supported parameters on the [OpenAI-Compatible Server](../serving/openai_compatible_server.md) page.
|
||||
|
||||
Structured outputs are supported by default in the OpenAI-Compatible Server. You
|
||||
may choose to specify the backend to use by setting the
|
||||
`--guided-decoding-backend` flag to `vllm serve`. The default backend is `auto`,
|
||||
`--structured-outputs-config.backend` flag to `vllm serve`. The default backend is `auto`,
|
||||
which will try to choose an appropriate backend based on the details of the
|
||||
request. You may also choose a specific backend, along with
|
||||
some options. A full set of options is available in the `vllm serve --help`
|
||||
text.
|
||||
|
||||
Now let´s see an example for each of the cases, starting with the `guided_choice`, as it´s the easiest one:
|
||||
Now let´s see an example for each of the cases, starting with the `choice`, as it´s the easiest one:
|
||||
|
||||
??? code
|
||||
|
||||
@ -45,12 +45,12 @@ Now let´s see an example for each of the cases, starting with the `guided_choic
|
||||
messages=[
|
||||
{"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"}
|
||||
],
|
||||
extra_body={"guided_choice": ["positive", "negative"]},
|
||||
extra_body={"structured_outputs": {"choice": ["positive", "negative"]}},
|
||||
)
|
||||
print(completion.choices[0].message.content)
|
||||
```
|
||||
|
||||
The next example shows how to use the `guided_regex`. The idea is to generate an email address, given a simple regex template:
|
||||
The next example shows how to use the `regex`. The idea is to generate an email address, given a simple regex template:
|
||||
|
||||
??? code
|
||||
|
||||
@ -63,18 +63,18 @@ The next example shows how to use the `guided_regex`. The idea is to generate an
|
||||
"content": "Generate an example email address for Alan Turing, who works in Enigma. End in .com and new line. Example result: alan.turing@enigma.com\n",
|
||||
}
|
||||
],
|
||||
extra_body={"guided_regex": r"\w+@\w+\.com\n", "stop": ["\n"]},
|
||||
extra_body={"structured_outputs": {"regex": r"\w+@\w+\.com\n"}, "stop": ["\n"]},
|
||||
)
|
||||
print(completion.choices[0].message.content)
|
||||
```
|
||||
|
||||
One of the most relevant features in structured text generation is the option to generate a valid JSON with pre-defined fields and formats.
|
||||
For this we can use the `guided_json` parameter in two different ways:
|
||||
For this we can use the `json` parameter in two different ways:
|
||||
|
||||
- Using directly a [JSON Schema](https://json-schema.org/)
|
||||
- Defining a [Pydantic model](https://docs.pydantic.dev/latest/) and then extracting the JSON Schema from it (which is normally an easier option).
|
||||
|
||||
The next example shows how to use the `guided_json` parameter with a Pydantic model:
|
||||
The next example shows how to use the `response_format` parameter with a Pydantic model:
|
||||
|
||||
??? code
|
||||
|
||||
@ -119,7 +119,7 @@ The next example shows how to use the `guided_json` parameter with a Pydantic mo
|
||||
JSON schema and how the fields should be populated. This can improve the
|
||||
results notably in most cases.
|
||||
|
||||
Finally we have the `guided_grammar` option, which is probably the most
|
||||
Finally we have the `grammar` option, which is probably the most
|
||||
difficult to use, but it´s really powerful. It allows us to define complete
|
||||
languages like SQL queries. It works by using a context free EBNF grammar.
|
||||
As an example, we can use to define a specific format of simplified SQL queries:
|
||||
@ -149,7 +149,7 @@ As an example, we can use to define a specific format of simplified SQL queries:
|
||||
"content": "Generate an SQL query to show the 'username' and 'email' from the 'users' table.",
|
||||
}
|
||||
],
|
||||
extra_body={"guided_grammar": simplified_sql_grammar},
|
||||
extra_body={"structured_outputs": {"grammar": simplified_sql_grammar}},
|
||||
)
|
||||
print(completion.choices[0].message.content)
|
||||
```
|
||||
@ -292,8 +292,8 @@ An example of using `structural_tag` can be found here: <gh-file:examples/online
|
||||
## Offline Inference
|
||||
|
||||
Offline inference allows for the same types of structured outputs.
|
||||
To use it, we´ll need to configure the guided decoding using the class `GuidedDecodingParams` inside `SamplingParams`.
|
||||
The main available options inside `GuidedDecodingParams` are:
|
||||
To use it, we´ll need to configure the structured outputs using the class `StructuredOutputsParams` inside `SamplingParams`.
|
||||
The main available options inside `StructuredOutputsParams` are:
|
||||
|
||||
- `json`
|
||||
- `regex`
|
||||
@ -309,12 +309,12 @@ shown below:
|
||||
|
||||
```python
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.sampling_params import GuidedDecodingParams
|
||||
from vllm.sampling_params import StructuredOutputsParams
|
||||
|
||||
llm = LLM(model="HuggingFaceTB/SmolLM2-1.7B-Instruct")
|
||||
|
||||
guided_decoding_params = GuidedDecodingParams(choice=["Positive", "Negative"])
|
||||
sampling_params = SamplingParams(guided_decoding=guided_decoding_params)
|
||||
structured_outputs_params = StructuredOutputsParams(choice=["Positive", "Negative"])
|
||||
sampling_params = SamplingParams(structured_outputs=structured_outputs_params)
|
||||
outputs = llm.generate(
|
||||
prompts="Classify this sentiment: vLLM is wonderful!",
|
||||
sampling_params=sampling_params,
|
||||
|
||||
@ -71,7 +71,7 @@ This example demonstrates:
|
||||
* Making a request with `tool_choice="auto"`
|
||||
* Handling the structured response and executing the corresponding function
|
||||
|
||||
You can also specify a particular function using named function calling by setting `tool_choice={"type": "function", "function": {"name": "get_weather"}}`. Note that this will use the guided decoding backend - so the first time this is used, there will be several seconds of latency (or more) as the FSM is compiled for the first time before it is cached for subsequent requests.
|
||||
You can also specify a particular function using named function calling by setting `tool_choice={"type": "function", "function": {"name": "get_weather"}}`. Note that this will use the structured outputs backend - so the first time this is used, there will be several seconds of latency (or more) as the FSM is compiled for the first time before it is cached for subsequent requests.
|
||||
|
||||
Remember that it's the caller's responsibility to:
|
||||
|
||||
@ -83,19 +83,18 @@ For more advanced usage, including parallel tool calls and different model-speci
|
||||
|
||||
## Named Function Calling
|
||||
|
||||
vLLM supports named function calling in the chat completion API by default. It does so using Outlines through guided decoding, so this is
|
||||
enabled by default and will work with any supported model. You are guaranteed a validly-parsable function call - not a
|
||||
vLLM supports named function calling in the chat completion API by default. This should work with most structured outputs backends supported by vLLM. You are guaranteed a validly-parsable function call - not a
|
||||
high-quality one.
|
||||
|
||||
vLLM will use guided decoding to ensure the response matches the tool parameter object defined by the JSON schema in the `tools` parameter.
|
||||
For best results, we recommend ensuring that the expected output format / schema is specified in the prompt to ensure that the model's intended generation is aligned with the schema that it's being forced to generate by the guided decoding backend.
|
||||
vLLM will use structured outputs to ensure the response matches the tool parameter object defined by the JSON schema in the `tools` parameter.
|
||||
For best results, we recommend ensuring that the expected output format / schema is specified in the prompt to ensure that the model's intended generation is aligned with the schema that it's being forced to generate by the structured outputs backend.
|
||||
|
||||
To use a named function, you need to define the functions in the `tools` parameter of the chat completion request, and
|
||||
specify the `name` of one of the tools in the `tool_choice` parameter of the chat completion request.
|
||||
|
||||
## Required Function Calling
|
||||
|
||||
vLLM supports the `tool_choice='required'` option in the chat completion API. Similar to the named function calling, it also uses guided decoding, so this is enabled by default and will work with any supported model. The guided decoding features for `tool_choice='required'` (such as JSON schema with `anyOf`) are currently only supported in the V0 engine with the guided decoding backend `outlines`. However, support for alternative decoding backends are on the [roadmap](../usage/v1_guide.md#features) for the V1 engine.
|
||||
vLLM supports the `tool_choice='required'` option in the chat completion API. Similar to the named function calling, it also uses structured outputs, so this is enabled by default and will work with any supported model. However, support for alternative decoding backends are on the [roadmap](../usage/v1_guide.md#features) for the V1 engine.
|
||||
|
||||
When tool_choice='required' is set, the model is guaranteed to generate one or more tool calls based on the specified tool list in the `tools` parameter. The number of tool calls depends on the user's query. The output format strictly follows the schema defined in the `tools` parameter.
|
||||
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
It's recommended to use [uv](https://docs.astral.sh/uv/), a very fast Python environment manager, to create and manage Python environments. Please follow the [documentation](https://docs.astral.sh/uv/#getting-started) to install `uv`. After installing `uv`, you can create a new Python environment and install vLLM using the following commands:
|
||||
It's recommended to use [uv](https://docs.astral.sh/uv/), a very fast Python environment manager, to create and manage Python environments. Please follow the [documentation](https://docs.astral.sh/uv/#getting-started) to install `uv`. After installing `uv`, you can create a new Python environment using the following commands:
|
||||
|
||||
```bash
|
||||
uv venv --python 3.12 --seed
|
||||
|
||||
@ -59,7 +59,7 @@ enabling the corresponding APIs:
|
||||
#### Predefined models
|
||||
|
||||
If the [Pooler][vllm.model_executor.layers.pooler.Pooler] defined by the model accepts `pooler_config`,
|
||||
you can override some of its attributes via the `--override-pooler-config` option.
|
||||
you can override some of its attributes via the `--pooler-config` option.
|
||||
|
||||
#### Converted models
|
||||
|
||||
@ -75,7 +75,7 @@ the pooler assigned to each task has the following attributes by default:
|
||||
When loading [Sentence Transformers](https://huggingface.co/sentence-transformers) models,
|
||||
its Sentence Transformers configuration file (`modules.json`) takes priority over the model's defaults.
|
||||
|
||||
You can further customize this via the `--override-pooler-config` option,
|
||||
You can further customize this via the `--pooler-config` option,
|
||||
which takes priority over both the model's and Sentence Transformers's defaults.
|
||||
|
||||
## Offline Inference
|
||||
|
||||
@ -17,9 +17,24 @@ These models are what we list in [supported-text-models][supported-text-models]
|
||||
|
||||
### Transformers
|
||||
|
||||
vLLM also supports model implementations that are available in Transformers. This does not currently work for all models, but most decoder language models and common vision language models are supported! Vision-language models currently accept only image inputs. Support for video inputs will be added in future releases.
|
||||
vLLM also supports model implementations that are available in Transformers. You should expect the performance of a Transformers model implementation used in vLLM to be within <1% of the performance of a dedicated vLLM model implementation. We call this feature the "Transformers backend".
|
||||
|
||||
To check if the modeling backend is Transformers, you can simply do this:
|
||||
Currently, the Transformers backend works for the following:
|
||||
|
||||
- Modalities: embedding models, language models and vision-language models*
|
||||
- Architectures: encoder-only, decoder-only
|
||||
- Attention types: full attention and/or sliding attention
|
||||
|
||||
_*Vision-language models currently accept only image inputs. Support for video inputs will be added in a future release._
|
||||
|
||||
If the Transformers model implementation follows all the steps in [writing a custom model](#writing-custom-models) then, when used with the Transformers backend, it will be compatible with the following features of vLLM:
|
||||
|
||||
- All the features listed in the [compatibility matrix](../features/compatibility_matrix.md#feature-x-feature)
|
||||
- Any combination of the following vLLM parallelisation schemes:
|
||||
- Pipeline parallel
|
||||
- Tensor parallel
|
||||
|
||||
Checking if the modeling backend is Transformers is as simple as:
|
||||
|
||||
```python
|
||||
from vllm import LLM
|
||||
@ -27,16 +42,12 @@ llm = LLM(model=...) # Name or path of your model
|
||||
llm.apply_model(lambda model: print(type(model)))
|
||||
```
|
||||
|
||||
If it is `TransformersForCausalLM` or `TransformersForMultimodalLM` then it means it's based on Transformers!
|
||||
If the printed type starts with `Transformers...` then it's using the Transformers model implementation!
|
||||
|
||||
!!! tip
|
||||
You can force the use of `TransformersForCausalLM` by setting `model_impl="transformers"` for [offline-inference](../serving/offline_inference.md) or `--model-impl transformers` for the [openai-compatible-server](../serving/openai_compatible_server.md).
|
||||
If a model has a vLLM implementation but you would prefer to use the Transformers implementation via the Transformers backend, set `model_impl="transformers"` for [offline inference](../serving/offline_inference.md) or `--model-impl transformers` for the [online serving](../serving/openai_compatible_server.md).
|
||||
|
||||
!!! note
|
||||
vLLM may not fully optimise the Transformers implementation so you may see degraded performance if comparing a native model to a Transformers model in vLLM.
|
||||
|
||||
!!! note
|
||||
In case of vision language models if you are loading with `dtype="auto"`, vLLM loads the whole model with config's `dtype` if it exists. In contrast the native Transformers will respect the `dtype` attribute of each backbone in the model. That might cause a slight difference in performance.
|
||||
For vision-language models, if you are loading with `dtype="auto"`, vLLM loads the whole model with config's `dtype` if it exists. In contrast the native Transformers will respect the `dtype` attribute of each backbone in the model. That might cause a slight difference in performance.
|
||||
|
||||
#### Custom models
|
||||
|
||||
@ -66,10 +77,11 @@ This section details the necessary modifications to make to a Transformers compa
|
||||
To make your model compatible with the Transformers backend, it needs:
|
||||
|
||||
1. `kwargs` passed down through all modules from `MyModel` to `MyAttention`.
|
||||
1. If your model is encoder-only, you must also add `is_causal = False` to `MyAttention`.
|
||||
2. `MyAttention` must use `ALL_ATTENTION_FUNCTIONS` to call attention.
|
||||
3. `MyModel` must contain `_supports_attention_backend = True`.
|
||||
|
||||
<details>
|
||||
<details class="code">
|
||||
<summary>modeling_my_model.py</summary>
|
||||
|
||||
```python
|
||||
@ -78,6 +90,7 @@ from transformers import PreTrainedModel
|
||||
from torch import nn
|
||||
|
||||
class MyAttention(nn.Module):
|
||||
is_causal = False # Only do this for encoder-only models
|
||||
|
||||
def forward(self, hidden_states, **kwargs):
|
||||
...
|
||||
@ -101,13 +114,13 @@ Here is what happens in the background when this model is loaded:
|
||||
|
||||
1. The config is loaded.
|
||||
2. `MyModel` Python class is loaded from the `auto_map` in config, and we check that the model `is_backend_compatible()`.
|
||||
3. `MyModel` is loaded into `TransformersForCausalLM` or `TransformersForMultimodalLM` (see <gh-file:vllm/model_executor/models/transformers.py>) which sets `self.config._attn_implementation = "vllm"` so that vLLM's attention layer is used.
|
||||
3. `MyModel` is loaded into one of the Transformers backend classes in <gh-file:vllm/model_executor/models/transformers.py> which sets `self.config._attn_implementation = "vllm"` so that vLLM's attention layer is used.
|
||||
|
||||
That's it!
|
||||
|
||||
For your model to be compatible with vLLM's tensor parallel and/or pipeline parallel features, you must add `base_model_tp_plan` and/or `base_model_pp_plan` to your model's config class:
|
||||
|
||||
<details>
|
||||
<details class="code">
|
||||
<summary>configuration_my_model.py</summary>
|
||||
|
||||
```python
|
||||
@ -457,7 +470,7 @@ These models primarily support the [`LLM.embed`](./pooling_models.md#llmembed) A
|
||||
|
||||
!!! note
|
||||
`ssmits/Qwen2-7B-Instruct-embed-base` has an improperly defined Sentence Transformers config.
|
||||
You need to manually set mean pooling by passing `--override-pooler-config '{"pooling_type": "MEAN"}'`.
|
||||
You need to manually set mean pooling by passing `--pooler-config '{"pooling_type": "MEAN"}'`.
|
||||
|
||||
!!! note
|
||||
For `Alibaba-NLP/gte-Qwen2-*`, you need to enable `--trust-remote-code` for the correct tokenizer to be loaded.
|
||||
@ -552,7 +565,18 @@ If your model is not in the above list, we will try to automatically convert the
|
||||
|
||||
!!! important
|
||||
For process-supervised reward models such as `peiyi9979/math-shepherd-mistral-7b-prm`, the pooling config should be set explicitly,
|
||||
e.g.: `--override-pooler-config '{"pooling_type": "STEP", "step_tag_id": 123, "returned_token_ids": [456, 789]}'`.
|
||||
e.g.: `--pooler-config '{"pooling_type": "STEP", "step_tag_id": 123, "returned_token_ids": [456, 789]}'`.
|
||||
|
||||
#### Token Classification
|
||||
|
||||
These models primarily support the [`LLM.encode`](./pooling_models.md#llmencode) API.
|
||||
|
||||
| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) |
|
||||
|--------------|--------|-------------------|-----------------------------|-----------------------------------------|---------------------|
|
||||
| `BertForTokenClassification` | bert-based | `boltuix/NeuroBERT-NER` (see note), etc. | | | ✅︎ |
|
||||
|
||||
!!! note
|
||||
Named Entity Recognition (NER) usage, please refer to <gh-file:examples/offline_inference/pooling/ner.py>, <gh-file:examples/online_serving/pooling/ner.py>.
|
||||
|
||||
[](){ #supported-mm-models }
|
||||
|
||||
|
||||
@ -133,7 +133,7 @@ completion = client.chat.completions.create(
|
||||
{"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"}
|
||||
],
|
||||
extra_body={
|
||||
"guided_choice": ["positive", "negative"]
|
||||
"structured_outputs": {"choice": ["positive", "negative"]}
|
||||
}
|
||||
)
|
||||
```
|
||||
@ -317,10 +317,11 @@ Full example: <gh-file:examples/online_serving/pooling/openai_chat_embedding_cli
|
||||
|
||||
#### Extra parameters
|
||||
|
||||
The following [pooling parameters][pooling-params] are supported.
|
||||
The following [pooling parameters][vllm.PoolingParams] are supported.
|
||||
|
||||
```python
|
||||
--8<-- "vllm/entrypoints/openai/protocol.py:embedding-pooling-params"
|
||||
--8<-- "vllm/pooling_params.py:common-pooling-params"
|
||||
--8<-- "vllm/pooling_params.py:embedding-pooling-params"
|
||||
```
|
||||
|
||||
The following extra parameters are supported by default:
|
||||
@ -374,7 +375,7 @@ The following extra parameters are supported:
|
||||
```python
|
||||
--8<-- "vllm/entrypoints/openai/protocol.py:transcription-extra-params"
|
||||
```
|
||||
|
||||
|
||||
[](){ #translations-api }
|
||||
|
||||
### Translations API
|
||||
@ -527,10 +528,11 @@ curl -v "http://127.0.0.1:8000/classify" \
|
||||
|
||||
#### Extra parameters
|
||||
|
||||
The following [pooling parameters][pooling-params] are supported.
|
||||
The following [pooling parameters][vllm.PoolingParams] are supported.
|
||||
|
||||
```python
|
||||
--8<-- "vllm/entrypoints/openai/protocol.py:classification-pooling-params"
|
||||
--8<-- "vllm/pooling_params.py:common-pooling-params"
|
||||
--8<-- "vllm/pooling_params.py:classification-pooling-params"
|
||||
```
|
||||
|
||||
The following extra parameters are supported:
|
||||
@ -733,10 +735,11 @@ Full example: <gh-file:examples/online_serving/openai_cross_encoder_score_for_mu
|
||||
|
||||
#### Extra parameters
|
||||
|
||||
The following [pooling parameters][pooling-params] are supported.
|
||||
The following [pooling parameters][vllm.PoolingParams] are supported.
|
||||
|
||||
```python
|
||||
--8<-- "vllm/entrypoints/openai/protocol.py:score-pooling-params"
|
||||
--8<-- "vllm/pooling_params.py:common-pooling-params"
|
||||
--8<-- "vllm/pooling_params.py:classification-pooling-params"
|
||||
```
|
||||
|
||||
The following extra parameters are supported:
|
||||
@ -815,10 +818,11 @@ Result documents will be sorted by relevance, and the `index` property can be us
|
||||
|
||||
#### Extra parameters
|
||||
|
||||
The following [pooling parameters][pooling-params] are supported.
|
||||
The following [pooling parameters][vllm.PoolingParams] are supported.
|
||||
|
||||
```python
|
||||
--8<-- "vllm/entrypoints/openai/protocol.py:rerank-pooling-params"
|
||||
--8<-- "vllm/pooling_params.py:common-pooling-params"
|
||||
--8<-- "vllm/pooling_params.py:classification-pooling-params"
|
||||
```
|
||||
|
||||
The following extra parameters are supported:
|
||||
|
||||
@ -26,8 +26,14 @@ python examples/offline_inference/pooling/embed_jina_embeddings_v3.py
|
||||
python examples/offline_inference/pooling/embed_matryoshka_fy.py
|
||||
```
|
||||
|
||||
## Named Entity Recognition (NER) usage
|
||||
|
||||
```bash
|
||||
python examples/offline_inference/pooling/ner.py
|
||||
```
|
||||
|
||||
## Qwen3 reranker usage
|
||||
|
||||
```bash
|
||||
python qwen3_reranker.py
|
||||
python examples/offline_inference/pooling/qwen3_reranker.py
|
||||
```
|
||||
|
||||
54
examples/offline_inference/pooling/ner.py
Normal file
54
examples/offline_inference/pooling/ner.py
Normal file
@ -0,0 +1,54 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
# Adapted from https://huggingface.co/boltuix/NeuroBERT-NER
|
||||
|
||||
from argparse import Namespace
|
||||
|
||||
from vllm import LLM, EngineArgs
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = FlexibleArgumentParser()
|
||||
parser = EngineArgs.add_cli_args(parser)
|
||||
# Set example specific arguments
|
||||
parser.set_defaults(
|
||||
model="boltuix/NeuroBERT-NER",
|
||||
runner="pooling",
|
||||
enforce_eager=True,
|
||||
trust_remote_code=True,
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def main(args: Namespace):
|
||||
# Sample prompts.
|
||||
prompts = [
|
||||
"Barack Obama visited Microsoft headquarters in Seattle on January 2025."
|
||||
]
|
||||
|
||||
# Create an LLM.
|
||||
llm = LLM(**vars(args))
|
||||
tokenizer = llm.get_tokenizer()
|
||||
label_map = llm.llm_engine.vllm_config.model_config.hf_config.id2label
|
||||
|
||||
# Run inference
|
||||
outputs = llm.encode(prompts)
|
||||
|
||||
for prompt, output in zip(prompts, outputs):
|
||||
logits = output.outputs.data
|
||||
predictions = logits.argmax(dim=-1)
|
||||
|
||||
# Map predictions to labels
|
||||
tokens = tokenizer.convert_ids_to_tokens(output.prompt_token_ids)
|
||||
labels = [label_map[p.item()] for p in predictions]
|
||||
|
||||
# Print results
|
||||
for token, label in zip(tokens, labels):
|
||||
if token not in tokenizer.all_special_tokens:
|
||||
print(f"{token:15} → {label}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parse_args()
|
||||
main(args)
|
||||
@ -53,7 +53,6 @@ def parse_args():
|
||||
"--method",
|
||||
type=str,
|
||||
default="eagle",
|
||||
choices=["ngram", "eagle", "eagle3", "mtp"],
|
||||
)
|
||||
parser.add_argument("--num-spec-tokens", type=int, default=2)
|
||||
parser.add_argument("--prompt-lookup-max", type=int, default=5)
|
||||
@ -118,6 +117,11 @@ def main():
|
||||
"prompt_lookup_max": args.prompt_lookup_max,
|
||||
"prompt_lookup_min": args.prompt_lookup_min,
|
||||
}
|
||||
elif args.method.endswith("mtp"):
|
||||
speculative_config = {
|
||||
"method": args.method,
|
||||
"num_speculative_tokens": args.num_spec_tokens,
|
||||
}
|
||||
else:
|
||||
raise ValueError(f"unknown method: {args.method}")
|
||||
|
||||
|
||||
@ -1,11 +1,10 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
This file demonstrates the example usage of guided decoding
|
||||
to generate structured outputs using vLLM. It shows how to apply
|
||||
different guided decoding techniques such as Choice, Regex, JSON schema,
|
||||
and Grammar to produce structured and formatted results
|
||||
based on specific prompts.
|
||||
This file demonstrates the example usage of structured outputs
|
||||
in vLLM. It shows how to apply different constraints such as choice,
|
||||
regex, json schema, and grammar to produce structured and formatted
|
||||
results based on specific prompts.
|
||||
"""
|
||||
|
||||
from enum import Enum
|
||||
@ -13,19 +12,23 @@ from enum import Enum
|
||||
from pydantic import BaseModel
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.sampling_params import GuidedDecodingParams
|
||||
from vllm.sampling_params import StructuredOutputsParams
|
||||
|
||||
MAX_TOKENS = 50
|
||||
|
||||
# Guided decoding by Choice (list of possible options)
|
||||
guided_decoding_params_choice = GuidedDecodingParams(choice=["Positive", "Negative"])
|
||||
sampling_params_choice = SamplingParams(guided_decoding=guided_decoding_params_choice)
|
||||
# Structured outputs by Choice (list of possible options)
|
||||
structured_outputs_params_choice = StructuredOutputsParams(
|
||||
choice=["Positive", "Negative"]
|
||||
)
|
||||
sampling_params_choice = SamplingParams(
|
||||
structured_outputs=structured_outputs_params_choice
|
||||
)
|
||||
prompt_choice = "Classify this sentiment: vLLM is wonderful!"
|
||||
|
||||
# Guided decoding by Regex
|
||||
guided_decoding_params_regex = GuidedDecodingParams(regex=r"\w+@\w+\.com\n")
|
||||
# Structured outputs by Regex
|
||||
structured_outputs_params_regex = StructuredOutputsParams(regex=r"\w+@\w+\.com\n")
|
||||
sampling_params_regex = SamplingParams(
|
||||
guided_decoding=guided_decoding_params_regex,
|
||||
structured_outputs=structured_outputs_params_regex,
|
||||
stop=["\n"],
|
||||
max_tokens=MAX_TOKENS,
|
||||
)
|
||||
@ -36,7 +39,7 @@ prompt_regex = (
|
||||
)
|
||||
|
||||
|
||||
# Guided decoding by JSON using Pydantic schema
|
||||
# Structured outputs by JSON using Pydantic schema
|
||||
class CarType(str, Enum):
|
||||
sedan = "sedan"
|
||||
suv = "SUV"
|
||||
@ -51,17 +54,16 @@ class CarDescription(BaseModel):
|
||||
|
||||
|
||||
json_schema = CarDescription.model_json_schema()
|
||||
guided_decoding_params_json = GuidedDecodingParams(json=json_schema)
|
||||
structured_outputs_params_json = StructuredOutputsParams(json=json_schema)
|
||||
sampling_params_json = SamplingParams(
|
||||
guided_decoding=guided_decoding_params_json,
|
||||
max_tokens=MAX_TOKENS,
|
||||
structured_outputs=structured_outputs_params_json, max_tokens=MAX_TOKENS
|
||||
)
|
||||
prompt_json = (
|
||||
"Generate a JSON with the brand, model and car_type of"
|
||||
"Generate a JSON with the brand, model and car_type of "
|
||||
"the most iconic car from the 90's"
|
||||
)
|
||||
|
||||
# Guided decoding by Grammar
|
||||
# Structured outputs by Grammar
|
||||
simplified_sql_grammar = """
|
||||
root ::= select_statement
|
||||
select_statement ::= "SELECT " column " from " table " where " condition
|
||||
@ -70,13 +72,15 @@ table ::= "table_1 " | "table_2 "
|
||||
condition ::= column "= " number
|
||||
number ::= "1 " | "2 "
|
||||
"""
|
||||
guided_decoding_params_grammar = GuidedDecodingParams(grammar=simplified_sql_grammar)
|
||||
structured_outputs_params_grammar = StructuredOutputsParams(
|
||||
grammar=simplified_sql_grammar
|
||||
)
|
||||
sampling_params_grammar = SamplingParams(
|
||||
guided_decoding=guided_decoding_params_grammar,
|
||||
structured_outputs=structured_outputs_params_grammar,
|
||||
max_tokens=MAX_TOKENS,
|
||||
)
|
||||
prompt_grammar = (
|
||||
"Generate an SQL query to show the 'username' and 'email'from the 'users' table."
|
||||
"Generate an SQL query to show the 'username' and 'email' from the 'users' table."
|
||||
)
|
||||
|
||||
|
||||
@ -93,16 +97,16 @@ def main():
|
||||
llm = LLM(model="Qwen/Qwen2.5-3B-Instruct", max_model_len=100)
|
||||
|
||||
choice_output = generate_output(prompt_choice, sampling_params_choice, llm)
|
||||
format_output("Guided decoding by Choice", choice_output)
|
||||
format_output("Structured outputs by Choice", choice_output)
|
||||
|
||||
regex_output = generate_output(prompt_regex, sampling_params_regex, llm)
|
||||
format_output("Guided decoding by Regex", regex_output)
|
||||
format_output("Structured outputs by Regex", regex_output)
|
||||
|
||||
json_output = generate_output(prompt_json, sampling_params_json, llm)
|
||||
format_output("Guided decoding by JSON", json_output)
|
||||
format_output("Structured outputs by JSON", json_output)
|
||||
|
||||
grammar_output = generate_output(prompt_grammar, sampling_params_grammar, llm)
|
||||
format_output("Guided decoding by Grammar", grammar_output)
|
||||
format_output("Structured outputs by Grammar", grammar_output)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@ -6,7 +6,7 @@ without any specific flags:
|
||||
|
||||
```bash
|
||||
VLLM_USE_V1=0 vllm serve unsloth/Llama-3.2-1B-Instruct \
|
||||
--guided-decoding-backend outlines
|
||||
--structured-outputs-config.backend outlines
|
||||
```
|
||||
|
||||
This example demonstrates how to generate chat completions
|
||||
|
||||
@ -42,7 +42,7 @@ python client.py
|
||||
|
||||
### Server Configuration
|
||||
|
||||
The key parameters for chunked processing are in the `--override-pooler-config`:
|
||||
The key parameters for chunked processing are in the `--pooler-config`:
|
||||
|
||||
```json
|
||||
{
|
||||
|
||||
@ -13,7 +13,7 @@ Prerequisites:
|
||||
|
||||
# MEAN pooling (processes all chunks, recommended for complete coverage)
|
||||
vllm serve intfloat/multilingual-e5-large \
|
||||
--override-pooler-config \
|
||||
--pooler-config \
|
||||
'{"pooling_type": "MEAN", "normalize": true, ' \
|
||||
'"enable_chunked_processing": true, "max_embed_len": 3072000}' \
|
||||
--served-model-name multilingual-e5-large \
|
||||
@ -23,7 +23,7 @@ Prerequisites:
|
||||
|
||||
# OR CLS pooling (native CLS within chunks, MEAN aggregation across chunks)
|
||||
vllm serve BAAI/bge-large-en-v1.5 \
|
||||
--override-pooler-config \
|
||||
--pooler-config \
|
||||
'{"pooling_type": "CLS", "normalize": true, ' \
|
||||
'"enable_chunked_processing": true, "max_embed_len": 1048576}' \
|
||||
--served-model-name bge-large-en-v1.5 \
|
||||
|
||||
@ -103,7 +103,7 @@ POOLER_CONFIG="{\"pooling_type\": \"$POOLING_TYPE\", \"normalize\": true, \"enab
|
||||
vllm serve "$MODEL_NAME" \
|
||||
--tensor-parallel-size "$GPU_COUNT" \
|
||||
--enforce-eager \
|
||||
--override-pooler-config "$POOLER_CONFIG" \
|
||||
--pooler-config "$POOLER_CONFIG" \
|
||||
--served-model-name ${MODEL_CODE} \
|
||||
--api-key "$API_KEY" \
|
||||
--trust-remote-code \
|
||||
|
||||
@ -12,6 +12,12 @@ python examples/online_serving/pooling/cohere_rerank_client.py
|
||||
python examples/online_serving/pooling/jinaai_rerank_client.py
|
||||
```
|
||||
|
||||
## Named Entity Recognition (NER) usage
|
||||
|
||||
```bash
|
||||
python examples/online_serving/pooling/ner.py
|
||||
```
|
||||
|
||||
## Openai chat embedding for multimodal usage
|
||||
|
||||
```bash
|
||||
|
||||
71
examples/online_serving/pooling/ner.py
Normal file
71
examples/online_serving/pooling/ner.py
Normal file
@ -0,0 +1,71 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
# Adapted from https://huggingface.co/boltuix/NeuroBERT-NER
|
||||
|
||||
"""
|
||||
Example online usage of Pooling API for Named Entity Recognition (NER).
|
||||
|
||||
Run `vllm serve <model> --runner pooling`
|
||||
to start up the server in vLLM. e.g.
|
||||
|
||||
vllm serve boltuix/NeuroBERT-NER
|
||||
"""
|
||||
|
||||
import argparse
|
||||
|
||||
import requests
|
||||
import torch
|
||||
|
||||
|
||||
def post_http_request(prompt: dict, api_url: str) -> requests.Response:
|
||||
headers = {"User-Agent": "Test Client"}
|
||||
response = requests.post(api_url, headers=headers, json=prompt)
|
||||
return response
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--host", type=str, default="localhost")
|
||||
parser.add_argument("--port", type=int, default=8000)
|
||||
parser.add_argument("--model", type=str, default="boltuix/NeuroBERT-NER")
|
||||
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def main(args):
|
||||
from transformers import AutoConfig, AutoTokenizer
|
||||
|
||||
api_url = f"http://{args.host}:{args.port}/pooling"
|
||||
model_name = args.model
|
||||
|
||||
# Load tokenizer and config
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
||||
config = AutoConfig.from_pretrained(model_name)
|
||||
label_map = config.id2label
|
||||
|
||||
# Input text
|
||||
text = "Barack Obama visited Microsoft headquarters in Seattle on January 2025."
|
||||
prompt = {"model": model_name, "input": text}
|
||||
|
||||
pooling_response = post_http_request(prompt=prompt, api_url=api_url)
|
||||
|
||||
# Run inference
|
||||
output = pooling_response.json()["data"][0]
|
||||
logits = torch.tensor(output["data"])
|
||||
predictions = logits.argmax(dim=-1)
|
||||
inputs = tokenizer(text, return_tensors="pt")
|
||||
|
||||
# Map predictions to labels
|
||||
tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
|
||||
labels = [label_map[p.item()] for p in predictions]
|
||||
assert len(tokens) == len(predictions)
|
||||
|
||||
# Print results
|
||||
for token, label in zip(tokens, labels):
|
||||
if token not in tokenizer.all_special_tokens:
|
||||
print(f"{token:15} → {label}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parse_args()
|
||||
main(args)
|
||||
@ -86,7 +86,7 @@ PARAMS: dict[ConstraintsFormat, dict[str, Any]] = {
|
||||
"content": "Classify this sentiment: vLLM is wonderful!",
|
||||
}
|
||||
],
|
||||
"extra_body": {"guided_choice": ["positive", "negative"]},
|
||||
"extra_body": {"structured_outputs": {"choice": ["positive", "negative"]}},
|
||||
},
|
||||
"regex": {
|
||||
"messages": [
|
||||
@ -96,7 +96,7 @@ PARAMS: dict[ConstraintsFormat, dict[str, Any]] = {
|
||||
}
|
||||
],
|
||||
"extra_body": {
|
||||
"guided_regex": r"[a-z0-9.]{1,20}@\w{6,10}\.com\n",
|
||||
"structured_outputs": {"regex": r"[a-z0-9.]{1,20}@\w{6,10}\.com\n"},
|
||||
},
|
||||
},
|
||||
"json": {
|
||||
@ -122,7 +122,8 @@ PARAMS: dict[ConstraintsFormat, dict[str, Any]] = {
|
||||
}
|
||||
],
|
||||
"extra_body": {
|
||||
"guided_grammar": """
|
||||
"structured_outputs": {
|
||||
"grammar": """
|
||||
root ::= select_statement
|
||||
|
||||
select_statement ::= "SELECT " column " from " table " where " condition
|
||||
@ -135,6 +136,7 @@ condition ::= column "= " number
|
||||
|
||||
number ::= "1 " | "2 "
|
||||
""",
|
||||
}
|
||||
},
|
||||
},
|
||||
"structural_tag": {
|
||||
|
||||
@ -1,8 +1,6 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import argparse
|
||||
import dataclasses
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
@ -327,12 +325,7 @@ def main():
|
||||
|
||||
|
||||
if args.command == "serialize":
|
||||
eng_args_dict = {f.name: getattr(args, f.name) for f in
|
||||
dataclasses.fields(EngineArgs)}
|
||||
|
||||
engine_args = EngineArgs.from_cli_args(
|
||||
argparse.Namespace(**eng_args_dict)
|
||||
)
|
||||
engine_args = EngineArgs.from_cli_args(args)
|
||||
|
||||
input_dir = tensorizer_dir.rstrip('/')
|
||||
suffix = args.suffix if args.suffix else uuid.uuid4().hex
|
||||
|
||||
@ -79,6 +79,7 @@ plugins:
|
||||
- "re:vllm\\._.*" # Internal modules
|
||||
- "vllm.third_party"
|
||||
- "vllm.vllm_flash_attn"
|
||||
- !ENV [API_AUTONAV_EXCLUDE, "re:^$"] # Match nothing by default
|
||||
- mkdocstrings:
|
||||
handlers:
|
||||
python:
|
||||
|
||||
@ -24,7 +24,7 @@ outlines_core == 0.2.11
|
||||
# required for outlines backend disk cache
|
||||
diskcache == 5.6.3
|
||||
lark == 1.2.2
|
||||
xgrammar == 0.1.23; platform_machine == "x86_64" or platform_machine == "aarch64" or platform_machine == "arm64"
|
||||
xgrammar == 0.1.24; platform_machine == "x86_64" or platform_machine == "aarch64" or platform_machine == "arm64"
|
||||
typing_extensions >= 4.10
|
||||
filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317
|
||||
partial-json-parser # used for parsing partial JSON outputs
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
# This file was autogenerated by uv via the following command:
|
||||
# uv pip compile requirements/test.in -o requirements/test.txt --index-strategy unsafe-best-match --torch-backend cu128
|
||||
# uv pip compile requirements/test.in -o requirements/test.txt --index-strategy unsafe-best-match --torch-backend cu128 --python-platform x86_64-manylinux_2_28
|
||||
absl-py==2.1.0
|
||||
# via rouge-score
|
||||
accelerate==1.0.1
|
||||
|
||||
@ -1,54 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""vllm.entrypoints.api_server with some extra logging for testing."""
|
||||
from collections.abc import Iterable
|
||||
from typing import Any
|
||||
|
||||
import uvicorn
|
||||
from fastapi.responses import JSONResponse, Response
|
||||
|
||||
import vllm.entrypoints.api_server
|
||||
import vllm.envs as envs
|
||||
from vllm.engine.arg_utils import AsyncEngineArgs
|
||||
from vllm.engine.async_llm_engine import AsyncLLMEngine
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
|
||||
app = vllm.entrypoints.api_server.app
|
||||
|
||||
|
||||
class AsyncLLMEngineWithStats(AsyncLLMEngine):
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self._num_aborts = 0
|
||||
|
||||
async def _engine_abort(self, request_ids: Iterable[str]):
|
||||
ids = list(request_ids)
|
||||
self._num_aborts += len(ids)
|
||||
await super()._engine_abort(ids)
|
||||
|
||||
def testing_stats(self) -> dict[str, Any]:
|
||||
return {"num_aborted_requests": self._num_aborts}
|
||||
|
||||
|
||||
@app.get("/stats")
|
||||
def stats() -> Response:
|
||||
"""Get the statistics of the engine."""
|
||||
return JSONResponse(engine.testing_stats())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = FlexibleArgumentParser()
|
||||
parser.add_argument("--host", type=str, default="localhost")
|
||||
parser.add_argument("--port", type=int, default=8000)
|
||||
parser = AsyncEngineArgs.add_cli_args(parser)
|
||||
args = parser.parse_args()
|
||||
|
||||
engine_args = AsyncEngineArgs.from_cli_args(args)
|
||||
engine = AsyncLLMEngineWithStats.from_engine_args(engine_args)
|
||||
vllm.entrypoints.api_server.engine = engine
|
||||
uvicorn.run(app,
|
||||
host=args.host,
|
||||
port=args.port,
|
||||
log_level="debug",
|
||||
timeout_keep_alive=envs.VLLM_HTTP_TIMEOUT_KEEP_ALIVE)
|
||||
@ -1,12 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.fixture(scope="function", autouse=True)
|
||||
def use_v0_only(monkeypatch):
|
||||
"""
|
||||
Since this module is V0 only, set VLLM_USE_V1=0 for
|
||||
all tests in the module.
|
||||
"""
|
||||
monkeypatch.setenv('VLLM_USE_V1', '0')
|
||||
@ -1,139 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import copyreg
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
import time
|
||||
from multiprocessing import Pool
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
import requests
|
||||
import urllib3.exceptions
|
||||
|
||||
|
||||
def _pickle_new_connection_error(obj):
|
||||
"""Custom pickler for NewConnectionError to fix tblib compatibility."""
|
||||
# Extract the original message by removing the "conn: " prefix
|
||||
full_message = obj.args[0] if obj.args else ""
|
||||
if ': ' in full_message:
|
||||
# Split off the connection part and keep the actual message
|
||||
_, actual_message = full_message.split(': ', 1)
|
||||
else:
|
||||
actual_message = full_message
|
||||
return _unpickle_new_connection_error, (actual_message, )
|
||||
|
||||
|
||||
def _unpickle_new_connection_error(message):
|
||||
"""Custom unpickler for NewConnectionError."""
|
||||
# Create with None as conn and the actual message
|
||||
return urllib3.exceptions.NewConnectionError(None, message)
|
||||
|
||||
|
||||
# Register the custom pickle/unpickle functions for tblib compatibility
|
||||
copyreg.pickle(urllib3.exceptions.NewConnectionError,
|
||||
_pickle_new_connection_error)
|
||||
|
||||
|
||||
def _query_server(prompt: str, max_tokens: int = 5) -> dict:
|
||||
response = requests.post("http://localhost:8000/generate",
|
||||
json={
|
||||
"prompt": prompt,
|
||||
"max_tokens": max_tokens,
|
||||
"temperature": 0,
|
||||
"ignore_eos": True
|
||||
})
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
|
||||
|
||||
def _query_server_long(prompt: str) -> dict:
|
||||
return _query_server(prompt, max_tokens=500)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def api_server(distributed_executor_backend: str):
|
||||
script_path = Path(__file__).parent.joinpath(
|
||||
"api_server_async_engine.py").absolute()
|
||||
commands = [
|
||||
sys.executable,
|
||||
"-u",
|
||||
str(script_path),
|
||||
"--model",
|
||||
"facebook/opt-125m",
|
||||
"--host",
|
||||
"127.0.0.1",
|
||||
"--distributed-executor-backend",
|
||||
distributed_executor_backend,
|
||||
]
|
||||
|
||||
# API Server Test Requires V0.
|
||||
my_env = os.environ.copy()
|
||||
my_env["VLLM_USE_V1"] = "0"
|
||||
uvicorn_process = subprocess.Popen(commands, env=my_env)
|
||||
yield
|
||||
uvicorn_process.terminate()
|
||||
|
||||
|
||||
@pytest.mark.timeout(300)
|
||||
@pytest.mark.parametrize("distributed_executor_backend", ["mp", "ray"])
|
||||
def test_api_server(api_server, distributed_executor_backend: str):
|
||||
"""
|
||||
Run the API server and test it.
|
||||
|
||||
We run both the server and requests in separate processes.
|
||||
|
||||
We test that the server can handle incoming requests, including
|
||||
multiple requests at the same time, and that it can handle requests
|
||||
being cancelled without crashing.
|
||||
"""
|
||||
with Pool(32) as pool:
|
||||
# Wait until the server is ready
|
||||
prompts = ["warm up"] * 1
|
||||
result = None
|
||||
while not result:
|
||||
try:
|
||||
for r in pool.map(_query_server, prompts):
|
||||
result = r
|
||||
break
|
||||
except requests.exceptions.ConnectionError:
|
||||
time.sleep(1)
|
||||
|
||||
# Actual tests start here
|
||||
# Try with 1 prompt
|
||||
for result in pool.map(_query_server, prompts):
|
||||
assert result
|
||||
|
||||
num_aborted_requests = requests.get(
|
||||
"http://localhost:8000/stats").json()["num_aborted_requests"]
|
||||
assert num_aborted_requests == 0
|
||||
|
||||
# Try with 100 prompts
|
||||
prompts = ["test prompt"] * 100
|
||||
for result in pool.map(_query_server, prompts):
|
||||
assert result
|
||||
|
||||
with Pool(32) as pool:
|
||||
# Cancel requests
|
||||
prompts = ["canceled requests"] * 100
|
||||
pool.map_async(_query_server_long, prompts)
|
||||
time.sleep(0.01)
|
||||
pool.terminate()
|
||||
pool.join()
|
||||
|
||||
# check cancellation stats
|
||||
# give it some time to update the stats
|
||||
time.sleep(1)
|
||||
|
||||
num_aborted_requests = requests.get(
|
||||
"http://localhost:8000/stats").json()["num_aborted_requests"]
|
||||
assert num_aborted_requests > 0
|
||||
|
||||
# check that server still runs after cancellations
|
||||
with Pool(32) as pool:
|
||||
# Try with 100 prompts
|
||||
prompts = ["test prompt after canceled"] * 100
|
||||
for result in pool.map(_query_server, prompts):
|
||||
assert result
|
||||
@ -1,71 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.engine.async_llm_engine import RequestTracker
|
||||
from vllm.outputs import RequestOutput
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_request_tracker():
|
||||
tracker = RequestTracker()
|
||||
stream_1 = tracker.add_request("1")
|
||||
assert tracker.new_requests_event.is_set()
|
||||
await tracker.wait_for_new_requests()
|
||||
new, aborted = tracker.get_new_and_aborted_requests()
|
||||
assert not tracker.new_requests_event.is_set()
|
||||
assert len(new) == 1
|
||||
assert new[0]["request_id"] == "1"
|
||||
assert not aborted
|
||||
assert not stream_1.finished
|
||||
|
||||
stream_2 = tracker.add_request("2")
|
||||
stream_3 = tracker.add_request("3")
|
||||
assert tracker.new_requests_event.is_set()
|
||||
await tracker.wait_for_new_requests()
|
||||
new, aborted = tracker.get_new_and_aborted_requests()
|
||||
assert not tracker.new_requests_event.is_set()
|
||||
assert len(new) == 2
|
||||
assert new[0]["request_id"] == "2"
|
||||
assert new[1]["request_id"] == "3"
|
||||
assert not aborted
|
||||
assert not stream_2.finished
|
||||
assert not stream_3.finished
|
||||
|
||||
# request_ids must be unique
|
||||
with pytest.raises(KeyError):
|
||||
tracker.add_request("1")
|
||||
assert not tracker.new_requests_event.is_set()
|
||||
|
||||
tracker.abort_request("1")
|
||||
new, aborted = tracker.get_new_and_aborted_requests()
|
||||
assert len(aborted) == 1
|
||||
assert "1" in aborted
|
||||
assert not new
|
||||
assert stream_1.finished
|
||||
|
||||
stream_4 = tracker.add_request("4")
|
||||
tracker.abort_request("4")
|
||||
assert tracker.new_requests_event.is_set()
|
||||
await tracker.wait_for_new_requests()
|
||||
new, aborted = tracker.get_new_and_aborted_requests()
|
||||
# aborted new requests will cancel each other out -
|
||||
# there's no need for them to propagate into the
|
||||
# engine
|
||||
assert not aborted
|
||||
assert not new
|
||||
assert stream_4.finished
|
||||
|
||||
stream_5 = tracker.add_request("5")
|
||||
assert tracker.new_requests_event.is_set()
|
||||
tracker.process_request_output(
|
||||
RequestOutput("2", "output", [], [], [], finished=True))
|
||||
await tracker.wait_for_new_requests()
|
||||
new, aborted = tracker.get_new_and_aborted_requests()
|
||||
assert not tracker.new_requests_event.is_set()
|
||||
assert not aborted
|
||||
assert len(new) == 1
|
||||
assert new[0]["request_id"] == "5"
|
||||
assert stream_2.finished
|
||||
assert not stream_5.finished
|
||||
@ -76,11 +76,6 @@ def test_models(
|
||||
model_executor: str,
|
||||
enable_prompt_embeds: bool,
|
||||
) -> None:
|
||||
|
||||
if enable_prompt_embeds and envs.is_set(
|
||||
"VLLM_USE_V1") and envs.VLLM_USE_V1:
|
||||
pytest.skip("enable_prompt_embeds is not supported in v1.")
|
||||
|
||||
if not envs.VLLM_USE_V1:
|
||||
if async_scheduling:
|
||||
pytest.skip("async_scheduling only supported in v1.")
|
||||
@ -164,11 +159,6 @@ def test_models_distributed(
|
||||
extra_env: dict[str, str],
|
||||
enable_prompt_embeds: bool,
|
||||
) -> None:
|
||||
|
||||
if enable_prompt_embeds and envs.is_set(
|
||||
"VLLM_USE_V1") and envs.VLLM_USE_V1:
|
||||
pytest.skip("enable_prompt_embeds is not supported in v1.")
|
||||
|
||||
if test_suite != TARGET_TEST_SUITE:
|
||||
pytest.skip(f"Skip test for {test_suite}")
|
||||
|
||||
|
||||
@ -1,189 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Compare the short outputs of HF and vLLM when using greedy sampling.
|
||||
|
||||
VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 has to be set before running this test.
|
||||
|
||||
Run `VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1
|
||||
pytest tests/basic_correctness/test_preemption.py`.
|
||||
"""
|
||||
import pytest
|
||||
from prometheus_client import REGISTRY
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm import SamplingParams
|
||||
from vllm.core.scheduler import (ARTIFICIAL_PREEMPTION_MAX_CNT,
|
||||
ENABLE_ARTIFICIAL_PREEMPT)
|
||||
|
||||
from ..models.utils import check_outputs_equal
|
||||
|
||||
MODELS = [
|
||||
"distilbert/distilgpt2",
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture(scope="function", autouse=True)
|
||||
def use_v0_only(monkeypatch):
|
||||
"""
|
||||
We should enable this for V1, but VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT,
|
||||
so use VLLM_USE_V1=0 for all tests in the file.
|
||||
"""
|
||||
monkeypatch.setenv('VLLM_USE_V1', '0')
|
||||
|
||||
|
||||
@pytest.fixture(scope="module", autouse=True)
|
||||
def check_settings():
|
||||
assert ENABLE_ARTIFICIAL_PREEMPT is True, (
|
||||
"Use an env var VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1."
|
||||
"`VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 "
|
||||
"pytest tests/basic_correctness/test_preemption.py`")
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def distributed_executor_backend() -> str:
|
||||
# When SPMD worker is used, use distributed_executor_backend="ray"
|
||||
# to test delta input optimization works with preemption.
|
||||
return "ray" if envs.VLLM_USE_RAY_SPMD_WORKER else "mp"
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
@pytest.mark.parametrize("max_tokens", [96])
|
||||
@pytest.mark.parametrize("chunked_prefill_token_size", [16])
|
||||
def test_chunked_prefill_recompute(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
chunked_prefill_token_size: int,
|
||||
distributed_executor_backend: str,
|
||||
) -> None:
|
||||
"""Ensure that chunked prefill works with preemption."""
|
||||
max_num_seqs = min(chunked_prefill_token_size, 256)
|
||||
enable_chunked_prefill = False
|
||||
max_num_batched_tokens = None
|
||||
if chunked_prefill_token_size != -1:
|
||||
enable_chunked_prefill = True
|
||||
max_num_batched_tokens = chunked_prefill_token_size
|
||||
|
||||
with hf_runner(model, dtype=dtype) as hf_model:
|
||||
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
with vllm_runner(
|
||||
model,
|
||||
dtype=dtype,
|
||||
max_num_batched_tokens=max_num_batched_tokens,
|
||||
enable_chunked_prefill=enable_chunked_prefill,
|
||||
max_num_seqs=max_num_seqs,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
disable_log_stats=False,
|
||||
) as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
assert (vllm_model.llm.llm_engine.scheduler[0].artificial_preempt_cnt
|
||||
< ARTIFICIAL_PREEMPTION_MAX_CNT)
|
||||
|
||||
for i in range(len(example_prompts)):
|
||||
hf_output_ids, hf_output_str = hf_outputs[i]
|
||||
vllm_output_ids, vllm_output_str = vllm_outputs[i]
|
||||
assert hf_output_str == vllm_output_str, (
|
||||
f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
|
||||
assert hf_output_ids == vllm_output_ids, (
|
||||
f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["float"])
|
||||
@pytest.mark.parametrize("max_tokens", [96])
|
||||
def test_preemption(
|
||||
caplog_vllm,
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
distributed_executor_backend: str,
|
||||
) -> None:
|
||||
"""By default, recompute preemption is enabled"""
|
||||
|
||||
with hf_runner(model, dtype=dtype) as hf_model:
|
||||
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
with vllm_runner(
|
||||
model,
|
||||
dtype=dtype,
|
||||
disable_log_stats=False,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
) as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
assert (vllm_model.llm.llm_engine.scheduler[0].artificial_preempt_cnt
|
||||
< ARTIFICIAL_PREEMPTION_MAX_CNT)
|
||||
total_preemption = (
|
||||
vllm_model.llm.llm_engine.scheduler[0].num_cumulative_preemption)
|
||||
|
||||
check_outputs_equal(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=vllm_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
)
|
||||
|
||||
assert ("is preempted by PreemptionMode.RECOMPUTE mode because there "
|
||||
"is not enough KV cache space." in caplog_vllm.text)
|
||||
# Ensure the count bucket of request-level histogram metrics matches
|
||||
# the number of requests as a simple sanity check to ensure metrics are
|
||||
# generated
|
||||
preemption_metrics = None
|
||||
for m in REGISTRY.collect():
|
||||
if m.name == "vllm:num_preemptions":
|
||||
preemption_metrics = m
|
||||
assert preemption_metrics is not None
|
||||
total_recorded_preemption = 0
|
||||
for sample in preemption_metrics.samples:
|
||||
total_recorded_preemption += sample.value
|
||||
assert total_preemption == total_recorded_preemption
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["float"])
|
||||
@pytest.mark.parametrize("max_tokens", [96])
|
||||
def test_preemption_infeasible(
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
distributed_executor_backend: str,
|
||||
) -> None:
|
||||
"""Verify infeasible preemption request will be ignored."""
|
||||
BLOCK_SIZE = 16
|
||||
prefill_blocks = 2
|
||||
decode_blocks = max_tokens // BLOCK_SIZE
|
||||
with vllm_runner(
|
||||
model,
|
||||
dtype=dtype,
|
||||
block_size=BLOCK_SIZE,
|
||||
# Not enough gpu blocks to complete a single sequence.
|
||||
# preemption should happen, and the sequence should be
|
||||
# ignored instead of hanging forever.
|
||||
num_gpu_blocks_override=prefill_blocks + decode_blocks // 2,
|
||||
max_model_len=((prefill_blocks + decode_blocks // 2) * BLOCK_SIZE),
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
) as vllm_model:
|
||||
sampling_params = SamplingParams(max_tokens=max_tokens,
|
||||
ignore_eos=True)
|
||||
req_outputs = vllm_model.llm.generate(
|
||||
example_prompts,
|
||||
sampling_params=sampling_params,
|
||||
)
|
||||
|
||||
assert (vllm_model.llm.llm_engine.scheduler[0].artificial_preempt_cnt
|
||||
< ARTIFICIAL_PREEMPTION_MAX_CNT)
|
||||
|
||||
# Verify the request is ignored and not hang.
|
||||
for req_output in req_outputs:
|
||||
outputs = req_output.outputs
|
||||
assert len(outputs) == 1
|
||||
assert outputs[0].finish_reason == "length"
|
||||
@ -68,7 +68,7 @@ def test_bench_serve_chat(server):
|
||||
"5",
|
||||
"--endpoint",
|
||||
"/v1/chat/completions",
|
||||
"--endpoint-type",
|
||||
"--backend",
|
||||
"openai-chat",
|
||||
]
|
||||
result = subprocess.run(command, capture_output=True, text=True)
|
||||
|
||||
@ -39,7 +39,8 @@ from vllm import LLM, SamplingParams
|
||||
from vllm.assets.audio import AudioAsset
|
||||
from vllm.assets.image import ImageAsset
|
||||
from vllm.assets.video import VideoAsset
|
||||
from vllm.config import ConvertOption, RunnerOption, _get_and_verify_dtype
|
||||
from vllm.config.model import (ConvertOption, RunnerOption,
|
||||
_get_and_verify_dtype)
|
||||
from vllm.connections import global_http_connection
|
||||
from vllm.distributed import (cleanup_dist_env_and_memory,
|
||||
init_distributed_environment,
|
||||
@ -244,39 +245,6 @@ class DecoderPromptType(Enum):
|
||||
EMPTY_STR = 3
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def example_encoder_decoder_prompts(
|
||||
) -> dict[DecoderPromptType, list[ExplicitEncoderDecoderPrompt]]:
|
||||
'''
|
||||
Returns an encoder prompt list and a decoder prompt list, wherein each pair
|
||||
of same-index entries in both lists corresponds to an (encoder prompt,
|
||||
decoder prompt) tuple.
|
||||
|
||||
Returns:
|
||||
|
||||
* Encoder prompt list
|
||||
* Decoder prompt list (reverse of encoder prompt list)
|
||||
'''
|
||||
|
||||
encoder_prompts = []
|
||||
for filename in _TEST_PROMPTS:
|
||||
encoder_prompts += _read_prompts(filename)
|
||||
|
||||
custom_decoder_prompts = encoder_prompts[::-1]
|
||||
empty_str_decoder_prompts = [""] * len(encoder_prompts)
|
||||
none_decoder_prompts = [None] * len(encoder_prompts)
|
||||
|
||||
# NONE decoder prompt type
|
||||
return {
|
||||
DecoderPromptType.NONE:
|
||||
zip_enc_dec_prompts(encoder_prompts, none_decoder_prompts),
|
||||
DecoderPromptType.EMPTY_STR:
|
||||
zip_enc_dec_prompts(encoder_prompts, empty_str_decoder_prompts),
|
||||
DecoderPromptType.CUSTOM:
|
||||
zip_enc_dec_prompts(encoder_prompts, custom_decoder_prompts),
|
||||
}
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def example_long_prompts() -> list[str]:
|
||||
prompts = []
|
||||
@ -690,68 +658,6 @@ class HfRunner:
|
||||
return [(output_ids, output_str, output_logprobs)
|
||||
for output_ids, output_str, output_logprobs in outputs]
|
||||
|
||||
def generate_encoder_decoder_greedy_logprobs_limit(
|
||||
self,
|
||||
encoder_decoder_prompts: list[ExplicitEncoderDecoderPrompt[str, str]],
|
||||
max_tokens: int,
|
||||
num_logprobs: Optional[int],
|
||||
images: Optional[PromptImageInput] = None,
|
||||
**kwargs: Any,
|
||||
) -> list[TokensTextLogprobs]:
|
||||
'''
|
||||
Greedy logprobs generation for vLLM encoder/decoder models
|
||||
'''
|
||||
|
||||
all_logprobs: list[list[dict[int, float]]] = []
|
||||
all_output_ids: list[list[int]] = []
|
||||
all_output_strs: list[str] = []
|
||||
|
||||
for i, (encoder_prompt, decoder_prompt) in enumerate(
|
||||
to_enc_dec_tuple_list(encoder_decoder_prompts)):
|
||||
processor_kwargs: dict[str, Any] = {
|
||||
"text": encoder_prompt,
|
||||
"return_tensors": "pt",
|
||||
}
|
||||
if images is not None and images[i] is not None:
|
||||
processor_kwargs["images"] = images[i]
|
||||
|
||||
encoder_inputs = self.processor(**processor_kwargs)
|
||||
encoder_inputs = self.wrap_device(encoder_inputs)
|
||||
|
||||
if decoder_prompt is None:
|
||||
decoder_input_ids = None
|
||||
else:
|
||||
decoder_inputs = self.tokenizer(decoder_prompt,
|
||||
return_tensors="pt")
|
||||
decoder_input_ids = self.wrap_device(decoder_inputs.input_ids)
|
||||
|
||||
output = self.model.generate(
|
||||
decoder_input_ids=decoder_input_ids,
|
||||
use_cache=True,
|
||||
do_sample=False,
|
||||
max_new_tokens=max_tokens,
|
||||
output_hidden_states=True,
|
||||
return_dict_in_generate=True,
|
||||
**encoder_inputs,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
(
|
||||
seq_logprobs_lst,
|
||||
output_len,
|
||||
) = self._hidden_states_to_logprobs(output.decoder_hidden_states,
|
||||
num_logprobs)
|
||||
|
||||
all_logprobs.append(seq_logprobs_lst)
|
||||
seq_ids = output.sequences[0]
|
||||
output_ids = seq_ids[-output_len:]
|
||||
all_output_ids.append(output_ids.tolist())
|
||||
all_output_strs.append(self.tokenizer.decode(output_ids))
|
||||
|
||||
outputs = zip(all_output_ids, all_output_strs, all_logprobs)
|
||||
return [(output_ids, output_str, output_logprobs)
|
||||
for output_ids, output_str, output_logprobs in outputs]
|
||||
|
||||
def encode(self, prompts: list[str], *args,
|
||||
**kwargs) -> list[list[torch.Tensor]]:
|
||||
return self.model.encode(prompts, *args, **kwargs)
|
||||
@ -940,26 +846,6 @@ class VllmRunner:
|
||||
if sampling_params.prompt_logprobs is None else
|
||||
toks_str_logsprobs_prompt_logprobs)
|
||||
|
||||
def generate_encoder_decoder_w_logprobs(
|
||||
self,
|
||||
encoder_decoder_prompts: list[ExplicitEncoderDecoderPrompt[str, str]],
|
||||
sampling_params: SamplingParams,
|
||||
) -> Union[list[TokensTextLogprobs],
|
||||
list[TokensTextLogprobsPromptLogprobs]]:
|
||||
'''
|
||||
Logprobs generation for vLLM encoder/decoder models
|
||||
'''
|
||||
|
||||
assert sampling_params.logprobs is not None
|
||||
req_outputs = self.llm.generate(encoder_decoder_prompts,
|
||||
sampling_params=sampling_params)
|
||||
toks_str_logsprobs_prompt_logprobs = (
|
||||
self._final_steps_generate_w_logprobs(req_outputs))
|
||||
# Omit prompt logprobs if not required by sampling params
|
||||
return ([x[0:-1] for x in toks_str_logsprobs_prompt_logprobs]
|
||||
if sampling_params.prompt_logprobs is None else
|
||||
toks_str_logsprobs_prompt_logprobs)
|
||||
|
||||
def generate_greedy(
|
||||
self,
|
||||
prompts: Union[list[str], list[torch.Tensor]],
|
||||
@ -1037,29 +923,6 @@ class VllmRunner:
|
||||
|
||||
return perplexities
|
||||
|
||||
def generate_encoder_decoder_greedy_logprobs(
|
||||
self,
|
||||
encoder_decoder_prompts: list[ExplicitEncoderDecoderPrompt[str, str]],
|
||||
max_tokens: int,
|
||||
num_logprobs: Optional[int],
|
||||
num_prompt_logprobs: Optional[int] = None,
|
||||
skip_special_tokens: bool = True,
|
||||
) -> Union[list[TokensTextLogprobs],
|
||||
list[TokensTextLogprobsPromptLogprobs]]:
|
||||
greedy_logprobs_params = SamplingParams(
|
||||
temperature=0.0,
|
||||
max_tokens=max_tokens,
|
||||
logprobs=num_logprobs,
|
||||
prompt_logprobs=(num_prompt_logprobs),
|
||||
skip_special_tokens=skip_special_tokens,
|
||||
)
|
||||
'''
|
||||
Greedy logprobs generation for vLLM encoder/decoder models
|
||||
'''
|
||||
|
||||
return self.generate_encoder_decoder_w_logprobs(
|
||||
encoder_decoder_prompts, greedy_logprobs_params)
|
||||
|
||||
def generate_beam_search(
|
||||
self,
|
||||
prompts: list[str],
|
||||
|
||||
@ -1,11 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def v1(run_with_both_engines):
|
||||
# Simple autouse wrapper to run both engines for each test
|
||||
# This can be promoted up to conftest.py to run for every
|
||||
# test in a package
|
||||
pass
|
||||
@ -1,83 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.engine.output_processor.stop_checker import StopChecker
|
||||
from vllm.inputs import token_inputs
|
||||
from vllm.sampling_params import SamplingParams
|
||||
from vllm.sequence import Logprob, Sequence, SequenceStatus
|
||||
|
||||
|
||||
def sequence_with_eos(text: str, eos_token: str,
|
||||
eos_token_id: int) -> Sequence:
|
||||
"""
|
||||
Create a Sequence that ends with an EOS token.
|
||||
"""
|
||||
seq = Sequence(
|
||||
seq_id=0,
|
||||
inputs=token_inputs([]),
|
||||
block_size=16,
|
||||
eos_token_id=eos_token_id,
|
||||
)
|
||||
seq.output_text = text + eos_token
|
||||
|
||||
offset = eos_token_id + 1
|
||||
for i in range(offset, len(text) + offset):
|
||||
seq.append_token_id(token_id=i, logprobs={i: Logprob(0.0)})
|
||||
seq.append_token_id(token_id=eos_token_id,
|
||||
logprobs={eos_token_id: Logprob(0.0)})
|
||||
|
||||
seq.status = SequenceStatus.RUNNING
|
||||
|
||||
return seq
|
||||
|
||||
|
||||
@pytest.mark.parametrize(["text_wo_eos", "eos_token", "eos_token_id"], [
|
||||
("This text ends with EOS token", "</s>", 2),
|
||||
])
|
||||
@pytest.mark.parametrize("ignore_eos", [True, False])
|
||||
@pytest.mark.parametrize("include_stop_str_in_output", [True, False])
|
||||
@pytest.mark.skip_global_cleanup
|
||||
def test_stop_on_eos_token(text_wo_eos: str, eos_token: str, eos_token_id: int,
|
||||
ignore_eos: bool, include_stop_str_in_output: bool):
|
||||
"""
|
||||
Test the behavior of the StopChecker's maybe_stop_sequence method
|
||||
when an EOS token is encountered.
|
||||
|
||||
This test covers:
|
||||
- When the EOS token should stop the sequence and be removed from the output
|
||||
- When the EOS token should stop the sequence and be included in the output
|
||||
- When the EOS token should be ignored, and the sequence continues
|
||||
"""
|
||||
|
||||
stop_checker = StopChecker(max_model_len=1024)
|
||||
|
||||
seq = sequence_with_eos(
|
||||
text=text_wo_eos,
|
||||
eos_token=eos_token,
|
||||
eos_token_id=eos_token_id,
|
||||
)
|
||||
new_char_count = len(eos_token)
|
||||
|
||||
# Note that `stop` and `stop_token_ids` are not specified
|
||||
sampling_params = SamplingParams(
|
||||
min_tokens=1,
|
||||
ignore_eos=ignore_eos,
|
||||
include_stop_str_in_output=include_stop_str_in_output)
|
||||
|
||||
stop_checker.maybe_stop_sequence(
|
||||
seq=seq,
|
||||
new_char_count=new_char_count,
|
||||
sampling_params=sampling_params,
|
||||
)
|
||||
|
||||
if ignore_eos:
|
||||
assert seq.status == SequenceStatus.RUNNING
|
||||
assert seq.output_text == text_wo_eos + eos_token
|
||||
elif include_stop_str_in_output:
|
||||
assert seq.status == SequenceStatus.FINISHED_STOPPED
|
||||
assert seq.output_text == text_wo_eos + eos_token
|
||||
else:
|
||||
assert seq.status == SequenceStatus.FINISHED_STOPPED
|
||||
assert seq.output_text == text_wo_eos
|
||||
@ -14,7 +14,7 @@ from typing import Literal, NamedTuple, Optional
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.config import _FLOAT16_NOT_SUPPORTED_MODELS, RunnerOption
|
||||
from vllm.config.model import _FLOAT16_NOT_SUPPORTED_MODELS, RunnerOption
|
||||
from vllm.logger import init_logger
|
||||
from vllm.transformers_utils.config import get_config
|
||||
|
||||
|
||||
@ -184,7 +184,7 @@ def sample_enum_json_schema():
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sample_guided_choice():
|
||||
def sample_structured_outputs_choices():
|
||||
return [
|
||||
"Python", "Java", "JavaScript", "C++", "C#", "PHP", "TypeScript",
|
||||
"Ruby", "Swift", "Kotlin"
|
||||
|
||||
@ -1,82 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import sys
|
||||
from contextlib import nullcontext
|
||||
|
||||
from vllm_test_utils import BlameResult, blame
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.distributed import cleanup_dist_env_and_memory
|
||||
from vllm.sampling_params import GuidedDecodingParams
|
||||
|
||||
|
||||
def run_normal():
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
||||
|
||||
# Create an LLM without guided decoding as a baseline.
|
||||
llm = LLM(model="distilbert/distilgpt2",
|
||||
enforce_eager=True,
|
||||
gpu_memory_utilization=0.3)
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
|
||||
# Destroy the LLM object and free up the GPU memory.
|
||||
del llm
|
||||
cleanup_dist_env_and_memory()
|
||||
|
||||
|
||||
def run_xgrammar(sample_regex):
|
||||
# Create an LLM with guided decoding enabled.
|
||||
llm = LLM(model="distilbert/distilgpt2",
|
||||
enforce_eager=True,
|
||||
guided_decoding_backend="xgrammar",
|
||||
gpu_memory_utilization=0.3)
|
||||
prompt = f"Give an example IPv4 address with this regex: {sample_regex}"
|
||||
guided_decoding = GuidedDecodingParams(regex=sample_regex)
|
||||
sampling_params = SamplingParams(temperature=0.8,
|
||||
top_p=0.95,
|
||||
guided_decoding=guided_decoding)
|
||||
outputs = llm.generate(
|
||||
prompts=[prompt] * 2,
|
||||
sampling_params=sampling_params,
|
||||
use_tqdm=True,
|
||||
)
|
||||
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
|
||||
|
||||
def test_lazy_outlines(sample_regex):
|
||||
"""If users don't use guided decoding, outlines should not be imported.
|
||||
"""
|
||||
# make sure outlines is not imported
|
||||
module_name = "outlines"
|
||||
# In CI, we only check finally if the module is imported.
|
||||
# If it is indeed imported, we can rerun the test with `use_blame=True`,
|
||||
# which will trace every function call to find the first import location,
|
||||
# and help find the root cause.
|
||||
# We don't run it in CI by default because it is slow.
|
||||
use_blame = False
|
||||
context = blame(
|
||||
lambda: module_name in sys.modules) if use_blame else nullcontext()
|
||||
with context as result:
|
||||
run_normal()
|
||||
run_xgrammar(sample_regex)
|
||||
if use_blame:
|
||||
assert isinstance(result, BlameResult)
|
||||
print(f"the first import location is:\n{result.trace_stack}")
|
||||
assert module_name not in sys.modules, (
|
||||
f"Module {module_name} is imported. To see the first"
|
||||
f" import location, run the test with `use_blame=True`.")
|
||||
@ -81,13 +81,3 @@ def test_lm_eval_accuracy_v1_engine(monkeypatch: pytest.MonkeyPatch):
|
||||
more_args = ["--max-num-seqs", "64"]
|
||||
|
||||
run_test(more_args)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("more_args", MORE_ARGS_LIST)
|
||||
def test_lm_eval_accuracy_v0_engine(monkeypatch: pytest.MonkeyPatch,
|
||||
more_args):
|
||||
"""Run with the V0 Engine."""
|
||||
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_USE_V1", "0")
|
||||
run_test(more_args)
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# imports for guided decoding tests
|
||||
# imports for structured outputs tests
|
||||
import json
|
||||
from typing import Optional
|
||||
|
||||
@ -28,11 +28,9 @@ def monkeypatch_module():
|
||||
mpatch.undo()
|
||||
|
||||
|
||||
@pytest.fixture(scope="module", params=[False, True])
|
||||
def server(request, monkeypatch_module, zephyr_lora_files): #noqa: F811
|
||||
|
||||
use_v1 = request.param
|
||||
monkeypatch_module.setenv('VLLM_USE_V1', '1' if use_v1 else '0')
|
||||
@pytest.fixture(scope="module")
|
||||
def server(monkeypatch_module, zephyr_lora_files): #noqa: F811
|
||||
monkeypatch_module.setenv('VLLM_USE_V1', '1')
|
||||
|
||||
args = [
|
||||
# use half precision for speed and memory savings in CI environment
|
||||
@ -57,13 +55,6 @@ def server(request, monkeypatch_module, zephyr_lora_files): #noqa: F811
|
||||
yield remote_server
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def is_v1_server(server):
|
||||
import os
|
||||
assert os.environ['VLLM_USE_V1'] in ['0', '1']
|
||||
return os.environ['VLLM_USE_V1'] == '1'
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def client(server):
|
||||
async with server.get_async_client() as async_client:
|
||||
@ -480,10 +471,10 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_guided_choice_chat(client: openai.AsyncOpenAI,
|
||||
sample_guided_choice, is_v1_server: bool):
|
||||
if not is_v1_server:
|
||||
pytest.skip("Guided decoding is only supported in v1 engine")
|
||||
async def test_structured_outputs_choice_chat(
|
||||
client: openai.AsyncOpenAI,
|
||||
sample_structured_outputs_choices,
|
||||
):
|
||||
messages = [{
|
||||
"role": "system",
|
||||
"content": "you are a helpful assistant"
|
||||
@ -498,9 +489,10 @@ async def test_guided_choice_chat(client: openai.AsyncOpenAI,
|
||||
messages=messages,
|
||||
max_completion_tokens=10,
|
||||
temperature=0.7,
|
||||
extra_body=dict(guided_choice=sample_guided_choice))
|
||||
extra_body=dict(
|
||||
structured_outputs={"choice": sample_structured_outputs_choices}))
|
||||
choice1 = chat_completion.choices[0].message.content
|
||||
assert choice1 in sample_guided_choice
|
||||
assert choice1 in sample_structured_outputs_choices
|
||||
|
||||
messages.append({"role": "assistant", "content": choice1})
|
||||
messages.append({
|
||||
@ -512,18 +504,18 @@ async def test_guided_choice_chat(client: openai.AsyncOpenAI,
|
||||
messages=messages,
|
||||
max_completion_tokens=10,
|
||||
temperature=0.7,
|
||||
extra_body=dict(guided_choice=sample_guided_choice))
|
||||
extra_body=dict(
|
||||
structured_outputs={"choice": sample_structured_outputs_choices}))
|
||||
choice2 = chat_completion.choices[0].message.content
|
||||
assert choice2 in sample_guided_choice
|
||||
assert choice2 in sample_structured_outputs_choices
|
||||
assert choice1 != choice2
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_guided_json_chat(client: openai.AsyncOpenAI, sample_json_schema,
|
||||
is_v1_server: bool):
|
||||
if not is_v1_server:
|
||||
pytest.skip("Guided decoding is only supported in v1 engine")
|
||||
|
||||
async def test_structured_outputs_json_chat(
|
||||
client: openai.AsyncOpenAI,
|
||||
sample_json_schema,
|
||||
):
|
||||
messages = [{
|
||||
"role": "system",
|
||||
"content": "you are a helpful assistant"
|
||||
@ -538,7 +530,7 @@ async def test_guided_json_chat(client: openai.AsyncOpenAI, sample_json_schema,
|
||||
model=MODEL_NAME,
|
||||
messages=messages,
|
||||
max_completion_tokens=1000,
|
||||
extra_body=dict(guided_json=sample_json_schema))
|
||||
extra_body=dict(structured_outputs={"json": sample_json_schema}))
|
||||
message = chat_completion.choices[0].message
|
||||
assert message.content is not None
|
||||
json1 = json.loads(message.content)
|
||||
@ -555,7 +547,7 @@ async def test_guided_json_chat(client: openai.AsyncOpenAI, sample_json_schema,
|
||||
model=MODEL_NAME,
|
||||
messages=messages,
|
||||
max_completion_tokens=1000,
|
||||
extra_body=dict(guided_json=sample_json_schema))
|
||||
extra_body=dict(structured_outputs={"json": sample_json_schema}))
|
||||
message = chat_completion.choices[0].message
|
||||
assert message.content is not None
|
||||
json2 = json.loads(message.content)
|
||||
@ -565,10 +557,10 @@ async def test_guided_json_chat(client: openai.AsyncOpenAI, sample_json_schema,
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_guided_regex_chat(client: openai.AsyncOpenAI, sample_regex,
|
||||
is_v1_server: bool):
|
||||
if not is_v1_server:
|
||||
pytest.skip("Guided decoding is only supported in v1 engine")
|
||||
async def test_structured_outputs_regex_chat(
|
||||
client: openai.AsyncOpenAI,
|
||||
sample_regex,
|
||||
):
|
||||
|
||||
messages = [{
|
||||
"role": "system",
|
||||
@ -583,7 +575,7 @@ async def test_guided_regex_chat(client: openai.AsyncOpenAI, sample_regex,
|
||||
model=MODEL_NAME,
|
||||
messages=messages,
|
||||
max_completion_tokens=20,
|
||||
extra_body=dict(guided_regex=sample_regex))
|
||||
extra_body=dict(structured_outputs={"regex": sample_regex}))
|
||||
ip1 = chat_completion.choices[0].message.content
|
||||
assert ip1 is not None
|
||||
assert re.fullmatch(sample_regex, ip1) is not None
|
||||
@ -594,7 +586,7 @@ async def test_guided_regex_chat(client: openai.AsyncOpenAI, sample_regex,
|
||||
model=MODEL_NAME,
|
||||
messages=messages,
|
||||
max_completion_tokens=20,
|
||||
extra_body=dict(guided_regex=sample_regex))
|
||||
extra_body=dict(structured_outputs={"regex": sample_regex}))
|
||||
ip2 = chat_completion.choices[0].message.content
|
||||
assert ip2 is not None
|
||||
assert re.fullmatch(sample_regex, ip2) is not None
|
||||
@ -602,7 +594,7 @@ async def test_guided_regex_chat(client: openai.AsyncOpenAI, sample_regex,
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_guided_decoding_type_error(client: openai.AsyncOpenAI):
|
||||
async def test_structured_outputs_type_error(client: openai.AsyncOpenAI):
|
||||
messages = [{
|
||||
"role": "system",
|
||||
"content": "you are a helpful assistant"
|
||||
@ -614,17 +606,19 @@ async def test_guided_decoding_type_error(client: openai.AsyncOpenAI):
|
||||
}]
|
||||
|
||||
with pytest.raises(openai.BadRequestError):
|
||||
_ = await client.chat.completions.create(model=MODEL_NAME,
|
||||
messages=messages,
|
||||
extra_body=dict(guided_regex={
|
||||
1: "Python",
|
||||
2: "C++"
|
||||
}))
|
||||
_ = await client.chat.completions.create(
|
||||
model=MODEL_NAME,
|
||||
messages=messages,
|
||||
extra_body=dict(
|
||||
structured_outputs={"regex": {
|
||||
1: "Python",
|
||||
2: "C++"
|
||||
}}))
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI,
|
||||
sample_guided_choice):
|
||||
async def test_structured_outputs_choice_chat_logprobs(
|
||||
client: openai.AsyncOpenAI, sample_structured_outputs_choices):
|
||||
|
||||
messages = [{
|
||||
"role": "system",
|
||||
@ -641,7 +635,8 @@ async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI,
|
||||
max_completion_tokens=10,
|
||||
logprobs=True,
|
||||
top_logprobs=5,
|
||||
extra_body=dict(guided_choice=sample_guided_choice))
|
||||
extra_body=dict(
|
||||
structured_outputs={"choice": sample_structured_outputs_choices}))
|
||||
|
||||
assert chat_completion.choices[0].logprobs is not None
|
||||
assert chat_completion.choices[0].logprobs.content is not None
|
||||
@ -653,20 +648,33 @@ async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI,
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_named_tool_use(client: openai.AsyncOpenAI, sample_json_schema,
|
||||
is_v1_server: bool):
|
||||
if not is_v1_server:
|
||||
pytest.skip("Tool use is only supported in v1 engine")
|
||||
async def test_named_tool_use(
|
||||
client: openai.AsyncOpenAI,
|
||||
sample_json_schema,
|
||||
):
|
||||
messages = [{
|
||||
"role": "system",
|
||||
"content": "you are a helpful assistant"
|
||||
}, {
|
||||
"role":
|
||||
"user",
|
||||
"content":
|
||||
f"Give an example JSON for an employee profile that "
|
||||
f"fits this schema: {sample_json_schema}"
|
||||
"content": ("Give an example JSON for an employee "
|
||||
"profile using the specified tool.")
|
||||
}]
|
||||
tools = [{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "dummy_function_name",
|
||||
"description": "This is a dummy function",
|
||||
"parameters": sample_json_schema
|
||||
}
|
||||
}]
|
||||
tool_choice = {
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "dummy_function_name"
|
||||
}
|
||||
}
|
||||
|
||||
# non-streaming
|
||||
|
||||
@ -674,20 +682,8 @@ async def test_named_tool_use(client: openai.AsyncOpenAI, sample_json_schema,
|
||||
model=MODEL_NAME,
|
||||
messages=messages,
|
||||
max_completion_tokens=1000,
|
||||
tools=[{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "dummy_function_name",
|
||||
"description": "This is a dummy function",
|
||||
"parameters": sample_json_schema
|
||||
}
|
||||
}],
|
||||
tool_choice={
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "dummy_function_name"
|
||||
}
|
||||
},
|
||||
tools=tools,
|
||||
tool_choice=tool_choice,
|
||||
)
|
||||
message = chat_completion.choices[0].message
|
||||
assert len(message.content) == 0
|
||||
@ -705,25 +701,12 @@ async def test_named_tool_use(client: openai.AsyncOpenAI, sample_json_schema,
|
||||
|
||||
# streaming
|
||||
|
||||
stream = await client.chat.completions.create(
|
||||
model=MODEL_NAME,
|
||||
messages=messages,
|
||||
max_completion_tokens=1000,
|
||||
tools=[{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "dummy_function_name",
|
||||
"description": "This is a dummy function",
|
||||
"parameters": sample_json_schema
|
||||
}
|
||||
}],
|
||||
tool_choice={
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "dummy_function_name"
|
||||
}
|
||||
},
|
||||
stream=True)
|
||||
stream = await client.chat.completions.create(model=MODEL_NAME,
|
||||
messages=messages,
|
||||
max_completion_tokens=1000,
|
||||
tools=tools,
|
||||
tool_choice=tool_choice,
|
||||
stream=True)
|
||||
|
||||
output = []
|
||||
finish_reason_count = 0
|
||||
@ -826,11 +809,7 @@ async def test_response_format_json_object(client: openai.AsyncOpenAI):
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_response_format_json_schema(client: openai.AsyncOpenAI,
|
||||
is_v1_server: bool):
|
||||
if not is_v1_server:
|
||||
pytest.skip(
|
||||
"JSON schema response format is only supported in v1 engine")
|
||||
async def test_response_format_json_schema(client: openai.AsyncOpenAI):
|
||||
prompt = 'what is 1+1? The format is "result": 2'
|
||||
# Check that this prompt cannot lead to a valid JSON without json_schema
|
||||
for _ in range(2):
|
||||
|
||||
@ -99,3 +99,26 @@ async def test_prompt_logprobs(client: openai.AsyncOpenAI):
|
||||
|
||||
assert completion.prompt_logprobs is not None
|
||||
assert len(completion.prompt_logprobs) > 0
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_top_logprobs(client: openai.AsyncOpenAI):
|
||||
messages = [{
|
||||
"role": "system",
|
||||
"content": "You are a helpful assistant."
|
||||
}, {
|
||||
"role": "user",
|
||||
"content": "Beijing is the capital of which country?"
|
||||
}]
|
||||
|
||||
completion = await client.chat.completions.create(
|
||||
model=MODEL_NAME,
|
||||
messages=messages,
|
||||
extra_body={
|
||||
"top_logprobs": -1,
|
||||
"logprobs": "true",
|
||||
},
|
||||
)
|
||||
assert completion.choices[0].logprobs is not None
|
||||
assert completion.choices[0].logprobs.content is not None
|
||||
assert len(completion.choices[0].logprobs.content) > 0
|
||||
|
||||
@ -1,831 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
# imports for guided decoding tests
|
||||
import json
|
||||
import os
|
||||
from typing import Optional
|
||||
|
||||
import jsonschema
|
||||
import openai # use the official client for correctness check
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
import regex as re
|
||||
import requests
|
||||
# downloading lora to test lora requests
|
||||
from openai import BadRequestError
|
||||
|
||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
# any model with a chat template should work here
|
||||
MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
|
||||
# technically these adapters use a different base model,
|
||||
# but we're not testing generation quality here
|
||||
|
||||
GUIDED_DECODING_BACKENDS = ["outlines", "xgrammar", "guidance"]
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def default_server_args(zephyr_lora_files):
|
||||
return [
|
||||
# use half precision for speed and memory savings in CI environment
|
||||
"--dtype",
|
||||
"bfloat16",
|
||||
"--max-model-len",
|
||||
"8192",
|
||||
"--max-num-seqs",
|
||||
"128",
|
||||
"--enforce-eager",
|
||||
# lora config
|
||||
"--enable-lora",
|
||||
"--lora-modules",
|
||||
f"zephyr-lora={zephyr_lora_files}",
|
||||
"--max-lora-rank",
|
||||
"64",
|
||||
"--max-cpu-loras",
|
||||
"2",
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture(scope="module",
|
||||
params=["", "--disable-frontend-multiprocessing"])
|
||||
def server(default_server_args, request):
|
||||
if request.param:
|
||||
default_server_args.append(request.param)
|
||||
|
||||
original_value = os.environ.get('VLLM_USE_V1')
|
||||
os.environ['VLLM_USE_V1'] = '0'
|
||||
try:
|
||||
with RemoteOpenAIServer(MODEL_NAME,
|
||||
default_server_args) as remote_server:
|
||||
yield remote_server
|
||||
finally:
|
||||
# Restore original env value
|
||||
if original_value is None:
|
||||
os.environ.pop('VLLM_USE_V1', None)
|
||||
else:
|
||||
os.environ['VLLM_USE_V1'] = original_value
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def is_v1_server(server):
|
||||
import os
|
||||
|
||||
# For completion tests, we assume v0 since there's no explicit v1 setup
|
||||
return os.environ.get('VLLM_USE_V1', '0') == '1'
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def client(server):
|
||||
async with server.get_async_client() as async_client:
|
||||
yield async_client
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize(
|
||||
# first test base model, then test loras
|
||||
"model_name",
|
||||
[MODEL_NAME, "zephyr-lora"],
|
||||
)
|
||||
async def test_single_completion(client: openai.AsyncOpenAI, model_name: str):
|
||||
completion = await client.completions.create(model=model_name,
|
||||
prompt="Hello, my name is",
|
||||
max_tokens=5,
|
||||
temperature=0.0)
|
||||
|
||||
assert completion.id is not None
|
||||
assert completion.choices is not None and len(completion.choices) == 1
|
||||
|
||||
choice = completion.choices[0]
|
||||
assert len(choice.text) >= 5
|
||||
assert choice.finish_reason == "length"
|
||||
assert completion.usage == openai.types.CompletionUsage(
|
||||
completion_tokens=5, prompt_tokens=6, total_tokens=11)
|
||||
|
||||
# test using token IDs
|
||||
completion = await client.completions.create(
|
||||
model=model_name,
|
||||
prompt=[0, 0, 0, 0, 0],
|
||||
max_tokens=5,
|
||||
temperature=0.0,
|
||||
)
|
||||
assert len(completion.choices[0].text) >= 1
|
||||
assert completion.choices[0].prompt_logprobs is None
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_added_lora_tokens_base_model(client: openai.AsyncOpenAI):
|
||||
# test using token IDs
|
||||
with pytest.raises(openai.BadRequestError, match="out of vocabulary"):
|
||||
# Added tokens should be rejected by the base model
|
||||
await client.completions.create(
|
||||
model=MODEL_NAME,
|
||||
prompt=[0, 0, 32000, 32001, 32002],
|
||||
echo=True,
|
||||
max_tokens=5,
|
||||
temperature=0.0,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize(
|
||||
# first test base model, then test loras
|
||||
"model_name",
|
||||
[MODEL_NAME, "zephyr-lora"],
|
||||
)
|
||||
async def test_no_logprobs(client: openai.AsyncOpenAI, model_name: str):
|
||||
# test using token IDs
|
||||
completion = await client.completions.create(
|
||||
model=model_name,
|
||||
prompt=[0, 0, 0, 0, 0],
|
||||
max_tokens=5,
|
||||
temperature=0.0,
|
||||
logprobs=None,
|
||||
)
|
||||
choice = completion.choices[0]
|
||||
assert choice.logprobs is None
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize(
|
||||
# just test 1 lora
|
||||
"model_name",
|
||||
[MODEL_NAME, "zephyr-lora"],
|
||||
)
|
||||
async def test_zero_logprobs(client: openai.AsyncOpenAI, model_name: str):
|
||||
# test using token IDs
|
||||
completion = await client.completions.create(
|
||||
model=model_name,
|
||||
prompt=[0, 0, 0, 0, 0],
|
||||
max_tokens=5,
|
||||
temperature=0.0,
|
||||
logprobs=0,
|
||||
)
|
||||
choice = completion.choices[0]
|
||||
assert choice.logprobs is not None
|
||||
assert choice.logprobs.token_logprobs is not None
|
||||
assert choice.logprobs.top_logprobs is not None
|
||||
assert len(choice.logprobs.top_logprobs[0]) == 1
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize(
|
||||
"model_name",
|
||||
[MODEL_NAME, "zephyr-lora"],
|
||||
)
|
||||
async def test_some_logprobs(client: openai.AsyncOpenAI, model_name: str):
|
||||
# test using token IDs
|
||||
completion = await client.completions.create(
|
||||
model=model_name,
|
||||
prompt=[0, 0, 0, 0, 0],
|
||||
max_tokens=5,
|
||||
temperature=0.0,
|
||||
logprobs=5,
|
||||
)
|
||||
choice = completion.choices[0]
|
||||
assert choice.logprobs is not None
|
||||
assert choice.logprobs.token_logprobs is not None
|
||||
assert choice.logprobs.top_logprobs is not None
|
||||
assert 5 <= len(choice.logprobs.top_logprobs[0]) <= 6
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize(
|
||||
"model_name",
|
||||
[MODEL_NAME, "zephyr-lora"],
|
||||
)
|
||||
async def test_too_many_completion_logprobs(client: openai.AsyncOpenAI,
|
||||
model_name: str):
|
||||
|
||||
with pytest.raises(
|
||||
(openai.BadRequestError, openai.APIError)): # test using token IDs
|
||||
await client.completions.create(
|
||||
model=model_name,
|
||||
prompt=[0, 0, 0, 0, 0],
|
||||
max_tokens=5,
|
||||
temperature=0.0,
|
||||
# vLLM has higher default max_logprobs (20 instead of 5) to support
|
||||
# both Completion API and Chat Completion API
|
||||
logprobs=21,
|
||||
)
|
||||
...
|
||||
with pytest.raises(
|
||||
(openai.BadRequestError, openai.APIError)): # test using token IDs
|
||||
stream = await client.completions.create(
|
||||
model=model_name,
|
||||
prompt=[0, 0, 0, 0, 0],
|
||||
max_tokens=5,
|
||||
temperature=0.0,
|
||||
# vLLM has higher default max_logprobs (20 instead of 5) to support
|
||||
# both Completion API and Chat Completion API
|
||||
logprobs=30,
|
||||
stream=True,
|
||||
)
|
||||
async for chunk in stream:
|
||||
...
|
||||
|
||||
# the server should still work afterwards
|
||||
completion = await client.completions.create(
|
||||
model=model_name,
|
||||
prompt=[0, 0, 0, 0, 0],
|
||||
max_tokens=5,
|
||||
temperature=0.0,
|
||||
)
|
||||
assert len(completion.choices[0].text) >= 0
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name, prompt_logprobs", [(MODEL_NAME, -1),
|
||||
(MODEL_NAME, 0),
|
||||
(MODEL_NAME, 1),
|
||||
(MODEL_NAME, None)])
|
||||
async def test_prompt_logprobs_completion(client: openai.AsyncOpenAI,
|
||||
model_name: str,
|
||||
prompt_logprobs: Optional[int]):
|
||||
params: dict = {
|
||||
"prompt": ["A robot may not injure another robot", "My name is"],
|
||||
"model": model_name,
|
||||
}
|
||||
if prompt_logprobs is not None:
|
||||
params["extra_body"] = {"prompt_logprobs": prompt_logprobs}
|
||||
|
||||
if prompt_logprobs is not None and prompt_logprobs < 0:
|
||||
with pytest.raises(BadRequestError):
|
||||
await client.completions.create(**params)
|
||||
else:
|
||||
completion = await client.completions.create(**params)
|
||||
if prompt_logprobs is not None:
|
||||
assert completion.choices[0].prompt_logprobs is not None
|
||||
assert len(completion.choices[0].prompt_logprobs) > 0
|
||||
|
||||
assert completion.choices[1].prompt_logprobs is not None
|
||||
assert len(completion.choices[1].prompt_logprobs) > 0
|
||||
|
||||
else:
|
||||
assert completion.choices[0].prompt_logprobs is None
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize(
|
||||
"model_name",
|
||||
[MODEL_NAME, "zephyr-lora"],
|
||||
)
|
||||
async def test_completion_streaming(client: openai.AsyncOpenAI,
|
||||
model_name: str):
|
||||
prompt = "What is an LLM?"
|
||||
|
||||
single_completion = await client.completions.create(
|
||||
model=model_name,
|
||||
prompt=prompt,
|
||||
max_tokens=5,
|
||||
temperature=0.0,
|
||||
)
|
||||
single_output = single_completion.choices[0].text
|
||||
stream = await client.completions.create(model=model_name,
|
||||
prompt=prompt,
|
||||
max_tokens=5,
|
||||
temperature=0.0,
|
||||
stream=True)
|
||||
chunks: list[str] = []
|
||||
finish_reason_count = 0
|
||||
async for chunk in stream:
|
||||
chunks.append(chunk.choices[0].text)
|
||||
if chunk.choices[0].finish_reason is not None:
|
||||
finish_reason_count += 1
|
||||
# finish reason should only return in last block
|
||||
assert finish_reason_count == 1
|
||||
assert chunk.choices[0].finish_reason == "length"
|
||||
assert chunk.choices[0].text
|
||||
assert "".join(chunks) == single_output
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize(
|
||||
"model_name",
|
||||
[MODEL_NAME, "zephyr-lora"],
|
||||
)
|
||||
async def test_parallel_streaming(client: openai.AsyncOpenAI, model_name: str):
|
||||
"""Streaming for parallel sampling.
|
||||
The tokens from multiple samples, are flattened into a single stream,
|
||||
with an index to indicate which sample the token belongs to.
|
||||
"""
|
||||
|
||||
prompt = "What is an LLM?"
|
||||
n = 3
|
||||
max_tokens = 5
|
||||
|
||||
stream = await client.completions.create(model=model_name,
|
||||
prompt=prompt,
|
||||
max_tokens=max_tokens,
|
||||
n=n,
|
||||
stream=True)
|
||||
chunks: list[list[str]] = [[] for i in range(n)]
|
||||
finish_reason_count = 0
|
||||
async for chunk in stream:
|
||||
index = chunk.choices[0].index
|
||||
text = chunk.choices[0].text
|
||||
chunks[index].append(text)
|
||||
if chunk.choices[0].finish_reason is not None:
|
||||
finish_reason_count += 1
|
||||
assert finish_reason_count == n
|
||||
for chunk in chunks:
|
||||
assert len(chunk) == max_tokens
|
||||
print("".join(chunk))
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize(
|
||||
"model_name",
|
||||
[MODEL_NAME, "zephyr-lora"],
|
||||
)
|
||||
async def test_completion_stream_options(client: openai.AsyncOpenAI,
|
||||
model_name: str):
|
||||
prompt = "What is the capital of France?"
|
||||
|
||||
# Test stream=True, stream_options=
|
||||
# {"include_usage": False, "continuous_usage_stats": False}
|
||||
stream = await client.completions.create(model=model_name,
|
||||
prompt=prompt,
|
||||
max_tokens=5,
|
||||
temperature=0.0,
|
||||
stream=True,
|
||||
stream_options={
|
||||
"include_usage": False,
|
||||
"continuous_usage_stats":
|
||||
False,
|
||||
})
|
||||
|
||||
async for chunk in stream:
|
||||
assert chunk.usage is None
|
||||
|
||||
# Test stream=True, stream_options=
|
||||
# {"include_usage": False, "continuous_usage_stats": True}
|
||||
stream = await client.completions.create(model=model_name,
|
||||
prompt=prompt,
|
||||
max_tokens=5,
|
||||
temperature=0.0,
|
||||
stream=True,
|
||||
stream_options={
|
||||
"include_usage": False,
|
||||
"continuous_usage_stats":
|
||||
True,
|
||||
})
|
||||
async for chunk in stream:
|
||||
assert chunk.usage is None
|
||||
|
||||
# Test stream=True, stream_options=
|
||||
# {"include_usage": True, "continuous_usage_stats": False}
|
||||
stream = await client.completions.create(model=model_name,
|
||||
prompt=prompt,
|
||||
max_tokens=5,
|
||||
temperature=0.0,
|
||||
stream=True,
|
||||
stream_options={
|
||||
"include_usage": True,
|
||||
"continuous_usage_stats":
|
||||
False,
|
||||
})
|
||||
async for chunk in stream:
|
||||
if chunk.choices[0].finish_reason is None:
|
||||
assert chunk.usage is None
|
||||
else:
|
||||
assert chunk.usage is None
|
||||
final_chunk = await stream.__anext__()
|
||||
assert final_chunk.usage is not None
|
||||
assert final_chunk.usage.prompt_tokens > 0
|
||||
assert final_chunk.usage.completion_tokens > 0
|
||||
assert final_chunk.usage.total_tokens == (
|
||||
final_chunk.usage.prompt_tokens +
|
||||
final_chunk.usage.completion_tokens)
|
||||
assert final_chunk.choices == []
|
||||
|
||||
# Test stream=True, stream_options=
|
||||
# {"include_usage": True, "continuous_usage_stats": True}
|
||||
stream = await client.completions.create(model=model_name,
|
||||
prompt=prompt,
|
||||
max_tokens=5,
|
||||
temperature=0.0,
|
||||
stream=True,
|
||||
stream_options={
|
||||
"include_usage": True,
|
||||
"continuous_usage_stats":
|
||||
True,
|
||||
})
|
||||
async for chunk in stream:
|
||||
assert chunk.usage is not None
|
||||
assert chunk.usage.prompt_tokens > 0
|
||||
assert chunk.usage.completion_tokens > 0
|
||||
assert chunk.usage.total_tokens == (chunk.usage.prompt_tokens +
|
||||
chunk.usage.completion_tokens)
|
||||
if chunk.choices[0].finish_reason is not None:
|
||||
final_chunk = await stream.__anext__()
|
||||
assert final_chunk.usage is not None
|
||||
assert final_chunk.usage.prompt_tokens > 0
|
||||
assert final_chunk.usage.completion_tokens > 0
|
||||
assert final_chunk.usage.total_tokens == (
|
||||
final_chunk.usage.prompt_tokens +
|
||||
final_chunk.usage.completion_tokens)
|
||||
assert final_chunk.choices == []
|
||||
|
||||
# Test stream=False, stream_options=
|
||||
# {"include_usage": None}
|
||||
with pytest.raises(BadRequestError):
|
||||
await client.completions.create(model=model_name,
|
||||
prompt=prompt,
|
||||
max_tokens=5,
|
||||
temperature=0.0,
|
||||
stream=False,
|
||||
stream_options={"include_usage": None})
|
||||
|
||||
# Test stream=False, stream_options=
|
||||
# {"include_usage": True}
|
||||
with pytest.raises(BadRequestError):
|
||||
await client.completions.create(model=model_name,
|
||||
prompt=prompt,
|
||||
max_tokens=5,
|
||||
temperature=0.0,
|
||||
stream=False,
|
||||
stream_options={"include_usage": True})
|
||||
|
||||
# Test stream=False, stream_options=
|
||||
# {"continuous_usage_stats": None}
|
||||
with pytest.raises(BadRequestError):
|
||||
await client.completions.create(
|
||||
model=model_name,
|
||||
prompt=prompt,
|
||||
max_tokens=5,
|
||||
temperature=0.0,
|
||||
stream=False,
|
||||
stream_options={"continuous_usage_stats": None})
|
||||
|
||||
# Test stream=False, stream_options=
|
||||
# {"continuous_usage_stats": True}
|
||||
with pytest.raises(BadRequestError):
|
||||
await client.completions.create(
|
||||
model=model_name,
|
||||
prompt=prompt,
|
||||
max_tokens=5,
|
||||
temperature=0.0,
|
||||
stream=False,
|
||||
stream_options={"continuous_usage_stats": True})
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize(
|
||||
"model_name",
|
||||
[MODEL_NAME, "zephyr-lora"],
|
||||
)
|
||||
async def test_batch_completions(client: openai.AsyncOpenAI, model_name: str):
|
||||
# test both text and token IDs
|
||||
for prompts in (["Hello, my name is"] * 2, [[0, 0, 0, 0, 0]] * 2):
|
||||
# test simple list
|
||||
batch = await client.completions.create(
|
||||
model=model_name,
|
||||
prompt=prompts,
|
||||
max_tokens=5,
|
||||
temperature=0.0,
|
||||
)
|
||||
assert len(batch.choices) == 2
|
||||
assert batch.choices[0].text == batch.choices[1].text
|
||||
|
||||
# test n = 2
|
||||
batch = await client.completions.create(
|
||||
model=model_name,
|
||||
prompt=prompts,
|
||||
n=2,
|
||||
max_tokens=5,
|
||||
temperature=0.0,
|
||||
extra_body=dict(
|
||||
# NOTE: this has to be true for n > 1 in vLLM, but
|
||||
# not necessary for official client.
|
||||
use_beam_search=True),
|
||||
)
|
||||
assert len(batch.choices) == 4
|
||||
assert batch.choices[0].text != batch.choices[
|
||||
1].text, "beam search should be different"
|
||||
assert batch.choices[0].text == batch.choices[
|
||||
2].text, "two copies of the same prompt should be the same"
|
||||
assert batch.choices[1].text == batch.choices[
|
||||
3].text, "two copies of the same prompt should be the same"
|
||||
|
||||
# test streaming
|
||||
batch = await client.completions.create(
|
||||
model=model_name,
|
||||
prompt=prompts,
|
||||
max_tokens=5,
|
||||
temperature=0.0,
|
||||
stream=True,
|
||||
)
|
||||
texts = [""] * 2
|
||||
async for chunk in batch:
|
||||
assert len(chunk.choices) == 1
|
||||
choice = chunk.choices[0]
|
||||
texts[choice.index] += choice.text
|
||||
assert texts[0] == texts[1]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_logits_bias(client: openai.AsyncOpenAI):
|
||||
prompt = "Hello, my name is"
|
||||
max_tokens = 5
|
||||
tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
|
||||
|
||||
# Test exclusive selection
|
||||
token_id = 1000
|
||||
completion = await client.completions.create(
|
||||
model=MODEL_NAME,
|
||||
prompt=prompt,
|
||||
max_tokens=max_tokens,
|
||||
temperature=0.0,
|
||||
logit_bias={str(token_id): 100},
|
||||
seed=42,
|
||||
)
|
||||
assert len(completion.choices[0].text) >= 5
|
||||
response_tokens = tokenizer(completion.choices[0].text,
|
||||
add_special_tokens=False)["input_ids"]
|
||||
expected_tokens = tokenizer(tokenizer.decode([token_id] * 5),
|
||||
add_special_tokens=False)["input_ids"]
|
||||
assert all([
|
||||
response == expected
|
||||
for response, expected in zip(response_tokens, expected_tokens)
|
||||
])
|
||||
|
||||
# Test ban
|
||||
completion = await client.completions.create(
|
||||
model=MODEL_NAME,
|
||||
prompt=prompt,
|
||||
max_tokens=max_tokens,
|
||||
temperature=0.0,
|
||||
)
|
||||
response_tokens = tokenizer(completion.choices[0].text,
|
||||
add_special_tokens=False)["input_ids"]
|
||||
first_response = completion.choices[0].text
|
||||
completion = await client.completions.create(
|
||||
model=MODEL_NAME,
|
||||
prompt=prompt,
|
||||
max_tokens=max_tokens,
|
||||
temperature=0.0,
|
||||
logit_bias={str(token): -100
|
||||
for token in response_tokens},
|
||||
)
|
||||
assert first_response != completion.choices[0].text
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_allowed_token_ids(client: openai.AsyncOpenAI):
|
||||
prompt = "Hello, my name is"
|
||||
max_tokens = 1
|
||||
tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
|
||||
|
||||
# Test exclusive selection
|
||||
allowed_ids = [21555, 21557, 21558]
|
||||
completion = await client.completions.create(
|
||||
model=MODEL_NAME,
|
||||
prompt=prompt,
|
||||
max_tokens=max_tokens,
|
||||
temperature=0.0,
|
||||
seed=42,
|
||||
extra_body=dict(allowed_token_ids=allowed_ids),
|
||||
logprobs=1,
|
||||
)
|
||||
response_tokens = completion.choices[0].logprobs.tokens
|
||||
assert len(response_tokens) == 1
|
||||
assert tokenizer.convert_tokens_to_ids(response_tokens)[0] in allowed_ids
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
|
||||
async def test_guided_json_completion(client: openai.AsyncOpenAI,
|
||||
guided_decoding_backend: str,
|
||||
sample_json_schema, is_v1_server: bool):
|
||||
if not is_v1_server:
|
||||
pytest.skip("Guided decoding is only supported in v1 engine")
|
||||
|
||||
completion = await client.completions.create(
|
||||
model=MODEL_NAME,
|
||||
prompt=f"Give an example JSON for an employee profile "
|
||||
f"that fits this schema: {sample_json_schema}",
|
||||
n=3,
|
||||
temperature=1.0,
|
||||
max_tokens=500,
|
||||
extra_body=dict(guided_json=sample_json_schema,
|
||||
guided_decoding_backend=guided_decoding_backend))
|
||||
|
||||
assert completion.id is not None
|
||||
assert len(completion.choices) == 3
|
||||
for i in range(3):
|
||||
output_json = json.loads(completion.choices[i].text)
|
||||
jsonschema.validate(instance=output_json, schema=sample_json_schema)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
|
||||
async def test_guided_regex_completion(client: openai.AsyncOpenAI,
|
||||
guided_decoding_backend: str,
|
||||
sample_regex, is_v1_server: bool):
|
||||
if not is_v1_server:
|
||||
pytest.skip("Guided decoding is only supported in v1 engine")
|
||||
|
||||
completion = await client.completions.create(
|
||||
model=MODEL_NAME,
|
||||
prompt=f"Give an example IPv4 address with this regex: {sample_regex}",
|
||||
n=3,
|
||||
temperature=1.0,
|
||||
max_tokens=20,
|
||||
extra_body=dict(guided_regex=sample_regex,
|
||||
guided_decoding_backend=guided_decoding_backend))
|
||||
|
||||
assert completion.id is not None
|
||||
assert len(completion.choices) == 3
|
||||
for i in range(3):
|
||||
assert re.fullmatch(sample_regex,
|
||||
completion.choices[i].text) is not None
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
|
||||
async def test_guided_choice_completion(client: openai.AsyncOpenAI,
|
||||
guided_decoding_backend: str,
|
||||
sample_guided_choice,
|
||||
is_v1_server: bool):
|
||||
if not is_v1_server:
|
||||
pytest.skip("Guided decoding is only supported in v1 engine")
|
||||
|
||||
completion = await client.completions.create(
|
||||
model=MODEL_NAME,
|
||||
prompt="The best language for type-safe systems programming is ",
|
||||
n=2,
|
||||
temperature=1.0,
|
||||
max_tokens=10,
|
||||
extra_body=dict(guided_choice=sample_guided_choice,
|
||||
guided_decoding_backend=guided_decoding_backend))
|
||||
|
||||
assert completion.id is not None
|
||||
assert len(completion.choices) == 2
|
||||
for i in range(2):
|
||||
assert completion.choices[i].text in sample_guided_choice
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_guided_grammar(client: openai.AsyncOpenAI,
|
||||
sample_sql_statements, is_v1_server: bool):
|
||||
if not is_v1_server:
|
||||
pytest.skip("Guided grammar is only supported in v1 engine")
|
||||
|
||||
completion = await client.completions.create(
|
||||
model=MODEL_NAME,
|
||||
prompt=("Generate a sql state that select col_1 from "
|
||||
"table_1 where it is equals to 1"),
|
||||
temperature=1.0,
|
||||
max_tokens=500,
|
||||
extra_body=dict(guided_grammar=sample_sql_statements))
|
||||
|
||||
content = completion.choices[0].text
|
||||
|
||||
# use Lark to parse the output, and make sure it's a valid parse tree
|
||||
from lark import Lark
|
||||
parser = Lark(sample_sql_statements)
|
||||
parser.parse(content)
|
||||
|
||||
# remove spaces for comparison b/c we removed them in the grammar
|
||||
ground_truth = "SELECT col_1 from table_1 where col_1 = 1".replace(" ", "")
|
||||
|
||||
assert content.strip() == ground_truth
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize(
|
||||
# first test base model, then test loras
|
||||
"model_name",
|
||||
[MODEL_NAME, "zephyr-lora"],
|
||||
)
|
||||
@pytest.mark.parametrize("logprobs_arg", [1, 0])
|
||||
async def test_echo_logprob_completion(client: openai.AsyncOpenAI,
|
||||
model_name: str, logprobs_arg: int):
|
||||
tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
|
||||
# test using text and token IDs
|
||||
for prompt in ("Hello, my name is", [0, 0, 0, 0, 0]):
|
||||
completion = await client.completions.create(model=model_name,
|
||||
prompt=prompt,
|
||||
max_tokens=5,
|
||||
temperature=0.0,
|
||||
echo=True,
|
||||
logprobs=logprobs_arg)
|
||||
|
||||
prompt_text = tokenizer.decode(prompt) if isinstance(prompt,
|
||||
list) else prompt
|
||||
assert re.search(r"^" + prompt_text, completion.choices[0].text)
|
||||
logprobs = completion.choices[0].logprobs
|
||||
assert logprobs is not None
|
||||
assert len(logprobs.text_offset) > 5
|
||||
assert (len(logprobs.token_logprobs) > 5
|
||||
and logprobs.token_logprobs[0] is None)
|
||||
assert (len(logprobs.top_logprobs) > 5
|
||||
and logprobs.top_logprobs[0] is None)
|
||||
for top_logprobs in logprobs.top_logprobs[1:]:
|
||||
assert max(logprobs_arg,
|
||||
1) <= len(top_logprobs) <= logprobs_arg + 1
|
||||
assert len(logprobs.tokens) > 5
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
|
||||
async def test_guided_decoding_type_error(client: openai.AsyncOpenAI,
|
||||
guided_decoding_backend: str,
|
||||
sample_json_schema, sample_regex,
|
||||
is_v1_server: bool):
|
||||
if not is_v1_server:
|
||||
pytest.skip("Guided decoding is only supported in v1 engine")
|
||||
|
||||
with pytest.raises(openai.BadRequestError):
|
||||
_ = await client.completions.create(
|
||||
model=MODEL_NAME,
|
||||
prompt="Give an example JSON that fits this schema: 42",
|
||||
extra_body=dict(guided_json=42,
|
||||
guided_decoding_backend=guided_decoding_backend))
|
||||
|
||||
with pytest.raises(openai.BadRequestError):
|
||||
_ = await client.completions.create(
|
||||
model=MODEL_NAME,
|
||||
prompt="Give an example string that fits this regex",
|
||||
extra_body=dict(guided_regex=sample_regex,
|
||||
guided_json=sample_json_schema))
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize(
|
||||
"model_name,stream,echo",
|
||||
[
|
||||
(MODEL_NAME, False, False),
|
||||
(MODEL_NAME, False, True),
|
||||
(MODEL_NAME, True, False),
|
||||
(MODEL_NAME, True, True) # should not raise BadRequestError error
|
||||
],
|
||||
)
|
||||
async def test_echo_stream_completion(client: openai.AsyncOpenAI,
|
||||
model_name: str, stream: bool,
|
||||
echo: bool):
|
||||
saying: str = "Hello, my name is"
|
||||
result = await client.completions.create(model=model_name,
|
||||
prompt=saying,
|
||||
max_tokens=10,
|
||||
temperature=0.0,
|
||||
echo=echo,
|
||||
stream=stream)
|
||||
|
||||
stop_reason = "length"
|
||||
|
||||
if not stream:
|
||||
completion = result
|
||||
assert completion.id is not None
|
||||
assert completion.choices is not None and len(completion.choices) == 1
|
||||
|
||||
choice = completion.choices[0]
|
||||
assert len(choice.text) >= 5
|
||||
assert choice.finish_reason == stop_reason
|
||||
|
||||
if echo:
|
||||
assert choice.text is not None and saying in choice.text
|
||||
else:
|
||||
assert choice.text is not None and saying not in choice.text
|
||||
|
||||
else:
|
||||
chunks: list[str] = []
|
||||
final_finish_reason = None
|
||||
async for chunk in result:
|
||||
if chunk.choices and chunk.choices[0].text:
|
||||
chunks.append(chunk.choices[0].text)
|
||||
if chunk.choices and chunk.choices[0].finish_reason:
|
||||
final_finish_reason = chunk.choices[0].finish_reason
|
||||
|
||||
assert final_finish_reason == stop_reason
|
||||
content = "".join(chunks)
|
||||
if echo:
|
||||
assert content is not None and saying in content
|
||||
else:
|
||||
assert content is not None and saying not in content
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_invocations(server: RemoteOpenAIServer,
|
||||
client: openai.AsyncOpenAI):
|
||||
request_args = {
|
||||
"model": MODEL_NAME,
|
||||
"prompt": "Hello, my name is",
|
||||
"max_tokens": 5,
|
||||
"temperature": 0.0,
|
||||
"logprobs": None,
|
||||
}
|
||||
|
||||
completion = await client.completions.create(**request_args)
|
||||
|
||||
invocation_response = requests.post(server.url_for("invocations"),
|
||||
json=request_args)
|
||||
invocation_response.raise_for_status()
|
||||
|
||||
completion_output = completion.model_dump()
|
||||
invocation_output = invocation_response.json()
|
||||
|
||||
assert completion_output.keys() == invocation_output.keys()
|
||||
assert completion_output["choices"] == invocation_output["choices"]
|
||||
@ -142,7 +142,7 @@ def server(): # noqa: F811
|
||||
"--dtype",
|
||||
"half",
|
||||
"--enable-auto-tool-choice",
|
||||
"--guided-decoding-backend",
|
||||
"--structured-outputs-config.backend",
|
||||
"xgrammar",
|
||||
"--tool-call-parser",
|
||||
"hermes",
|
||||
@ -225,7 +225,7 @@ def k2_server(): # noqa: F811
|
||||
"--dtype",
|
||||
"half",
|
||||
"--enable-auto-tool-choice",
|
||||
"--guided-decoding-backend",
|
||||
"--structured-outputs-config.backend",
|
||||
"xgrammar",
|
||||
"--tool-call-parser",
|
||||
"hermes",
|
||||
|
||||
@ -14,6 +14,9 @@ from transformers import AutoConfig
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
pytest.skip("Skipping prompt_embeds test until V1 supports it.",
|
||||
allow_module_level=True)
|
||||
|
||||
# any model with a chat template should work here
|
||||
MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
|
||||
|
||||
@ -33,7 +36,6 @@ def default_server_args() -> list[str]:
|
||||
"--enforce-eager",
|
||||
# Prompt Embeds server args
|
||||
"--enable-prompt-embeds",
|
||||
"--no-enable-chunked-prefill",
|
||||
]
|
||||
|
||||
|
||||
@ -228,3 +230,20 @@ async def test_completions_with_logprobs_and_prompt_embeds(
|
||||
assert max(logprobs_arg,
|
||||
1) <= len(top_logprobs) <= logprobs_arg + 1
|
||||
assert len(logprobs.tokens) == 5
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_prompt_logprobs_raises_error(
|
||||
client_with_prompt_embeds: openai.AsyncOpenAI):
|
||||
with pytest.raises(BadRequestError, match="not compatible"):
|
||||
encoded_embeds = create_dummy_embeds()
|
||||
await client_with_prompt_embeds.completions.create(
|
||||
model=MODEL_NAME,
|
||||
prompt="",
|
||||
max_tokens=5,
|
||||
temperature=0.0,
|
||||
extra_body={
|
||||
"prompt_embeds": encoded_embeds,
|
||||
"prompt_logprobs": True
|
||||
},
|
||||
)
|
||||
|
||||
@ -53,12 +53,13 @@ def monkeypatch_module():
|
||||
mpatch.undo()
|
||||
|
||||
|
||||
@pytest.fixture(scope="module", params=[False, True])
|
||||
@pytest.fixture(scope="module", params=[True])
|
||||
def server_with_lora_modules_json(request, monkeypatch_module,
|
||||
zephyr_lora_files):
|
||||
|
||||
use_v1 = request.param
|
||||
monkeypatch_module.setenv('VLLM_USE_V1', '1' if use_v1 else '0')
|
||||
assert use_v1
|
||||
monkeypatch_module.setenv('VLLM_USE_V1', '1')
|
||||
|
||||
# Define the json format LoRA module configurations
|
||||
lora_module_1 = {
|
||||
|
||||
@ -22,7 +22,7 @@ MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
|
||||
PREV_MINOR_VERSION = version._prev_minor_version()
|
||||
|
||||
|
||||
@pytest.fixture(scope="module", params=[True, False])
|
||||
@pytest.fixture(scope="module", params=[True])
|
||||
def use_v1(request):
|
||||
# Module-scoped variant of run_with_both_engines
|
||||
#
|
||||
|
||||
@ -102,12 +102,14 @@ def before_generate_case(context: schemathesis.hooks.HookContext, strategy):
|
||||
if "custom" in tool_call:
|
||||
return False
|
||||
|
||||
# Sometimes guided_grammar is generated to be empty
|
||||
# Sometimes structured_outputs.grammar is generated to be empty
|
||||
# Causing a server error in EBNF grammar parsing
|
||||
# https://github.com/vllm-project/vllm/pull/22587#issuecomment-3195253421
|
||||
guided_grammar = case.body.get("guided_grammar")
|
||||
structured_outputs = case.body.get("structured_outputs", {})
|
||||
grammar = structured_outputs.get("grammar") if isinstance(
|
||||
structured_outputs, dict) else None
|
||||
|
||||
if guided_grammar == '':
|
||||
if grammar == '':
|
||||
# Allow None (will be handled as no grammar)
|
||||
# But skip empty strings
|
||||
return False
|
||||
|
||||
@ -3,7 +3,7 @@
|
||||
|
||||
import io
|
||||
|
||||
# imports for guided decoding tests
|
||||
# imports for structured outputs tests
|
||||
import openai
|
||||
import pybase64
|
||||
import pytest
|
||||
|
||||
@ -287,6 +287,57 @@ async def test_stateful_multi_turn(client: OpenAI, model_name: str):
|
||||
assert response3.status == "completed"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_streaming_types(client: OpenAI, model_name: str):
|
||||
prompts = [
|
||||
"tell me a story about a cat in 20 words",
|
||||
]
|
||||
|
||||
# this links the "done" type with the "start" type
|
||||
# so every "done" type should have a corresponding "start" type
|
||||
# and every open block should be closed by the end of the stream
|
||||
pairs_of_event_types = {
|
||||
"response.completed": "response.created",
|
||||
"response.output_item.done": "response.output_item.added",
|
||||
"response.content_part.done": "response.content_part.added",
|
||||
"response.output_text.done": "response.output_text.delta",
|
||||
"response.web_search_call.done": "response.web_search_call.added",
|
||||
"response.reasoning_text.done": "response.reasoning_text.delta",
|
||||
"response.reasoning_part.done": "response.reasoning_part.added",
|
||||
}
|
||||
|
||||
for prompt in prompts:
|
||||
response = await client.responses.create(
|
||||
model=model_name,
|
||||
input=prompt,
|
||||
reasoning={"effort": "low"},
|
||||
tools=[],
|
||||
stream=True,
|
||||
background=False,
|
||||
)
|
||||
|
||||
stack_of_event_types = []
|
||||
async for event in response:
|
||||
if event.type == 'response.created':
|
||||
stack_of_event_types.append(event.type)
|
||||
elif event.type == 'response.completed':
|
||||
assert stack_of_event_types[-1] == pairs_of_event_types[
|
||||
event.type]
|
||||
stack_of_event_types.pop()
|
||||
if event.type.endswith("added"):
|
||||
stack_of_event_types.append(event.type)
|
||||
elif event.type.endswith("delta"):
|
||||
if stack_of_event_types[-1] == event.type:
|
||||
continue
|
||||
stack_of_event_types.append(event.type)
|
||||
elif event.type.endswith("done"):
|
||||
assert stack_of_event_types[-1] == pairs_of_event_types[
|
||||
event.type]
|
||||
stack_of_event_types.pop()
|
||||
assert len(stack_of_event_types) == 0
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
@pytest.mark.parametrize("background", [True, False])
|
||||
@ -343,7 +394,10 @@ async def test_streaming(client: OpenAI, model_name: str, background: bool):
|
||||
assert event.item_id == current_item_id
|
||||
|
||||
# verify content_index_id is correct
|
||||
if event.type == "response.content_part.added":
|
||||
if event.type in [
|
||||
"response.content_part.added",
|
||||
"response.reasoning_part.added"
|
||||
]:
|
||||
assert event.content_index != current_content_index
|
||||
current_content_index = event.content_index
|
||||
elif event.type in [
|
||||
@ -461,6 +515,7 @@ async def test_function_calling(client: OpenAI, model_name: str):
|
||||
model=model_name,
|
||||
input="What's the weather like in Paris today?",
|
||||
tools=tools,
|
||||
temperature=0.0,
|
||||
)
|
||||
assert response is not None
|
||||
assert response.status == "completed"
|
||||
@ -689,3 +744,18 @@ async def test_function_calling_full_history(client: OpenAI, model_name: str):
|
||||
assert response_2 is not None
|
||||
assert response_2.status == "completed"
|
||||
assert response_2.output_text is not None
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_output_messages_enabled(client: OpenAI, model_name: str,
|
||||
server):
|
||||
response = await client.responses.create(
|
||||
model=model_name,
|
||||
input="What is the capital of South Korea?",
|
||||
extra_body={"enable_response_messages": True})
|
||||
|
||||
assert response is not None
|
||||
assert response.status == "completed"
|
||||
assert len(response.input_messages) > 0
|
||||
assert len(response.output_messages) > 0
|
||||
|
||||
@ -10,8 +10,30 @@ import pytest
|
||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
from .test_completion import default_server_args # noqa: F401
|
||||
from .test_completion import MODEL_NAME
|
||||
|
||||
MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def default_server_args(zephyr_lora_files):
|
||||
return [
|
||||
# use half precision for speed and memory savings in CI environment
|
||||
"--dtype",
|
||||
"bfloat16",
|
||||
"--max-model-len",
|
||||
"8192",
|
||||
"--max-num-seqs",
|
||||
"128",
|
||||
"--enforce-eager",
|
||||
# lora config
|
||||
"--enable-lora",
|
||||
"--lora-modules",
|
||||
f"zephyr-lora={zephyr_lora_files}",
|
||||
"--max-lora-rank",
|
||||
"64",
|
||||
"--max-cpu-loras",
|
||||
"2",
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
|
||||
@ -333,7 +333,6 @@ async def test_serving_chat_should_set_correct_max_tokens():
|
||||
"role": "user",
|
||||
"content": "what is 1+1?"
|
||||
}],
|
||||
guided_decoding_backend="outlines",
|
||||
)
|
||||
|
||||
with suppress(Exception):
|
||||
@ -378,7 +377,6 @@ async def test_serving_chat_should_set_correct_max_tokens():
|
||||
"role": "user",
|
||||
"content": "what is 1+1?"
|
||||
}],
|
||||
guided_decoding_backend="outlines",
|
||||
)
|
||||
|
||||
with suppress(Exception):
|
||||
@ -433,7 +431,6 @@ async def test_serving_chat_should_set_correct_max_tokens():
|
||||
"role": "user",
|
||||
"content": "what is 1+1?"
|
||||
}],
|
||||
guided_decoding_backend="outlines",
|
||||
)
|
||||
|
||||
with suppress(Exception):
|
||||
@ -489,7 +486,6 @@ async def test_serving_chat_could_load_correct_generation_config():
|
||||
"role": "user",
|
||||
"content": "what is 1+1?"
|
||||
}],
|
||||
guided_decoding_backend="outlines",
|
||||
)
|
||||
|
||||
with suppress(Exception):
|
||||
|
||||
@ -15,14 +15,6 @@ MODEL_NAME = "ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11"
|
||||
DTYPE = "float16"
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def v1(run_with_both_engines):
|
||||
# Simple autouse wrapper to run both engines for each test
|
||||
# This can be promoted up to conftest.py to run for every
|
||||
# test in a package
|
||||
pass
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def server():
|
||||
args = [
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# imports for guided decoding tests
|
||||
# imports for structured outputs tests
|
||||
import io
|
||||
import json
|
||||
|
||||
|
||||
@ -2,7 +2,7 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import io
|
||||
# imports for guided decoding tests
|
||||
# imports for structured outputs tests
|
||||
import json
|
||||
|
||||
import httpx
|
||||
|
||||
@ -216,7 +216,7 @@ def server_with_chunked_processing():
|
||||
"--enforce-eager",
|
||||
"--max-model-len",
|
||||
"512", # Set smaller max_model_len to trigger chunking mechanism
|
||||
'--override-pooler-config',
|
||||
'--pooler-config',
|
||||
('{"pooling_type": "MEAN", "normalize": true, '
|
||||
'"enable_chunked_processing": true, "max_embed_len": 10000}'),
|
||||
"--gpu-memory-utilization",
|
||||
|
||||
@ -60,7 +60,7 @@ def test_api_server_process_manager_init(api_server_args, with_stats_update):
|
||||
global WORKER_RUNTIME_SECONDS
|
||||
WORKER_RUNTIME_SECONDS = 0.5
|
||||
|
||||
# Copy the args to avoid mutating the
|
||||
# Copy the args to avoid mutating them
|
||||
args = api_server_args.copy()
|
||||
|
||||
if not with_stats_update:
|
||||
|
||||
@ -83,7 +83,7 @@ def ref_paged_attn(
|
||||
@pytest.mark.parametrize("num_heads", NUM_HEADS)
|
||||
@pytest.mark.parametrize("head_size", HEAD_SIZES)
|
||||
@pytest.mark.parametrize("block_size", BLOCK_SIZES)
|
||||
@pytest.mark.parametrize("sliding_window", [None, 256])
|
||||
@pytest.mark.parametrize("sliding_window", [None, 64, 128, 256])
|
||||
@pytest.mark.parametrize("dtype", DTYPES)
|
||||
@pytest.mark.parametrize("soft_cap", [None, 50.0])
|
||||
@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
|
||||
@ -102,9 +102,6 @@ def test_triton_unified_attn(
|
||||
) -> None:
|
||||
torch.set_default_device("cuda")
|
||||
|
||||
if q_dtype is not None and q_dtype.itemsize < 2 and block_size < 32:
|
||||
pytest.skip("block size must be at least 32 for fp8")
|
||||
|
||||
current_platform.seed_everything(0)
|
||||
num_seqs = len(seq_lens)
|
||||
query_lens = [x[0] for x in seq_lens]
|
||||
|
||||
@ -1,9 +1,12 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from typing import NamedTuple
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
from packaging.version import Version
|
||||
from transformers import AutoConfig
|
||||
from transformers import __version__ as TRANSFORMERS_VERSION
|
||||
|
||||
from vllm.model_executor.layers.rotary_embedding import get_rope
|
||||
from vllm.platforms import current_platform
|
||||
@ -15,6 +18,7 @@ def generate_test_data(num_tokens: int, num_q_heads: int, num_kv_heads: int,
|
||||
head_size: int, max_position_embeddings: int,
|
||||
dtype: torch.dtype, device: torch.device):
|
||||
"""Generate test data for given configuration."""
|
||||
current_platform.seed_everything(42)
|
||||
# Create 2D positions (3, num_tokens) for multimodal case
|
||||
positions = torch.randint(0,
|
||||
max_position_embeddings // 4, (3, num_tokens),
|
||||
@ -33,22 +37,37 @@ def generate_test_data(num_tokens: int, num_q_heads: int, num_kv_heads: int,
|
||||
return positions, query, key
|
||||
|
||||
|
||||
def unroll_model_tp_dict(model_tp_dict):
|
||||
return [(model_name, tp_size)
|
||||
for model_name, tp_sizes in model_tp_dict.items()
|
||||
for tp_size in tp_sizes]
|
||||
class MRoPETestInfo(NamedTuple):
|
||||
model_name: str
|
||||
# https://github.com/pytorch/pytorch/blob/main/torch/testing/_comparison.py#L1317
|
||||
atol: float = 1e-2
|
||||
rtol: float = 1.6e-2
|
||||
marks: list[pytest.MarkDecorator] = []
|
||||
|
||||
|
||||
model_tp_dict = {
|
||||
"Qwen/Qwen2-VL-7B-Instruct": [1, 2],
|
||||
"Qwen/Qwen2-VL-72B-Instruct": [1, 2],
|
||||
"Qwen/Qwen2.5-VL-72B-Instruct": [1, 2],
|
||||
"zai-org/GLM-4.1V-9B-Thinking": [1, 2],
|
||||
}
|
||||
TRANSFORMERS_BASE_VERSION = Version(TRANSFORMERS_VERSION).base_version
|
||||
|
||||
# https://github.com/pytorch/pytorch/blob/main/torch/testing/_comparison.py#L1317
|
||||
dtype_atol_rtol_list = [
|
||||
[torch.bfloat16, 1e-2, 1.6e-2],
|
||||
MODELS_TO_TEST = [
|
||||
MRoPETestInfo(model_name="zai-org/GLM-4.1V-9B-Thinking"),
|
||||
MRoPETestInfo(model_name="Qwen/Qwen2-VL-7B-Instruct"),
|
||||
MRoPETestInfo(model_name="Qwen/Qwen2-VL-72B-Instruct"),
|
||||
MRoPETestInfo(model_name="Qwen/Qwen2.5-VL-72B-Instruct"),
|
||||
MRoPETestInfo(
|
||||
model_name="Qwen/Qwen3-VL-4B-Instruct",
|
||||
marks=[
|
||||
pytest.mark.skipif(
|
||||
Version(TRANSFORMERS_BASE_VERSION) < Version("4.57.0"),
|
||||
reason="Qwen3-VL only available after Transformers v4.57",
|
||||
)
|
||||
]),
|
||||
MRoPETestInfo(
|
||||
model_name="Qwen/Qwen3-VL-30B-A3B-Instruct",
|
||||
marks=[
|
||||
pytest.mark.skipif(
|
||||
Version(TRANSFORMERS_BASE_VERSION) < Version("4.57.0"),
|
||||
reason="Qwen3-VL only available after Transformers v4.57",
|
||||
)
|
||||
]),
|
||||
]
|
||||
|
||||
num_tokens_list = [11, 8192]
|
||||
@ -56,20 +75,29 @@ num_tokens_list = [11, 8192]
|
||||
|
||||
@pytest.mark.skipif(not current_platform.is_cuda_alike(),
|
||||
reason="Skipping CUDA/ROCm only tests.")
|
||||
@pytest.mark.parametrize("model_name, tp_size",
|
||||
unroll_model_tp_dict(model_tp_dict))
|
||||
@pytest.mark.parametrize("dtype, atol, rtol", dtype_atol_rtol_list)
|
||||
@pytest.mark.parametrize("model_info, model_name", [
|
||||
pytest.param(test_config, test_config.model_name, marks=test_config.marks)
|
||||
for test_config in MODELS_TO_TEST
|
||||
])
|
||||
@pytest.mark.parametrize("tp_size", [1, 2])
|
||||
@pytest.mark.parametrize("dtype", [torch.bfloat16])
|
||||
@pytest.mark.parametrize("num_tokens", num_tokens_list)
|
||||
def test_mrope(model_name, tp_size, dtype, atol, rtol, num_tokens):
|
||||
def test_mrope(model_name: str, model_info: MRoPETestInfo, tp_size: int,
|
||||
dtype: torch.dtype, num_tokens: int):
|
||||
|
||||
atol = model_info.atol
|
||||
rtol = model_info.rtol
|
||||
|
||||
config = AutoConfig.from_pretrained(model_name)
|
||||
config = config.get_text_config()
|
||||
|
||||
# get the model config
|
||||
total_num_kv_heads = config.num_key_value_heads
|
||||
total_num_heads = config.num_attention_heads
|
||||
num_heads = total_num_heads // tp_size
|
||||
num_kv_heads = max(1, total_num_kv_heads // tp_size)
|
||||
head_dim = config.hidden_size // total_num_heads
|
||||
head_dim = (config.head_dim if hasattr(config, "head_dim") else
|
||||
config.hidden_size // total_num_heads)
|
||||
is_neox_style = True
|
||||
|
||||
rope_theta = config.rope_theta
|
||||
@ -111,24 +139,30 @@ def test_mrope(model_name, tp_size, dtype, atol, rtol, num_tokens):
|
||||
|
||||
@pytest.mark.skipif(not current_platform.is_cuda_alike(),
|
||||
reason="Skipping CUDA/ROCm only tests.")
|
||||
@pytest.mark.parametrize(
|
||||
"model_name, tp_size",
|
||||
unroll_model_tp_dict({
|
||||
"Qwen/Qwen2-VL-7B-Instruct": [1, 2],
|
||||
"zai-org/GLM-4.1V-9B-Thinking": [1, 2]
|
||||
}))
|
||||
@pytest.mark.parametrize("dtype, atol, rtol", dtype_atol_rtol_list)
|
||||
@pytest.mark.parametrize("num_tokens", [4])
|
||||
def test_mrope_torch_compile_tracing(model_name, tp_size, dtype, atol, rtol,
|
||||
num_tokens):
|
||||
@pytest.mark.parametrize("model_info, model_name", [
|
||||
pytest.param(test_config, test_config.model_name, marks=test_config.marks)
|
||||
for test_config in MODELS_TO_TEST
|
||||
])
|
||||
@pytest.mark.parametrize("tp_size", [1, 2])
|
||||
@pytest.mark.parametrize("dtype", [torch.bfloat16])
|
||||
@pytest.mark.parametrize("num_tokens", num_tokens_list)
|
||||
def test_mrope_torch_compile_tracing(model_name: str,
|
||||
model_info: MRoPETestInfo, tp_size: int,
|
||||
dtype: torch.dtype, num_tokens: int):
|
||||
|
||||
atol = model_info.atol
|
||||
rtol = model_info.rtol
|
||||
|
||||
config = AutoConfig.from_pretrained(model_name)
|
||||
config = config.get_text_config()
|
||||
|
||||
# get the model config
|
||||
total_num_kv_heads = config.num_key_value_heads
|
||||
total_num_heads = config.num_attention_heads
|
||||
num_heads = total_num_heads // tp_size
|
||||
num_kv_heads = max(1, total_num_kv_heads // tp_size)
|
||||
head_dim = config.hidden_size // total_num_heads
|
||||
head_dim = (config.head_dim if hasattr(config, "head_dim") else
|
||||
config.hidden_size // total_num_heads)
|
||||
is_neox_style = True
|
||||
rope_theta = config.rope_theta
|
||||
max_position = config.max_position_embeddings
|
||||
|
||||
@ -11,7 +11,8 @@ import pytest
|
||||
import torch
|
||||
|
||||
import vllm.model_executor.layers.fused_moe.modular_kernel as mk
|
||||
from vllm.config import VllmConfig, current_platform, set_current_vllm_config
|
||||
from vllm.config import VllmConfig, set_current_vllm_config
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils import has_deep_ep, has_deep_gemm, has_pplx
|
||||
from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe
|
||||
|
||||
|
||||
@ -8,11 +8,12 @@ import torch
|
||||
from safetensors.torch import load_file
|
||||
from torch import nn
|
||||
|
||||
from vllm.config import ModelConfig, VllmConfig
|
||||
from vllm.config.lora import LoRAConfig
|
||||
from vllm.lora.layers import (ColumnParallelLinearWithLoRA,
|
||||
MergedColumnParallelLinearWithLoRA,
|
||||
RowParallelLinearWithLoRA)
|
||||
from vllm.lora.lora import LoRALayerWeights, PackedLoRALayerWeights
|
||||
from vllm.lora.lora_weights import LoRALayerWeights, PackedLoRALayerWeights
|
||||
from vllm.lora.models import (LoRAMapping, LoRAModel, LoRAModelManager,
|
||||
LRUCacheLoRAModelManager)
|
||||
from vllm.lora.peft_helper import PEFTHelper
|
||||
@ -435,10 +436,19 @@ def test_lru_cache_worker_adapter_manager(dist_init, dummy_model, device,
|
||||
target_modules=["layer1.dense1", "dense2"],
|
||||
lora_dtype=DEFAULT_DTYPE,
|
||||
)
|
||||
|
||||
model_config = ModelConfig(max_model_len=16)
|
||||
vllm_config = VllmConfig(model_config=model_config,
|
||||
lora_config=lora_config)
|
||||
|
||||
vllm_config.scheduler_config.max_num_seqs = 4
|
||||
vllm_config.scheduler_config.max_num_batched_tokens = 2
|
||||
worker_adapter_manager = LRUCacheWorkerLoRAManager(
|
||||
4, 2,
|
||||
dummy_model.unpadded_vocab_size - lora_config.lora_extra_vocab_size,
|
||||
lora_config, device, EMBEDDING_MODULES, EMBEDDING_PADDING_MODULES)
|
||||
vllm_config, device, EMBEDDING_MODULES, EMBEDDING_PADDING_MODULES)
|
||||
|
||||
worker_adapter_manager.max_num_seqs = 4
|
||||
worker_adapter_manager.max_num_batched_tokens = 2
|
||||
|
||||
worker_adapter_manager.create_lora_manager(dummy_model)
|
||||
|
||||
mapping = LoRAMapping([], [])
|
||||
@ -517,10 +527,20 @@ def test_worker_adapter_manager(dist_init, dummy_model_gate_up, device,
|
||||
max_cpu_loras=4,
|
||||
max_loras=4,
|
||||
lora_dtype=DEFAULT_DTYPE)
|
||||
worker_adapter_manager = WorkerLoRAManager(
|
||||
4, 2, dummy_model_gate_up.unpadded_vocab_size -
|
||||
lora_config.lora_extra_vocab_size, lora_config, device,
|
||||
EMBEDDING_MODULES, EMBEDDING_PADDING_MODULES)
|
||||
|
||||
model_config = ModelConfig(max_model_len=16)
|
||||
vllm_config = VllmConfig(model_config=model_config,
|
||||
lora_config=lora_config)
|
||||
|
||||
vllm_config.scheduler_config.max_num_seqs = 4
|
||||
vllm_config.scheduler_config.max_num_batched_tokens = 2
|
||||
|
||||
worker_adapter_manager = WorkerLoRAManager(vllm_config, device,
|
||||
EMBEDDING_MODULES,
|
||||
EMBEDDING_PADDING_MODULES)
|
||||
worker_adapter_manager.vocab_size = (
|
||||
dummy_model_gate_up.unpadded_vocab_size -
|
||||
lora_config.lora_extra_vocab_size)
|
||||
worker_adapter_manager.create_lora_manager(dummy_model_gate_up)
|
||||
|
||||
dummy_lora_files = f"{tmp_path}/lora_adapter"
|
||||
|
||||
@ -9,7 +9,7 @@ from typing import Optional, Union
|
||||
import torch
|
||||
from safetensors.torch import save_file
|
||||
|
||||
from vllm.lora.lora import LoRALayerWeights, PackedLoRALayerWeights
|
||||
from vllm.lora.lora_weights import LoRALayerWeights, PackedLoRALayerWeights
|
||||
|
||||
|
||||
class DummyLoRAManager:
|
||||
|
||||
@ -125,12 +125,6 @@ def test_models(hf_runner, vllm_runner, example_prompts, model: str,
|
||||
# in parts of the operators
|
||||
pytest.skip(f"Skipping '{model}' model test with AITER kernel.")
|
||||
|
||||
# Note: can be removed when
|
||||
# https://github.com/vllm-project/vllm/pull/24278 finished
|
||||
if current_platform.is_cpu() and use_prompt_embeds:
|
||||
pytest.skip("Skipping use_prompt_embeds=True with "
|
||||
"V1-only CPU backend.")
|
||||
|
||||
with hf_runner(model) as hf_model:
|
||||
hf_outputs = hf_model.generate_greedy_logprobs_limit(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
|
||||
@ -418,7 +418,9 @@ def test_full_cuda_graph(
|
||||
@pytest.mark.parametrize("model", FP32_STATE_MODELS)
|
||||
@pytest.mark.parametrize("max_tokens", [64])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
def test_fp32_state(
|
||||
@pytest.mark.parametrize("cache_dtype_param",
|
||||
["mamba_ssm_cache_dtype", "mamba_cache_dtype"])
|
||||
def test_fp32_cache_state(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
@ -426,6 +428,7 @@ def test_fp32_state(
|
||||
model: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
cache_dtype_param: str,
|
||||
) -> None:
|
||||
|
||||
try:
|
||||
@ -443,13 +446,13 @@ def test_fp32_state(
|
||||
m.setenv("VLLM_USE_V1", "0")
|
||||
with vllm_runner(model,
|
||||
max_num_seqs=MAX_NUM_SEQS,
|
||||
mamba_ssm_cache_dtype="float32") as vllm_model:
|
||||
**{cache_dtype_param: "float32"}) as vllm_model:
|
||||
vllm_v0_outputs = vllm_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
|
||||
with vllm_runner(model,
|
||||
max_num_seqs=MAX_NUM_SEQS,
|
||||
mamba_ssm_cache_dtype="float32") as vllm_model:
|
||||
**{cache_dtype_param: "float32"}) as vllm_model:
|
||||
vllm_v1_outputs = vllm_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
|
||||
|
||||
@ -58,7 +58,7 @@ def test_models(
|
||||
|
||||
vllm_extra_kwargs = {}
|
||||
if model == "ssmits/Qwen2-7B-Instruct-embed-base":
|
||||
vllm_extra_kwargs["override_pooler_config"] = \
|
||||
vllm_extra_kwargs["pooler_config"] = \
|
||||
PoolerConfig(pooling_type="MEAN", normalize=False)
|
||||
|
||||
max_model_len: Optional[int] = 512
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from vllm.config.pooler import PoolerConfig
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
|
||||
@ -99,7 +100,7 @@ def test_gemma_multimodal(
|
||||
convert="classify",
|
||||
load_format="auto",
|
||||
hf_overrides=update_config,
|
||||
override_pooler_config={"pooling_type": "LAST"},
|
||||
pooler_config=PoolerConfig(pooling_type="LAST"),
|
||||
max_model_len=512,
|
||||
enforce_eager=True,
|
||||
tensor_parallel_size=1,
|
||||
|
||||
@ -24,18 +24,18 @@ def test_classify_models_using_activation(
|
||||
dtype: str,
|
||||
) -> None:
|
||||
|
||||
with vllm_runner(model,
|
||||
max_model_len=512,
|
||||
dtype=dtype,
|
||||
override_pooler_config=PoolerConfig(
|
||||
activation=False)) as vllm_model:
|
||||
with vllm_runner(
|
||||
model,
|
||||
max_model_len=512,
|
||||
dtype=dtype,
|
||||
pooler_config=PoolerConfig(activation=False)) as vllm_model:
|
||||
wo_activation_out = vllm_model.classify(example_prompts)
|
||||
|
||||
with vllm_runner(model,
|
||||
max_model_len=512,
|
||||
dtype=dtype,
|
||||
override_pooler_config=PoolerConfig(
|
||||
activation=True)) as vllm_model:
|
||||
with vllm_runner(
|
||||
model,
|
||||
max_model_len=512,
|
||||
dtype=dtype,
|
||||
pooler_config=PoolerConfig(activation=True)) as vllm_model:
|
||||
w_activation_out = vllm_model.classify(example_prompts)
|
||||
|
||||
for wo_activation, w_activation in zip(wo_activation_out,
|
||||
@ -43,9 +43,8 @@ def test_classify_models_using_activation(
|
||||
wo_activation = torch.tensor(wo_activation)
|
||||
w_activation = torch.tensor(w_activation)
|
||||
|
||||
assert not torch.allclose(
|
||||
wo_activation, w_activation,
|
||||
atol=1e-2), "override_pooler_config is not working"
|
||||
assert not torch.allclose(wo_activation, w_activation,
|
||||
atol=1e-2), "pooler_config is not working"
|
||||
assert torch.allclose(softmax(wo_activation), w_activation,
|
||||
1e-3 if dtype == "float" else 1e-2)
|
||||
|
||||
@ -65,23 +64,22 @@ def test_embed_models_using_normalize(
|
||||
dtype: str,
|
||||
) -> None:
|
||||
|
||||
with vllm_runner(model,
|
||||
max_model_len=512,
|
||||
dtype=dtype,
|
||||
override_pooler_config=PoolerConfig(
|
||||
normalize=False)) as vllm_model:
|
||||
wo_normalize = torch.tensor(vllm_model.embed(example_prompts))
|
||||
|
||||
with vllm_runner(
|
||||
model,
|
||||
max_model_len=512,
|
||||
dtype=dtype,
|
||||
override_pooler_config=PoolerConfig(normalize=True)) as vllm_model:
|
||||
pooler_config=PoolerConfig(normalize=False)) as vllm_model:
|
||||
wo_normalize = torch.tensor(vllm_model.embed(example_prompts))
|
||||
|
||||
with vllm_runner(model,
|
||||
max_model_len=512,
|
||||
dtype=dtype,
|
||||
pooler_config=PoolerConfig(normalize=True)) as vllm_model:
|
||||
w_normalize = torch.tensor(vllm_model.embed(example_prompts))
|
||||
|
||||
assert not torch.allclose(
|
||||
wo_normalize, w_normalize,
|
||||
atol=1e-2), "override_pooler_config normalize is not working"
|
||||
atol=1e-2), "pooler_config normalize is not working"
|
||||
assert torch.allclose(
|
||||
F.normalize(wo_normalize, p=2, dim=-1), w_normalize,
|
||||
atol=1e-2), "w_normal should be close to normal(wo_normal)."
|
||||
@ -102,18 +100,16 @@ def test_reward_models_using_softmax(
|
||||
dtype: str,
|
||||
) -> None:
|
||||
|
||||
with vllm_runner(
|
||||
model,
|
||||
max_model_len=1024,
|
||||
dtype=dtype,
|
||||
override_pooler_config=PoolerConfig(softmax=False)) as vllm_model:
|
||||
with vllm_runner(model,
|
||||
max_model_len=1024,
|
||||
dtype=dtype,
|
||||
pooler_config=PoolerConfig(softmax=False)) as vllm_model:
|
||||
wo_softmax = vllm_model.encode(example_prompts)
|
||||
|
||||
with vllm_runner(
|
||||
model,
|
||||
max_model_len=1024,
|
||||
dtype=dtype,
|
||||
override_pooler_config=PoolerConfig(softmax=True)) as vllm_model:
|
||||
with vllm_runner(model,
|
||||
max_model_len=1024,
|
||||
dtype=dtype,
|
||||
pooler_config=PoolerConfig(softmax=True)) as vllm_model:
|
||||
w_softmax = vllm_model.encode(example_prompts)
|
||||
|
||||
for wo, w in zip(wo_softmax, w_softmax):
|
||||
@ -121,7 +117,7 @@ def test_reward_models_using_softmax(
|
||||
w = torch.tensor(w)
|
||||
|
||||
assert not torch.allclose(
|
||||
wo, w, atol=1e-2), "override_pooler_config softmax is not working"
|
||||
wo, w, atol=1e-2), "pooler_config softmax is not working"
|
||||
assert torch.allclose(
|
||||
softmax(wo), w,
|
||||
atol=1e-2), "w_softmax should be close to softmax(wo_softmax)."
|
||||
39
tests/models/language/pooling/test_token_classification.py
Normal file
39
tests/models/language/pooling/test_token_classification.py
Normal file
@ -0,0 +1,39 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import pytest
|
||||
import torch
|
||||
from transformers import AutoModelForTokenClassification
|
||||
|
||||
from tests.models.utils import softmax
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", ["boltuix/NeuroBERT-NER"])
|
||||
# The float32 is required for this tiny model to pass the test.
|
||||
@pytest.mark.parametrize("dtype", ["float"])
|
||||
@torch.inference_mode
|
||||
def test_models(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
dtype: str,
|
||||
) -> None:
|
||||
with vllm_runner(model, max_model_len=None, dtype=dtype) as vllm_model:
|
||||
vllm_outputs = vllm_model.encode(example_prompts)
|
||||
|
||||
with hf_runner(model,
|
||||
dtype=dtype,
|
||||
auto_cls=AutoModelForTokenClassification) as hf_model:
|
||||
tokenizer = hf_model.tokenizer
|
||||
hf_outputs = []
|
||||
for prompt in example_prompts:
|
||||
inputs = tokenizer([prompt], return_tensors="pt")
|
||||
inputs = hf_model.wrap_device(inputs)
|
||||
output = hf_model.model(**inputs)
|
||||
hf_outputs.append(softmax(output.logits[0]))
|
||||
|
||||
# check logits difference
|
||||
for hf_output, vllm_output in zip(hf_outputs, vllm_outputs):
|
||||
hf_output = torch.tensor(hf_output).cpu().float()
|
||||
vllm_output = torch.tensor(vllm_output).cpu().float()
|
||||
assert torch.allclose(hf_output, vllm_output, 1e-2)
|
||||
@ -414,6 +414,7 @@ _SEQUENCE_CLASSIFICATION_EXAMPLE_MODELS = {
|
||||
|
||||
# [Cross-encoder]
|
||||
"BertForSequenceClassification": _HfExamplesInfo("cross-encoder/ms-marco-MiniLM-L-6-v2"), # noqa: E501
|
||||
"BertForTokenClassification": _HfExamplesInfo("boltuix/NeuroBERT-NER"),
|
||||
"GteNewForSequenceClassification": _HfExamplesInfo("Alibaba-NLP/gte-multilingual-reranker-base", # noqa: E501
|
||||
trust_remote_code=True,
|
||||
hf_overrides={
|
||||
|
||||
@ -7,7 +7,6 @@ from unittest.mock import patch
|
||||
import pytest
|
||||
|
||||
from vllm import LLM
|
||||
from vllm.config import ModelImpl
|
||||
from vllm.engine.llm_engine import LLMEngine as V0LLMEngine
|
||||
from vllm.utils import GiB_bytes
|
||||
from vllm.v1.core.kv_cache_utils import get_kv_cache_configs
|
||||
@ -111,8 +110,8 @@ def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch,
|
||||
# these tests seem to produce leftover memory
|
||||
gpu_memory_utilization=0.80,
|
||||
load_format="dummy",
|
||||
model_impl=ModelImpl.TRANSFORMERS
|
||||
if model_arch in _TRANSFORMERS_BACKEND_MODELS else ModelImpl.VLLM,
|
||||
model_impl="transformers"
|
||||
if model_arch in _TRANSFORMERS_BACKEND_MODELS else "vllm",
|
||||
hf_overrides=hf_overrides_fn,
|
||||
max_num_seqs=model_info.max_num_seqs)
|
||||
|
||||
|
||||
@ -9,7 +9,7 @@ from vllm.platforms import current_platform
|
||||
|
||||
from ..conftest import HfRunner, VllmRunner
|
||||
from ..utils import multi_gpu_test, prep_prompts
|
||||
from .utils import check_logprobs_close
|
||||
from .utils import check_embeddings_close, check_logprobs_close
|
||||
|
||||
|
||||
def check_implementation(
|
||||
@ -165,6 +165,40 @@ def test_embed_loading(vllm_runner, model):
|
||||
assert model_config.using_transformers_backend()
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[
|
||||
# Encoder model
|
||||
"BAAI/bge-base-en-v1.5",
|
||||
])
|
||||
def test_embed_correctness(hf_runner, vllm_runner, example_prompts, model):
|
||||
import transformers
|
||||
from packaging.version import Version
|
||||
installed = Version(transformers.__version__)
|
||||
required = Version("4.57.0.dev0")
|
||||
if installed < required:
|
||||
pytest.skip("Encoder models with the Transformers backend require "
|
||||
f"transformers>={required}, but got {installed}")
|
||||
|
||||
with vllm_runner(model, max_model_len=512,
|
||||
model_impl="transformers") as vllm_model:
|
||||
model_config = vllm_model.llm.llm_engine.model_config
|
||||
assert model_config.using_transformers_backend()
|
||||
|
||||
vllm_outputs = vllm_model.embed(example_prompts)
|
||||
|
||||
with hf_runner(model, is_sentence_transformer=True) as hf_model:
|
||||
hf_outputs = hf_model.encode(example_prompts)
|
||||
|
||||
check_embeddings_close(
|
||||
embeddings_0_lst=hf_outputs,
|
||||
embeddings_1_lst=vllm_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
tol=1e-2,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
["jason9693/Qwen2.5-1.5B-apeach"],
|
||||
|
||||
@ -22,7 +22,7 @@ class DataModuleConfig(TypedDict):
|
||||
|
||||
class ImagePrompt(BaseModel):
|
||||
|
||||
data_format: Literal["b64_json", "bytes", "url"]
|
||||
data_format: Literal["b64_json", "bytes", "url", "path"]
|
||||
"""
|
||||
This is the data type for the input image
|
||||
"""
|
||||
|
||||
@ -1,182 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from vllm import SamplingParams
|
||||
|
||||
from ..conftest import VllmRunner
|
||||
|
||||
MODELS = ["distilbert/distilgpt2"]
|
||||
|
||||
|
||||
@pytest.fixture(scope="function", autouse=True)
|
||||
def use_v0_only(monkeypatch):
|
||||
"""
|
||||
This module is V0 only since it uses dtype=float, so
|
||||
set VLLM_USE_V1=0 for all tests in the module.
|
||||
"""
|
||||
monkeypatch.setenv('VLLM_USE_V1', '0')
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype",
|
||||
["float"]) # needed for comparing logprobs with HF
|
||||
@pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16, -1])
|
||||
@pytest.mark.parametrize("num_top_logprobs", [0, 6]) # 32000 == vocab_size
|
||||
@pytest.mark.parametrize("detokenize", [True, False])
|
||||
def test_get_prompt_logprobs(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
model,
|
||||
dtype,
|
||||
chunked_prefill_token_size: int,
|
||||
num_top_logprobs: int,
|
||||
detokenize: bool,
|
||||
example_prompts,
|
||||
):
|
||||
max_num_seqs = 256
|
||||
enable_chunked_prefill = False
|
||||
max_num_batched_tokens = None
|
||||
if chunked_prefill_token_size != -1:
|
||||
enable_chunked_prefill = True
|
||||
max_num_seqs = min(chunked_prefill_token_size, max_num_seqs)
|
||||
max_num_batched_tokens = chunked_prefill_token_size
|
||||
|
||||
max_tokens = 5
|
||||
with hf_runner(model, dtype=dtype) as hf_model:
|
||||
hf_logprobs = hf_model.generate_greedy_logprobs(
|
||||
example_prompts,
|
||||
max_tokens=max_tokens,
|
||||
)
|
||||
|
||||
with vllm_runner(
|
||||
model,
|
||||
dtype=dtype,
|
||||
max_logprobs=num_top_logprobs,
|
||||
enable_chunked_prefill=enable_chunked_prefill,
|
||||
max_num_batched_tokens=max_num_batched_tokens,
|
||||
max_num_seqs=max_num_seqs,
|
||||
) as vllm_model:
|
||||
vllm_sampling_params = SamplingParams(max_tokens=max_tokens,
|
||||
logprobs=num_top_logprobs,
|
||||
prompt_logprobs=num_top_logprobs,
|
||||
temperature=0.0,
|
||||
detokenize=detokenize)
|
||||
vllm_results = vllm_model.llm.generate(
|
||||
example_prompts, sampling_params=vllm_sampling_params)
|
||||
|
||||
# Test whether logprobs are included in the results.
|
||||
for result in vllm_results:
|
||||
assert result.prompt_logprobs is not None
|
||||
assert result.outputs[0].logprobs is not None
|
||||
assert len(result.outputs[0].logprobs) == max_tokens
|
||||
for logprobs in result.outputs[0].logprobs:
|
||||
# If the output token is not included in the top X
|
||||
# logprob, it can return 1 more data
|
||||
assert (len(logprobs) == num_top_logprobs
|
||||
or len(logprobs) == num_top_logprobs + 1)
|
||||
output_text = result.outputs[0].text
|
||||
output_string_from_most_likely_tokens_lst: list[str] = []
|
||||
for top_logprobs in result.outputs[0].logprobs:
|
||||
top_logprob = next(iter(top_logprobs.values()))
|
||||
output_string_from_most_likely_tokens_lst.append(
|
||||
top_logprob.decoded_token)
|
||||
|
||||
if detokenize:
|
||||
output_string_from_most_likely_tokens = "".join(
|
||||
output_string_from_most_likely_tokens_lst)
|
||||
assert output_text == output_string_from_most_likely_tokens, (
|
||||
"The output text from the top logprob for each token position "
|
||||
"should be the same as the output text in the result.")
|
||||
else:
|
||||
assert output_text == ''
|
||||
assert output_string_from_most_likely_tokens_lst == ([None] *
|
||||
max_tokens)
|
||||
|
||||
# The first prompt logprob is always None
|
||||
assert result.prompt_logprobs[0] is None
|
||||
for prompt_logprobs in result.prompt_logprobs[1:]:
|
||||
# If the prompt token is not included in the top X
|
||||
# logprob, it can return 1 more data
|
||||
assert (len(prompt_logprobs) == num_top_logprobs
|
||||
or len(prompt_logprobs) == num_top_logprobs + 1)
|
||||
|
||||
# Test whether prompt logprobs are consistent with HF
|
||||
for vllm_result, hf_logprob in zip(vllm_results, hf_logprobs):
|
||||
# Check prompt logprobs
|
||||
# The first prompt logprob is always None, so we compare it from 1:.
|
||||
vllm_prompt_logprobs = vllm_result.prompt_logprobs[1:]
|
||||
for i, vllm_prompt_logprob_dict in enumerate(vllm_prompt_logprobs):
|
||||
for token_id, logprob in vllm_prompt_logprob_dict.items():
|
||||
torch.testing.assert_close(logprob.logprob,
|
||||
hf_logprob[0][i][token_id].item(),
|
||||
atol=1e-2,
|
||||
rtol=1e-2)
|
||||
vllm_sample_logprobs = vllm_result.outputs[0].logprobs
|
||||
for i, top_logprobs in enumerate(vllm_sample_logprobs):
|
||||
for token_id, sample_logprob in top_logprobs.items():
|
||||
logprob = sample_logprob.logprob
|
||||
torch.testing.assert_close(logprob,
|
||||
hf_logprob[i][-1][token_id].item(),
|
||||
atol=1e-2,
|
||||
rtol=1e-2)
|
||||
if detokenize:
|
||||
assert isinstance(sample_logprob.decoded_token, str), (
|
||||
"The token should be decoded by the time it is returned"
|
||||
" to the user.")
|
||||
|
||||
# Test if prompt logprobs are correctly set.
|
||||
for vllm_result in vllm_results:
|
||||
token_ids = vllm_result.prompt_token_ids
|
||||
prompt_logprobs = vllm_result.prompt_logprobs
|
||||
|
||||
# The first token doesn't have logprob.
|
||||
assert prompt_logprobs[0] is None
|
||||
|
||||
for token_id, logprob_dict in zip(token_ids[1:], prompt_logprobs[1:]):
|
||||
assert token_id in logprob_dict
|
||||
|
||||
|
||||
def test_max_logprobs():
|
||||
runner = VllmRunner("facebook/opt-125m", max_logprobs=1)
|
||||
vllm_sampling_params = SamplingParams(logprobs=1)
|
||||
# should pass
|
||||
runner.generate(["Hello world"], sampling_params=vllm_sampling_params)
|
||||
|
||||
bad_sampling_params = SamplingParams(logprobs=2)
|
||||
with pytest.raises(ValueError):
|
||||
runner.generate(["Hello world"], sampling_params=bad_sampling_params)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16, -1])
|
||||
@pytest.mark.parametrize("detokenize", [True, False])
|
||||
def test_none_logprobs(vllm_runner, model, chunked_prefill_token_size: int,
|
||||
detokenize: bool, example_prompts):
|
||||
max_num_seqs = 256
|
||||
enable_chunked_prefill = False
|
||||
max_num_batched_tokens = None
|
||||
if chunked_prefill_token_size != -1:
|
||||
enable_chunked_prefill = True
|
||||
max_num_seqs = min(chunked_prefill_token_size, max_num_seqs)
|
||||
max_num_batched_tokens = chunked_prefill_token_size
|
||||
max_tokens = 5
|
||||
|
||||
with vllm_runner(
|
||||
model,
|
||||
enable_chunked_prefill=enable_chunked_prefill,
|
||||
max_num_batched_tokens=max_num_batched_tokens,
|
||||
max_num_seqs=max_num_seqs,
|
||||
) as vllm_model:
|
||||
sampling_params_logprobs_none = SamplingParams(max_tokens=max_tokens,
|
||||
logprobs=None,
|
||||
temperature=0.0,
|
||||
detokenize=detokenize)
|
||||
results_logprobs_none = vllm_model.llm.generate(
|
||||
example_prompts, sampling_params=sampling_params_logprobs_none)
|
||||
|
||||
for i in range(len(results_logprobs_none)):
|
||||
assert results_logprobs_none[i].outputs[0].logprobs is None
|
||||
assert results_logprobs_none[i].outputs[0].cumulative_logprob is None
|
||||
@ -207,25 +207,19 @@ def test_get_pooling_config():
|
||||
model_id = "sentence-transformers/all-MiniLM-L12-v2"
|
||||
model_config = ModelConfig(model_id)
|
||||
|
||||
pooling_config = model_config._init_pooler_config()
|
||||
assert pooling_config is not None
|
||||
|
||||
assert pooling_config.normalize
|
||||
assert pooling_config.pooling_type == PoolingType.MEAN.name
|
||||
assert model_config.pooler_config is not None
|
||||
assert model_config.pooler_config.normalize
|
||||
assert model_config.pooler_config.pooling_type == PoolingType.MEAN.name
|
||||
|
||||
|
||||
@pytest.mark.skipif(current_platform.is_rocm(),
|
||||
reason="Xformers backend is not supported on ROCm.")
|
||||
def test_get_pooling_config_from_args():
|
||||
model_id = "sentence-transformers/all-MiniLM-L12-v2"
|
||||
model_config = ModelConfig(model_id)
|
||||
pooler_config = PoolerConfig(pooling_type="CLS", normalize=True)
|
||||
model_config = ModelConfig(model_id, pooler_config=pooler_config)
|
||||
|
||||
override_pooler_config = PoolerConfig(pooling_type='CLS', normalize=True)
|
||||
model_config.override_pooler_config = override_pooler_config
|
||||
|
||||
pooling_config = model_config._init_pooler_config()
|
||||
assert pooling_config is not None
|
||||
assert asdict(pooling_config) == asdict(override_pooler_config)
|
||||
assert asdict(model_config.pooler_config) == asdict(pooler_config)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
|
||||
@ -1,84 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Tests for the SamplingParams class.
|
||||
"""
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm import SamplingParams
|
||||
from vllm.config import ModelConfig
|
||||
from vllm.entrypoints.openai.protocol import ChatCompletionRequest
|
||||
|
||||
MODEL_NAME = "Qwen/Qwen1.5-7B"
|
||||
|
||||
|
||||
def test_max_tokens_none():
|
||||
"""max_tokens=None should be allowed"""
|
||||
SamplingParams(temperature=0.01, top_p=0.1, max_tokens=None)
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def model_config():
|
||||
return ModelConfig(
|
||||
MODEL_NAME,
|
||||
seed=0,
|
||||
dtype="float16",
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def default_max_tokens():
|
||||
return 4096
|
||||
|
||||
|
||||
def test_sampling_params_from_request_with_no_guided_decoding_backend(
|
||||
model_config, default_max_tokens):
|
||||
# guided_decoding_backend is not present at request level
|
||||
request = ChatCompletionRequest.model_validate({
|
||||
'messages': [{
|
||||
'role': 'user',
|
||||
'content': 'Hello'
|
||||
}],
|
||||
'model':
|
||||
MODEL_NAME,
|
||||
'response_format': {
|
||||
'type': 'json_object',
|
||||
},
|
||||
})
|
||||
|
||||
sampling_params = request.to_sampling_params(
|
||||
default_max_tokens,
|
||||
model_config.logits_processor_pattern,
|
||||
)
|
||||
# we do not expect any backend to be present and the default
|
||||
# guided_decoding_backend at engine level will be used.
|
||||
assert sampling_params.guided_decoding.backend is None
|
||||
|
||||
|
||||
@pytest.mark.parametrize("request_level_guided_decoding_backend,expected",
|
||||
[("xgrammar", "xgrammar"), ("guidance", "guidance"),
|
||||
("outlines", "outlines")])
|
||||
def test_sampling_params_from_request_with_guided_decoding_backend(
|
||||
request_level_guided_decoding_backend: str, expected: str,
|
||||
model_config, default_max_tokens):
|
||||
|
||||
request = ChatCompletionRequest.model_validate({
|
||||
'messages': [{
|
||||
'role': 'user',
|
||||
'content': 'Hello'
|
||||
}],
|
||||
'model':
|
||||
MODEL_NAME,
|
||||
'response_format': {
|
||||
'type': 'json_object',
|
||||
},
|
||||
'guided_decoding_backend':
|
||||
request_level_guided_decoding_backend,
|
||||
})
|
||||
|
||||
sampling_params = request.to_sampling_params(
|
||||
default_max_tokens,
|
||||
model_config.logits_processor_pattern,
|
||||
)
|
||||
# backend correctly identified in resulting sampling_params
|
||||
assert sampling_params.guided_decoding.backend == expected
|
||||
@ -68,7 +68,7 @@ EXAMPLE_TOOLS = [
|
||||
def _compile_and_check(tools: list[ChatCompletionToolsParam], sample_output,
|
||||
should_match: bool):
|
||||
self = MagicMock(tool_choice="required", tools=tools)
|
||||
schema = ChatCompletionRequest._get_guided_json_from_tool(self)
|
||||
schema = ChatCompletionRequest._get_json_schema_from_tool(self)
|
||||
assert isinstance(schema, dict)
|
||||
|
||||
# use build_regex_from_schema used in JSONLogitsProcessor to create Guide
|
||||
@ -218,7 +218,7 @@ VALID_TOOLS = [t[0] for t in VALID_TOOL_OUTPUTS]
|
||||
}
|
||||
}, {}], False),
|
||||
])
|
||||
def test_guided_json(sample_output, should_match):
|
||||
def test_structured_outputs_json(sample_output, should_match):
|
||||
_compile_and_check(tools=TypeAdapter(
|
||||
list[ChatCompletionToolsParam]).validate_python(EXAMPLE_TOOLS),
|
||||
sample_output=sample_output,
|
||||
@ -273,8 +273,9 @@ def update_parameters_empty_dict(
|
||||
@pytest.mark.parametrize(
|
||||
"update_parameters",
|
||||
[update_parameters_none, update_parameters_empty_dict])
|
||||
def test_guided_json_without_parameters(sample_output, should_match,
|
||||
update_parameters):
|
||||
def test_structured_outputs_json_without_parameters(sample_output,
|
||||
should_match,
|
||||
update_parameters):
|
||||
updated_tools = [deepcopy(EXAMPLE_TOOLS[0])]
|
||||
tools = TypeAdapter(
|
||||
list[ChatCompletionToolsParam]).validate_python(updated_tools)
|
||||
@ -334,4 +335,4 @@ def test_streaming_output_valid(output, empty_params, delta_len):
|
||||
combined_messages += message.tool_calls[0].function.arguments
|
||||
combined_messages += "}]"
|
||||
assert json.loads(combined_messages) == output
|
||||
assert json.dumps(json.loads(combined_messages)) == output_json
|
||||
assert json.dumps(json.loads(combined_messages)) == output_json
|
||||
|
||||
@ -513,27 +513,27 @@ def test_hash_request_tokens_no_mm_inputs(hash_fn):
|
||||
assert block_hashes[1] == hash_fn((block_hashes[0], (3, 4, 5), None))
|
||||
|
||||
|
||||
def _stats(requests: int, queries: int, hits: int) -> PrefixCacheStats:
|
||||
return PrefixCacheStats(requests=requests, queries=queries, hits=hits)
|
||||
|
||||
|
||||
def test_metrics():
|
||||
"""
|
||||
Test the prefix caching metrics.
|
||||
"""
|
||||
|
||||
def stats(requests, queries, hits):
|
||||
return PrefixCacheStats(requests=requests, queries=queries, hits=hits)
|
||||
|
||||
metrics = PrefixCachingMetrics(max_recent_requests=5)
|
||||
assert metrics.hit_rate == 0.0
|
||||
|
||||
metrics.observe(stats(1, 20, 9))
|
||||
metrics.observe(_stats(1, 20, 9))
|
||||
# 9 / 20 = 0.45
|
||||
assert metrics.hit_rate == 0.45
|
||||
|
||||
metrics.observe(stats(4, 80, 16))
|
||||
metrics.observe(_stats(4, 80, 16))
|
||||
|
||||
# 25 / 100 = 0.25
|
||||
assert metrics.hit_rate == 0.25
|
||||
|
||||
metrics.observe(stats(1, 10, 2))
|
||||
metrics.observe(_stats(1, 10, 2))
|
||||
|
||||
# Remove (20, 9) and add (10, 2): 18 / 90 = 0.2
|
||||
assert metrics.aggregated_requests == 5
|
||||
@ -549,6 +549,38 @@ def test_metrics():
|
||||
assert not metrics.query_queue
|
||||
|
||||
|
||||
def test_metrics_empty_stats():
|
||||
"""
|
||||
Test the prefix caching metrics with empty stats.
|
||||
"""
|
||||
metrics = PrefixCachingMetrics(max_recent_requests=5)
|
||||
metrics.observe(_stats(0, 0, 0))
|
||||
metrics.observe(_stats(1, 20, 9))
|
||||
metrics.observe(_stats(0, 0, 0))
|
||||
metrics.observe(_stats(4, 80, 16))
|
||||
metrics.observe(_stats(0, 0, 0))
|
||||
metrics.observe(_stats(1, 10, 2))
|
||||
# Remove (20, 9) and add (10, 2): 18 / 90 = 0.2
|
||||
assert metrics.aggregated_requests == 5
|
||||
assert metrics.aggregated_query_total == 90
|
||||
assert metrics.aggregated_query_hit == 18
|
||||
assert metrics.hit_rate == 0.2
|
||||
|
||||
# Only the latest added stats preserved 10 / 20 = 0.5
|
||||
metrics.observe(_stats(11, 20, 10))
|
||||
assert metrics.aggregated_requests == 11
|
||||
assert metrics.aggregated_query_total == 20
|
||||
assert metrics.aggregated_query_hit == 10
|
||||
assert metrics.hit_rate == 0.5
|
||||
|
||||
# Only the latest added stats preserved 30 / 40 = 0.75
|
||||
metrics.observe(_stats(22, 40, 30))
|
||||
assert metrics.aggregated_requests == 22
|
||||
assert metrics.aggregated_query_total == 40
|
||||
assert metrics.aggregated_query_hit == 30
|
||||
assert metrics.hit_rate == 0.75
|
||||
|
||||
|
||||
def test_get_kv_cache_configs_multiple_workers():
|
||||
model_config = ModelConfig(max_model_len=16)
|
||||
vllm_config = VllmConfig(model_config=model_config)
|
||||
|
||||
@ -10,7 +10,7 @@ from vllm.config import (CacheConfig, KVTransferConfig, ModelConfig,
|
||||
SchedulerConfig, SpeculativeConfig, VllmConfig)
|
||||
from vllm.multimodal.inputs import (MultiModalFeatureSpec,
|
||||
MultiModalKwargsItem, PlaceholderRange)
|
||||
from vllm.sampling_params import GuidedDecodingParams, SamplingParams
|
||||
from vllm.sampling_params import SamplingParams, StructuredOutputsParams
|
||||
from vllm.v1.core.sched.output import CachedRequestData, SchedulerOutput
|
||||
from vllm.v1.core.sched.scheduler import Scheduler
|
||||
from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
|
||||
@ -1796,11 +1796,11 @@ def test_schedule_skip_tokenizer_init():
|
||||
|
||||
def test_schedule_skip_tokenizer_init_structured_output_request():
|
||||
scheduler = create_scheduler(skip_tokenizer_init=True)
|
||||
guided_params = GuidedDecodingParams(regex="[0-9]+")
|
||||
structured_outputs_params = StructuredOutputsParams(regex="[0-9]+")
|
||||
sampling_params = SamplingParams(
|
||||
ignore_eos=False,
|
||||
max_tokens=16,
|
||||
guided_decoding=guided_params,
|
||||
structured_outputs=structured_outputs_params,
|
||||
)
|
||||
request = Request(
|
||||
request_id="0",
|
||||
|
||||
@ -8,7 +8,7 @@ from typing import TYPE_CHECKING, Optional
|
||||
import pytest
|
||||
|
||||
from vllm import LLM
|
||||
from vllm.sampling_params import GuidedDecodingParams, SamplingParams
|
||||
from vllm.sampling_params import SamplingParams, StructuredOutputsParams
|
||||
from vllm.v1.metrics.reader import Counter, Gauge, Histogram, Metric, Vector
|
||||
|
||||
if TYPE_CHECKING:
|
||||
@ -97,7 +97,7 @@ def _get_test_sampling_params(
|
||||
top_p=0.95,
|
||||
n=n,
|
||||
seed=seed,
|
||||
guided_decoding=GuidedDecodingParams(
|
||||
structured_outputs=StructuredOutputsParams(
|
||||
regex="[0-9]+") if structured_outputs else None,
|
||||
) for n in n_list
|
||||
], n_list
|
||||
|
||||
@ -151,7 +151,7 @@ def sample_definition_json_schema():
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sample_guided_choice():
|
||||
def sample_structured_outputs_choices():
|
||||
return [
|
||||
"Python", "Java", "JavaScript", "C++", "C#", "PHP", "TypeScript",
|
||||
"Ruby", "Swift", "Kotlin"
|
||||
|
||||
@ -15,12 +15,13 @@ import torch
|
||||
from pydantic import BaseModel
|
||||
|
||||
from tests.reasoning.utils import run_reasoning_extraction
|
||||
from vllm.config import StructuredOutputsConfig
|
||||
from vllm.distributed import cleanup_dist_env_and_memory
|
||||
from vllm.entrypoints.llm import LLM
|
||||
from vllm.outputs import RequestOutput
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.reasoning.abs_reasoning_parsers import ReasoningParserManager
|
||||
from vllm.sampling_params import GuidedDecodingParams, SamplingParams
|
||||
from vllm.sampling_params import SamplingParams, StructuredOutputsParams
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.config import TokenizerMode
|
||||
@ -90,7 +91,7 @@ def _load_json(s: str, backend: str) -> str:
|
||||
|
||||
@pytest.mark.skip_global_cleanup
|
||||
@pytest.mark.parametrize(
|
||||
"model_name, guided_decoding_backend, tokenizer_mode, speculative_config",
|
||||
"model_name, backend, tokenizer_mode, speculative_config",
|
||||
PARAMS_MODELS_BACKENDS_TOKENIZER_MODE)
|
||||
def test_structured_output(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
@ -99,8 +100,8 @@ def test_structured_output(
|
||||
sample_sql_ebnf: str,
|
||||
sample_sql_lark: str,
|
||||
sample_regex: str,
|
||||
sample_guided_choice: str,
|
||||
guided_decoding_backend: str,
|
||||
sample_structured_outputs_choices: str,
|
||||
backend: str,
|
||||
tokenizer_mode: str,
|
||||
model_name: str,
|
||||
speculative_config: dict[str, Any],
|
||||
@ -115,16 +116,15 @@ def test_structured_output(
|
||||
enforce_eager = bool(not current_platform.is_tpu())
|
||||
# Use a single LLM instance for several scenarios to
|
||||
# speed up the test suite.
|
||||
llm = LLM(
|
||||
model=model_name,
|
||||
enforce_eager=enforce_eager,
|
||||
max_model_len=1024,
|
||||
guided_decoding_backend=guided_decoding_backend,
|
||||
guided_decoding_disable_any_whitespace=(guided_decoding_backend
|
||||
in {"xgrammar", "guidance"}),
|
||||
seed=120,
|
||||
tokenizer_mode=tokenizer_mode,
|
||||
speculative_config=speculative_config)
|
||||
llm = LLM(model=model_name,
|
||||
enforce_eager=enforce_eager,
|
||||
max_model_len=1024,
|
||||
structured_outputs_config=dict(backend=backend,
|
||||
disable_any_whitespace=backend
|
||||
in {"xgrammar", "guidance"}),
|
||||
seed=120,
|
||||
tokenizer_mode=tokenizer_mode,
|
||||
speculative_config=speculative_config)
|
||||
|
||||
#
|
||||
# Test 1: Generate JSON output based on a provided schema
|
||||
@ -132,7 +132,7 @@ def test_structured_output(
|
||||
sampling_params = SamplingParams(
|
||||
temperature=1.0,
|
||||
max_tokens=4096,
|
||||
guided_decoding=GuidedDecodingParams(json=sample_json_schema))
|
||||
structured_outputs=StructuredOutputsParams(json=sample_json_schema))
|
||||
|
||||
prompt = ("Give an example JSON for an employee profile that fits this "
|
||||
"schema. Make the response as short as possible. Schema: "
|
||||
@ -152,7 +152,7 @@ def test_structured_output(
|
||||
|
||||
generated_text = output.outputs[0].text
|
||||
assert generated_text is not None
|
||||
if guided_decoding_backend != 'lm-format-enforcer':
|
||||
if backend != 'lm-format-enforcer':
|
||||
assert "\n" not in generated_text
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
output_json = json.loads(generated_text)
|
||||
@ -161,12 +161,12 @@ def test_structured_output(
|
||||
#
|
||||
# Test 2: Generate JSON object without a schema
|
||||
#
|
||||
if guided_decoding_backend != "outlines":
|
||||
if backend != "outlines":
|
||||
sampling_params = SamplingParams(
|
||||
temperature=1.0,
|
||||
max_tokens=4096,
|
||||
n=2,
|
||||
guided_decoding=GuidedDecodingParams(json_object=True))
|
||||
structured_outputs=StructuredOutputsParams(json_object=True))
|
||||
|
||||
outputs = llm.generate(prompts=(
|
||||
"Generate a JSON object with curly braces for a person with "
|
||||
@ -195,8 +195,9 @@ def test_structured_output(
|
||||
sampling_params = SamplingParams(
|
||||
temperature=1.0,
|
||||
max_tokens=4096,
|
||||
guided_decoding=GuidedDecodingParams(json=unsupported_json_schema))
|
||||
if guided_decoding_backend.startswith("xgrammar"):
|
||||
structured_outputs=StructuredOutputsParams(
|
||||
json=unsupported_json_schema))
|
||||
if backend.startswith("xgrammar"):
|
||||
with pytest.raises(ValueError,
|
||||
match="The provided JSON schema contains features "
|
||||
"not supported by xgrammar."):
|
||||
@ -230,7 +231,7 @@ def test_structured_output(
|
||||
parsed_json = json.loads(generated_text)
|
||||
assert isinstance(parsed_json, dict)
|
||||
|
||||
if guided_decoding_backend not in ["outlines", "lm-format-enforcer"]:
|
||||
if backend not in ["outlines", "lm-format-enforcer"]:
|
||||
#
|
||||
# Test 4: Generate SQL statement using EBNF grammar
|
||||
#
|
||||
@ -238,7 +239,8 @@ def test_structured_output(
|
||||
temperature=0.8,
|
||||
top_p=0.95,
|
||||
max_tokens=1000,
|
||||
guided_decoding=GuidedDecodingParams(grammar=sample_sql_ebnf))
|
||||
structured_outputs=StructuredOutputsParams(
|
||||
grammar=sample_sql_ebnf))
|
||||
outputs = llm.generate(
|
||||
("Generate a sql statement that selects col_1 from "
|
||||
"table_1 where it is equal to 1. Make the response as short as "
|
||||
@ -271,7 +273,8 @@ def test_structured_output(
|
||||
temperature=0.8,
|
||||
top_p=0.95,
|
||||
max_tokens=1000,
|
||||
guided_decoding=GuidedDecodingParams(grammar=sample_sql_lark))
|
||||
structured_outputs=StructuredOutputsParams(
|
||||
grammar=sample_sql_lark))
|
||||
outputs = llm.generate(
|
||||
("Generate a sql statement that selects col_1 from "
|
||||
"table_1 where it is equal to 1. Make the response as short as "
|
||||
@ -309,7 +312,8 @@ def test_structured_output(
|
||||
temperature=0.8,
|
||||
top_p=0.95,
|
||||
max_tokens=1000,
|
||||
guided_decoding=GuidedDecodingParams(grammar="not a grammar"))
|
||||
structured_outputs=StructuredOutputsParams(
|
||||
grammar="not a grammar"))
|
||||
with pytest.raises(ValueError, match="Failed to convert the grammar "):
|
||||
llm.generate(
|
||||
("Generate a sql statement that selects col_1 from "
|
||||
@ -325,7 +329,7 @@ def test_structured_output(
|
||||
sampling_params = SamplingParams(
|
||||
temperature=0.8,
|
||||
top_p=0.95,
|
||||
guided_decoding=GuidedDecodingParams(regex=sample_regex))
|
||||
structured_outputs=StructuredOutputsParams(regex=sample_regex))
|
||||
|
||||
prompt = (f"Give an example IPv4 address with this regex: {sample_regex}. "
|
||||
f"Make the response as short as possible.")
|
||||
@ -352,7 +356,8 @@ def test_structured_output(
|
||||
sampling_params = SamplingParams(
|
||||
temperature=0.8,
|
||||
top_p=0.95,
|
||||
guided_decoding=GuidedDecodingParams(choice=sample_guided_choice))
|
||||
structured_outputs=StructuredOutputsParams(
|
||||
choice=sample_structured_outputs_choices))
|
||||
|
||||
outputs = llm.generate(
|
||||
("The best language for type-safe systems programming is "
|
||||
@ -368,7 +373,7 @@ def test_structured_output(
|
||||
generated_text = output.outputs[0].text
|
||||
print(generated_text)
|
||||
assert generated_text is not None
|
||||
assert generated_text in sample_guided_choice
|
||||
assert generated_text in sample_structured_outputs_choices
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
|
||||
#
|
||||
@ -378,7 +383,7 @@ def test_structured_output(
|
||||
sampling_params = SamplingParams(
|
||||
temperature=1.0,
|
||||
max_tokens=1000,
|
||||
guided_decoding=GuidedDecodingParams(json=json_schema))
|
||||
structured_outputs=StructuredOutputsParams(json=json_schema))
|
||||
|
||||
outputs = llm.generate(
|
||||
("Generate a JSON with the brand, model and car_type of the most "
|
||||
@ -422,7 +427,7 @@ def test_structured_output(
|
||||
sampling_params = SamplingParams(
|
||||
temperature=1.0,
|
||||
max_tokens=4096,
|
||||
guided_decoding=GuidedDecodingParams(json=json_schema))
|
||||
structured_outputs=StructuredOutputsParams(json=json_schema))
|
||||
|
||||
outputs = llm.generate(
|
||||
("Generate a description of a frog using 50 characters. "
|
||||
@ -444,7 +449,7 @@ def test_structured_output(
|
||||
output_json = json.loads(generated_text)
|
||||
jsonschema.validate(instance=output_json, schema=json_schema)
|
||||
|
||||
if guided_decoding_backend not in ["outlines", "lm-format-enforcer"]:
|
||||
if backend not in ["outlines", "lm-format-enforcer"]:
|
||||
#
|
||||
# Test 11: Generate structured output using structural_tag format
|
||||
#
|
||||
@ -470,7 +475,7 @@ def test_structured_output(
|
||||
sampling_params = SamplingParams(
|
||||
temperature=0.0,
|
||||
max_tokens=4096,
|
||||
guided_decoding=GuidedDecodingParams(
|
||||
structured_outputs=StructuredOutputsParams(
|
||||
structural_tag=json.dumps(structural_tag_config)))
|
||||
|
||||
prompt = """
|
||||
@ -547,7 +552,7 @@ Make the response as short as possible.
|
||||
|
||||
@pytest.mark.skip_global_cleanup
|
||||
@pytest.mark.parametrize(
|
||||
"model_name, guided_decoding_backend, tokenizer_mode, reasoning_parser, speculative_config", # noqa: E501
|
||||
"model_name, backend, tokenizer_mode, reasoning_parser, speculative_config", # noqa: E501
|
||||
[
|
||||
("deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", "xgrammar", "auto",
|
||||
"deepseek_r1", NGRAM_SPEC_CONFIG),
|
||||
@ -556,7 +561,7 @@ Make the response as short as possible.
|
||||
)
|
||||
def test_structured_output_with_reasoning_matrices(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
guided_decoding_backend: str,
|
||||
backend: str,
|
||||
tokenizer_mode: TokenizerMode,
|
||||
reasoning_parser: str,
|
||||
model_name: str,
|
||||
@ -576,10 +581,11 @@ def test_structured_output_with_reasoning_matrices(
|
||||
enforce_eager=bool(not current_platform.is_tpu()),
|
||||
max_model_len=1024,
|
||||
max_num_seqs=16,
|
||||
guided_decoding_backend=guided_decoding_backend,
|
||||
guided_decoding_disable_any_whitespace=True,
|
||||
structured_outputs_config=dict(backend=backend,
|
||||
disable_any_whitespace=backend
|
||||
in {"xgrammar", "guidance"},
|
||||
reasoning_parser=reasoning_parser),
|
||||
tokenizer_mode=tokenizer_mode,
|
||||
reasoning_parser=reasoning_parser,
|
||||
speculative_config=speculative_config,
|
||||
)
|
||||
tokenizer = llm.get_tokenizer()
|
||||
@ -603,7 +609,7 @@ def test_structured_output_with_reasoning_matrices(
|
||||
sampling_params = SamplingParams(
|
||||
temperature=0.1,
|
||||
max_tokens=8192,
|
||||
guided_decoding=GuidedDecodingParams(json=reasoning_schema),
|
||||
structured_outputs=StructuredOutputsParams(json=reasoning_schema),
|
||||
)
|
||||
outputs = llm.generate(
|
||||
[reasoning_prompt],
|
||||
@ -640,13 +646,14 @@ def test_structured_output_auto_mode(
|
||||
|
||||
llm = LLM(model=model_name,
|
||||
max_model_len=1024,
|
||||
guided_decoding_backend="auto",
|
||||
structured_outputs_config=dict(backend="auto"),
|
||||
tokenizer_mode=tokenizer_mode)
|
||||
|
||||
sampling_params = SamplingParams(
|
||||
temperature=1.0,
|
||||
max_tokens=1000,
|
||||
guided_decoding=GuidedDecodingParams(json=unsupported_json_schema))
|
||||
structured_outputs=StructuredOutputsParams(
|
||||
json=unsupported_json_schema))
|
||||
|
||||
prompts = (
|
||||
"Give an example JSON object for a grade "
|
||||
@ -681,9 +688,10 @@ def test_guidance_no_additional_properties(monkeypatch: pytest.MonkeyPatch):
|
||||
|
||||
llm = LLM(model="Qwen/Qwen2.5-1.5B-Instruct",
|
||||
max_model_len=1024,
|
||||
guided_decoding_backend="guidance",
|
||||
guided_decoding_disable_any_whitespace=True,
|
||||
guided_decoding_disable_additional_properties=True)
|
||||
structured_outputs_config=dict(
|
||||
backend="guidance",
|
||||
disable_any_whitespace=True,
|
||||
disable_additional_properties=True))
|
||||
|
||||
schema = {
|
||||
'type': 'object',
|
||||
@ -709,14 +717,15 @@ def test_guidance_no_additional_properties(monkeypatch: pytest.MonkeyPatch):
|
||||
"<|im_end|>\n<|im_start|>assistant\n")
|
||||
|
||||
def generate_with_backend(backend):
|
||||
guided_params = GuidedDecodingParams(
|
||||
structured_outputs_params = StructuredOutputsParams(
|
||||
json=schema,
|
||||
backend=backend,
|
||||
disable_any_whitespace=True,
|
||||
disable_additional_properties=True)
|
||||
sampling_params = SamplingParams(temperature=0,
|
||||
max_tokens=256,
|
||||
guided_decoding=guided_params)
|
||||
sampling_params = SamplingParams(
|
||||
temperature=0,
|
||||
max_tokens=256,
|
||||
structured_outputs=structured_outputs_params)
|
||||
|
||||
outputs = llm.generate(prompt, sampling_params=sampling_params)
|
||||
assert outputs is not None
|
||||
@ -736,12 +745,11 @@ def test_guidance_no_additional_properties(monkeypatch: pytest.MonkeyPatch):
|
||||
assert "a6" not in generated
|
||||
|
||||
|
||||
@pytest.mark.parametrize("guided_decoding_backend",
|
||||
["guidance", "xgrammar", "outlines"])
|
||||
def test_structured_output_batched_with_non_guided_requests(
|
||||
@pytest.mark.parametrize("backend", ["guidance", "xgrammar", "outlines"])
|
||||
def test_structured_output_batched_with_non_structured_outputs_requests(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
sample_json_schema: dict[str, Any],
|
||||
guided_decoding_backend: str,
|
||||
backend: str,
|
||||
):
|
||||
monkeypatch.setenv("VLLM_USE_V1", "1")
|
||||
|
||||
@ -753,24 +761,25 @@ def test_structured_output_batched_with_non_guided_requests(
|
||||
model="meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||
enforce_eager=enforce_eager,
|
||||
max_model_len=1024,
|
||||
guided_decoding_backend=guided_decoding_backend,
|
||||
guided_decoding_disable_any_whitespace=(guided_decoding_backend
|
||||
in {"xgrammar", "guidance"}),
|
||||
structured_outputs_config=StructuredOutputsConfig(
|
||||
backend=backend,
|
||||
disable_any_whitespace=backend in {"xgrammar", "guidance"},
|
||||
),
|
||||
)
|
||||
|
||||
guided_prompt = (
|
||||
structured_outputs_prompt = (
|
||||
"Give an example JSON for an employee profile that fits this "
|
||||
"schema. Make the response as short as possible. Schema: "
|
||||
f"{sample_json_schema}")
|
||||
|
||||
non_guided_prompt = "The diameter of the Earth in kilometers is "
|
||||
non_structured_outputs_prompt = "The diameter of the Earth in kilometers is "
|
||||
|
||||
prompts = [guided_prompt, non_guided_prompt]
|
||||
prompts = [structured_outputs_prompt, non_structured_outputs_prompt]
|
||||
sampling_params = [
|
||||
SamplingParams(
|
||||
temperature=1.0,
|
||||
max_tokens=400,
|
||||
guided_decoding=GuidedDecodingParams(json=sample_json_schema)),
|
||||
SamplingParams(temperature=1.0,
|
||||
max_tokens=400,
|
||||
structured_outputs=StructuredOutputsParams(
|
||||
json=sample_json_schema)),
|
||||
# No max tokens, temp=0 to assert on contents
|
||||
SamplingParams(
|
||||
seed=42,
|
||||
@ -801,16 +810,16 @@ def test_structured_output_batched_with_non_guided_requests(
|
||||
print(f"Prompt:\n{prompt!r}\nGenerated text:\n{generated_text!r}")
|
||||
|
||||
if index == 0:
|
||||
# First prompt is guided, expect valid JSON
|
||||
# First prompt is structured outputs, expect valid JSON
|
||||
assert "\n" not in generated_text
|
||||
output_json = json.loads(generated_text)
|
||||
jsonschema.validate(instance=output_json,
|
||||
schema=sample_json_schema)
|
||||
else:
|
||||
# Second prompt is not guided, expect valid output
|
||||
# Second prompt is not structured outputs, expect valid output
|
||||
# Cannot assert on exact output, but we can expect it to be factual
|
||||
assert "12,742" in generated_text
|
||||
|
||||
# non-guided requests should not return a valid JSON here
|
||||
# non-structured outputs requests should not return a valid JSON here
|
||||
with pytest.raises(ValueError):
|
||||
output_json = json.loads(generated_text)
|
||||
|
||||
@ -77,7 +77,9 @@ async def test_invalid_json_schema(client: openai.AsyncOpenAI,
|
||||
"role": "user",
|
||||
"content": prompt,
|
||||
}],
|
||||
extra_body={"guided_json": invalid_json_schema},
|
||||
extra_body={"structured_outputs": {
|
||||
"json": invalid_json_schema
|
||||
}},
|
||||
)
|
||||
|
||||
|
||||
@ -99,7 +101,9 @@ async def test_invalid_regex(client: openai.AsyncOpenAI, model_name: str):
|
||||
"content": prompt,
|
||||
}],
|
||||
extra_body={
|
||||
"guided_regex": r"[.*",
|
||||
"structured_outputs": {
|
||||
"regex": r"[.*"
|
||||
},
|
||||
"stop": ["\n"]
|
||||
},
|
||||
)
|
||||
@ -134,5 +138,9 @@ async def test_invalid_grammar(client: openai.AsyncOpenAI, model_name: str):
|
||||
"role": "user",
|
||||
"content": prompt,
|
||||
}],
|
||||
extra_body={"guided_grammar": invalid_simplified_sql_grammar},
|
||||
extra_body={
|
||||
"structured_outputs": {
|
||||
"grammar": invalid_simplified_sql_grammar
|
||||
}
|
||||
},
|
||||
)
|
||||
|
||||
@ -627,7 +627,9 @@ async def test_invalid_json_schema(client: openai.AsyncOpenAI,
|
||||
await client.completions.create(
|
||||
model=model_name,
|
||||
prompt=prompt,
|
||||
extra_body={"guided_json": invalid_json_schema},
|
||||
extra_body={"structured_outputs": {
|
||||
"json": invalid_json_schema
|
||||
}},
|
||||
)
|
||||
|
||||
|
||||
@ -646,7 +648,9 @@ async def test_invalid_regex(client: openai.AsyncOpenAI, model_name: str):
|
||||
model=model_name,
|
||||
prompt=prompt,
|
||||
extra_body={
|
||||
"guided_regex": r"[.*",
|
||||
"structured_outputs": {
|
||||
"regex": r"[.*"
|
||||
},
|
||||
"stop": ["\n"]
|
||||
},
|
||||
)
|
||||
@ -678,7 +682,11 @@ async def test_invalid_grammar(client: openai.AsyncOpenAI, model_name: str):
|
||||
await client.completions.create(
|
||||
model=model_name,
|
||||
prompt=prompt,
|
||||
extra_body={"guided_grammar": invalid_simplified_sql_grammar},
|
||||
extra_body={
|
||||
"structured_outputs": {
|
||||
"grammar": invalid_simplified_sql_grammar
|
||||
}
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
|
||||
@ -18,12 +18,18 @@ import torch
|
||||
|
||||
from vllm import LLM
|
||||
from vllm.config import KVTransferConfig
|
||||
from vllm.distributed.kv_transfer.kv_connector.utils import KVOutputAggregator
|
||||
from vllm.distributed.kv_transfer.kv_connector.v1.metrics import (
|
||||
KVConnectorStats)
|
||||
from vllm.distributed.kv_transfer.kv_connector.v1.multi_connector import (
|
||||
MultiKVConnectorStats)
|
||||
from vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector import (
|
||||
KVConnectorRole, NixlAgentMetadata, NixlConnector, NixlConnectorMetadata,
|
||||
NixlConnectorWorker)
|
||||
NixlConnectorWorker, NixlKVConnectorStats)
|
||||
from vllm.forward_context import ForwardContext
|
||||
from vllm.sampling_params import SamplingParams
|
||||
from vllm.v1.attention.backends.flash_attn import FlashAttentionBackend
|
||||
from vllm.v1.outputs import KVConnectorOutput, ModelRunnerOutput
|
||||
|
||||
from .utils import create_request, create_scheduler, create_vllm_config
|
||||
|
||||
@ -475,6 +481,209 @@ class TestNixlHandshake:
|
||||
# NOTE: resource cleanup in mp backend is a bit finicky, so the order in which
|
||||
# we put here is important. First run ray, it will clean up the resources, then
|
||||
# the rest of the tests.
|
||||
@patch(
|
||||
"vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.NixlWrapper",
|
||||
FakeNixlWrapper)
|
||||
def test_kv_connector_stats(dist_init):
|
||||
"""Test that KV transfer stats are properly recorded and retrieved."""
|
||||
vllm_config = create_vllm_config()
|
||||
|
||||
# Test worker role in decode server.
|
||||
connector = NixlConnector(vllm_config, KVConnectorRole.WORKER)
|
||||
connector.connector_worker = FakeNixlConnectorWorker(vllm_config,
|
||||
connector.engine_id,
|
||||
hand_shake_latency=0)
|
||||
|
||||
# Verify that xfer_stats starts empty
|
||||
initial_stats = connector.get_kv_connector_stats()
|
||||
assert initial_stats is None
|
||||
|
||||
# Create transfer metadata
|
||||
request_id = "test_req_for_stats"
|
||||
metadata = NixlConnectorMetadata()
|
||||
metadata.add_new_req(request_id=request_id,
|
||||
local_block_ids=[1, 2, 3],
|
||||
kv_transfer_params={
|
||||
"remote_block_ids": [4, 5, 6],
|
||||
"remote_engine_id":
|
||||
FakeNixlConnectorWorker.REMOTE_ENGINE_ID,
|
||||
"remote_host": "localhost",
|
||||
"remote_port": 1234,
|
||||
"remote_tp_size": 1,
|
||||
})
|
||||
connector.bind_connector_metadata(metadata)
|
||||
|
||||
# Start the transfer
|
||||
dummy_ctx = ForwardContext(
|
||||
no_compile_layers={},
|
||||
attn_metadata={},
|
||||
virtual_engine=0,
|
||||
)
|
||||
connector.start_load_kv(dummy_ctx)
|
||||
|
||||
# Verify stats are recorded after transfer is complete
|
||||
max_iterations = 2
|
||||
# Clear metadata before start_load_kv to prevent reprocessing same request
|
||||
connector.bind_connector_metadata(NixlConnectorMetadata())
|
||||
for _ in range(max_iterations):
|
||||
# Need to call start_load_kv to process completed handshakes
|
||||
connector.start_load_kv(dummy_ctx)
|
||||
_, done_recving = connector.get_finished(finished_req_ids=set())
|
||||
if len(done_recving) > 0 and request_id in done_recving:
|
||||
break
|
||||
time.sleep(
|
||||
0.1) # Small delay to allow background handshake to complete
|
||||
else:
|
||||
assert "Transfer did not complete within expected iterations"
|
||||
|
||||
# Now check that stats were recorded
|
||||
stats_after_transfer = connector.get_kv_connector_stats()
|
||||
assert isinstance(stats_after_transfer, NixlKVConnectorStats)
|
||||
|
||||
# Verify stats values are recorded
|
||||
assert not stats_after_transfer.is_empty()
|
||||
assert stats_after_transfer.data["num_successful_transfers"] == 1
|
||||
|
||||
# Verify stats are reset after retrieval
|
||||
stats_after_reset = connector.get_kv_connector_stats()
|
||||
assert stats_after_reset is None
|
||||
|
||||
|
||||
def test_kv_connector_stats_aggregation():
|
||||
"""
|
||||
Test KV transfer stats aggregation across TP ranks using
|
||||
KVOutputAggregator (used by MultiprocExecutor).
|
||||
"""
|
||||
|
||||
# Create KVOutputAggregator for 3 workers (simulating TP=3), same thing
|
||||
# done in MultiprocExecutor.execute_model
|
||||
aggregator = KVOutputAggregator(world_size=3)
|
||||
|
||||
# Create stats for multiple workers with different transfer patterns
|
||||
worker1_stats = NixlKVConnectorStats()
|
||||
worker2_stats = NixlKVConnectorStats()
|
||||
worker3_stats = NixlKVConnectorStats()
|
||||
|
||||
# Record different transfers on each worker
|
||||
# Worker 1: 2 transfers
|
||||
worker1_stats.record_transfer()
|
||||
worker1_stats.record_transfer()
|
||||
|
||||
# Worker 2: 1 transfer
|
||||
worker2_stats.record_transfer()
|
||||
|
||||
# Worker 3: 3 transfers
|
||||
worker3_stats.record_transfer()
|
||||
worker3_stats.record_transfer()
|
||||
worker3_stats.record_transfer()
|
||||
|
||||
# Create ModelRunnerOutput instances for each worker
|
||||
worker_outputs = []
|
||||
for i, worker_stats in enumerate(
|
||||
[worker1_stats, worker2_stats, worker3_stats]):
|
||||
output = ModelRunnerOutput(
|
||||
req_ids=[f"req_{i}"],
|
||||
req_id_to_index={f"req_{i}": 0},
|
||||
sampled_token_ids=[[123]], # dummy token
|
||||
logprobs=None,
|
||||
prompt_logprobs_dict={},
|
||||
pooler_output=[None],
|
||||
kv_connector_output=KVConnectorOutput(
|
||||
finished_sending=set([f"req_{i}_send"])
|
||||
if i < 2 else None, # Workers 0,1 finished sending
|
||||
finished_recving=set([f"req_{i}_recv"])
|
||||
if i > 0 else None, # Workers 1,2 finished receiving
|
||||
kv_connector_stats=worker_stats,
|
||||
))
|
||||
worker_outputs.append(output)
|
||||
|
||||
# Use the real aggregation mechanism (like MultiprocExecutor.execute_model)
|
||||
aggregated_output = aggregator.aggregate(worker_outputs, output_rank=0)
|
||||
kv_connector_stats = \
|
||||
aggregated_output.kv_connector_output.kv_connector_stats
|
||||
assert isinstance(kv_connector_stats, NixlKVConnectorStats)
|
||||
# Number of total transfers across all workers.
|
||||
assert kv_connector_stats.data["num_successful_transfers"] == 6
|
||||
|
||||
|
||||
def test_multi_kv_connector_stats_aggregation():
|
||||
"""
|
||||
Test MultiKVConnectorStats aggregation across TP ranks using
|
||||
KVOutputAggregator (used by MultiprocExecutor).
|
||||
"""
|
||||
|
||||
aggregator = KVOutputAggregator(world_size=3)
|
||||
|
||||
from dataclasses import dataclass
|
||||
|
||||
@dataclass
|
||||
class FooKVConnectorStats(KVConnectorStats):
|
||||
|
||||
def reset(self):
|
||||
self.data = {"num_foo_transfers": 0}
|
||||
|
||||
def record_transfer(self):
|
||||
if "num_foo_transfers" not in self.data:
|
||||
self.data["num_foo_transfers"] = 0
|
||||
self.data["num_foo_transfers"] += 1
|
||||
|
||||
def is_empty(self) -> bool:
|
||||
return self.data["num_foo_transfers"] == 0
|
||||
|
||||
def aggregate(self,
|
||||
other: "FooKVConnectorStats") -> "FooKVConnectorStats":
|
||||
if not other.is_empty():
|
||||
self.data["num_foo_transfers"] += other.data[
|
||||
"num_foo_transfers"]
|
||||
return self
|
||||
|
||||
def make_multi_stats(nixl_count: int,
|
||||
foo_count: int) -> MultiKVConnectorStats:
|
||||
data: dict[str, KVConnectorStats] = {}
|
||||
if nixl_count > 0:
|
||||
nixl_stats = NixlKVConnectorStats()
|
||||
for _ in range(nixl_count):
|
||||
nixl_stats.record_transfer()
|
||||
data["NixlConnector"] = nixl_stats
|
||||
if foo_count > 0:
|
||||
foo_stats = FooKVConnectorStats()
|
||||
for _ in range(foo_count):
|
||||
foo_stats.record_transfer()
|
||||
data["FooConnector"] = foo_stats
|
||||
return MultiKVConnectorStats(data=data)
|
||||
|
||||
# Create heterogeneous stats across 3 workers
|
||||
worker_patterns = [(2, 1), (3, 0), (0, 5)] # (Nixl, Foo)
|
||||
|
||||
worker_outputs: list[ModelRunnerOutput] = []
|
||||
for i, (nixl, foo) in enumerate(worker_patterns):
|
||||
stats = make_multi_stats(nixl, foo)
|
||||
output = ModelRunnerOutput(
|
||||
req_ids=[f"req_{i}"],
|
||||
req_id_to_index={f"req_{i}": 0},
|
||||
sampled_token_ids=[[123]],
|
||||
logprobs=None,
|
||||
prompt_logprobs_dict={},
|
||||
pooler_output=[None],
|
||||
kv_connector_output=KVConnectorOutput(
|
||||
finished_sending=set([f"req_{i}_send"]) if i < 2 else None,
|
||||
finished_recving=set([f"req_{i}_recv"]) if i > 0 else None,
|
||||
kv_connector_stats=stats,
|
||||
),
|
||||
)
|
||||
worker_outputs.append(output)
|
||||
|
||||
aggregated_output = aggregator.aggregate(worker_outputs, output_rank=0)
|
||||
kv_connector_stats = \
|
||||
aggregated_output.kv_connector_output.kv_connector_stats
|
||||
assert isinstance(kv_connector_stats, MultiKVConnectorStats)
|
||||
|
||||
# Validate per-connector totals across workers
|
||||
assert kv_connector_stats["NixlConnector"].data[
|
||||
"num_successful_transfers"] == 5
|
||||
assert kv_connector_stats["FooConnector"].data["num_foo_transfers"] == 6
|
||||
|
||||
|
||||
@pytest.mark.parametrize("distributed_executor_backend", ["ray", None])
|
||||
@patch(
|
||||
"vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.NixlWrapper",
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user