Compare commits
144 Commits
revert-222
...
remove-reg
| Author | SHA1 | Date | |
|---|---|---|---|
| 36ccdcad2c | |||
| 1d20c34717 | |||
| b6af24fba7 | |||
| 0ca2393b47 | |||
| 31a500c86f | |||
| 4e8614e88b | |||
| c6cd5ca3d3 | |||
| df0e0f023e | |||
| b4b78d6317 | |||
| 12817a8ac7 | |||
| c9232d41f4 | |||
| 9bd9294f0e | |||
| da2705198f | |||
| 19b927e52d | |||
| 20d65aa755 | |||
| b159c0a67a | |||
| 6772bb0f7d | |||
| fceafaf582 | |||
| 6b794c756c | |||
| 98deac3879 | |||
| 653124bd46 | |||
| 0b1bdac6af | |||
| d94e3026de | |||
| 3f52738dce | |||
| a01e0018b5 | |||
| 9e7e5baaa8 | |||
| d16aa3dae4 | |||
| 6807af8f46 | |||
| 4c558cf62e | |||
| 77a6bf07ae | |||
| 4082338a25 | |||
| c6b928798e | |||
| b1361c7273 | |||
| 4f0f844b16 | |||
| c5830381af | |||
| d31f97cf57 | |||
| 71683ca6f6 | |||
| e18859298d | |||
| fde0b611a3 | |||
| d0a6301588 | |||
| 45c3936e94 | |||
| ba81acbdc1 | |||
| 53c730286c | |||
| 6534d2fc97 | |||
| 422f22e012 | |||
| 6bd8ebf026 | |||
| dab4f9f764 | |||
| c42fe0b63a | |||
| 5a4b4b3729 | |||
| e5d3d63c42 | |||
| 3d9d40efde | |||
| 67c153b88a | |||
| f7ad6a1eb3 | |||
| 80bb1e8afe | |||
| d030b01548 | |||
| 767e63b860 | |||
| 007dd90859 | |||
| b8a9d0e429 | |||
| 50f2aae1b4 | |||
| 46ae7f6666 | |||
| 1ece7f30ba | |||
| bc8372efc3 | |||
| 8d17fa633e | |||
| 9f909b8996 | |||
| 59f3b93636 | |||
| 78077d5417 | |||
| 6d729c43fb | |||
| 2f4657952b | |||
| 3a7e3bbdd2 | |||
| 4fbd8bb597 | |||
| ad344ef552 | |||
| bbaf9e9cb1 | |||
| 4678503476 | |||
| 93d0652433 | |||
| ea1292ad3e | |||
| dc5e4a653c | |||
| 839ab00349 | |||
| 9b94d6ec8f | |||
| 1891a265d3 | |||
| 95a935fc48 | |||
| 458e74eb90 | |||
| 65abe111a3 | |||
| 807d21b80d | |||
| c90fb03df5 | |||
| 84cf78acee | |||
| 16fb668b61 | |||
| f7dcce7a4a | |||
| 8e13d9fe6d | |||
| 3fa5b25845 | |||
| 14a5d903ab | |||
| 951b038298 | |||
| ebf7605b0d | |||
| bc1d02ac85 | |||
| 1e55dfa7e5 | |||
| 384a052971 | |||
| 39052dbca8 | |||
| 9c97a1c349 | |||
| f919d4cb8f | |||
| afa5b7ca0b | |||
| 1b99028069 | |||
| 5898b135ab | |||
| b799f4b9ea | |||
| 06da44f0cb | |||
| a554991748 | |||
| d1af8b7be9 | |||
| 68b254d673 | |||
| 8c50d62f5a | |||
| b4e2916721 | |||
| 65a7917be4 | |||
| b76753f0b5 | |||
| b81fe83b2c | |||
| 0757551c96 | |||
| 8290d15d2c | |||
| 049c245143 | |||
| 00976db0c3 | |||
| d411df0296 | |||
| 010e0e39ea | |||
| 326976291b | |||
| 7e8d685775 | |||
| c49848396d | |||
| 2a84fb422f | |||
| 534c45b962 | |||
| 3d7363e61c | |||
| 0c5254b82a | |||
| 61f67d8acd | |||
| 42172ad18f | |||
| fbd8595c5c | |||
| 5a16fa614c | |||
| 2d18256e47 | |||
| 56186474f6 | |||
| 1bf5e1f25b | |||
| a6022e6fbc | |||
| 2be07a0db1 | |||
| 0edc0cd52b | |||
| 7920e9b1c5 | |||
| b7c0942b65 | |||
| 9a0c5ded5a | |||
| 10a02535d4 | |||
| 65552b476b | |||
| 7ad7adb67f | |||
| 6ade99eafa | |||
| 3157aebb63 | |||
| 8a0ffd6285 | |||
| 23472ff51c |
@ -12,7 +12,6 @@
|
||||
"vllm_server_parameters": {
|
||||
"disable_log_stats": "",
|
||||
"gpu_memory_utilization": 0.9,
|
||||
"num_scheduler_steps": 10,
|
||||
"max_num_seqs": 512,
|
||||
"dtype": "bfloat16"
|
||||
},
|
||||
|
||||
@ -36,7 +36,6 @@
|
||||
"vllm_server_parameters": {
|
||||
"disable_log_stats": "",
|
||||
"gpu_memory_utilization": 0.9,
|
||||
"num_scheduler_steps": 10,
|
||||
"max_num_seqs": 512,
|
||||
"dtype": "bfloat16"
|
||||
},
|
||||
@ -90,7 +89,6 @@
|
||||
"vllm_server_parameters": {
|
||||
"disable_log_stats": "",
|
||||
"gpu_memory_utilization": 0.9,
|
||||
"num_scheduler_steps": 10,
|
||||
"max_num_seqs": 512,
|
||||
"dtype": "bfloat16"
|
||||
},
|
||||
@ -144,7 +142,6 @@
|
||||
"vllm_server_parameters": {
|
||||
"disable_log_stats": "",
|
||||
"gpu_memory_utilization": 0.9,
|
||||
"num_scheduler_steps": 10,
|
||||
"max_num_seqs": 512,
|
||||
"dtype": "bfloat16"
|
||||
},
|
||||
@ -195,7 +192,6 @@
|
||||
"vllm_server_parameters": {
|
||||
"disable_log_stats": "",
|
||||
"gpu_memory_utilization": 0.9,
|
||||
"num_scheduler_steps": 10,
|
||||
"max_num_seqs": 512,
|
||||
"dtype": "bfloat16"
|
||||
},
|
||||
@ -248,7 +244,6 @@
|
||||
"vllm_server_parameters": {
|
||||
"disable_log_stats": "",
|
||||
"gpu_memory_utilization": 0.9,
|
||||
"num_scheduler_steps": 10,
|
||||
"max_num_seqs": 512,
|
||||
"dtype": "bfloat16"
|
||||
},
|
||||
@ -301,7 +296,6 @@
|
||||
"vllm_server_parameters": {
|
||||
"disable_log_stats": "",
|
||||
"gpu_memory_utilization": 0.9,
|
||||
"num_scheduler_steps": 10,
|
||||
"max_num_seqs": 512,
|
||||
"dtype": "bfloat16"
|
||||
},
|
||||
|
||||
@ -128,7 +128,7 @@ run_and_track_test() {
|
||||
|
||||
# --- Actual Test Execution ---
|
||||
run_and_track_test 1 "test_struct_output_generate.py" \
|
||||
"HF_HUB_DISABLE_XET=1 python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\""
|
||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\""
|
||||
run_and_track_test 2 "test_moe_pallas.py" \
|
||||
"python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py"
|
||||
run_and_track_test 3 "test_lora.py" \
|
||||
@ -139,6 +139,8 @@ run_and_track_test 5 "test_spmd_model_weight_loading.py" \
|
||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_spmd_model_weight_loading.py"
|
||||
run_and_track_test 6 "test_kv_cache_update_kernel.py" \
|
||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_kv_cache_update_kernel.py"
|
||||
run_and_track_test 7 "test_tpu_int8.py" \
|
||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_tpu_int8.py"
|
||||
|
||||
# After all tests have been attempted, exit with the overall status.
|
||||
if [ "$overall_script_exit_code" -ne 0 ]; then
|
||||
|
||||
@ -134,7 +134,7 @@ run_and_track_test 1 "test_compilation.py" \
|
||||
run_and_track_test 2 "test_basic.py" \
|
||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_basic.py"
|
||||
run_and_track_test 3 "test_accuracy.py::test_lm_eval_accuracy_v1_engine" \
|
||||
"HF_HUB_DISABLE_XET=1 python3 -m pytest -s -v /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine"
|
||||
"python3 -m pytest -s -v /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine"
|
||||
run_and_track_test 4 "test_quantization_accuracy.py" \
|
||||
"python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py"
|
||||
run_and_track_test 5 "examples/offline_inference/tpu.py" \
|
||||
|
||||
@ -57,20 +57,20 @@ steps:
|
||||
- vllm/
|
||||
- tests/mq_llm_engine
|
||||
- tests/async_engine
|
||||
- tests/test_inputs
|
||||
- tests/test_inputs.py
|
||||
- tests/test_outputs.py
|
||||
- tests/multimodal
|
||||
- tests/test_utils
|
||||
- tests/utils_
|
||||
- tests/worker
|
||||
- tests/standalone_tests/lazy_imports.py
|
||||
commands:
|
||||
- python3 standalone_tests/lazy_imports.py
|
||||
- pytest -v -s mq_llm_engine # MQLLMEngine
|
||||
- pytest -v -s async_engine # AsyncLLMEngine
|
||||
- NUM_SCHEDULER_STEPS=4 pytest -v -s async_engine/test_async_llm_engine.py
|
||||
- pytest -v -s test_inputs.py
|
||||
- pytest -v -s test_outputs.py
|
||||
- pytest -v -s multimodal
|
||||
- pytest -v -s test_utils.py # Utils
|
||||
- pytest -v -s utils_ # Utils
|
||||
- pytest -v -s worker # Worker
|
||||
|
||||
- label: Python-only Installation Test
|
||||
@ -227,16 +227,6 @@ steps:
|
||||
##### fast check tests #####
|
||||
##### 1 GPU test #####
|
||||
|
||||
- label: Regression Test # 5min
|
||||
mirror_hardwares: [amdexperimental]
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/test_regression
|
||||
commands:
|
||||
- pip install modelscope
|
||||
- pytest -v -s test_regression.py
|
||||
working_dir: "/vllm-workspace/tests" # optional
|
||||
|
||||
- label: Engine Test # 10min
|
||||
mirror_hardwares: [amdexperimental]
|
||||
source_file_dependencies:
|
||||
@ -426,7 +416,6 @@ steps:
|
||||
|
||||
- label: Tensorizer Test # 11min
|
||||
mirror_hardwares: [amdexperimental]
|
||||
soft_fail: true
|
||||
source_file_dependencies:
|
||||
- vllm/model_executor/model_loader
|
||||
- tests/tensorizer_loader
|
||||
@ -535,8 +524,6 @@ steps:
|
||||
- vllm/
|
||||
- tests/models/language
|
||||
commands:
|
||||
# Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
|
||||
- pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
|
||||
- pip freeze | grep -E 'torch'
|
||||
- pytest -v -s models/language -m core_model
|
||||
|
||||
@ -547,8 +534,10 @@ steps:
|
||||
- vllm/
|
||||
- tests/models/language/generation
|
||||
commands:
|
||||
# Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
|
||||
- pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
|
||||
# Install fast path packages for testing against transformers
|
||||
# Note: also needed to run plamo2 model in vLLM
|
||||
- uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
|
||||
- uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
|
||||
- pytest -v -s models/language/generation -m hybrid_model
|
||||
|
||||
- label: Language Models Test (Extended Generation) # 1hr20min
|
||||
@ -773,27 +762,6 @@ steps:
|
||||
- pytest -v -s models/test_oot_registration.py # it needs a clean process
|
||||
- pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins
|
||||
|
||||
- label: Multi-step Tests (4 GPUs) # 36min
|
||||
mirror_hardwares: [amdexperimental]
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 4
|
||||
source_file_dependencies:
|
||||
- vllm/model_executor/layers/sampler.py
|
||||
- vllm/sequence.py
|
||||
- vllm/worker/worker_base.py
|
||||
- vllm/worker/worker.py
|
||||
- vllm/worker/multi_step_worker.py
|
||||
- vllm/worker/model_runner_base.py
|
||||
- vllm/worker/model_runner.py
|
||||
- vllm/worker/multi_step_model_runner.py
|
||||
- vllm/engine
|
||||
- tests/multi_step
|
||||
commands:
|
||||
# this test is quite flaky
|
||||
# TODO: investigate and fix.
|
||||
# - pytest -v -s multi_step/test_correctness_async_llm.py
|
||||
- pytest -v -s multi_step/test_correctness_llm.py
|
||||
|
||||
- label: Pipeline Parallelism Test # 45min
|
||||
mirror_hardwares: [amdexperimental]
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
|
||||
11
.github/CODEOWNERS
vendored
11
.github/CODEOWNERS
vendored
@ -9,7 +9,7 @@
|
||||
/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
||||
/vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
||||
/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
||||
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth
|
||||
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256
|
||||
/vllm/multimodal @DarkLight1337 @ywang96
|
||||
/vllm/vllm_flash_attn @LucasWilkinson
|
||||
/vllm/lora @jeejeelee
|
||||
@ -20,7 +20,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
|
||||
|
||||
# Any change to the VllmConfig changes can have a large user-facing impact,
|
||||
# so spam a lot of people
|
||||
/vllm/config.py @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor
|
||||
/vllm/config @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg
|
||||
|
||||
# vLLM V1
|
||||
/vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
|
||||
@ -34,16 +34,15 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
|
||||
/tests/distributed/test_pipeline_parallel.py @youkaichao
|
||||
/tests/distributed/test_same_node.py @youkaichao
|
||||
/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo @aarnphm
|
||||
/tests/kernels @tlrmchlsmth @WoosukKwon
|
||||
/tests/kernels @tlrmchlsmth @WoosukKwon @yewentao256
|
||||
/tests/models @DarkLight1337 @ywang96
|
||||
/tests/multi_step @alexm-redhat @comaniac
|
||||
/tests/multimodal @DarkLight1337 @ywang96
|
||||
/tests/prefix_caching @comaniac @KuntaiDu
|
||||
/tests/quantization @mgoin @robertgshaw2-redhat
|
||||
/tests/quantization @mgoin @robertgshaw2-redhat @yewentao256
|
||||
/tests/test_inputs.py @DarkLight1337 @ywang96
|
||||
/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
|
||||
/tests/v1/structured_output @mgoin @russellb @aarnphm
|
||||
/tests/weight_loading @mgoin @youkaichao
|
||||
/tests/weight_loading @mgoin @youkaichao @yewentao256
|
||||
/tests/lora @jeejeelee
|
||||
|
||||
# Docs
|
||||
|
||||
20
.github/PULL_REQUEST_TEMPLATE.md
vendored
20
.github/PULL_REQUEST_TEMPLATE.md
vendored
@ -1,11 +1,5 @@
|
||||
# Essential Elements of an Effective PR Description Checklist
|
||||
|
||||
- [ ] The purpose of the PR, such as "Fix some issue (link existing issues this PR will resolve)".
|
||||
- [ ] The test plan, such as providing test command.
|
||||
- [ ] The test results, such as pasting the results comparison before and after, or e2e results
|
||||
- [ ] (Optional) The necessary documentation update, such as updating `supported_models.md` and `examples` for a new model.
|
||||
|
||||
PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS ABOVE HAVE BEEN CONSIDERED.
|
||||
<!-- markdownlint-disable -->
|
||||
PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS (AT THE BOTTOM) HAVE BEEN CONSIDERED.
|
||||
|
||||
## Purpose
|
||||
|
||||
@ -15,4 +9,14 @@ PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS ABOVE HAVE B
|
||||
|
||||
## (Optional) Documentation Update
|
||||
|
||||
---
|
||||
<details>
|
||||
<summary> Essential Elements of an Effective PR Description Checklist </summary>
|
||||
|
||||
- [ ] The purpose of the PR, such as "Fix some issue (link existing issues this PR will resolve)".
|
||||
- [ ] The test plan, such as providing test command.
|
||||
- [ ] The test results, such as pasting the results comparison before and after, or e2e results
|
||||
- [ ] (Optional) The necessary documentation update, such as updating `supported_models.md` and `examples` for a new model.
|
||||
</details>
|
||||
|
||||
**BEFORE SUBMITTING, PLEASE READ <https://docs.vllm.ai/en/latest/contributing>** (anything written below this line will be removed by GitHub Actions)
|
||||
|
||||
14
.github/mergify.yml
vendored
14
.github/mergify.yml
vendored
@ -118,6 +118,20 @@ pull_request_rules:
|
||||
add:
|
||||
- qwen
|
||||
|
||||
- name: label-gpt-oss
|
||||
description: Automatically apply gpt-oss label
|
||||
conditions:
|
||||
- or:
|
||||
- files~=^examples/.*gpt[-_]?oss.*\.py
|
||||
- files~=^tests/.*gpt[-_]?oss.*\.py
|
||||
- files~=^vllm/model_executor/models/.*gpt[-_]?oss.*\.py
|
||||
- files~=^vllm/model_executor/layers/.*gpt[-_]?oss.*\.py
|
||||
- title~=(?i)gpt[-_]?oss
|
||||
actions:
|
||||
label:
|
||||
add:
|
||||
- gpt-oss
|
||||
|
||||
- name: label-rocm
|
||||
description: Automatically apply rocm label
|
||||
conditions:
|
||||
|
||||
8
.github/scripts/cleanup_pr_body.sh
vendored
8
.github/scripts/cleanup_pr_body.sh
vendored
@ -15,11 +15,11 @@ NEW=/tmp/new_pr_body.txt
|
||||
gh pr view --json body --template "{{.body}}" "${PR_NUMBER}" > "${OLD}"
|
||||
cp "${OLD}" "${NEW}"
|
||||
|
||||
# Remove "FIX #xxxx (*link existing issues this PR will resolve*)"
|
||||
sed -i '/FIX #xxxx.*$/d' "${NEW}"
|
||||
# Remove markdown comments (like the <!-- markdownlint-disable --> at the start)
|
||||
sed -i '/<!--.*-->$/d' "${NEW}"
|
||||
|
||||
# Remove "FILL IN THE PR DESCRIPTION HERE"
|
||||
sed -i '/FILL IN THE PR DESCRIPTION HERE/d' "${NEW}"
|
||||
# Remove "PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS (AT THE BOTTOM) HAVE BEEN CONSIDERED."
|
||||
sed -i '/PLEASE FILL IN THE PR DESCRIPTION HERE.*$/d' "${NEW}"
|
||||
|
||||
# Remove all lines after and including "**BEFORE SUBMITTING, PLEASE READ THE CHECKLIST BELOW AND FILL IN THE DESCRIPTION ABOVE**"
|
||||
sed -i '/\*\*BEFORE SUBMITTING, PLEASE READ.*\*\*/,$d' "${NEW}"
|
||||
|
||||
3
.gitignore
vendored
3
.gitignore
vendored
@ -150,7 +150,8 @@ venv.bak/
|
||||
# mkdocs documentation
|
||||
/site
|
||||
docs/argparse
|
||||
docs/examples
|
||||
docs/examples/*
|
||||
!docs/examples/README.md
|
||||
|
||||
# mypy
|
||||
.mypy_cache/
|
||||
|
||||
@ -18,14 +18,15 @@ Easy, fast, and cheap LLM serving for everyone
|
||||
|
||||
*Latest News* 🔥
|
||||
|
||||
- [2025/08] We hosted [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/dgkWg1WFpWGO2jCdTqQHxA) focusing on large-scale LLM deployment! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Pid6NSFLU43DZRi0EaTcPgXsAzDvbBqF) and the recording [here](https://www.chaspark.com/#/live/1166916873711665152).
|
||||
- [2025/05] We hosted [NYC vLLM Meetup](https://lu.ma/c1rqyf1f)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1_q_aW_ioMJWUImf1s1YM-ZhjXz8cUeL0IJvaquOYBeA/edit?usp=sharing).
|
||||
- [2025/05] vLLM is now a hosted project under PyTorch Foundation! Please find the announcement [here](https://pytorch.org/blog/pytorch-foundation-welcomes-vllm/).
|
||||
- [2025/04] We hosted [Asia Developer Day](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/19cp6Qu8u48ihB91A064XfaXruNYiBOUKrBxAmDOllOo/edit?usp=sharing).
|
||||
- [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html).
|
||||
|
||||
<details>
|
||||
<summary>Previous News</summary>
|
||||
|
||||
- [2025/04] We hosted [Asia Developer Day](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/19cp6Qu8u48ihB91A064XfaXruNYiBOUKrBxAmDOllOo/edit?usp=sharing).
|
||||
- [2025/03] We hosted [vLLM x Ollama Inference Night](https://lu.ma/vllm-ollama)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/16T2PDD1YwRnZ4Tu8Q5r6n53c5Lr5c73UV9Vd2_eBo4U/edit?usp=sharing).
|
||||
- [2025/03] We hosted [the first vLLM China Meetup](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg)! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1REHvfQMKGnvz6p3Fd23HhSO4c8j5WPGZV0bKYLwnHyQ/edit?usp=sharing).
|
||||
- [2025/03] We hosted [the East Coast vLLM Meetup](https://lu.ma/7mu4k4xx)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1NHiv8EUFF1NLd3fEYODm56nDmL26lEeXCaDgyDlTsRs/edit#slide=id.g31441846c39_0_0).
|
||||
@ -121,6 +122,7 @@ Cash Donations:
|
||||
|
||||
Compute Resources:
|
||||
|
||||
- Alibaba Cloud
|
||||
- AMD
|
||||
- Anyscale
|
||||
- AWS
|
||||
@ -160,7 +162,7 @@ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs
|
||||
## Contact Us
|
||||
|
||||
<!-- --8<-- [start:contact-us] -->
|
||||
- For technical questions and feature requests, please use GitHub [Issues](https://github.com/vllm-project/vllm/issues) or [Discussions](https://github.com/vllm-project/vllm/discussions)
|
||||
- For technical questions and feature requests, please use GitHub [Issues](https://github.com/vllm-project/vllm/issues)
|
||||
- For discussing with fellow users, please use the [vLLM Forum](https://discuss.vllm.ai)
|
||||
- For coordinating contributions and development, please use [Slack](https://slack.vllm.ai)
|
||||
- For security disclosures, please use GitHub's [Security Advisories](https://github.com/vllm-project/vllm/security/advisories) feature
|
||||
|
||||
@ -31,7 +31,7 @@ class RequestFuncInput:
|
||||
model_name: Optional[str] = None
|
||||
logprobs: Optional[int] = None
|
||||
extra_body: Optional[dict] = None
|
||||
multi_modal_content: Optional[dict] = None
|
||||
multi_modal_content: Optional[dict | list[dict]] = None
|
||||
ignore_eos: bool = False
|
||||
language: Optional[str] = None
|
||||
|
||||
@ -364,7 +364,15 @@ async def async_request_openai_chat_completions(
|
||||
) as session:
|
||||
content = [{"type": "text", "text": request_func_input.prompt}]
|
||||
if request_func_input.multi_modal_content:
|
||||
content.append(request_func_input.multi_modal_content)
|
||||
mm_content = request_func_input.multi_modal_content
|
||||
if isinstance(mm_content, list):
|
||||
content.extend(mm_content)
|
||||
elif isinstance(mm_content, dict):
|
||||
content.append(mm_content)
|
||||
else:
|
||||
raise TypeError(
|
||||
"multi_modal_content must be a dict or list[dict] for openai-chat"
|
||||
)
|
||||
payload = {
|
||||
"model": request_func_input.model_name
|
||||
if request_func_input.model_name
|
||||
@ -491,7 +499,10 @@ async def async_request_openai_audio(
|
||||
buffer.seek(0)
|
||||
return buffer
|
||||
|
||||
with to_bytes(*request_func_input.multi_modal_content["audio"]) as f:
|
||||
mm_audio = request_func_input.multi_modal_content
|
||||
if not isinstance(mm_audio, dict) or "audio" not in mm_audio:
|
||||
raise TypeError("multi_modal_content must be a dict containing 'audio'")
|
||||
with to_bytes(*mm_audio["audio"]) as f:
|
||||
form = aiohttp.FormData()
|
||||
form.add_field("file", f, content_type="audio/wav")
|
||||
for key, value in payload.items():
|
||||
|
||||
74
benchmarks/benchmark_block_pool.py
Normal file
74
benchmarks/benchmark_block_pool.py
Normal file
@ -0,0 +1,74 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import gc
|
||||
|
||||
from tabulate import tabulate
|
||||
|
||||
from benchmark_utils import TimeCollector
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
from vllm.v1.core.block_pool import BlockPool
|
||||
|
||||
|
||||
def main(args):
|
||||
rows = []
|
||||
for allocate_block in args.allocate_blocks:
|
||||
# Enforce a GC collect ahead to minimize the impact among runs
|
||||
gc.collect()
|
||||
block_pool = BlockPool(num_gpu_blocks=args.num_gpu_blocks, enable_caching=True)
|
||||
|
||||
get_blocks_times = TimeCollector(TimeCollector.US)
|
||||
free_blocks_times = TimeCollector(TimeCollector.US)
|
||||
for _ in range(args.num_iteration):
|
||||
with get_blocks_times:
|
||||
blocks = block_pool.get_new_blocks(allocate_block)
|
||||
with free_blocks_times:
|
||||
block_pool.free_blocks(blocks)
|
||||
|
||||
rows.append(
|
||||
[get_blocks_times.cnt, args.num_gpu_blocks, allocate_block]
|
||||
+ get_blocks_times.dump_avg_max()
|
||||
+ free_blocks_times.dump_avg_max()
|
||||
)
|
||||
|
||||
print(
|
||||
tabulate(
|
||||
rows,
|
||||
headers=[
|
||||
"Iterations",
|
||||
"Total\nBlocks",
|
||||
"Allocated\nBlocks",
|
||||
"Get Blocks\nAvg (us)",
|
||||
"Get Blocks\nMax (us)",
|
||||
"Free Blocks\nAvg (us)",
|
||||
"Free Blocks\nMax (us)",
|
||||
],
|
||||
tablefmt="grid",
|
||||
floatfmt=".3f",
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def invoke_main() -> None:
|
||||
parser = FlexibleArgumentParser(
|
||||
description="Benchmark the performance of BlockPool for KV Cache."
|
||||
)
|
||||
parser.add_argument("--num-gpu-blocks", type=int, default=100000)
|
||||
parser.add_argument(
|
||||
"--num-iteration",
|
||||
type=int,
|
||||
default=1000,
|
||||
help="Number of iterations to run to stablize final data readings",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--allocate-blocks",
|
||||
type=int,
|
||||
nargs="*",
|
||||
default=[10, 50, 100, 500, 1000],
|
||||
help="Number of blocks to allocate",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
main(args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
invoke_main() # pragma: no cover
|
||||
@ -52,7 +52,7 @@ class SampleRequest:
|
||||
prompt: Union[str, Any]
|
||||
prompt_len: int
|
||||
expected_output_len: int
|
||||
multi_modal_data: Optional[Union[MultiModalDataDict, dict]] = None
|
||||
multi_modal_data: Optional[Union[MultiModalDataDict, dict, list[dict]]] = None
|
||||
lora_request: Optional[LoRARequest] = None
|
||||
|
||||
|
||||
|
||||
112
benchmarks/benchmark_ngram_proposer.py
Normal file
112
benchmarks/benchmark_ngram_proposer.py
Normal file
@ -0,0 +1,112 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import gc
|
||||
|
||||
import numpy as np
|
||||
from tabulate import tabulate
|
||||
|
||||
from benchmark_utils import TimeCollector
|
||||
from vllm.config import ModelConfig, SpeculativeConfig, VllmConfig
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
from vllm.v1.spec_decode.ngram_proposer import NgramProposer
|
||||
|
||||
|
||||
def main(args):
|
||||
rows = []
|
||||
for max_ngram in args.max_ngram:
|
||||
collector = TimeCollector(TimeCollector.US)
|
||||
|
||||
model_config = ModelConfig(
|
||||
model="facebook/opt-125m",
|
||||
task="generate",
|
||||
max_model_len=args.num_token + args.num_spec_token,
|
||||
tokenizer="facebook/opt-125m",
|
||||
tokenizer_mode="auto",
|
||||
dtype="auto",
|
||||
seed=None,
|
||||
trust_remote_code=False,
|
||||
)
|
||||
proposer = NgramProposer(
|
||||
vllm_config=VllmConfig(
|
||||
model_config=model_config,
|
||||
speculative_config=SpeculativeConfig(
|
||||
prompt_lookup_min=args.min_ngram,
|
||||
prompt_lookup_max=max_ngram,
|
||||
num_speculative_tokens=args.num_spec_token,
|
||||
method="ngram",
|
||||
),
|
||||
)
|
||||
)
|
||||
|
||||
# Warm up
|
||||
proposer.propose(np.random.randint(0, 20, (args.num_token,)))
|
||||
|
||||
gc.collect()
|
||||
for _ in range(args.num_iteration):
|
||||
tokens = np.random.randint(0, 20, (args.num_req, args.num_token))
|
||||
with collector:
|
||||
for i in range(args.num_req):
|
||||
proposer.propose(tokens[i, :])
|
||||
rows.append(
|
||||
[args.num_req, args.num_token, args.min_ngram, max_ngram]
|
||||
+ collector.dump_avg_max()
|
||||
)
|
||||
|
||||
print(
|
||||
tabulate(
|
||||
rows,
|
||||
headers=[
|
||||
"# Request",
|
||||
"# Token",
|
||||
"Min Ngram",
|
||||
"Max Ngram",
|
||||
"Avg (us)",
|
||||
"Max (us)",
|
||||
],
|
||||
tablefmt="grid",
|
||||
floatfmt=".3f",
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def invoke_main() -> None:
|
||||
parser = FlexibleArgumentParser(
|
||||
description="Benchmark the performance of N-gram speculative decode drafting"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--num-iteration",
|
||||
type=int,
|
||||
default=100,
|
||||
help="Number of iterations to run to stablize final data readings",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--num-req", type=int, default=128, help="Number of requests in the batch"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--num-token", type=int, default=1500, help="Number of tokens for each request"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--min-ngram",
|
||||
type=int,
|
||||
default=3,
|
||||
help="Minimum n-gram to match",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max-ngram",
|
||||
type=int,
|
||||
nargs="*",
|
||||
default=[5, 7, 10, 15, 20],
|
||||
help="Maximum n-gram to match",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--num-spec-token",
|
||||
type=int,
|
||||
default=3,
|
||||
help="Number of speculative tokens to generate",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
main(args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
invoke_main() # pragma: no cover
|
||||
@ -263,7 +263,14 @@ async def benchmark(
|
||||
input_requests[0].multi_modal_data,
|
||||
)
|
||||
|
||||
assert test_mm_content is None or isinstance(test_mm_content, dict)
|
||||
assert (
|
||||
test_mm_content is None
|
||||
or isinstance(test_mm_content, dict)
|
||||
or (
|
||||
isinstance(test_mm_content, list)
|
||||
and all(isinstance(item, dict) for item in test_mm_content)
|
||||
)
|
||||
), "multi_modal_data must be a dict or list[dict]"
|
||||
test_input = RequestFuncInput(
|
||||
model=model_id,
|
||||
model_name=model_name,
|
||||
|
||||
@ -1,11 +1,12 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import math
|
||||
import os
|
||||
from typing import Any
|
||||
import time
|
||||
from types import TracebackType
|
||||
from typing import Any, Optional, Union
|
||||
|
||||
|
||||
def convert_to_pytorch_benchmark_format(
|
||||
@ -72,3 +73,53 @@ def write_to_json(filename: str, records: list) -> None:
|
||||
cls=InfEncoder,
|
||||
default=lambda o: f"<{type(o).__name__} object is not JSON serializable>",
|
||||
)
|
||||
|
||||
|
||||
# Collect time and generate time metrics
|
||||
#
|
||||
# Example Usage:
|
||||
# collector = TimeCollector(TimeCollector.US)
|
||||
# for _ in range(total_iteration):
|
||||
# with collector:
|
||||
# ...
|
||||
# collector.dump_avg_max()
|
||||
class TimeCollector:
|
||||
NS: int = 1
|
||||
US: int = NS * 1000
|
||||
MS: int = US * 1000
|
||||
S: int = MS * 1000
|
||||
|
||||
def __init__(self, scale: int) -> None:
|
||||
self.cnt: int = 0
|
||||
self._sum: int = 0
|
||||
self._max: Optional[int] = None
|
||||
self.scale = scale
|
||||
self.start_time: int = time.monotonic_ns()
|
||||
|
||||
def collect(self, v: int) -> None:
|
||||
self.cnt += 1
|
||||
self._sum += v
|
||||
if self._max is None:
|
||||
self._max = v
|
||||
else:
|
||||
self._max = max(self._max, v)
|
||||
|
||||
def avg(self) -> Union[float, str]:
|
||||
return self._sum * 1.0 / self.cnt / self.scale if self.cnt > 0 else "N/A"
|
||||
|
||||
def max(self) -> Union[float, str]:
|
||||
return self._max / self.scale if self._max else "N/A"
|
||||
|
||||
def dump_avg_max(self) -> list[Union[float, str]]:
|
||||
return [self.avg(), self.max()]
|
||||
|
||||
def __enter__(self) -> None:
|
||||
self.start_time = time.monotonic_ns()
|
||||
|
||||
def __exit__(
|
||||
self,
|
||||
exc_type: Optional[type[BaseException]],
|
||||
exc_value: Optional[BaseException],
|
||||
exc_traceback: Optional[TracebackType],
|
||||
) -> None:
|
||||
self.collect(time.monotonic_ns() - self.start_time)
|
||||
|
||||
@ -22,10 +22,10 @@ from vllm.utils import FlexibleArgumentParser
|
||||
FP8_DTYPE = current_platform.fp8_dtype()
|
||||
|
||||
|
||||
def ensure_divisibility(numerator, denominator):
|
||||
def ensure_divisibility(numerator, denominator, text):
|
||||
"""Ensure that numerator is divisible by the denominator."""
|
||||
assert numerator % denominator == 0, (
|
||||
"intermediate_size {} is not divisible by tp {}.".format(numerator, denominator)
|
||||
assert numerator % denominator == 0, "{} {} is not divisible by tp {}.".format(
|
||||
text, numerator, denominator
|
||||
)
|
||||
|
||||
|
||||
@ -577,12 +577,10 @@ def main(args: argparse.Namespace):
|
||||
E = config.ffn_config.moe_num_experts
|
||||
topk = config.ffn_config.moe_top_k
|
||||
intermediate_size = config.ffn_config.ffn_hidden_size
|
||||
shard_intermediate_size = 2 * intermediate_size // args.tp_size
|
||||
elif config.architectures[0] == "JambaForCausalLM":
|
||||
E = config.num_experts
|
||||
topk = config.num_experts_per_tok
|
||||
intermediate_size = config.intermediate_size
|
||||
shard_intermediate_size = 2 * intermediate_size // args.tp_size
|
||||
elif config.architectures[0] in (
|
||||
"DeepseekV3ForCausalLM",
|
||||
"DeepseekV2ForCausalLM",
|
||||
@ -591,17 +589,14 @@ def main(args: argparse.Namespace):
|
||||
E = config.n_routed_experts
|
||||
topk = config.num_experts_per_tok
|
||||
intermediate_size = config.moe_intermediate_size
|
||||
shard_intermediate_size = 2 * intermediate_size // args.tp_size
|
||||
elif config.architectures[0] in ("Qwen2MoeForCausalLM", "Qwen3MoeForCausalLM"):
|
||||
E = config.num_experts
|
||||
topk = config.num_experts_per_tok
|
||||
intermediate_size = config.moe_intermediate_size
|
||||
shard_intermediate_size = 2 * intermediate_size // args.tp_size
|
||||
elif config.architectures[0] in ("HunYuanMoEV1ForCausalLM"):
|
||||
E = config.num_experts
|
||||
topk = config.moe_topk[0]
|
||||
intermediate_size = config.moe_intermediate_size[0]
|
||||
shard_intermediate_size = 2 * intermediate_size // args.tp_size
|
||||
else:
|
||||
# Support for llama4
|
||||
config = config.get_text_config()
|
||||
@ -609,8 +604,14 @@ def main(args: argparse.Namespace):
|
||||
E = config.num_local_experts
|
||||
topk = config.num_experts_per_tok
|
||||
intermediate_size = config.intermediate_size
|
||||
enable_ep = bool(args.enable_expert_parallel)
|
||||
if enable_ep:
|
||||
ensure_divisibility(E, args.tp_size, "Number of experts")
|
||||
E = E // args.tp_size
|
||||
shard_intermediate_size = 2 * intermediate_size
|
||||
else:
|
||||
ensure_divisibility(intermediate_size, args.tp_size, "intermediate_size")
|
||||
shard_intermediate_size = 2 * intermediate_size // args.tp_size
|
||||
ensure_divisibility(intermediate_size, args.tp_size)
|
||||
hidden_size = config.hidden_size
|
||||
dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype
|
||||
use_fp8_w8a8 = args.dtype == "fp8_w8a8"
|
||||
@ -742,6 +743,7 @@ if __name__ == "__main__":
|
||||
parser.add_argument(
|
||||
"--tp-size", "-tp", "--tensor-parallel-size", type=int, default=2
|
||||
)
|
||||
parser.add_argument("--enable-expert-parallel", "-enable-ep", action="store_true")
|
||||
parser.add_argument(
|
||||
"--dtype", type=str, choices=["auto", "fp8_w8a8", "int8_w8a16"], default="auto"
|
||||
)
|
||||
|
||||
328
benchmarks/kernels/benchmark_mrope.py
Normal file
328
benchmarks/kernels/benchmark_mrope.py
Normal file
@ -0,0 +1,328 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# This script benchmarks the mrope kernel (mainly for Qwen2VL and Qwen2.5VL models).
|
||||
# It generates test data, runs benchmarks, and saves results to a CSV file.
|
||||
#
|
||||
# The CSV file (named with current date/time) contains these columns:
|
||||
# model_name, tp_size, num_tokens, num_heads, num_kv_heads, head_dim, max_position,
|
||||
# rope_theta, is_neox_style, rope_scaling, dtype, torch_mean, torch_median, torch_p99,
|
||||
# torch_min, torch_max, triton_mean, triton_median, triton_p99, triton_min, triton_max,
|
||||
# speedup
|
||||
#
|
||||
# == Usage Examples ==
|
||||
#
|
||||
# Single model benchmark:
|
||||
# python3 benchmark_mrope.py --model-name Qwen/Qwen2-VL-7B-Instruct --tp-size 1 \
|
||||
# --warmup-iter 10 --benchmark-iter 100 --dtype bfloat16 --seed 0 --num-tokens 1024
|
||||
#
|
||||
# All models benchmark:
|
||||
# python3 benchmark_mrope.py --model-name "" --tp-size 1 --warmup-iter 10 \
|
||||
# --benchmark-iter 100 --dtype bfloat16 --seed 0 --num-tokens 1024
|
||||
#
|
||||
# All models with different TP sizes:
|
||||
# python3 benchmark_mrope.py --model-name "" --tp-size 1 2 4 8 --warmup-iter 10 \
|
||||
# --benchmark-iter 100 --dtype bfloat16 --seed 0 --num-tokens 1024
|
||||
#
|
||||
# All models with different token counts:
|
||||
# python3 benchmark_mrope.py --model-name "" --tp-size 1 --warmup-iter 10 \
|
||||
# --benchmark-iter 100 --dtype bfloat16 --seed 0 --num-tokens 1024 4096 16384
|
||||
import csv
|
||||
import os
|
||||
import time
|
||||
from datetime import datetime
|
||||
from typing import Any
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
from vllm.model_executor.layers.rotary_embedding import get_rope
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.transformers_utils.config import get_config
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
|
||||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
|
||||
|
||||
def generate_test_data(
|
||||
num_tokens: int,
|
||||
num_q_heads: int,
|
||||
num_kv_heads: int,
|
||||
head_size: int,
|
||||
max_position_embeddings: int,
|
||||
dtype: torch.dtype,
|
||||
device: torch.device,
|
||||
):
|
||||
"""Generate test data for given configuration."""
|
||||
# Create 2D positions (3, num_tokens) for multimodal case
|
||||
positions = torch.randint(
|
||||
0, max_position_embeddings // 4, (3, num_tokens), device=device
|
||||
)
|
||||
|
||||
# Create query and key tensors
|
||||
query = torch.randn(num_tokens, num_q_heads * head_size, dtype=dtype, device=device)
|
||||
key = torch.randn(num_tokens, num_kv_heads * head_size, dtype=dtype, device=device)
|
||||
|
||||
return positions, query, key
|
||||
|
||||
|
||||
def calculate_stats(times: list[float]) -> dict[str, float]:
|
||||
"""Calculate statistics from a list of times."""
|
||||
times_array = np.array(times)
|
||||
return {
|
||||
"mean": np.mean(times_array),
|
||||
"median": np.median(times_array),
|
||||
"p99": np.percentile(times_array, 99),
|
||||
"min": np.min(times_array),
|
||||
"max": np.max(times_array),
|
||||
}
|
||||
|
||||
|
||||
def benchmark_mrope(
|
||||
model_name: str,
|
||||
num_tokens: int,
|
||||
head_dim: int,
|
||||
tp_size: int,
|
||||
num_heads: int,
|
||||
num_kv_heads: int,
|
||||
max_position: int = 8192,
|
||||
rope_theta: float = 10000,
|
||||
is_neox_style: bool = True,
|
||||
rope_scaling: dict[str, Any] = None,
|
||||
dtype: torch.dtype = torch.bfloat16,
|
||||
seed: int = 0,
|
||||
warmup_iter: int = 10,
|
||||
benchmark_iter: int = 100,
|
||||
csv_writer=None,
|
||||
):
|
||||
current_platform.seed_everything(seed)
|
||||
torch.set_default_device(device)
|
||||
# the parameters to compute the q k v size based on tp_size
|
||||
mrope_helper_class = get_rope(
|
||||
head_size=head_dim,
|
||||
rotary_dim=head_dim,
|
||||
max_position=max_position,
|
||||
base=rope_theta,
|
||||
is_neox_style=is_neox_style,
|
||||
rope_scaling=rope_scaling,
|
||||
dtype=dtype,
|
||||
).to(device=device)
|
||||
|
||||
print(80 * "=")
|
||||
print(
|
||||
f"Evaluating model: {model_name} "
|
||||
f"with tp_size: {tp_size} "
|
||||
f"and num_tokens: {num_tokens}, "
|
||||
f"dtype: {dtype}"
|
||||
)
|
||||
|
||||
# create q k v input tensors
|
||||
# create rotary pos emb input tensors
|
||||
positions, query, key = generate_test_data(
|
||||
num_tokens, num_heads, num_kv_heads, head_dim, max_position, dtype, device
|
||||
)
|
||||
|
||||
# Warm up
|
||||
for _ in range(warmup_iter):
|
||||
mrope_helper_class.forward_native(
|
||||
positions,
|
||||
query.clone(),
|
||||
key.clone(),
|
||||
)
|
||||
|
||||
mrope_helper_class.forward_cuda(
|
||||
positions,
|
||||
query.clone(),
|
||||
key.clone(),
|
||||
)
|
||||
|
||||
torch.cuda.synchronize()
|
||||
|
||||
# Time reference implementation
|
||||
torch_times = []
|
||||
for _ in range(benchmark_iter):
|
||||
query_clone = query.clone()
|
||||
key_clone = key.clone()
|
||||
torch.cuda.synchronize()
|
||||
start_time = time.time()
|
||||
|
||||
mrope_helper_class.forward_native(
|
||||
positions,
|
||||
query_clone,
|
||||
key_clone,
|
||||
)
|
||||
|
||||
torch.cuda.synchronize()
|
||||
torch_times.append(time.time() - start_time)
|
||||
|
||||
# Time triton kernel implementation
|
||||
triton_times = []
|
||||
for _ in range(benchmark_iter):
|
||||
query_clone = query.clone()
|
||||
key_clone = key.clone()
|
||||
torch.cuda.synchronize()
|
||||
start_time = time.time()
|
||||
mrope_helper_class.forward_cuda(
|
||||
positions,
|
||||
query_clone,
|
||||
key_clone,
|
||||
)
|
||||
torch.cuda.synchronize()
|
||||
triton_times.append(time.time() - start_time)
|
||||
|
||||
# Calculate statistics
|
||||
torch_stats = calculate_stats(torch_times)
|
||||
triton_stats = calculate_stats(triton_times)
|
||||
print(f"\nPerformance for config ({num_tokens}, {num_heads}, {num_kv_heads}):")
|
||||
|
||||
print(
|
||||
f"Torch implementation: "
|
||||
f"mean={torch_stats['mean']:.8f}s, "
|
||||
f"median={torch_stats['median']:.8f}s, "
|
||||
f"p99={torch_stats['p99']:.8f}s"
|
||||
)
|
||||
|
||||
print(
|
||||
f"Triton implementation: "
|
||||
f"mean={triton_stats['mean']:.8f}s, "
|
||||
f"median={triton_stats['median']:.8f}s, "
|
||||
f"p99={triton_stats['p99']:.8f}s"
|
||||
)
|
||||
|
||||
print(
|
||||
f"Triton Speedup over Torch: {torch_stats['mean'] / triton_stats['mean']:.8f}x"
|
||||
)
|
||||
|
||||
# Write to CSV
|
||||
if csv_writer:
|
||||
row = [
|
||||
model_name,
|
||||
tp_size,
|
||||
num_tokens,
|
||||
num_heads,
|
||||
num_kv_heads,
|
||||
head_dim,
|
||||
max_position,
|
||||
rope_theta,
|
||||
is_neox_style,
|
||||
str(rope_scaling),
|
||||
str(dtype).split(".")[-1],
|
||||
torch_stats["mean"],
|
||||
torch_stats["median"],
|
||||
torch_stats["p99"],
|
||||
torch_stats["min"],
|
||||
torch_stats["max"],
|
||||
triton_stats["mean"],
|
||||
triton_stats["median"],
|
||||
triton_stats["p99"],
|
||||
triton_stats["min"],
|
||||
triton_stats["max"],
|
||||
torch_stats["mean"] / triton_stats["mean"], # speedup
|
||||
]
|
||||
csv_writer.writerow(row)
|
||||
|
||||
return torch_stats, triton_stats
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = FlexibleArgumentParser(
|
||||
description="Benchmark the rotary embedding kernels."
|
||||
)
|
||||
parser.add_argument("--model-name", type=str, default="")
|
||||
parser.add_argument("--tp-size", type=int, default=1)
|
||||
parser.add_argument("--warmup-iter", type=int, default=10)
|
||||
parser.add_argument("--benchmark-iter", type=int, default=100)
|
||||
parser.add_argument("--dtype", type=str, choices=["bfloat16"], default="bfloat16")
|
||||
parser.add_argument("--seed", type=int, default=0)
|
||||
parser.add_argument("--num-tokens", type=int, nargs="+", required=False)
|
||||
parser.add_argument("--trust-remote-code", action="store_true")
|
||||
parser.add_argument("--output-csv", type=str, default="mrope_benchmark_results.csv")
|
||||
args = parser.parse_args()
|
||||
print(args)
|
||||
|
||||
# Create CSV file for results
|
||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
csv_filename = f"{os.path.splitext(args.output_csv)[0]}_{timestamp}.csv"
|
||||
|
||||
with open(csv_filename, "w", newline="") as csvfile:
|
||||
csv_writer = csv.writer(csvfile)
|
||||
# Write header
|
||||
header = [
|
||||
"model_name",
|
||||
"tp_size",
|
||||
"num_tokens",
|
||||
"num_heads",
|
||||
"num_kv_heads",
|
||||
"head_dim",
|
||||
"max_position",
|
||||
"rope_theta",
|
||||
"is_neox_style",
|
||||
"rope_scaling",
|
||||
"dtype",
|
||||
"torch_mean",
|
||||
"torch_median",
|
||||
"torch_p99",
|
||||
"torch_min",
|
||||
"torch_max",
|
||||
"triton_mean",
|
||||
"triton_median",
|
||||
"triton_p99",
|
||||
"triton_min",
|
||||
"triton_max",
|
||||
"speedup",
|
||||
]
|
||||
csv_writer.writerow(header)
|
||||
|
||||
model_tp_dict = {}
|
||||
if args.model_name == "":
|
||||
model_tp_dict = {
|
||||
"Qwen/Qwen2-VL-2B-Instruct": [1],
|
||||
"Qwen/Qwen2-VL-7B-Instruct": [1],
|
||||
"Qwen/Qwen2-VL-72B-Instruct": [2, 4, 8],
|
||||
"Qwen/Qwen2.5-VL-3B-Instruct": [1, 2, 4, 8],
|
||||
"Qwen/Qwen2.5-VL-7B-Instruct": [1, 2, 4, 8],
|
||||
"Qwen/Qwen2.5-VL-72B-Instruct": [2, 4, 8],
|
||||
}
|
||||
else:
|
||||
model_tp_dict[args.model_name] = [args.tp_size]
|
||||
|
||||
if args.num_tokens is None:
|
||||
num_tokens_list = [2**i for i in range(0, 18)]
|
||||
else:
|
||||
num_tokens_list = args.num_tokens
|
||||
|
||||
for model_name, tp_list in model_tp_dict.items():
|
||||
config = get_config(model_name, trust_remote_code=args.trust_remote_code)
|
||||
for tp_size in tp_list:
|
||||
# get the model config
|
||||
total_num_kv_heads = config.num_key_value_heads
|
||||
total_num_heads = config.num_attention_heads
|
||||
num_heads = total_num_heads // tp_size
|
||||
num_kv_heads = max(1, total_num_kv_heads // tp_size)
|
||||
head_dim = config.hidden_size // total_num_heads
|
||||
q_size = num_heads * head_dim
|
||||
kv_size = num_kv_heads * head_dim
|
||||
is_neox_style = True
|
||||
rope_theta = config.rope_theta
|
||||
max_position = config.max_position_embeddings
|
||||
|
||||
for num_tokens in num_tokens_list:
|
||||
benchmark_mrope(
|
||||
model_name=model_name,
|
||||
num_tokens=num_tokens,
|
||||
head_dim=head_dim,
|
||||
tp_size=tp_size,
|
||||
num_heads=num_heads,
|
||||
num_kv_heads=num_kv_heads,
|
||||
max_position=max_position,
|
||||
rope_theta=rope_theta,
|
||||
is_neox_style=is_neox_style,
|
||||
rope_scaling=config.rope_scaling,
|
||||
dtype=getattr(torch, args.dtype),
|
||||
seed=args.seed,
|
||||
warmup_iter=args.warmup_iter,
|
||||
benchmark_iter=args.benchmark_iter,
|
||||
csv_writer=csv_writer,
|
||||
)
|
||||
|
||||
print(f"Benchmark results saved to {csv_filename}")
|
||||
@ -1,108 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import gc
|
||||
import time
|
||||
from typing import Optional
|
||||
|
||||
from tabulate import tabulate
|
||||
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
from vllm.v1.core.block_pool import BlockPool
|
||||
|
||||
|
||||
class Metric:
|
||||
def __init__(self) -> None:
|
||||
self.cnt: int = 0
|
||||
self.sum_v: int = 0
|
||||
self.max_v: Optional[int] = None
|
||||
|
||||
def update(self, v: int) -> None:
|
||||
self.cnt += 1
|
||||
self.sum_v += v
|
||||
if self.max_v is None:
|
||||
self.max_v = v
|
||||
else:
|
||||
self.max_v = max(self.max_v, v)
|
||||
|
||||
def avg_v(self) -> float:
|
||||
return self.sum_v * 1.0 / self.cnt
|
||||
|
||||
|
||||
def main(args):
|
||||
rows = []
|
||||
for allocate_block in args.allocate_blocks:
|
||||
# Enforce a GC collect ahead to minimize the impact among runs
|
||||
gc.collect()
|
||||
block_pool = BlockPool(num_gpu_blocks=args.num_gpu_blocks, enable_caching=True)
|
||||
|
||||
get_blocks_metric: Metric = Metric()
|
||||
free_blocks_metric: Metric = Metric()
|
||||
for _ in range(args.num_iteration):
|
||||
t1 = time.monotonic_ns()
|
||||
blocks = block_pool.get_new_blocks(allocate_block)
|
||||
t2 = time.monotonic_ns()
|
||||
block_pool.free_blocks(blocks)
|
||||
t3 = time.monotonic_ns()
|
||||
get_blocks_metric.update(t2 - t1)
|
||||
free_blocks_metric.update(t3 - t2)
|
||||
|
||||
if get_blocks_metric.max_v is not None and free_blocks_metric.max_v is not None:
|
||||
rows.append(
|
||||
[
|
||||
get_blocks_metric.cnt,
|
||||
args.num_gpu_blocks,
|
||||
allocate_block,
|
||||
get_blocks_metric.avg_v() / 1000000,
|
||||
get_blocks_metric.max_v / 1000000.0,
|
||||
free_blocks_metric.avg_v() / 1000000,
|
||||
free_blocks_metric.max_v / 1000000.0,
|
||||
]
|
||||
)
|
||||
else:
|
||||
print(
|
||||
"No valid metrics found."
|
||||
f" {get_blocks_metric.max_v=} {free_blocks_metric.max_v=}"
|
||||
)
|
||||
|
||||
print(
|
||||
tabulate(
|
||||
rows,
|
||||
headers=[
|
||||
"Iterations",
|
||||
"Total\nBlocks",
|
||||
"Allocated\nBlocks",
|
||||
"Get Blocks\nAvg (ms)",
|
||||
"Get Blocks\nMax (ms)",
|
||||
"Free Blocks\nAvg (ms)",
|
||||
"Free Blocks\nMax (ms)",
|
||||
],
|
||||
tablefmt="grid",
|
||||
floatfmt=".6f",
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def invoke_main() -> None:
|
||||
parser = FlexibleArgumentParser(
|
||||
description="Benchmark the performance of BlockPool for KV Cache."
|
||||
)
|
||||
parser.add_argument("--num-gpu-blocks", type=int, default=100000)
|
||||
parser.add_argument(
|
||||
"--num-iteration",
|
||||
type=int,
|
||||
default=1000,
|
||||
help="Number of iterations to run to stablize final data readings",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--allocate-blocks",
|
||||
type=int,
|
||||
nargs="*",
|
||||
default=[10, 50, 100, 500, 1000],
|
||||
help="Number of blocks to allocate",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
main(args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
invoke_main() # pragma: no cover
|
||||
@ -4,7 +4,7 @@ import logging
|
||||
from enum import Enum
|
||||
|
||||
|
||||
class Color(str, Enum):
|
||||
class Color(Enum):
|
||||
RED = "\033[91m"
|
||||
GREEN = "\033[92m"
|
||||
BLUE = "\033[94m"
|
||||
@ -13,6 +13,9 @@ class Color(str, Enum):
|
||||
YELLOW = "\033[93m"
|
||||
RESET = "\033[0m"
|
||||
|
||||
def __str__(self):
|
||||
return self.value
|
||||
|
||||
|
||||
TEXT_SEPARATOR = "-" * 100
|
||||
|
||||
|
||||
@ -188,7 +188,9 @@ __launch_bounds__(TPB) __global__ void moeTopK(
|
||||
It fuses the softmax, max and argmax into a single kernel.
|
||||
|
||||
Limitations:
|
||||
1) This implementation is intended for when the number of experts is a small power of 2.
|
||||
1) This implementation is optimized for when the number of experts is a small power of 2.
|
||||
Additionally it also supports when number of experts is multiple of 64 which is still
|
||||
faster than the computing softmax and topK separately (only tested on CUDA yet).
|
||||
2) This implementation assumes k is small, but will work for any k.
|
||||
*/
|
||||
|
||||
@ -198,8 +200,6 @@ __launch_bounds__(WARPS_PER_CTA* WARP_SIZE_PARAM) __global__
|
||||
int* source_rows, const int k, const int start_expert, const int end_expert)
|
||||
{
|
||||
// We begin by enforcing compile time assertions and setting up compile time constants.
|
||||
static_assert(VPT == (VPT & -VPT), "VPT must be power of 2");
|
||||
static_assert(NUM_EXPERTS == (NUM_EXPERTS & -NUM_EXPERTS), "NUM_EXPERTS must be power of 2");
|
||||
static_assert(BYTES_PER_LDG == (BYTES_PER_LDG & -BYTES_PER_LDG), "BYTES_PER_LDG must be power of 2");
|
||||
static_assert(BYTES_PER_LDG <= 16, "BYTES_PER_LDG must be leq 16");
|
||||
|
||||
@ -407,12 +407,10 @@ struct TopkConstants
|
||||
};
|
||||
} // namespace detail
|
||||
|
||||
template <int EXPERTS, int WARPS_PER_TB, int WARP_SIZE_PARAM, typename IndType>
|
||||
template <int EXPERTS, int WARPS_PER_TB, int WARP_SIZE_PARAM, int MAX_BYTES_PER_LDG, typename IndType>
|
||||
void topkGatingSoftmaxLauncherHelper(const float* input, const bool* finished, float* output, IndType* indices,
|
||||
int* source_row, const int num_rows, const int k, const int start_expert, const int end_expert, cudaStream_t stream)
|
||||
{
|
||||
static constexpr std::size_t MAX_BYTES_PER_LDG = 16;
|
||||
|
||||
static constexpr int BYTES_PER_LDG = MIN(MAX_BYTES_PER_LDG, sizeof(float) * EXPERTS);
|
||||
using Constants = detail::TopkConstants<EXPERTS, BYTES_PER_LDG, WARP_SIZE_PARAM>;
|
||||
static constexpr int VPT = Constants::VPT;
|
||||
@ -425,21 +423,27 @@ void topkGatingSoftmaxLauncherHelper(const float* input, const bool* finished, f
|
||||
input, finished, output, num_rows, indices, source_row, k, start_expert, end_expert);
|
||||
}
|
||||
|
||||
#define LAUNCH_SOFTMAX(NUM_EXPERTS, WARPS_PER_TB) \
|
||||
switch (warpSize) { \
|
||||
case 32: \
|
||||
topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB, 32>( \
|
||||
gating_output, nullptr, topk_weights, topk_indices, \
|
||||
token_expert_indices, num_tokens, topk, 0, num_experts, stream); \
|
||||
break; \
|
||||
case 64: \
|
||||
topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB, 64>( \
|
||||
gating_output, nullptr, topk_weights, topk_indices, \
|
||||
token_expert_indices, num_tokens, topk, 0, num_experts, stream); \
|
||||
break; \
|
||||
default: \
|
||||
TORCH_CHECK(false, "Unsupported warp size: ", warpSize); \
|
||||
#ifndef USE_ROCM
|
||||
#define LAUNCH_SOFTMAX(NUM_EXPERTS, WARPS_PER_TB, MAX_BYTES) \
|
||||
static_assert(WARP_SIZE == 32, \
|
||||
"Unsupported warp size. Only 32 is supported for CUDA"); \
|
||||
topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB, WARP_SIZE, MAX_BYTES>( \
|
||||
gating_output, nullptr, topk_weights, topk_indices, \
|
||||
token_expert_indices, num_tokens, topk, 0, num_experts, stream);
|
||||
#else
|
||||
#define LAUNCH_SOFTMAX(NUM_EXPERTS, WARPS_PER_TB, MAX_BYTES) \
|
||||
if (WARP_SIZE == 64) { \
|
||||
topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB, 64, MAX_BYTES>( \
|
||||
gating_output, nullptr, topk_weights, topk_indices, \
|
||||
token_expert_indices, num_tokens, topk, 0, num_experts, stream); \
|
||||
} else if (WARP_SIZE == 32) { \
|
||||
topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB, 32, MAX_BYTES>( \
|
||||
gating_output, nullptr, topk_weights, topk_indices, \
|
||||
token_expert_indices, num_tokens, topk, 0, num_experts, stream); \
|
||||
} else { \
|
||||
assert(false && "Unsupported warp size. Only 32 and 64 are supported for ROCm"); \
|
||||
}
|
||||
#endif
|
||||
|
||||
template <typename IndType>
|
||||
void topkGatingSoftmaxKernelLauncher(
|
||||
@ -453,38 +457,64 @@ void topkGatingSoftmaxKernelLauncher(
|
||||
const int topk,
|
||||
cudaStream_t stream) {
|
||||
static constexpr int WARPS_PER_TB = 4;
|
||||
auto warpSize = WARP_SIZE;
|
||||
static constexpr int BYTES_PER_LDG_POWER_OF_2 = 16;
|
||||
#ifndef USE_ROCM
|
||||
static constexpr int BYTES_PER_LDG_MULTIPLE_64 = 8;
|
||||
#endif
|
||||
switch (num_experts) {
|
||||
case 1:
|
||||
LAUNCH_SOFTMAX(1, WARPS_PER_TB);
|
||||
LAUNCH_SOFTMAX(1, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
|
||||
break;
|
||||
case 2:
|
||||
LAUNCH_SOFTMAX(2, WARPS_PER_TB);
|
||||
LAUNCH_SOFTMAX(2, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
|
||||
break;
|
||||
case 4:
|
||||
LAUNCH_SOFTMAX(4, WARPS_PER_TB);
|
||||
LAUNCH_SOFTMAX(4, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
|
||||
break;
|
||||
case 8:
|
||||
LAUNCH_SOFTMAX(8, WARPS_PER_TB);
|
||||
LAUNCH_SOFTMAX(8, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
|
||||
break;
|
||||
case 16:
|
||||
LAUNCH_SOFTMAX(16, WARPS_PER_TB);
|
||||
LAUNCH_SOFTMAX(16, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
|
||||
break;
|
||||
case 32:
|
||||
LAUNCH_SOFTMAX(32, WARPS_PER_TB);
|
||||
LAUNCH_SOFTMAX(32, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
|
||||
break;
|
||||
case 64:
|
||||
LAUNCH_SOFTMAX(64, WARPS_PER_TB);
|
||||
LAUNCH_SOFTMAX(64, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
|
||||
break;
|
||||
case 128:
|
||||
LAUNCH_SOFTMAX(128, WARPS_PER_TB);
|
||||
LAUNCH_SOFTMAX(128, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
|
||||
break;
|
||||
case 256:
|
||||
LAUNCH_SOFTMAX(256, WARPS_PER_TB);
|
||||
LAUNCH_SOFTMAX(256, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
|
||||
break;
|
||||
case 512:
|
||||
LAUNCH_SOFTMAX(512, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
|
||||
break;
|
||||
// (CUDA only) support multiples of 64 when num_experts is not power of 2.
|
||||
// ROCm uses WARP_SIZE 64 so 8 bytes loading won't fit for some of num_experts,
|
||||
// alternatively we can test 4 bytes loading and enable it in future.
|
||||
#ifndef USE_ROCM
|
||||
case 192:
|
||||
LAUNCH_SOFTMAX(192, WARPS_PER_TB, BYTES_PER_LDG_MULTIPLE_64);
|
||||
break;
|
||||
case 320:
|
||||
LAUNCH_SOFTMAX(320, WARPS_PER_TB, BYTES_PER_LDG_MULTIPLE_64);
|
||||
break;
|
||||
case 384:
|
||||
LAUNCH_SOFTMAX(384, WARPS_PER_TB, BYTES_PER_LDG_MULTIPLE_64);
|
||||
break;
|
||||
case 448:
|
||||
LAUNCH_SOFTMAX(448, WARPS_PER_TB, BYTES_PER_LDG_MULTIPLE_64);
|
||||
break;
|
||||
case 576:
|
||||
LAUNCH_SOFTMAX(576, WARPS_PER_TB, BYTES_PER_LDG_MULTIPLE_64);
|
||||
break;
|
||||
#endif
|
||||
default: {
|
||||
TORCH_CHECK(softmax_workspace != nullptr,
|
||||
"softmax_workspace must be provided for num_experts that are not a power of 2.");
|
||||
"softmax_workspace must be provided for num_experts that are not a power of 2 or multiple of 64.");
|
||||
static constexpr int TPB = 256;
|
||||
moeSoftmax<TPB><<<num_tokens, TPB, 0, stream>>>(
|
||||
gating_output, nullptr, softmax_workspace, num_experts);
|
||||
|
||||
@ -270,7 +270,7 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel(
|
||||
const int num_kv_heads,
|
||||
const float scale,
|
||||
const int* __restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq]
|
||||
const int* __restrict__ context_lens, // [num_seqs]
|
||||
const int* __restrict__ seq_lens, // [num_seqs]
|
||||
const int* __restrict__ query_start_loc_ptr, // [num_seqs]
|
||||
const int max_num_blocks_per_seq,
|
||||
const float* __restrict__ alibi_slopes, // [num_heads]
|
||||
@ -304,12 +304,12 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel(
|
||||
|
||||
const auto max_num_partitions = gridDim.y;
|
||||
|
||||
const int context_len = context_lens[seq_idx];
|
||||
const int seq_len = seq_lens[seq_idx];
|
||||
|
||||
const int partition_start_token_idx =
|
||||
partition_idx * T_PAR_SIZE; // partition_size;
|
||||
// exit if partition is out of context for seq
|
||||
if (partition_start_token_idx >= context_len) {
|
||||
if (partition_start_token_idx >= seq_len) {
|
||||
return;
|
||||
}
|
||||
|
||||
@ -361,8 +361,8 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel(
|
||||
// output layout from QKmfma : QH16xT4x4 16 qheads across 16 lanes, 16 tokens
|
||||
// across 4 rows x 4 tokens per lane
|
||||
|
||||
const int num_context_blocks = DIVIDE_ROUND_UP(context_len, BLOCK_SIZE);
|
||||
const int last_ctx_block = num_context_blocks - 1;
|
||||
const int num_seq_blocks = DIVIDE_ROUND_UP(seq_len, BLOCK_SIZE);
|
||||
const int last_seq_block = num_seq_blocks - 1;
|
||||
|
||||
const int* block_table_seq = block_tables + seq_idx * max_num_blocks_per_seq;
|
||||
|
||||
@ -373,9 +373,9 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel(
|
||||
const int klocal_token_idx =
|
||||
TOKENS_PER_WARP * warpid + token_depth * 16 + lane16id;
|
||||
const int kglobal_token_idx = partition_start_token_idx + klocal_token_idx;
|
||||
const int kblock_idx = (kglobal_token_idx < context_len)
|
||||
const int kblock_idx = (kglobal_token_idx < seq_len)
|
||||
? kglobal_token_idx / BLOCK_SIZE
|
||||
: last_ctx_block;
|
||||
: last_seq_block;
|
||||
kphysical_block_number[token_depth] = block_table_seq[kblock_idx];
|
||||
}
|
||||
|
||||
@ -476,9 +476,9 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel(
|
||||
// tokens
|
||||
const int vglobal_token_idx =
|
||||
partition_start_token_idx + vlocal_token_idx;
|
||||
const int vblock_idx = (vglobal_token_idx < context_len)
|
||||
const int vblock_idx = (vglobal_token_idx < seq_len)
|
||||
? vglobal_token_idx / BLOCK_SIZE
|
||||
: last_ctx_block;
|
||||
: last_seq_block;
|
||||
vphysical_block_number[vtoken_depth][vblock_depth] =
|
||||
block_table_seq[vblock_idx];
|
||||
}
|
||||
@ -554,7 +554,7 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel(
|
||||
if constexpr (ALIBI_ENABLED) {
|
||||
for (int token_depth = 0; token_depth < TLOOP; token_depth++) {
|
||||
const int local_token_idx = qkout_token_idx + token_depth * 16;
|
||||
const int alibi_offset = local_token_idx - context_len + 1;
|
||||
const int alibi_offset = local_token_idx - seq_len + 1;
|
||||
for (int i = 0; i < 4; i++) {
|
||||
d_out[token_depth][i] += alibi_slope * (alibi_offset + i);
|
||||
}
|
||||
@ -568,9 +568,8 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel(
|
||||
for (int token_depth = 0; token_depth < TLOOP; token_depth++) {
|
||||
const int local_token_idx = qkout_token_idx + token_depth * 16;
|
||||
for (int i = 0; i < 4; i++) {
|
||||
const float tmp = (local_token_idx + i < context_len)
|
||||
? d_out[token_depth][i]
|
||||
: -FLT_MAX;
|
||||
const float tmp =
|
||||
(local_token_idx + i < seq_len) ? d_out[token_depth][i] : -FLT_MAX;
|
||||
qk_max = fmaxf(qk_max, tmp);
|
||||
}
|
||||
}
|
||||
@ -582,7 +581,7 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel(
|
||||
for (int token_depth = 0; token_depth < TLOOP; token_depth++) {
|
||||
const int local_token_idx = qkout_token_idx + token_depth * 16;
|
||||
for (int i = 0; i < 4; i++) {
|
||||
const float tmp = (local_token_idx + i < context_len)
|
||||
const float tmp = (local_token_idx + i < seq_len)
|
||||
? __expf(d_out[token_depth][i] - qk_max)
|
||||
: 0.0f;
|
||||
d_out[token_depth][i] = tmp;
|
||||
@ -780,7 +779,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel(
|
||||
const int num_kv_heads,
|
||||
const float scale,
|
||||
const int* __restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq]
|
||||
const int* __restrict__ context_lens, // [num_seqs]
|
||||
const int* __restrict__ seq_lens, // [num_seqs]
|
||||
const int* __restrict__ query_start_loc_ptr, // [num_seqs]
|
||||
const int max_num_blocks_per_seq,
|
||||
const float* __restrict__ alibi_slopes, // [num_heads]
|
||||
@ -809,10 +808,10 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel(
|
||||
const auto partition_size = blockDim.x;
|
||||
const auto max_num_partitions = gridDim.y;
|
||||
|
||||
const int context_len = context_lens[seq_idx];
|
||||
const int seq_len = seq_lens[seq_idx];
|
||||
const int partition_start_token_idx = partition_idx * partition_size;
|
||||
// exit if partition is out of context for seq
|
||||
if (partition_start_token_idx >= context_len) {
|
||||
if (partition_start_token_idx >= seq_len) {
|
||||
return;
|
||||
}
|
||||
// every 4 lanes fetch 4 different qheads
|
||||
@ -855,7 +854,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel(
|
||||
const int warp_start_token_idx =
|
||||
partition_start_token_idx + warpid * WARP_SIZE;
|
||||
|
||||
if (warp_start_token_idx >= context_len) { // warp out of context
|
||||
if (warp_start_token_idx >= seq_len) { // warp out of context
|
||||
#pragma unroll
|
||||
for (int h = 0; h < GQA_RATIO4; h++) {
|
||||
shared_qk_max[warpid][h] = -FLT_MAX;
|
||||
@ -863,8 +862,8 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel(
|
||||
}
|
||||
} else { // warp within context
|
||||
|
||||
const int num_context_blocks = DIVIDE_ROUND_UP(context_len, BLOCK_SIZE);
|
||||
const int last_ctx_block = num_context_blocks - 1;
|
||||
const int num_seq_blocks = DIVIDE_ROUND_UP(seq_len, BLOCK_SIZE);
|
||||
const int last_seq_block = num_seq_blocks - 1;
|
||||
|
||||
const int* block_table = block_tables + seq_idx * max_num_blocks_per_seq;
|
||||
// token id within partition
|
||||
@ -873,9 +872,9 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel(
|
||||
const int global_token_idx = partition_start_token_idx + local_token_idx;
|
||||
|
||||
// fetch block number for k
|
||||
const int block_idx = (global_token_idx < context_len)
|
||||
const int block_idx = (global_token_idx < seq_len)
|
||||
? global_token_idx / BLOCK_SIZE
|
||||
: last_ctx_block;
|
||||
: last_seq_block;
|
||||
|
||||
// fetch k physical block number
|
||||
// int32 physical_block_number leads to overflow when multiplied with
|
||||
@ -888,7 +887,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel(
|
||||
for (int b = 0; b < VBLOCKS; b++) {
|
||||
const int vblock_idx = warp_start_block_idx + b;
|
||||
const int vblock_idx_ctx =
|
||||
(vblock_idx <= last_ctx_block) ? vblock_idx : last_ctx_block;
|
||||
(vblock_idx <= last_seq_block) ? vblock_idx : last_seq_block;
|
||||
vphysical_blocks[b] = block_table[vblock_idx_ctx];
|
||||
}
|
||||
|
||||
@ -1057,7 +1056,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel(
|
||||
const int lane4_token_idx = 4 * (global_token_idx >> 2);
|
||||
|
||||
if constexpr (ALIBI_ENABLED) {
|
||||
const int alibi_offset = lane4_token_idx - context_len + 1;
|
||||
const int alibi_offset = lane4_token_idx - seq_len + 1;
|
||||
for (int h = 0; h < QHLOOP; h++) {
|
||||
for (int i = 0; i < 4; i++) {
|
||||
d_out[h][i] += alibi_slope[h] * (alibi_offset + i);
|
||||
@ -1070,7 +1069,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel(
|
||||
for (int h = 0; h < QHLOOP; h++) {
|
||||
qk_max[h] = -FLT_MAX;
|
||||
for (int i = 0; i < 4; i++) {
|
||||
qk_max[h] = (lane4_token_idx + i < context_len)
|
||||
qk_max[h] = (lane4_token_idx + i < seq_len)
|
||||
? fmaxf(qk_max[h], d_out[h][i])
|
||||
: qk_max[h];
|
||||
}
|
||||
@ -1101,7 +1100,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel(
|
||||
for (int h = 0; h < QHLOOP; h++) {
|
||||
exp_sum[h] = 0.0f;
|
||||
for (int i = 0; i < 4; i++) {
|
||||
d_out[h][i] = (lane4_token_idx + i < context_len)
|
||||
d_out[h][i] = (lane4_token_idx + i < seq_len)
|
||||
? __expf(d_out[h][i] - qk_max[h])
|
||||
: 0.0f;
|
||||
exp_sum[h] += d_out[h][i];
|
||||
@ -1181,7 +1180,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel(
|
||||
}
|
||||
}
|
||||
|
||||
if (warp_start_token_idx >= context_len) { // warp out of context
|
||||
if (warp_start_token_idx >= seq_len) { // warp out of context
|
||||
for (int qh = 0; qh < QHLOOP; qh++) {
|
||||
for (int vh = 0; vh < VHELOOP; vh++) {
|
||||
vout_shared[qh][vh][laneid][warpid] = {0};
|
||||
@ -1279,7 +1278,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
|
||||
// max_num_partitions]
|
||||
const scalar_t* __restrict__ tmp_out, // [num_seqs, num_heads,
|
||||
// max_num_partitions, head_size]
|
||||
const int* __restrict__ context_lens, // [num_seqs]
|
||||
const int* __restrict__ seq_lens, // [num_seqs]
|
||||
const int* __restrict__ query_start_loc_ptr, // [num_seqs]
|
||||
const int max_num_partitions, const float* __restrict__ fp8_out_scale_ptr) {
|
||||
const auto num_heads = gridDim.x;
|
||||
@ -1293,8 +1292,8 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
|
||||
return;
|
||||
}
|
||||
|
||||
const int context_len = context_lens[seq_idx];
|
||||
const int num_partitions = DIVIDE_ROUND_UP(context_len, PARTITION_SIZE);
|
||||
const int seq_len = seq_lens[seq_idx];
|
||||
const int num_partitions = DIVIDE_ROUND_UP(seq_len, PARTITION_SIZE);
|
||||
const auto warpid = threadIdx.x / WARP_SIZE;
|
||||
|
||||
__shared__ float shared_global_exp_sum;
|
||||
@ -1581,7 +1580,7 @@ __launch_bounds__(NUM_THREADS, 3) void paged_attention_ll4mi_QKV_mfma16_kernel(
|
||||
// head_size, block_size]
|
||||
const int num_kv_heads, const float scale,
|
||||
const int* __restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq]
|
||||
const int* __restrict__ context_lens, // [num_seqs]
|
||||
const int* __restrict__ seq_lens, // [num_seqs]
|
||||
const int* __restrict__ query_start_loc_ptr, // [num_seqs]
|
||||
const int max_num_blocks_per_seq,
|
||||
const float* __restrict__ alibi_slopes, // [num_heads]
|
||||
@ -1615,11 +1614,11 @@ __launch_bounds__(NUM_THREADS, 3) void paged_attention_ll4mi_QKV_mfma16_kernel(
|
||||
|
||||
const int max_num_partitions = gridDim.y;
|
||||
|
||||
const int context_len = context_lens[seq_idx]; // length of a seq
|
||||
const int seq_len = seq_lens[seq_idx]; // length of a seq
|
||||
|
||||
const int partition_start_token_idx = partition_idx * T_PAR_SIZE;
|
||||
// exit if partition is out of context for seq
|
||||
if (partition_start_token_idx >= context_len) {
|
||||
if (partition_start_token_idx >= seq_len) {
|
||||
return;
|
||||
}
|
||||
|
||||
@ -1715,8 +1714,8 @@ __launch_bounds__(NUM_THREADS, 3) void paged_attention_ll4mi_QKV_mfma16_kernel(
|
||||
}
|
||||
}
|
||||
|
||||
const int num_context_blocks = DIVIDE_ROUND_UP(context_len, BLOCK_SIZE);
|
||||
const int last_ctx_block = num_context_blocks - 1;
|
||||
const int num_seq_blocks = DIVIDE_ROUND_UP(seq_len, BLOCK_SIZE);
|
||||
const int last_seq_block = num_seq_blocks - 1;
|
||||
|
||||
const int* block_table_seq = block_tables + seq_idx * max_num_blocks_per_seq;
|
||||
|
||||
@ -1727,9 +1726,9 @@ __launch_bounds__(NUM_THREADS, 3) void paged_attention_ll4mi_QKV_mfma16_kernel(
|
||||
const int klocal_token_idx =
|
||||
TOKENS_PER_WARP * warpid + token_depth * 16 + lane16id;
|
||||
const int kglobal_token_idx = partition_start_token_idx + klocal_token_idx;
|
||||
const int kblock_idx = (kglobal_token_idx < context_len)
|
||||
const int kblock_idx = (kglobal_token_idx < seq_len)
|
||||
? kglobal_token_idx / BLOCK_SIZE
|
||||
: last_ctx_block;
|
||||
: last_seq_block;
|
||||
kphysical_block_number[token_depth] = block_table_seq[kblock_idx];
|
||||
}
|
||||
|
||||
@ -1781,9 +1780,9 @@ __launch_bounds__(NUM_THREADS, 3) void paged_attention_ll4mi_QKV_mfma16_kernel(
|
||||
vblock_depth * BLOCK_SIZE;
|
||||
const int vglobal_token_idx =
|
||||
partition_start_token_idx + vlocal_token_idx;
|
||||
const int vblock_idx = (vglobal_token_idx < context_len)
|
||||
const int vblock_idx = (vglobal_token_idx < seq_len)
|
||||
? vglobal_token_idx / BLOCK_SIZE
|
||||
: last_ctx_block;
|
||||
: last_seq_block;
|
||||
vphysical_block_number[vtoken_depth][vblock_depth] =
|
||||
block_table_seq[vblock_idx];
|
||||
}
|
||||
@ -1836,9 +1835,8 @@ __launch_bounds__(NUM_THREADS, 3) void paged_attention_ll4mi_QKV_mfma16_kernel(
|
||||
for (int token_depth = 0; token_depth < TLOOP; token_depth++) {
|
||||
const int local_token_idx = qkout_token_idx + token_depth * 16;
|
||||
for (int i = 0; i < 8; i++) {
|
||||
const float tmp = (local_token_idx + 2 * i < context_len)
|
||||
? dout[token_depth][i]
|
||||
: -FLT_MAX;
|
||||
const float tmp =
|
||||
(local_token_idx + 2 * i < seq_len) ? dout[token_depth][i] : -FLT_MAX;
|
||||
qk_max = fmaxf(qk_max, tmp);
|
||||
}
|
||||
}
|
||||
@ -1848,7 +1846,7 @@ __launch_bounds__(NUM_THREADS, 3) void paged_attention_ll4mi_QKV_mfma16_kernel(
|
||||
for (int token_depth = 0; token_depth < TLOOP; token_depth++) {
|
||||
const int local_token_idx = qkout_token_idx + token_depth * 16;
|
||||
for (int i = 0; i < 8; i++) {
|
||||
const float tmp = (local_token_idx + 2 * i < context_len)
|
||||
const float tmp = (local_token_idx + 2 * i < seq_len)
|
||||
? __expf(dout[token_depth][i] - qk_max)
|
||||
: 0.0f;
|
||||
dout[token_depth][i] = tmp;
|
||||
@ -2019,7 +2017,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel(
|
||||
// head_size, block_size]
|
||||
const int num_kv_heads, const float scale,
|
||||
const int* __restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq]
|
||||
const int* __restrict__ context_lens, // [num_seqs]
|
||||
const int* __restrict__ seq_lens, // [num_seqs]
|
||||
const int* __restrict__ query_start_loc_ptr, // [num_seqs]
|
||||
const int max_num_blocks_per_seq,
|
||||
const float* __restrict__ alibi_slopes, // [num_heads]
|
||||
@ -2046,7 +2044,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
|
||||
// max_num_partitions]
|
||||
const scalar_t* __restrict__ tmp_out, // [num_seqs, num_heads,
|
||||
// max_num_partitions, head_size]
|
||||
const int* __restrict__ context_lens, // [num_seqs]
|
||||
const int* __restrict__ seq_lens, // [num_seqs]
|
||||
const int* __restrict__ query_start_loc_ptr, // [num_seqs]
|
||||
const int max_num_partitions, const float* __restrict__ fp8_out_scale_ptr) {
|
||||
const auto num_heads = gridDim.x;
|
||||
@ -2060,8 +2058,8 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
|
||||
return;
|
||||
}
|
||||
|
||||
const int context_len = context_lens[seq_idx];
|
||||
const int num_partitions = DIVIDE_ROUND_UP(context_len, PARTITION_SIZE);
|
||||
const int seq_len = seq_lens[seq_idx];
|
||||
const int num_partitions = DIVIDE_ROUND_UP(seq_len, PARTITION_SIZE);
|
||||
const int warpid = threadIdx.x / WARP_SIZE;
|
||||
|
||||
__shared__ float shared_global_exp_sum;
|
||||
@ -2349,7 +2347,7 @@ __launch_bounds__(NUM_THREADS, 3) void paged_attention_ll4mi_QKV_mfma16_kernel(
|
||||
// head_size, block_size]
|
||||
const int num_kv_heads, const float scale,
|
||||
const int* __restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq]
|
||||
const int* __restrict__ context_lens, // [num_seqs]
|
||||
const int* __restrict__ seq_lens, // [num_seqs]
|
||||
const int* __restrict__ query_start_loc_ptr, // [num_seqs]
|
||||
const int max_num_blocks_per_seq,
|
||||
const float* __restrict__ alibi_slopes, // [num_heads]
|
||||
@ -2382,11 +2380,11 @@ __launch_bounds__(NUM_THREADS, 3) void paged_attention_ll4mi_QKV_mfma16_kernel(
|
||||
|
||||
const int max_num_partitions = gridDim.y;
|
||||
|
||||
const int context_len = context_lens[seq_idx]; // length of a seq
|
||||
const int seq_len = seq_lens[seq_idx]; // length of a seq
|
||||
|
||||
const int partition_start_token_idx = partition_idx * T_PAR_SIZE;
|
||||
// exit if partition is out of context for seq
|
||||
if (partition_start_token_idx >= context_len) {
|
||||
if (partition_start_token_idx >= seq_len) {
|
||||
return;
|
||||
}
|
||||
|
||||
@ -2482,8 +2480,8 @@ __launch_bounds__(NUM_THREADS, 3) void paged_attention_ll4mi_QKV_mfma16_kernel(
|
||||
}
|
||||
}
|
||||
|
||||
const int num_context_blocks = DIVIDE_ROUND_UP(context_len, BLOCK_SIZE);
|
||||
const int last_ctx_block = num_context_blocks - 1;
|
||||
const int num_seq_blocks = DIVIDE_ROUND_UP(seq_len, BLOCK_SIZE);
|
||||
const int last_seq_block = num_seq_blocks - 1;
|
||||
|
||||
const int* block_table_seq = block_tables + seq_idx * max_num_blocks_per_seq;
|
||||
|
||||
@ -2494,9 +2492,9 @@ __launch_bounds__(NUM_THREADS, 3) void paged_attention_ll4mi_QKV_mfma16_kernel(
|
||||
const int klocal_token_idx =
|
||||
TOKENS_PER_WARP * warpid + token_depth * 16 + lane16id;
|
||||
const int kglobal_token_idx = partition_start_token_idx + klocal_token_idx;
|
||||
const int kblock_idx = (kglobal_token_idx < context_len)
|
||||
const int kblock_idx = (kglobal_token_idx < seq_len)
|
||||
? kglobal_token_idx / BLOCK_SIZE
|
||||
: last_ctx_block;
|
||||
: last_seq_block;
|
||||
kphysical_block_number[token_depth] = block_table_seq[kblock_idx];
|
||||
}
|
||||
|
||||
@ -2548,9 +2546,9 @@ __launch_bounds__(NUM_THREADS, 3) void paged_attention_ll4mi_QKV_mfma16_kernel(
|
||||
rowid * VTOKENS_PER_LANE + vblock_depth * BLOCK_SIZE;
|
||||
const int vglobal_token_idx =
|
||||
partition_start_token_idx + vlocal_token_idx;
|
||||
const int vblock_idx = (vglobal_token_idx < context_len)
|
||||
const int vblock_idx = (vglobal_token_idx < seq_len)
|
||||
? vglobal_token_idx / BLOCK_SIZE
|
||||
: last_ctx_block;
|
||||
: last_seq_block;
|
||||
vphysical_block_number[vtoken_depth][vblock_depth] =
|
||||
block_table_seq[vblock_idx];
|
||||
}
|
||||
@ -2604,7 +2602,7 @@ __launch_bounds__(NUM_THREADS, 3) void paged_attention_ll4mi_QKV_mfma16_kernel(
|
||||
const int local_token_idx = qkout_token_idx + token_depth * 16;
|
||||
for (int i = 0; i < 8; i++) {
|
||||
const float tmp =
|
||||
(local_token_idx + i < context_len) ? dout[token_depth][i] : -FLT_MAX;
|
||||
(local_token_idx + i < seq_len) ? dout[token_depth][i] : -FLT_MAX;
|
||||
qk_max = fmaxf(qk_max, tmp);
|
||||
}
|
||||
}
|
||||
@ -2614,7 +2612,7 @@ __launch_bounds__(NUM_THREADS, 3) void paged_attention_ll4mi_QKV_mfma16_kernel(
|
||||
for (int token_depth = 0; token_depth < TLOOP; token_depth++) {
|
||||
const int local_token_idx = qkout_token_idx + token_depth * 16;
|
||||
for (int i = 0; i < 8; i++) {
|
||||
const float tmp = (local_token_idx + i < context_len)
|
||||
const float tmp = (local_token_idx + i < seq_len)
|
||||
? __expf(dout[token_depth][i] - qk_max)
|
||||
: 0.0f;
|
||||
dout[token_depth][i] = tmp;
|
||||
@ -2751,7 +2749,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel(
|
||||
// head_size, block_size]
|
||||
const int num_kv_heads, const float scale,
|
||||
const int* __restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq]
|
||||
const int* __restrict__ context_lens, // [num_seqs]
|
||||
const int* __restrict__ seq_lens, // [num_seqs]
|
||||
const int* __restrict__ query_start_loc_ptr, // [num_seqs]
|
||||
const int max_num_blocks_per_seq,
|
||||
const float* __restrict__ alibi_slopes, // [num_heads]
|
||||
@ -2778,7 +2776,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
|
||||
// max_num_partitions]
|
||||
const scalar_t* __restrict__ tmp_out, // [num_seqs, num_heads,
|
||||
// max_num_partitions, head_size]
|
||||
const int* __restrict__ context_lens, // [num_seqs]
|
||||
const int* __restrict__ seq_lens, // [num_seqs]
|
||||
const int* __restrict__ query_start_loc_ptr, // [num_seqs]
|
||||
const int max_num_partitions, const float* __restrict__ fp8_out_scale_ptr) {
|
||||
const auto num_heads = gridDim.x;
|
||||
@ -2792,8 +2790,8 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
|
||||
return;
|
||||
}
|
||||
|
||||
const int context_len = context_lens[seq_idx];
|
||||
const int num_partitions = DIVIDE_ROUND_UP(context_len, PARTITION_SIZE);
|
||||
const int seq_len = seq_lens[seq_idx];
|
||||
const int num_partitions = DIVIDE_ROUND_UP(seq_len, PARTITION_SIZE);
|
||||
const int warpid = threadIdx.x / WARP_SIZE;
|
||||
|
||||
__shared__ float shared_global_exp_sum;
|
||||
@ -2980,7 +2978,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma16_kernel(
|
||||
const int num_kv_heads,
|
||||
const float scale,
|
||||
const int* __restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq]
|
||||
const int* __restrict__ context_lens, // [num_seqs]
|
||||
const int* __restrict__ seq_lens, // [num_seqs]
|
||||
const int* __restrict__ query_start_loc_ptr, // [num_seqs]
|
||||
const int max_num_blocks_per_seq,
|
||||
const float* __restrict__ alibi_slopes, // [num_heads]
|
||||
@ -3007,7 +3005,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel(
|
||||
const int num_kv_heads,
|
||||
const float scale,
|
||||
const int* __restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq]
|
||||
const int* __restrict__ context_lens, // [num_seqs]
|
||||
const int* __restrict__ seq_lens, // [num_seqs]
|
||||
const int* __restrict__ query_start_loc_ptr, // [num_seqs]
|
||||
const int max_num_blocks_per_seq,
|
||||
const float* __restrict__ alibi_slopes, // [num_heads]
|
||||
@ -3031,7 +3029,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
|
||||
const float* __restrict__ exp_sums, // [num_seqs, num_heads, max_num_partitions]
|
||||
const float* __restrict__ max_logits, // [num_seqs, num_heads, max_num_partitions]
|
||||
const scalar_t* __restrict__ tmp_out, // [num_seqs, num_heads, max_num_partitions, head_size]
|
||||
const int* __restrict__ context_lens, // [num_seqs]
|
||||
const int* __restrict__ seq_lens, // [num_seqs]
|
||||
const int* __restrict__ query_start_loc_ptr, // [num_seqs]
|
||||
const int max_num_partitions, const float* __restrict__ fp8_out_scale_ptr) {
|
||||
UNREACHABLE_CODE
|
||||
@ -3046,7 +3044,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
|
||||
GQA_RATIO> \
|
||||
<<<grid, block, 0, stream>>>( \
|
||||
query_ptr, key_cache_ptr, value_cache_ptr, num_kv_heads, scale, \
|
||||
block_tables_ptr, context_lens_ptr, query_start_loc_ptr, \
|
||||
block_tables_ptr, seq_lens_ptr, query_start_loc_ptr, \
|
||||
max_num_blocks_per_seq, alibi_slopes_ptr, q_stride, kv_block_stride, \
|
||||
kv_head_stride, exp_sums_ptr, max_logits_ptr, tmp_out_ptr, out_ptr, \
|
||||
max_ctx_blocks, k_scale_ptr, v_scale_ptr);
|
||||
@ -3057,18 +3055,17 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
|
||||
GQA_RATIO> \
|
||||
<<<grid, block, 0, stream>>>( \
|
||||
query_ptr, key_cache_ptr, value_cache_ptr, num_kv_heads, scale, \
|
||||
block_tables_ptr, context_lens_ptr, query_start_loc_ptr, \
|
||||
block_tables_ptr, seq_lens_ptr, query_start_loc_ptr, \
|
||||
max_num_blocks_per_seq, alibi_slopes_ptr, q_stride, kv_block_stride, \
|
||||
kv_head_stride, exp_sums_ptr, max_logits_ptr, tmp_out_ptr, out_ptr, \
|
||||
max_ctx_blocks, k_scale_ptr, v_scale_ptr);
|
||||
|
||||
#define LAUNCH_CUSTOM_REDUCTION(NPAR_LOOPS) \
|
||||
paged_attention_ll4mi_reduce_kernel<T, OUTT, HEAD_SIZE, HEAD_SIZE, \
|
||||
PARTITION_SIZE, NPAR_LOOPS> \
|
||||
<<<reduce_grid, reduce_block, 0, stream>>>( \
|
||||
out_ptr, exp_sums_ptr, max_logits_ptr, tmp_out_ptr, \
|
||||
context_lens_ptr, query_start_loc_ptr, max_num_partitions, \
|
||||
fp8_out_scale_ptr);
|
||||
#define LAUNCH_CUSTOM_REDUCTION(NPAR_LOOPS) \
|
||||
paged_attention_ll4mi_reduce_kernel<T, OUTT, HEAD_SIZE, HEAD_SIZE, \
|
||||
PARTITION_SIZE, NPAR_LOOPS> \
|
||||
<<<reduce_grid, reduce_block, 0, stream>>>( \
|
||||
out_ptr, exp_sums_ptr, max_logits_ptr, tmp_out_ptr, seq_lens_ptr, \
|
||||
query_start_loc_ptr, max_num_partitions, fp8_out_scale_ptr);
|
||||
|
||||
template <typename T, typename KVT, vllm::Fp8KVCacheDataType KV_DTYPE,
|
||||
int BLOCK_SIZE, int HEAD_SIZE, typename OUTT, int PARTITION_SIZE_OLD,
|
||||
@ -3077,8 +3074,8 @@ void paged_attention_custom_launcher(
|
||||
torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits,
|
||||
torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
|
||||
torch::Tensor& value_cache, const int num_kv_heads, float scale,
|
||||
torch::Tensor& block_tables, torch::Tensor& context_lens,
|
||||
const std::optional<torch::Tensor>& query_start_loc, int max_context_len,
|
||||
torch::Tensor& block_tables, torch::Tensor& seq_lens,
|
||||
const std::optional<torch::Tensor>& query_start_loc, int max_seq_len,
|
||||
const std::optional<torch::Tensor>& alibi_slopes, torch::Tensor& k_scale,
|
||||
torch::Tensor& v_scale, const std::optional<torch::Tensor>& fp8_out_scale) {
|
||||
int num_seqs = block_tables.size(0);
|
||||
@ -3109,7 +3106,7 @@ void paged_attention_custom_launcher(
|
||||
KVT* key_cache_ptr = reinterpret_cast<KVT*>(key_cache.data_ptr());
|
||||
KVT* value_cache_ptr = reinterpret_cast<KVT*>(value_cache.data_ptr());
|
||||
int* block_tables_ptr = block_tables.data_ptr<int>();
|
||||
int* context_lens_ptr = context_lens.data_ptr<int>();
|
||||
int* seq_lens_ptr = seq_lens.data_ptr<int>();
|
||||
const float* k_scale_ptr = reinterpret_cast<const float*>(k_scale.data_ptr());
|
||||
const float* v_scale_ptr = reinterpret_cast<const float*>(v_scale.data_ptr());
|
||||
// NOTE: fp8_out_scale is optional.
|
||||
@ -3119,13 +3116,12 @@ void paged_attention_custom_launcher(
|
||||
: nullptr;
|
||||
OUTT* out_ptr = reinterpret_cast<OUTT*>(out.data_ptr());
|
||||
|
||||
const int max_ctx_blocks = DIVIDE_ROUND_UP(max_context_len, BLOCK_SIZE);
|
||||
const int max_ctx_blocks = DIVIDE_ROUND_UP(max_seq_len, BLOCK_SIZE);
|
||||
|
||||
// partition size is fixed at 256 since both mfma4 and mfma16 kernels support
|
||||
// it mfma4 kernel also supports partition size 512
|
||||
constexpr int PARTITION_SIZE = 256;
|
||||
const int max_num_partitions =
|
||||
DIVIDE_ROUND_UP(max_context_len, PARTITION_SIZE);
|
||||
const int max_num_partitions = DIVIDE_ROUND_UP(max_seq_len, PARTITION_SIZE);
|
||||
const int gqa_ratio = num_heads / num_kv_heads;
|
||||
assert(num_heads % num_kv_heads == 0);
|
||||
assert(head_size == HEAD_SIZE);
|
||||
@ -3234,8 +3230,8 @@ void paged_attention_custom_launcher_navi(
|
||||
torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits,
|
||||
torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
|
||||
torch::Tensor& value_cache, const int num_kv_heads, float scale,
|
||||
torch::Tensor& block_tables, torch::Tensor& context_lens,
|
||||
const std::optional<torch::Tensor>& query_start_loc, int max_context_len,
|
||||
torch::Tensor& block_tables, torch::Tensor& seq_lens,
|
||||
const std::optional<torch::Tensor>& query_start_loc, int max_seq_len,
|
||||
const std::optional<torch::Tensor>& alibi_slopes, torch::Tensor& k_scale,
|
||||
torch::Tensor& v_scale) {
|
||||
int num_seqs = block_tables.size(0);
|
||||
@ -3263,7 +3259,7 @@ void paged_attention_custom_launcher_navi(
|
||||
KVT* key_cache_ptr = reinterpret_cast<KVT*>(key_cache.data_ptr());
|
||||
KVT* value_cache_ptr = reinterpret_cast<KVT*>(value_cache.data_ptr());
|
||||
int* block_tables_ptr = block_tables.data_ptr<int>();
|
||||
int* context_lens_ptr = context_lens.data_ptr<int>();
|
||||
int* seq_lens_ptr = seq_lens.data_ptr<int>();
|
||||
|
||||
const float* k_scale_ptr = reinterpret_cast<const float*>(k_scale.data_ptr());
|
||||
const float* v_scale_ptr = reinterpret_cast<const float*>(v_scale.data_ptr());
|
||||
@ -3271,11 +3267,10 @@ void paged_attention_custom_launcher_navi(
|
||||
const auto fp8_out_scale_ptr = nullptr;
|
||||
OUTT* out_ptr = reinterpret_cast<OUTT*>(out.data_ptr());
|
||||
|
||||
const int max_ctx_blocks = DIVIDE_ROUND_UP(max_context_len, BLOCK_SIZE);
|
||||
const int max_ctx_blocks = DIVIDE_ROUND_UP(max_seq_len, BLOCK_SIZE);
|
||||
|
||||
constexpr int PARTITION_SIZE = 256;
|
||||
const int max_num_partitions =
|
||||
DIVIDE_ROUND_UP(max_context_len, PARTITION_SIZE);
|
||||
const int max_num_partitions = DIVIDE_ROUND_UP(max_seq_len, PARTITION_SIZE);
|
||||
const int gqa_ratio = num_heads / num_kv_heads;
|
||||
assert(num_heads % num_kv_heads == 0);
|
||||
assert(head_size == HEAD_SIZE);
|
||||
@ -3407,14 +3402,14 @@ void paged_attention_custom_launcher_navi(
|
||||
paged_attention_custom_launcher<T, KVT, KV_DTYPE, BLK_SIZE, HEAD_SIZE, \
|
||||
OUTT, PSIZE, ALIBI_ENABLED>( \
|
||||
out, exp_sums, max_logits, tmp_out, query, key_cache, value_cache, \
|
||||
num_kv_heads, scale, block_tables, context_lens, query_start_loc, \
|
||||
max_context_len, alibi_slopes, k_scale, v_scale, fp8_out_scale); \
|
||||
num_kv_heads, scale, block_tables, seq_lens, query_start_loc, \
|
||||
max_seq_len, alibi_slopes, k_scale, v_scale, fp8_out_scale); \
|
||||
} else { \
|
||||
paged_attention_custom_launcher_navi< \
|
||||
T, KVT, KV_DTYPE, BLK_SIZE, HEAD_SIZE, OUTT, PSIZE, ALIBI_ENABLED>( \
|
||||
out, exp_sums, max_logits, tmp_out, query, key_cache, value_cache, \
|
||||
num_kv_heads, scale, block_tables, context_lens, query_start_loc, \
|
||||
max_context_len, alibi_slopes, k_scale, v_scale); \
|
||||
num_kv_heads, scale, block_tables, seq_lens, query_start_loc, \
|
||||
max_seq_len, alibi_slopes, k_scale, v_scale); \
|
||||
}
|
||||
|
||||
#define CALL_CUSTOM_LAUNCHER_ALIBI(T, KVT, KV_DTYPE, BLK_SIZE, HEAD_SIZE, \
|
||||
@ -3502,9 +3497,9 @@ void paged_attention(
|
||||
int64_t num_kv_heads,
|
||||
double scale,
|
||||
torch::Tensor& block_tables, // [num_seqs, max_num_blocks_per_seq]
|
||||
torch::Tensor& context_lens, // [num_seqs]
|
||||
torch::Tensor& seq_lens, // [num_seqs]
|
||||
const std::optional<torch::Tensor>& query_start_loc, // [num_seqs]
|
||||
int64_t block_size, int64_t max_context_len,
|
||||
int64_t block_size, int64_t max_seq_len,
|
||||
const std::optional<torch::Tensor>& alibi_slopes,
|
||||
const std::string& kv_cache_dtype, torch::Tensor& k_scale,
|
||||
torch::Tensor& v_scale,
|
||||
|
||||
@ -15,8 +15,8 @@ void paged_attention(
|
||||
torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits,
|
||||
torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
|
||||
torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
|
||||
torch::Tensor& block_tables, torch::Tensor& context_lens,
|
||||
torch::Tensor& block_tables, torch::Tensor& seq_lens,
|
||||
const std::optional<torch::Tensor>& query_start_loc, int64_t block_size,
|
||||
int64_t max_context_len, const std::optional<torch::Tensor>& alibi_slopes,
|
||||
int64_t max_seq_len, const std::optional<torch::Tensor>& alibi_slopes,
|
||||
const std::string& kv_cache_dtype, torch::Tensor& k_scale,
|
||||
torch::Tensor& v_scale, const std::optional<torch::Tensor>& fp8_out_scale);
|
||||
|
||||
@ -41,10 +41,10 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, rocm_ops) {
|
||||
" Tensor query, Tensor key_cache,"
|
||||
" Tensor value_cache, int num_kv_heads,"
|
||||
" float scale, Tensor block_tables,"
|
||||
" Tensor context_lens,"
|
||||
" Tensor seq_lens,"
|
||||
" Tensor? query_start_loc,"
|
||||
" int block_size,"
|
||||
" int max_context_len,"
|
||||
" int max_seq_len,"
|
||||
" Tensor? alibi_slopes,"
|
||||
" str kv_cache_dtype,"
|
||||
" Tensor k_scale, Tensor v_scale,"
|
||||
|
||||
@ -210,16 +210,7 @@ ARG SCCACHE_REGION_NAME=us-west-2
|
||||
ARG SCCACHE_S3_NO_CREDENTIALS=0
|
||||
|
||||
# Flag to control whether to use pre-built vLLM wheels
|
||||
ARG VLLM_USE_PRECOMPILED
|
||||
# TODO: in setup.py VLLM_USE_PRECOMPILED is sensitive to truthiness, it will take =0 as "true", this should be fixed
|
||||
ENV VLLM_USE_PRECOMPILED=""
|
||||
RUN if [ "${VLLM_USE_PRECOMPILED}" = "1" ]; then \
|
||||
export VLLM_USE_PRECOMPILED=1 && \
|
||||
echo "Using precompiled wheels"; \
|
||||
else \
|
||||
unset VLLM_USE_PRECOMPILED && \
|
||||
echo "Leaving VLLM_USE_PRECOMPILED unset to build wheels from source"; \
|
||||
fi
|
||||
ARG VLLM_USE_PRECOMPILED=""
|
||||
|
||||
# if USE_SCCACHE is set, use sccache to speed up compilation
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
@ -236,6 +227,8 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
&& export SCCACHE_S3_NO_CREDENTIALS=${SCCACHE_S3_NO_CREDENTIALS} \
|
||||
&& export SCCACHE_IDLE_TIMEOUT=0 \
|
||||
&& export CMAKE_BUILD_TYPE=Release \
|
||||
&& export VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED}" \
|
||||
&& export VLLM_DOCKER_BUILD_CONTEXT=1 \
|
||||
&& sccache --show-stats \
|
||||
&& python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38 \
|
||||
&& sccache --show-stats; \
|
||||
@ -249,6 +242,8 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
|
||||
# Clean any existing CMake artifacts
|
||||
rm -rf .deps && \
|
||||
mkdir -p .deps && \
|
||||
export VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED}" && \
|
||||
export VLLM_DOCKER_BUILD_CONTEXT=1 && \
|
||||
python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \
|
||||
fi
|
||||
|
||||
@ -392,7 +387,7 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
|
||||
ARG FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git"
|
||||
# Keep this in sync with https://github.com/vllm-project/vllm/blob/main/requirements/cuda.txt
|
||||
# We use `--force-reinstall --no-deps` to avoid issues with the existing FlashInfer wheel.
|
||||
ARG FLASHINFER_GIT_REF="v0.2.10"
|
||||
ARG FLASHINFER_GIT_REF="v0.2.11"
|
||||
RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH'
|
||||
. /etc/environment
|
||||
git clone --depth 1 --recursive --shallow-submodules \
|
||||
@ -437,7 +432,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
|
||||
# Install DeepGEMM from source
|
||||
ARG DEEPGEMM_GIT_REPO="https://github.com/deepseek-ai/DeepGEMM.git"
|
||||
ARG DEEPGEMM_GIT_REF="187656694f7f69e3e7975617a68bc3387680a7e1"
|
||||
ARG DEEPGEMM_GIT_REF="7b6b5563b9d4c1ae07ffbce7f78ad3ac9204827c"
|
||||
RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH'
|
||||
. /etc/environment
|
||||
CUDA_MAJOR="${CUDA_VERSION%%.*}"
|
||||
|
||||
@ -1,25 +1,17 @@
|
||||
nav:
|
||||
- Home:
|
||||
- vLLM: README.md
|
||||
- Home: README.md
|
||||
- User Guide:
|
||||
- usage/README.md
|
||||
- Getting Started:
|
||||
- getting_started/quickstart.md
|
||||
- getting_started/installation
|
||||
- Examples:
|
||||
- examples/README.md
|
||||
- Offline Inference: examples/offline_inference
|
||||
- Online Serving: examples/online_serving
|
||||
- Others: examples/others
|
||||
- Quick Links:
|
||||
- User Guide: usage/README.md
|
||||
- Developer Guide: contributing/README.md
|
||||
- API Reference: api/README.md
|
||||
- CLI Reference: cli/README.md
|
||||
- Timeline:
|
||||
- Roadmap: https://roadmap.vllm.ai
|
||||
- Releases: https://github.com/vllm-project/vllm/releases
|
||||
- User Guide:
|
||||
- Summary: usage/README.md
|
||||
- usage/v1_guide.md
|
||||
- General:
|
||||
- usage/v1_guide.md
|
||||
- usage/*
|
||||
- Inference and Serving:
|
||||
- serving/offline_inference.md
|
||||
@ -32,7 +24,7 @@ nav:
|
||||
- deployment/integrations
|
||||
- Training: training
|
||||
- Configuration:
|
||||
- Summary: configuration/README.md
|
||||
- configuration/README.md
|
||||
- configuration/*
|
||||
- Models:
|
||||
- models/supported_models.md
|
||||
@ -45,11 +37,11 @@ nav:
|
||||
- features/*
|
||||
- features/quantization
|
||||
- Developer Guide:
|
||||
- Summary: contributing/README.md
|
||||
- contributing/README.md
|
||||
- General:
|
||||
- glob: contributing/*
|
||||
flatten_single_child_sections: true
|
||||
- Model Implementation:
|
||||
- Model Implementation:
|
||||
- contributing/model/README.md
|
||||
- contributing/model/basic.md
|
||||
- contributing/model/registration.md
|
||||
@ -58,11 +50,9 @@ nav:
|
||||
- CI: contributing/ci
|
||||
- Design Documents: design
|
||||
- API Reference:
|
||||
- Summary: api/summary.md
|
||||
- Contents:
|
||||
- api/vllm/*
|
||||
- CLI Reference:
|
||||
- Summary: cli/README.md
|
||||
- api/README.md
|
||||
- api/vllm/*
|
||||
- CLI Reference: cli
|
||||
- Community:
|
||||
- community/*
|
||||
- Blog: https://blog.vllm.ai
|
||||
|
||||
@ -1,3 +1,9 @@
|
||||
---
|
||||
hide:
|
||||
- navigation
|
||||
- toc
|
||||
---
|
||||
|
||||
# Welcome to vLLM
|
||||
|
||||
<figure markdown="span">
|
||||
@ -21,6 +27,17 @@ vLLM is a fast and easy-to-use library for LLM inference and serving.
|
||||
|
||||
Originally developed in the [Sky Computing Lab](https://sky.cs.berkeley.edu) at UC Berkeley, vLLM has evolved into a community-driven project with contributions from both academia and industry.
|
||||
|
||||
Where to get started with vLLM depends on the type of user. If you are looking to:
|
||||
|
||||
- Run open-source models on vLLM, we recommend starting with the [Quickstart Guide](./getting_started/quickstart.md)
|
||||
- Build applications with vLLM, we recommend starting with the [User Guide](./usage)
|
||||
- Build vLLM, we recommend starting with [Developer Guide](./contributing)
|
||||
|
||||
For information about the development of vLLM, see:
|
||||
|
||||
- [Roadmap](https://roadmap.vllm.ai)
|
||||
- [Releases](https://github.com/vllm-project/vllm/releases)
|
||||
|
||||
vLLM is fast with:
|
||||
|
||||
- State-of-the-art serving throughput
|
||||
|
||||
@ -1,7 +1,5 @@
|
||||
# Summary
|
||||
|
||||
[](){ #configuration }
|
||||
|
||||
## Configuration
|
||||
|
||||
API documentation for vLLM's configuration classes.
|
||||
1
docs/cli/.meta.yml
Normal file
1
docs/cli/.meta.yml
Normal file
@ -0,0 +1 @@
|
||||
toc_depth: 3
|
||||
8
docs/cli/.nav.yml
Normal file
8
docs/cli/.nav.yml
Normal file
@ -0,0 +1,8 @@
|
||||
nav:
|
||||
- README.md
|
||||
- serve.md
|
||||
- chat.md
|
||||
- complete.md
|
||||
- run-batch.md
|
||||
- vllm bench:
|
||||
- bench/*.md
|
||||
@ -1,7 +1,3 @@
|
||||
---
|
||||
toc_depth: 4
|
||||
---
|
||||
|
||||
# vLLM CLI Guide
|
||||
|
||||
The vllm command-line tool is used to run and manage vLLM models. You can start by viewing the help message with:
|
||||
@ -18,40 +14,46 @@ vllm {chat,complete,serve,bench,collect-env,run-batch}
|
||||
|
||||
## serve
|
||||
|
||||
Start the vLLM OpenAI Compatible API server.
|
||||
Starts the vLLM OpenAI Compatible API server.
|
||||
|
||||
??? console "Examples"
|
||||
Start with a model:
|
||||
|
||||
```bash
|
||||
# Start with a model
|
||||
vllm serve meta-llama/Llama-2-7b-hf
|
||||
```bash
|
||||
vllm serve meta-llama/Llama-2-7b-hf
|
||||
```
|
||||
|
||||
# Specify the port
|
||||
vllm serve meta-llama/Llama-2-7b-hf --port 8100
|
||||
Specify the port:
|
||||
|
||||
# Serve over a Unix domain socket
|
||||
vllm serve meta-llama/Llama-2-7b-hf --uds /tmp/vllm.sock
|
||||
```bash
|
||||
vllm serve meta-llama/Llama-2-7b-hf --port 8100
|
||||
```
|
||||
|
||||
# Check with --help for more options
|
||||
# To list all groups
|
||||
vllm serve --help=listgroup
|
||||
Serve over a Unix domain socket:
|
||||
|
||||
# To view a argument group
|
||||
vllm serve --help=ModelConfig
|
||||
```bash
|
||||
vllm serve meta-llama/Llama-2-7b-hf --uds /tmp/vllm.sock
|
||||
```
|
||||
|
||||
# To view a single argument
|
||||
vllm serve --help=max-num-seqs
|
||||
Check with --help for more options:
|
||||
|
||||
# To search by keyword
|
||||
vllm serve --help=max
|
||||
```bash
|
||||
# To list all groups
|
||||
vllm serve --help=listgroup
|
||||
|
||||
# To view full help with pager (less/more)
|
||||
vllm serve --help=page
|
||||
```
|
||||
# To view a argument group
|
||||
vllm serve --help=ModelConfig
|
||||
|
||||
### Options
|
||||
# To view a single argument
|
||||
vllm serve --help=max-num-seqs
|
||||
|
||||
--8<-- "docs/argparse/serve.md"
|
||||
# To search by keyword
|
||||
vllm serve --help=max
|
||||
|
||||
# To view full help with pager (less/more)
|
||||
vllm serve --help=page
|
||||
```
|
||||
|
||||
See [vllm serve](./serve.md) for the full reference of all available arguments.
|
||||
|
||||
## chat
|
||||
|
||||
@ -68,6 +70,8 @@ vllm chat --url http://{vllm-serve-host}:{vllm-serve-port}/v1
|
||||
vllm chat --quick "hi"
|
||||
```
|
||||
|
||||
See [vllm chat](./chat.md) for the full reference of all available arguments.
|
||||
|
||||
## complete
|
||||
|
||||
Generate text completions based on the given prompt via the running API server.
|
||||
@ -83,7 +87,7 @@ vllm complete --url http://{vllm-serve-host}:{vllm-serve-port}/v1
|
||||
vllm complete --quick "The future of AI is"
|
||||
```
|
||||
|
||||
</details>
|
||||
See [vllm complete](./complete.md) for the full reference of all available arguments.
|
||||
|
||||
## bench
|
||||
|
||||
@ -110,6 +114,8 @@ vllm bench latency \
|
||||
--load-format dummy
|
||||
```
|
||||
|
||||
See [vllm bench latency](./bench/latency.md) for the full reference of all available arguments.
|
||||
|
||||
### serve
|
||||
|
||||
Benchmark the online serving throughput.
|
||||
@ -124,6 +130,8 @@ vllm bench serve \
|
||||
--num-prompts 5
|
||||
```
|
||||
|
||||
See [vllm bench serve](./bench/serve.md) for the full reference of all available arguments.
|
||||
|
||||
### throughput
|
||||
|
||||
Benchmark offline inference throughput.
|
||||
@ -137,6 +145,8 @@ vllm bench throughput \
|
||||
--load-format dummy
|
||||
```
|
||||
|
||||
See [vllm bench throughput](./bench/throughput.md) for the full reference of all available arguments.
|
||||
|
||||
## collect-env
|
||||
|
||||
Start collecting environment information.
|
||||
@ -149,24 +159,25 @@ vllm collect-env
|
||||
|
||||
Run batch prompts and write results to file.
|
||||
|
||||
<details>
|
||||
<summary>Examples</summary>
|
||||
Running with a local file:
|
||||
|
||||
```bash
|
||||
# Running with a local file
|
||||
vllm run-batch \
|
||||
-i offline_inference/openai_batch/openai_example_batch.jsonl \
|
||||
-o results.jsonl \
|
||||
--model meta-llama/Meta-Llama-3-8B-Instruct
|
||||
```
|
||||
|
||||
# Using remote file
|
||||
Using remote file:
|
||||
|
||||
```bash
|
||||
vllm run-batch \
|
||||
-i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai_batch/openai_example_batch.jsonl \
|
||||
-o results.jsonl \
|
||||
--model meta-llama/Meta-Llama-3-8B-Instruct
|
||||
```
|
||||
|
||||
</details>
|
||||
See [vllm run-batch](./run-batch.md) for the full reference of all available arguments.
|
||||
|
||||
## More Help
|
||||
|
||||
|
||||
9
docs/cli/bench/latency.md
Normal file
9
docs/cli/bench/latency.md
Normal file
@ -0,0 +1,9 @@
|
||||
# vllm bench latency
|
||||
|
||||
## JSON CLI Arguments
|
||||
|
||||
--8<-- "docs/cli/json_tip.inc.md"
|
||||
|
||||
## Options
|
||||
|
||||
--8<-- "docs/argparse/bench_latency.md"
|
||||
9
docs/cli/bench/serve.md
Normal file
9
docs/cli/bench/serve.md
Normal file
@ -0,0 +1,9 @@
|
||||
# vllm bench serve
|
||||
|
||||
## JSON CLI Arguments
|
||||
|
||||
--8<-- "docs/cli/json_tip.inc.md"
|
||||
|
||||
## Options
|
||||
|
||||
--8<-- "docs/argparse/bench_serve.md"
|
||||
9
docs/cli/bench/throughput.md
Normal file
9
docs/cli/bench/throughput.md
Normal file
@ -0,0 +1,9 @@
|
||||
# vllm bench throughput
|
||||
|
||||
## JSON CLI Arguments
|
||||
|
||||
--8<-- "docs/cli/json_tip.inc.md"
|
||||
|
||||
## Options
|
||||
|
||||
--8<-- "docs/argparse/bench_throughput.md"
|
||||
5
docs/cli/chat.md
Normal file
5
docs/cli/chat.md
Normal file
@ -0,0 +1,5 @@
|
||||
# vllm chat
|
||||
|
||||
## Options
|
||||
|
||||
--8<-- "docs/argparse/chat.md"
|
||||
5
docs/cli/complete.md
Normal file
5
docs/cli/complete.md
Normal file
@ -0,0 +1,5 @@
|
||||
# vllm complete
|
||||
|
||||
## Options
|
||||
|
||||
--8<-- "docs/argparse/complete.md"
|
||||
9
docs/cli/json_tip.inc.md
Normal file
9
docs/cli/json_tip.inc.md
Normal file
@ -0,0 +1,9 @@
|
||||
When passing JSON CLI arguments, the following sets of arguments are equivalent:
|
||||
|
||||
- `--json-arg '{"key1": "value1", "key2": {"key3": "value2"}}'`
|
||||
- `--json-arg.key1 value1 --json-arg.key2.key3 value2`
|
||||
|
||||
Additionally, list elements can be passed individually using `+`:
|
||||
|
||||
- `--json-arg '{"key4": ["value3", "value4", "value5"]}'`
|
||||
- `--json-arg.key4+ value3 --json-arg.key4+='value4,value5'`
|
||||
9
docs/cli/run-batch.md
Normal file
9
docs/cli/run-batch.md
Normal file
@ -0,0 +1,9 @@
|
||||
# vllm run-batch
|
||||
|
||||
## JSON CLI Arguments
|
||||
|
||||
--8<-- "docs/cli/json_tip.inc.md"
|
||||
|
||||
## Options
|
||||
|
||||
--8<-- "docs/argparse/run-batch.md"
|
||||
9
docs/cli/serve.md
Normal file
9
docs/cli/serve.md
Normal file
@ -0,0 +1,9 @@
|
||||
# vllm serve
|
||||
|
||||
## JSON CLI Arguments
|
||||
|
||||
--8<-- "docs/cli/json_tip.inc.md"
|
||||
|
||||
## Options
|
||||
|
||||
--8<-- "docs/argparse/serve.md"
|
||||
@ -2,6 +2,7 @@
|
||||
|
||||
We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below:
|
||||
|
||||
- [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/dgkWg1WFpWGO2jCdTqQHxA), August 2nd 2025. [[Slides]](https://drive.google.com/drive/folders/1Pid6NSFLU43DZRi0EaTcPgXsAzDvbBqF) [[Recording]](https://www.chaspark.com/#/live/1166916873711665152).
|
||||
- [NYC vLLM Meetup](https://lu.ma/c1rqyf1f), May 7th, 2025. [[Slides]](https://docs.google.com/presentation/d/1_q_aW_ioMJWUImf1s1YM-ZhjXz8cUeL0IJvaquOYBeA/edit?usp=sharing)
|
||||
- [Asia Developer Day](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day), April 3rd 2025. [[Slides]](https://docs.google.com/presentation/d/19cp6Qu8u48ihB91A064XfaXruNYiBOUKrBxAmDOllOo/edit?usp=sharing).
|
||||
- [vLLM x Ollama Inference Night](https://lu.ma/vllm-ollama), March 27th 2025. [[Slides]](https://docs.google.com/presentation/d/16T2PDD1YwRnZ4Tu8Q5r6n53c5Lr5c73UV9Vd2_eBo4U/edit?usp=sharing).
|
||||
|
||||
@ -15,6 +15,7 @@ Cash Donations:
|
||||
|
||||
Compute Resources:
|
||||
|
||||
- Alibaba Cloud
|
||||
- AMD
|
||||
- Anyscale
|
||||
- AWS
|
||||
|
||||
@ -11,6 +11,8 @@ Engine arguments control the behavior of the vLLM engine.
|
||||
|
||||
The engine argument classes, [EngineArgs][vllm.engine.arg_utils.EngineArgs] and [AsyncEngineArgs][vllm.engine.arg_utils.AsyncEngineArgs], are a combination of the configuration classes defined in [vllm.config][]. Therefore, if you are interested in developer documentation, we recommend looking at these configuration classes as they are the source of truth for types, defaults and docstrings.
|
||||
|
||||
--8<-- "docs/cli/json_tip.inc.md"
|
||||
|
||||
## `EngineArgs`
|
||||
|
||||
--8<-- "docs/argparse/engine_args.md"
|
||||
|
||||
@ -96,7 +96,7 @@ Although it’s common to do this with GPUs, don't try to fragment 2 or 8 differ
|
||||
|
||||
### Tune your workloads
|
||||
|
||||
Although we try to have great default configs, we strongly recommend you check out the [vLLM auto-tuner](../../benchmarks/auto_tune/README.md) to optimize your workloads for your use case.
|
||||
Although we try to have great default configs, we strongly recommend you check out the [vLLM auto-tuner](gh-file:benchmarks/auto_tune/README.md) to optimize your workloads for your use case.
|
||||
|
||||
### Future Topics We'll Cover
|
||||
|
||||
|
||||
@ -131,19 +131,6 @@ MAX_JOBS=16 uv pip install --system \
|
||||
--no-build-isolation "git+https://github.com/facebookresearch/xformers@v0.0.30"
|
||||
```
|
||||
|
||||
### Mamba
|
||||
|
||||
```bash
|
||||
uv pip install --system \
|
||||
--no-build-isolation "git+https://github.com/state-spaces/mamba@v2.2.5"
|
||||
```
|
||||
|
||||
### causal-conv1d
|
||||
|
||||
```bash
|
||||
uv pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
|
||||
```
|
||||
|
||||
## Update all the different vLLM platforms
|
||||
|
||||
Rather than attempting to update all vLLM platforms in a single pull request, it's more manageable
|
||||
|
||||
@ -117,7 +117,7 @@ For models with interleaving sliding windows (e.g. `google/gemma-2-2b-it` and `m
|
||||
|
||||
To support a model with interleaving sliding windows, we need to take care of the following details:
|
||||
|
||||
- Make sure the model's `config.json` contains `sliding_window_pattern`. vLLM then sets `self.hf_text_config.interleaved_sliding_window` to the value of `self.hf_text_config.sliding_window` and deletes `sliding_window` from `self.hf_text_config`. The model will then be treated as a full-attention model.
|
||||
- Make sure the model's `config.json` contains `layer_types`.
|
||||
- In the modeling code, parse the correct sliding window value for every layer, and pass it to the attention layer's `per_layer_sliding_window` argument. For reference, check [this line](https://github.com/vllm-project/vllm/blob/996357e4808ca5eab97d4c97c7d25b3073f46aab/vllm/model_executor/models/llama.py#L171).
|
||||
|
||||
With these two steps, interleave sliding windows should work with the model.
|
||||
|
||||
@ -540,8 +540,10 @@ return a schema of the tensors outputted by the HF processor that are related to
|
||||
The shape of `image_patches` outputted by `FuyuImageProcessor` is therefore
|
||||
`(1, num_images, num_patches, patch_width * patch_height * num_channels)`.
|
||||
|
||||
In order to support the use of [MultiModalFieldConfig.batched][] like in LLaVA,
|
||||
we remove the extra batch dimension by overriding [BaseMultiModalProcessor._call_hf_processor][]:
|
||||
In order to support the use of
|
||||
[MultiModalFieldConfig.batched][vllm.multimodal.inputs.MultiModalFieldConfig.batched]
|
||||
like in LLaVA, we remove the extra batch dimension by overriding
|
||||
[BaseMultiModalProcessor._call_hf_processor][vllm.multimodal.processing.BaseMultiModalProcessor._call_hf_processor]:
|
||||
|
||||
??? code
|
||||
|
||||
@ -816,7 +818,7 @@ Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies
|
||||
After you have defined [BaseProcessingInfo][vllm.multimodal.processing.BaseProcessingInfo] (Step 2),
|
||||
[BaseDummyInputsBuilder][vllm.multimodal.profiling.BaseDummyInputsBuilder] (Step 3),
|
||||
and [BaseMultiModalProcessor][vllm.multimodal.processing.BaseMultiModalProcessor] (Step 4),
|
||||
decorate the model class with [MULTIMODAL_REGISTRY.register_processor][vllm.multimodal.processing.MultiModalRegistry.register_processor]
|
||||
decorate the model class with [MULTIMODAL_REGISTRY.register_processor][vllm.multimodal.registry.MultiModalRegistry.register_processor]
|
||||
to register them to the multi-modal registry:
|
||||
|
||||
```diff
|
||||
|
||||
7
docs/examples/README.md
Normal file
7
docs/examples/README.md
Normal file
@ -0,0 +1,7 @@
|
||||
# Examples
|
||||
|
||||
vLLM's examples are split into three categories:
|
||||
|
||||
- If you are using vLLM from within Python code, see [Offline Inference](./offline_inference/)
|
||||
- If you are using vLLM from an HTTP application or client, see [Online Serving](./online_serving/)
|
||||
- For examples of using some of vLLM's advanced features (e.g. LMCache or Tensorizer) which are not specific to either of the above use cases, see [Others](./others/)
|
||||
@ -351,3 +351,22 @@ vllm serve ibm-granite/granite-speech-3.3-2b \
|
||||
```
|
||||
|
||||
Note: Default multimodal LoRAs are currently only available for `.generate` and chat completions.
|
||||
|
||||
## Using Tips
|
||||
|
||||
### Configuring `max_lora_rank`
|
||||
|
||||
The `--max-lora-rank` parameter controls the maximum rank allowed for LoRA adapters. This setting affects memory allocation and performance:
|
||||
|
||||
- **Set it to the maximum rank** among all LoRA adapters you plan to use
|
||||
- **Avoid setting it too high** - using a value much larger than needed wastes memory and can cause performance issues
|
||||
|
||||
For example, if your LoRA adapters have ranks [16, 32, 64], use `--max-lora-rank 64` rather than 256
|
||||
|
||||
```bash
|
||||
# Good: matches actual maximum rank
|
||||
vllm serve model --enable-lora --max-lora-rank 64
|
||||
|
||||
# Bad: unnecessarily high, wastes memory
|
||||
vllm serve model --enable-lora --max-lora-rank 256
|
||||
```
|
||||
|
||||
@ -203,6 +203,7 @@ an [EAGLE (Extrapolation Algorithm for Greater Language-model Efficiency)](https
|
||||
"model": "yuhuili/EAGLE-LLaMA3-Instruct-8B",
|
||||
"draft_tensor_parallel_size": 1,
|
||||
"num_speculative_tokens": 2,
|
||||
"method": "eagle",
|
||||
},
|
||||
)
|
||||
|
||||
@ -231,6 +232,9 @@ A few important things to consider when using the EAGLE based draft models:
|
||||
reported in the reference implementation [here](https://github.com/SafeAILab/EAGLE). This issue is under
|
||||
investigation and tracked here: <gh-issue:9565>.
|
||||
|
||||
4. When using EAGLE-3 based draft model, option "method" must be set to "eagle3".
|
||||
That is, to specify `"method": "eagle3"` in `speculative_config`.
|
||||
|
||||
A variety of EAGLE draft models are available on the Hugging Face hub:
|
||||
|
||||
| Base Model | EAGLE on Hugging Face | # EAGLE Parameters |
|
||||
|
||||
@ -14,3 +14,16 @@ vLLM supports the following hardware platforms:
|
||||
- [Google TPU](google_tpu.md)
|
||||
- [Intel Gaudi](intel_gaudi.md)
|
||||
- [AWS Neuron](aws_neuron.md)
|
||||
|
||||
## Hardware Plugins
|
||||
|
||||
The backends below live **outside** the main `vllm` repository and follow the
|
||||
[Hardware-Pluggable RFC](../design/plugin_system.md).
|
||||
|
||||
| Accelerator | PyPI / package | Repository |
|
||||
|-------------|----------------|------------|
|
||||
| Ascend NPU | `vllm-ascend` | <https://github.com/vllm-project/vllm-ascend> |
|
||||
| Intel Gaudi (HPU) | N/A, install from source | <https://github.com/vllm-project/vllm-gaudi> |
|
||||
| MetaX MACA GPU | N/A, install from source | <https://github.com/MetaX-MACA/vLLM-metax> |
|
||||
| Rebellions ATOM / REBEL NPU | `vllm-rbln` | <https://github.com/rebellions-sw/vllm-rbln> |
|
||||
| IBM Spyre AIU | `vllm-spyre` | <https://github.com/vllm-project/vllm-spyre> |
|
||||
|
||||
@ -6,7 +6,7 @@ vLLM supports basic model inferencing and serving on x86 CPU platform, with data
|
||||
# --8<-- [start:requirements]
|
||||
|
||||
- OS: Linux
|
||||
- CPU flags: `avx512f`, `avx512_bf16` (Optional), `avx512_vnni` (Optional)
|
||||
- CPU flags: `avx512f` (Recommended), `avx512_bf16` (Optional), `avx512_vnni` (Optional)
|
||||
|
||||
!!! tip
|
||||
Use `lscpu` to check the CPU flags.
|
||||
@ -28,7 +28,7 @@ vLLM supports basic model inferencing and serving on x86 CPU platform, with data
|
||||
[https://gallery.ecr.aws/q9t5s3a7/vllm-cpu-release-repo](https://gallery.ecr.aws/q9t5s3a7/vllm-cpu-release-repo)
|
||||
|
||||
!!! warning
|
||||
If deploying the pre-built images on machines only contain `avx512f`, `Illegal instruction` error may be raised. It is recommended to build images for these machines with `--build-arg VLLM_CPU_AVX512BF16=false` and `--build-arg VLLM_CPU_AVX512VNNI=false`.
|
||||
If deploying the pre-built images on machines without `avx512f`, `avx512_bf16`, or `avx512_vnni` support, an `Illegal instruction` error may be raised. It is recommended to build images for these machines with the appropriate build arguments (e.g., `--build-arg VLLM_CPU_DISABLE_AVX512=true`, `--build-arg VLLM_CPU_AVX512BF16=false`, or `--build-arg VLLM_CPU_AVX512VNNI=false`) to disable unsupported features. Please note that without `avx512f`, AVX2 will be used and this version is not recommended because it only has basic feature support.
|
||||
|
||||
# --8<-- [end:pre-built-images]
|
||||
# --8<-- [start:build-image-from-source]
|
||||
@ -37,6 +37,7 @@ vLLM supports basic model inferencing and serving on x86 CPU platform, with data
|
||||
docker build -f docker/Dockerfile.cpu \
|
||||
--build-arg VLLM_CPU_AVX512BF16=false (default)|true \
|
||||
--build-arg VLLM_CPU_AVX512VNNI=false (default)|true \
|
||||
--build-arg VLLM_CPU_DISABLE_AVX512=false (default)|true \
|
||||
--tag vllm-cpu-env \
|
||||
--target vllm-openai .
|
||||
|
||||
|
||||
@ -15,8 +15,14 @@ sys.modules["aiohttp"] = MagicMock()
|
||||
sys.modules["blake3"] = MagicMock()
|
||||
sys.modules["vllm._C"] = MagicMock()
|
||||
|
||||
from vllm.benchmarks import latency # noqa: E402
|
||||
from vllm.benchmarks import serve # noqa: E402
|
||||
from vllm.benchmarks import throughput # noqa: E402
|
||||
from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs # noqa: E402
|
||||
from vllm.entrypoints.openai.cli_args import make_arg_parser # noqa: E402
|
||||
from vllm.entrypoints.cli.openai import ChatCommand # noqa: E402
|
||||
from vllm.entrypoints.cli.openai import CompleteCommand # noqa: E402
|
||||
from vllm.entrypoints.openai import cli_args # noqa: E402
|
||||
from vllm.entrypoints.openai import run_batch # noqa: E402
|
||||
from vllm.utils import FlexibleArgumentParser # noqa: E402
|
||||
|
||||
logger = logging.getLogger("mkdocs")
|
||||
@ -68,7 +74,8 @@ class MarkdownFormatter(HelpFormatter):
|
||||
self._markdown_output.append(
|
||||
f"Possible choices: {metavar}\n\n")
|
||||
|
||||
self._markdown_output.append(f"{action.help}\n\n")
|
||||
if action.help:
|
||||
self._markdown_output.append(f"{action.help}\n\n")
|
||||
|
||||
if (default := action.default) != SUPPRESS:
|
||||
self._markdown_output.append(f"Default: `{default}`\n\n")
|
||||
@ -78,7 +85,7 @@ class MarkdownFormatter(HelpFormatter):
|
||||
return "".join(self._markdown_output)
|
||||
|
||||
|
||||
def create_parser(cls, **kwargs) -> FlexibleArgumentParser:
|
||||
def create_parser(add_cli_args, **kwargs) -> FlexibleArgumentParser:
|
||||
"""Create a parser for the given class with markdown formatting.
|
||||
|
||||
Args:
|
||||
@ -88,18 +95,12 @@ def create_parser(cls, **kwargs) -> FlexibleArgumentParser:
|
||||
Returns:
|
||||
FlexibleArgumentParser: A parser with markdown formatting for the class.
|
||||
"""
|
||||
parser = FlexibleArgumentParser()
|
||||
parser = FlexibleArgumentParser(add_json_tip=False)
|
||||
parser.formatter_class = MarkdownFormatter
|
||||
with patch("vllm.config.DeviceConfig.__post_init__"):
|
||||
return cls.add_cli_args(parser, **kwargs)
|
||||
|
||||
|
||||
def create_serve_parser() -> FlexibleArgumentParser:
|
||||
"""Create a parser for the serve command with markdown formatting."""
|
||||
parser = FlexibleArgumentParser()
|
||||
parser.formatter_class = lambda prog: MarkdownFormatter(
|
||||
prog, starting_heading_level=4)
|
||||
return make_arg_parser(parser)
|
||||
_parser = add_cli_args(parser, **kwargs)
|
||||
# add_cli_args might be in-place so return parser if _parser is None
|
||||
return _parser or parser
|
||||
|
||||
|
||||
def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
|
||||
@ -113,10 +114,24 @@ def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
|
||||
|
||||
# Create parsers to document
|
||||
parsers = {
|
||||
"engine_args": create_parser(EngineArgs),
|
||||
"async_engine_args": create_parser(AsyncEngineArgs,
|
||||
async_args_only=True),
|
||||
"serve": create_serve_parser(),
|
||||
"engine_args":
|
||||
create_parser(EngineArgs.add_cli_args),
|
||||
"async_engine_args":
|
||||
create_parser(AsyncEngineArgs.add_cli_args, async_args_only=True),
|
||||
"serve":
|
||||
create_parser(cli_args.make_arg_parser),
|
||||
"chat":
|
||||
create_parser(ChatCommand.add_cli_args),
|
||||
"complete":
|
||||
create_parser(CompleteCommand.add_cli_args),
|
||||
"bench_latency":
|
||||
create_parser(latency.add_cli_args),
|
||||
"bench_throughput":
|
||||
create_parser(throughput.add_cli_args),
|
||||
"bench_serve":
|
||||
create_parser(serve.add_cli_args),
|
||||
"run-batch":
|
||||
create_parser(run_batch.make_arg_parser),
|
||||
}
|
||||
|
||||
# Generate documentation for each parser
|
||||
|
||||
@ -23,6 +23,13 @@ a:not(:has(svg)):not(.md-icon):not(.autorefs-external) {
|
||||
}
|
||||
}
|
||||
|
||||
a[href*="localhost"]::after,
|
||||
a[href*="127.0.0.1"]::after,
|
||||
a[href*="org.readthedocs.build"]::after,
|
||||
a[href*="docs.vllm.ai"]::after {
|
||||
display: none !important;
|
||||
}
|
||||
|
||||
/* Light mode: darker section titles */
|
||||
body[data-md-color-scheme="default"] .md-nav__item--section > label.md-nav__link .md-ellipsis {
|
||||
color: rgba(0, 0, 0, 0.7) !important;
|
||||
|
||||
@ -4,7 +4,7 @@ vLLM provides first-class support for generative models, which covers most of LL
|
||||
|
||||
In vLLM, generative models implement the[VllmModelForTextGeneration][vllm.model_executor.models.VllmModelForTextGeneration] interface.
|
||||
Based on the final hidden states of the input, these models output log probabilities of the tokens to generate,
|
||||
which are then passed through [Sampler][vllm.model_executor.layers.Sampler] to obtain the final text.
|
||||
which are then passed through [Sampler][vllm.model_executor.layers.sampler.Sampler] to obtain the final text.
|
||||
|
||||
## Configuration
|
||||
|
||||
@ -19,7 +19,7 @@ Run a model in generation mode via the option `--runner generate`.
|
||||
## Offline Inference
|
||||
|
||||
The [LLM][vllm.LLM] class provides various methods for offline inference.
|
||||
See [configuration][configuration] for a list of options when initializing the model.
|
||||
See [configuration](../api/summary.md#configuration) for a list of options when initializing the model.
|
||||
|
||||
### `LLM.generate`
|
||||
|
||||
|
||||
@ -81,7 +81,7 @@ which takes priority over both the model's and Sentence Transformers's defaults.
|
||||
## Offline Inference
|
||||
|
||||
The [LLM][vllm.LLM] class provides various methods for offline inference.
|
||||
See [configuration][configuration] for a list of options when initializing the model.
|
||||
See [configuration](../api/summary.md#configuration) for a list of options when initializing the model.
|
||||
|
||||
### `LLM.embed`
|
||||
|
||||
|
||||
@ -331,7 +331,7 @@ th {
|
||||
| `BloomForCausalLM` | BLOOM, BLOOMZ, BLOOMChat | `bigscience/bloom`, `bigscience/bloomz`, etc. | | ✅︎ | |
|
||||
| `BartForConditionalGeneration` | BART | `facebook/bart-base`, `facebook/bart-large-cnn`, etc. | | | |
|
||||
| `ChatGLMModel`, `ChatGLMForConditionalGeneration` | ChatGLM | `zai-org/chatglm2-6b`, `zai-org/chatglm3-6b`, `ShieldLM-6B-chatglm3`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `CohereForCausalLM`, `Cohere2ForCausalLM` | Command-R | `CohereForAI/c4ai-command-r-v01`, `CohereForAI/c4ai-command-r7b-12-2024`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `CohereForCausalLM`, `Cohere2ForCausalLM` | Command-R | `CohereLabs/c4ai-command-r-v01`, `CohereLabs/c4ai-command-r7b-12-2024`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `DbrxForCausalLM` | DBRX | `databricks/dbrx-base`, `databricks/dbrx-instruct`, etc. | | ✅︎ | ✅︎ |
|
||||
| `DeciLMForCausalLM` | DeciLM | `nvidia/Llama-3_3-Nemotron-Super-49B-v1`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `DeepseekForCausalLM` | DeepSeek | `deepseek-ai/deepseek-llm-67b-base`, `deepseek-ai/deepseek-llm-7b-chat`, etc. | | ✅︎ | ✅︎ |
|
||||
@ -349,9 +349,10 @@ th {
|
||||
| `GemmaForCausalLM` | Gemma | `google/gemma-2b`, `google/gemma-1.1-2b-it`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `Gemma2ForCausalLM` | Gemma 2 | `google/gemma-2-9b`, `google/gemma-2-27b`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `Gemma3ForCausalLM` | Gemma 3 | `google/gemma-3-1b-it`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `Gemma3nForConditionalGeneration` | Gemma 3n | `google/gemma-3n-E2B-it`, `google/gemma-3n-E4B-it`, etc. | | | ✅︎ |
|
||||
| `Gemma3nForCausalLM` | Gemma 3n | `google/gemma-3n-E2B-it`, `google/gemma-3n-E4B-it`, etc. | | | ✅︎ |
|
||||
| `GlmForCausalLM` | GLM-4 | `zai-org/glm-4-9b-chat-hf`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `Glm4ForCausalLM` | GLM-4-0414 | `zai-org/GLM-4-32B-0414`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `Glm4MoeForCausalLM` | GLM-4.5 | `zai-org/GLM-4.5`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `GPT2LMHeadModel` | GPT-2 | `gpt2`, `gpt2-xl`, etc. | | ✅︎ | ✅︎ |
|
||||
| `GPTBigCodeForCausalLM` | StarCoder, SantaCoder, WizardCoder | `bigcode/starcoder`, `bigcode/gpt_bigcode-santacoder`, `WizardLM/WizardCoder-15B-V1.0`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `GPTJForCausalLM` | GPT-J | `EleutherAI/gpt-j-6b`, `nomic-ai/gpt4all-j`, etc. | | ✅︎ | ✅︎ |
|
||||
@ -404,16 +405,19 @@ th {
|
||||
| `TeleChat2ForCausalLM` | TeleChat2 | `Tele-AI/TeleChat2-3B`, `Tele-AI/TeleChat2-7B`, `Tele-AI/TeleChat2-35B`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `TeleFLMForCausalLM` | TeleFLM | `CofeAI/FLM-2-52B-Instruct-2407`, `CofeAI/Tele-FLM`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `XverseForCausalLM` | XVERSE | `xverse/XVERSE-7B-Chat`, `xverse/XVERSE-13B-Chat`, `xverse/XVERSE-65B-Chat`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `MiniMaxM1ForCausalLM` | MiniMax-Text | `MiniMaxAI/MiniMax-M1-40k`, `MiniMaxAI/MiniMax-M1-80k`, etc. | | | |
|
||||
| `MiniMaxText01ForCausalLM` | MiniMax-Text | `MiniMaxAI/MiniMax-Text-01`, etc. | | | |
|
||||
| `MiniMaxM1ForCausalLM` | MiniMax-Text | `MiniMaxAI/MiniMax-M1-40k`, `MiniMaxAI/MiniMax-M1-80k`, etc. | | | ✅︎ |
|
||||
| `MiniMaxText01ForCausalLM` | MiniMax-Text | `MiniMaxAI/MiniMax-Text-01`, etc. | | | ✅︎ |
|
||||
| `Zamba2ForCausalLM` | Zamba2 | `Zyphra/Zamba2-7B-instruct`, `Zyphra/Zamba2-2.7B-instruct`, `Zyphra/Zamba2-1.2B-instruct`, etc. | | | ✅︎ |
|
||||
|
||||
Some models are supported only via the [Transformers backend](#transformers). The purpose of the table below is to acknowledge models which we officially support in this way. The logs will say that the Transformers backend is being used, and you will see no warning that this is fallback behaviour. This means that, if you have issues with any of the models listed below, please [make an issue](https://github.com/vllm-project/vllm/issues/new/choose) and we'll do our best to fix it!
|
||||
|
||||
| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) |
|
||||
|--------------|--------|-------------------|----------------------|---------------------------|---------------------|
|
||||
| `SmolLM3ForCausalLM` | SmolLM3 | `HuggingFaceTB/SmolLM3-3B` | ✅︎ | ✅︎ | ✅︎ |
|
||||
|
||||
!!! note
|
||||
Currently, the ROCm version of vLLM supports Mistral and Mixtral only for context lengths up to 4096.
|
||||
|
||||
!!! note
|
||||
Only text inputs are currently supported for `Gemma3nForConditionalGeneration`. To use this model, please upgrade Hugging Face Transformers to version 4.53.0.
|
||||
|
||||
### Pooling Models
|
||||
|
||||
See [this page](./pooling_models.md) for more information on how to use pooling models.
|
||||
@ -583,6 +587,9 @@ See [this page](../features/multimodal_inputs.md) on how to pass multi-modal inp
|
||||
|
||||
**This is no longer required if you are using vLLM V1.**
|
||||
|
||||
!!! tip
|
||||
For hybrid-only models such as Llama-4, Step3 and Mistral-3, a text-only mode can be enabled by setting all supported multimodal modalities to 0 (e.g, `--limit-mm-per-prompt '{"image":0}`) so that their multimodal modules will not be loaded to free up more GPU memory for KV cache.
|
||||
|
||||
!!! note
|
||||
vLLM currently only supports adding LoRA to the language backbone of multimodal models.
|
||||
|
||||
@ -600,14 +607,15 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
|
||||
| `AyaVisionForConditionalGeneration` | Aya Vision | T + I<sup>+</sup> | `CohereForAI/aya-vision-8b`, `CohereForAI/aya-vision-32b`, etc. | | ✅︎ | ✅︎ |
|
||||
| `Blip2ForConditionalGeneration` | BLIP-2 | T + I<sup>E</sup> | `Salesforce/blip2-opt-2.7b`, `Salesforce/blip2-opt-6.7b`, etc. | | ✅︎ | ✅︎ |
|
||||
| `ChameleonForConditionalGeneration` | Chameleon | T + I | `facebook/chameleon-7b`, etc. | | ✅︎ | ✅︎ |
|
||||
| `Cohere2VisionForConditionalGeneration` | Command A Vision | T + I<sup>+</sup> | `CohereLabs/command-a-vision-07-2025`, etc. | | ✅︎ | ✅︎ |
|
||||
| `DeepseekVLV2ForCausalLM`<sup>^</sup> | DeepSeek-VL2 | T + I<sup>+</sup> | `deepseek-ai/deepseek-vl2-tiny`, `deepseek-ai/deepseek-vl2-small`, `deepseek-ai/deepseek-vl2`, etc. | | ✅︎ | ✅︎ |
|
||||
| `Florence2ForConditionalGeneration` | Florence-2 | T + I | `microsoft/Florence-2-base`, `microsoft/Florence-2-large`, etc. | | | |
|
||||
| `FuyuForCausalLM` | Fuyu | T + I | `adept/fuyu-8b`, etc. | | ✅︎ | ✅︎ |
|
||||
| `Gemma3ForConditionalGeneration` | Gemma 3 | T + I<sup>+</sup> | `google/gemma-3-4b-it`, `google/gemma-3-27b-it`, etc. | ✅︎ | ✅︎ | ⚠️ |
|
||||
| `Gemma3nForConditionalGeneration` | Gemma 3n | T + I + A | `google/gemma-3n-E2B-it`, `google/gemma-3n-E4B-it`, etc. | | | ✅︎ |
|
||||
| `GLM4VForCausalLM`<sup>^</sup> | GLM-4V | T + I | `zai-org/glm-4v-9b`, `zai-org/cogagent-9b-20241220`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `Glm4vForConditionalGeneration` | GLM-4.1V-Thinking | T + I<sup>E+</sup> + V<sup>E+</sup> | `zai-org/GLM-4.1V-9B-Thinking`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `Glm4MoeForCausalLM` | GLM-4.5 | T + I<sup>E+</sup> + V<sup>E+</sup> | `zai-org/GLM-4.5`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `Glm4v_moeForConditionalGeneration` | GLM-4.5V | T + I<sup>E+</sup> + V<sup>E+</sup> | `zai-org/GLM-4.5V`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `Glm4vMoeForConditionalGeneration` | GLM-4.5V | T + I<sup>E+</sup> + V<sup>E+</sup> | `zai-org/GLM-4.5V`, etc. | | ✅︎ | ✅︎ |
|
||||
| `GraniteSpeechForConditionalGeneration` | Granite Speech | T + A | `ibm-granite/granite-speech-3.3-8b` | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `H2OVLChatModel` | H2OVL | T + I<sup>E+</sup> | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | | ✅︎ | ✅︎ |
|
||||
| `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3`, etc. | ✅︎ | | ✅︎ |
|
||||
@ -674,6 +682,15 @@ Some models are supported only via the [Transformers backend](#transformers). Th
|
||||
|
||||
This limitation exists because the model's mixed attention pattern (bidirectional for images, causal otherwise) is not yet supported by vLLM's attention backends.
|
||||
|
||||
!!! note
|
||||
`Gemma3nForConditionalGeneration` is only supported on V1 due to shared KV caching and it depends on `timm>=1.0.17` to make use of its
|
||||
MobileNet-v5 vision backbone.
|
||||
|
||||
Performance is not yet fully optimized mainly due to:
|
||||
|
||||
- Both audio and vision MM encoders use `transformers.AutoModel` implementation.
|
||||
- There's no PLE caching or out-of-memory swapping support, as described in [Google's blog](https://developers.googleblog.com/en/introducing-gemma-3n/). These features might be too model-specific for vLLM, and swapping in particular may be better suited for constrained setups.
|
||||
|
||||
!!! note
|
||||
Only `InternVLChatModel` with Qwen2.5 text backbone (`OpenGVLab/InternVL3-2B`, `OpenGVLab/InternVL2.5-1B` etc) has video inputs support currently.
|
||||
|
||||
@ -760,7 +777,7 @@ The following table lists those that are tested in vLLM.
|
||||
Cross-encoder and reranker models are a subset of classification models that accept two prompts as input.
|
||||
These models primarily support the [`LLM.score`](./pooling_models.md#llmscore) API.
|
||||
|
||||
| Architecture | Models | Inputs | Example HF Models | [LoRA][lora-adapter] | [PP][parallelism-scaling] | [V1](gh-issue:8779) |
|
||||
| Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) |
|
||||
|-------------------------------------|--------------------|----------|--------------------------|------------------------|-----------------------------|-----------------------|
|
||||
| `JinaVLForSequenceClassification` | JinaVL-based | T + I<sup>E+</sup> | `jinaai/jina-reranker-m0`, etc. | | | ✅︎ |
|
||||
|
||||
|
||||
@ -1,6 +1,8 @@
|
||||
# Using vLLM
|
||||
|
||||
vLLM supports the following usage patterns:
|
||||
First, vLLM must be [installed](../getting_started/installation) for your chosen device in either a Python or Docker environment.
|
||||
|
||||
Then, vLLM supports the following usage patterns:
|
||||
|
||||
- [Inference and Serving](../serving/offline_inference.md): Run a single instance of a model.
|
||||
- [Deployment](../deployment/docker.md): Scale up model instances for production.
|
||||
|
||||
@ -59,12 +59,13 @@ based on assigned priority, with FCFS as a tie-breaker), configurable via the
|
||||
|
||||
### Hardware
|
||||
|
||||
| Hardware | Status |
|
||||
|------------|------------------------------------|
|
||||
| **NVIDIA** | <nobr>🚀</nobr> |
|
||||
| **AMD** | <nobr>🟢</nobr> |
|
||||
| **TPU** | <nobr>🟢</nobr> |
|
||||
| **CPU** | <nobr>🟢 (x86) 🟡 (MacOS) </nobr> |
|
||||
| Hardware | Status |
|
||||
|------------|-----------------------------------------------|
|
||||
| **NVIDIA** | <nobr>🚀</nobr> |
|
||||
| **AMD** | <nobr>🟢</nobr> |
|
||||
| **INTEL GPU** | <nobr>🟢</nobr> |
|
||||
| **TPU** | <nobr>🟢</nobr> |
|
||||
| **CPU** | <nobr>🟢 (x86\_64/aarch64) 🟡 (MacOS) </nobr> |
|
||||
|
||||
!!! note
|
||||
|
||||
@ -72,6 +73,7 @@ based on assigned priority, with FCFS as a tie-breaker), configurable via the
|
||||
|
||||
- [vllm-ascend](https://github.com/vllm-project/vllm-ascend)
|
||||
- [vllm-spyre](https://github.com/vllm-project/vllm-spyre)
|
||||
- [vllm-gaudi](https://github.com/vllm-project/vllm-gaudi)
|
||||
- [vllm-openvino](https://github.com/vllm-project/vllm-openvino)
|
||||
|
||||
Please check their corresponding repositories for more details.
|
||||
@ -111,6 +113,10 @@ Models that combine Mamba-2 and Mamba-1 layers with standard attention layers ar
|
||||
`Zamba2ForCausalLM`, `NemotronHForCausalLM`, `FalconH1ForCausalLM` and `GraniteMoeHybridForCausalLM`, `JambaForCausalLM`). Please note that
|
||||
these models currently require disabling prefix caching and using the FlashInfer attention backend in V1.
|
||||
|
||||
Hybrid models with mechanisms different to Mamba are also supported (e.g, `MiniMaxText01ForCausalLM`, `MiniMaxM1ForCausalLM`).
|
||||
Please note that these models currently require disabling prefix caching, enforcing eager mode, and using the FlashInfer
|
||||
attention backend in V1.
|
||||
|
||||
#### Encoder-Decoder Models
|
||||
|
||||
Models requiring cross-attention between separate encoder and decoder (e.g., `BartForConditionalGeneration`, `MllamaForConditionalGeneration`)
|
||||
|
||||
@ -96,6 +96,25 @@ def run_voxtral(question: str, audio_count: int) -> ModelRequestData:
|
||||
)
|
||||
|
||||
|
||||
# Gemma3N
|
||||
def run_gemma3n(question: str, audio_count: int) -> ModelRequestData:
|
||||
model_name = "google/gemma-3n-E2B-it"
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
max_model_len=2048,
|
||||
max_num_batched_tokens=2048,
|
||||
max_num_seqs=2,
|
||||
limit_mm_per_prompt={"audio": audio_count},
|
||||
enforce_eager=True,
|
||||
)
|
||||
prompt = f"<start_of_turn>user\n<audio_soft_token>{question}"
|
||||
"<end_of_turn>\n<start_of_turn>model\n"
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompt=prompt,
|
||||
)
|
||||
|
||||
|
||||
# Granite Speech
|
||||
def run_granite_speech(question: str, audio_count: int) -> ModelRequestData:
|
||||
# NOTE - the setting in this example are somehat different than what is
|
||||
@ -331,6 +350,7 @@ def run_whisper(question: str, audio_count: int) -> ModelRequestData:
|
||||
|
||||
model_example_map = {
|
||||
"voxtral": run_voxtral,
|
||||
"gemma3n": run_gemma3n,
|
||||
"granite_speech": run_granite_speech,
|
||||
"minicpmo": run_minicpmo,
|
||||
"phi4_mm": run_phi4mm,
|
||||
|
||||
@ -126,6 +126,29 @@ def run_chameleon(questions: list[str], modality: str) -> ModelRequestData:
|
||||
)
|
||||
|
||||
|
||||
def run_command_a_vision(questions: list[str], modality: str) -> ModelRequestData:
|
||||
assert modality == "image"
|
||||
|
||||
model_name = "CohereLabs/command-a-vision-07-2025"
|
||||
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
max_model_len=32768,
|
||||
tensor_parallel_size=4,
|
||||
limit_mm_per_prompt={modality: 1},
|
||||
)
|
||||
|
||||
prompts = [
|
||||
f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|><|IMG_PATCH|>{question}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>"
|
||||
for question in questions
|
||||
]
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompts=prompts,
|
||||
)
|
||||
|
||||
|
||||
# Deepseek-VL2
|
||||
def run_deepseek_vl2(questions: list[str], modality: str) -> ModelRequestData:
|
||||
assert modality == "image"
|
||||
@ -211,7 +234,33 @@ def run_gemma3(questions: list[str], modality: str) -> ModelRequestData:
|
||||
)
|
||||
for question in questions
|
||||
]
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompts=prompts,
|
||||
)
|
||||
|
||||
|
||||
# Gemma3N
|
||||
def run_gemma3n(questions: list[str], modality: str) -> ModelRequestData:
|
||||
assert modality == "image"
|
||||
model_name = "google/gemma-3n-E2B-it"
|
||||
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
max_model_len=2048,
|
||||
max_num_seqs=2,
|
||||
limit_mm_per_prompt={modality: 1},
|
||||
enforce_eager=True,
|
||||
)
|
||||
|
||||
prompts = [
|
||||
(
|
||||
"<start_of_turn>user\n"
|
||||
f"<image_soft_token>{question}<end_of_turn>\n"
|
||||
"<start_of_turn>model\n"
|
||||
)
|
||||
for question in questions
|
||||
]
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompts=prompts,
|
||||
@ -1391,10 +1440,12 @@ model_example_map = {
|
||||
"aya_vision": run_aya_vision,
|
||||
"blip-2": run_blip2,
|
||||
"chameleon": run_chameleon,
|
||||
"command_a_vision": run_command_a_vision,
|
||||
"deepseek_vl_v2": run_deepseek_vl2,
|
||||
"florence2": run_florence2,
|
||||
"fuyu": run_fuyu,
|
||||
"gemma3": run_gemma3,
|
||||
"gemma3n": run_gemma3n,
|
||||
"glm4v": run_glm4v,
|
||||
"glm4_1v": run_glm4_1v,
|
||||
"h2ovl_chat": run_h2ovl,
|
||||
|
||||
@ -107,6 +107,42 @@ def load_aya_vision(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
)
|
||||
|
||||
|
||||
def load_command_a_vision(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
model_name = "CohereLabs/command-a-vision-07-2025"
|
||||
|
||||
# NOTE: This model is 122B parameters and requires tensor parallelism
|
||||
# Recommended to use tp=4 on H100 GPUs
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
max_model_len=32768,
|
||||
tensor_parallel_size=4,
|
||||
limit_mm_per_prompt={"image": len(image_urls)},
|
||||
)
|
||||
|
||||
placeholders = [{"type": "image", "image": url} for url in image_urls]
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
*placeholders,
|
||||
{"type": "text", "text": question},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
processor = AutoProcessor.from_pretrained(model_name)
|
||||
|
||||
prompt = processor.apply_chat_template(
|
||||
messages, tokenize=False, add_generation_prompt=True
|
||||
)
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompt=prompt,
|
||||
image_data=[fetch_image(url) for url in image_urls],
|
||||
)
|
||||
|
||||
|
||||
def load_deepseek_vl2(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
model_name = "deepseek-ai/deepseek-vl2-tiny"
|
||||
|
||||
@ -1031,6 +1067,7 @@ def load_tarsier2(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
model_example_map = {
|
||||
"aria": load_aria,
|
||||
"aya_vision": load_aya_vision,
|
||||
"command_a_vision": load_command_a_vision,
|
||||
"deepseek_vl_v2": load_deepseek_vl2,
|
||||
"gemma3": load_gemma3,
|
||||
"h2ovl_chat": load_h2ovl,
|
||||
|
||||
186
examples/online_serving/openai_embedding_long_text/README.md
Normal file
186
examples/online_serving/openai_embedding_long_text/README.md
Normal file
@ -0,0 +1,186 @@
|
||||
# Long Text Embedding with Chunked Processing
|
||||
|
||||
This directory contains examples for using vLLM's **chunked processing** feature to handle long text embedding that exceeds the model's maximum context length.
|
||||
|
||||
## 🚀 Quick Start
|
||||
|
||||
### Start the Server
|
||||
|
||||
Use the provided script to start a vLLM server with chunked processing enabled:
|
||||
|
||||
```bash
|
||||
# Basic usage (supports very long texts up to ~3M tokens)
|
||||
./service.sh
|
||||
|
||||
# Custom configuration with different models
|
||||
MODEL_NAME="jinaai/jina-embeddings-v3" \
|
||||
MAX_EMBED_LEN=1048576 \
|
||||
./service.sh
|
||||
|
||||
# For extremely long documents
|
||||
MODEL_NAME="intfloat/multilingual-e5-large" \
|
||||
MAX_EMBED_LEN=3072000 \
|
||||
./service.sh
|
||||
```
|
||||
|
||||
### Test Long Text Embedding
|
||||
|
||||
Run the comprehensive test client:
|
||||
|
||||
```bash
|
||||
python client.py
|
||||
```
|
||||
|
||||
## 📁 Files
|
||||
|
||||
| File | Description |
|
||||
|------|-------------|
|
||||
| `service.sh` | Server startup script with chunked processing enabled |
|
||||
| `client.py` | Comprehensive test client for long text embedding |
|
||||
|
||||
## ⚙️ Configuration
|
||||
|
||||
### Server Configuration
|
||||
|
||||
The key parameters for chunked processing are in the `--override-pooler-config`:
|
||||
|
||||
```json
|
||||
{
|
||||
"pooling_type": "auto",
|
||||
"normalize": true,
|
||||
"enable_chunked_processing": true,
|
||||
"max_embed_len": 3072000
|
||||
}
|
||||
```
|
||||
|
||||
!!! note
|
||||
`pooling_type` sets the model's own pooling strategy for processing within each chunk. The cross-chunk aggregation automatically uses MEAN strategy when input exceeds the model's native maximum length.
|
||||
|
||||
#### Chunked Processing Behavior
|
||||
|
||||
Chunked processing uses **MEAN aggregation** for cross-chunk combination when input exceeds the model's native maximum length:
|
||||
|
||||
| Component | Behavior | Description |
|
||||
|-----------|----------|-------------|
|
||||
| **Within chunks** | Model's native pooling | Uses the model's configured pooling strategy |
|
||||
| **Cross-chunk aggregation** | Always MEAN | Weighted averaging based on chunk token counts |
|
||||
| **Performance** | Optimal | All chunks processed for complete semantic coverage |
|
||||
|
||||
### Environment Variables
|
||||
|
||||
| Variable | Default | Description |
|
||||
|----------|---------|-------------|
|
||||
| `MODEL_NAME` | `intfloat/multilingual-e5-large` | Embedding model to use (supports multiple models) |
|
||||
| `PORT` | `31090` | Server port |
|
||||
| `GPU_COUNT` | `1` | Number of GPUs to use |
|
||||
| `MAX_EMBED_LEN` | `3072000` | Maximum embedding input length (supports very long documents) |
|
||||
| `POOLING_TYPE` | `auto` | Model's native pooling type: `auto`, `MEAN`, `CLS`, `LAST` (only affects within-chunk pooling, not cross-chunk aggregation) |
|
||||
| `API_KEY` | `EMPTY` | API key for authentication |
|
||||
|
||||
## 🔧 How It Works
|
||||
|
||||
1. **Enhanced Input Validation**: `max_embed_len` allows accepting inputs longer than `max_model_len` without environment variables
|
||||
2. **Smart Chunking**: Text is split based on `max_position_embeddings` to maintain semantic integrity
|
||||
3. **Unified Processing**: All chunks processed separately through the model using its configured pooling strategy
|
||||
4. **MEAN Aggregation**: When input exceeds model's native length, results combined using token count-based weighted averaging across all chunks
|
||||
5. **Consistent Output**: Final embeddings maintain the same dimensionality as standard processing
|
||||
|
||||
### Input Length Handling
|
||||
|
||||
- **Within max_embed_len**: Input is accepted and processed (up to 3M+ tokens)
|
||||
- **Exceeds max_position_embeddings**: Chunked processing is automatically triggered
|
||||
- **Exceeds max_embed_len**: Input is rejected with clear error message
|
||||
- **No environment variables required**: Works without `VLLM_ALLOW_LONG_MAX_MODEL_LEN`
|
||||
|
||||
### Extreme Long Text Support
|
||||
|
||||
With `MAX_EMBED_LEN=3072000`, you can process:
|
||||
|
||||
- **Academic papers**: Full research papers with references
|
||||
- **Legal documents**: Complete contracts and legal texts
|
||||
- **Books**: Entire chapters or small books
|
||||
- **Code repositories**: Large codebases and documentation
|
||||
|
||||
## 📊 Performance Characteristics
|
||||
|
||||
### Chunked Processing Performance
|
||||
|
||||
| Aspect | Behavior | Performance |
|
||||
|--------|----------|-------------|
|
||||
| **Chunk Processing** | All chunks processed with native pooling | Consistent with input length |
|
||||
| **Cross-chunk Aggregation** | MEAN weighted averaging | Minimal overhead |
|
||||
| **Memory Usage** | Proportional to number of chunks | Moderate, scalable |
|
||||
| **Semantic Quality** | Complete text coverage | Optimal for long documents |
|
||||
|
||||
## 🧪 Test Cases
|
||||
|
||||
The test client demonstrates:
|
||||
|
||||
- ✅ **Short text**: Normal processing (baseline)
|
||||
- ✅ **Medium text**: Single chunk processing
|
||||
- ✅ **Long text**: Multi-chunk processing with aggregation
|
||||
- ✅ **Very long text**: Many chunks processing
|
||||
- ✅ **Extreme long text**: Document-level processing (100K+ tokens)
|
||||
- ✅ **Batch processing**: Mixed-length inputs in one request
|
||||
- ✅ **Consistency**: Reproducible results across runs
|
||||
|
||||
## 🐛 Troubleshooting
|
||||
|
||||
### Common Issues
|
||||
|
||||
1. **Chunked processing not enabled**:
|
||||
|
||||
```log
|
||||
ValueError: This model's maximum position embeddings length is 4096 tokens...
|
||||
```
|
||||
|
||||
**Solution**: Ensure `enable_chunked_processing: true` in pooler config
|
||||
|
||||
2. **Input exceeds max_embed_len**:
|
||||
|
||||
```log
|
||||
ValueError: This model's maximum embedding input length is 3072000 tokens...
|
||||
```
|
||||
|
||||
**Solution**: Increase `max_embed_len` in pooler config or reduce input length
|
||||
|
||||
3. **Memory errors**:
|
||||
|
||||
```log
|
||||
RuntimeError: CUDA out of memory
|
||||
```
|
||||
|
||||
**Solution**: Reduce chunk size by adjusting model's `max_position_embeddings` or use fewer GPUs
|
||||
|
||||
4. **Slow processing**:
|
||||
**Expected**: Long text takes more time due to multiple inference calls
|
||||
|
||||
### Debug Information
|
||||
|
||||
Server logs show chunked processing activity:
|
||||
|
||||
```log
|
||||
INFO: Input length 150000 exceeds max_position_embeddings 4096, will use chunked processing
|
||||
INFO: Split input of 150000 tokens into 37 chunks (max_chunk_size: 4096)
|
||||
```
|
||||
|
||||
## 🤝 Contributing
|
||||
|
||||
To extend chunked processing support to other embedding models:
|
||||
|
||||
1. Check model compatibility with the pooling architecture
|
||||
2. Test with various text lengths
|
||||
3. Validate embedding quality compared to single-chunk processing
|
||||
4. Submit PR with test cases and documentation updates
|
||||
|
||||
## 🆕 Enhanced Features
|
||||
|
||||
### max_embed_len Parameter
|
||||
|
||||
The new `max_embed_len` parameter provides:
|
||||
|
||||
- **Simplified Configuration**: No need for `VLLM_ALLOW_LONG_MAX_MODEL_LEN` environment variable
|
||||
- **Flexible Input Validation**: Accept inputs longer than `max_model_len` up to `max_embed_len`
|
||||
- **Extreme Length Support**: Process documents with millions of tokens
|
||||
- **Clear Error Messages**: Better feedback when inputs exceed limits
|
||||
- **Backward Compatibility**: Existing configurations continue to work
|
||||
366
examples/online_serving/openai_embedding_long_text/client.py
Normal file
366
examples/online_serving/openai_embedding_long_text/client.py
Normal file
@ -0,0 +1,366 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
"""
|
||||
Example script demonstrating long text embedding with chunked processing in vLLM.
|
||||
|
||||
This example shows how to use vLLM's chunked processing feature to handle text
|
||||
inputs that exceed the model's maximum token length. The feature automatically
|
||||
splits long text into chunks and handles different pooling types optimally.
|
||||
|
||||
Prerequisites:
|
||||
1. Start vLLM server with chunked processing enabled:
|
||||
|
||||
# MEAN pooling (processes all chunks, recommended for complete coverage)
|
||||
vllm serve intfloat/multilingual-e5-large \
|
||||
--override-pooler-config \
|
||||
'{"pooling_type": "MEAN", "normalize": true, ' \
|
||||
'"enable_chunked_processing": true, "max_embed_len": 3072000}' \
|
||||
--served-model-name multilingual-e5-large \
|
||||
--trust-remote-code \
|
||||
--port 31090 \
|
||||
--api-key your-api-key
|
||||
|
||||
# OR CLS pooling (native CLS within chunks, MEAN aggregation across chunks)
|
||||
vllm serve BAAI/bge-large-en-v1.5 \
|
||||
--override-pooler-config \
|
||||
'{"pooling_type": "CLS", "normalize": true, ' \
|
||||
'"enable_chunked_processing": true, "max_embed_len": 1048576}' \
|
||||
--served-model-name bge-large-en-v1.5 \
|
||||
--trust-remote-code \
|
||||
--port 31090 \
|
||||
--api-key your-api-key
|
||||
|
||||
2. Install required dependencies:
|
||||
pip install openai requests
|
||||
"""
|
||||
|
||||
import time
|
||||
|
||||
import numpy as np
|
||||
from openai import OpenAI
|
||||
|
||||
# Configuration
|
||||
API_KEY = "your-api-key" # Replace with your actual API key
|
||||
BASE_URL = "http://localhost:31090/v1"
|
||||
MODEL_NAME = "multilingual-e5-large"
|
||||
|
||||
|
||||
def generate_long_text(base_text: str, repeat_count: int) -> str:
|
||||
"""Generate long text by repeating base text."""
|
||||
return base_text * repeat_count
|
||||
|
||||
|
||||
def test_embedding_with_different_lengths():
|
||||
"""Test embedding generation with different text lengths."""
|
||||
client = OpenAI(api_key=API_KEY, base_url=BASE_URL)
|
||||
|
||||
# Test cases with different text lengths
|
||||
test_cases = [
|
||||
{
|
||||
"name": "Short Text",
|
||||
"text": "Hello, this is a short text for embedding.",
|
||||
"expected_chunks": 1,
|
||||
},
|
||||
{
|
||||
"name": "Medium Text",
|
||||
"text": generate_long_text(
|
||||
"This is a medium-length text that should fit within the "
|
||||
"model's context window. " * 20,
|
||||
2,
|
||||
),
|
||||
"expected_chunks": 1,
|
||||
},
|
||||
{
|
||||
"name": "Long Text (2 chunks)",
|
||||
"text": generate_long_text(
|
||||
"This is a very long text that will exceed the model's "
|
||||
"maximum context length and trigger chunked processing. " * 50,
|
||||
5,
|
||||
),
|
||||
"expected_chunks": 2,
|
||||
},
|
||||
{
|
||||
"name": "Very Long Text (3+ chunks)",
|
||||
"text": generate_long_text(
|
||||
"This text is extremely long and will definitely "
|
||||
"require multiple chunks for processing. " * 100,
|
||||
10,
|
||||
),
|
||||
"expected_chunks": 3,
|
||||
},
|
||||
]
|
||||
|
||||
print("🧪 Testing vLLM Long Text Embedding with Chunked Processing")
|
||||
print("=" * 70)
|
||||
|
||||
for i, test_case in enumerate(test_cases, 1):
|
||||
print(f"\n📝 Test {i}: {test_case['name']}")
|
||||
print(f"Text length: {len(test_case['text'])} characters")
|
||||
|
||||
try:
|
||||
start_time = time.time()
|
||||
|
||||
response = client.embeddings.create(
|
||||
input=test_case["text"], model=MODEL_NAME, encoding_format="float"
|
||||
)
|
||||
|
||||
end_time = time.time()
|
||||
processing_time = end_time - start_time
|
||||
|
||||
# Extract embedding data
|
||||
embedding = response.data[0].embedding
|
||||
embedding_dim = len(embedding)
|
||||
|
||||
print("✅ Success!")
|
||||
print(f" - Embedding dimension: {embedding_dim}")
|
||||
print(f" - Processing time: {processing_time:.2f}s")
|
||||
print(f" - Expected chunks: ~{test_case['expected_chunks']}")
|
||||
print(f" - First 5 values: {embedding[:5]}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Failed: {str(e)}")
|
||||
|
||||
|
||||
def test_batch_embedding():
|
||||
"""Test batch embedding with mixed-length inputs."""
|
||||
client = OpenAI(api_key=API_KEY, base_url=BASE_URL)
|
||||
|
||||
print("\n🔄 Testing Batch Embedding with Mixed Lengths")
|
||||
print("=" * 50)
|
||||
|
||||
# Mix of short and long texts
|
||||
batch_inputs = [
|
||||
"Short text 1",
|
||||
generate_long_text("Medium length text that fits in one chunk. " * 20, 1),
|
||||
"Another short text",
|
||||
generate_long_text("Long text requiring chunked processing. " * 100, 5),
|
||||
]
|
||||
|
||||
try:
|
||||
start_time = time.time()
|
||||
|
||||
response = client.embeddings.create(
|
||||
input=batch_inputs, model=MODEL_NAME, encoding_format="float"
|
||||
)
|
||||
|
||||
end_time = time.time()
|
||||
processing_time = end_time - start_time
|
||||
|
||||
print("✅ Batch processing successful!")
|
||||
print(f" - Number of inputs: {len(batch_inputs)}")
|
||||
print(f" - Number of embeddings: {len(response.data)}")
|
||||
print(f" - Total processing time: {processing_time:.2f}s")
|
||||
print(
|
||||
f" - Average time per input: {processing_time / len(batch_inputs):.2f}s"
|
||||
)
|
||||
|
||||
for i, data in enumerate(response.data):
|
||||
input_length = len(batch_inputs[i])
|
||||
embedding_dim = len(data.embedding)
|
||||
print(
|
||||
f" - Input {i + 1}: {input_length} chars → {embedding_dim}D embedding"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Batch processing failed: {str(e)}")
|
||||
|
||||
|
||||
def test_multiple_long_texts_batch():
|
||||
"""Test batch processing with multiple long texts to verify chunk ID uniqueness."""
|
||||
client = OpenAI(api_key=API_KEY, base_url=BASE_URL)
|
||||
|
||||
print("\n🔧 Testing Multiple Long Texts in Batch (Chunk ID Fix Verification)")
|
||||
print("=" * 70)
|
||||
|
||||
# Create multiple distinct long texts that will all require chunking
|
||||
# Note: All pooling types now use MEAN aggregation across chunks:
|
||||
# - Native pooling (MEAN/CLS/LAST) is used within each chunk
|
||||
# - MEAN aggregation combines results across all chunks
|
||||
# - Full semantic coverage for all pooling types
|
||||
long_texts = [
|
||||
generate_long_text(
|
||||
"First long document about artificial intelligence and machine learning. "
|
||||
* 80,
|
||||
6,
|
||||
),
|
||||
generate_long_text(
|
||||
"Second long document about natural language processing and transformers. "
|
||||
* 80,
|
||||
6,
|
||||
),
|
||||
generate_long_text(
|
||||
"Third long document about computer vision and neural networks. " * 80, 6
|
||||
),
|
||||
]
|
||||
|
||||
# Add some short texts to mix things up
|
||||
batch_inputs = [
|
||||
"Short text before long texts",
|
||||
long_texts[0],
|
||||
"Short text between long texts",
|
||||
long_texts[1],
|
||||
long_texts[2],
|
||||
"Short text after long texts",
|
||||
]
|
||||
|
||||
print("📊 Batch composition:")
|
||||
for i, text in enumerate(batch_inputs):
|
||||
length = len(text)
|
||||
text_type = "Long (will be chunked)" if length > 5000 else "Short"
|
||||
print(f" - Input {i + 1}: {length} chars ({text_type})")
|
||||
|
||||
try:
|
||||
start_time = time.time()
|
||||
|
||||
response = client.embeddings.create(
|
||||
input=batch_inputs, model=MODEL_NAME, encoding_format="float"
|
||||
)
|
||||
|
||||
end_time = time.time()
|
||||
processing_time = end_time - start_time
|
||||
|
||||
print("\n✅ Multiple long texts batch processing successful!")
|
||||
print(f" - Number of inputs: {len(batch_inputs)}")
|
||||
print(f" - Number of embeddings returned: {len(response.data)}")
|
||||
print(f" - Total processing time: {processing_time:.2f}s")
|
||||
|
||||
# Verify each embedding is different (no incorrect aggregation)
|
||||
embeddings = [data.embedding for data in response.data]
|
||||
|
||||
if len(embeddings) >= 3:
|
||||
import numpy as np
|
||||
|
||||
# Compare embeddings of the long texts (indices 1, 3, 4)
|
||||
long_embeddings = [
|
||||
np.array(embeddings[1]), # First long text
|
||||
np.array(embeddings[3]), # Second long text
|
||||
np.array(embeddings[4]), # Third long text
|
||||
]
|
||||
|
||||
print("\n🔍 Verifying embedding uniqueness:")
|
||||
for i in range(len(long_embeddings)):
|
||||
for j in range(i + 1, len(long_embeddings)):
|
||||
cosine_sim = np.dot(long_embeddings[i], long_embeddings[j]) / (
|
||||
np.linalg.norm(long_embeddings[i])
|
||||
* np.linalg.norm(long_embeddings[j])
|
||||
)
|
||||
print(
|
||||
f" - Similarity between long text {i + 1} and {j + 1}: "
|
||||
f"{cosine_sim:.4f}"
|
||||
)
|
||||
|
||||
if (
|
||||
cosine_sim < 0.9
|
||||
): # Different content should have lower similarity
|
||||
print(" ✅ Good: Embeddings are appropriately different")
|
||||
else:
|
||||
print(
|
||||
" ⚠️ High similarity - may indicate chunk "
|
||||
"aggregation issue"
|
||||
)
|
||||
|
||||
print("\n📋 Per-input results:")
|
||||
for i, data in enumerate(response.data):
|
||||
input_length = len(batch_inputs[i])
|
||||
embedding_dim = len(data.embedding)
|
||||
embedding_norm = np.linalg.norm(data.embedding)
|
||||
print(
|
||||
f" - Input {i + 1}: {input_length} chars → {embedding_dim}D "
|
||||
f"embedding (norm: {embedding_norm:.4f})"
|
||||
)
|
||||
|
||||
print(
|
||||
"\n✅ This test verifies the fix for chunk ID collisions in "
|
||||
"batch processing"
|
||||
)
|
||||
print(" - Before fix: Multiple long texts would have conflicting chunk IDs")
|
||||
print(" - After fix: Each prompt's chunks have unique IDs with prompt index")
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Multiple long texts batch test failed: {str(e)}")
|
||||
print(" This might indicate the chunk ID collision bug is present!")
|
||||
|
||||
|
||||
def test_embedding_consistency():
|
||||
"""Test that chunked processing produces consistent results."""
|
||||
client = OpenAI(api_key=API_KEY, base_url=BASE_URL)
|
||||
|
||||
print("\n🔍 Testing Embedding Consistency")
|
||||
print("=" * 40)
|
||||
|
||||
# Use the same long text multiple times
|
||||
long_text = generate_long_text(
|
||||
"Consistency test text for chunked processing validation. " * 50, 3
|
||||
)
|
||||
|
||||
embeddings = []
|
||||
|
||||
try:
|
||||
for i in range(3):
|
||||
response = client.embeddings.create(
|
||||
input=long_text, model=MODEL_NAME, encoding_format="float"
|
||||
)
|
||||
embeddings.append(response.data[0].embedding)
|
||||
print(f" - Generated embedding {i + 1}")
|
||||
|
||||
# Check consistency (embeddings should be identical)
|
||||
if len(embeddings) >= 2:
|
||||
# Calculate similarity between first two embeddings
|
||||
|
||||
emb1 = np.array(embeddings[0])
|
||||
emb2 = np.array(embeddings[1])
|
||||
|
||||
# Cosine similarity
|
||||
cosine_sim = np.dot(emb1, emb2) / (
|
||||
np.linalg.norm(emb1) * np.linalg.norm(emb2)
|
||||
)
|
||||
|
||||
print("✅ Consistency test completed!")
|
||||
print(f" - Cosine similarity between runs: {cosine_sim:.6f}")
|
||||
print(" - Expected: ~1.0 (identical embeddings)")
|
||||
|
||||
if cosine_sim > 0.999:
|
||||
print(" - ✅ High consistency achieved!")
|
||||
else:
|
||||
print(" - ⚠️ Consistency may vary due to numerical precision")
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Consistency test failed: {str(e)}")
|
||||
|
||||
|
||||
def main():
|
||||
"""Main function to run all tests."""
|
||||
print("🚀 vLLM Long Text Embedding Client")
|
||||
print(f"📡 Connecting to: {BASE_URL}")
|
||||
print(f"🤖 Model: {MODEL_NAME}")
|
||||
masked_key = "*" * (len(API_KEY) - 4) + API_KEY[-4:] if len(API_KEY) > 4 else "****"
|
||||
print(f"🔑 API Key: {masked_key}")
|
||||
|
||||
# Run all test cases
|
||||
test_embedding_with_different_lengths()
|
||||
test_batch_embedding()
|
||||
test_multiple_long_texts_batch()
|
||||
test_embedding_consistency()
|
||||
|
||||
print("\n" + "=" * 70)
|
||||
print("🎉 All tests completed!")
|
||||
print("\n💡 Key Features Demonstrated:")
|
||||
print(" - ✅ Automatic chunked processing for long text")
|
||||
print(" - ✅ Seamless handling of mixed-length batches")
|
||||
print(" - ✅ Multiple long texts in single batch (chunk ID fix)")
|
||||
print(" - ✅ Unified chunked processing:")
|
||||
print(" • Native pooling used within each chunk")
|
||||
print(" • MEAN aggregation across all chunks")
|
||||
print(" • Complete semantic coverage for all pooling types")
|
||||
print(" - ✅ Consistent embedding generation")
|
||||
print(" - ✅ Backward compatibility with short text")
|
||||
print("\n📚 For more information, see:")
|
||||
print(
|
||||
" - Documentation: https://docs.vllm.ai/en/latest/models/pooling_models.html"
|
||||
)
|
||||
print(" - Chunked Processing Guide: openai_embedding_long_text.md")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
137
examples/online_serving/openai_embedding_long_text/service.sh
Normal file
137
examples/online_serving/openai_embedding_long_text/service.sh
Normal file
@ -0,0 +1,137 @@
|
||||
#!/bin/bash
|
||||
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# vLLM Embedding Server with Enhanced Chunked Processing
|
||||
# This script starts a vLLM server with chunked processing enabled for long text embedding.
|
||||
# Now supports proper pooling type validation and model-specific configurations.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# Configuration
|
||||
MODEL_NAME=${MODEL_NAME:-"intfloat/multilingual-e5-large"}
|
||||
MODEL_CODE=${MODEL_CODE:-"multilingual-e5-large"}
|
||||
|
||||
PORT=${PORT:-31090}
|
||||
GPU_COUNT=${GPU_COUNT:-1}
|
||||
MAX_EMBED_LEN=${MAX_EMBED_LEN:-3072000}
|
||||
API_KEY=${API_KEY:-"your-api-key"}
|
||||
|
||||
# Enhanced pooling configuration with model-specific defaults
|
||||
POOLING_TYPE=${POOLING_TYPE:-"auto"} # auto, MEAN, CLS, LAST
|
||||
export VLLM_ENABLE_CHUNKED_PROCESSING=true
|
||||
export CUDA_VISIBLE_DEVICES=2,3,4,5
|
||||
# export VLLM_ATTENTION_BACKEND=XFORMERS
|
||||
|
||||
echo "🚀 Starting vLLM Embedding Server with Enhanced Chunked Processing"
|
||||
echo "=================================================================="
|
||||
|
||||
# Environment variables for optimization
|
||||
export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
|
||||
# Function to determine optimal pooling type for known models
|
||||
get_optimal_pooling_type() {
|
||||
local model="$1"
|
||||
case "$model" in
|
||||
*"e5-"* | *"multilingual-e5"*)
|
||||
echo "MEAN" # E5 series native pooling
|
||||
;;
|
||||
*"bge-"*)
|
||||
echo "CLS" # BGE series native pooling
|
||||
;;
|
||||
*"gte-"*)
|
||||
echo "LAST" # GTE series native pooling
|
||||
;;
|
||||
*"sentence-t5"* | *"st5"*)
|
||||
echo "MEAN" # Sentence-T5 native pooling
|
||||
;;
|
||||
*"jina-embeddings"*)
|
||||
echo "MEAN" # Jina embeddings native pooling
|
||||
;;
|
||||
*"Qwen"*"Embedding"*)
|
||||
echo "LAST" # Qwen embeddings native pooling
|
||||
;;
|
||||
*)
|
||||
echo "MEAN" # Default native pooling for unknown models
|
||||
;;
|
||||
esac
|
||||
}
|
||||
|
||||
# Auto-detect pooling type if not explicitly set
|
||||
if [ "$POOLING_TYPE" = "auto" ]; then
|
||||
POOLING_TYPE=$(get_optimal_pooling_type "$MODEL_NAME")
|
||||
echo "🔍 Auto-detected pooling type: $POOLING_TYPE for model $MODEL_NAME"
|
||||
fi
|
||||
|
||||
# Display configuration
|
||||
echo "📋 Configuration:"
|
||||
echo " - Model: $MODEL_NAME"
|
||||
echo " - Port: $PORT"
|
||||
echo " - GPU Count: $GPU_COUNT"
|
||||
echo " - Enhanced Chunked Processing: ${VLLM_ENABLE_CHUNKED_PROCESSING}"
|
||||
echo " - Max Embed Length: ${MAX_EMBED_LEN} tokens"
|
||||
echo " - Native Pooling Type: $POOLING_TYPE + Normalization"
|
||||
echo " - Cross-chunk Aggregation: MEAN (automatic)"
|
||||
echo ""
|
||||
|
||||
# Validate GPU availability
|
||||
if command -v nvidia-smi &> /dev/null; then
|
||||
gpu_count=$(nvidia-smi --list-gpus | wc -l)
|
||||
echo "🖥️ Available GPUs: $gpu_count"
|
||||
if [ "$GPU_COUNT" -gt "$gpu_count" ]; then
|
||||
echo "⚠️ Warning: Requested $GPU_COUNT GPUs but only $gpu_count available"
|
||||
echo " Adjusting to use $gpu_count GPUs"
|
||||
GPU_COUNT=$gpu_count
|
||||
fi
|
||||
else
|
||||
echo "⚠️ Warning: nvidia-smi not found. GPU detection skipped."
|
||||
fi
|
||||
|
||||
# Chunked processing uses unified MEAN aggregation
|
||||
echo "ℹ️ Chunked Processing: Using $POOLING_TYPE pooling within chunks, MEAN aggregation across chunks"
|
||||
echo " - All chunks processed for complete semantic coverage"
|
||||
echo " - Weighted averaging based on chunk token counts"
|
||||
|
||||
echo ""
|
||||
echo "🔧 Starting server with enhanced chunked processing configuration..."
|
||||
|
||||
# Build pooler config JSON
|
||||
POOLER_CONFIG="{\"pooling_type\": \"$POOLING_TYPE\", \"normalize\": true, \"enable_chunked_processing\": ${VLLM_ENABLE_CHUNKED_PROCESSING}, \"max_embed_len\": ${MAX_EMBED_LEN}}"
|
||||
|
||||
# Start vLLM server with enhanced chunked processing
|
||||
vllm serve "$MODEL_NAME" \
|
||||
--tensor-parallel-size "$GPU_COUNT" \
|
||||
--enforce-eager \
|
||||
--override-pooler-config "$POOLER_CONFIG" \
|
||||
--served-model-name ${MODEL_CODE} \
|
||||
--api-key "$API_KEY" \
|
||||
--trust-remote-code \
|
||||
--port "$PORT" \
|
||||
--host 0.0.0.0
|
||||
|
||||
echo ""
|
||||
echo "✅ vLLM Embedding Server started successfully!"
|
||||
echo ""
|
||||
echo "📡 Server Information:"
|
||||
echo " - Base URL: http://localhost:$PORT"
|
||||
echo " - Model Code: ${MODEL_CODE}"
|
||||
echo " - API Key: $API_KEY"
|
||||
echo " - Native Pooling: $POOLING_TYPE | Cross-chunk: MEAN"
|
||||
echo ""
|
||||
echo "🧪 Test the server with:"
|
||||
echo " python examples/online_serving/openai_embedding_long_text_client.py"
|
||||
echo ""
|
||||
echo "📚 Enhanced features enabled:"
|
||||
echo " ✅ Intelligent native pooling type detection"
|
||||
echo " ✅ Unified MEAN aggregation for chunked processing"
|
||||
echo " ✅ Model-specific native pooling optimization"
|
||||
echo " ✅ Enhanced max embedding length (${MAX_EMBED_LEN} tokens)"
|
||||
echo " ✅ Complete semantic coverage for all pooling types"
|
||||
echo " ✅ OpenAI-compatible API"
|
||||
echo " ✅ GPU acceleration"
|
||||
echo ""
|
||||
echo "🔧 Advanced usage:"
|
||||
echo " - Set POOLING_TYPE=MEAN|CLS|LAST to override auto-detection"
|
||||
echo " - Set MAX_EMBED_LEN to adjust maximum input length"
|
||||
echo " - All pooling types use MEAN aggregation across chunks"
|
||||
@ -15,6 +15,14 @@ else
|
||||
MODEL=$2
|
||||
fi
|
||||
|
||||
# The prefillers and decoders in LMCache use the same hash seed for all chunk keys.
|
||||
# This seed must be aligned so that decoders can identify and retrieve KV cache
|
||||
# entries stored by prefillers.
|
||||
#
|
||||
# WARNING: Using a fixed hash seed is insecure and makes the application vulnerable to
|
||||
# denial-of-service attacks. In a production environment, this should be set to a
|
||||
# secure random value. This is set to a fixed value for demonstration purposes only.
|
||||
export PYTHONHASHSEED=${VLLM_PYTHON_HASH_SEED:-123}
|
||||
|
||||
if [[ $1 == "prefiller" ]]; then
|
||||
# Prefiller listens on port 8100
|
||||
|
||||
@ -34,13 +34,14 @@ theme:
|
||||
- content.action.edit
|
||||
- content.code.copy
|
||||
- content.tabs.link
|
||||
- navigation.instant
|
||||
- navigation.instant.progress
|
||||
- navigation.tracking
|
||||
- navigation.tabs
|
||||
- navigation.tabs.sticky
|
||||
- navigation.sections
|
||||
- navigation.prune
|
||||
- navigation.top
|
||||
- navigation.indexes
|
||||
- navigation.top
|
||||
- search.highlight
|
||||
- search.share
|
||||
- toc.follow
|
||||
|
||||
@ -29,3 +29,5 @@ setproctitle
|
||||
torch
|
||||
transformers
|
||||
zmq
|
||||
uvloop
|
||||
prometheus-client
|
||||
|
||||
@ -10,7 +10,7 @@ pytest-timeout
|
||||
# testing utils
|
||||
backoff # required for phi4mm test
|
||||
blobfile # required for kimi-vl test
|
||||
einops # required for MPT, qwen-vl and Mamba
|
||||
einops # required for MPT, qwen-vl
|
||||
httpx
|
||||
librosa # required for audio tests
|
||||
vector_quantize_pytorch # required for minicpmo_26 test
|
||||
@ -21,12 +21,11 @@ ray[cgraph,default]>=2.48.0 # Ray Compiled Graph, required by pipeline paralleli
|
||||
sentence-transformers # required for embedding tests
|
||||
soundfile # required for audio tests
|
||||
jiwer # required for audio tests
|
||||
timm # required for internvl test
|
||||
timm >=1.0.17 # required for internvl and gemma3n-mm test
|
||||
torch==2.7.1
|
||||
torchaudio==2.7.1
|
||||
torchvision==0.22.1
|
||||
transformers_stream_generator # required for qwen-vl test
|
||||
mamba_ssm==2.2.5 # required for plamo2 test
|
||||
matplotlib # required for qwen-vl test
|
||||
mistral_common[image,audio] >= 1.8.2 # required for voxtral test
|
||||
num2words # required for smolvlm test
|
||||
@ -53,4 +52,4 @@ runai-model-streamer==0.11.0
|
||||
runai-model-streamer-s3==0.11.0
|
||||
fastsafetensors>=0.1.10
|
||||
pydantic>=2.10 # 2.9 leads to error on python 3.10
|
||||
terratorch==1.1rc2 # required for PrithviMAE test
|
||||
terratorch==1.1rc2 # required for PrithviMAE test
|
||||
|
||||
@ -178,7 +178,6 @@ einops==0.8.1
|
||||
# via
|
||||
# -r requirements/test.in
|
||||
# encodec
|
||||
# mamba-ssm
|
||||
# terratorch
|
||||
# torchgeo
|
||||
# vector-quantize-pytorch
|
||||
@ -417,8 +416,6 @@ lxml==5.3.0
|
||||
# sacrebleu
|
||||
mako==1.3.10
|
||||
# via alembic
|
||||
mamba-ssm==2.2.5
|
||||
# via -r requirements/test.in
|
||||
markdown==3.8.2
|
||||
# via mlflow
|
||||
markdown-it-py==3.0.0
|
||||
@ -475,8 +472,6 @@ networkx==3.2.1
|
||||
# via
|
||||
# scikit-image
|
||||
# torch
|
||||
ninja==1.11.1.3
|
||||
# via mamba-ssm
|
||||
nltk==3.9.1
|
||||
# via rouge-score
|
||||
num2words==0.5.14
|
||||
@ -629,7 +624,6 @@ packaging==24.2
|
||||
# lazy-loader
|
||||
# lightning
|
||||
# lightning-utilities
|
||||
# mamba-ssm
|
||||
# matplotlib
|
||||
# mlflow-skinny
|
||||
# peft
|
||||
@ -973,7 +967,6 @@ sentencepiece==0.2.0
|
||||
setuptools==77.0.3
|
||||
# via
|
||||
# lightning-utilities
|
||||
# mamba-ssm
|
||||
# pytablewriter
|
||||
# torch
|
||||
# triton
|
||||
@ -1058,7 +1051,7 @@ tiktoken==0.7.0
|
||||
# via
|
||||
# lm-eval
|
||||
# mistral-common
|
||||
timm==1.0.15
|
||||
timm==1.0.17
|
||||
# via
|
||||
# -r requirements/test.in
|
||||
# open-clip-torch
|
||||
@ -1085,7 +1078,6 @@ torch==2.7.1+cu128
|
||||
# lightly
|
||||
# lightning
|
||||
# lm-eval
|
||||
# mamba-ssm
|
||||
# mteb
|
||||
# open-clip-torch
|
||||
# peft
|
||||
@ -1152,16 +1144,13 @@ transformers==4.55.0
|
||||
# -r requirements/test.in
|
||||
# genai-perf
|
||||
# lm-eval
|
||||
# mamba-ssm
|
||||
# peft
|
||||
# sentence-transformers
|
||||
# transformers-stream-generator
|
||||
transformers-stream-generator==0.0.5
|
||||
# via -r requirements/test.in
|
||||
triton==3.3.1
|
||||
# via
|
||||
# mamba-ssm
|
||||
# torch
|
||||
# via torch
|
||||
tritonclient==2.51.0
|
||||
# via
|
||||
# -r requirements/test.in
|
||||
|
||||
187
setup.py
187
setup.py
@ -7,6 +7,7 @@ import json
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
from pathlib import Path
|
||||
@ -281,10 +282,81 @@ class cmake_build_ext(build_ext):
|
||||
self.copy_file(file, dst_file)
|
||||
|
||||
|
||||
class repackage_wheel(build_ext):
|
||||
class precompiled_build_ext(build_ext):
|
||||
"""Disables extension building when using precompiled binaries."""
|
||||
|
||||
def run(self) -> None:
|
||||
assert _is_cuda(
|
||||
), "VLLM_USE_PRECOMPILED is only supported for CUDA builds"
|
||||
|
||||
def build_extensions(self) -> None:
|
||||
print("Skipping build_ext: using precompiled extensions.")
|
||||
return
|
||||
|
||||
|
||||
class precompiled_wheel_utils:
|
||||
"""Extracts libraries and other files from an existing wheel."""
|
||||
|
||||
def get_base_commit_in_main_branch(self) -> str:
|
||||
@staticmethod
|
||||
def extract_precompiled_and_patch_package(wheel_url_or_path: str) -> dict:
|
||||
import tempfile
|
||||
import zipfile
|
||||
|
||||
temp_dir = None
|
||||
try:
|
||||
if not os.path.isfile(wheel_url_or_path):
|
||||
wheel_filename = wheel_url_or_path.split("/")[-1]
|
||||
temp_dir = tempfile.mkdtemp(prefix="vllm-wheels")
|
||||
wheel_path = os.path.join(temp_dir, wheel_filename)
|
||||
print(f"Downloading wheel from {wheel_url_or_path} "
|
||||
f"to {wheel_path}")
|
||||
from urllib.request import urlretrieve
|
||||
urlretrieve(wheel_url_or_path, filename=wheel_path)
|
||||
else:
|
||||
wheel_path = wheel_url_or_path
|
||||
print(f"Using existing wheel at {wheel_path}")
|
||||
|
||||
package_data_patch = {}
|
||||
|
||||
with zipfile.ZipFile(wheel_path) as wheel:
|
||||
files_to_copy = [
|
||||
"vllm/_C.abi3.so",
|
||||
"vllm/_moe_C.abi3.so",
|
||||
"vllm/_flashmla_C.abi3.so",
|
||||
"vllm/vllm_flash_attn/_vllm_fa2_C.abi3.so",
|
||||
"vllm/vllm_flash_attn/_vllm_fa3_C.abi3.so",
|
||||
"vllm/cumem_allocator.abi3.so",
|
||||
]
|
||||
|
||||
compiled_regex = re.compile(
|
||||
r"vllm/vllm_flash_attn/(?:[^/.][^/]*/)*(?!\.)[^/]*\.py")
|
||||
file_members = list(
|
||||
filter(lambda x: x.filename in files_to_copy,
|
||||
wheel.filelist))
|
||||
file_members += list(
|
||||
filter(lambda x: compiled_regex.match(x.filename),
|
||||
wheel.filelist))
|
||||
|
||||
for file in file_members:
|
||||
print(f"[extract] {file.filename}")
|
||||
target_path = os.path.join(".", file.filename)
|
||||
os.makedirs(os.path.dirname(target_path), exist_ok=True)
|
||||
with wheel.open(file.filename) as src, open(
|
||||
target_path, "wb") as dst:
|
||||
shutil.copyfileobj(src, dst)
|
||||
|
||||
pkg = os.path.dirname(file.filename).replace("/", ".")
|
||||
package_data_patch.setdefault(pkg, []).append(
|
||||
os.path.basename(file.filename))
|
||||
|
||||
return package_data_patch
|
||||
finally:
|
||||
if temp_dir is not None:
|
||||
print(f"Removing temporary directory {temp_dir}")
|
||||
shutil.rmtree(temp_dir)
|
||||
|
||||
@staticmethod
|
||||
def get_base_commit_in_main_branch() -> str:
|
||||
# Force to use the nightly wheel. This is mainly used for CI testing.
|
||||
if envs.VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL:
|
||||
return "nightly"
|
||||
@ -297,6 +369,10 @@ class repackage_wheel(build_ext):
|
||||
]).decode("utf-8")
|
||||
upstream_main_commit = json.loads(resp_json)["sha"]
|
||||
|
||||
# In Docker build context, .git may be immutable or missing.
|
||||
if envs.VLLM_DOCKER_BUILD_CONTEXT:
|
||||
return upstream_main_commit
|
||||
|
||||
# Check if the upstream_main_commit exists in the local repo
|
||||
try:
|
||||
subprocess.check_output(
|
||||
@ -329,86 +405,6 @@ class repackage_wheel(build_ext):
|
||||
"wheel may not be compatible with your dev branch: %s", err)
|
||||
return "nightly"
|
||||
|
||||
def run(self) -> None:
|
||||
assert _is_cuda(
|
||||
), "VLLM_USE_PRECOMPILED is only supported for CUDA builds"
|
||||
|
||||
wheel_location = os.getenv("VLLM_PRECOMPILED_WHEEL_LOCATION", None)
|
||||
if wheel_location is None:
|
||||
base_commit = self.get_base_commit_in_main_branch()
|
||||
wheel_location = f"https://wheels.vllm.ai/{base_commit}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
|
||||
# Fallback to nightly wheel if latest commit wheel is unavailable,
|
||||
# in this rare case, the nightly release CI hasn't finished on main.
|
||||
if not is_url_available(wheel_location):
|
||||
wheel_location = "https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
|
||||
|
||||
import zipfile
|
||||
|
||||
if os.path.isfile(wheel_location):
|
||||
wheel_path = wheel_location
|
||||
print(f"Using existing wheel={wheel_path}")
|
||||
else:
|
||||
# Download the wheel from a given URL, assume
|
||||
# the filename is the last part of the URL
|
||||
wheel_filename = wheel_location.split("/")[-1]
|
||||
|
||||
import tempfile
|
||||
|
||||
# create a temporary directory to store the wheel
|
||||
temp_dir = tempfile.mkdtemp(prefix="vllm-wheels")
|
||||
wheel_path = os.path.join(temp_dir, wheel_filename)
|
||||
|
||||
print(f"Downloading wheel from {wheel_location} to {wheel_path}")
|
||||
|
||||
from urllib.request import urlretrieve
|
||||
|
||||
try:
|
||||
urlretrieve(wheel_location, filename=wheel_path)
|
||||
except Exception as e:
|
||||
from setuptools.errors import SetupError
|
||||
|
||||
raise SetupError(
|
||||
f"Failed to get vLLM wheel from {wheel_location}") from e
|
||||
|
||||
with zipfile.ZipFile(wheel_path) as wheel:
|
||||
files_to_copy = [
|
||||
"vllm/_C.abi3.so",
|
||||
"vllm/_moe_C.abi3.so",
|
||||
"vllm/_flashmla_C.abi3.so",
|
||||
"vllm/vllm_flash_attn/_vllm_fa2_C.abi3.so",
|
||||
"vllm/vllm_flash_attn/_vllm_fa3_C.abi3.so",
|
||||
"vllm/cumem_allocator.abi3.so",
|
||||
# "vllm/_version.py", # not available in nightly wheels yet
|
||||
]
|
||||
|
||||
file_members = list(
|
||||
filter(lambda x: x.filename in files_to_copy, wheel.filelist))
|
||||
|
||||
# vllm_flash_attn python code:
|
||||
# Regex from
|
||||
# `glob.translate('vllm/vllm_flash_attn/**/*.py', recursive=True)`
|
||||
compiled_regex = re.compile(
|
||||
r"vllm/vllm_flash_attn/(?:[^/.][^/]*/)*(?!\.)[^/]*\.py")
|
||||
file_members += list(
|
||||
filter(lambda x: compiled_regex.match(x.filename),
|
||||
wheel.filelist))
|
||||
|
||||
for file in file_members:
|
||||
print(f"Extracting and including {file.filename} "
|
||||
"from existing wheel")
|
||||
package_name = os.path.dirname(file.filename).replace("/", ".")
|
||||
file_name = os.path.basename(file.filename)
|
||||
|
||||
if package_name not in package_data:
|
||||
package_data[package_name] = []
|
||||
|
||||
wheel.extract(file)
|
||||
if file_name.endswith(".py"):
|
||||
# python files shouldn't be added to package_data
|
||||
continue
|
||||
|
||||
package_data[package_name].append(file_name)
|
||||
|
||||
|
||||
def _no_device() -> bool:
|
||||
return VLLM_TARGET_DEVICE == "empty"
|
||||
@ -639,6 +635,29 @@ package_data = {
|
||||
]
|
||||
}
|
||||
|
||||
# If using precompiled, extract and patch package_data (in advance of setup)
|
||||
if envs.VLLM_USE_PRECOMPILED:
|
||||
assert _is_cuda(), "VLLM_USE_PRECOMPILED is only supported for CUDA builds"
|
||||
wheel_location = os.getenv("VLLM_PRECOMPILED_WHEEL_LOCATION", None)
|
||||
if wheel_location is not None:
|
||||
wheel_url = wheel_location
|
||||
else:
|
||||
base_commit = precompiled_wheel_utils.get_base_commit_in_main_branch()
|
||||
wheel_url = f"https://wheels.vllm.ai/{base_commit}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
|
||||
from urllib.request import urlopen
|
||||
try:
|
||||
with urlopen(wheel_url) as resp:
|
||||
if resp.status != 200:
|
||||
wheel_url = "https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
|
||||
except Exception as e:
|
||||
print(f"[warn] Falling back to nightly wheel: {e}")
|
||||
wheel_url = "https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
|
||||
|
||||
patch = precompiled_wheel_utils.extract_precompiled_and_patch_package(
|
||||
wheel_url)
|
||||
for pkg, files in patch.items():
|
||||
package_data.setdefault(pkg, []).extend(files)
|
||||
|
||||
if _no_device():
|
||||
ext_modules = []
|
||||
|
||||
@ -647,7 +666,7 @@ if not ext_modules:
|
||||
else:
|
||||
cmdclass = {
|
||||
"build_ext":
|
||||
repackage_wheel if envs.VLLM_USE_PRECOMPILED else cmake_build_ext
|
||||
precompiled_build_ext if envs.VLLM_USE_PRECOMPILED else cmake_build_ext
|
||||
}
|
||||
|
||||
setup(
|
||||
@ -665,7 +684,7 @@ setup(
|
||||
"mistral_common[audio]"], # Required for audio processing
|
||||
"video": [], # Kept for backwards compatibility
|
||||
# FlashInfer should be updated together with the Dockerfile
|
||||
"flashinfer": ["flashinfer-python==0.2.10"],
|
||||
"flashinfer": ["flashinfer-python==0.2.11"],
|
||||
},
|
||||
cmdclass=cmdclass,
|
||||
package_data=package_data,
|
||||
|
||||
@ -1,409 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import uuid
|
||||
from asyncio import CancelledError
|
||||
from copy import copy
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any, Optional
|
||||
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
import torch
|
||||
|
||||
from vllm import SamplingParams
|
||||
from vllm.config import ParallelConfig
|
||||
from vllm.distributed import cleanup_dist_env_and_memory
|
||||
from vllm.engine.async_llm_engine import AsyncEngineArgs, AsyncLLMEngine
|
||||
from vllm.outputs import RequestOutput as RealRequestOutput
|
||||
from vllm.sampling_params import RequestOutputKind
|
||||
|
||||
from ..utils import wait_for_gpu_memory_to_clear
|
||||
|
||||
|
||||
@dataclass
|
||||
class RequestOutput:
|
||||
request_id: int
|
||||
finished: bool = False
|
||||
|
||||
|
||||
@dataclass
|
||||
class MockModelConfig:
|
||||
use_async_output_proc = True
|
||||
media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict)
|
||||
|
||||
|
||||
class MockEngine:
|
||||
|
||||
def __init__(self):
|
||||
self.step_calls = 0
|
||||
self.add_request_calls = 0
|
||||
self.abort_request_calls = 0
|
||||
self.request_id = None
|
||||
# Ugly, remove dependency when possible
|
||||
self.parallel_config = ParallelConfig()
|
||||
self.model_config = MockModelConfig()
|
||||
|
||||
async def step_async(self, virtual_engine):
|
||||
# PP size is 1, ignore virtual engine
|
||||
self.step_calls += 1
|
||||
return [RequestOutput(
|
||||
request_id=self.request_id)] if self.request_id else []
|
||||
|
||||
async def process_model_inputs_async(self, *args, **kwargs):
|
||||
pass
|
||||
|
||||
async def stop_remote_worker_execution_loop_async(self):
|
||||
pass
|
||||
|
||||
def generate(self, request_id):
|
||||
self.request_id = request_id
|
||||
|
||||
def stop_generating(self):
|
||||
self.request_id = None
|
||||
|
||||
def add_request(self, **kwargs):
|
||||
del kwargs # Unused
|
||||
self.add_request_calls += 1
|
||||
print(f'Request calls: {self.add_request_calls}')
|
||||
|
||||
async def add_request_async(self, **kwargs):
|
||||
self.add_request_calls += 1
|
||||
return
|
||||
|
||||
def abort_request(self, request_id):
|
||||
del request_id # Unused
|
||||
self.abort_request_calls += 1
|
||||
|
||||
def has_unfinished_requests(self):
|
||||
return self.request_id is not None
|
||||
|
||||
def has_unfinished_requests_for_virtual_engine(self, virtual_engine):
|
||||
return self.request_id is not None
|
||||
|
||||
|
||||
class MockAsyncLLMEngine(AsyncLLMEngine):
|
||||
_engine_class = MockEngine
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_new_requests_event():
|
||||
params = SamplingParams()
|
||||
|
||||
engine = MockAsyncLLMEngine()
|
||||
engine.start_background_loop()
|
||||
await asyncio.sleep(0.01)
|
||||
assert engine.engine.step_calls == 0
|
||||
|
||||
await engine.add_request("1", "", params)
|
||||
await asyncio.sleep(0.01)
|
||||
assert engine.engine.add_request_calls == 1
|
||||
assert engine.engine.step_calls == 1
|
||||
|
||||
await engine.add_request("2", "", params)
|
||||
engine.engine.generate("2")
|
||||
await asyncio.sleep(0)
|
||||
await asyncio.sleep(0)
|
||||
await asyncio.sleep(0)
|
||||
assert engine.engine.add_request_calls == 2
|
||||
assert engine.engine.step_calls >= 2
|
||||
await asyncio.sleep(0.001)
|
||||
assert engine.engine.step_calls >= 3
|
||||
engine.engine.stop_generating()
|
||||
await asyncio.sleep(0.001)
|
||||
old_step_calls = engine.engine.step_calls
|
||||
await asyncio.sleep(0.001)
|
||||
assert engine.engine.step_calls == old_step_calls
|
||||
|
||||
await engine.add_request("3", "", params)
|
||||
await asyncio.sleep(0.01)
|
||||
assert engine.engine.add_request_calls == 3
|
||||
assert engine.engine.step_calls == old_step_calls + 1
|
||||
await asyncio.sleep(0.01)
|
||||
assert engine.engine.add_request_calls == 3
|
||||
assert engine.engine.step_calls == old_step_calls + 1
|
||||
|
||||
engine = MockAsyncLLMEngine()
|
||||
assert engine.get_model_config() is not None
|
||||
assert engine.get_tokenizer() is not None
|
||||
assert engine.get_decoding_config() is not None
|
||||
|
||||
|
||||
def start_engine():
|
||||
wait_for_gpu_memory_to_clear(
|
||||
devices=list(range(torch.cuda.device_count())),
|
||||
threshold_bytes=2 * 2**30,
|
||||
timeout_s=60,
|
||||
)
|
||||
|
||||
num_scheduler_steps = int(os.getenv("NUM_SCHEDULER_STEPS", "1"))
|
||||
print(f"Starting engine with num_scheduler_steps={num_scheduler_steps}")
|
||||
|
||||
return AsyncLLMEngine.from_engine_args(
|
||||
AsyncEngineArgs(model="facebook/opt-125m",
|
||||
enforce_eager=True,
|
||||
num_scheduler_steps=num_scheduler_steps))
|
||||
|
||||
|
||||
def uid() -> str:
|
||||
return str(uuid.uuid4())
|
||||
|
||||
|
||||
@pytest_asyncio.fixture(scope="module")
|
||||
async def async_engine():
|
||||
# We cannot use monkeypatch since this is a module
|
||||
# scoped fixture and monkeypatch is function scoped.
|
||||
previous_value = os.getenv("VLLM_USE_V1", None)
|
||||
os.environ["VLLM_USE_V1"] = "0"
|
||||
engine = await asyncio.get_event_loop().run_in_executor(executor=None,
|
||||
func=start_engine)
|
||||
try:
|
||||
yield engine
|
||||
finally:
|
||||
engine.shutdown_background_loop()
|
||||
del engine
|
||||
await asyncio.sleep(0.1)
|
||||
cleanup_dist_env_and_memory()
|
||||
|
||||
if previous_value:
|
||||
os.environ["VLLM_USE_V1"] = previous_value
|
||||
else:
|
||||
del os.environ["VLLM_USE_V1"]
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def should_do_global_cleanup_after_test(request) -> bool:
|
||||
# So we can share the async engine fixture between these tests
|
||||
return False
|
||||
|
||||
|
||||
@pytest.mark.asyncio(scope="module")
|
||||
@pytest.mark.parametrize("stop", [None, ["a stop string"]])
|
||||
async def test_asyncio_run(async_engine, stop):
|
||||
|
||||
scheduler_config = await async_engine.get_scheduler_config()
|
||||
num_scheduler_steps = scheduler_config.num_scheduler_steps
|
||||
|
||||
async def run(prompt: str):
|
||||
sampling_params = SamplingParams(
|
||||
temperature=0,
|
||||
max_tokens=32,
|
||||
min_tokens=32,
|
||||
stop=stop,
|
||||
)
|
||||
|
||||
output_count = 0
|
||||
final_output = None
|
||||
async for output in async_engine.generate(prompt,
|
||||
sampling_params,
|
||||
request_id=uid()):
|
||||
output_count += 1
|
||||
final_output = output
|
||||
return final_output, output_count
|
||||
|
||||
results = await asyncio.gather(
|
||||
run("test0"),
|
||||
run("test0"),
|
||||
)
|
||||
assert len(results) == 2
|
||||
first, second = results
|
||||
|
||||
# remove nondeterministic fields for comparison
|
||||
first[0].metrics = None
|
||||
second[0].metrics = None
|
||||
first[0].request_id = None
|
||||
second[0].request_id = None
|
||||
|
||||
assert str(first) == str(second)
|
||||
|
||||
output_count = results[0][1]
|
||||
if num_scheduler_steps == 1:
|
||||
assert output_count == 32
|
||||
else:
|
||||
assert 1 < output_count < 32
|
||||
|
||||
|
||||
@pytest.mark.asyncio(scope="module")
|
||||
@pytest.mark.parametrize("stop", [None, ["a stop string"]])
|
||||
async def test_output_kinds(async_engine, stop):
|
||||
"""Test that output_kind works as expected and that
|
||||
results are equivalent across different kinds."""
|
||||
|
||||
scheduler_config = await async_engine.get_scheduler_config()
|
||||
num_scheduler_steps = scheduler_config.num_scheduler_steps
|
||||
|
||||
sampling_params = SamplingParams(
|
||||
temperature=0,
|
||||
max_tokens=32,
|
||||
min_tokens=32,
|
||||
stop=stop,
|
||||
)
|
||||
|
||||
async def run(prompt: str, kind: RequestOutputKind):
|
||||
params = copy(sampling_params)
|
||||
params.output_kind = kind
|
||||
|
||||
output_count = 0
|
||||
final_output = None
|
||||
async for output in async_engine.generate(prompt,
|
||||
params,
|
||||
request_id=uid()):
|
||||
output_count += 1
|
||||
final_output = output
|
||||
|
||||
assert final_output is not None
|
||||
assert final_output.finished
|
||||
|
||||
return (final_output.prompt_token_ids,
|
||||
final_output.outputs[0].token_ids,
|
||||
final_output.outputs[0].text, output_count)
|
||||
|
||||
async def run_deltas(prompt: str):
|
||||
params = copy(sampling_params)
|
||||
params.output_kind = RequestOutputKind.DELTA
|
||||
|
||||
prompt_tokens = None
|
||||
output_tokens: list[int] = []
|
||||
output_text = ""
|
||||
output_count = 0
|
||||
final_output = None
|
||||
async for output in async_engine.generate(prompt,
|
||||
params,
|
||||
request_id=uid()):
|
||||
token_ids = output.outputs[0].token_ids
|
||||
text = output.outputs[0].text
|
||||
final_output = output
|
||||
|
||||
# Ensure we get prompt ids iff we haven't yet received output tokens
|
||||
if output_tokens:
|
||||
assert 1 <= len(token_ids) <= num_scheduler_steps
|
||||
assert stop or text
|
||||
assert not output.prompt_token_ids
|
||||
else:
|
||||
assert output.prompt_token_ids
|
||||
prompt_tokens = output.prompt_token_ids
|
||||
|
||||
output_tokens.extend(token_ids)
|
||||
output_text += text
|
||||
|
||||
output_count += 1
|
||||
|
||||
assert final_output is not None
|
||||
assert final_output.finished
|
||||
|
||||
return prompt_tokens, output_tokens, output_text, output_count
|
||||
|
||||
results = await asyncio.gather(
|
||||
run("common input prompt", RequestOutputKind.CUMULATIVE),
|
||||
run("common input prompt", RequestOutputKind.FINAL_ONLY),
|
||||
run_deltas("common input prompt"))
|
||||
|
||||
# Make sure outputs are the same
|
||||
prompt_set = set(tuple(prompt_ids) for prompt_ids, _, _, _ in results)
|
||||
assert len(prompt_set) == 1
|
||||
|
||||
text_set = set(text for _, _, text, _ in results)
|
||||
assert len(text_set) == 1
|
||||
|
||||
tokens_set = set(tuple(ids) for _, ids, _, _ in results)
|
||||
assert len(tokens_set) == 1
|
||||
|
||||
cumulative, final, deltas = results
|
||||
|
||||
# output message counts
|
||||
assert cumulative[3] == deltas[3]
|
||||
|
||||
if num_scheduler_steps == 1:
|
||||
assert cumulative[3] == 32
|
||||
else:
|
||||
assert 1 < cumulative[3] < 32
|
||||
|
||||
assert final[3] == 1
|
||||
|
||||
|
||||
@pytest.mark.asyncio(scope="module")
|
||||
@pytest.mark.parametrize("stop", [None, ["a stop string"]])
|
||||
async def test_cancellation(async_engine, stop):
|
||||
scheduler_config = await async_engine.get_scheduler_config()
|
||||
num_scheduler_steps = scheduler_config.num_scheduler_steps
|
||||
|
||||
sampling_params = SamplingParams(
|
||||
temperature=0,
|
||||
min_tokens=13,
|
||||
max_tokens=13,
|
||||
stop=stop,
|
||||
)
|
||||
|
||||
stop_at = 5 if num_scheduler_steps == 1 else 1
|
||||
|
||||
request_id = uid()
|
||||
|
||||
i = 0
|
||||
with pytest.raises(CancelledError):
|
||||
async for output in async_engine.generate("test2",
|
||||
sampling_params,
|
||||
request_id=request_id):
|
||||
assert not output.finished
|
||||
i += 1
|
||||
if i == stop_at:
|
||||
await async_engine.abort(request_id)
|
||||
|
||||
assert i == stop_at
|
||||
|
||||
|
||||
@pytest.mark.asyncio(scope="module")
|
||||
@pytest.mark.parametrize("stop", [None, ["a stop string"]])
|
||||
async def test_delayed_generator(async_engine, stop):
|
||||
scheduler_config = await async_engine.get_scheduler_config()
|
||||
|
||||
if scheduler_config.num_scheduler_steps != 1:
|
||||
pytest.skip("no need to test this one with multistep")
|
||||
|
||||
sampling_params = SamplingParams(
|
||||
temperature=0,
|
||||
min_tokens=10,
|
||||
max_tokens=10,
|
||||
stop=stop,
|
||||
)
|
||||
|
||||
stream = async_engine.generate("test3", sampling_params, request_id=uid())
|
||||
i = 0
|
||||
final_output: Optional[RealRequestOutput] = None
|
||||
async for output in stream:
|
||||
final_output = output
|
||||
if i == 0:
|
||||
# wait for generation to complete before consuming
|
||||
# the remaining messages
|
||||
await asyncio.sleep(1)
|
||||
if i < 9:
|
||||
assert not output.finished
|
||||
i += 1
|
||||
|
||||
assert i == 10
|
||||
assert final_output is not None
|
||||
assert len(final_output.outputs[0].token_ids) == 10
|
||||
assert final_output.finished
|
||||
|
||||
|
||||
@pytest.mark.asyncio(scope="module")
|
||||
async def test_invalid_argument(async_engine):
|
||||
scheduler_config = await async_engine.get_scheduler_config()
|
||||
|
||||
if scheduler_config.num_scheduler_steps != 1:
|
||||
pytest.skip("no need to test this one with multistep")
|
||||
|
||||
sampling_params = SamplingParams(
|
||||
temperature=0,
|
||||
min_tokens=10,
|
||||
max_tokens=10,
|
||||
)
|
||||
|
||||
# Targeting specific DP rank only supported in v1 multi-instance DP
|
||||
with pytest.raises(ValueError):
|
||||
async for _ in async_engine.generate("test",
|
||||
sampling_params,
|
||||
request_id=uid(),
|
||||
data_parallel_rank=0):
|
||||
pass
|
||||
@ -2,4 +2,3 @@ port: 12312
|
||||
served_model_name: mymodel
|
||||
tensor_parallel_size: 2
|
||||
trust_remote_code: true
|
||||
multi_step_stream_outputs: false
|
||||
|
||||
@ -4,4 +4,3 @@ port: 12312
|
||||
served_model_name: mymodel
|
||||
tensor_parallel_size: 2
|
||||
trust_remote_code: true
|
||||
multi_step_stream_outputs: false
|
||||
|
||||
@ -644,11 +644,9 @@ def test_chunked_prefill_preempt():
|
||||
assert out.num_batched_tokens == max_num_batched_tokens
|
||||
|
||||
|
||||
@pytest.mark.parametrize("num_scheduler_steps", [1, 5])
|
||||
def test_chunked_prefill_spec_prefill(num_scheduler_steps):
|
||||
def test_chunked_prefill_spec_prefill():
|
||||
"""Verify that the num_lookahead_slots is set appropriately for an all"""
|
||||
"""prefill batch depending on whether multi-step scheduling is enabled"""
|
||||
"""or not"""
|
||||
"""prefill batch."""
|
||||
block_size = 4
|
||||
max_seqs = 30
|
||||
max_model_len = 200
|
||||
@ -661,7 +659,6 @@ def test_chunked_prefill_spec_prefill(num_scheduler_steps):
|
||||
max_model_len,
|
||||
enable_chunked_prefill=True,
|
||||
num_lookahead_slots=num_lookahead_slots,
|
||||
num_scheduler_steps=num_scheduler_steps,
|
||||
)
|
||||
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
|
||||
cache_config.num_cpu_blocks = 16
|
||||
@ -679,8 +676,7 @@ def test_chunked_prefill_spec_prefill(num_scheduler_steps):
|
||||
assert out.num_prefill_groups == 1
|
||||
assert out.num_batched_tokens == max_num_batched_tokens
|
||||
print(out.num_lookahead_slots)
|
||||
assert out.num_lookahead_slots == (0 if (num_scheduler_steps == 1) else
|
||||
num_lookahead_slots)
|
||||
assert out.num_lookahead_slots == 0
|
||||
|
||||
|
||||
def test_chunked_prefill_max_seqs():
|
||||
|
||||
@ -6,7 +6,6 @@ import pytest
|
||||
from tests.conftest import VllmRunner
|
||||
from tests.core.utils import create_dummy_prompt
|
||||
from vllm.engine.llm_engine import LLMEngine
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.sequence import SequenceGroup
|
||||
|
||||
MODEL = "JackFram/llama-160m"
|
||||
@ -17,32 +16,19 @@ def add_seq_group_to_engine(engine: LLMEngine, seq_group: SequenceGroup):
|
||||
scheduler.add_seq_group(seq_group)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("num_scheduler_steps", [1, 8])
|
||||
@pytest.mark.parametrize("enable_chunked_prefill", [False, True])
|
||||
@pytest.mark.parametrize("enforce_eager", [False, True])
|
||||
def test_num_computed_tokens_update(num_scheduler_steps: int,
|
||||
enable_chunked_prefill: bool,
|
||||
def test_num_computed_tokens_update(enable_chunked_prefill: bool,
|
||||
enforce_eager: bool):
|
||||
|
||||
is_multi_step = num_scheduler_steps > 1
|
||||
is_multi_step_chunked_prefill = is_multi_step and enable_chunked_prefill
|
||||
|
||||
if is_multi_step_chunked_prefill and current_platform.is_rocm():
|
||||
pytest.skip("Multi-step with Chunked-Prefill does not support "
|
||||
"rocm_flash_attn backend")
|
||||
|
||||
# Make a vllm engine
|
||||
runner = VllmRunner(model_name=MODEL,
|
||||
gpu_memory_utilization=0.7,
|
||||
num_scheduler_steps=num_scheduler_steps,
|
||||
enable_chunked_prefill=enable_chunked_prefill,
|
||||
enforce_eager=enforce_eager)
|
||||
engine: LLMEngine = runner.llm.llm_engine
|
||||
|
||||
# In multi-step + chunked-prefill there is no separate single prompt step.
|
||||
# What is scheduled will run for num_scheduler_steps always.
|
||||
num_prompt_steps = num_scheduler_steps \
|
||||
if is_multi_step_chunked_prefill else 1
|
||||
num_prompt_steps = 1
|
||||
|
||||
num_output_tokens_list = [4, 8, 12, 15, 16, 17]
|
||||
|
||||
@ -73,10 +59,8 @@ def test_num_computed_tokens_update(num_scheduler_steps: int,
|
||||
# Test correctness of num_computed_tokens after the decode steps
|
||||
assert seq.data.get_num_computed_tokens(
|
||||
) == prompt_num_computed_tokens + decode_step_counter
|
||||
for _ in range(num_scheduler_steps):
|
||||
# decode step
|
||||
engine.step()
|
||||
decode_step_counter += 1
|
||||
engine.step()
|
||||
decode_step_counter += 1
|
||||
|
||||
# Test correctness of num_computed_tokens after the sequence finish.
|
||||
assert seq.data.get_num_computed_tokens(
|
||||
|
||||
@ -1,274 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import random
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
import pytest
|
||||
from transformers import PreTrainedTokenizer
|
||||
|
||||
from vllm.core.scheduler import Scheduler
|
||||
from vllm.engine.output_processor.multi_step import MultiStepOutputProcessor
|
||||
from vllm.engine.output_processor.stop_checker import StopChecker
|
||||
from vllm.sampling_params import SamplingParams
|
||||
from vllm.sequence import (CompletionSequenceGroupOutput, Logprob,
|
||||
SequenceOutput, SequenceStatus)
|
||||
from vllm.transformers_utils.detokenizer import Detokenizer
|
||||
from vllm.utils import Counter
|
||||
|
||||
from ..core.utils import create_seq_group
|
||||
|
||||
|
||||
@pytest.mark.parametrize("seq_output_len", [128])
|
||||
@pytest.mark.parametrize("num_new_tokens", [1, 12])
|
||||
@pytest.mark.skip_global_cleanup
|
||||
def test_appends_token_ids(num_new_tokens: int, seq_output_len: int):
|
||||
"""Verify multi-step decoding appends token ids correctly.
|
||||
|
||||
We append token ids and verify all the token ids were appended correctly.
|
||||
Note that ignore_eos=True.
|
||||
"""
|
||||
detokenizer = MagicMock(spec=Detokenizer)
|
||||
scheduler = MagicMock(spec=Scheduler)
|
||||
stop_checker = MagicMock(spec=StopChecker)
|
||||
seq_counter = Counter()
|
||||
|
||||
output_processor = MultiStepOutputProcessor(
|
||||
detokenizer=detokenizer,
|
||||
scheduler=[scheduler],
|
||||
seq_counter=seq_counter,
|
||||
get_tokenizer_for_seq=lambda _: mock_tokenizer(),
|
||||
stop_checker=stop_checker,
|
||||
)
|
||||
|
||||
seq_group = create_seq_group(
|
||||
seq_prompt_len=1024,
|
||||
seq_output_lens=[seq_output_len],
|
||||
sampling_params=SamplingParams(max_tokens=seq_output_len +
|
||||
num_new_tokens,
|
||||
ignore_eos=True),
|
||||
)
|
||||
|
||||
seq = seq_group.get_seqs()[0]
|
||||
seq.status = SequenceStatus.RUNNING
|
||||
|
||||
new_token_ids = list(range(num_new_tokens))
|
||||
|
||||
outputs = [
|
||||
CompletionSequenceGroupOutput(
|
||||
samples=[
|
||||
SequenceOutput(
|
||||
parent_seq_id=seq.seq_id,
|
||||
output_token=output_token,
|
||||
logprobs={output_token: Logprob(0.0)},
|
||||
)
|
||||
],
|
||||
prompt_logprobs=None,
|
||||
) for output_token in new_token_ids
|
||||
]
|
||||
|
||||
assert seq.get_token_ids()[-len(new_token_ids):] != new_token_ids
|
||||
output_processor.process_outputs(seq_group, outputs)
|
||||
assert seq.get_token_ids()[-len(new_token_ids):] == new_token_ids
|
||||
|
||||
|
||||
@pytest.mark.parametrize("seq_prompt_len", [1024])
|
||||
@pytest.mark.parametrize("seq_output_len", [128])
|
||||
@pytest.mark.parametrize("num_new_tokens", [5, 6, 7, 8])
|
||||
@pytest.mark.parametrize("max_tokens", [128 + 3])
|
||||
@pytest.mark.skip_global_cleanup
|
||||
def test_respects_max_tokens(num_new_tokens: int, seq_prompt_len: int,
|
||||
seq_output_len: int, max_tokens: int):
|
||||
"""Verify tokens after max_tokens are dropped and not appended to the
|
||||
sequence.
|
||||
"""
|
||||
detokenizer = MagicMock(spec=Detokenizer)
|
||||
scheduler = MagicMock(spec=Scheduler)
|
||||
stop_checker = MagicMock(spec=StopChecker)
|
||||
seq_counter = Counter()
|
||||
|
||||
output_processor = MultiStepOutputProcessor(
|
||||
detokenizer=detokenizer,
|
||||
scheduler=[scheduler],
|
||||
seq_counter=seq_counter,
|
||||
get_tokenizer_for_seq=lambda _: mock_tokenizer(),
|
||||
stop_checker=stop_checker,
|
||||
)
|
||||
|
||||
seq_group = create_seq_group(
|
||||
seq_prompt_len=seq_prompt_len,
|
||||
seq_output_lens=[seq_output_len],
|
||||
sampling_params=SamplingParams(max_tokens=max_tokens, ),
|
||||
)
|
||||
|
||||
seq = seq_group.get_seqs()[0]
|
||||
seq.status = SequenceStatus.RUNNING
|
||||
|
||||
new_token_ids = list(range(num_new_tokens))
|
||||
|
||||
outputs = [
|
||||
CompletionSequenceGroupOutput(
|
||||
samples=[
|
||||
SequenceOutput(
|
||||
parent_seq_id=seq.seq_id,
|
||||
output_token=output_token,
|
||||
logprobs={output_token: Logprob(0.0)},
|
||||
)
|
||||
],
|
||||
prompt_logprobs=None,
|
||||
) for output_token in new_token_ids
|
||||
]
|
||||
|
||||
assert seq.get_len() == seq_prompt_len + seq_output_len
|
||||
output_processor.process_outputs(seq_group, outputs)
|
||||
|
||||
# Expect the processed sequence to not go over max tokens in len.
|
||||
assert seq.get_len() == seq_prompt_len + max_tokens
|
||||
|
||||
# Expect the correct tokens were appended.
|
||||
expected_appended_tokens = new_token_ids[:max_tokens - seq_output_len]
|
||||
assert seq.get_token_ids(
|
||||
)[-len(expected_appended_tokens):] == expected_appended_tokens
|
||||
|
||||
|
||||
@pytest.mark.parametrize("seq_prompt_len", [1024])
|
||||
@pytest.mark.parametrize("seq_output_len", [128])
|
||||
@pytest.mark.parametrize("num_new_tokens", [12])
|
||||
@pytest.mark.parametrize("seed", list(range(6)))
|
||||
@pytest.mark.skip_global_cleanup
|
||||
def test_respects_eos_token_id(num_new_tokens: int, seq_prompt_len: int,
|
||||
seq_output_len: int, seed: int):
|
||||
"""Verify the eos token id is included in the sequence, but subsequent
|
||||
tokens are dropped (not appended to sequence).
|
||||
"""
|
||||
random.seed(seed)
|
||||
detokenizer = MagicMock(spec=Detokenizer)
|
||||
scheduler = MagicMock(spec=Scheduler)
|
||||
stop_checker = MagicMock(spec=StopChecker)
|
||||
seq_counter = Counter()
|
||||
|
||||
eos_token_id = 100
|
||||
|
||||
output_processor = MultiStepOutputProcessor(
|
||||
detokenizer=detokenizer,
|
||||
scheduler=[scheduler],
|
||||
seq_counter=seq_counter,
|
||||
get_tokenizer_for_seq=lambda _: mock_tokenizer(eos_token_id),
|
||||
stop_checker=stop_checker,
|
||||
)
|
||||
|
||||
seq_group = create_seq_group(
|
||||
seq_prompt_len=seq_prompt_len,
|
||||
seq_output_lens=[seq_output_len],
|
||||
sampling_params=SamplingParams(
|
||||
# Ensure enough space.
|
||||
max_tokens=seq_output_len + num_new_tokens, ),
|
||||
)
|
||||
|
||||
seq = seq_group.get_seqs()[0]
|
||||
seq.status = SequenceStatus.RUNNING
|
||||
|
||||
new_token_ids = list(range(num_new_tokens))
|
||||
assert eos_token_id not in new_token_ids
|
||||
eos_index = random.randint(0, len(new_token_ids) - 1)
|
||||
new_token_ids[eos_index] = eos_token_id
|
||||
|
||||
outputs = [
|
||||
CompletionSequenceGroupOutput(
|
||||
samples=[
|
||||
SequenceOutput(
|
||||
parent_seq_id=seq.seq_id,
|
||||
output_token=output_token,
|
||||
logprobs={output_token: Logprob(0.0)},
|
||||
)
|
||||
],
|
||||
prompt_logprobs=None,
|
||||
) for output_token in new_token_ids
|
||||
]
|
||||
|
||||
assert seq.get_len() == seq_prompt_len + seq_output_len
|
||||
output_processor.process_outputs(seq_group, outputs)
|
||||
|
||||
# Expect the processed sequence to not go beyond provided eos.
|
||||
assert seq.get_len() == seq_prompt_len + seq_output_len + (eos_index + 1)
|
||||
|
||||
# Expect the correct tokens were appended.
|
||||
expected_appended_tokens = new_token_ids[:eos_index + 1]
|
||||
assert seq.get_token_ids(
|
||||
)[-len(expected_appended_tokens):] == expected_appended_tokens
|
||||
|
||||
|
||||
@pytest.mark.parametrize("seq_prompt_len", [1024])
|
||||
@pytest.mark.parametrize("seq_output_len", [128])
|
||||
@pytest.mark.parametrize("num_new_tokens", [12])
|
||||
@pytest.mark.parametrize("seed", list(range(6)))
|
||||
@pytest.mark.skip_global_cleanup
|
||||
def test_ignores_eos_token_id(num_new_tokens: int, seq_prompt_len: int,
|
||||
seq_output_len: int, seed: int):
|
||||
"""When sampling parameters dictate that we should ignore the eos token id,
|
||||
ensure all token ids are appended even if the eos token id is emitted.
|
||||
"""
|
||||
random.seed(seed)
|
||||
detokenizer = MagicMock(spec=Detokenizer)
|
||||
scheduler = MagicMock(spec=Scheduler)
|
||||
stop_checker = MagicMock(spec=StopChecker)
|
||||
seq_counter = Counter()
|
||||
|
||||
eos_token_id = 100
|
||||
|
||||
output_processor = MultiStepOutputProcessor(
|
||||
detokenizer=detokenizer,
|
||||
scheduler=[scheduler],
|
||||
seq_counter=seq_counter,
|
||||
get_tokenizer_for_seq=lambda _: mock_tokenizer(eos_token_id),
|
||||
stop_checker=stop_checker,
|
||||
)
|
||||
|
||||
seq_group = create_seq_group(
|
||||
seq_prompt_len=seq_prompt_len,
|
||||
seq_output_lens=[seq_output_len],
|
||||
sampling_params=SamplingParams(
|
||||
# Ensure enough space.
|
||||
max_tokens=seq_output_len + num_new_tokens,
|
||||
ignore_eos=True,
|
||||
),
|
||||
)
|
||||
|
||||
seq = seq_group.get_seqs()[0]
|
||||
seq.status = SequenceStatus.RUNNING
|
||||
|
||||
new_token_ids = list(range(num_new_tokens))
|
||||
assert eos_token_id not in new_token_ids
|
||||
eos_index = random.randint(0, len(new_token_ids) - 1)
|
||||
new_token_ids[eos_index] = eos_token_id
|
||||
|
||||
outputs = [
|
||||
CompletionSequenceGroupOutput(
|
||||
samples=[
|
||||
SequenceOutput(
|
||||
parent_seq_id=seq.seq_id,
|
||||
output_token=output_token,
|
||||
logprobs={output_token: Logprob(0.0)},
|
||||
)
|
||||
],
|
||||
prompt_logprobs=None,
|
||||
) for output_token in new_token_ids
|
||||
]
|
||||
|
||||
assert seq.get_len() == seq_prompt_len + seq_output_len
|
||||
output_processor.process_outputs(seq_group, outputs)
|
||||
|
||||
# Expect the processed sequence to go beyond eos.
|
||||
assert seq.get_len() == seq_prompt_len + seq_output_len + num_new_tokens
|
||||
|
||||
# Expect the correct tokens were appended.
|
||||
expected_appended_tokens = new_token_ids[:seq_output_len + num_new_tokens -
|
||||
seq_output_len]
|
||||
assert seq.get_token_ids(
|
||||
)[-len(expected_appended_tokens):] == expected_appended_tokens
|
||||
|
||||
|
||||
def mock_tokenizer(eos_token_id=1000):
|
||||
tokenizer = MagicMock(spec=PreTrainedTokenizer)
|
||||
tokenizer.eos_token_id = eos_token_id
|
||||
return tokenizer
|
||||
@ -96,9 +96,6 @@ def test_lm_eval_accuracy_v1_engine_fp8_kv_cache(
|
||||
more_args = None
|
||||
if current_platform.is_tpu():
|
||||
# Limit compilation time for TPU V1
|
||||
|
||||
# xet doesn't work well for Qwen/Qwen3-1.7B
|
||||
m.setenv("HF_HUB_DISABLE_XET", "1")
|
||||
more_args = "max_model_len=2048,max_num_seqs=128,kv_cache_dtype=fp8"
|
||||
|
||||
# Add TP test (if provided)
|
||||
|
||||
@ -65,3 +65,9 @@ def test_pooling_params(llm: LLM):
|
||||
assert torch.allclose(
|
||||
softmax(wo_activation), w_activation, atol=1e-2
|
||||
), "w_activation should be close to activation(wo_activation)."
|
||||
|
||||
|
||||
def test_encode_api(llm: LLM):
|
||||
err_msg = "pooling_task must be one of.+"
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
llm.encode(prompts, use_tqdm=False)
|
||||
|
||||
@ -26,15 +26,12 @@ DEFAULT_ARGS = ["--max-model-len", "4096"]
|
||||
MORE_ARGS_LIST = [
|
||||
[], # Default
|
||||
["--enable-chunked-prefill"], # Chunked
|
||||
["--num-scheduler-steps", "8"], # MS
|
||||
["--num-scheduler-steps", "8", "--multi-step-stream-outputs"] # MS+Stream
|
||||
]
|
||||
MAX_WAIT_SECONDS = None
|
||||
|
||||
if current_platform.is_tpu():
|
||||
MORE_ARGS_LIST = [
|
||||
[], # Default
|
||||
# ["--num-scheduler-steps", "8"], # Multi-step << currently fails
|
||||
]
|
||||
MAX_WAIT_SECONDS = 600
|
||||
|
||||
|
||||
@ -23,6 +23,8 @@ MAXIMUM_AUDIOS = 2
|
||||
@pytest.fixture(scope="module")
|
||||
def server():
|
||||
args = [
|
||||
"--dtype",
|
||||
"float32",
|
||||
"--max-model-len",
|
||||
"2048",
|
||||
"--max-num-seqs",
|
||||
|
||||
@ -211,3 +211,18 @@ async def test_activation(server: RemoteOpenAIServer, model_name: str):
|
||||
assert torch.allclose(
|
||||
F.softmax(wo_activation, dim=-1), w_activation, atol=1e-2
|
||||
), "w_activation should be close to activation(wo_activation)."
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
def test_pooling(server: RemoteOpenAIServer, model_name: str):
|
||||
# pooling api uses ALL pooling, which does not support chunked prefill.
|
||||
response = requests.post(
|
||||
server.url_for("pooling"),
|
||||
json={
|
||||
"model": model_name,
|
||||
"input": "test",
|
||||
"encoding_format": "float"
|
||||
},
|
||||
)
|
||||
assert response.json()["error"]["type"] == "BadRequestError"
|
||||
|
||||
441
tests/entrypoints/openai/test_embedding_long_text.py
Normal file
441
tests/entrypoints/openai/test_embedding_long_text.py
Normal file
@ -0,0 +1,441 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
Test cases for long text embedding with automatic chunking mechanism.
|
||||
|
||||
This test suite validates vLLM's automatic chunking functionality for handling
|
||||
text inputs that exceed the model's maximum token length, specifically targeting
|
||||
the intfloat/multilingual-e5-small model (max token length: 512).
|
||||
"""
|
||||
|
||||
import random
|
||||
|
||||
import openai
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
|
||||
from vllm.entrypoints.openai.protocol import EmbeddingResponse
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
|
||||
def _generate_random_text(word_count: int) -> str:
|
||||
"""Generate random text with approximately the specified word count."""
|
||||
# Common English words with focus on verbs and nouns for realistic text
|
||||
common_words = [
|
||||
# Essential articles and pronouns (minimal)
|
||||
"the",
|
||||
"and",
|
||||
"you",
|
||||
"they",
|
||||
"this",
|
||||
"that",
|
||||
"these",
|
||||
"those",
|
||||
|
||||
# Action verbs
|
||||
"create",
|
||||
"build",
|
||||
"develop",
|
||||
"design",
|
||||
"implement",
|
||||
"execute",
|
||||
"analyze",
|
||||
"process",
|
||||
"generate",
|
||||
"calculate",
|
||||
"evaluate",
|
||||
"optimize",
|
||||
"transform",
|
||||
"integrate",
|
||||
"configure",
|
||||
"deploy",
|
||||
"monitor",
|
||||
"manage",
|
||||
"discover",
|
||||
"explore",
|
||||
"investigate",
|
||||
"research",
|
||||
"study",
|
||||
"examine",
|
||||
"improve",
|
||||
"enhance",
|
||||
"upgrade",
|
||||
"modify",
|
||||
"update",
|
||||
"maintain",
|
||||
"solve",
|
||||
"resolve",
|
||||
"handle",
|
||||
"address",
|
||||
"tackle",
|
||||
"overcome",
|
||||
"communicate",
|
||||
"collaborate",
|
||||
"coordinate",
|
||||
"organize",
|
||||
"plan",
|
||||
"achieve",
|
||||
"accomplish",
|
||||
"complete",
|
||||
"finish",
|
||||
"deliver",
|
||||
"provide",
|
||||
|
||||
# Technology and science nouns
|
||||
"system",
|
||||
"application",
|
||||
"software",
|
||||
"hardware",
|
||||
"network",
|
||||
"database",
|
||||
"algorithm",
|
||||
"model",
|
||||
"framework",
|
||||
"platform",
|
||||
"interface",
|
||||
"protocol",
|
||||
"architecture",
|
||||
"infrastructure",
|
||||
"component",
|
||||
"module",
|
||||
"service",
|
||||
"technology",
|
||||
"innovation",
|
||||
"solution",
|
||||
"methodology",
|
||||
"approach",
|
||||
"artificial",
|
||||
"intelligence",
|
||||
"machine",
|
||||
"learning",
|
||||
"neural",
|
||||
"network",
|
||||
"computer",
|
||||
"processor",
|
||||
"memory",
|
||||
"storage",
|
||||
"computation",
|
||||
"data",
|
||||
"information",
|
||||
"knowledge",
|
||||
"insight",
|
||||
"pattern",
|
||||
"trend",
|
||||
"analysis",
|
||||
"research",
|
||||
"development",
|
||||
"engineering",
|
||||
"science",
|
||||
"mathematics",
|
||||
"statistics",
|
||||
"probability",
|
||||
"optimization",
|
||||
"performance",
|
||||
"efficiency",
|
||||
|
||||
# General nouns
|
||||
"project",
|
||||
"team",
|
||||
"organization",
|
||||
"company",
|
||||
"business",
|
||||
"industry",
|
||||
"market",
|
||||
"customer",
|
||||
"user",
|
||||
"client",
|
||||
"product",
|
||||
"feature",
|
||||
"function",
|
||||
"requirement",
|
||||
"specification",
|
||||
"documentation",
|
||||
"report",
|
||||
"result",
|
||||
"outcome",
|
||||
"impact",
|
||||
"benefit",
|
||||
"advantage",
|
||||
"challenge",
|
||||
"problem",
|
||||
"opportunity",
|
||||
"strategy",
|
||||
"goal",
|
||||
"objective",
|
||||
"target",
|
||||
"milestone",
|
||||
"process",
|
||||
"procedure",
|
||||
"workflow",
|
||||
"pipeline",
|
||||
"operation",
|
||||
"task",
|
||||
"activity",
|
||||
"event",
|
||||
"session",
|
||||
"meeting",
|
||||
"discussion",
|
||||
"decision"
|
||||
]
|
||||
|
||||
words = []
|
||||
for _ in range(word_count):
|
||||
words.append(random.choice(common_words))
|
||||
|
||||
# Add some punctuation for more realistic text
|
||||
text = " ".join(words)
|
||||
# Add periods every 10-20 words
|
||||
words_list = text.split()
|
||||
result = []
|
||||
for i, word in enumerate(words_list):
|
||||
result.append(word)
|
||||
if ((i + 1) % random.randint(10, 20) == 0 and i < len(words_list) - 1):
|
||||
result[-1] += "."
|
||||
|
||||
return " ".join(result)
|
||||
|
||||
|
||||
MODEL_NAME = "intfloat/multilingual-e5-small"
|
||||
DTYPE = "bfloat16"
|
||||
|
||||
# Test text: Generate text with approximately 1500 words to exceed 1024 tokens
|
||||
LONG_TEXT_1500_WORDS = _generate_random_text(1500)
|
||||
|
||||
# Test text: Generate text with approximately 2500 words to exceed 2048 tokens
|
||||
LONG_TEXT_2500_WORDS = _generate_random_text(2500)
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def server_with_chunked_processing():
|
||||
"""Start server with automatic chunking processing enabled."""
|
||||
args = [
|
||||
"--runner",
|
||||
"pooling",
|
||||
"--dtype",
|
||||
DTYPE,
|
||||
"--enforce-eager",
|
||||
"--max-model-len",
|
||||
"512", # Set smaller max_model_len to trigger chunking mechanism
|
||||
'--override-pooler-config',
|
||||
('{"pooling_type": "MEAN", "normalize": true, '
|
||||
'"enable_chunked_processing": true, "max_embed_len": 10000}'),
|
||||
"--gpu-memory-utilization",
|
||||
"0.8",
|
||||
]
|
||||
|
||||
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def client_with_chunked_processing(server_with_chunked_processing):
|
||||
"""Create async client with chunking processing support."""
|
||||
async with server_with_chunked_processing.get_async_client(
|
||||
) as async_client:
|
||||
yield async_client
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_long_text_embedding_1500_chars(
|
||||
client_with_chunked_processing: openai.AsyncOpenAI, model_name: str):
|
||||
"""Test embedding processing for ~1500 character long text
|
||||
(~1028 tokens, exceeding 512 token limit)."""
|
||||
|
||||
# Verify text length
|
||||
# Verify text has sufficient word count (approximately 1500 words)
|
||||
word_count = len(LONG_TEXT_1500_WORDS.split())
|
||||
assert word_count >= 1400, (
|
||||
f"Test text word count insufficient: {word_count} words")
|
||||
|
||||
# Send embedding request
|
||||
embedding_response = await client_with_chunked_processing.embeddings.create(
|
||||
model=model_name,
|
||||
input=[LONG_TEXT_1500_WORDS],
|
||||
encoding_format="float",
|
||||
)
|
||||
|
||||
# Verify response structure
|
||||
embeddings = EmbeddingResponse.model_validate(
|
||||
embedding_response.model_dump(mode="json"))
|
||||
|
||||
assert embeddings.id is not None
|
||||
assert len(embeddings.data) == 1
|
||||
assert len(embeddings.data[0].embedding
|
||||
) == 384 # multilingual-e5-small embedding dimension
|
||||
assert embeddings.usage.completion_tokens == 0
|
||||
# Due to chunked processing, token count should
|
||||
# reflect actual processed tokens
|
||||
# With ~1500 words, we expect roughly
|
||||
# 1024+ tokens (exceeding 512 token limit)
|
||||
# Should exceed single chunk limit of 512
|
||||
assert embeddings.usage.prompt_tokens > 800
|
||||
assert embeddings.usage.total_tokens == embeddings.usage.prompt_tokens
|
||||
|
||||
# Verify embedding vector validity
|
||||
embedding_vector = embeddings.data[0].embedding
|
||||
assert all(
|
||||
isinstance(x, float)
|
||||
for x in embedding_vector), "Embedding vector should contain floats"
|
||||
assert not all(
|
||||
x == 0
|
||||
for x in embedding_vector), "Embedding vector should not be all zeros"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_long_text_embedding_2500_chars(
|
||||
client_with_chunked_processing: openai.AsyncOpenAI, model_name: str):
|
||||
"""Test embedding processing for ~2500 character long text
|
||||
(~2048 tokens, requiring multiple chunks)."""
|
||||
|
||||
# Verify text length
|
||||
# Verify text has sufficient word count (approximately 2500 words)
|
||||
word_count = len(LONG_TEXT_2500_WORDS.split())
|
||||
assert word_count >= 2300, (
|
||||
f"Test text word count insufficient: {word_count} words")
|
||||
|
||||
# Send embedding request
|
||||
embedding_response = await client_with_chunked_processing.embeddings.create(
|
||||
model=model_name,
|
||||
input=[LONG_TEXT_2500_WORDS],
|
||||
encoding_format="float",
|
||||
)
|
||||
|
||||
# Verify response structure
|
||||
embeddings = EmbeddingResponse.model_validate(
|
||||
embedding_response.model_dump(mode="json"))
|
||||
|
||||
assert embeddings.id is not None
|
||||
assert len(embeddings.data) == 1
|
||||
assert len(embeddings.data[0].embedding
|
||||
) == 384 # multilingual-e5-small embedding dimension
|
||||
assert embeddings.usage.completion_tokens == 0
|
||||
# Due to chunked processing, token count should
|
||||
# reflect actual processed tokens
|
||||
# With ~2500 words, we expect
|
||||
# roughly 2048+ tokens (requiring multiple chunks)
|
||||
# Should require multiple chunks for processing
|
||||
assert embeddings.usage.prompt_tokens > 1500
|
||||
assert embeddings.usage.total_tokens == embeddings.usage.prompt_tokens
|
||||
|
||||
# Verify embedding vector validity
|
||||
embedding_vector = embeddings.data[0].embedding
|
||||
assert all(
|
||||
isinstance(x, float)
|
||||
for x in embedding_vector), "Embedding vector should contain floats"
|
||||
assert not all(
|
||||
x == 0
|
||||
for x in embedding_vector), "Embedding vector should not be all zeros"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_batch_long_text_embedding(
|
||||
client_with_chunked_processing: openai.AsyncOpenAI, model_name: str):
|
||||
"""Test batch long text embedding processing."""
|
||||
|
||||
input_texts = [
|
||||
LONG_TEXT_1500_WORDS,
|
||||
LONG_TEXT_2500_WORDS,
|
||||
"This is a short text test.", # Short text for comparison
|
||||
]
|
||||
|
||||
# Send batch embedding request
|
||||
embedding_response = await client_with_chunked_processing.embeddings.create(
|
||||
model=model_name,
|
||||
input=input_texts,
|
||||
encoding_format="float",
|
||||
)
|
||||
|
||||
# Verify response structure
|
||||
embeddings = EmbeddingResponse.model_validate(
|
||||
embedding_response.model_dump(mode="json"))
|
||||
|
||||
assert embeddings.id is not None
|
||||
assert len(embeddings.data) == 3 # Three input texts
|
||||
|
||||
# Verify each embedding dimension
|
||||
for i, embedding_data in enumerate(embeddings.data):
|
||||
assert len(embedding_data.embedding) == 384
|
||||
assert embedding_data.index == i
|
||||
|
||||
# Verify embedding vector validity
|
||||
embedding_vector = embedding_data.embedding
|
||||
assert all(isinstance(x, float) for x in embedding_vector)
|
||||
assert not all(x == 0 for x in embedding_vector)
|
||||
|
||||
# Verify token usage
|
||||
assert embeddings.usage.completion_tokens == 0
|
||||
# Total token count should be very substantial
|
||||
assert embeddings.usage.prompt_tokens > 1000
|
||||
assert embeddings.usage.total_tokens == embeddings.usage.prompt_tokens
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_chunked_vs_normal_consistency(
|
||||
client_with_chunked_processing: openai.AsyncOpenAI, model_name: str):
|
||||
"""Test consistency between chunked and
|
||||
normal processing (using short text)."""
|
||||
|
||||
# Use a short text within the 512 token limit
|
||||
short_text = ("Artificial intelligence technology is changing our world, "
|
||||
"bringing unprecedented opportunities and challenges.")
|
||||
|
||||
# Send embedding request
|
||||
embedding_response = await client_with_chunked_processing.embeddings.create(
|
||||
model=model_name,
|
||||
input=[short_text],
|
||||
encoding_format="float",
|
||||
)
|
||||
|
||||
# Verify response structure
|
||||
embeddings = EmbeddingResponse.model_validate(
|
||||
embedding_response.model_dump(mode="json"))
|
||||
|
||||
assert embeddings.id is not None
|
||||
assert len(embeddings.data) == 1
|
||||
assert len(embeddings.data[0].embedding) == 384
|
||||
assert embeddings.usage.completion_tokens == 0
|
||||
# Short text should not require chunked processing
|
||||
assert embeddings.usage.prompt_tokens < 512
|
||||
assert embeddings.usage.total_tokens == embeddings.usage.prompt_tokens
|
||||
|
||||
# 验证embedding向量的有效性
|
||||
embedding_vector = embeddings.data[0].embedding
|
||||
assert all(isinstance(x, float) for x in embedding_vector)
|
||||
assert not all(x == 0 for x in embedding_vector)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_chunked_processing_response_format(
|
||||
client_with_chunked_processing: openai.AsyncOpenAI, model_name: str):
|
||||
"""Test response format and structure during chunked processing."""
|
||||
|
||||
# Test with long text to trigger chunking
|
||||
embedding_response = await client_with_chunked_processing.embeddings.create(
|
||||
model=model_name,
|
||||
input=[LONG_TEXT_1500_WORDS],
|
||||
encoding_format="float",
|
||||
)
|
||||
|
||||
# Verify response structure
|
||||
embeddings = EmbeddingResponse.model_validate(
|
||||
embedding_response.model_dump(mode="json"))
|
||||
|
||||
assert embeddings.id is not None
|
||||
assert len(embeddings.data) == 1
|
||||
assert embeddings.data[0].object == "embedding"
|
||||
assert embeddings.data[0].index == 0
|
||||
|
||||
# Verify embedding vector properties
|
||||
embedding_vector = embeddings.data[0].embedding
|
||||
import math
|
||||
vector_norm = math.sqrt(sum(x * x for x in embedding_vector))
|
||||
# Check that the vector is normalized
|
||||
# (default behavior for most embedding models)
|
||||
assert 0.8 < vector_norm < 1.2, (
|
||||
f"Vector norm should be reasonable, actual: {vector_norm}")
|
||||
@ -54,38 +54,54 @@ def before_generate_case(context: schemathesis.hooks.HookContext, strategy):
|
||||
op = context.operation
|
||||
assert op is not None
|
||||
|
||||
def no_file_type(case: schemathesis.models.Case):
|
||||
def no_invalid_types(case: schemathesis.models.Case):
|
||||
"""
|
||||
This filter skips test cases for the `POST /tokenize` endpoint where the
|
||||
HTTP request body uses `"type": "file"` in any message's content.
|
||||
We expect these cases to fail because that type isn't implemented here
|
||||
https://github.com/vllm-project/vllm/blob/0b34593017953051b3225b1483ce0f4670e3eb0e/vllm/entrypoints/chat_utils.py#L1038-L1095
|
||||
This filter skips test cases with invalid data that schemathesis
|
||||
incorrectly generates due to permissive schema configurations.
|
||||
|
||||
1. Skips `POST /tokenize` endpoint cases with `"type": "file"` in
|
||||
message content, which isn't implemented.
|
||||
|
||||
2. Skips tool_calls with `"type": "custom"` which schemathesis
|
||||
incorrectly generates instead of the valid `"type": "function"`.
|
||||
|
||||
Example test cases that are skipped:
|
||||
curl -X POST -H 'Content-Type: application/json' \
|
||||
-d '{"messages": [{"role": "assistant"}, {"content": [{"file": {}, "type": "file"}], "role": "user"}]}' \
|
||||
-d '{"messages": [{"content": [{"file": {}, "type": "file"}], "role": "user"}]}' \
|
||||
http://localhost:8000/tokenize
|
||||
|
||||
curl -X POST -H 'Content-Type: application/json' \
|
||||
-d '{"messages": [{"content": [{"file": {}, "type": "file"}], "role": "user"}]}' \
|
||||
http://localhost:8000/tokenize
|
||||
-d '{"messages": [{"role": "assistant", "tool_calls": [{"custom": {"input": "", "name": ""}, "id": "", "type": "custom"}]}]}' \
|
||||
http://localhost:8000/v1/chat/completions
|
||||
""" # noqa: E501
|
||||
if (op.method.lower() == "post" and op.path == "/tokenize"
|
||||
and hasattr(case, "body") and isinstance(case.body, dict)
|
||||
if (hasattr(case, "body") and isinstance(case.body, dict)
|
||||
and "messages" in case.body
|
||||
and isinstance(case.body["messages"], list)
|
||||
and len(case.body["messages"]) > 0):
|
||||
|
||||
for message in case.body["messages"]:
|
||||
if not isinstance(message, dict):
|
||||
continue
|
||||
content = message.get("content", [])
|
||||
if not isinstance(content, list) or len(content) == 0:
|
||||
continue
|
||||
if any(item.get("type") == "file" for item in content):
|
||||
return False
|
||||
|
||||
# Check for invalid file type in tokenize endpoint
|
||||
if op.method.lower() == "post" and op.path == "/tokenize":
|
||||
content = message.get("content", [])
|
||||
if (isinstance(content, list) and len(content) > 0 and any(
|
||||
item.get("type") == "file" for item in content)):
|
||||
return False
|
||||
|
||||
# Check for invalid tool_calls with non-function types
|
||||
tool_calls = message.get("tool_calls", [])
|
||||
if isinstance(tool_calls, list):
|
||||
for tool_call in tool_calls:
|
||||
if isinstance(tool_call, dict):
|
||||
if tool_call.get("type") != "function":
|
||||
return False
|
||||
if "custom" in tool_call:
|
||||
return False
|
||||
return True
|
||||
|
||||
return strategy.filter(no_file_type)
|
||||
return strategy.filter(no_invalid_types)
|
||||
|
||||
|
||||
@schema.parametrize()
|
||||
|
||||
@ -126,7 +126,9 @@ def test_invocations(server: RemoteOpenAIServer):
|
||||
invocation_output["results"]):
|
||||
assert rerank_result.keys() == invocations_result.keys()
|
||||
assert rerank_result["relevance_score"] == pytest.approx(
|
||||
invocations_result["relevance_score"], rel=0.01)
|
||||
invocations_result["relevance_score"], rel=0.05)
|
||||
# TODO: reset this tolerance to 0.01 once we find
|
||||
# an alternative to flash_attn with bfloat16
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
|
||||
624
tests/entrypoints/openai/test_response_api_with_harmony.py
Normal file
624
tests/entrypoints/openai/test_response_api_with_harmony.py
Normal file
@ -0,0 +1,624 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import json
|
||||
import time
|
||||
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
import requests
|
||||
from openai import BadRequestError, NotFoundError, OpenAI
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
pytest.skip(allow_module_level=True, reason="gpt-oss can't run on CI yet.")
|
||||
|
||||
MODEL_NAME = "openai/gpt-oss-20b"
|
||||
DTYPE = "bfloat16"
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def server():
|
||||
args = ["--enforce-eager", "--tool-server", "demo"]
|
||||
|
||||
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def client(server):
|
||||
async with server.get_async_client() as async_client:
|
||||
yield async_client
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_basic(client: OpenAI, model_name: str):
|
||||
response = await client.responses.create(
|
||||
model=model_name,
|
||||
input="What is 13 * 24?",
|
||||
)
|
||||
assert response is not None
|
||||
print("response: ", response)
|
||||
assert response.status == "completed"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_basic_with_instructions(client: OpenAI, model_name: str):
|
||||
response = await client.responses.create(
|
||||
model=model_name,
|
||||
input="What is 13 * 24?",
|
||||
instructions="Respond in Korean.",
|
||||
)
|
||||
assert response is not None
|
||||
assert response.status == "completed"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_basic_with_reasoning_effort(client: OpenAI, model_name: str):
|
||||
response = await client.responses.create(
|
||||
model=model_name,
|
||||
input="What is the capital of South Korea?",
|
||||
reasoning={"effort": "low"},
|
||||
)
|
||||
assert response is not None
|
||||
assert response.status == "completed"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_chat(client: OpenAI, model_name: str):
|
||||
response = await client.responses.create(
|
||||
model=model_name,
|
||||
input=[
|
||||
{
|
||||
"role": "system",
|
||||
"content": "Respond in Korean."
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Hello!"
|
||||
},
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": "Hello! How can I help you today?"
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": "What is 13 * 24? Explain your answer."
|
||||
},
|
||||
],
|
||||
)
|
||||
assert response is not None
|
||||
assert response.status == "completed"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_chat_with_input_type(client: OpenAI, model_name: str):
|
||||
response = await client.responses.create(
|
||||
model=model_name,
|
||||
input=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [{
|
||||
"type": "input_text",
|
||||
"text": "What is 13*24?"
|
||||
}],
|
||||
},
|
||||
],
|
||||
)
|
||||
assert response is not None
|
||||
assert response.status == "completed"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_structured_output(client: OpenAI, model_name: str):
|
||||
response = await client.responses.create(
|
||||
model=model_name,
|
||||
input=[
|
||||
{
|
||||
"role": "system",
|
||||
"content": "Extract the event information."
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content":
|
||||
"Alice and Bob are going to a science fair on Friday.",
|
||||
},
|
||||
],
|
||||
text={
|
||||
"format": {
|
||||
"type": "json_schema",
|
||||
"name": "calendar_event",
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"name": {
|
||||
"type": "string"
|
||||
},
|
||||
"date": {
|
||||
"type": "string"
|
||||
},
|
||||
"participants": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
},
|
||||
"required": ["name", "date", "participants"],
|
||||
"additionalProperties": False,
|
||||
},
|
||||
"description": "A calendar event.",
|
||||
"strict": True,
|
||||
}
|
||||
},
|
||||
)
|
||||
assert response is not None
|
||||
assert response.status == "completed"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_structured_output_with_parse(client: OpenAI, model_name: str):
|
||||
from pydantic import BaseModel
|
||||
|
||||
class CalendarEvent(BaseModel):
|
||||
name: str
|
||||
date: str
|
||||
participants: list[str]
|
||||
|
||||
response = await client.responses.parse(
|
||||
model=model_name,
|
||||
input="Alice and Bob are going to a science fair on Friday",
|
||||
instructions="Extract the event information",
|
||||
text_format=CalendarEvent,
|
||||
)
|
||||
assert response is not None
|
||||
assert response.status == "completed"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_store(client: OpenAI, model_name: str):
|
||||
for store in [True, False]:
|
||||
response = await client.responses.create(
|
||||
model=model_name,
|
||||
input="What is 13 * 24?",
|
||||
store=store,
|
||||
)
|
||||
assert response is not None
|
||||
|
||||
try:
|
||||
_retrieved_response = await client.responses.retrieve(response.id)
|
||||
is_not_found = False
|
||||
except NotFoundError:
|
||||
is_not_found = True
|
||||
|
||||
assert is_not_found == (not store)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_background(client: OpenAI, model_name: str):
|
||||
response = await client.responses.create(
|
||||
model=model_name,
|
||||
input="What is 13 * 24?",
|
||||
background=True,
|
||||
)
|
||||
assert response is not None
|
||||
|
||||
retries = 0
|
||||
max_retries = 30
|
||||
while retries < max_retries:
|
||||
response = await client.responses.retrieve(response.id)
|
||||
if response.status == "completed":
|
||||
break
|
||||
time.sleep(1)
|
||||
retries += 1
|
||||
|
||||
assert response.status == "completed"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_background_cancel(client: OpenAI, model_name: str):
|
||||
response = await client.responses.create(
|
||||
model=model_name,
|
||||
input="Write a long story about a cat.",
|
||||
background=True,
|
||||
)
|
||||
assert response is not None
|
||||
time.sleep(1)
|
||||
|
||||
cancelled_response = await client.responses.cancel(response.id)
|
||||
assert cancelled_response is not None
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_stateful_multi_turn(client: OpenAI, model_name: str):
|
||||
response1 = await client.responses.create(
|
||||
model=model_name,
|
||||
input="What is 13 * 24?",
|
||||
)
|
||||
assert response1 is not None
|
||||
assert response1.status == "completed"
|
||||
|
||||
response2 = await client.responses.create(
|
||||
model=model_name,
|
||||
input="What if I increase both numbers by 1?",
|
||||
previous_response_id=response1.id,
|
||||
)
|
||||
assert response2 is not None
|
||||
assert response2.status == "completed"
|
||||
|
||||
response3 = await client.responses.create(
|
||||
model=model_name,
|
||||
input="Divide the result by 2.",
|
||||
previous_response_id=response2.id,
|
||||
)
|
||||
assert response3 is not None
|
||||
assert response3.status == "completed"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_streaming(client: OpenAI, model_name: str):
|
||||
prompts = [
|
||||
"tell me a story about a cat in 20 words",
|
||||
"What is 13 * 24? Use python to calculate the result.",
|
||||
"When did Jensen found NVIDIA? Search it and answer the year only.",
|
||||
]
|
||||
|
||||
for prompt in prompts:
|
||||
response = await client.responses.create(
|
||||
model=model_name,
|
||||
input=prompt,
|
||||
reasoning={"effort": "low"},
|
||||
tools=[
|
||||
{
|
||||
"type": "web_search_preview"
|
||||
},
|
||||
{
|
||||
"type": "code_interpreter",
|
||||
"container": {
|
||||
"type": "auto"
|
||||
}
|
||||
},
|
||||
],
|
||||
stream=True,
|
||||
)
|
||||
|
||||
events = []
|
||||
current_event_mode = None
|
||||
async for event in response:
|
||||
if current_event_mode != event.type:
|
||||
current_event_mode = event.type
|
||||
print(f"\n[{event.type}] ", end="", flush=True)
|
||||
|
||||
if "text.delta" in event.type:
|
||||
print(event.delta, end="", flush=True)
|
||||
elif "reasoning_text.delta" in event.type:
|
||||
print(f"{event.delta}", end="", flush=True)
|
||||
elif "response.code_interpreter_call_code.done" in event.type:
|
||||
print(f"Code: {event.code}", end="", flush=True)
|
||||
elif ("response.output_item.added" in event.type
|
||||
and event.item.type == "web_search_call"):
|
||||
print(f"Web search: {event.item.action}", end="", flush=True)
|
||||
events.append(event)
|
||||
|
||||
assert len(events) > 0
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_web_search(client: OpenAI, model_name: str):
|
||||
response = await client.responses.create(
|
||||
model=model_name,
|
||||
input="Who is the president of South Korea as of now?",
|
||||
tools=[{
|
||||
"type": "web_search_preview"
|
||||
}],
|
||||
)
|
||||
assert response is not None
|
||||
assert response.status == "completed"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_code_interpreter(client: OpenAI, model_name: str):
|
||||
response = await client.responses.create(
|
||||
model=model_name,
|
||||
input="Multiply 64548*15151 using builtin python interpreter.",
|
||||
tools=[{
|
||||
"type": "code_interpreter",
|
||||
"container": {
|
||||
"type": "auto"
|
||||
}
|
||||
}],
|
||||
)
|
||||
assert response is not None
|
||||
assert response.status == "completed"
|
||||
|
||||
|
||||
def get_weather(latitude, longitude):
|
||||
response = requests.get(
|
||||
f"https://api.open-meteo.com/v1/forecast?latitude={latitude}&longitude={longitude}¤t=temperature_2m,wind_speed_10m&hourly=temperature_2m,relative_humidity_2m,wind_speed_10m" # noqa
|
||||
)
|
||||
data = response.json()
|
||||
return data["current"]["temperature_2m"]
|
||||
|
||||
|
||||
def get_place_to_travel():
|
||||
return "Paris"
|
||||
|
||||
|
||||
def call_function(name, args):
|
||||
if name == "get_weather":
|
||||
return get_weather(**args)
|
||||
elif name == "get_place_to_travel":
|
||||
return get_place_to_travel()
|
||||
else:
|
||||
raise ValueError(f"Unknown function: {name}")
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_function_calling(client: OpenAI, model_name: str):
|
||||
tools = [{
|
||||
"type": "function",
|
||||
"name": "get_weather",
|
||||
"description":
|
||||
"Get current temperature for provided coordinates in celsius.", # noqa
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"latitude": {
|
||||
"type": "number"
|
||||
},
|
||||
"longitude": {
|
||||
"type": "number"
|
||||
},
|
||||
},
|
||||
"required": ["latitude", "longitude"],
|
||||
"additionalProperties": False,
|
||||
},
|
||||
"strict": True,
|
||||
}]
|
||||
|
||||
response = await client.responses.create(
|
||||
model=model_name,
|
||||
input="What's the weather like in Paris today?",
|
||||
tools=tools,
|
||||
)
|
||||
assert response is not None
|
||||
assert response.status == "completed"
|
||||
assert len(response.output) == 2
|
||||
assert response.output[0].type == "reasoning"
|
||||
assert response.output[1].type == "function_call"
|
||||
|
||||
tool_call = response.output[1]
|
||||
name = tool_call.name
|
||||
args = json.loads(tool_call.arguments)
|
||||
|
||||
result = call_function(name, args)
|
||||
|
||||
response_2 = await client.responses.create(
|
||||
model=model_name,
|
||||
input=[{
|
||||
"type": "function_call_output",
|
||||
"call_id": tool_call.call_id,
|
||||
"output": str(result),
|
||||
}],
|
||||
tools=tools,
|
||||
previous_response_id=response.id,
|
||||
)
|
||||
assert response_2 is not None
|
||||
assert response_2.status == "completed"
|
||||
assert response_2.output_text is not None
|
||||
|
||||
# NOTE: chain-of-thought should be removed.
|
||||
response_3 = await client.responses.create(
|
||||
model=model_name,
|
||||
input="What's the weather like in Paris today?",
|
||||
tools=tools,
|
||||
previous_response_id=response_2.id,
|
||||
)
|
||||
assert response_3 is not None
|
||||
assert response_3.status == "completed"
|
||||
assert response_3.output_text is not None
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_function_calling_multi_turn(client: OpenAI, model_name: str):
|
||||
tools = [
|
||||
{
|
||||
"type": "function",
|
||||
"name": "get_place_to_travel",
|
||||
"description": "Get a random place to travel",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {},
|
||||
"required": [],
|
||||
"additionalProperties": False,
|
||||
},
|
||||
"strict": True,
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"name": "get_weather",
|
||||
"description":
|
||||
"Get current temperature for provided coordinates in celsius.", # noqa
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"latitude": {
|
||||
"type": "number"
|
||||
},
|
||||
"longitude": {
|
||||
"type": "number"
|
||||
},
|
||||
},
|
||||
"required": ["latitude", "longitude"],
|
||||
"additionalProperties": False,
|
||||
},
|
||||
"strict": True,
|
||||
},
|
||||
]
|
||||
|
||||
response = await client.responses.create(
|
||||
model=model_name,
|
||||
input=
|
||||
"Help me plan a trip to a random place. And tell me the weather there.",
|
||||
tools=tools,
|
||||
)
|
||||
assert response is not None
|
||||
assert response.status == "completed"
|
||||
assert len(response.output) == 2
|
||||
assert response.output[0].type == "reasoning"
|
||||
assert response.output[1].type == "function_call"
|
||||
|
||||
tool_call = response.output[1]
|
||||
name = tool_call.name
|
||||
args = json.loads(tool_call.arguments)
|
||||
|
||||
result = call_function(name, args)
|
||||
|
||||
response_2 = await client.responses.create(
|
||||
model=model_name,
|
||||
input=[{
|
||||
"type": "function_call_output",
|
||||
"call_id": tool_call.call_id,
|
||||
"output": str(result),
|
||||
}],
|
||||
tools=tools,
|
||||
previous_response_id=response.id,
|
||||
)
|
||||
assert response_2 is not None
|
||||
assert response_2.status == "completed"
|
||||
assert len(response_2.output) == 2
|
||||
assert response_2.output[0].type == "reasoning"
|
||||
assert response_2.output[1].type == "function_call"
|
||||
|
||||
tool_call = response_2.output[1]
|
||||
name = tool_call.name
|
||||
args = json.loads(tool_call.arguments)
|
||||
|
||||
result = call_function(name, args)
|
||||
|
||||
response_3 = await client.responses.create(
|
||||
model=model_name,
|
||||
input=[{
|
||||
"type": "function_call_output",
|
||||
"call_id": tool_call.call_id,
|
||||
"output": str(result),
|
||||
}],
|
||||
tools=tools,
|
||||
previous_response_id=response_2.id,
|
||||
)
|
||||
assert response_3 is not None
|
||||
assert response_3.status == "completed"
|
||||
assert response_3.output_text is not None
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_function_calling_required(client: OpenAI, model_name: str):
|
||||
tools = [{
|
||||
"type": "function",
|
||||
"name": "get_weather",
|
||||
"description":
|
||||
"Get current temperature for provided coordinates in celsius.", # noqa
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"latitude": {
|
||||
"type": "number"
|
||||
},
|
||||
"longitude": {
|
||||
"type": "number"
|
||||
},
|
||||
},
|
||||
"required": ["latitude", "longitude"],
|
||||
"additionalProperties": False,
|
||||
},
|
||||
"strict": True,
|
||||
}]
|
||||
|
||||
with pytest.raises(BadRequestError):
|
||||
await client.responses.create(
|
||||
model=model_name,
|
||||
input="What's the weather like in Paris today?",
|
||||
tools=tools,
|
||||
tool_choice="required",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_function_calling_full_history(client: OpenAI, model_name: str):
|
||||
tools = [{
|
||||
"type": "function",
|
||||
"name": "get_weather",
|
||||
"description":
|
||||
"Get current temperature for provided coordinates in celsius.", # noqa
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"latitude": {
|
||||
"type": "number"
|
||||
},
|
||||
"longitude": {
|
||||
"type": "number"
|
||||
},
|
||||
},
|
||||
"required": ["latitude", "longitude"],
|
||||
"additionalProperties": False,
|
||||
},
|
||||
"strict": True,
|
||||
}]
|
||||
|
||||
input_messages = [{
|
||||
"role": "user",
|
||||
"content": "What's the weather like in Paris today?"
|
||||
}]
|
||||
|
||||
response = await client.responses.create(
|
||||
model=model_name,
|
||||
input=input_messages,
|
||||
tools=tools,
|
||||
)
|
||||
|
||||
assert response is not None
|
||||
assert response.status == "completed"
|
||||
|
||||
tool_call = response.output[-1]
|
||||
name = tool_call.name
|
||||
args = json.loads(tool_call.arguments)
|
||||
|
||||
result = call_function(name, args)
|
||||
|
||||
input_messages.extend(
|
||||
response.output) # append model's function call message
|
||||
input_messages.append(
|
||||
{ # append result message
|
||||
"type": "function_call_output",
|
||||
"call_id": tool_call.call_id,
|
||||
"output": str(result),
|
||||
}
|
||||
)
|
||||
|
||||
response_2 = await client.responses.create(
|
||||
model=model_name,
|
||||
input=input_messages,
|
||||
tools=tools,
|
||||
)
|
||||
assert response_2 is not None
|
||||
assert response_2.status == "completed"
|
||||
assert response_2.output_text is not None
|
||||
@ -220,7 +220,9 @@ class TestModel:
|
||||
invocation_output["data"]):
|
||||
assert score_data.keys() == invocation_data.keys()
|
||||
assert score_data["score"] == pytest.approx(
|
||||
invocation_data["score"], rel=0.01)
|
||||
invocation_data["score"], rel=0.05)
|
||||
# TODO: reset this tolerance to 0.01 once we find
|
||||
# an alternative to flash_attn with bfloat16
|
||||
|
||||
def test_activation(self, server: RemoteOpenAIServer, model: dict[str,
|
||||
Any]):
|
||||
|
||||
@ -44,7 +44,7 @@ def model_uri(tmp_dir):
|
||||
def tensorize_model_and_lora(tmp_dir, model_uri):
|
||||
tensorizer_config = TensorizerConfig(tensorizer_uri=model_uri,
|
||||
lora_dir=tmp_dir)
|
||||
args = EngineArgs(model=MODEL_NAME, device="cuda")
|
||||
args = EngineArgs(model=MODEL_NAME)
|
||||
|
||||
tensorize_lora_adapter(LORA_PATH, tensorizer_config)
|
||||
tensorize_vllm_model(args, tensorizer_config)
|
||||
|
||||
@ -35,11 +35,10 @@ FLASH_MLA_UNSUPPORTED_REASON = is_flashmla_supported()[1] \
|
||||
@pytest.mark.parametrize("block_size", [64])
|
||||
@pytest.mark.parametrize("causal", [True])
|
||||
@pytest.mark.parametrize("varlen", [False, True])
|
||||
@pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16])
|
||||
@torch.inference_mode()
|
||||
def test_flash_mla(b, s_q, mean_sk, h_q, h_kv, d, dv, block_size, causal,
|
||||
varlen):
|
||||
# TODO: parametrize using pytest
|
||||
dtype = torch.bfloat16
|
||||
varlen, dtype):
|
||||
device = torch.device("cuda:0")
|
||||
torch.set_default_dtype(dtype)
|
||||
torch.set_default_device(device)
|
||||
@ -48,7 +47,7 @@ def test_flash_mla(b, s_q, mean_sk, h_q, h_kv, d, dv, block_size, causal,
|
||||
random.seed(0)
|
||||
|
||||
print(f"{b=}, {s_q=}, {mean_sk=}, {h_q=}, {h_kv=}, "
|
||||
f"{d=}, {dv=}, {causal=}, {varlen=}")
|
||||
f"{d=}, {dv=}, {causal=}, {varlen=}, {dtype=}")
|
||||
|
||||
cache_seqlens = torch.full((b, ), mean_sk, dtype=torch.int32)
|
||||
if varlen:
|
||||
|
||||
215
tests/kernels/core/test_mrope.py
Normal file
215
tests/kernels/core/test_mrope.py
Normal file
@ -0,0 +1,215 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
from transformers import AutoConfig
|
||||
|
||||
from vllm.model_executor.layers.rotary_embedding import get_rope
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
|
||||
|
||||
def generate_test_data(num_tokens: int, num_q_heads: int, num_kv_heads: int,
|
||||
head_size: int, max_position_embeddings: int,
|
||||
dtype: torch.dtype, device: torch.device):
|
||||
"""Generate test data for given configuration."""
|
||||
# Create 2D positions (3, num_tokens) for multimodal case
|
||||
positions = torch.randint(0,
|
||||
max_position_embeddings // 4, (3, num_tokens),
|
||||
device=device)
|
||||
|
||||
# Create query and key tensors
|
||||
query = torch.randn(num_tokens,
|
||||
num_q_heads * head_size,
|
||||
dtype=dtype,
|
||||
device=device)
|
||||
key = torch.randn(num_tokens,
|
||||
num_kv_heads * head_size,
|
||||
dtype=dtype,
|
||||
device=device)
|
||||
|
||||
return positions, query, key
|
||||
|
||||
|
||||
def unroll_model_tp_dict(model_tp_dict):
|
||||
return [(model_name, tp_size)
|
||||
for model_name, tp_sizes in model_tp_dict.items()
|
||||
for tp_size in tp_sizes]
|
||||
|
||||
|
||||
model_tp_dict = {
|
||||
"Qwen/Qwen2-VL-7B-Instruct": [1, 2],
|
||||
"Qwen/Qwen2-VL-72B-Instruct": [1, 2],
|
||||
"Qwen/Qwen2.5-VL-72B-Instruct": [1, 2],
|
||||
"zai-org/GLM-4.1V-9B-Thinking": [1, 2],
|
||||
}
|
||||
|
||||
# https://github.com/pytorch/pytorch/blob/main/torch/testing/_comparison.py#L1317
|
||||
dtype_atol_rtol_list = [
|
||||
[torch.bfloat16, 1e-2, 1.6e-2],
|
||||
]
|
||||
|
||||
num_tokens_list = [11, 8192]
|
||||
|
||||
|
||||
@pytest.mark.skipif(not current_platform.is_cuda_alike(),
|
||||
reason="Skipping CUDA/ROCm only tests.")
|
||||
@pytest.mark.parametrize("model_name, tp_size",
|
||||
unroll_model_tp_dict(model_tp_dict))
|
||||
@pytest.mark.parametrize("dtype, atol, rtol", dtype_atol_rtol_list)
|
||||
@pytest.mark.parametrize("num_tokens", num_tokens_list)
|
||||
def test_mrope(model_name, tp_size, dtype, atol, rtol, num_tokens):
|
||||
|
||||
config = AutoConfig.from_pretrained(model_name)
|
||||
|
||||
# get the model config
|
||||
total_num_kv_heads = config.num_key_value_heads
|
||||
total_num_heads = config.num_attention_heads
|
||||
num_heads = total_num_heads // tp_size
|
||||
num_kv_heads = max(1, total_num_kv_heads // tp_size)
|
||||
head_dim = config.hidden_size // total_num_heads
|
||||
is_neox_style = True
|
||||
|
||||
rope_theta = config.rope_theta
|
||||
max_position = config.max_position_embeddings
|
||||
partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0)
|
||||
rotary_dim = int(head_dim * partial_rotary_factor)
|
||||
|
||||
mrope_helper_class = get_rope(
|
||||
head_size=head_dim,
|
||||
rotary_dim=rotary_dim,
|
||||
max_position=max_position,
|
||||
base=rope_theta,
|
||||
is_neox_style=is_neox_style,
|
||||
rope_scaling=config.rope_scaling,
|
||||
dtype=dtype,
|
||||
).to(device=device)
|
||||
|
||||
# create q k v input tensors
|
||||
# create rotary pos emb input tensors
|
||||
positions, query, key = generate_test_data(num_tokens, num_heads,
|
||||
num_kv_heads, head_dim,
|
||||
max_position, dtype, device)
|
||||
|
||||
query_native, key_native = mrope_helper_class.forward_native(
|
||||
positions,
|
||||
query.clone(),
|
||||
key.clone(),
|
||||
)
|
||||
|
||||
query_cuda, key_cuda = mrope_helper_class.forward_cuda(
|
||||
positions,
|
||||
query.clone(),
|
||||
key.clone(),
|
||||
)
|
||||
|
||||
torch.testing.assert_close(query_native, query_cuda, atol=atol, rtol=rtol)
|
||||
torch.testing.assert_close(key_native, key_cuda, atol=atol, rtol=rtol)
|
||||
|
||||
|
||||
@pytest.mark.skipif(not current_platform.is_cuda_alike(),
|
||||
reason="Skipping CUDA/ROCm only tests.")
|
||||
@pytest.mark.parametrize(
|
||||
"model_name, tp_size",
|
||||
unroll_model_tp_dict({
|
||||
"Qwen/Qwen2-VL-7B-Instruct": [1, 2],
|
||||
"zai-org/GLM-4.1V-9B-Thinking": [1, 2]
|
||||
}))
|
||||
@pytest.mark.parametrize("dtype, atol, rtol", dtype_atol_rtol_list)
|
||||
@pytest.mark.parametrize("num_tokens", [4])
|
||||
def test_mrope_torch_compile_tracing(model_name, tp_size, dtype, atol, rtol,
|
||||
num_tokens):
|
||||
config = AutoConfig.from_pretrained(model_name)
|
||||
|
||||
# get the model config
|
||||
total_num_kv_heads = config.num_key_value_heads
|
||||
total_num_heads = config.num_attention_heads
|
||||
num_heads = total_num_heads // tp_size
|
||||
num_kv_heads = max(1, total_num_kv_heads // tp_size)
|
||||
head_dim = config.hidden_size // total_num_heads
|
||||
is_neox_style = True
|
||||
rope_theta = config.rope_theta
|
||||
max_position = config.max_position_embeddings
|
||||
partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0)
|
||||
rotary_dim = int(head_dim * partial_rotary_factor)
|
||||
|
||||
mrope_helper_class = get_rope(
|
||||
head_size=head_dim,
|
||||
rotary_dim=rotary_dim,
|
||||
max_position=max_position,
|
||||
base=rope_theta,
|
||||
is_neox_style=is_neox_style,
|
||||
rope_scaling=config.rope_scaling,
|
||||
dtype=dtype,
|
||||
).to(device=device)
|
||||
|
||||
# Generate test data
|
||||
positions, query, key = generate_test_data(num_tokens, num_heads,
|
||||
num_kv_heads, head_dim,
|
||||
max_position, dtype, device)
|
||||
|
||||
# Create a wrapper that makes the in-place function appear functional
|
||||
def functional_forward_cuda(pos, q, k):
|
||||
"""Wrapper that converts in-place operation to functional style
|
||||
|
||||
CUDA Graph does not support in-place operations.
|
||||
This wrapper creates working copies of the
|
||||
input tensors and modifies them.
|
||||
"""
|
||||
q_work = q.clone() # Create working copies
|
||||
k_work = k.clone()
|
||||
# Your in-place function modifies q_work and k_work
|
||||
mrope_helper_class.forward_cuda(pos, q_work, k_work)
|
||||
return q_work, k_work # Return the modified tensors
|
||||
|
||||
# Get reference results
|
||||
query_native, key_native = mrope_helper_class.forward_native(
|
||||
positions,
|
||||
query.clone(),
|
||||
key.clone(),
|
||||
)
|
||||
|
||||
try:
|
||||
compiled_forward_cuda = torch.compile(functional_forward_cuda,
|
||||
fullgraph=True,
|
||||
backend="inductor",
|
||||
mode="reduce-overhead",
|
||||
dynamic=False)
|
||||
|
||||
# Run compiled version
|
||||
query_compiled_cuda, key_compiled_cuda = compiled_forward_cuda(
|
||||
positions,
|
||||
query,
|
||||
key,
|
||||
)
|
||||
|
||||
# Run original version for comparison
|
||||
query_cuda = query.clone()
|
||||
key_cuda = key.clone()
|
||||
mrope_helper_class.forward_cuda(positions, query_cuda, key_cuda)
|
||||
|
||||
# Verify results
|
||||
torch.testing.assert_close(query_compiled_cuda,
|
||||
query_cuda,
|
||||
atol=atol,
|
||||
rtol=rtol)
|
||||
torch.testing.assert_close(key_compiled_cuda,
|
||||
key_cuda,
|
||||
atol=atol,
|
||||
rtol=rtol)
|
||||
torch.testing.assert_close(query_compiled_cuda,
|
||||
query_native,
|
||||
atol=atol,
|
||||
rtol=rtol)
|
||||
torch.testing.assert_close(key_compiled_cuda,
|
||||
key_native,
|
||||
atol=atol,
|
||||
rtol=rtol)
|
||||
|
||||
print("✓ forward_cuda successfully traced with torch.compile inductor")
|
||||
|
||||
except Exception as e:
|
||||
pytest.fail(
|
||||
f"forward_cuda failed to trace with torch.compile inductor: {e}")
|
||||
@ -187,7 +187,7 @@ def generate_continuous_batched_examples(example_lens_by_batch,
|
||||
[torch.float32, torch.float16, torch.bfloat16])
|
||||
@pytest.mark.parametrize("n_heads", [3, 4, 11, 16, 32])
|
||||
@pytest.mark.parametrize("d_head", [5, 8, 19, 32, 128])
|
||||
@pytest.mark.parametrize("seq_len_chunk_size", [(119, 17), (128, 32)])
|
||||
@pytest.mark.parametrize("seq_len_chunk_size", [(112, 16), (128, 32)])
|
||||
def test_mamba_chunk_scan_single_example(d_head, n_heads, seq_len_chunk_size,
|
||||
itype):
|
||||
|
||||
@ -253,15 +253,15 @@ def test_mamba_chunk_scan_single_example(d_head, n_heads, seq_len_chunk_size,
|
||||
(8, 8, 16, 32, 16),
|
||||
]), # mode examples with varied lengths
|
||||
|
||||
# odd chunk_size
|
||||
(64, 29, 2, [(11, 4), (13, 23), (19, 22),
|
||||
(21, 15)]), # irregular sizes
|
||||
|
||||
# large-ish chunk_size (256)
|
||||
(64, 256, 1, [(5, ), (1, ), (1, ),
|
||||
(1, )]), # irregular sizes with small sequences
|
||||
(64, 256, 2, [(5, 30), (1, 2), (1, 2),
|
||||
(1, 2)]), # irregular sizes with small sequences
|
||||
|
||||
# we also need to test some large seqlen
|
||||
# to catch errors with init states decay
|
||||
(768, 128, 2, [(138, 225), (138, 225)]),
|
||||
])
|
||||
def test_mamba_chunk_scan_cont_batch(d_head, n_heads, seq_len_chunk_size_cases,
|
||||
itype):
|
||||
@ -271,10 +271,9 @@ def test_mamba_chunk_scan_cont_batch(d_head, n_heads, seq_len_chunk_size_cases,
|
||||
|
||||
seqlen, chunk_size, num_examples, cases = seq_len_chunk_size_cases
|
||||
|
||||
# TODO: the irregular chunk size cases have some issues and require higher
|
||||
# tolerance. This is to be invesigated
|
||||
if chunk_size not in {8, 256}:
|
||||
atol, rtol = 5e-1, 5e-1
|
||||
# This test can have larger error for longer sequences
|
||||
if seqlen > 256:
|
||||
atol, rtol = 1e-2, 5e-3
|
||||
else:
|
||||
atol, rtol = 5e-3, 5e-3
|
||||
|
||||
|
||||
@ -36,7 +36,6 @@ def _set_vllm_config(vllm_config: VllmConfig, world_size: int, rank: int,
|
||||
import tempfile
|
||||
temp_file = tempfile.mkstemp()[1]
|
||||
|
||||
set_current_vllm_config(vllm_config)
|
||||
with set_current_vllm_config(vllm_config):
|
||||
init_distributed_environment(
|
||||
world_size=world_size,
|
||||
|
||||
@ -16,7 +16,7 @@ from vllm.model_executor.layers.fused_moe.fused_moe import (
|
||||
fused_topk, modular_triton_fused_moe)
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils import has_deep_gemm
|
||||
from vllm.utils.deep_gemm import is_blackwell_deep_gemm_used
|
||||
from vllm.utils.deep_gemm import is_blackwell_deep_gemm_e8m0_used
|
||||
|
||||
dg_available = has_deep_gemm()
|
||||
|
||||
@ -224,7 +224,8 @@ def test_w8a8_block_fp8_fused_moe(M, N, K, E, topk, block_size, dtype, seed,
|
||||
@pytest.mark.parametrize("topk", TOP_KS)
|
||||
@pytest.mark.parametrize("seed", SEEDS)
|
||||
@pytest.mark.skipif(not dg_available, reason="DeepGemm kernels not available.")
|
||||
@pytest.mark.skipif(is_blackwell_deep_gemm_used(), reason="Not E8M0 scale MOE")
|
||||
@pytest.mark.skipif(is_blackwell_deep_gemm_e8m0_used(),
|
||||
reason="Not E8M0 scale MOE")
|
||||
@torch.inference_mode()
|
||||
def test_w8a8_block_fp8_deep_gemm_fused_moe(M, N, K, E, topk, seed,
|
||||
monkeypatch):
|
||||
|
||||
@ -20,7 +20,7 @@ from vllm.model_executor.layers.fused_moe.modular_kernel import (
|
||||
FusedMoEModularKernel)
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils import has_deep_ep, has_deep_gemm
|
||||
from vllm.utils.deep_gemm import (is_blackwell_deep_gemm_used,
|
||||
from vllm.utils.deep_gemm import (is_blackwell_deep_gemm_e8m0_used,
|
||||
is_deep_gemm_supported)
|
||||
|
||||
from .parallel_utils import ProcessGroupInfo, parallel_launch
|
||||
@ -370,7 +370,7 @@ NUM_EXPERTS = [32]
|
||||
@pytest.mark.parametrize("world_dp_size", [(2, 1)])
|
||||
@requires_deep_ep
|
||||
@requires_deep_gemm
|
||||
@pytest.mark.skipif(is_blackwell_deep_gemm_used(),
|
||||
@pytest.mark.skipif(is_blackwell_deep_gemm_e8m0_used(),
|
||||
reason="Skipping test for Blackwell DeepGEMM")
|
||||
def test_ht_deepep_deepgemm_moe(mnk: tuple[int, int, int], num_experts: int,
|
||||
topk: int, world_dp_size: tuple[int, int]):
|
||||
@ -427,7 +427,7 @@ USE_FP8_DISPATCH = [False]
|
||||
@pytest.mark.parametrize("world_dp_size", [(2, 1)])
|
||||
@requires_deep_ep
|
||||
@requires_deep_gemm
|
||||
@pytest.mark.skipif(is_blackwell_deep_gemm_used(),
|
||||
@pytest.mark.skipif(is_blackwell_deep_gemm_e8m0_used(),
|
||||
reason="Skipping test for Blackwell DeepGEMM")
|
||||
def test_ll_deepep_deepgemm_moe(
|
||||
mnk: tuple[int, int, int],
|
||||
|
||||
@ -5,6 +5,15 @@ from dataclasses import dataclass, fields
|
||||
import pytest
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
|
||||
from vllm.utils import has_triton_kernels
|
||||
|
||||
if not has_triton_kernels():
|
||||
pytest.skip(
|
||||
"triton_kernels not found, skipping all related tests",
|
||||
allow_module_level=True,
|
||||
)
|
||||
|
||||
import triton_kernels.swiglu
|
||||
from triton_kernels.matmul_ogs import FlexCtx, PrecisionConfig
|
||||
from triton_kernels.numerics import InFlexData
|
||||
@ -65,7 +74,7 @@ def init_compute_data(M, K, N, E, a_dtype: str, w_dtype: str, num_warps: int):
|
||||
dtype_dict = {
|
||||
"bf16": torch.bfloat16,
|
||||
"fp8_e4m3": torch.float8_e4m3fn,
|
||||
"fp8_e5m2": torch.float8_e5m2
|
||||
"fp8_e5m2": torch.float8_e5m2,
|
||||
}
|
||||
|
||||
x = x.to(dtype_dict[a_dtype]).to(torch.bfloat16)
|
||||
@ -97,12 +106,18 @@ def init_compute_data(M, K, N, E, a_dtype: str, w_dtype: str, num_warps: int):
|
||||
|
||||
x_pad = w1_bottom_pad
|
||||
|
||||
w1_tri = F.pad(w1_tri, (0, w1_right_pad, 0, w1_bottom_pad, 0, 0),
|
||||
mode="constant",
|
||||
value=0)
|
||||
w2_tri = F.pad(w2_tri, (0, w2_right_pad, 0, w2_bottom_pad, 0, 0),
|
||||
mode="constant",
|
||||
value=0)
|
||||
w1_tri = F.pad(
|
||||
w1_tri,
|
||||
(0, w1_right_pad, 0, w1_bottom_pad, 0, 0),
|
||||
mode="constant",
|
||||
value=0,
|
||||
)
|
||||
w2_tri = F.pad(
|
||||
w2_tri,
|
||||
(0, w2_right_pad, 0, w2_bottom_pad, 0, 0),
|
||||
mode="constant",
|
||||
value=0,
|
||||
)
|
||||
|
||||
w1_bias_tri = F.pad(w1_bias_tri, (0, w1_right_pad, 0, 0),
|
||||
mode="constant",
|
||||
@ -127,13 +142,19 @@ def init_compute_data(M, K, N, E, a_dtype: str, w_dtype: str, num_warps: int):
|
||||
|
||||
w1_tri = convert_layout(wrap_torch_tensor(w1_tri, FP4), w_layout,
|
||||
**w_layout_opts)
|
||||
w1_scale_tri = convert_layout(wrap_torch_tensor(w1_scale_tri),
|
||||
w_scale_layout, **w_scale_layout_opts)
|
||||
w1_scale_tri = convert_layout(
|
||||
wrap_torch_tensor(w1_scale_tri),
|
||||
w_scale_layout,
|
||||
**w_scale_layout_opts,
|
||||
)
|
||||
|
||||
w2_tri = convert_layout(wrap_torch_tensor(w2_tri, FP4), w_layout,
|
||||
**w_layout_opts)
|
||||
w2_scale_tri = convert_layout(wrap_torch_tensor(w2_scale_tri),
|
||||
w_scale_layout, **w_scale_layout_opts)
|
||||
w2_scale_tri = convert_layout(
|
||||
wrap_torch_tensor(w2_scale_tri),
|
||||
w_scale_layout,
|
||||
**w_scale_layout_opts,
|
||||
)
|
||||
|
||||
pc1 = PrecisionConfig(weight_scale=w1_scale_tri,
|
||||
flex_ctx=FlexCtx(rhs_data=InFlexData()))
|
||||
@ -149,8 +170,22 @@ def init_compute_data(M, K, N, E, a_dtype: str, w_dtype: str, num_warps: int):
|
||||
w1 = w1.transpose(-1, -2).contiguous()
|
||||
w2 = w2.transpose(-1, -2).contiguous()
|
||||
|
||||
return (x, w1, w1_bias, w2, w2_bias, exp_data, x_tri, w1_tri, w2_tri,
|
||||
exp_data_tri, w1_bias_tri, w2_bias_tri, pc1, pc2)
|
||||
return (
|
||||
x,
|
||||
w1,
|
||||
w1_bias,
|
||||
w2,
|
||||
w2_bias,
|
||||
exp_data,
|
||||
x_tri,
|
||||
w1_tri,
|
||||
w2_tri,
|
||||
exp_data_tri,
|
||||
w1_bias_tri,
|
||||
w2_bias_tri,
|
||||
pc1,
|
||||
pc2,
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
@ -184,13 +219,14 @@ def swiglu(x, alpha: float = 1.702, limit: float = 1.0):
|
||||
|
||||
|
||||
def oai_moe_forward(
|
||||
hidden_states: torch.Tensor, # (M, K)
|
||||
w1: torch.Tensor, # (E, 2N)
|
||||
w1_bias: torch.Tensor, # (E, 2N, K)
|
||||
w2: torch.Tensor, # (E, K, N)
|
||||
w2_bias: torch.Tensor, # (E, N)
|
||||
gating_output: torch.Tensor, # (M, E)
|
||||
topk: int):
|
||||
hidden_states: torch.Tensor, # (M, K)
|
||||
w1: torch.Tensor, # (E, 2N)
|
||||
w1_bias: torch.Tensor, # (E, 2N, K)
|
||||
w2: torch.Tensor, # (E, K, N)
|
||||
w2_bias: torch.Tensor, # (E, N)
|
||||
gating_output: torch.Tensor, # (M, E)
|
||||
topk: int,
|
||||
):
|
||||
# model.py 309:330, assuming gating and norm
|
||||
t = hidden_states
|
||||
experts = torch.topk(gating_output, k=topk, dim=-1, sorted=True)
|
||||
@ -240,10 +276,22 @@ def test_equiv(num_token, a_dtype, w_dtype, tp):
|
||||
N = ModelConfig.intermediate_size // tp
|
||||
topk = ModelConfig.experts_per_token
|
||||
|
||||
x, w1, w1_bias, w2, w2_bias, exp_data, \
|
||||
x_tri, w1_tri, w2_tri, exp_data_tri, w1_bias_tri,\
|
||||
w2_bias_tri, pc1, pc2 = init_compute_data(
|
||||
M, K, N, E, a_dtype, w_dtype, num_warps=8)
|
||||
(
|
||||
x,
|
||||
w1,
|
||||
w1_bias,
|
||||
w2,
|
||||
w2_bias,
|
||||
exp_data,
|
||||
x_tri,
|
||||
w1_tri,
|
||||
w2_tri,
|
||||
exp_data_tri,
|
||||
w1_bias_tri,
|
||||
w2_bias_tri,
|
||||
pc1,
|
||||
pc2,
|
||||
) = init_compute_data(M, K, N, E, a_dtype, w_dtype, num_warps=8)
|
||||
|
||||
out_triton_monolithic = triton_kernel_moe_forward(
|
||||
hidden_states=x_tri,
|
||||
@ -255,33 +303,46 @@ def test_equiv(num_token, a_dtype, w_dtype, tp):
|
||||
w1_bias=w1_bias_tri,
|
||||
w2_bias=w2_bias_tri,
|
||||
w1_precision=pc1,
|
||||
w2_precision=pc2)
|
||||
w2_precision=pc2,
|
||||
)
|
||||
out_triton_monolithic = out_triton_monolithic[..., :K]
|
||||
|
||||
out_ref = oai_moe_forward(hidden_states=x,
|
||||
w1=w1,
|
||||
w1_bias=w1_bias,
|
||||
w2=w2,
|
||||
w2_bias=w2_bias,
|
||||
gating_output=exp_data,
|
||||
topk=topk)
|
||||
out_ref = oai_moe_forward(
|
||||
hidden_states=x,
|
||||
w1=w1,
|
||||
w1_bias=w1_bias,
|
||||
w2=w2,
|
||||
w2_bias=w2_bias,
|
||||
gating_output=exp_data,
|
||||
topk=topk,
|
||||
)
|
||||
assert_close(ref=out_ref,
|
||||
tri=out_triton_monolithic,
|
||||
maxtol=0.025,
|
||||
rmstol=0.005)
|
||||
|
||||
|
||||
def batched_moe(a: torch.Tensor, w1, w2, gating_output: torch.Tensor,
|
||||
topk: int, renormalize: bool, w1_bias: torch.Tensor,
|
||||
w2_bias: torch.Tensor, w1_precision: PrecisionConfig,
|
||||
w2_precision: PrecisionConfig) -> torch.Tensor:
|
||||
def batched_moe(
|
||||
a: torch.Tensor,
|
||||
w1,
|
||||
w2,
|
||||
gating_output: torch.Tensor,
|
||||
topk: int,
|
||||
renormalize: bool,
|
||||
w1_bias: torch.Tensor,
|
||||
w2_bias: torch.Tensor,
|
||||
w1_precision: PrecisionConfig,
|
||||
w2_precision: PrecisionConfig,
|
||||
) -> torch.Tensor:
|
||||
max_num_tokens = round_up(a.shape[0], 64)
|
||||
|
||||
fused_experts = FusedMoEModularKernel(
|
||||
BatchedPrepareAndFinalize(max_num_tokens,
|
||||
num_dispatchers=1,
|
||||
num_local_experts=w1.shape[0],
|
||||
rank=0),
|
||||
BatchedPrepareAndFinalize(
|
||||
max_num_tokens,
|
||||
num_dispatchers=1,
|
||||
num_local_experts=w1.shape[0],
|
||||
rank=0,
|
||||
),
|
||||
BatchedOAITritonExperts(
|
||||
None,
|
||||
max_num_tokens=max_num_tokens,
|
||||
@ -327,30 +388,46 @@ def test_triton_kernel_batched_moe(num_token, a_dtype, w_dtype, ep):
|
||||
N = ModelConfig.intermediate_size
|
||||
topk = ModelConfig.experts_per_token
|
||||
|
||||
x, w1, w1_bias, w2, w2_bias, exp_data, \
|
||||
x_tri, w1_tri, w2_tri, exp_data_tri, w1_bias_tri, \
|
||||
w2_bias_tri, pc1, pc2 = init_compute_data(
|
||||
M, K, N, E, a_dtype, w_dtype, num_warps=4)
|
||||
(
|
||||
x,
|
||||
w1,
|
||||
w1_bias,
|
||||
w2,
|
||||
w2_bias,
|
||||
exp_data,
|
||||
x_tri,
|
||||
w1_tri,
|
||||
w2_tri,
|
||||
exp_data_tri,
|
||||
w1_bias_tri,
|
||||
w2_bias_tri,
|
||||
pc1,
|
||||
pc2,
|
||||
) = init_compute_data(M, K, N, E, a_dtype, w_dtype, num_warps=4)
|
||||
|
||||
out_tri = batched_moe(a=x_tri,
|
||||
w1=w1_tri,
|
||||
w2=w2_tri,
|
||||
gating_output=exp_data_tri,
|
||||
topk=topk,
|
||||
renormalize=True,
|
||||
w1_bias=w1_bias_tri,
|
||||
w2_bias=w2_bias_tri,
|
||||
w1_precision=pc1,
|
||||
w2_precision=pc2)
|
||||
out_tri = batched_moe(
|
||||
a=x_tri,
|
||||
w1=w1_tri,
|
||||
w2=w2_tri,
|
||||
gating_output=exp_data_tri,
|
||||
topk=topk,
|
||||
renormalize=True,
|
||||
w1_bias=w1_bias_tri,
|
||||
w2_bias=w2_bias_tri,
|
||||
w1_precision=pc1,
|
||||
w2_precision=pc2,
|
||||
)
|
||||
out_tri = out_tri[..., :K]
|
||||
|
||||
out_ref = oai_moe_forward(hidden_states=x,
|
||||
w1=w1,
|
||||
w1_bias=w1_bias,
|
||||
w2=w2,
|
||||
w2_bias=w2_bias,
|
||||
gating_output=exp_data,
|
||||
topk=topk)
|
||||
out_ref = oai_moe_forward(
|
||||
hidden_states=x,
|
||||
w1=w1,
|
||||
w1_bias=w1_bias,
|
||||
w2=w2,
|
||||
w2_bias=w2_bias,
|
||||
gating_output=exp_data,
|
||||
topk=topk,
|
||||
)
|
||||
assert_close(ref=out_ref, tri=out_tri, maxtol=0.025, rmstol=0.005)
|
||||
|
||||
|
||||
@ -370,6 +447,7 @@ def test_unit_shuffle():
|
||||
out = triton_kernels.swiglu.swiglu_torch(
|
||||
out,
|
||||
alpha=1.702,
|
||||
precision_config=triton_kernels.swiglu.PrecisionConfig(limit=1.0))
|
||||
precision_config=triton_kernels.swiglu.PrecisionConfig(limit=1.0),
|
||||
)
|
||||
|
||||
assert_close(ref=out_ref, tri=out)
|
||||
assert_close(ref=out_ref, tri=out)
|
||||
|
||||
@ -36,7 +36,7 @@ from vllm.model_executor.models.mixtral import MixtralMoE
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.scalar_type import ScalarType, scalar_types
|
||||
|
||||
NUM_EXPERTS = [8, 64]
|
||||
NUM_EXPERTS = [8, 64, 192]
|
||||
EP_SIZE = [1, 4]
|
||||
TOP_KS = [2, 6]
|
||||
|
||||
|
||||
@ -94,45 +94,6 @@ def test_metric_counter_generation_tokens(
|
||||
f"metric: {metric_count!r}")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("max_tokens", [128, 129])
|
||||
@pytest.mark.parametrize("disable_async_output_proc", [True, False])
|
||||
def test_metric_counter_generation_tokens_multi_step(
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
max_tokens: int,
|
||||
disable_async_output_proc: bool,
|
||||
) -> None:
|
||||
num_scheduler_steps = 8
|
||||
with vllm_runner(
|
||||
model,
|
||||
disable_log_stats=False,
|
||||
gpu_memory_utilization=0.4,
|
||||
num_scheduler_steps=num_scheduler_steps,
|
||||
disable_async_output_proc=disable_async_output_proc,
|
||||
) as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
tokenizer = vllm_model.llm.get_tokenizer()
|
||||
stat_logger = vllm_model.llm.llm_engine.stat_loggers['prometheus']
|
||||
metric_count = stat_logger.metrics.counter_generation_tokens.labels(
|
||||
**stat_logger.labels)._value.get()
|
||||
vllm_generation_count = 0
|
||||
for i in range(len(example_prompts)):
|
||||
vllm_output_ids, vllm_output_str = vllm_outputs[i]
|
||||
prompt_ids = tokenizer.encode(example_prompts[i])
|
||||
# vllm_output_ids contains both prompt tokens and generation tokens.
|
||||
# We're interested only in the count of the generation tokens.
|
||||
vllm_generation_count += len(vllm_output_ids) - len(prompt_ids)
|
||||
|
||||
# The multi-step scheduling will continue to execute forward even when
|
||||
# encountering EOS, leading to slightly imprecise metrics.
|
||||
assert abs(vllm_generation_count - metric_count) <\
|
||||
len(example_prompts) * num_scheduler_steps, \
|
||||
(f"generation token count: {vllm_generation_count!r}\n"
|
||||
f"metric: {metric_count!r}")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["float"])
|
||||
@pytest.mark.parametrize(
|
||||
|
||||
@ -20,19 +20,15 @@ pytestmark = pytest.mark.hybrid_model
|
||||
SSM_MODELS = [
|
||||
"state-spaces/mamba-130m-hf",
|
||||
"tiiuae/falcon-mamba-tiny-dev",
|
||||
"mistralai/Mamba-Codestral-7B-v0.1",
|
||||
"yujiepan/mamba2-codestral-v0.1-tiny-random",
|
||||
]
|
||||
|
||||
HYBRID_MODELS = [
|
||||
"ai21labs/Jamba-tiny-dev",
|
||||
# NOTE: Running Plamo2 in transformers implementation requires to install
|
||||
# causal-conv1d package, which is not listed as a test dependency as it's
|
||||
# not compatible with pip-compile.
|
||||
"pfnet/plamo-2-1b",
|
||||
# skipping until vLLM implementation issues are resolved
|
||||
# "pfnet/plamo-2-1b",
|
||||
"Zyphra/Zamba2-1.2B-instruct",
|
||||
"hmellor/tiny-random-BambaForCausalLM",
|
||||
"ibm-ai-platform/Bamba-9B-v1",
|
||||
"nvidia/Nemotron-H-8B-Base-8K",
|
||||
"ibm-granite/granite-4.0-tiny-preview",
|
||||
"tiiuae/Falcon-H1-0.5B-Base",
|
||||
]
|
||||
@ -42,23 +38,18 @@ HF_UNSUPPORTED_MODELS = [
|
||||
# Mamba2 is buggy for Codestral as it doesn't handle n_groups, so the test
|
||||
# doesn't compare vLLM output with HF output.
|
||||
# See https://github.com/huggingface/transformers/pull/35943
|
||||
"mistralai/Mamba-Codestral-7B-v0.1",
|
||||
# Note: I'm not seeing the same output from vLLM V0 vs. HF transformers
|
||||
# for Nemotron-H-8B; currently only compare vLLM V0 vs. vLLM V1
|
||||
"nvidia/Nemotron-H-8B-Base-8K",
|
||||
# NOTE: Currently the test fails due to HF transformers issue fixed in:
|
||||
# https://github.com/huggingface/transformers/pull/39033
|
||||
# We will enable vLLM test for Granite after next HF transformers release.
|
||||
"ibm-granite/granite-4.0-tiny-preview",
|
||||
"yujiepan/mamba2-codestral-v0.1-tiny-random",
|
||||
# transformers 4.55 is still producing garbage for this model
|
||||
# TODO(tdoublep): follow-up on transformers side
|
||||
"ibm-granite/granite-4.0-tiny-preview"
|
||||
]
|
||||
|
||||
V1_SUPPORTED_MODELS = [
|
||||
"state-spaces/mamba-130m-hf",
|
||||
"ai21labs/Jamba-tiny-dev",
|
||||
"mistralai/Mamba-Codestral-7B-v0.1",
|
||||
"ibm-ai-platform/Bamba-9B-v1",
|
||||
"yujiepan/mamba2-codestral-v0.1-tiny-random",
|
||||
"Zyphra/Zamba2-1.2B-instruct",
|
||||
"nvidia/Nemotron-H-8B-Base-8K",
|
||||
"hmellor/tiny-random-BambaForCausalLM",
|
||||
"ibm-granite/granite-4.0-tiny-preview",
|
||||
"tiiuae/Falcon-H1-0.5B-Base",
|
||||
]
|
||||
@ -83,12 +74,16 @@ def test_models(
|
||||
try:
|
||||
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
|
||||
model_info.check_available_online(on_fail="skip")
|
||||
model_info.check_transformers_version(on_fail="skip")
|
||||
hf_version_check = model_info.check_transformers_version(
|
||||
on_fail="return")
|
||||
except ValueError:
|
||||
pass
|
||||
hf_version_check = None
|
||||
|
||||
if hf_version_check is not None:
|
||||
print(f"Skipping transformers comparison because: {hf_version_check}")
|
||||
|
||||
with hf_runner(model) as hf_model:
|
||||
if model not in HF_UNSUPPORTED_MODELS:
|
||||
if model not in HF_UNSUPPORTED_MODELS and hf_version_check is None:
|
||||
hf_outputs = hf_model.generate_greedy_logprobs_limit(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
else:
|
||||
@ -336,32 +331,6 @@ def test_state_cleanup(
|
||||
"could be related to finished_requests_ids")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
|
||||
@pytest.mark.parametrize("max_tokens", [64])
|
||||
def test_multistep_correctness(
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
max_tokens: int,
|
||||
) -> None:
|
||||
with vllm_runner(model, num_scheduler_steps=8,
|
||||
max_num_seqs=2) as vllm_model:
|
||||
vllm_outputs_multistep = vllm_model.generate_greedy(
|
||||
example_prompts, max_tokens)
|
||||
|
||||
with vllm_runner(model, num_scheduler_steps=1,
|
||||
max_num_seqs=2) as vllm_model:
|
||||
vllm_outputs_single_step = vllm_model.generate_greedy(
|
||||
example_prompts, max_tokens)
|
||||
|
||||
check_outputs_equal(
|
||||
outputs_0_lst=vllm_outputs_multistep,
|
||||
outputs_1_lst=vllm_outputs_single_step,
|
||||
name_0="vllm_outputs_multistep",
|
||||
name_1="vllm_outputs_single_step",
|
||||
)
|
||||
|
||||
|
||||
@multi_gpu_test(num_gpus=2)
|
||||
@pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
|
||||
@pytest.mark.parametrize("max_tokens", [64])
|
||||
@ -389,3 +358,63 @@ def test_distributed_correctness(
|
||||
name_0="vllm_tp_1",
|
||||
name_1="vllm_tp_2",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", ["Zyphra/Zamba2-1.2B-instruct"])
|
||||
@pytest.mark.parametrize("max_tokens", [64])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
def test_full_cuda_graph(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
monkeypatch,
|
||||
model: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
) -> None:
|
||||
|
||||
try:
|
||||
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
|
||||
model_info.check_available_online(on_fail="skip")
|
||||
model_info.check_transformers_version(on_fail="skip")
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
with hf_runner(model) as hf_model:
|
||||
if model not in HF_UNSUPPORTED_MODELS:
|
||||
hf_outputs = hf_model.generate_greedy_logprobs_limit(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
else:
|
||||
hf_outputs = None
|
||||
|
||||
with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
|
||||
vllm_v0_outputs = vllm_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_USE_V1", "1")
|
||||
if model in HYBRID_MODELS:
|
||||
# required due to reorder_batch behaviour
|
||||
m.setenv("VLLM_ATTENTION_BACKEND", "FLASHINFER")
|
||||
with vllm_runner(model,
|
||||
max_num_seqs=MAX_NUM_SEQS,
|
||||
compilation_config={'full_cuda_graph': True},
|
||||
enable_prefix_caching=False) as vllm_model:
|
||||
vllm_v1_outputs = vllm_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
|
||||
if hf_outputs is not None:
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=vllm_v0_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm-v0",
|
||||
)
|
||||
|
||||
ref_outputs = hf_outputs if hf_outputs is not None else vllm_v0_outputs
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=ref_outputs,
|
||||
outputs_1_lst=vllm_v1_outputs,
|
||||
name_0="hf" if hf_outputs is not None else "vllm-v0",
|
||||
name_1="vllm-v1",
|
||||
)
|
||||
|
||||
@ -177,9 +177,12 @@ def mteb_test_embed_models(hf_runner,
|
||||
max_model_len=None,
|
||||
**vllm_extra_kwargs) as vllm_model:
|
||||
|
||||
model_config = vllm_model.llm.llm_engine.model_config
|
||||
|
||||
if model_info.architecture:
|
||||
assert (model_info.architecture
|
||||
in vllm_model.llm.llm_engine.model_config.architectures)
|
||||
assert model_info.architecture in model_config.architectures
|
||||
assert (model_config._model_info.default_pooling_type ==
|
||||
model_info.default_pooling_type)
|
||||
|
||||
vllm_main_score = run_mteb_embed_task(VllmMtebEncoder(vllm_model),
|
||||
MTEB_EMBED_TASKS)
|
||||
@ -286,7 +289,12 @@ def mteb_test_rerank_models(hf_runner,
|
||||
**vllm_extra_kwargs) as vllm_model:
|
||||
|
||||
model_config = vllm_model.llm.llm_engine.model_config
|
||||
|
||||
if model_info.architecture:
|
||||
assert (model_info.architecture in model_config.architectures)
|
||||
assert model_config.hf_config.num_labels == 1
|
||||
assert (model_config._model_info.default_pooling_type ==
|
||||
model_info.default_pooling_type)
|
||||
|
||||
vllm_main_score = run_mteb_rerank(vllm_mteb_encoder(vllm_model),
|
||||
tasks=MTEB_RERANK_TASKS,
|
||||
|
||||
@ -0,0 +1,93 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import pytest
|
||||
import torch
|
||||
from transformers import AutoModelForSequenceClassification
|
||||
|
||||
from tests.models.language.pooling.embed_utils import (
|
||||
run_embedding_correctness_test)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
["jason9693/Qwen2.5-1.5B-apeach"],
|
||||
)
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
def test_classify_models(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
dtype: str,
|
||||
) -> None:
|
||||
|
||||
example_prompts = example_prompts * 2
|
||||
|
||||
with vllm_runner(model,
|
||||
max_model_len=512,
|
||||
dtype=dtype,
|
||||
enable_prefix_caching=True) as vllm_model:
|
||||
cache_config = vllm_model.llm.llm_engine.cache_config
|
||||
assert cache_config.enable_prefix_caching
|
||||
vllm_outputs = vllm_model.classify(example_prompts)
|
||||
|
||||
with hf_runner(model,
|
||||
dtype=dtype,
|
||||
auto_cls=AutoModelForSequenceClassification) as hf_model:
|
||||
hf_outputs = hf_model.classify(example_prompts)
|
||||
|
||||
for hf_output, vllm_output in zip(hf_outputs, vllm_outputs):
|
||||
hf_output = torch.tensor(hf_output)
|
||||
vllm_output = torch.tensor(vllm_output)
|
||||
|
||||
assert torch.allclose(hf_output, vllm_output,
|
||||
1e-3 if dtype == "float" else 1e-2)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
["Qwen/Qwen3-Embedding-0.6B"],
|
||||
)
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
def test_embed_models(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
dtype: str,
|
||||
):
|
||||
example_prompts = [str(s).strip() for s in example_prompts] * 2
|
||||
|
||||
with vllm_runner(
|
||||
model,
|
||||
runner="pooling",
|
||||
max_model_len=None,
|
||||
enable_prefix_caching=True,
|
||||
) as vllm_model:
|
||||
cache_config = vllm_model.llm.llm_engine.cache_config
|
||||
assert cache_config.enable_prefix_caching
|
||||
vllm_outputs = vllm_model.embed(example_prompts)
|
||||
|
||||
with hf_runner(
|
||||
model,
|
||||
is_sentence_transformer=True,
|
||||
) as hf_model:
|
||||
run_embedding_correctness_test(hf_model, example_prompts, vllm_outputs)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[
|
||||
"intfloat/e5-small",
|
||||
"Alibaba-NLP/gte-Qwen2-1.5B-instruct", # is_causal == False
|
||||
"papluca/xlm-roberta-base-language-detection",
|
||||
])
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
def test_non_causal_models(hf_runner, vllm_runner, example_prompts, model: str,
|
||||
dtype: str) -> None:
|
||||
with vllm_runner(model,
|
||||
max_model_len=512,
|
||||
dtype=dtype,
|
||||
enable_prefix_caching=True) as vllm_model:
|
||||
cache_config = vllm_model.llm.llm_engine.cache_config
|
||||
assert not cache_config.enable_prefix_caching
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user