Compare commits
108 Commits
v0.8.0rc1
...
v1-sched-i
| Author | SHA1 | Date | |
|---|---|---|---|
| 8db54c7912 | |||
| 0c6f5023c3 | |||
| 06dd08256f | |||
| 2b22290ce0 | |||
| 530dbecd1a | |||
| d8e82bc06d | |||
| 086b56824c | |||
| 5a0905ba2a | |||
| a8f12a63fd | |||
| 69ae2380c6 | |||
| 27261e40a6 | |||
| e3f813c33b | |||
| c607a2652b | |||
| 3d45e3d749 | |||
| 742369d35a | |||
| bfe2fe0af4 | |||
| a8652f4f0f | |||
| 2f726b241e | |||
| a597a57595 | |||
| ae65f3e237 | |||
| 34868b106a | |||
| 1f16b7fe74 | |||
| b88be22165 | |||
| d8c6d7d6b5 | |||
| 40828ce5fe | |||
| ffa443afed | |||
| 70e500cad9 | |||
| 4cb1c05c9e | |||
| c47aafa37c | |||
| cfbca8a2f2 | |||
| 0fe5609874 | |||
| 22d33baca2 | |||
| b0e96aaebb | |||
| 8310e0b59b | |||
| 26dd972adb | |||
| 61c7a1b856 | |||
| 374ee287d8 | |||
| a4d83661d7 | |||
| 8363cd093d | |||
| 6c5a3195db | |||
| 073d1ed354 | |||
| 3d446433ec | |||
| 1fe0fd12d3 | |||
| dafb4e504a | |||
| 68cf1601d3 | |||
| 61f412187d | |||
| 05ccd0aa35 | |||
| f690372b68 | |||
| 8b3e94a357 | |||
| 437f9162d0 | |||
| 4f065f12f5 | |||
| 228b768db6 | |||
| 027827cc1d | |||
| 72a8639b68 | |||
| 99abb8b650 | |||
| 3a1e648158 | |||
| 46c759c165 | |||
| 179a619c21 | |||
| 452e8fd968 | |||
| 8b793f7ec6 | |||
| af35d3a3cc | |||
| 3b457143d2 | |||
| ab656f2c2f | |||
| 64fc2193dc | |||
| dd732028f5 | |||
| 414919138b | |||
| db7c8ca910 | |||
| f863ffc965 | |||
| 400d483e87 | |||
| d1695758b2 | |||
| 53a0cf8b95 | |||
| 5eeabc2a44 | |||
| 18551e820c | |||
| e41e160263 | |||
| b89fb2a4a1 | |||
| 5340b0e221 | |||
| 37e3806132 | |||
| c0efdd655b | |||
| aaaec52ad9 | |||
| e1eb45d397 | |||
| 89fca671fb | |||
| d20b0c139c | |||
| 166a168b0f | |||
| 2bb0e1a799 | |||
| 6eaf1e5c52 | |||
| 868a8c5b2c | |||
| b4ad56c1bd | |||
| 69698f257e | |||
| cd0cd85102 | |||
| 0a74bfce9c | |||
| dd3b865854 | |||
| 9b87a579aa | |||
| b539222d4e | |||
| ada8a47b12 | |||
| 6b42a56d46 | |||
| a7facf98d9 | |||
| 1e7bf7970a | |||
| 24ce0a7638 | |||
| e484ecb947 | |||
| da07067215 | |||
| 6cd1b1a18c | |||
| 8730469cfa | |||
| 5b38e984b3 | |||
| 06e22ba44c | |||
| 8d46d5d11d | |||
| f198d7d07a | |||
| 0bf6e97493 | |||
| 6e7209347d |
@ -82,7 +82,7 @@ steps:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --progress plain -f Dockerfile.cpu ."
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain -f Dockerfile.cpu ."
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
@ -1,25 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
|
||||
# Build the docker image.
|
||||
docker build -f Dockerfile.tpu -t vllm-tpu .
|
||||
|
||||
# Set up cleanup.
|
||||
remove_docker_container() { docker rm -f tpu-test || true; }
|
||||
trap remove_docker_container EXIT
|
||||
# Remove the container that might not be cleaned up in the previous run.
|
||||
remove_docker_container
|
||||
|
||||
# For HF_TOKEN.
|
||||
source /etc/environment
|
||||
# Run a simple end-to-end example.
|
||||
docker run --privileged --net host --shm-size=16G -it \
|
||||
-e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
|
||||
vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \
|
||||
&& python3 -m pip install pytest \
|
||||
&& python3 -m pip install lm_eval[api]==0.4.4 \
|
||||
&& pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
|
||||
&& python3 /workspace/vllm/tests/tpu/test_compilation.py \
|
||||
&& python3 /workspace/vllm/tests/tpu/test_quantization_accuracy.py \
|
||||
&& python3 /workspace/vllm/examples/offline_inference/tpu.py"
|
||||
@ -15,13 +15,22 @@ remove_docker_container
|
||||
source /etc/environment
|
||||
# Run a simple end-to-end example.
|
||||
docker run --privileged --net host --shm-size=16G -it \
|
||||
-e "HF_TOKEN=$HF_TOKEN" -e "VLLM_USE_V1=1" --name tpu-test \
|
||||
-e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
|
||||
vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \
|
||||
&& python3 -m pip install pytest \
|
||||
&& python3 -m pip install lm_eval[api]==0.4.4 \
|
||||
&& pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
|
||||
&& pytest -v -s /workspace/vllm/tests/v1/tpu/test_basic.py \
|
||||
&& pytest -v -s /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine \
|
||||
&& python3 /workspace/vllm/tests/tpu/test_compilation.py \
|
||||
&& python3 /workspace/vllm/tests/tpu/test_quantization_accuracy.py \
|
||||
&& python3 /workspace/vllm/examples/offline_inference/tpu.py"
|
||||
&& echo TEST_1 \
|
||||
&& VLLM_USE_V1=1 python3 /workspace/vllm/tests/tpu/test_compilation.py \
|
||||
&& echo TEST_2 \
|
||||
&& VLLM_USE_V1=1 pytest -v -s /workspace/vllm/tests/v1/tpu/test_basic.py \
|
||||
&& echo TEST_3 \
|
||||
&& VLLM_USE_V1=1 pytest -v -s /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine \
|
||||
&& echo TEST_4 \
|
||||
&& VLLM_USE_V1=1 pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py \
|
||||
&& echo TEST_5 \
|
||||
&& VLLM_USE_V1=1 python3 /workspace/vllm/examples/offline_inference/tpu.py" \
|
||||
|
||||
|
||||
# TODO: This test fails because it uses RANDOM_SEED sampling
|
||||
# && VLLM_USE_V1=1 pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
|
||||
|
||||
|
||||
@ -12,10 +12,11 @@ docker build -t ${image_name} -f Dockerfile.xpu .
|
||||
|
||||
# Setup cleanup
|
||||
remove_docker_container() {
|
||||
docker rm -f "${container_name}" || docker image rm -f "${image_name}" || true;
|
||||
docker rm -f "${container_name}" || true;
|
||||
docker image rm -f "${image_name}" || true;
|
||||
docker system prune -f || true;
|
||||
}
|
||||
trap remove_docker_container EXIT
|
||||
remove_docker_container
|
||||
|
||||
# Run the image and test offline inference/tensor parallel
|
||||
docker run \
|
||||
@ -25,6 +26,6 @@ docker run \
|
||||
--name "${container_name}" \
|
||||
"${image_name}" \
|
||||
sh -c '
|
||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m
|
||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m -tp 2
|
||||
VLLM_USE_V1=0 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m
|
||||
VLLM_USE_V1=0 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m -tp 2
|
||||
'
|
||||
|
||||
@ -136,6 +136,10 @@ steps:
|
||||
- examples/offline_inference/rlhf_colocate.py
|
||||
- tests/examples/offline_inference/data_parallel.py
|
||||
commands:
|
||||
# test with tp=2 and external_dp=2
|
||||
- VLLM_USE_V1=0 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
|
||||
- torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
|
||||
# test with internal dp
|
||||
- python3 ../examples/offline_inference/data_parallel.py
|
||||
- pytest -v -s distributed/test_utils.py
|
||||
- pytest -v -s compile/test_basic_correctness.py
|
||||
@ -200,6 +204,7 @@ steps:
|
||||
- pytest -v -s v1/core
|
||||
- pytest -v -s v1/entrypoints
|
||||
- pytest -v -s v1/engine
|
||||
- pytest -v -s v1/entrypoints
|
||||
- pytest -v -s v1/sample
|
||||
- pytest -v -s v1/worker
|
||||
- pytest -v -s v1/structured_output
|
||||
@ -226,10 +231,13 @@ steps:
|
||||
- python3 offline_inference/basic/chat.py
|
||||
- python3 offline_inference/prefix_caching.py
|
||||
- python3 offline_inference/llm_engine_example.py
|
||||
- python3 offline_inference/vision_language.py
|
||||
- python3 offline_inference/vision_language_multi_image.py
|
||||
- python3 offline_inference/audio_language.py --seed 0
|
||||
- python3 offline_inference/vision_language.py --seed 0
|
||||
- python3 offline_inference/vision_language_embedding.py --seed 0
|
||||
- python3 offline_inference/vision_language_multi_image.py --seed 0
|
||||
- VLLM_USE_V1=0 python3 other/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 other/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
||||
- python3 offline_inference/encoder_decoder.py
|
||||
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
|
||||
- python3 offline_inference/basic/classify.py
|
||||
- python3 offline_inference/basic/embed.py
|
||||
- python3 offline_inference/basic/score.py
|
||||
@ -291,6 +299,7 @@ steps:
|
||||
# these tests need to be separated, cannot combine
|
||||
- pytest -v -s compile/piecewise/test_simple.py
|
||||
- pytest -v -s compile/piecewise/test_toy_llama.py
|
||||
- pytest -v -s compile/test_pass_manager.py
|
||||
|
||||
- label: PyTorch Fullgraph Test # 18min
|
||||
source_file_dependencies:
|
||||
@ -507,8 +516,6 @@ steps:
|
||||
- entrypoints/llm/test_collective_rpc.py
|
||||
commands:
|
||||
- pytest -v -s entrypoints/llm/test_collective_rpc.py
|
||||
- VLLM_USE_V1=1 torchrun --nproc-per-node=2 distributed/test_torchrun_example.py
|
||||
- torchrun --nproc-per-node=2 distributed/test_torchrun_example.py
|
||||
- pytest -v -s ./compile/test_basic_correctness.py
|
||||
- pytest -v -s ./compile/test_wrapper.py
|
||||
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
||||
|
||||
28
.github/ISSUE_TEMPLATE/800-misc-discussion.yml
vendored
28
.github/ISSUE_TEMPLATE/800-misc-discussion.yml
vendored
@ -1,28 +0,0 @@
|
||||
name: 🎲 Misc/random discussions that do not fit into the above categories.
|
||||
description: Submit a discussion as you like. Note that developers are heavily overloaded and we mainly rely on community users to answer these issues.
|
||||
title: "[Misc]: "
|
||||
labels: ["misc"]
|
||||
|
||||
body:
|
||||
- type: markdown
|
||||
attributes:
|
||||
value: >
|
||||
#### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
|
||||
- type: textarea
|
||||
attributes:
|
||||
label: Anything you want to discuss about vllm.
|
||||
description: >
|
||||
Anything you want to discuss about vllm.
|
||||
validations:
|
||||
required: true
|
||||
- type: markdown
|
||||
attributes:
|
||||
value: >
|
||||
Thanks for contributing 🎉!
|
||||
- type: checkboxes
|
||||
id: askllm
|
||||
attributes:
|
||||
label: Before submitting a new issue...
|
||||
options:
|
||||
- label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions.
|
||||
required: true
|
||||
4
.github/ISSUE_TEMPLATE/config.yml
vendored
4
.github/ISSUE_TEMPLATE/config.yml
vendored
@ -1 +1,5 @@
|
||||
blank_issues_enabled: false
|
||||
contact_links:
|
||||
- name: Questions
|
||||
url: https://discuss.vllm.ai
|
||||
about: Ask questions and discuss with other vLLM community members
|
||||
|
||||
@ -1,11 +1,7 @@
|
||||
FROM intel/deep-learning-essentials:2025.0.1-0-devel-ubuntu22.04 AS vllm-base
|
||||
# oneapi 2025.0.2 docker base image use rolling 2448 package. https://dgpu-docs.intel.com/releases/packages.html?release=Rolling+2448.13&os=Ubuntu+22.04, and we don't need install driver manually.
|
||||
FROM intel/deep-learning-essentials:2025.0.2-0-devel-ubuntu22.04 AS vllm-base
|
||||
|
||||
RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
|
||||
echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
|
||||
chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \
|
||||
wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \
|
||||
echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
|
||||
chmod 644 /usr/share/keyrings/intel-graphics.gpg
|
||||
RUN rm /etc/apt/sources.list.d/intel-graphics.list
|
||||
|
||||
RUN apt-get update -y && \
|
||||
apt-get install -y --no-install-recommends --fix-missing \
|
||||
@ -21,8 +17,6 @@ RUN apt-get update -y && \
|
||||
python3 \
|
||||
python3-dev \
|
||||
python3-pip \
|
||||
libze-intel-gpu-dev \
|
||||
libze-intel-gpu1 \
|
||||
wget
|
||||
|
||||
WORKDIR /workspace/vllm
|
||||
|
||||
18
README.md
18
README.md
@ -10,21 +10,20 @@ Easy, fast, and cheap LLM serving for everyone
|
||||
</h3>
|
||||
|
||||
<p align="center">
|
||||
| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://vllm.ai"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> |
|
||||
| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://vllm.ai"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> | <a href="https://discuss.vllm.ai"><b>User Forum</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> |
|
||||
</p>
|
||||
|
||||
---
|
||||
|
||||
We’re excited to invite you to the first **vLLM China Meetup** on **March 16** in **Beijing**!
|
||||
[2025/03] We are collaborating with Ollama to host an [Inference Night](https://lu.ma/vllm-ollama) at Y Combinator in San Francisco on Thursday, March 27, at 6 PM. Discuss all things inference local or data center!
|
||||
|
||||
Join us to connect with the **vLLM team** and explore how vLLM is leveraged in **post-training, fine-tuning, and deployment**, including [verl](https://github.com/volcengine/verl), [LLaMA-Factory](https://github.com/hiyouga/LLaMA-Factory), and [vllm-ascend](https://github.com/vllm-project/vllm-ascend).
|
||||
|
||||
👉 **[Register Now](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg)** to be part of the discussion!
|
||||
[2025/04] We're hosting our first-ever *vLLM Asia Developer Day* in Singapore on *April 3rd*! This is a full-day event (9 AM - 9 PM SGT) in partnership with SGInnovate, AMD, and Embedded LLM. Meet the vLLM team and learn about LLM inference for RL, MI300X, and more! [Register Now](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day)
|
||||
|
||||
---
|
||||
|
||||
*Latest News* 🔥
|
||||
|
||||
- [2025/03] We hosted [the first vLLM China Meetup](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg)! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1REHvfQMKGnvz6p3Fd23HhSO4c8j5WPGZV0bKYLwnHyQ/edit?usp=sharing).
|
||||
- [2025/03] We hosted [the East Coast vLLM Meetup](https://lu.ma/7mu4k4xx)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1NHiv8EUFF1NLd3fEYODm56nDmL26lEeXCaDgyDlTsRs/edit#slide=id.g31441846c39_0_0).
|
||||
- [2025/02] We hosted [the ninth vLLM meetup](https://lu.ma/h7g3kuj9) with Meta! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1jzC_PZVXrVNSFVCW-V4cFXb6pn7zZ2CyP_Flwo05aqg/edit?usp=sharing) and AMD [here](https://drive.google.com/file/d/1Zk5qEJIkTmlQ2eQcXQZlljAx3m9s7nwn/view?usp=sharing). The slides from Meta will not be posted.
|
||||
- [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html).
|
||||
@ -152,10 +151,11 @@ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs
|
||||
|
||||
## Contact Us
|
||||
|
||||
- For technical questions and feature requests, please use GitHub issues or discussions.
|
||||
- For discussing with fellow users and coordinating contributions and development, please use Slack.
|
||||
- For security disclosures, please use GitHub's security advisory feature.
|
||||
- For collaborations and partnerships, please contact us at vllm-questions AT lists.berkeley.edu.
|
||||
- For technical questions and feature requests, please use GitHub [Issues](https://github.com/vllm-project/vllm/issues) or [Discussions](https://github.com/vllm-project/vllm/discussions)
|
||||
- For discussing with fellow users, please use the [vLLM Forum](https://discuss.vllm.ai)
|
||||
- coordinating contributions and development, please use [Slack](https://slack.vllm.ai)
|
||||
- For security disclosures, please use GitHub's [Security Advisories](https://github.com/vllm-project/vllm/security/advisories) feature
|
||||
- For collaborations and partnerships, please contact us at [vllm-questions@lists.berkeley.edu](mailto:vllm-questions@lists.berkeley.edu)
|
||||
|
||||
## Media Kit
|
||||
|
||||
|
||||
@ -42,7 +42,7 @@ become available.
|
||||
</tr>
|
||||
<tr>
|
||||
<td><strong>HuggingFace</strong></td>
|
||||
<td style="text-align: center;">✅</td>
|
||||
<td style="text-align: center;">🟡</td>
|
||||
<td style="text-align: center;">🟡</td>
|
||||
<td>Specify your dataset path on HuggingFace</td>
|
||||
</tr>
|
||||
@ -60,8 +60,8 @@ become available.
|
||||
🚧: to be supported
|
||||
|
||||
🟡: Partial support. Currently, HuggingFaceDataset only supports dataset formats
|
||||
similar to `lmms-lab/LLaVA-OneVision-Data`. If you need support for other dataset
|
||||
formats, please consider contributing.
|
||||
similar to `lmms-lab/LLaVA-OneVision-Data` and `Aeala/ShareGPT_Vicuna_unfiltered`.
|
||||
If you need support for other dataset formats, please consider contributing.
|
||||
|
||||
**Note**: VisionArena’s `dataset-name` should be set to `hf`
|
||||
|
||||
@ -139,6 +139,57 @@ python3 vllm/benchmarks/benchmark_serving.py \
|
||||
--num-prompts "${NUM_PROMPTS}"
|
||||
```
|
||||
|
||||
### HuggingFaceDataset Examples
|
||||
|
||||
Currently, HuggingFaceDataset only supports dataset formats
|
||||
similar to `lmms-lab/LLaVA-OneVision-Data` and `Aeala/ShareGPT_Vicuna_unfiltered`. If you need support for other dataset
|
||||
formats, please consider contributing.
|
||||
|
||||
```bash
|
||||
# need a model with vision capability here
|
||||
vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
|
||||
```
|
||||
|
||||
**`lmms-lab/LLaVA-OneVision-Data`**
|
||||
|
||||
```bash
|
||||
MODEL_NAME="Qwen/Qwen2-VL-7B-Instruct"
|
||||
NUM_PROMPTS=10
|
||||
BACKEND="openai-chat"
|
||||
DATASET_NAME="hf"
|
||||
DATASET_PATH="lmms-lab/LLaVA-OneVision-Data"
|
||||
DATASET_SPLIT='train'
|
||||
DATASET_SUBSET='chart2text(cauldron)'
|
||||
python3 vllm/benchmarks/benchmark_serving.py \
|
||||
--backend "${BACKEND}" \
|
||||
--model "${MODEL_NAME}" \
|
||||
--endpoint "/v1/chat/completions" \
|
||||
--dataset-name "${DATASET_NAME}" \
|
||||
--dataset-path "${DATASET_PATH}" \
|
||||
--hf-split "${DATASET_SPLIT}" \
|
||||
--num-prompts "${NUM_PROMPTS}" \
|
||||
--hf-subset "${DATASET_SUBSET}"
|
||||
```
|
||||
|
||||
**`Aeala/ShareGPT_Vicuna_unfiltered`**
|
||||
|
||||
```bash
|
||||
MODEL_NAME="Qwen/Qwen2-VL-7B-Instruct"
|
||||
NUM_PROMPTS=10
|
||||
BACKEND="openai-chat"
|
||||
DATASET_NAME="hf"
|
||||
DATASET_PATH="Aeala/ShareGPT_Vicuna_unfiltered"
|
||||
DATASET_SPLIT='train'
|
||||
python3 vllm/benchmarks/benchmark_serving.py \
|
||||
--backend "${BACKEND}" \
|
||||
--model "${MODEL_NAME}" \
|
||||
--endpoint "/v1/chat/completions" \
|
||||
--dataset-name "${DATASET_NAME}" \
|
||||
--dataset-path "${DATASET_PATH}" \
|
||||
--hf-split "${DATASET_SPLIT}" \
|
||||
--num-prompts "${NUM_PROMPTS}" \
|
||||
```
|
||||
|
||||
---
|
||||
## Example - Offline Throughput Benchmark
|
||||
|
||||
|
||||
@ -63,7 +63,7 @@ async def async_request_tgi(
|
||||
"temperature": 0.01, # TGI does not accept 0.0 temperature.
|
||||
"top_p": 0.99, # TGI does not accept 1.0 top_p.
|
||||
"truncate": request_func_input.prompt_len,
|
||||
# TGI does not accept ignore_eos flag.
|
||||
"ignore_eos_token": request_func_input.ignore_eos,
|
||||
}
|
||||
payload = {
|
||||
"inputs": request_func_input.prompt,
|
||||
@ -71,6 +71,10 @@ async def async_request_tgi(
|
||||
}
|
||||
output = RequestFuncOutput()
|
||||
output.prompt_len = request_func_input.prompt_len
|
||||
if request_func_input.ignore_eos:
|
||||
output.output_tokens = request_func_input.output_len
|
||||
else:
|
||||
output.output_tokens = None
|
||||
|
||||
ttft = 0.0
|
||||
st = time.perf_counter()
|
||||
|
||||
@ -17,6 +17,7 @@ SampleRequest instances, similar to the approach used in ShareGPT.
|
||||
import base64
|
||||
import io
|
||||
import json
|
||||
import logging
|
||||
import random
|
||||
from abc import ABC, abstractmethod
|
||||
from collections.abc import Mapping
|
||||
@ -35,6 +36,8 @@ from vllm.lora.utils import get_adapter_absolute_path
|
||||
from vllm.multimodal import MultiModalDataDict
|
||||
from vllm.transformers_utils.tokenizer import AnyTokenizer, get_lora_tokenizer
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Data Classes
|
||||
# -----------------------------------------------------------------------------
|
||||
@ -61,9 +64,6 @@ class SampleRequest:
|
||||
class BenchmarkDataset(ABC):
|
||||
DEFAULT_SEED = 0
|
||||
|
||||
# num_requests has default 1000 in both the benchmark_serving.py and
|
||||
# benchmark_throughput.py
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
dataset_path: Optional[str] = None,
|
||||
@ -90,8 +90,8 @@ class BenchmarkDataset(ABC):
|
||||
mm_content: Optional[MultiModalDataDict] = None) -> list[dict]:
|
||||
"""
|
||||
Transform a prompt and optional multimodal content into a chat format.
|
||||
This method is used for chat models that expect a specific
|
||||
conversation format.
|
||||
This method is used for chat models that expect a specific conversation
|
||||
format.
|
||||
"""
|
||||
content = [{"text": prompt, "type": "text"}]
|
||||
if mm_content is not None:
|
||||
@ -101,10 +101,10 @@ class BenchmarkDataset(ABC):
|
||||
def load_data(self) -> None:
|
||||
"""
|
||||
Load data from the dataset path into self.data.
|
||||
|
||||
|
||||
This method must be overridden by subclasses since the method to load
|
||||
data will vary depending on the dataset format and source.
|
||||
|
||||
|
||||
Raises:
|
||||
NotImplementedError: If a subclass does not implement this method.
|
||||
"""
|
||||
@ -121,18 +121,18 @@ class BenchmarkDataset(ABC):
|
||||
"""
|
||||
Optionally select a random LoRA request and return its associated
|
||||
tokenizer.
|
||||
|
||||
|
||||
This method is used when LoRA parameters are provided. It randomly
|
||||
selects a LoRA based on max_loras and retrieves a cached tokenizer for
|
||||
that LoRA if available. Otherwise, it returns the base tokenizer.
|
||||
|
||||
|
||||
Args:
|
||||
tokenizer (PreTrainedTokenizerBase): The base tokenizer to use if no
|
||||
LoRA is selected. max_loras (Optional[int]): The maximum number of
|
||||
LoRAs available. If None, LoRA is not used. lora_path
|
||||
(Optional[str]): Path to the LoRA parameters on disk. If None, LoRA
|
||||
is not used.
|
||||
|
||||
|
||||
Returns:
|
||||
tuple[Optional[LoRARequest], AnyTokenizer]: A tuple where the first
|
||||
element is a LoRARequest (or None if not applicable) and the second
|
||||
@ -160,21 +160,39 @@ class BenchmarkDataset(ABC):
|
||||
num_requests: int) -> list[SampleRequest]:
|
||||
"""
|
||||
Abstract method to generate sample requests from the dataset.
|
||||
|
||||
|
||||
Subclasses must override this method to implement dataset-specific logic
|
||||
for generating a list of SampleRequest objects.
|
||||
|
||||
|
||||
Args:
|
||||
tokenizer (PreTrainedTokenizerBase): The tokenizer to be used
|
||||
for processing the dataset's text.
|
||||
num_requests (int): The number of sample requests to generate.
|
||||
|
||||
|
||||
Returns:
|
||||
list[SampleRequest]: A list of sample requests generated from the
|
||||
dataset.
|
||||
"""
|
||||
raise NotImplementedError("sample must be implemented in subclasses.")
|
||||
|
||||
def maybe_oversample_requests(self, requests: list[SampleRequest],
|
||||
num_requests: int) -> None:
|
||||
"""
|
||||
Oversamples the list of requests if its size is less than the desired
|
||||
number.
|
||||
|
||||
Args:
|
||||
requests (List[SampleRequest]): The current list of sampled
|
||||
requests. num_requests (int): The target number of requests.
|
||||
"""
|
||||
if len(requests) < num_requests:
|
||||
random.seed(self.random_seed)
|
||||
additional = random.choices(requests,
|
||||
k=num_requests - len(requests))
|
||||
requests.extend(additional)
|
||||
logger.info("Oversampled requests to reach %d total samples.",
|
||||
num_requests)
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Utility Functions and Global Caches
|
||||
@ -276,15 +294,16 @@ class RandomDataset(BenchmarkDataset):
|
||||
) -> None:
|
||||
super().__init__(**kwargs)
|
||||
|
||||
def sample(self,
|
||||
tokenizer: PreTrainedTokenizerBase,
|
||||
num_requests: int,
|
||||
prefix_len: int = DEFAULT_PREFIX_LEN,
|
||||
range_ratio: float = DEFAULT_RANGE_RATIO,
|
||||
input_len: int = DEFAULT_INPUT_LEN,
|
||||
output_len: int = DEFAULT_OUTPUT_LEN,
|
||||
**kwargs) -> list[SampleRequest]:
|
||||
|
||||
def sample(
|
||||
self,
|
||||
tokenizer: PreTrainedTokenizerBase,
|
||||
num_requests: int,
|
||||
prefix_len: int = DEFAULT_PREFIX_LEN,
|
||||
range_ratio: float = DEFAULT_RANGE_RATIO,
|
||||
input_len: int = DEFAULT_INPUT_LEN,
|
||||
output_len: int = DEFAULT_OUTPUT_LEN,
|
||||
**kwargs,
|
||||
) -> list[SampleRequest]:
|
||||
vocab_size = tokenizer.vocab_size
|
||||
|
||||
prefix_token_ids = (np.random.randint(
|
||||
@ -346,20 +365,24 @@ class ShareGPTDataset(BenchmarkDataset):
|
||||
random.seed(self.random_seed)
|
||||
random.shuffle(self.data)
|
||||
|
||||
def sample(self,
|
||||
tokenizer: PreTrainedTokenizerBase,
|
||||
num_requests: int,
|
||||
lora_path: Optional[str] = None,
|
||||
max_loras: Optional[int] = None,
|
||||
output_len: Optional[int] = None,
|
||||
enable_multimodal_chat: bool = False,
|
||||
**kwargs) -> list:
|
||||
def sample(
|
||||
self,
|
||||
tokenizer: PreTrainedTokenizerBase,
|
||||
num_requests: int,
|
||||
lora_path: Optional[str] = None,
|
||||
max_loras: Optional[int] = None,
|
||||
output_len: Optional[int] = None,
|
||||
enable_multimodal_chat: bool = False,
|
||||
**kwargs,
|
||||
) -> list:
|
||||
samples: list = []
|
||||
for entry in self.data:
|
||||
if len(samples) >= num_requests:
|
||||
break
|
||||
prompt, completion = entry["conversations"][0]["value"],\
|
||||
entry["conversations"][1]["value"]
|
||||
prompt, completion = (
|
||||
entry["conversations"][0]["value"],
|
||||
entry["conversations"][1]["value"],
|
||||
)
|
||||
|
||||
lora_request, tokenizer = self.get_random_lora_request(
|
||||
tokenizer=tokenizer, max_loras=max_loras, lora_path=lora_path)
|
||||
@ -383,6 +406,7 @@ class ShareGPTDataset(BenchmarkDataset):
|
||||
expected_output_len=new_output_len,
|
||||
lora_request=lora_request,
|
||||
))
|
||||
self.maybe_oversample_requests(samples, num_requests)
|
||||
return samples
|
||||
|
||||
|
||||
@ -415,19 +439,20 @@ class SonnetDataset(BenchmarkDataset):
|
||||
with open(self.dataset_path, encoding="utf-8") as f:
|
||||
self.data = f.readlines()
|
||||
|
||||
def sample(self,
|
||||
tokenizer,
|
||||
num_requests: int,
|
||||
prefix_len: int = DEFAULT_PREFIX_LEN,
|
||||
input_len: int = DEFAULT_INPUT_LEN,
|
||||
output_len: int = DEFAULT_OUTPUT_LEN,
|
||||
return_prompt_formatted: bool = False,
|
||||
**kwargs) -> list:
|
||||
def sample(
|
||||
self,
|
||||
tokenizer,
|
||||
num_requests: int,
|
||||
prefix_len: int = DEFAULT_PREFIX_LEN,
|
||||
input_len: int = DEFAULT_INPUT_LEN,
|
||||
output_len: int = DEFAULT_OUTPUT_LEN,
|
||||
return_prompt_formatted: bool = False,
|
||||
**kwargs,
|
||||
) -> list:
|
||||
# Calculate average token length for a poem line.
|
||||
tokenized_lines = [tokenizer(line).input_ids for line in self.data]
|
||||
avg_len = sum(len(tokens)
|
||||
for tokens in \
|
||||
tokenized_lines) / len(tokenized_lines)
|
||||
for tokens in tokenized_lines) / len(tokenized_lines)
|
||||
|
||||
# Build the base prompt.
|
||||
base_prompt = "Pick as many lines as you can from these poem lines:\n"
|
||||
@ -506,12 +531,14 @@ class BurstGPTDataset(BenchmarkDataset):
|
||||
# Convert the dataframe to a list of lists.
|
||||
return data.values.tolist()
|
||||
|
||||
def sample(self,
|
||||
tokenizer: PreTrainedTokenizerBase,
|
||||
num_requests: int,
|
||||
max_loras: Optional[int] = None,
|
||||
lora_path: Optional[str] = None,
|
||||
**kwargs) -> list[SampleRequest]:
|
||||
def sample(
|
||||
self,
|
||||
tokenizer: PreTrainedTokenizerBase,
|
||||
num_requests: int,
|
||||
max_loras: Optional[int] = None,
|
||||
lora_path: Optional[str] = None,
|
||||
**kwargs,
|
||||
) -> list[SampleRequest]:
|
||||
samples = []
|
||||
data = self._sample_loaded_data(num_requests=num_requests)
|
||||
for i in range(num_requests):
|
||||
@ -544,7 +571,6 @@ class HuggingFaceDataset(BenchmarkDataset):
|
||||
Dataset class for processing a HuggingFace dataset with conversation data
|
||||
and optional images.
|
||||
"""
|
||||
DEFAULT_NUM_REQUESTS = 1000
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@ -618,6 +644,7 @@ class HuggingFaceDataset(BenchmarkDataset):
|
||||
expected_output_len=output_len,
|
||||
multi_modal_data=mm_content,
|
||||
))
|
||||
self.maybe_oversample_requests(sampled_requests, num_requests)
|
||||
return sampled_requests
|
||||
|
||||
|
||||
@ -632,7 +659,6 @@ class VisionArenaDataset(HuggingFaceDataset):
|
||||
"""
|
||||
|
||||
DEFAULT_OUTPUT_LEN = 128
|
||||
DEFAULT_NUM_REQUESTS = 1000
|
||||
VISION_ARENA_DATASET_PATH = "lmarena-ai/vision-arena-bench-v0.1"
|
||||
|
||||
def __init__(
|
||||
@ -657,12 +683,14 @@ class VisionArenaDataset(HuggingFaceDataset):
|
||||
)
|
||||
self.data = dataset.shuffle(seed=self.random_seed)
|
||||
|
||||
def sample(self,
|
||||
tokenizer: PreTrainedTokenizerBase,
|
||||
num_requests: int,
|
||||
output_len: Optional[int] = None,
|
||||
enable_multimodal_chat: bool = False,
|
||||
**kwargs) -> list:
|
||||
def sample(
|
||||
self,
|
||||
tokenizer: PreTrainedTokenizerBase,
|
||||
num_requests: int,
|
||||
output_len: Optional[int] = None,
|
||||
enable_multimodal_chat: bool = False,
|
||||
**kwargs,
|
||||
) -> list:
|
||||
output_len = (output_len
|
||||
if output_len is not None else self.DEFAULT_OUTPUT_LEN)
|
||||
sampled_requests = []
|
||||
@ -685,4 +713,5 @@ class VisionArenaDataset(HuggingFaceDataset):
|
||||
expected_output_len=output_len,
|
||||
multi_modal_data=mm_content,
|
||||
))
|
||||
self.maybe_oversample_requests(sampled_requests, num_requests)
|
||||
return sampled_requests
|
||||
|
||||
@ -732,8 +732,11 @@ def main(args: argparse.Namespace):
|
||||
api_url = f"http://{args.host}:{args.port}{args.endpoint}"
|
||||
base_url = f"http://{args.host}:{args.port}"
|
||||
|
||||
tokenizer = get_tokenizer(tokenizer_id,
|
||||
trust_remote_code=args.trust_remote_code)
|
||||
tokenizer = get_tokenizer(
|
||||
tokenizer_id,
|
||||
trust_remote_code=args.trust_remote_code,
|
||||
tokenizer_mode=args.tokenizer_mode,
|
||||
)
|
||||
|
||||
if args.dataset == 'grammar':
|
||||
args.structure_type = 'guided_grammar'
|
||||
@ -876,6 +879,13 @@ if __name__ == "__main__":
|
||||
help=
|
||||
"Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501
|
||||
)
|
||||
parser.add_argument(
|
||||
"--tokenizer-mode",
|
||||
type=str,
|
||||
default="auto",
|
||||
help=
|
||||
"Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501
|
||||
)
|
||||
parser.add_argument(
|
||||
"--num-prompts",
|
||||
type=int,
|
||||
@ -989,11 +999,12 @@ if __name__ == "__main__":
|
||||
type=float,
|
||||
default=1.0,
|
||||
help="Ratio of Structured Outputs requests")
|
||||
parser.add_argument("--structured-output-backend",
|
||||
type=str,
|
||||
choices=["outlines", "lm-format-enforcer", "xgrammar"],
|
||||
default="xgrammar",
|
||||
help="Backend to use for structured outputs")
|
||||
parser.add_argument(
|
||||
"--structured-output-backend",
|
||||
type=str,
|
||||
choices=["outlines", "lm-format-enforcer", "xgrammar", "guidance"],
|
||||
default="xgrammar",
|
||||
help="Backend to use for structured outputs")
|
||||
|
||||
args = parser.parse_args()
|
||||
main(args)
|
||||
|
||||
@ -17,13 +17,8 @@ from torch.utils.benchmark import Measurement as TMeasurement
|
||||
from utils import ArgPool, Bench, CudaGraphBenchParams
|
||||
from weight_shapes import WEIGHT_SHAPES
|
||||
|
||||
from vllm.lora.ops.triton_ops.bgmv_expand import bgmv_expand
|
||||
from vllm.lora.ops.triton_ops.bgmv_expand_slice import bgmv_expand_slice
|
||||
from vllm.lora.ops.triton_ops.bgmv_shrink import bgmv_shrink
|
||||
from vllm.lora.ops.triton_ops.sgmv_expand import sgmv_expand
|
||||
from vllm.lora.ops.triton_ops.sgmv_shrink import sgmv_shrink
|
||||
from vllm.lora.ops.triton_ops import LoRAKernelMeta, lora_expand, lora_shrink
|
||||
from vllm.lora.ops.triton_ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT
|
||||
from vllm.lora.ops.triton_ops.v1 import V1KernelMeta, v1_expand, v1_shrink
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
|
||||
DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
|
||||
@ -167,69 +162,25 @@ class OpType(Enum):
|
||||
"""
|
||||
LoRA Ops to benchmark and its properties.
|
||||
"""
|
||||
SGMV_SHRINK = auto()
|
||||
BGMV_SHRINK = auto()
|
||||
SGMV_EXPAND = auto()
|
||||
BGMV_EXPAND = auto()
|
||||
BGMV_EXPAND_SLICE = auto()
|
||||
V1_SHRINK = auto()
|
||||
V1_EXPAND = auto()
|
||||
LORA_SHRINK = auto()
|
||||
LORA_EXPAND = auto()
|
||||
|
||||
@staticmethod
|
||||
def from_str(s: str) -> "OpType":
|
||||
if s.lower() == 'sgmv_shrink':
|
||||
return OpType.SGMV_SHRINK
|
||||
if s.lower() == 'sgmv_expand':
|
||||
return OpType.SGMV_EXPAND
|
||||
if s.lower() == 'bgmv_shrink':
|
||||
return OpType.BGMV_SHRINK
|
||||
if s.lower() == 'bgmv_expand':
|
||||
return OpType.BGMV_EXPAND
|
||||
if s.lower() == "bgmv_expand_slice":
|
||||
return OpType.BGMV_EXPAND_SLICE
|
||||
if s.lower() == "v1_shrink":
|
||||
return OpType.V1_SHRINK
|
||||
if s.lower() == "v1_expand":
|
||||
return OpType.V1_EXPAND
|
||||
if s.lower() == "lora_shrink":
|
||||
return OpType.LORA_SHRINK
|
||||
if s.lower() == "lora_expand":
|
||||
return OpType.LORA_EXPAND
|
||||
raise ValueError(f"Unrecognized str {s} to convert to OpType")
|
||||
|
||||
def is_shrink_fn(self) -> bool:
|
||||
return self in [
|
||||
OpType.SGMV_SHRINK, OpType.BGMV_SHRINK, OpType.V1_SHRINK
|
||||
]
|
||||
return self in [OpType.LORA_SHRINK]
|
||||
|
||||
def is_expand_fn(self) -> bool:
|
||||
return self in [
|
||||
OpType.SGMV_EXPAND, OpType.BGMV_EXPAND, OpType.V1_EXPAND
|
||||
]
|
||||
|
||||
def is_prefill_op(self) -> bool:
|
||||
return self in [
|
||||
OpType.SGMV_SHRINK, OpType.SGMV_EXPAND, OpType.V1_SHRINK,
|
||||
OpType.V1_EXPAND
|
||||
]
|
||||
|
||||
def is_decode_op(self) -> bool:
|
||||
return self in [
|
||||
OpType.BGMV_SHRINK, OpType.BGMV_EXPAND, OpType.BGMV_EXPAND_SLICE,
|
||||
OpType.V1_SHRINK, OpType.V1_EXPAND
|
||||
]
|
||||
|
||||
def is_expand_slice_fn(self) -> bool:
|
||||
return self in [OpType.BGMV_EXPAND_SLICE]
|
||||
return self in [OpType.LORA_EXPAND]
|
||||
|
||||
def num_slices(self) -> list[int]:
|
||||
if self in [
|
||||
OpType.SGMV_EXPAND, OpType.SGMV_SHRINK, OpType.V1_SHRINK,
|
||||
OpType.V1_EXPAND
|
||||
]:
|
||||
# SGMV kernels and v1 kernels supports slices
|
||||
return [1, 2, 3]
|
||||
if self in [OpType.BGMV_SHRINK, OpType.BGMV_EXPAND]:
|
||||
return [1]
|
||||
if self in [OpType.BGMV_EXPAND_SLICE]:
|
||||
return [2, 3]
|
||||
raise ValueError(f"Unrecognized OpType {self}")
|
||||
return [1, 2, 3]
|
||||
|
||||
def mkn(self, batch_size: int, seq_length: int, hidden_size: int,
|
||||
lora_rank: int) -> tuple[int, int, int]:
|
||||
@ -239,7 +190,7 @@ class OpType(Enum):
|
||||
k = hidden_size
|
||||
n = lora_rank
|
||||
else:
|
||||
assert self.is_expand_fn() or self.is_expand_slice_fn()
|
||||
assert self.is_expand_fn()
|
||||
m = num_tokens
|
||||
k = lora_rank
|
||||
n = hidden_size
|
||||
@ -254,7 +205,7 @@ class OpType(Enum):
|
||||
if self.is_shrink_fn():
|
||||
return op_dtype, op_dtype, torch.float32
|
||||
else:
|
||||
assert self.is_expand_fn() or self.is_expand_slice_fn()
|
||||
assert self.is_expand_fn()
|
||||
return torch.float32, op_dtype, op_dtype
|
||||
|
||||
def matmul_shapes(
|
||||
@ -268,43 +219,19 @@ class OpType(Enum):
|
||||
m, k, n = self.mkn(batch_size, seq_length, hidden_size, lora_rank)
|
||||
|
||||
b_shape = (num_loras, n, k) # col-major
|
||||
if self in [OpType.SGMV_SHRINK, OpType.V1_SHRINK]:
|
||||
# SGMV shrink and V1 shrink kernels support num_slices inherently
|
||||
# in the kernel.
|
||||
if self in [OpType.LORA_SHRINK]:
|
||||
# LoRA shrink kernels support num_slices inherently in the kernel.
|
||||
return ((m, k), b_shape, (num_slices, m, n))
|
||||
if self in [OpType.SGMV_EXPAND, OpType.V1_EXPAND]:
|
||||
# SGMV expand and V1 expand kernels support num_slices inherently
|
||||
# in the kernel
|
||||
if self in [OpType.LORA_EXPAND]:
|
||||
# LoRA expand kernels support num_slices inherently in the kernel
|
||||
return ((num_slices, m, k), b_shape, (m, n * num_slices))
|
||||
if self == OpType.BGMV_SHRINK:
|
||||
return ((m, k), b_shape, (m, n))
|
||||
if self == OpType.BGMV_EXPAND:
|
||||
return ((m, k), b_shape, (m, n))
|
||||
if self == OpType.BGMV_EXPAND_SLICE:
|
||||
return ((num_slices, m, k), b_shape, (m, n * num_slices))
|
||||
|
||||
raise ValueError(f"Unrecognized op_type {self}")
|
||||
|
||||
def bench_fn(self) -> Callable:
|
||||
|
||||
def emulate_bgmv_expand_slice(kwargs_list: list[dict[str, Any]]):
|
||||
for x in kwargs_list:
|
||||
bgmv_expand_slice(**x)
|
||||
|
||||
if self == OpType.SGMV_SHRINK:
|
||||
return sgmv_shrink
|
||||
if self == OpType.SGMV_EXPAND:
|
||||
return sgmv_expand
|
||||
if self == OpType.BGMV_SHRINK:
|
||||
return bgmv_shrink
|
||||
if self == OpType.BGMV_EXPAND:
|
||||
return bgmv_expand
|
||||
if self == OpType.BGMV_EXPAND_SLICE:
|
||||
return emulate_bgmv_expand_slice
|
||||
if self == OpType.V1_SHRINK:
|
||||
return v1_shrink
|
||||
if self == OpType.V1_EXPAND:
|
||||
return v1_expand
|
||||
if self == OpType.LORA_SHRINK:
|
||||
return lora_shrink
|
||||
if self == OpType.LORA_EXPAND:
|
||||
return lora_expand
|
||||
|
||||
raise ValueError(f"Unrecognized optype {self}")
|
||||
|
||||
@ -318,34 +245,13 @@ class OpType(Enum):
|
||||
"""
|
||||
w_dtype = lora_weights[0].dtype
|
||||
num_slices = len(lora_weights)
|
||||
if self in [OpType.SGMV_SHRINK, OpType.V1_SHRINK]:
|
||||
if self in [OpType.LORA_SHRINK]:
|
||||
for slice_idx in range(num_slices):
|
||||
ref_group_gemm(ref_out=output[slice_idx, :],
|
||||
input=input,
|
||||
lora_weights=lora_weights[slice_idx],
|
||||
**kwargs)
|
||||
elif self in [OpType.SGMV_EXPAND, OpType.V1_EXPAND]:
|
||||
hidden_size = lora_weights[0].shape[1]
|
||||
for slice_idx in range(num_slices):
|
||||
slice_offset = slice_idx * hidden_size
|
||||
ref_group_gemm(
|
||||
ref_out=output[:, slice_offset:slice_offset + hidden_size],
|
||||
input=input[slice_idx].clone().to(dtype=w_dtype),
|
||||
lora_weights=lora_weights[slice_idx],
|
||||
**kwargs)
|
||||
elif self == OpType.BGMV_SHRINK:
|
||||
assert num_slices == 1
|
||||
ref_group_gemm(ref_out=output,
|
||||
input=input,
|
||||
lora_weights=lora_weights[0],
|
||||
**kwargs)
|
||||
elif self == OpType.BGMV_EXPAND:
|
||||
assert num_slices == 1
|
||||
ref_group_gemm(ref_out=output,
|
||||
input=input.clone().to(dtype=w_dtype),
|
||||
lora_weights=lora_weights[0],
|
||||
**kwargs)
|
||||
elif self == OpType.BGMV_EXPAND_SLICE:
|
||||
elif self in [OpType.LORA_EXPAND]:
|
||||
hidden_size = lora_weights[0].shape[1]
|
||||
for slice_idx in range(num_slices):
|
||||
slice_offset = slice_idx * hidden_size
|
||||
@ -411,13 +317,11 @@ class BenchmarkTensors:
|
||||
input: torch.Tensor
|
||||
lora_weights_lst: list[torch.Tensor]
|
||||
output: torch.Tensor
|
||||
# metadata tensors
|
||||
# LoRA kernel metadata
|
||||
lora_kernel_meta: LoRAKernelMeta
|
||||
# Metadata tensors used in testing correctness
|
||||
seq_lens: torch.Tensor
|
||||
seq_start_loc: torch.Tensor
|
||||
prompt_lora_mapping: torch.Tensor
|
||||
token_lora_mapping: torch.Tensor
|
||||
# v1 kernel metadata
|
||||
v1_kernel_meta: Optional[V1KernelMeta] = None
|
||||
|
||||
def io_types(self) -> str:
|
||||
return (f"{dtype_to_str(self.input.dtype)}x"
|
||||
@ -444,35 +348,29 @@ class BenchmarkTensors:
|
||||
assert ctx.num_active_loras <= ctx.num_loras
|
||||
total_tokens = ctx.batch_size * ctx.seq_length
|
||||
|
||||
# Make metadata tensors involved in correctness testing.
|
||||
# Prepare seq lens tensor
|
||||
seq_len_tensor = torch.randint(ctx.seq_length, ctx.seq_length + 1,
|
||||
(ctx.batch_size, ))
|
||||
# Prepare seq_start_loc tensor
|
||||
seq_start_loc_tensor = torch.cumsum(torch.tensor(
|
||||
[0] + seq_len_tensor[:-1].tolist(), dtype=torch.long),
|
||||
dim=0)
|
||||
assert total_tokens == seq_len_tensor.sum()
|
||||
# Prepare prompt lora indices tensor
|
||||
prompt_lora_indices_tensor = make_prompt_lora_mapping(
|
||||
ctx.batch_size, ctx.num_active_loras, ctx.sort_by_lora_id, "cpu")
|
||||
# Prepare token lora indices tensor
|
||||
|
||||
# Make LoRAKernelMeta
|
||||
token_lora_indices_tensor = make_token_lora_mapping(
|
||||
total_tokens, ctx.batch_size, prompt_lora_indices_tensor,
|
||||
seq_len_tensor, "cpu")
|
||||
|
||||
v1_kernel_meta = None
|
||||
if op_type in [OpType.V1_SHRINK, OpType.V1_EXPAND]:
|
||||
v1_kernel_meta = V1KernelMeta.make(
|
||||
max_loras=ctx.num_loras,
|
||||
max_num_tokens=token_lora_indices_tensor.size(0),
|
||||
device="cpu")
|
||||
v1_kernel_meta.prepare_tensors(
|
||||
token_lora_mapping=token_lora_indices_tensor)
|
||||
lora_kernel_meta = LoRAKernelMeta.make(
|
||||
max_loras=ctx.num_loras,
|
||||
max_num_tokens=token_lora_indices_tensor.size(0),
|
||||
device="cpu")
|
||||
lora_kernel_meta.prepare_tensors(
|
||||
token_lora_mapping=token_lora_indices_tensor)
|
||||
|
||||
return BenchmarkTensors(input_tensor, lora_weights, output_tensor,
|
||||
seq_len_tensor, seq_start_loc_tensor,
|
||||
prompt_lora_indices_tensor,
|
||||
token_lora_indices_tensor, v1_kernel_meta)
|
||||
lora_kernel_meta, seq_len_tensor,
|
||||
prompt_lora_indices_tensor)
|
||||
|
||||
def sanity_check(self) -> None:
|
||||
"""
|
||||
@ -482,9 +380,9 @@ class BenchmarkTensors:
|
||||
# check metadata tensors
|
||||
assert torch.sum(self.seq_lens) == num_tokens
|
||||
num_seqs = self.seq_lens.shape[0]
|
||||
assert self.seq_start_loc.shape[0] == num_seqs
|
||||
#assert self.seq_start_loc.shape[0] == num_seqs
|
||||
assert self.prompt_lora_mapping.shape[0] == num_seqs
|
||||
assert self.token_lora_mapping.shape[0] == num_tokens
|
||||
assert self.lora_kernel_meta.token_lora_mapping.shape[0] == num_tokens
|
||||
|
||||
def to_device(self, device: str):
|
||||
"""
|
||||
@ -499,220 +397,27 @@ class BenchmarkTensors:
|
||||
self.input = to_device(self.input)
|
||||
self.output = to_device(self.output)
|
||||
self.seq_lens = to_device(self.seq_lens)
|
||||
self.seq_start_loc = to_device(self.seq_start_loc)
|
||||
self.prompt_lora_mapping = to_device(self.prompt_lora_mapping)
|
||||
self.token_lora_mapping = to_device(self.token_lora_mapping)
|
||||
for i in range(len(self.lora_weights_lst)):
|
||||
self.lora_weights_lst[i] = to_device(self.lora_weights_lst[i])
|
||||
|
||||
# v1 meta
|
||||
if self.v1_kernel_meta:
|
||||
for field_name in V1KernelMeta.__dataclass_fields__:
|
||||
field = getattr(self.v1_kernel_meta, field_name)
|
||||
assert isinstance(field, torch.Tensor)
|
||||
setattr(self.v1_kernel_meta, field_name, to_device(field))
|
||||
# LoRA meta
|
||||
for field_name in LoRAKernelMeta.__dataclass_fields__:
|
||||
field = getattr(self.lora_kernel_meta, field_name)
|
||||
assert isinstance(field, torch.Tensor)
|
||||
setattr(self.lora_kernel_meta, field_name, to_device(field))
|
||||
|
||||
def metadata(self) -> tuple[int, int, int]:
|
||||
"""
|
||||
Return num_seqs, num_tokens and max_seq_len
|
||||
"""
|
||||
num_seqs = self.seq_lens.shape[0]
|
||||
num_tokens = self.token_lora_mapping.shape[0]
|
||||
num_tokens = self.lora_kernel_meta.token_lora_mapping.shape[0]
|
||||
max_seq_len = torch.max(self.seq_lens).item()
|
||||
num_slices = len(self.lora_weights_lst)
|
||||
return num_seqs, num_tokens, max_seq_len, num_slices
|
||||
|
||||
def convert_to_sgmv_benchmark_tensors(self):
|
||||
"""
|
||||
For sgmv punica kernels, when consecutive sequences have the
|
||||
same LoRA ID, we just merge them together.
|
||||
This happens in punica.py::compute_metadata
|
||||
"""
|
||||
|
||||
# Collapse seq_lens and seq_start_loc
|
||||
_, seq_lens = torch.unique_consecutive(self.token_lora_mapping,
|
||||
return_counts=True)
|
||||
cum_result = torch.cumsum(seq_lens, dim=0)
|
||||
seq_start_loc = torch.zeros_like(seq_lens)
|
||||
seq_start_loc[1:].copy_(cum_result[:-1])
|
||||
|
||||
# Collapse prompt mapping
|
||||
prompt_lora_mapping = torch.unique_consecutive(
|
||||
self.prompt_lora_mapping)
|
||||
|
||||
assert torch.sum(seq_lens) == torch.sum(self.seq_lens), \
|
||||
f"dont match - new {torch.sum(seq_lens)} vs {torch.sum(self.seq_lens)}"
|
||||
|
||||
self.prompt_lora_mapping = prompt_lora_mapping.to(
|
||||
dtype=self.prompt_lora_mapping.dtype)
|
||||
self.seq_lens = seq_lens.to(dtype=self.seq_lens.dtype)
|
||||
self.seq_start_loc = seq_start_loc.to(dtype=self.seq_start_loc.dtype)
|
||||
|
||||
def as_sgmv_shrink_kwargs(self) -> dict[str, Any]:
|
||||
self.convert_to_sgmv_benchmark_tensors()
|
||||
self.sanity_check()
|
||||
self.to_device(self.input.device)
|
||||
|
||||
num_seqs, num_tokens, max_seq_len, num_slices = self.metadata()
|
||||
|
||||
# Sanity check matrix shapes.
|
||||
i_shape, lw_shape, o_shape = self.input.shape, self.lora_weights_lst[
|
||||
0].shape, self.output.shape
|
||||
# Expected input shape [num_tokens, hidden_size]
|
||||
assert len(i_shape) == 2
|
||||
assert i_shape[0] == num_tokens
|
||||
hidden_size = i_shape[1]
|
||||
# Expected lora weight shape [num_loras, lora_rank, hidden_size]
|
||||
assert len(lw_shape) == 3
|
||||
assert lw_shape[2] == hidden_size
|
||||
lora_rank = lw_shape[1]
|
||||
# Expected output shape [num_slices, num_tokens, lora_rank]
|
||||
assert len(o_shape) == 3
|
||||
assert o_shape == (num_slices, num_tokens, lora_rank)
|
||||
|
||||
return {
|
||||
'inputs': self.input,
|
||||
'lora_a_weights': self.lora_weights_lst,
|
||||
'output_tensor': self.output,
|
||||
'b_seq_start_loc': self.seq_start_loc,
|
||||
'seq_len_tensor': self.seq_lens,
|
||||
'lora_indices_tensor': self.prompt_lora_mapping,
|
||||
'batches': num_seqs,
|
||||
'max_seq_length': max_seq_len,
|
||||
'token_nums': num_tokens,
|
||||
'scaling': 1.0,
|
||||
}
|
||||
|
||||
def as_sgmv_expand_kwargs(self, add_inputs: bool) -> dict[str, Any]:
|
||||
|
||||
self.convert_to_sgmv_benchmark_tensors()
|
||||
self.sanity_check()
|
||||
self.to_device(self.input.device)
|
||||
|
||||
num_seqs, num_tokens, max_seq_len, num_slices = self.metadata()
|
||||
|
||||
# Sanity check matrix shapes.
|
||||
i_shape, lw_shape, o_shape = self.input.shape, self.lora_weights_lst[
|
||||
0].shape, self.output.shape
|
||||
# Expected input shape : [num_slices, num_tokens, lora_rank]
|
||||
assert len(i_shape) == 3
|
||||
assert i_shape[0] == num_slices
|
||||
assert i_shape[1] == num_tokens
|
||||
lora_rank = i_shape[2]
|
||||
# Expected lora weight shape : [num_lora, hidden_size, lora_rank]
|
||||
assert len(lw_shape) == 3
|
||||
assert lw_shape[2] == lora_rank
|
||||
hidden_size = lw_shape[1]
|
||||
# Expected output shape : [num_tokens, hidden_size * num_slices]
|
||||
assert len(o_shape) == 2
|
||||
assert o_shape == (num_tokens, hidden_size * num_slices)
|
||||
|
||||
return {
|
||||
'inputs': self.input,
|
||||
'lora_b_weights': self.lora_weights_lst,
|
||||
'output_tensor': self.output,
|
||||
'b_seq_start_loc': self.seq_start_loc,
|
||||
'seq_len_tensor': self.seq_lens,
|
||||
'lora_indices_tensor': self.prompt_lora_mapping,
|
||||
'batches': num_seqs,
|
||||
'max_seq_length': max_seq_len,
|
||||
'token_nums': num_tokens,
|
||||
'offset_start': 0,
|
||||
'add_inputs': add_inputs,
|
||||
}
|
||||
|
||||
def as_bgmv_shrink_kwargs(self) -> dict[str, Any]:
|
||||
assert len(self.lora_weights_lst) == 1
|
||||
self.to_device(self.input.device)
|
||||
|
||||
_, num_tokens, _, _ = self.metadata()
|
||||
# Sanity check shapes
|
||||
i_shape, lw_shape, o_shape = self.input.shape, self.lora_weights_lst[
|
||||
0].shape, self.output.shape
|
||||
# Expected input shape [num_tokens, hidden_size]
|
||||
assert len(i_shape) == 2
|
||||
assert i_shape[0] == num_tokens
|
||||
hidden_size = i_shape[1]
|
||||
# Expected lora weight shape [num_loras, lora_rank, hidden_size]
|
||||
assert len(lw_shape) == 3
|
||||
assert lw_shape[2] == hidden_size
|
||||
lora_rank = lw_shape[1]
|
||||
# Expected output shape [num_tokens, lora_rank]
|
||||
assert len(o_shape) == 2
|
||||
assert o_shape == (num_tokens, lora_rank)
|
||||
|
||||
return {
|
||||
'inputs': self.input,
|
||||
'lora_a_weights': self.lora_weights_lst[0],
|
||||
'output_tensor': self.output,
|
||||
'lora_indices_tensor': self.token_lora_mapping,
|
||||
'scaling': 1.0
|
||||
}
|
||||
|
||||
def as_bgmv_expand_kwargs(self, add_inputs: bool):
|
||||
assert len(self.lora_weights_lst) == 1
|
||||
self.to_device(self.input.device)
|
||||
|
||||
_, num_tokens, _, _ = self.metadata()
|
||||
# Sanity check shapes
|
||||
i_shape, lw_shape, o_shape = self.input.shape, self.lora_weights_lst[
|
||||
0].shape, self.output.shape
|
||||
# Expected input shape [num_tokens, lora_rank]
|
||||
assert len(i_shape) == 2
|
||||
assert i_shape[0] == num_tokens
|
||||
lora_rank = i_shape[1]
|
||||
# Expected lora weight shape [num_loras, hidden_size, lora_rank]
|
||||
assert len(lw_shape) == 3
|
||||
assert lw_shape[2] == lora_rank
|
||||
hidden_size = lw_shape[1]
|
||||
# Expected output shape [num_tokens, hidden_size]
|
||||
assert len(o_shape) == 2
|
||||
assert o_shape == (num_tokens, hidden_size)
|
||||
|
||||
return {
|
||||
'inputs': self.input,
|
||||
'lora_b_weights': self.lora_weights_lst[0],
|
||||
'output_tensor': self.output,
|
||||
'lora_indices_tensor': self.token_lora_mapping,
|
||||
'add_inputs': add_inputs
|
||||
}
|
||||
|
||||
def as_bgmv_expand_slice_kwargs(self, add_inputs: bool) -> dict[str, Any]:
|
||||
|
||||
_, num_tokens, _, num_slices = self.metadata()
|
||||
# Sanity check shapes
|
||||
i_shape, lw_shape, o_shape = self.input.shape, self.lora_weights_lst[
|
||||
0].shape, self.output.shape
|
||||
# Expected input shape [num_slices, num_tokens, lora_rank]
|
||||
assert len(i_shape) == 3
|
||||
assert i_shape[0] == num_slices
|
||||
assert i_shape[1] == num_tokens
|
||||
lora_rank = i_shape[2]
|
||||
# Expected lora weight shape [num_loras, hidden_size, lora_rank]
|
||||
assert len(lw_shape) == 3
|
||||
assert lw_shape[2] == lora_rank
|
||||
hidden_size = lw_shape[1]
|
||||
# Expected output shape [num_tokens, hidden_size * num_slices]
|
||||
assert len(o_shape) == 2
|
||||
assert o_shape == (num_tokens, hidden_size * num_slices)
|
||||
|
||||
self.to_device(self.input.device)
|
||||
|
||||
kwargs_list = []
|
||||
for i in range(num_slices):
|
||||
kwargs_list.append({
|
||||
'inputs': self.input[i],
|
||||
'lora_b_weights': self.lora_weights_lst[i],
|
||||
'output_tensor': self.output,
|
||||
'lora_indices_tensor': self.token_lora_mapping,
|
||||
'slice_offset': i * hidden_size,
|
||||
'slice_size': hidden_size,
|
||||
'add_inputs': add_inputs,
|
||||
})
|
||||
return {'kwargs_list': kwargs_list}
|
||||
|
||||
def as_v1_shrink_kwargs(self) -> dict[str, Any]:
|
||||
assert self.v1_kernel_meta is not None
|
||||
def as_lora_shrink_kwargs(self) -> dict[str, Any]:
|
||||
self.sanity_check()
|
||||
self.to_device(self.input.device)
|
||||
|
||||
@ -737,17 +442,16 @@ class BenchmarkTensors:
|
||||
'inputs': self.input,
|
||||
'lora_a_weights': self.lora_weights_lst,
|
||||
'output_tensor': self.output,
|
||||
'token_lora_mapping': self.v1_kernel_meta.token_lora_mapping,
|
||||
'token_lora_mapping': self.lora_kernel_meta.token_lora_mapping,
|
||||
'token_indices_sorted_by_lora_ids':
|
||||
self.v1_kernel_meta.token_indices_sorted_by_lora_ids,
|
||||
'num_tokens_per_lora': self.v1_kernel_meta.num_tokens_per_lora,
|
||||
'lora_token_start_loc': self.v1_kernel_meta.lora_token_start_loc,
|
||||
'lora_ids': self.v1_kernel_meta.active_lora_ids,
|
||||
self.lora_kernel_meta.token_indices_sorted_by_lora_ids,
|
||||
'num_tokens_per_lora': self.lora_kernel_meta.num_tokens_per_lora,
|
||||
'lora_token_start_loc': self.lora_kernel_meta.lora_token_start_loc,
|
||||
'lora_ids': self.lora_kernel_meta.active_lora_ids,
|
||||
'scaling': 1.0,
|
||||
}
|
||||
|
||||
def as_v1_expand_kwargs(self, add_inputs: bool) -> dict[str, Any]:
|
||||
assert self.v1_kernel_meta is not None
|
||||
def as_lora_expand_kwargs(self, add_inputs: bool) -> dict[str, Any]:
|
||||
self.sanity_check()
|
||||
self.to_device(self.input.device)
|
||||
|
||||
@ -773,12 +477,12 @@ class BenchmarkTensors:
|
||||
'inputs': self.input,
|
||||
'lora_b_weights': self.lora_weights_lst,
|
||||
'output_tensor': self.output,
|
||||
'token_lora_mapping': self.v1_kernel_meta.token_lora_mapping,
|
||||
'token_lora_mapping': self.lora_kernel_meta.token_lora_mapping,
|
||||
'token_indices_sorted_by_lora_ids':
|
||||
self.v1_kernel_meta.token_indices_sorted_by_lora_ids,
|
||||
'num_tokens_per_lora': self.v1_kernel_meta.num_tokens_per_lora,
|
||||
'lora_token_start_loc': self.v1_kernel_meta.lora_token_start_loc,
|
||||
'lora_ids': self.v1_kernel_meta.active_lora_ids,
|
||||
self.lora_kernel_meta.token_indices_sorted_by_lora_ids,
|
||||
'num_tokens_per_lora': self.lora_kernel_meta.num_tokens_per_lora,
|
||||
'lora_token_start_loc': self.lora_kernel_meta.lora_token_start_loc,
|
||||
'lora_ids': self.lora_kernel_meta.active_lora_ids,
|
||||
'offset_start': 0,
|
||||
'add_inputs': add_inputs,
|
||||
}
|
||||
@ -791,20 +495,10 @@ class BenchmarkTensors:
|
||||
else:
|
||||
assert add_inputs is not None
|
||||
|
||||
if op_type == OpType.SGMV_SHRINK:
|
||||
return self.as_sgmv_shrink_kwargs()
|
||||
if op_type == OpType.SGMV_EXPAND:
|
||||
return self.as_sgmv_expand_kwargs(add_inputs)
|
||||
if op_type == OpType.BGMV_SHRINK:
|
||||
return self.as_bgmv_shrink_kwargs()
|
||||
if op_type == OpType.BGMV_EXPAND:
|
||||
return self.as_bgmv_expand_kwargs(add_inputs)
|
||||
if op_type == OpType.BGMV_EXPAND_SLICE:
|
||||
return self.as_bgmv_expand_slice_kwargs(add_inputs)
|
||||
if op_type == OpType.V1_SHRINK:
|
||||
return self.as_v1_shrink_kwargs()
|
||||
if op_type == OpType.V1_EXPAND:
|
||||
return self.as_v1_expand_kwargs(add_inputs)
|
||||
if op_type == OpType.LORA_SHRINK:
|
||||
return self.as_lora_shrink_kwargs()
|
||||
if op_type == OpType.LORA_EXPAND:
|
||||
return self.as_lora_expand_kwargs(add_inputs)
|
||||
raise ValueError(f"Unrecognized optype {self}")
|
||||
|
||||
def test_correctness(self, op_type: OpType,
|
||||
@ -993,10 +687,6 @@ def run(args: argparse.Namespace, bench_ctxs: list[BenchmarkContext]):
|
||||
for bench_ctx in bench_ctxs:
|
||||
for seq_len in args.seq_lengths:
|
||||
bench_ops: list[OpType] = args.op_types
|
||||
if seq_len > 1:
|
||||
# bench only prefill ops
|
||||
bench_ops = [op for op in args.op_types if op.is_prefill_op()]
|
||||
|
||||
seq_len_timers = []
|
||||
for bench_op in bench_ops:
|
||||
for num_slices in bench_op.num_slices():
|
||||
@ -1206,13 +896,13 @@ Benchmark LoRA kernels:
|
||||
{use_cuda_graph_recommendation()}
|
||||
|
||||
list_bench example:
|
||||
python3 benchmarks/kernels/benchmark_lora.py list_bench --arg-pool-size 32 --batch-sizes 1 16 32 --dtype torch.float16 --hidden-sizes 2048 --lora-ranks 16 --num-loras 1 4 --op-types bgmv_shrink bgmv_expand sgmv_shrink sgmv_expand bgmv_expand_slice --seq-lengths 1 16 --sort-by-lora-id 1 --cuda-graph-nops 32
|
||||
python3 benchmarks/kernels/benchmark_lora.py list_bench --arg-pool-size 32 --batch-sizes 1 16 32 --dtype torch.float16 --hidden-sizes 2048 --lora-ranks 16 --num-loras 1 4 --op-types lora_shrink lora_expand --seq-lengths 1 16 --sort-by-lora-id 1 --cuda-graph-nops 32
|
||||
|
||||
model_bench example:
|
||||
python3 benchmarks/kernels/benchmark_lora.py model_bench --models meta-llama/Llama-3-8b --arg-pool-size 32 --batch-sizes 1 16 32 --dtype torch.float16 --lora-ranks 16 --num-loras 1 4 --op-types bgmv_shrink bgmv_expand sgmv_shrink sgmv_expand bgmv_expand_slice --seq-lengths 1 16 --sort-by-lora-id 1 --cuda-graph-nops 32
|
||||
python3 benchmarks/kernels/benchmark_lora.py model_bench --models meta-llama/Llama-3-8b --arg-pool-size 32 --batch-sizes 1 16 32 --dtype torch.float16 --lora-ranks 16 --num-loras 1 4 --op-types lora_shrink lora_expand --seq-lengths 1 16 --sort-by-lora-id 1 --cuda-graph-nops 32
|
||||
|
||||
range_bench example:
|
||||
python3 benchmarks/kernels/benchmark_lora.py range_bench --arg-pool-size 32 --batch-sizes 1 16 32 --dtype torch.float16 --num-loras 1 4 --op-types bgmv_shrink bgmv_expand sgmv_shrink sgmv_expand bgmv_expand_slice --seq-lengths 1 16 --sort-by-lora-id 1 --cuda-graph-nops 32 --hidden-sizes-start 1024 --hidden-sizes-end 4096 --hidden-sizes-increment 1024 --lora-ranks-start 8 --lora-ranks-end 24 --lora-ranks-increment 8
|
||||
python3 benchmarks/kernels/benchmark_lora.py range_bench --arg-pool-size 32 --batch-sizes 1 16 32 --dtype torch.float16 --num-loras 1 4 --op-types lora_shrink lora_expand --seq-lengths 1 16 --sort-by-lora-id 1 --cuda-graph-nops 32 --hidden-sizes-start 1024 --hidden-sizes-end 4096 --hidden-sizes-increment 1024 --lora-ranks-start 8 --lora-ranks-end 24 --lora-ranks-increment 8
|
||||
""", # noqa: E501
|
||||
formatter_class=argparse.RawTextHelpFormatter)
|
||||
|
||||
|
||||
@ -54,6 +54,7 @@ for qps in "${QPS_VALUES[@]}"; do
|
||||
python "$SCRIPT_DIR/benchmark_serving_structured_output.py" $COMMON_PARAMS \
|
||||
--request-rate $qps \
|
||||
--result-filename "$FILENAME" \
|
||||
--tokenizer-mode ${TOKENIZER_MODE:-"auto"} \
|
||||
--port ${PORT:-8000}
|
||||
|
||||
echo "Completed benchmark with QPS: $qps"
|
||||
|
||||
@ -38,7 +38,7 @@ else()
|
||||
FetchContent_Declare(
|
||||
vllm-flash-attn
|
||||
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
|
||||
GIT_TAG 9bfa9869829d8c593527eb34c5271d0090f7ccc9
|
||||
GIT_TAG dc9d410b3e2d6534a4c70724c2515f4def670a22
|
||||
GIT_PROGRESS TRUE
|
||||
# Don't share the vllm-flash-attn build between build types
|
||||
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
|
||||
|
||||
@ -350,8 +350,8 @@ __global__ void concat_and_cache_mla_kernel(
|
||||
|
||||
} // namespace vllm
|
||||
|
||||
// KV_T is the stored data type of kv-cache.
|
||||
// CACHE_T is the data type of key and value tensors.
|
||||
// KV_T is the data type of key and value tensors.
|
||||
// CACHE_T is the stored data type of kv-cache.
|
||||
// KV_DTYPE is the real data type of kv-cache.
|
||||
#define CALL_RESHAPE_AND_CACHE(KV_T, CACHE_T, KV_DTYPE) \
|
||||
vllm::reshape_and_cache_kernel<KV_T, CACHE_T, KV_DTYPE> \
|
||||
@ -393,8 +393,8 @@ void reshape_and_cache(
|
||||
CALL_RESHAPE_AND_CACHE)
|
||||
}
|
||||
|
||||
// KV_T is the stored data type of kv-cache.
|
||||
// CACHE_T is the data type of key and value tensors.
|
||||
// KV_T is the data type of key and value tensors.
|
||||
// CACHE_T is the stored data type of kv-cache.
|
||||
// KV_DTYPE is the real data type of kv-cache.
|
||||
#define CALL_RESHAPE_AND_CACHE_FLASH(KV_T, CACHE_T, KV_DTYPE) \
|
||||
vllm::reshape_and_cache_flash_kernel<KV_T, CACHE_T, KV_DTYPE> \
|
||||
@ -446,8 +446,8 @@ void reshape_and_cache_flash(
|
||||
CALL_RESHAPE_AND_CACHE_FLASH);
|
||||
}
|
||||
|
||||
// KV_T is the stored data type of kv-cache.
|
||||
// CACHE_T is the data type of key and value tensors.
|
||||
// KV_T is the data type of key and value tensors.
|
||||
// CACHE_T is the stored data type of kv-cache.
|
||||
// KV_DTYPE is the real data type of kv-cache.
|
||||
#define CALL_CONCAT_AND_CACHE_MLA(KV_T, CACHE_T, KV_DTYPE) \
|
||||
vllm::concat_and_cache_mla_kernel<KV_T, CACHE_T, KV_DTYPE> \
|
||||
|
||||
@ -127,7 +127,7 @@ __device__ __forceinline__ T from_float(const float& inp) {
|
||||
|
||||
template <typename T>
|
||||
__device__ __forceinline__ _B16x4 from_floatx4(const floatx4& inp) {
|
||||
union tmpcvt {
|
||||
[[maybe_unused]] union tmpcvt {
|
||||
uint16_t u;
|
||||
_Float16 f;
|
||||
__hip_bfloat16 b;
|
||||
@ -160,7 +160,7 @@ __device__ __forceinline__ _B16x4 from_floatx4(const floatx4& inp) {
|
||||
template <typename T>
|
||||
__device__ __forceinline__ _B16x4 addx4(const _B16x4& inp1,
|
||||
const _B16x4& inp2) {
|
||||
union tmpcvt {
|
||||
[[maybe_unused]] union tmpcvt {
|
||||
uint16_t u;
|
||||
_Float16 f;
|
||||
__hip_bfloat16 b;
|
||||
@ -1273,9 +1273,9 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
|
||||
const int seq_idx = blockIdx.y;
|
||||
const int context_len = context_lens[seq_idx];
|
||||
const int num_partitions = DIVIDE_ROUND_UP(context_len, PARTITION_SIZE);
|
||||
constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
|
||||
[[maybe_unused]] constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
|
||||
const int warpid = threadIdx.x / WARP_SIZE;
|
||||
const int laneid = threadIdx.x % WARP_SIZE;
|
||||
[[maybe_unused]] const int laneid = threadIdx.x % WARP_SIZE;
|
||||
|
||||
__shared__ float shared_global_exp_sum;
|
||||
// max num partitions supported is warp_size * NPAR_LOOPS
|
||||
|
||||
@ -34,11 +34,11 @@ If you need to use those dependencies (having accepted the license terms),
|
||||
create a custom Dockerfile on top of the base image with an extra layer that installs them:
|
||||
|
||||
```Dockerfile
|
||||
FROM vllm/vllm-openai:v0.7.3
|
||||
FROM vllm/vllm-openai:v0.8.0
|
||||
|
||||
# e.g. install the `audio` and `video` optional dependencies
|
||||
# NOTE: Make sure the version of vLLM matches the base image!
|
||||
RUN uv pip install --system vllm[audio,video]==0.7.3
|
||||
RUN uv pip install vllm[audio,video]==0.8.0
|
||||
```
|
||||
|
||||
:::
|
||||
@ -52,7 +52,7 @@ with an extra layer that installs their code from source:
|
||||
```Dockerfile
|
||||
FROM vllm/vllm-openai:latest
|
||||
|
||||
RUN uv pip install --system git+https://github.com/huggingface/transformers.git
|
||||
RUN uv pip install git+https://github.com/huggingface/transformers.git
|
||||
```
|
||||
|
||||
:::
|
||||
|
||||
@ -191,7 +191,7 @@ When the head block (least recently used block) of the free queue is cached, we
|
||||
|
||||
In this example, we assume the block size is 4 (each block can cache 4 tokens), and we have 10 blocks in the KV-cache manager in total.
|
||||
|
||||
**Time 1: The cache is empty and a new request comes in.** We allocate 4 blocks. 3 of them are already full and cached. The fourth block is partially full with 2 of 4 tokens.
|
||||
**Time 1: The cache is empty and a new request comes in.** We allocate 4 blocks. 3 of them are already full and cached. The fourth block is partially full with 3 of 4 tokens.
|
||||
|
||||
:::{image} /assets/design/v1/prefix_caching/example-time-1.png
|
||||
:alt: Example Time 1
|
||||
@ -203,7 +203,7 @@ In this example, we assume the block size is 4 (each block can cache 4 tokens),
|
||||
:alt: Example Time 3
|
||||
:::
|
||||
|
||||
**Time 4: Request 1 comes in with the 14 prompt tokens, where the first 11 tokens are the same as request 0.** We can see that only 2 blocks (11 tokens) hit the cache, because the 3rd block only matches 3 of 4 tokens.
|
||||
**Time 4: Request 1 comes in with the 14 prompt tokens, where the first 10 tokens are the same as request 0.** We can see that only the first 2 blocks (8 tokens) hit the cache, because the 3rd block only matches 2 of 4 tokens.
|
||||
|
||||
:::{image} /assets/design/v1/prefix_caching/example-time-4.png
|
||||
:alt: Example Time 4
|
||||
|
||||
@ -2,6 +2,8 @@
|
||||
|
||||
V1 is now enabled by default for all supported use cases, and we will gradually enable it for every use case we plan to support. Please share any feedback on [GitHub](https://github.com/vllm-project/vllm) or in the [vLLM Slack](https://inviter.co/vllm-slack).
|
||||
|
||||
To disable V1, please set the environment variable as: `VLLM_USE_V1=0`, and send us a GitHub issue sharing the reason!
|
||||
|
||||
## Why vLLM V1?
|
||||
|
||||
vLLM V0 successfully supported a wide range of models and hardware, but as new features were developed independently, the system grew increasingly complex. This complexity made it harder to integrate new capabilities and introduced technical debt, revealing the need for a more streamlined and unified design.
|
||||
|
||||
@ -477,6 +477,11 @@ See [this page](#generative-models) for more information on how to use generativ
|
||||
* `xverse/XVERSE-7B-Chat`, `xverse/XVERSE-13B-Chat`, `xverse/XVERSE-65B-Chat`, etc.
|
||||
* ✅︎
|
||||
* ✅︎
|
||||
- * `Zamba2ForCausalLM`
|
||||
* Zamba2
|
||||
* `Zyphra/Zamba2-7B-instruct`, `Zyphra/Zamba2-2.7B-instruct`, `Zyphra/Zamba2-1.2B-instruct`, etc.
|
||||
*
|
||||
*
|
||||
:::
|
||||
|
||||
:::{note}
|
||||
@ -879,7 +884,7 @@ See [this page](#generative-models) for more information on how to use generativ
|
||||
- * `PixtralForConditionalGeneration`
|
||||
* Pixtral
|
||||
* T + I<sup>+</sup>
|
||||
* `mistralai/Pixtral-12B-2409`, `mistral-community/pixtral-12b`, etc.
|
||||
* `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, `mistral-community/pixtral-12b`, etc.
|
||||
*
|
||||
* ✅︎
|
||||
* ✅︎
|
||||
@ -946,7 +951,7 @@ V0 correctly implements the model's attention pattern:
|
||||
|
||||
V1 currently uses a simplified attention pattern:
|
||||
- Uses causal attention for all tokens, including image tokens
|
||||
- Generates reasonable outputs but does not match the original model's attention for text + image inputs
|
||||
- Generates reasonable outputs but does not match the original model's attention for text + image inputs, especially when `{"do_pan_and_scan": True}`
|
||||
- Will be updated in the future to support the correct behavior
|
||||
|
||||
This limitation exists because the model's mixed attention pattern (bidirectional for images, causal otherwise) is not yet supported by vLLM's attention backends.
|
||||
|
||||
@ -83,7 +83,7 @@ Since this is a ray cluster of **containers**, all the following commands should
|
||||
|
||||
Then, on any node, use `docker exec -it node /bin/bash` to enter the container, execute `ray status` and `ray list nodes` to check the status of the Ray cluster. You should see the right number of nodes and GPUs.
|
||||
|
||||
After that, on any node, use `docker exec -it node /bin/bash` to enter the container again. **In the container**, you can use vLLM as usual, just as you have all the GPUs on one node. The common practice is to set the tensor parallel size to the number of GPUs in each node, and the pipeline parallel size to the number of nodes. For example, if you have 16 GPUs in 2 nodes (8 GPUs per node), you can set the tensor parallel size to 8 and the pipeline parallel size to 2:
|
||||
After that, on any node, use `docker exec -it node /bin/bash` to enter the container again. **In the container**, you can use vLLM as usual, just as you have all the GPUs on one node: vLLM will be able to leverage GPU resources of all nodes in the Ray cluster, and therefore, only run the `vllm` command on this node but not other nodes. The common practice is to set the tensor parallel size to the number of GPUs in each node, and the pipeline parallel size to the number of nodes. For example, if you have 16 GPUs in 2 nodes (8 GPUs per node), you can set the tensor parallel size to 8 and the pipeline parallel size to 2:
|
||||
|
||||
```console
|
||||
vllm serve /path/to/the/model/in/the/container \
|
||||
|
||||
@ -7,11 +7,13 @@ For most models, the prompt format should follow corresponding examples
|
||||
on HuggingFace model repository.
|
||||
"""
|
||||
import os
|
||||
from dataclasses import asdict
|
||||
from typing import NamedTuple, Optional
|
||||
|
||||
from huggingface_hub import snapshot_download
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm import LLM, EngineArgs, SamplingParams
|
||||
from vllm.assets.audio import AudioAsset
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
@ -23,21 +25,31 @@ question_per_audio_count = {
|
||||
2: "What sport and what nursery rhyme are referenced?"
|
||||
}
|
||||
|
||||
|
||||
class ModelRequestData(NamedTuple):
|
||||
engine_args: EngineArgs
|
||||
prompt: str
|
||||
stop_token_ids: Optional[list[int]] = None
|
||||
lora_requests: Optional[list[LoRARequest]] = None
|
||||
|
||||
|
||||
# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
|
||||
# lower-end GPUs.
|
||||
# Unless specified, these settings have been tested to work on a single L4.
|
||||
|
||||
|
||||
# MiniCPM-O
|
||||
def run_minicpmo(question: str, audio_count: int):
|
||||
def run_minicpmo(question: str, audio_count: int) -> ModelRequestData:
|
||||
model_name = "openbmb/MiniCPM-o-2_6"
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name,
|
||||
trust_remote_code=True)
|
||||
llm = LLM(model=model_name,
|
||||
trust_remote_code=True,
|
||||
max_model_len=4096,
|
||||
max_num_seqs=5,
|
||||
limit_mm_per_prompt={"audio": audio_count})
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
trust_remote_code=True,
|
||||
max_model_len=4096,
|
||||
max_num_seqs=5,
|
||||
limit_mm_per_prompt={"audio": audio_count},
|
||||
)
|
||||
|
||||
stop_tokens = ['<|im_end|>', '<|endoftext|>']
|
||||
stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
|
||||
@ -52,11 +64,16 @@ def run_minicpmo(question: str, audio_count: int):
|
||||
tokenize=False,
|
||||
add_generation_prompt=True,
|
||||
chat_template=audio_chat_template)
|
||||
return llm, prompt, stop_token_ids
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompt=prompt,
|
||||
stop_token_ids=stop_token_ids,
|
||||
)
|
||||
|
||||
|
||||
# Phi-4-multimodal-instruct
|
||||
def run_phi4mm(questions: str, audio_count: int):
|
||||
def run_phi4mm(question: str, audio_count: int) -> ModelRequestData:
|
||||
"""
|
||||
Phi-4-multimodal-instruct supports both image and audio inputs. Here, we
|
||||
show how to process audio inputs.
|
||||
@ -67,36 +84,35 @@ def run_phi4mm(questions: str, audio_count: int):
|
||||
speech_lora_path = os.path.join(model_path, "speech-lora")
|
||||
placeholders = "".join([f"<|audio_{i+1}|>" for i in range(audio_count)])
|
||||
|
||||
prompts = f"<|user|>{placeholders}{questions}<|end|><|assistant|>"
|
||||
prompts = f"<|user|>{placeholders}{question}<|end|><|assistant|>"
|
||||
|
||||
llm = LLM(
|
||||
engine_args = EngineArgs(
|
||||
model=model_path,
|
||||
trust_remote_code=True,
|
||||
max_model_len=4096,
|
||||
max_num_seqs=2,
|
||||
enable_lora=True,
|
||||
max_lora_rank=320,
|
||||
lora_extra_vocab_size=0,
|
||||
limit_mm_per_prompt={"audio": audio_count},
|
||||
)
|
||||
lora_request = LoRARequest("speech", 1, speech_lora_path)
|
||||
# To maintain code compatibility in this script, we add LoRA here.
|
||||
llm.llm_engine.add_lora(lora_request=lora_request)
|
||||
# You can also add LoRA using:
|
||||
# llm.generate(prompts, lora_request=lora_request,...)
|
||||
|
||||
stop_token_ids = None
|
||||
return llm, prompts, stop_token_ids
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompt=prompts,
|
||||
lora_requests=[LoRARequest("speech", 1, speech_lora_path)],
|
||||
)
|
||||
|
||||
|
||||
# Qwen2-Audio
|
||||
def run_qwen2_audio(question: str, audio_count: int):
|
||||
def run_qwen2_audio(question: str, audio_count: int) -> ModelRequestData:
|
||||
model_name = "Qwen/Qwen2-Audio-7B-Instruct"
|
||||
|
||||
llm = LLM(model=model_name,
|
||||
max_model_len=4096,
|
||||
max_num_seqs=5,
|
||||
limit_mm_per_prompt={"audio": audio_count})
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
max_model_len=4096,
|
||||
max_num_seqs=5,
|
||||
limit_mm_per_prompt={"audio": audio_count},
|
||||
)
|
||||
|
||||
audio_in_prompt = "".join([
|
||||
f"Audio {idx+1}: "
|
||||
@ -107,12 +123,15 @@ def run_qwen2_audio(question: str, audio_count: int):
|
||||
"<|im_start|>user\n"
|
||||
f"{audio_in_prompt}{question}<|im_end|>\n"
|
||||
"<|im_start|>assistant\n")
|
||||
stop_token_ids = None
|
||||
return llm, prompt, stop_token_ids
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompt=prompt,
|
||||
)
|
||||
|
||||
|
||||
# Ultravox 0.5-1B
|
||||
def run_ultravox(question: str, audio_count: int):
|
||||
def run_ultravox(question: str, audio_count: int) -> ModelRequestData:
|
||||
model_name = "fixie-ai/ultravox-v0_5-llama-3_2-1b"
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
||||
@ -124,29 +143,39 @@ def run_ultravox(question: str, audio_count: int):
|
||||
tokenize=False,
|
||||
add_generation_prompt=True)
|
||||
|
||||
llm = LLM(model=model_name,
|
||||
max_model_len=4096,
|
||||
max_num_seqs=5,
|
||||
trust_remote_code=True,
|
||||
limit_mm_per_prompt={"audio": audio_count})
|
||||
stop_token_ids = None
|
||||
return llm, prompt, stop_token_ids
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
max_model_len=4096,
|
||||
max_num_seqs=5,
|
||||
trust_remote_code=True,
|
||||
limit_mm_per_prompt={"audio": audio_count},
|
||||
)
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompt=prompt,
|
||||
)
|
||||
|
||||
|
||||
# Whisper
|
||||
def run_whisper(question: str, audio_count: int):
|
||||
def run_whisper(question: str, audio_count: int) -> ModelRequestData:
|
||||
assert audio_count == 1, (
|
||||
"Whisper only support single audio input per prompt")
|
||||
model_name = "openai/whisper-large-v3-turbo"
|
||||
|
||||
prompt = "<|startoftranscript|>"
|
||||
|
||||
llm = LLM(model=model_name,
|
||||
max_model_len=448,
|
||||
max_num_seqs=5,
|
||||
limit_mm_per_prompt={"audio": audio_count})
|
||||
stop_token_ids = None
|
||||
return llm, prompt, stop_token_ids
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
max_model_len=448,
|
||||
max_num_seqs=5,
|
||||
limit_mm_per_prompt={"audio": audio_count},
|
||||
)
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompt=prompt,
|
||||
)
|
||||
|
||||
|
||||
model_example_map = {
|
||||
@ -164,14 +193,24 @@ def main(args):
|
||||
raise ValueError(f"Model type {model} is not supported.")
|
||||
|
||||
audio_count = args.num_audios
|
||||
llm, prompt, stop_token_ids = model_example_map[model](
|
||||
question_per_audio_count[audio_count], audio_count)
|
||||
req_data = model_example_map[model](question_per_audio_count[audio_count],
|
||||
audio_count)
|
||||
|
||||
engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
|
||||
llm = LLM(**engine_args)
|
||||
|
||||
# To maintain code compatibility in this script, we add LoRA here.
|
||||
# You can also add LoRA using:
|
||||
# llm.generate(prompts, lora_request=lora_request,...)
|
||||
if req_data.lora_requests:
|
||||
for lora_request in req_data.lora_requests:
|
||||
llm.llm_engine.add_lora(lora_request=lora_request)
|
||||
|
||||
# We set temperature to 0.2 so that outputs can be different
|
||||
# even when all prompts are identical when running batch inference.
|
||||
sampling_params = SamplingParams(temperature=0.2,
|
||||
max_tokens=64,
|
||||
stop_token_ids=stop_token_ids)
|
||||
stop_token_ids=req_data.stop_token_ids)
|
||||
|
||||
mm_data = {}
|
||||
if audio_count > 0:
|
||||
@ -183,7 +222,7 @@ def main(args):
|
||||
}
|
||||
|
||||
assert args.num_prompts > 0
|
||||
inputs = {"prompt": prompt, "multi_modal_data": mm_data}
|
||||
inputs = {"prompt": req_data.prompt, "multi_modal_data": mm_data}
|
||||
if args.num_prompts > 1:
|
||||
# Batch inference
|
||||
inputs = [inputs] * args.num_prompts
|
||||
@ -214,6 +253,10 @@ if __name__ == "__main__":
|
||||
default=1,
|
||||
choices=[0, 1, 2],
|
||||
help="Number of audio items per prompt.")
|
||||
parser.add_argument("--seed",
|
||||
type=int,
|
||||
default=None,
|
||||
help="Set the seed when initializing `vllm.LLM`.")
|
||||
|
||||
args = parser.parse_args()
|
||||
main(args)
|
||||
|
||||
@ -4,16 +4,23 @@ This example shows how to use vLLM for running offline inference with
|
||||
the explicit/implicit prompt format on enc-dec LMMs for text generation.
|
||||
"""
|
||||
import time
|
||||
from collections.abc import Sequence
|
||||
from dataclasses import asdict
|
||||
from typing import NamedTuple
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm import LLM, EngineArgs, PromptType, SamplingParams
|
||||
from vllm.assets.audio import AudioAsset
|
||||
from vllm.assets.image import ImageAsset
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
|
||||
|
||||
class ModelRequestData(NamedTuple):
|
||||
engine_args: EngineArgs
|
||||
prompts: Sequence[PromptType]
|
||||
|
||||
|
||||
def run_florence2():
|
||||
# Create a Florence-2 encoder/decoder model instance
|
||||
llm = LLM(
|
||||
engine_args = EngineArgs(
|
||||
model="microsoft/Florence-2-large",
|
||||
tokenizer="facebook/bart-large",
|
||||
max_num_seqs=8,
|
||||
@ -39,12 +46,15 @@ def run_florence2():
|
||||
"decoder_prompt": "",
|
||||
},
|
||||
]
|
||||
return llm, prompts
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompts=prompts,
|
||||
)
|
||||
|
||||
|
||||
def run_mllama():
|
||||
# Create a Mllama encoder/decoder model instance
|
||||
llm = LLM(
|
||||
engine_args = EngineArgs(
|
||||
model="meta-llama/Llama-3.2-11B-Vision-Instruct",
|
||||
max_model_len=4096,
|
||||
max_num_seqs=2,
|
||||
@ -69,12 +79,15 @@ def run_mllama():
|
||||
"decoder_prompt": "<|image|><|begin_of_text|>Please describe the image.", # noqa: E501
|
||||
},
|
||||
]
|
||||
return llm, prompts
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompts=prompts,
|
||||
)
|
||||
|
||||
|
||||
def run_whisper():
|
||||
# Create a Whisper encoder/decoder model instance
|
||||
llm = LLM(
|
||||
engine_args = EngineArgs(
|
||||
model="openai/whisper-large-v3-turbo",
|
||||
max_model_len=448,
|
||||
max_num_seqs=16,
|
||||
@ -99,7 +112,11 @@ def run_whisper():
|
||||
"decoder_prompt": "<|startoftranscript|>",
|
||||
}
|
||||
]
|
||||
return llm, prompts
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompts=prompts,
|
||||
)
|
||||
|
||||
|
||||
model_example_map = {
|
||||
@ -114,7 +131,12 @@ def main(args):
|
||||
if model not in model_example_map:
|
||||
raise ValueError(f"Model type {model} is not supported.")
|
||||
|
||||
llm, prompts = model_example_map[model]()
|
||||
req_data = model_example_map[model]()
|
||||
|
||||
engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
|
||||
llm = LLM(**engine_args)
|
||||
|
||||
prompts = req_data.prompts
|
||||
|
||||
# Create a sampling params object.
|
||||
sampling_params = SamplingParams(
|
||||
@ -153,6 +175,10 @@ if __name__ == "__main__":
|
||||
default="mllama",
|
||||
choices=model_example_map.keys(),
|
||||
help='Huggingface "model_type".')
|
||||
parser.add_argument("--seed",
|
||||
type=int,
|
||||
default=None,
|
||||
help="Set the seed when initializing `vllm.LLM`.")
|
||||
|
||||
args = parser.parse_args()
|
||||
main(args)
|
||||
|
||||
@ -6,14 +6,16 @@ import argparse
|
||||
from vllm import LLM
|
||||
from vllm.sampling_params import SamplingParams
|
||||
|
||||
# This script is an offline demo for running Pixtral.
|
||||
# This script is an offline demo for running Mistral-Small-3.1
|
||||
#
|
||||
# If you want to run a server/client setup, please follow this code:
|
||||
#
|
||||
# - Server:
|
||||
#
|
||||
# ```bash
|
||||
# vllm serve mistralai/Pixtral-12B-2409 --tokenizer-mode mistral --limit-mm-per-prompt 'image=4' --max-model-len 16384
|
||||
# vllm serve mistralai/Mistral-Small-3.1-24B-Instruct-2503 \
|
||||
# --tokenizer-mode mistral --config-format mistral --load-format mistral \
|
||||
# --limit-mm-per-prompt 'image=4' --max-model-len 16384
|
||||
# ```
|
||||
#
|
||||
# - Client:
|
||||
@ -23,7 +25,7 @@ from vllm.sampling_params import SamplingParams
|
||||
# --header 'Content-Type: application/json' \
|
||||
# --header 'Authorization: Bearer token' \
|
||||
# --data '{
|
||||
# "model": "mistralai/Pixtral-12B-2409",
|
||||
# "model": "mistralai/Mistral-Small-3.1-24B-Instruct-2503",
|
||||
# "messages": [
|
||||
# {
|
||||
# "role": "user",
|
||||
@ -44,13 +46,15 @@ from vllm.sampling_params import SamplingParams
|
||||
|
||||
|
||||
def run_simple_demo(args: argparse.Namespace):
|
||||
model_name = "mistralai/Pixtral-12B-2409"
|
||||
model_name = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
|
||||
sampling_params = SamplingParams(max_tokens=8192)
|
||||
|
||||
# Lower max_model_len and/or max_num_seqs on low-VRAM GPUs.
|
||||
llm = LLM(
|
||||
model=model_name,
|
||||
tokenizer_mode="mistral",
|
||||
config_format="mistral",
|
||||
load_format="mistral",
|
||||
max_model_len=4096,
|
||||
max_num_seqs=2,
|
||||
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
|
||||
@ -83,7 +87,7 @@ def run_simple_demo(args: argparse.Namespace):
|
||||
|
||||
|
||||
def run_advanced_demo(args: argparse.Namespace):
|
||||
model_name = "mistralai/Pixtral-12B-2409"
|
||||
model_name = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
|
||||
max_img_per_msg = 5
|
||||
max_tokens_per_img = 4096
|
||||
|
||||
@ -91,6 +95,8 @@ def run_advanced_demo(args: argparse.Namespace):
|
||||
llm = LLM(
|
||||
model=model_name,
|
||||
tokenizer_mode="mistral",
|
||||
config_format="mistral",
|
||||
load_format="mistral",
|
||||
limit_mm_per_prompt={"image": max_img_per_msg},
|
||||
max_model_len=max_img_per_msg * max_tokens_per_img,
|
||||
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
|
||||
@ -8,126 +8,167 @@ on HuggingFace model repository.
|
||||
"""
|
||||
import os
|
||||
import random
|
||||
from dataclasses import asdict
|
||||
from typing import NamedTuple, Optional
|
||||
|
||||
from huggingface_hub import snapshot_download
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm import LLM, EngineArgs, SamplingParams
|
||||
from vllm.assets.image import ImageAsset
|
||||
from vllm.assets.video import VideoAsset
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
|
||||
|
||||
class ModelRequestData(NamedTuple):
|
||||
engine_args: EngineArgs
|
||||
prompts: list[str]
|
||||
stop_token_ids: Optional[list[int]] = None
|
||||
lora_requests: Optional[list[LoRARequest]] = None
|
||||
|
||||
|
||||
# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
|
||||
# lower-end GPUs.
|
||||
# Unless specified, these settings have been tested to work on a single L4.
|
||||
|
||||
|
||||
# Aria
|
||||
def run_aria(questions: list[str], modality: str):
|
||||
def run_aria(questions: list[str], modality: str) -> ModelRequestData:
|
||||
assert modality == "image"
|
||||
model_name = "rhymes-ai/Aria"
|
||||
|
||||
# NOTE: Need L40 (or equivalent) to avoid OOM
|
||||
llm = LLM(model=model_name,
|
||||
max_model_len=4096,
|
||||
max_num_seqs=2,
|
||||
dtype="bfloat16",
|
||||
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
max_model_len=4096,
|
||||
max_num_seqs=2,
|
||||
dtype="bfloat16",
|
||||
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
|
||||
)
|
||||
|
||||
prompts = [(f"<|im_start|>user\n<fim_prefix><|img|><fim_suffix>{question}"
|
||||
"<|im_end|>\n<|im_start|>assistant\n")
|
||||
for question in questions]
|
||||
|
||||
stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519]
|
||||
return llm, prompts, stop_token_ids
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompts=prompts,
|
||||
stop_token_ids=stop_token_ids,
|
||||
)
|
||||
|
||||
|
||||
# BLIP-2
|
||||
def run_blip2(questions: list[str], modality: str):
|
||||
def run_blip2(questions: list[str], modality: str) -> ModelRequestData:
|
||||
assert modality == "image"
|
||||
|
||||
# BLIP-2 prompt format is inaccurate on HuggingFace model repository.
|
||||
# See https://huggingface.co/Salesforce/blip2-opt-2.7b/discussions/15#64ff02f3f8cf9e4f5b038262 #noqa
|
||||
prompts = [f"Question: {question} Answer:" for question in questions]
|
||||
llm = LLM(model="Salesforce/blip2-opt-2.7b",
|
||||
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
|
||||
stop_token_ids = None
|
||||
return llm, prompts, stop_token_ids
|
||||
engine_args = EngineArgs(
|
||||
model="Salesforce/blip2-opt-2.7b",
|
||||
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
|
||||
)
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompts=prompts,
|
||||
)
|
||||
|
||||
|
||||
# Chameleon
|
||||
def run_chameleon(questions: list[str], modality: str):
|
||||
def run_chameleon(questions: list[str], modality: str) -> ModelRequestData:
|
||||
assert modality == "image"
|
||||
|
||||
prompts = [f"{question}<image>" for question in questions]
|
||||
llm = LLM(model="facebook/chameleon-7b",
|
||||
max_model_len=4096,
|
||||
max_num_seqs=2,
|
||||
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
|
||||
stop_token_ids = None
|
||||
return llm, prompts, stop_token_ids
|
||||
engine_args = EngineArgs(
|
||||
model="facebook/chameleon-7b",
|
||||
max_model_len=4096,
|
||||
max_num_seqs=2,
|
||||
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
|
||||
)
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompts=prompts,
|
||||
)
|
||||
|
||||
|
||||
# Deepseek-VL2
|
||||
def run_deepseek_vl2(questions: list[str], modality: str):
|
||||
def run_deepseek_vl2(questions: list[str], modality: str) -> ModelRequestData:
|
||||
assert modality == "image"
|
||||
|
||||
model_name = "deepseek-ai/deepseek-vl2-tiny"
|
||||
|
||||
llm = LLM(model=model_name,
|
||||
max_model_len=4096,
|
||||
max_num_seqs=2,
|
||||
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
|
||||
hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]})
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
max_model_len=4096,
|
||||
max_num_seqs=2,
|
||||
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
|
||||
hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]},
|
||||
)
|
||||
|
||||
prompts = [
|
||||
f"<|User|>: <image>\n{question}\n\n<|Assistant|>:"
|
||||
for question in questions
|
||||
]
|
||||
stop_token_ids = None
|
||||
return llm, prompts, stop_token_ids
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompts=prompts,
|
||||
)
|
||||
|
||||
|
||||
# Florence2
|
||||
def run_florence2(question: str, modality: str):
|
||||
def run_florence2(questions: list[str], modality: str) -> ModelRequestData:
|
||||
assert modality == "image"
|
||||
|
||||
llm = LLM(model="microsoft/Florence-2-large",
|
||||
tokenizer="facebook/bart-large",
|
||||
max_num_seqs=8,
|
||||
trust_remote_code=True,
|
||||
dtype="bfloat16",
|
||||
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
|
||||
engine_args = EngineArgs(
|
||||
model="microsoft/Florence-2-large",
|
||||
tokenizer="facebook/bart-large",
|
||||
max_num_seqs=8,
|
||||
trust_remote_code=True,
|
||||
dtype="bfloat16",
|
||||
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
|
||||
)
|
||||
|
||||
prompt = "<MORE_DETAILED_CAPTION>"
|
||||
stop_token_ids = None
|
||||
return llm, prompt, stop_token_ids
|
||||
prompts = ["<MORE_DETAILED_CAPTION>" for _ in questions]
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompts=prompts,
|
||||
)
|
||||
|
||||
|
||||
# Fuyu
|
||||
def run_fuyu(questions: list[str], modality: str):
|
||||
def run_fuyu(questions: list[str], modality: str) -> ModelRequestData:
|
||||
assert modality == "image"
|
||||
|
||||
prompts = [f"{question}\n" for question in questions]
|
||||
llm = LLM(model="adept/fuyu-8b",
|
||||
max_model_len=2048,
|
||||
max_num_seqs=2,
|
||||
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
|
||||
stop_token_ids = None
|
||||
return llm, prompts, stop_token_ids
|
||||
engine_args = EngineArgs(
|
||||
model="adept/fuyu-8b",
|
||||
max_model_len=2048,
|
||||
max_num_seqs=2,
|
||||
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
|
||||
)
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompts=prompts,
|
||||
)
|
||||
|
||||
|
||||
# Gemma 3
|
||||
def run_gemma3(questions: list[str], modality: str):
|
||||
def run_gemma3(questions: list[str], modality: str) -> ModelRequestData:
|
||||
assert modality == "image"
|
||||
model_name = "google/gemma-3-4b-it"
|
||||
|
||||
llm = LLM(
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
max_model_len=2048,
|
||||
max_num_seqs=2,
|
||||
# Default is False; setting it to True is not supported in V1 yet
|
||||
mm_processor_kwargs={"do_pan_and_scan": True},
|
||||
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
|
||||
)
|
||||
@ -135,22 +176,27 @@ def run_gemma3(questions: list[str], modality: str):
|
||||
prompts = [("<bos><start_of_turn>user\n"
|
||||
f"<start_of_image>{question}<end_of_turn>\n"
|
||||
"<start_of_turn>model\n") for question in questions]
|
||||
stop_token_ids = None
|
||||
return llm, prompts, stop_token_ids
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompts=prompts,
|
||||
)
|
||||
|
||||
|
||||
# GLM-4v
|
||||
def run_glm4v(questions: list[str], modality: str):
|
||||
def run_glm4v(questions: list[str], modality: str) -> ModelRequestData:
|
||||
assert modality == "image"
|
||||
model_name = "THUDM/glm-4v-9b"
|
||||
|
||||
llm = LLM(model=model_name,
|
||||
max_model_len=2048,
|
||||
max_num_seqs=2,
|
||||
trust_remote_code=True,
|
||||
enforce_eager=True,
|
||||
hf_overrides={"architectures": ["GLM4VForCausalLM"]},
|
||||
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
max_model_len=2048,
|
||||
max_num_seqs=2,
|
||||
trust_remote_code=True,
|
||||
enforce_eager=True,
|
||||
hf_overrides={"architectures": ["GLM4VForCausalLM"]},
|
||||
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
|
||||
)
|
||||
|
||||
prompts = [
|
||||
f"<|user|>\n<|begin_of_image|><|endoftext|><|end_of_image|>\
|
||||
@ -158,16 +204,21 @@ def run_glm4v(questions: list[str], modality: str):
|
||||
]
|
||||
|
||||
stop_token_ids = [151329, 151336, 151338]
|
||||
return llm, prompts, stop_token_ids
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompts=prompts,
|
||||
stop_token_ids=stop_token_ids,
|
||||
)
|
||||
|
||||
|
||||
# H2OVL-Mississippi
|
||||
def run_h2ovl(questions: list[str], modality: str):
|
||||
def run_h2ovl(questions: list[str], modality: str) -> ModelRequestData:
|
||||
assert modality == "image"
|
||||
|
||||
model_name = "h2oai/h2ovl-mississippi-800m"
|
||||
|
||||
llm = LLM(
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
trust_remote_code=True,
|
||||
max_model_len=8192,
|
||||
@ -187,15 +238,20 @@ def run_h2ovl(questions: list[str], modality: str):
|
||||
# Stop tokens for H2OVL-Mississippi
|
||||
# https://huggingface.co/h2oai/h2ovl-mississippi-800m
|
||||
stop_token_ids = [tokenizer.eos_token_id]
|
||||
return llm, prompts, stop_token_ids
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompts=prompts,
|
||||
stop_token_ids=stop_token_ids,
|
||||
)
|
||||
|
||||
|
||||
# Idefics3-8B-Llama3
|
||||
def run_idefics3(questions: list[str], modality: str):
|
||||
def run_idefics3(questions: list[str], modality: str) -> ModelRequestData:
|
||||
assert modality == "image"
|
||||
model_name = "HuggingFaceM4/Idefics3-8B-Llama3"
|
||||
|
||||
llm = LLM(
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
max_model_len=8192,
|
||||
max_num_seqs=2,
|
||||
@ -212,17 +268,20 @@ def run_idefics3(questions: list[str], modality: str):
|
||||
prompts = [(
|
||||
f"<|begin_of_text|>User:<image>{question}<end_of_utterance>\nAssistant:"
|
||||
) for question in questions]
|
||||
stop_token_ids = None
|
||||
return llm, prompts, stop_token_ids
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompts=prompts,
|
||||
)
|
||||
|
||||
|
||||
# InternVL
|
||||
def run_internvl(questions: list[str], modality: str):
|
||||
def run_internvl(questions: list[str], modality: str) -> ModelRequestData:
|
||||
assert modality == "image"
|
||||
|
||||
model_name = "OpenGVLab/InternVL2-2B"
|
||||
|
||||
llm = LLM(
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
trust_remote_code=True,
|
||||
max_model_len=4096,
|
||||
@ -245,53 +304,75 @@ def run_internvl(questions: list[str], modality: str):
|
||||
# https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py
|
||||
stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"]
|
||||
stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
|
||||
return llm, prompts, stop_token_ids
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompts=prompts,
|
||||
stop_token_ids=stop_token_ids,
|
||||
)
|
||||
|
||||
|
||||
# LLaVA-1.5
|
||||
def run_llava(questions: list[str], modality: str):
|
||||
def run_llava(questions: list[str], modality: str) -> ModelRequestData:
|
||||
assert modality == "image"
|
||||
|
||||
prompts = [
|
||||
f"USER: <image>\n{question}\nASSISTANT:" for question in questions
|
||||
]
|
||||
|
||||
llm = LLM(model="llava-hf/llava-1.5-7b-hf",
|
||||
max_model_len=4096,
|
||||
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
|
||||
stop_token_ids = None
|
||||
return llm, prompts, stop_token_ids
|
||||
engine_args = EngineArgs(
|
||||
model="llava-hf/llava-1.5-7b-hf",
|
||||
max_model_len=4096,
|
||||
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
|
||||
)
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompts=prompts,
|
||||
)
|
||||
|
||||
|
||||
# LLaVA-1.6/LLaVA-NeXT
|
||||
def run_llava_next(questions: list[str], modality: str):
|
||||
def run_llava_next(questions: list[str], modality: str) -> ModelRequestData:
|
||||
assert modality == "image"
|
||||
|
||||
prompts = [f"[INST] <image>\n{question} [/INST]" for question in questions]
|
||||
llm = LLM(model="llava-hf/llava-v1.6-mistral-7b-hf",
|
||||
max_model_len=8192,
|
||||
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
|
||||
stop_token_ids = None
|
||||
return llm, prompts, stop_token_ids
|
||||
engine_args = EngineArgs(
|
||||
model="llava-hf/llava-v1.6-mistral-7b-hf",
|
||||
max_model_len=8192,
|
||||
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
|
||||
)
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompts=prompts,
|
||||
)
|
||||
|
||||
|
||||
# LlaVA-NeXT-Video
|
||||
# Currently only support for video input
|
||||
def run_llava_next_video(questions: list[str], modality: str):
|
||||
def run_llava_next_video(questions: list[str],
|
||||
modality: str) -> ModelRequestData:
|
||||
assert modality == "video"
|
||||
|
||||
prompts = [
|
||||
f"USER: <video>\n{question} ASSISTANT:" for question in questions
|
||||
]
|
||||
llm = LLM(model="llava-hf/LLaVA-NeXT-Video-7B-hf",
|
||||
max_model_len=8192,
|
||||
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
|
||||
stop_token_ids = None
|
||||
return llm, prompts, stop_token_ids
|
||||
engine_args = EngineArgs(
|
||||
model="llava-hf/LLaVA-NeXT-Video-7B-hf",
|
||||
max_model_len=8192,
|
||||
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
|
||||
)
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompts=prompts,
|
||||
)
|
||||
|
||||
|
||||
# LLaVA-OneVision
|
||||
def run_llava_onevision(questions: list[str], modality: str):
|
||||
def run_llava_onevision(questions: list[str],
|
||||
modality: str) -> ModelRequestData:
|
||||
|
||||
if modality == "video":
|
||||
prompts = [
|
||||
@ -305,15 +386,20 @@ def run_llava_onevision(questions: list[str], modality: str):
|
||||
<|im_start|>assistant\n" for question in questions
|
||||
]
|
||||
|
||||
llm = LLM(model="llava-hf/llava-onevision-qwen2-7b-ov-hf",
|
||||
max_model_len=16384,
|
||||
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
|
||||
stop_token_ids = None
|
||||
return llm, prompts, stop_token_ids
|
||||
engine_args = EngineArgs(
|
||||
model="llava-hf/llava-onevision-qwen2-7b-ov-hf",
|
||||
max_model_len=16384,
|
||||
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
|
||||
)
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompts=prompts,
|
||||
)
|
||||
|
||||
|
||||
# Mantis
|
||||
def run_mantis(questions: list[str], modality: str):
|
||||
def run_mantis(questions: list[str], modality: str) -> ModelRequestData:
|
||||
assert modality == "image"
|
||||
|
||||
llama3_template = '<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n' # noqa: E501
|
||||
@ -322,14 +408,19 @@ def run_mantis(questions: list[str], modality: str):
|
||||
for question in questions
|
||||
]
|
||||
|
||||
llm = LLM(
|
||||
engine_args = EngineArgs(
|
||||
model="TIGER-Lab/Mantis-8B-siglip-llama3",
|
||||
max_model_len=4096,
|
||||
hf_overrides={"architectures": ["MantisForConditionalGeneration"]},
|
||||
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
|
||||
)
|
||||
stop_token_ids = [128009]
|
||||
return llm, prompts, stop_token_ids
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompts=prompts,
|
||||
stop_token_ids=stop_token_ids,
|
||||
)
|
||||
|
||||
|
||||
# MiniCPM-V
|
||||
@ -357,7 +448,7 @@ def run_minicpmv_base(questions: list[str], modality: str, model_name):
|
||||
# model_name = "openbmb/MiniCPM-o-2_6"
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name,
|
||||
trust_remote_code=True)
|
||||
llm = LLM(
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
max_model_len=4096,
|
||||
max_num_seqs=2,
|
||||
@ -389,19 +480,24 @@ def run_minicpmv_base(questions: list[str], modality: str, model_name):
|
||||
tokenize=False,
|
||||
add_generation_prompt=True) for question in questions
|
||||
]
|
||||
return llm, prompts, stop_token_ids
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompts=prompts,
|
||||
stop_token_ids=stop_token_ids,
|
||||
)
|
||||
|
||||
|
||||
def run_minicpmo(questions: list[str], modality: str):
|
||||
def run_minicpmo(questions: list[str], modality: str) -> ModelRequestData:
|
||||
return run_minicpmv_base(questions, modality, "openbmb/MiniCPM-o-2_6")
|
||||
|
||||
|
||||
def run_minicpmv(questions: list[str], modality: str):
|
||||
def run_minicpmv(questions: list[str], modality: str) -> ModelRequestData:
|
||||
return run_minicpmv_base(questions, modality, "openbmb/MiniCPM-V-2_6")
|
||||
|
||||
|
||||
# LLama 3.2
|
||||
def run_mllama(questions: list[str], modality: str):
|
||||
def run_mllama(questions: list[str], modality: str) -> ModelRequestData:
|
||||
assert modality == "image"
|
||||
|
||||
model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
|
||||
@ -411,7 +507,7 @@ def run_mllama(questions: list[str], modality: str):
|
||||
# You may lower either to run this example on lower-end GPUs.
|
||||
|
||||
# The configuration below has been confirmed to launch on a single L40 GPU.
|
||||
llm = LLM(
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
max_model_len=4096,
|
||||
max_num_seqs=16,
|
||||
@ -432,17 +528,20 @@ def run_mllama(questions: list[str], modality: str):
|
||||
prompts = tokenizer.apply_chat_template(messages,
|
||||
add_generation_prompt=True,
|
||||
tokenize=False)
|
||||
stop_token_ids = None
|
||||
return llm, prompts, stop_token_ids
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompts=prompts,
|
||||
)
|
||||
|
||||
|
||||
# Molmo
|
||||
def run_molmo(questions: list[str], modality: str):
|
||||
def run_molmo(questions: list[str], modality: str) -> ModelRequestData:
|
||||
assert modality == "image"
|
||||
|
||||
model_name = "allenai/Molmo-7B-D-0924"
|
||||
|
||||
llm = LLM(
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
trust_remote_code=True,
|
||||
dtype="bfloat16",
|
||||
@ -453,18 +552,21 @@ def run_molmo(questions: list[str], modality: str):
|
||||
f"<|im_start|>user <image>\n{question}<|im_end|> \
|
||||
<|im_start|>assistant\n" for question in questions
|
||||
]
|
||||
stop_token_ids = None
|
||||
return llm, prompts, stop_token_ids
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompts=prompts,
|
||||
)
|
||||
|
||||
|
||||
# NVLM-D
|
||||
def run_nvlm_d(questions: list[str], modality: str):
|
||||
def run_nvlm_d(questions: list[str], modality: str) -> ModelRequestData:
|
||||
assert modality == "image"
|
||||
|
||||
model_name = "nvidia/NVLM-D-72B"
|
||||
|
||||
# Adjust this as necessary to fit in GPU
|
||||
llm = LLM(
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
trust_remote_code=True,
|
||||
max_model_len=4096,
|
||||
@ -481,36 +583,47 @@ def run_nvlm_d(questions: list[str], modality: str):
|
||||
prompts = tokenizer.apply_chat_template(messages,
|
||||
tokenize=False,
|
||||
add_generation_prompt=True)
|
||||
stop_token_ids = None
|
||||
return llm, prompts, stop_token_ids
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompts=prompts,
|
||||
)
|
||||
|
||||
|
||||
# PaliGemma
|
||||
def run_paligemma(question: str, modality: str):
|
||||
def run_paligemma(questions: list[str], modality: str) -> ModelRequestData:
|
||||
assert modality == "image"
|
||||
|
||||
# PaliGemma has special prompt format for VQA
|
||||
prompt = ["caption en"]
|
||||
llm = LLM(model="google/paligemma-3b-mix-224",
|
||||
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
|
||||
stop_token_ids = None
|
||||
return llm, prompt, stop_token_ids
|
||||
prompts = ["caption en" for _ in questions]
|
||||
engine_args = EngineArgs(
|
||||
model="google/paligemma-3b-mix-224",
|
||||
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompts=prompts,
|
||||
)
|
||||
|
||||
|
||||
# PaliGemma 2
|
||||
def run_paligemma2(question: str, modality: str):
|
||||
def run_paligemma2(questions: list[str], modality: str) -> ModelRequestData:
|
||||
assert modality == "image"
|
||||
|
||||
# PaliGemma 2 has special prompt format for VQA
|
||||
prompt = ["caption en"]
|
||||
llm = LLM(model="google/paligemma2-3b-ft-docci-448",
|
||||
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
|
||||
stop_token_ids = None
|
||||
return llm, prompt, stop_token_ids
|
||||
prompts = ["caption en" for _ in questions]
|
||||
engine_args = EngineArgs(
|
||||
model="google/paligemma2-3b-ft-docci-448",
|
||||
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompts=prompts,
|
||||
)
|
||||
|
||||
|
||||
# Phi-3-Vision
|
||||
def run_phi3v(questions: list[str], modality: str):
|
||||
def run_phi3v(questions: list[str], modality: str) -> ModelRequestData:
|
||||
assert modality == "image"
|
||||
|
||||
prompts = [
|
||||
@ -530,7 +643,7 @@ def run_phi3v(questions: list[str], modality: str):
|
||||
#
|
||||
# https://huggingface.co/microsoft/Phi-3.5-vision-instruct#loading-the-model-locally
|
||||
# https://huggingface.co/microsoft/Phi-3.5-vision-instruct/blob/main/processing_phi3_v.py#L194
|
||||
llm = LLM(
|
||||
engine_args = EngineArgs(
|
||||
model="microsoft/Phi-3.5-vision-instruct",
|
||||
trust_remote_code=True,
|
||||
max_model_len=4096,
|
||||
@ -539,12 +652,15 @@ def run_phi3v(questions: list[str], modality: str):
|
||||
mm_processor_kwargs={"num_crops": 16},
|
||||
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
|
||||
)
|
||||
stop_token_ids = None
|
||||
return llm, prompts, stop_token_ids
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompts=prompts,
|
||||
)
|
||||
|
||||
|
||||
# Phi-4-multimodal-instruct
|
||||
def run_phi4mm(questions: list[str], modality: str):
|
||||
def run_phi4mm(questions: list[str], modality: str) -> ModelRequestData:
|
||||
"""
|
||||
Phi-4-multimodal-instruct supports both image and audio inputs. Here, we
|
||||
show how to process image inputs.
|
||||
@ -558,33 +674,30 @@ def run_phi4mm(questions: list[str], modality: str):
|
||||
f"<|user|><|image_1|>{question}<|end|><|assistant|>"
|
||||
for question in questions
|
||||
]
|
||||
llm = LLM(
|
||||
engine_args = EngineArgs(
|
||||
model=model_path,
|
||||
trust_remote_code=True,
|
||||
max_model_len=4096,
|
||||
max_num_seqs=2,
|
||||
enable_lora=True,
|
||||
max_lora_rank=320,
|
||||
lora_extra_vocab_size=0,
|
||||
)
|
||||
lora_request = LoRARequest("vision", 1, vision_lora_path)
|
||||
# To maintain code compatibility in this script, we add LoRA here.
|
||||
llm.llm_engine.add_lora(lora_request=lora_request)
|
||||
# You can also add LoRA using:
|
||||
# llm.generate(prompts, lora_request=lora_request,...)
|
||||
|
||||
stop_token_ids = None
|
||||
return llm, prompts, stop_token_ids
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompts=prompts,
|
||||
lora_requests=[LoRARequest("vision", 1, vision_lora_path)],
|
||||
)
|
||||
|
||||
|
||||
# Pixtral HF-format
|
||||
def run_pixtral_hf(questions: list[str], modality: str):
|
||||
def run_pixtral_hf(questions: list[str], modality: str) -> ModelRequestData:
|
||||
assert modality == "image"
|
||||
|
||||
model_name = "mistral-community/pixtral-12b"
|
||||
|
||||
# NOTE: Need L40 (or equivalent) to avoid OOM
|
||||
llm = LLM(
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
max_model_len=8192,
|
||||
max_num_seqs=2,
|
||||
@ -592,15 +705,18 @@ def run_pixtral_hf(questions: list[str], modality: str):
|
||||
)
|
||||
|
||||
prompts = [f"<s>[INST]{question}\n[IMG][/INST]" for question in questions]
|
||||
stop_token_ids = None
|
||||
return llm, prompts, stop_token_ids
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompts=prompts,
|
||||
)
|
||||
|
||||
|
||||
# Qwen
|
||||
def run_qwen_vl(questions: list[str], modality: str):
|
||||
def run_qwen_vl(questions: list[str], modality: str) -> ModelRequestData:
|
||||
assert modality == "image"
|
||||
|
||||
llm = LLM(
|
||||
engine_args = EngineArgs(
|
||||
model="Qwen/Qwen-VL",
|
||||
trust_remote_code=True,
|
||||
max_model_len=1024,
|
||||
@ -610,16 +726,19 @@ def run_qwen_vl(questions: list[str], modality: str):
|
||||
)
|
||||
|
||||
prompts = [f"{question}Picture 1: <img></img>\n" for question in questions]
|
||||
stop_token_ids = None
|
||||
return llm, prompts, stop_token_ids
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompts=prompts,
|
||||
)
|
||||
|
||||
|
||||
# Qwen2-VL
|
||||
def run_qwen2_vl(questions: list[str], modality: str):
|
||||
def run_qwen2_vl(questions: list[str], modality: str) -> ModelRequestData:
|
||||
|
||||
model_name = "Qwen/Qwen2-VL-7B-Instruct"
|
||||
|
||||
llm = LLM(
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
max_model_len=4096,
|
||||
max_num_seqs=5,
|
||||
@ -642,16 +761,19 @@ def run_qwen2_vl(questions: list[str], modality: str):
|
||||
f"{question}<|im_end|>\n"
|
||||
"<|im_start|>assistant\n") for question in questions
|
||||
]
|
||||
stop_token_ids = None
|
||||
return llm, prompts, stop_token_ids
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompts=prompts,
|
||||
)
|
||||
|
||||
|
||||
# Qwen2.5-VL
|
||||
def run_qwen2_5_vl(questions: list[str], modality: str):
|
||||
def run_qwen2_5_vl(questions: list[str], modality: str) -> ModelRequestData:
|
||||
|
||||
model_name = "Qwen/Qwen2.5-VL-3B-Instruct"
|
||||
|
||||
llm = LLM(
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
max_model_len=4096,
|
||||
max_num_seqs=5,
|
||||
@ -674,8 +796,11 @@ def run_qwen2_5_vl(questions: list[str], modality: str):
|
||||
f"{question}<|im_end|>\n"
|
||||
"<|im_start|>assistant\n") for question in questions
|
||||
]
|
||||
stop_token_ids = None
|
||||
return llm, prompts, stop_token_ids
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompts=prompts,
|
||||
)
|
||||
|
||||
|
||||
model_example_map = {
|
||||
@ -789,18 +914,28 @@ def main(args):
|
||||
data = mm_input["data"]
|
||||
questions = mm_input["questions"]
|
||||
|
||||
llm, prompts, stop_token_ids = model_example_map[model](questions,
|
||||
modality)
|
||||
req_data = model_example_map[model](questions, modality)
|
||||
|
||||
engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
|
||||
llm = LLM(**engine_args)
|
||||
|
||||
# To maintain code compatibility in this script, we add LoRA here.
|
||||
# You can also add LoRA using:
|
||||
# llm.generate(prompts, lora_request=lora_request,...)
|
||||
if req_data.lora_requests:
|
||||
for lora_request in req_data.lora_requests:
|
||||
llm.llm_engine.add_lora(lora_request=lora_request)
|
||||
|
||||
# Don't want to check the flag multiple times, so just hijack `prompts`.
|
||||
prompts = prompts if args.use_different_prompt_per_request else [
|
||||
prompts[0]
|
||||
prompts = req_data.prompts if args.use_different_prompt_per_request else [
|
||||
req_data.prompts[0]
|
||||
]
|
||||
|
||||
# We set temperature to 0.2 so that outputs can be different
|
||||
# even when all prompts are identical when running batch inference.
|
||||
sampling_params = SamplingParams(temperature=0.2,
|
||||
max_tokens=64,
|
||||
stop_token_ids=stop_token_ids)
|
||||
stop_token_ids=req_data.stop_token_ids)
|
||||
|
||||
assert args.num_prompts > 0
|
||||
if args.num_prompts == 1:
|
||||
@ -865,6 +1000,10 @@ if __name__ == "__main__":
|
||||
type=int,
|
||||
default=16,
|
||||
help='Number of frames to extract from the video.')
|
||||
parser.add_argument("--seed",
|
||||
type=int,
|
||||
default=None,
|
||||
help="Set the seed when initializing `vllm.LLM`.")
|
||||
|
||||
parser.add_argument(
|
||||
'--image-repeat-prob',
|
||||
|
||||
@ -7,11 +7,12 @@ For most models, the prompt format should follow corresponding examples
|
||||
on HuggingFace model repository.
|
||||
"""
|
||||
from argparse import Namespace
|
||||
from dataclasses import asdict
|
||||
from typing import Literal, NamedTuple, Optional, TypedDict, Union, get_args
|
||||
|
||||
from PIL.Image import Image
|
||||
|
||||
from vllm import LLM
|
||||
from vllm import LLM, EngineArgs
|
||||
from vllm.multimodal.utils import fetch_image
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
|
||||
@ -37,12 +38,12 @@ Query = Union[TextQuery, ImageQuery, TextImageQuery]
|
||||
|
||||
|
||||
class ModelRequestData(NamedTuple):
|
||||
llm: LLM
|
||||
engine_args: EngineArgs
|
||||
prompt: str
|
||||
image: Optional[Image]
|
||||
|
||||
|
||||
def run_e5_v(query: Query):
|
||||
def run_e5_v(query: Query) -> ModelRequestData:
|
||||
llama3_template = '<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n' # noqa: E501
|
||||
|
||||
if query["modality"] == "text":
|
||||
@ -58,20 +59,20 @@ def run_e5_v(query: Query):
|
||||
modality = query['modality']
|
||||
raise ValueError(f"Unsupported query modality: '{modality}'")
|
||||
|
||||
llm = LLM(
|
||||
engine_args = EngineArgs(
|
||||
model="royokong/e5-v",
|
||||
task="embed",
|
||||
max_model_len=4096,
|
||||
)
|
||||
|
||||
return ModelRequestData(
|
||||
llm=llm,
|
||||
engine_args=engine_args,
|
||||
prompt=prompt,
|
||||
image=image,
|
||||
)
|
||||
|
||||
|
||||
def run_vlm2vec(query: Query):
|
||||
def run_vlm2vec(query: Query) -> ModelRequestData:
|
||||
if query["modality"] == "text":
|
||||
text = query["text"]
|
||||
prompt = f"Find me an everyday image that matches the given caption: {text}" # noqa: E501
|
||||
@ -87,7 +88,7 @@ def run_vlm2vec(query: Query):
|
||||
modality = query['modality']
|
||||
raise ValueError(f"Unsupported query modality: '{modality}'")
|
||||
|
||||
llm = LLM(
|
||||
engine_args = EngineArgs(
|
||||
model="TIGER-Lab/VLM2Vec-Full",
|
||||
task="embed",
|
||||
trust_remote_code=True,
|
||||
@ -95,7 +96,7 @@ def run_vlm2vec(query: Query):
|
||||
)
|
||||
|
||||
return ModelRequestData(
|
||||
llm=llm,
|
||||
engine_args=engine_args,
|
||||
prompt=prompt,
|
||||
image=image,
|
||||
)
|
||||
@ -126,15 +127,18 @@ def get_query(modality: QueryModality):
|
||||
raise ValueError(msg)
|
||||
|
||||
|
||||
def run_encode(model: str, modality: QueryModality):
|
||||
def run_encode(model: str, modality: QueryModality, seed: Optional[int]):
|
||||
query = get_query(modality)
|
||||
req_data = model_example_map[model](query)
|
||||
|
||||
engine_args = asdict(req_data.engine_args) | {"seed": seed}
|
||||
llm = LLM(**engine_args)
|
||||
|
||||
mm_data = {}
|
||||
if req_data.image is not None:
|
||||
mm_data["image"] = req_data.image
|
||||
|
||||
outputs = req_data.llm.embed({
|
||||
outputs = llm.embed({
|
||||
"prompt": req_data.prompt,
|
||||
"multi_modal_data": mm_data,
|
||||
})
|
||||
@ -144,7 +148,7 @@ def run_encode(model: str, modality: QueryModality):
|
||||
|
||||
|
||||
def main(args: Namespace):
|
||||
run_encode(args.model_name, args.modality)
|
||||
run_encode(args.model_name, args.modality, args.seed)
|
||||
|
||||
|
||||
model_example_map = {
|
||||
@ -167,5 +171,10 @@ if __name__ == "__main__":
|
||||
default="image",
|
||||
choices=get_args(QueryModality),
|
||||
help='Modality of the input.')
|
||||
parser.add_argument("--seed",
|
||||
type=int,
|
||||
default=None,
|
||||
help="Set the seed when initializing `vllm.LLM`.")
|
||||
|
||||
args = parser.parse_args()
|
||||
main(args)
|
||||
|
||||
@ -6,13 +6,14 @@ using the chat template defined by the model.
|
||||
"""
|
||||
import os
|
||||
from argparse import Namespace
|
||||
from dataclasses import asdict
|
||||
from typing import NamedTuple, Optional
|
||||
|
||||
from huggingface_hub import snapshot_download
|
||||
from PIL.Image import Image
|
||||
from transformers import AutoProcessor, AutoTokenizer
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm import LLM, EngineArgs, SamplingParams
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.multimodal.utils import fetch_image
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
@ -25,11 +26,12 @@ IMAGE_URLS = [
|
||||
|
||||
|
||||
class ModelRequestData(NamedTuple):
|
||||
llm: LLM
|
||||
engine_args: EngineArgs
|
||||
prompt: str
|
||||
stop_token_ids: Optional[list[int]]
|
||||
image_data: list[Image]
|
||||
chat_template: Optional[str]
|
||||
stop_token_ids: Optional[list[int]] = None
|
||||
chat_template: Optional[str] = None
|
||||
lora_requests: Optional[list[LoRARequest]] = None
|
||||
|
||||
|
||||
# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
|
||||
@ -37,58 +39,58 @@ class ModelRequestData(NamedTuple):
|
||||
# Unless specified, these settings have been tested to work on a single L4.
|
||||
|
||||
|
||||
def load_aria(question, image_urls: list[str]) -> ModelRequestData:
|
||||
def load_aria(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
model_name = "rhymes-ai/Aria"
|
||||
llm = LLM(model=model_name,
|
||||
tokenizer_mode="slow",
|
||||
trust_remote_code=True,
|
||||
dtype="bfloat16",
|
||||
limit_mm_per_prompt={"image": len(image_urls)})
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
tokenizer_mode="slow",
|
||||
trust_remote_code=True,
|
||||
dtype="bfloat16",
|
||||
limit_mm_per_prompt={"image": len(image_urls)},
|
||||
)
|
||||
placeholders = "<fim_prefix><|img|><fim_suffix>\n" * len(image_urls)
|
||||
prompt = (f"<|im_start|>user\n{placeholders}{question}<|im_end|>\n"
|
||||
"<|im_start|>assistant\n")
|
||||
stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519]
|
||||
|
||||
return ModelRequestData(
|
||||
llm=llm,
|
||||
engine_args=engine_args,
|
||||
prompt=prompt,
|
||||
stop_token_ids=stop_token_ids,
|
||||
image_data=[fetch_image(url) for url in image_urls],
|
||||
chat_template=None,
|
||||
)
|
||||
|
||||
|
||||
def load_deepseek_vl2(question: str, image_urls: list[str]):
|
||||
def load_deepseek_vl2(question: str,
|
||||
image_urls: list[str]) -> ModelRequestData:
|
||||
model_name = "deepseek-ai/deepseek-vl2-tiny"
|
||||
|
||||
llm = LLM(model=model_name,
|
||||
max_model_len=4096,
|
||||
max_num_seqs=2,
|
||||
hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]},
|
||||
limit_mm_per_prompt={"image": len(image_urls)})
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
max_model_len=4096,
|
||||
max_num_seqs=2,
|
||||
hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]},
|
||||
limit_mm_per_prompt={"image": len(image_urls)},
|
||||
)
|
||||
|
||||
placeholder = "".join(f"image_{i}:<image>\n"
|
||||
for i, _ in enumerate(image_urls, start=1))
|
||||
prompt = f"<|User|>: {placeholder}{question}\n\n<|Assistant|>:"
|
||||
|
||||
return ModelRequestData(
|
||||
llm=llm,
|
||||
engine_args=engine_args,
|
||||
prompt=prompt,
|
||||
stop_token_ids=None,
|
||||
image_data=[fetch_image(url) for url in image_urls],
|
||||
chat_template=None,
|
||||
)
|
||||
|
||||
|
||||
def load_gemma3(question, image_urls: list[str]) -> ModelRequestData:
|
||||
def load_gemma3(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
model_name = "google/gemma-3-4b-it"
|
||||
|
||||
llm = LLM(
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
max_model_len=8192,
|
||||
max_num_seqs=2,
|
||||
# Default is False; setting it to True is not supported in V1 yet
|
||||
mm_processor_kwargs={"do_pan_and_scan": True},
|
||||
limit_mm_per_prompt={"image": len(image_urls)},
|
||||
)
|
||||
|
||||
@ -112,18 +114,16 @@ def load_gemma3(question, image_urls: list[str]) -> ModelRequestData:
|
||||
add_generation_prompt=True)
|
||||
|
||||
return ModelRequestData(
|
||||
llm=llm,
|
||||
engine_args=engine_args,
|
||||
prompt=prompt,
|
||||
stop_token_ids=None,
|
||||
image_data=[fetch_image(url) for url in image_urls],
|
||||
chat_template=None,
|
||||
)
|
||||
|
||||
|
||||
def load_h2ovl(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
model_name = "h2oai/h2ovl-mississippi-800m"
|
||||
|
||||
llm = LLM(
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
trust_remote_code=True,
|
||||
max_model_len=8192,
|
||||
@ -146,19 +146,18 @@ def load_h2ovl(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
stop_token_ids = [tokenizer.eos_token_id]
|
||||
|
||||
return ModelRequestData(
|
||||
llm=llm,
|
||||
engine_args=engine_args,
|
||||
prompt=prompt,
|
||||
stop_token_ids=stop_token_ids,
|
||||
image_data=[fetch_image(url) for url in image_urls],
|
||||
chat_template=None,
|
||||
)
|
||||
|
||||
|
||||
def load_idefics3(question, image_urls: list[str]) -> ModelRequestData:
|
||||
def load_idefics3(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
model_name = "HuggingFaceM4/Idefics3-8B-Llama3"
|
||||
|
||||
# The configuration below has been confirmed to launch on a single L40 GPU.
|
||||
llm = LLM(
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
max_model_len=8192,
|
||||
max_num_seqs=16,
|
||||
@ -177,18 +176,16 @@ def load_idefics3(question, image_urls: list[str]) -> ModelRequestData:
|
||||
for i, _ in enumerate(image_urls, start=1))
|
||||
prompt = f"<|begin_of_text|>User:{placeholders}\n{question}<end_of_utterance>\nAssistant:" # noqa: E501
|
||||
return ModelRequestData(
|
||||
llm=llm,
|
||||
engine_args=engine_args,
|
||||
prompt=prompt,
|
||||
stop_token_ids=None,
|
||||
image_data=[fetch_image(url) for url in image_urls],
|
||||
chat_template=None,
|
||||
)
|
||||
|
||||
|
||||
def load_internvl(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
model_name = "OpenGVLab/InternVL2-2B"
|
||||
|
||||
llm = LLM(
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
trust_remote_code=True,
|
||||
max_model_len=4096,
|
||||
@ -214,19 +211,18 @@ def load_internvl(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
|
||||
|
||||
return ModelRequestData(
|
||||
llm=llm,
|
||||
engine_args=engine_args,
|
||||
prompt=prompt,
|
||||
stop_token_ids=stop_token_ids,
|
||||
image_data=[fetch_image(url) for url in image_urls],
|
||||
chat_template=None,
|
||||
)
|
||||
|
||||
|
||||
def load_mllama(question, image_urls: list[str]) -> ModelRequestData:
|
||||
def load_mllama(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
|
||||
|
||||
# The configuration below has been confirmed to launch on a single L40 GPU.
|
||||
llm = LLM(
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
max_model_len=4096,
|
||||
max_num_seqs=16,
|
||||
@ -236,19 +232,17 @@ def load_mllama(question, image_urls: list[str]) -> ModelRequestData:
|
||||
placeholders = "<|image|>" * len(image_urls)
|
||||
prompt = f"{placeholders}<|begin_of_text|>{question}"
|
||||
return ModelRequestData(
|
||||
llm=llm,
|
||||
engine_args=engine_args,
|
||||
prompt=prompt,
|
||||
stop_token_ids=None,
|
||||
image_data=[fetch_image(url) for url in image_urls],
|
||||
chat_template=None,
|
||||
)
|
||||
|
||||
|
||||
def load_nvlm_d(question: str, image_urls: list[str]):
|
||||
def load_nvlm_d(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
model_name = "nvidia/NVLM-D-72B"
|
||||
|
||||
# Adjust this as necessary to fit in GPU
|
||||
llm = LLM(
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
trust_remote_code=True,
|
||||
max_model_len=8192,
|
||||
@ -266,14 +260,11 @@ def load_nvlm_d(question: str, image_urls: list[str]):
|
||||
prompt = tokenizer.apply_chat_template(messages,
|
||||
tokenize=False,
|
||||
add_generation_prompt=True)
|
||||
stop_token_ids = None
|
||||
|
||||
return ModelRequestData(
|
||||
llm=llm,
|
||||
engine_args=engine_args,
|
||||
prompt=prompt,
|
||||
stop_token_ids=stop_token_ids,
|
||||
image_data=[fetch_image(url) for url in image_urls],
|
||||
chat_template=None,
|
||||
)
|
||||
|
||||
|
||||
@ -281,7 +272,7 @@ def load_pixtral_hf(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
model_name = "mistral-community/pixtral-12b"
|
||||
|
||||
# Adjust this as necessary to fit in GPU
|
||||
llm = LLM(
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
max_model_len=8192,
|
||||
max_num_seqs=2,
|
||||
@ -291,14 +282,11 @@ def load_pixtral_hf(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
|
||||
placeholders = "[IMG]" * len(image_urls)
|
||||
prompt = f"<s>[INST]{question}\n{placeholders}[/INST]"
|
||||
stop_token_ids = None
|
||||
|
||||
return ModelRequestData(
|
||||
llm=llm,
|
||||
engine_args=engine_args,
|
||||
prompt=prompt,
|
||||
stop_token_ids=stop_token_ids,
|
||||
image_data=[fetch_image(url) for url in image_urls],
|
||||
chat_template=None,
|
||||
)
|
||||
|
||||
|
||||
@ -315,7 +303,7 @@ def load_phi3v(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
#
|
||||
# https://huggingface.co/microsoft/Phi-3.5-vision-instruct#loading-the-model-locally
|
||||
# https://huggingface.co/microsoft/Phi-3.5-vision-instruct/blob/main/processing_phi3_v.py#L194
|
||||
llm = LLM(
|
||||
engine_args = EngineArgs(
|
||||
model="microsoft/Phi-3.5-vision-instruct",
|
||||
trust_remote_code=True,
|
||||
max_model_len=4096,
|
||||
@ -326,14 +314,11 @@ def load_phi3v(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
placeholders = "\n".join(f"<|image_{i}|>"
|
||||
for i, _ in enumerate(image_urls, start=1))
|
||||
prompt = f"<|user|>\n{placeholders}\n{question}<|end|>\n<|assistant|>\n"
|
||||
stop_token_ids = None
|
||||
|
||||
return ModelRequestData(
|
||||
llm=llm,
|
||||
engine_args=engine_args,
|
||||
prompt=prompt,
|
||||
stop_token_ids=stop_token_ids,
|
||||
image_data=[fetch_image(url) for url in image_urls],
|
||||
chat_template=None,
|
||||
)
|
||||
|
||||
|
||||
@ -347,7 +332,7 @@ def load_phi4mm(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
# Since the vision-lora and speech-lora co-exist with the base model,
|
||||
# we have to manually specify the path of the lora weights.
|
||||
vision_lora_path = os.path.join(model_path, "vision-lora")
|
||||
llm = LLM(
|
||||
engine_args = EngineArgs(
|
||||
model=model_path,
|
||||
trust_remote_code=True,
|
||||
max_model_len=10000,
|
||||
@ -355,32 +340,24 @@ def load_phi4mm(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
limit_mm_per_prompt={"image": len(image_urls)},
|
||||
enable_lora=True,
|
||||
max_lora_rank=320,
|
||||
lora_extra_vocab_size=0,
|
||||
)
|
||||
lora_request = LoRARequest("vision", 1, vision_lora_path)
|
||||
# To maintain code compatibility in this script, we add LoRA here.
|
||||
llm.llm_engine.add_lora(lora_request=lora_request)
|
||||
# You can also add LoRA using:
|
||||
# llm.generate(prompts, lora_request=lora_request,...)
|
||||
|
||||
placeholders = "".join(f"<|image_{i}|>"
|
||||
for i, _ in enumerate(image_urls, start=1))
|
||||
prompt = f"<|user|>{placeholders}{question}<|end|><|assistant|>"
|
||||
stop_token_ids = None
|
||||
|
||||
return ModelRequestData(
|
||||
llm=llm,
|
||||
engine_args=engine_args,
|
||||
prompt=prompt,
|
||||
stop_token_ids=stop_token_ids,
|
||||
image_data=[fetch_image(url) for url in image_urls],
|
||||
chat_template=None,
|
||||
lora_requests=[LoRARequest("vision", 1, vision_lora_path)],
|
||||
)
|
||||
|
||||
|
||||
def load_qwen_vl_chat(question: str,
|
||||
image_urls: list[str]) -> ModelRequestData:
|
||||
model_name = "Qwen/Qwen-VL-Chat"
|
||||
llm = LLM(
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
trust_remote_code=True,
|
||||
max_model_len=1024,
|
||||
@ -411,7 +388,7 @@ def load_qwen_vl_chat(question: str,
|
||||
stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
|
||||
|
||||
return ModelRequestData(
|
||||
llm=llm,
|
||||
engine_args=engine_args,
|
||||
prompt=prompt,
|
||||
stop_token_ids=stop_token_ids,
|
||||
image_data=[fetch_image(url) for url in image_urls],
|
||||
@ -419,7 +396,7 @@ def load_qwen_vl_chat(question: str,
|
||||
)
|
||||
|
||||
|
||||
def load_qwen2_vl(question, image_urls: list[str]) -> ModelRequestData:
|
||||
def load_qwen2_vl(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
try:
|
||||
from qwen_vl_utils import process_vision_info
|
||||
except ModuleNotFoundError:
|
||||
@ -431,7 +408,7 @@ def load_qwen2_vl(question, image_urls: list[str]) -> ModelRequestData:
|
||||
model_name = "Qwen/Qwen2-VL-7B-Instruct"
|
||||
|
||||
# Tested on L40
|
||||
llm = LLM(
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
max_model_len=32768 if process_vision_info is None else 4096,
|
||||
max_num_seqs=5,
|
||||
@ -460,23 +437,19 @@ def load_qwen2_vl(question, image_urls: list[str]) -> ModelRequestData:
|
||||
tokenize=False,
|
||||
add_generation_prompt=True)
|
||||
|
||||
stop_token_ids = None
|
||||
|
||||
if process_vision_info is None:
|
||||
image_data = [fetch_image(url) for url in image_urls]
|
||||
else:
|
||||
image_data, _ = process_vision_info(messages)
|
||||
|
||||
return ModelRequestData(
|
||||
llm=llm,
|
||||
engine_args=engine_args,
|
||||
prompt=prompt,
|
||||
stop_token_ids=stop_token_ids,
|
||||
image_data=image_data,
|
||||
chat_template=None,
|
||||
)
|
||||
|
||||
|
||||
def load_qwen2_5_vl(question, image_urls: list[str]) -> ModelRequestData:
|
||||
def load_qwen2_5_vl(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
try:
|
||||
from qwen_vl_utils import process_vision_info
|
||||
except ModuleNotFoundError:
|
||||
@ -487,7 +460,7 @@ def load_qwen2_5_vl(question, image_urls: list[str]) -> ModelRequestData:
|
||||
|
||||
model_name = "Qwen/Qwen2.5-VL-3B-Instruct"
|
||||
|
||||
llm = LLM(
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
max_model_len=32768 if process_vision_info is None else 4096,
|
||||
max_num_seqs=5,
|
||||
@ -516,8 +489,6 @@ def load_qwen2_5_vl(question, image_urls: list[str]) -> ModelRequestData:
|
||||
tokenize=False,
|
||||
add_generation_prompt=True)
|
||||
|
||||
stop_token_ids = None
|
||||
|
||||
if process_vision_info is None:
|
||||
image_data = [fetch_image(url) for url in image_urls]
|
||||
else:
|
||||
@ -525,11 +496,9 @@ def load_qwen2_5_vl(question, image_urls: list[str]) -> ModelRequestData:
|
||||
return_video_kwargs=False)
|
||||
|
||||
return ModelRequestData(
|
||||
llm=llm,
|
||||
engine_args=engine_args,
|
||||
prompt=prompt,
|
||||
stop_token_ids=stop_token_ids,
|
||||
image_data=image_data,
|
||||
chat_template=None,
|
||||
)
|
||||
|
||||
|
||||
@ -551,14 +520,25 @@ model_example_map = {
|
||||
}
|
||||
|
||||
|
||||
def run_generate(model, question: str, image_urls: list[str]):
|
||||
def run_generate(model, question: str, image_urls: list[str],
|
||||
seed: Optional[int]):
|
||||
req_data = model_example_map[model](question, image_urls)
|
||||
|
||||
engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
|
||||
llm = LLM(**engine_args)
|
||||
|
||||
# To maintain code compatibility in this script, we add LoRA here.
|
||||
# You can also add LoRA using:
|
||||
# llm.generate(prompts, lora_request=lora_request,...)
|
||||
if req_data.lora_requests:
|
||||
for lora_request in req_data.lora_requests:
|
||||
llm.llm_engine.add_lora(lora_request=lora_request)
|
||||
|
||||
sampling_params = SamplingParams(temperature=0.0,
|
||||
max_tokens=128,
|
||||
stop_token_ids=req_data.stop_token_ids)
|
||||
|
||||
outputs = req_data.llm.generate(
|
||||
outputs = llm.generate(
|
||||
{
|
||||
"prompt": req_data.prompt,
|
||||
"multi_modal_data": {
|
||||
@ -572,13 +552,24 @@ def run_generate(model, question: str, image_urls: list[str]):
|
||||
print(generated_text)
|
||||
|
||||
|
||||
def run_chat(model: str, question: str, image_urls: list[str]):
|
||||
def run_chat(model: str, question: str, image_urls: list[str],
|
||||
seed: Optional[int]):
|
||||
req_data = model_example_map[model](question, image_urls)
|
||||
|
||||
engine_args = asdict(req_data.engine_args) | {"seed": seed}
|
||||
llm = LLM(**engine_args)
|
||||
|
||||
# To maintain code compatibility in this script, we add LoRA here.
|
||||
# You can also add LoRA using:
|
||||
# llm.generate(prompts, lora_request=lora_request,...)
|
||||
if req_data.lora_requests:
|
||||
for lora_request in req_data.lora_requests:
|
||||
llm.llm_engine.add_lora(lora_request=lora_request)
|
||||
|
||||
sampling_params = SamplingParams(temperature=0.0,
|
||||
max_tokens=128,
|
||||
stop_token_ids=req_data.stop_token_ids)
|
||||
outputs = req_data.llm.chat(
|
||||
outputs = llm.chat(
|
||||
[{
|
||||
"role":
|
||||
"user",
|
||||
@ -607,11 +598,12 @@ def run_chat(model: str, question: str, image_urls: list[str]):
|
||||
def main(args: Namespace):
|
||||
model = args.model_type
|
||||
method = args.method
|
||||
seed = args.seed
|
||||
|
||||
if method == "generate":
|
||||
run_generate(model, QUESTION, IMAGE_URLS)
|
||||
run_generate(model, QUESTION, IMAGE_URLS, seed)
|
||||
elif method == "chat":
|
||||
run_chat(model, QUESTION, IMAGE_URLS)
|
||||
run_chat(model, QUESTION, IMAGE_URLS, seed)
|
||||
else:
|
||||
raise ValueError(f"Invalid method: {method}")
|
||||
|
||||
@ -632,6 +624,10 @@ if __name__ == "__main__":
|
||||
default="generate",
|
||||
choices=["generate", "chat"],
|
||||
help="The method to run in `vllm.LLM`.")
|
||||
parser.add_argument("--seed",
|
||||
type=int,
|
||||
default=None,
|
||||
help="Set the seed when initializing `vllm.LLM`.")
|
||||
|
||||
args = parser.parse_args()
|
||||
main(args)
|
||||
|
||||
@ -8,6 +8,9 @@ set -xe
|
||||
echo "🚧🚧 Warning: The usage of disaggregated prefill is experimental and subject to change 🚧🚧"
|
||||
sleep 1
|
||||
|
||||
# meta-llama/Meta-Llama-3.1-8B-Instruct or deepseek-ai/DeepSeek-V2-Lite
|
||||
MODEL_NAME=${HF_MODEL_NAME:-meta-llama/Meta-Llama-3.1-8B-Instruct}
|
||||
|
||||
# Trap the SIGINT signal (triggered by Ctrl+C)
|
||||
trap 'cleanup' INT
|
||||
|
||||
@ -44,18 +47,20 @@ wait_for_server() {
|
||||
# You can also adjust --kv-ip and --kv-port for distributed inference.
|
||||
|
||||
# prefilling instance, which is the KV producer
|
||||
CUDA_VISIBLE_DEVICES=0 vllm serve meta-llama/Meta-Llama-3.1-8B-Instruct \
|
||||
CUDA_VISIBLE_DEVICES=0 vllm serve $MODEL_NAME \
|
||||
--port 8100 \
|
||||
--max-model-len 100 \
|
||||
--gpu-memory-utilization 0.8 \
|
||||
--trust-remote-code \
|
||||
--kv-transfer-config \
|
||||
'{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2}' &
|
||||
|
||||
# decoding instance, which is the KV consumer
|
||||
CUDA_VISIBLE_DEVICES=1 vllm serve meta-llama/Meta-Llama-3.1-8B-Instruct \
|
||||
CUDA_VISIBLE_DEVICES=1 vllm serve $MODEL_NAME \
|
||||
--port 8200 \
|
||||
--max-model-len 100 \
|
||||
--gpu-memory-utilization 0.8 \
|
||||
--trust-remote-code \
|
||||
--kv-transfer-config \
|
||||
'{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2}' &
|
||||
|
||||
@ -78,7 +83,7 @@ sleep 1
|
||||
output1=$(curl -X POST -s http://localhost:8000/v1/completions \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||
"model": "'"$MODEL_NAME"'",
|
||||
"prompt": "San Francisco is a",
|
||||
"max_tokens": 10,
|
||||
"temperature": 0
|
||||
@ -87,7 +92,7 @@ output1=$(curl -X POST -s http://localhost:8000/v1/completions \
|
||||
output2=$(curl -X POST -s http://localhost:8000/v1/completions \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||
"model": "'"$MODEL_NAME"'",
|
||||
"prompt": "Santa Clara is a",
|
||||
"max_tokens": 10,
|
||||
"temperature": 0
|
||||
|
||||
@ -18,9 +18,10 @@ pillow # Required for image processing
|
||||
prometheus-fastapi-instrumentator >= 7.0.0
|
||||
tiktoken >= 0.6.0 # Required for DBRX tokenizer
|
||||
lm-format-enforcer >= 0.10.11, < 0.11
|
||||
llguidance >= 0.7.2, < 0.8.0; platform_machine == "x86_64" or platform_machine == "arm64" or platform_machine == "aarch64"
|
||||
outlines == 0.1.11
|
||||
lark == 1.2.2
|
||||
xgrammar == 0.1.15; platform_machine == "x86_64" or platform_machine == "aarch64"
|
||||
xgrammar == 0.1.16; platform_machine == "x86_64" or platform_machine == "aarch64"
|
||||
typing_extensions >= 4.10
|
||||
filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317
|
||||
partial-json-parser # used for parsing partial JSON outputs
|
||||
@ -28,7 +29,7 @@ pyzmq
|
||||
msgspec
|
||||
gguf == 0.10.0
|
||||
importlib_metadata
|
||||
mistral_common[opencv] >= 1.5.0
|
||||
mistral_common[opencv] >= 1.5.4
|
||||
pyyaml
|
||||
six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
|
||||
setuptools>=74.1.1; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
|
||||
|
||||
@ -3,7 +3,8 @@
|
||||
|
||||
# Dependencies for CPUs
|
||||
torch==2.6.0+cpu; platform_machine == "x86_64"
|
||||
torch==2.5.1; platform_machine == "ppc64le" or platform_machine == "aarch64" or platform_system == "Darwin"
|
||||
torch==2.6.0; platform_system == "Darwin"
|
||||
torch==2.5.1; platform_machine == "ppc64le" or platform_machine == "aarch64"
|
||||
torch==2.7.0.dev20250304; platform_machine == "s390x"
|
||||
|
||||
# required for the image processor of minicpm-o-2_6, this must be updated alongside torch
|
||||
|
||||
@ -15,7 +15,7 @@ pydantic >= 2.8
|
||||
torch
|
||||
py-cpuinfo
|
||||
transformers
|
||||
mistral_common >= 1.5.0
|
||||
mistral_common >= 1.5.4
|
||||
aiohttp
|
||||
starlette
|
||||
openai # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
|
||||
|
||||
@ -27,10 +27,10 @@ torchaudio==2.6.0
|
||||
torchvision==0.21.0
|
||||
transformers_stream_generator # required for qwen-vl test
|
||||
matplotlib # required for qwen-vl test
|
||||
mistral_common[opencv] >= 1.5.0 # required for pixtral test
|
||||
mistral_common[opencv] >= 1.5.4 # required for pixtral test
|
||||
datamodel_code_generator # required for minicpm3 test
|
||||
lm-eval[api]==0.4.4 # required for model evaluation test
|
||||
transformers==4.48.2
|
||||
transformers==4.48.2
|
||||
# quantization
|
||||
bitsandbytes>=0.45.3
|
||||
buildkite-test-collector==0.1.9
|
||||
@ -40,4 +40,4 @@ tritonclient==2.51.0
|
||||
|
||||
numpy < 2.0.0
|
||||
runai-model-streamer==0.11.0
|
||||
runai-model-streamer-s3==0.11.0
|
||||
runai-model-streamer-s3==0.11.0
|
||||
|
||||
@ -235,7 +235,7 @@ mbstrdecoder==1.1.3
|
||||
# typepy
|
||||
mdurl==0.1.2
|
||||
# via markdown-it-py
|
||||
mistral-common==1.5.1
|
||||
mistral-common==1.5.4
|
||||
# via -r requirements/test.in
|
||||
more-itertools==10.5.0
|
||||
# via lm-eval
|
||||
|
||||
@ -17,9 +17,9 @@ ray[data]
|
||||
--find-links https://storage.googleapis.com/libtpu-releases/index.html
|
||||
--find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html
|
||||
--find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
|
||||
torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.7.0.dev20250306%2Bcxx11-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
|
||||
torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.7.0.dev20250306%2Bcxx11-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
|
||||
torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.7.0.dev20250306%2Bcxx11-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
|
||||
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250306%2Bcxx11-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
|
||||
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250306%2Bcxx11-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
|
||||
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250306%2Bcxx11-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
|
||||
torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250319-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
|
||||
torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250319-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
|
||||
torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250319-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
|
||||
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250319-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
|
||||
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250319-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
|
||||
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250319-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
|
||||
|
||||
30
setup.py
30
setup.py
@ -294,26 +294,28 @@ class repackage_wheel(build_ext):
|
||||
]).decode("utf-8")
|
||||
upstream_main_commit = json.loads(resp_json)["sha"]
|
||||
|
||||
# Check if the local main branch is up-to-date. This is to ensure
|
||||
# the base commit we found is the most recent commit on the main
|
||||
# branch.
|
||||
local_main_commit = subprocess.check_output(
|
||||
["git", "rev-parse", "main"]).decode("utf-8").strip()
|
||||
if local_main_commit != upstream_main_commit:
|
||||
raise ValueError(
|
||||
f"Local main branch ({local_main_commit}) is not "
|
||||
"up-to-date with upstream main branch "
|
||||
f"({upstream_main_commit}). Please pull the latest "
|
||||
"changes from upstream main branch first.")
|
||||
# Check if the upstream_main_commit exists in the local repo
|
||||
try:
|
||||
subprocess.check_output(
|
||||
["git", "cat-file", "-e", f"{upstream_main_commit}"])
|
||||
except subprocess.CalledProcessError:
|
||||
# If not present, fetch it from the remote repository.
|
||||
# Note that this does not update any local branches,
|
||||
# but ensures that this commit ref and its history are
|
||||
# available in our local repo.
|
||||
subprocess.check_call([
|
||||
"git", "fetch", "https://github.com/vllm-project/vllm",
|
||||
"main"
|
||||
])
|
||||
|
||||
# Then get the commit hash of the current branch that is the same as
|
||||
# the upstream main commit.
|
||||
current_branch = subprocess.check_output(
|
||||
["git", "branch", "--show-current"]).decode("utf-8").strip()
|
||||
|
||||
base_commit = subprocess.check_output(
|
||||
["git", "merge-base", "main",
|
||||
current_branch]).decode("utf-8").strip()
|
||||
base_commit = subprocess.check_output([
|
||||
"git", "merge-base", f"{upstream_main_commit}", current_branch
|
||||
]).decode("utf-8").strip()
|
||||
return base_commit
|
||||
except ValueError as err:
|
||||
raise ValueError(err) from None
|
||||
|
||||
@ -7,10 +7,10 @@ from vllm import LLM, SamplingParams
|
||||
from vllm.device_allocator.cumem import CuMemAllocator
|
||||
from vllm.utils import GiB_bytes
|
||||
|
||||
from ..utils import fork_new_process_for_each_test
|
||||
from ..utils import create_new_process_for_each_test
|
||||
|
||||
|
||||
@fork_new_process_for_each_test
|
||||
@create_new_process_for_each_test()
|
||||
def test_python_error():
|
||||
"""
|
||||
Test if Python error occurs when there's low-level
|
||||
@ -36,7 +36,7 @@ def test_python_error():
|
||||
allocator.wake_up()
|
||||
|
||||
|
||||
@fork_new_process_for_each_test
|
||||
@create_new_process_for_each_test()
|
||||
def test_basic_cumem():
|
||||
# some tensors from default memory pool
|
||||
shape = (1024, 1024)
|
||||
@ -69,7 +69,7 @@ def test_basic_cumem():
|
||||
assert torch.allclose(output, torch.ones_like(output) * 3)
|
||||
|
||||
|
||||
@fork_new_process_for_each_test
|
||||
@create_new_process_for_each_test()
|
||||
def test_cumem_with_cudagraph():
|
||||
allocator = CuMemAllocator.get_instance()
|
||||
with allocator.use_memory_pool():
|
||||
@ -114,7 +114,7 @@ def test_cumem_with_cudagraph():
|
||||
assert torch.allclose(y, x + 1)
|
||||
|
||||
|
||||
@fork_new_process_for_each_test
|
||||
@create_new_process_for_each_test()
|
||||
@pytest.mark.parametrize(
|
||||
"model, use_v1",
|
||||
[
|
||||
|
||||
@ -60,7 +60,7 @@ class TestSetting:
|
||||
# embedding model
|
||||
TestSetting(
|
||||
model="BAAI/bge-multilingual-gemma2",
|
||||
model_args=["--task", "embed"],
|
||||
model_args=["--task", "embed", "--dtype", "bfloat16"],
|
||||
pp_size=1,
|
||||
tp_size=1,
|
||||
attn_backend="FLASH_ATTN",
|
||||
|
||||
@ -12,7 +12,7 @@ from vllm import LLM, SamplingParams
|
||||
from vllm.config import CompilationLevel
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
from ..utils import fork_new_process_for_each_test
|
||||
from ..utils import create_new_process_for_each_test
|
||||
|
||||
|
||||
@pytest.fixture(params=None, name="model_info")
|
||||
@ -78,7 +78,7 @@ def models_list_fixture(request):
|
||||
[CompilationLevel.DYNAMO_ONCE, CompilationLevel.PIECEWISE],
|
||||
)
|
||||
@pytest.mark.parametrize("model_info", "", indirect=True)
|
||||
@fork_new_process_for_each_test
|
||||
@create_new_process_for_each_test()
|
||||
def test_full_graph(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
model_info: tuple[str, dict[str, Any]],
|
||||
|
||||
@ -4,34 +4,38 @@ import pickle
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
from torch._inductor.codecache import BypassFxGraphCache
|
||||
|
||||
from vllm.compilation.config import CompilationConfig
|
||||
from vllm.compilation.inductor_pass import (CallableInductorPass,
|
||||
as_inductor_pass)
|
||||
from vllm.compilation.inductor_pass import CallableInductorPass, InductorPass
|
||||
from vllm.compilation.pass_manager import PostGradPassManager
|
||||
from vllm.config import CompilationConfig
|
||||
|
||||
|
||||
def simple_callable(graph: torch.fx.Graph):
|
||||
pass
|
||||
|
||||
|
||||
@as_inductor_pass(files=(__file__, ))
|
||||
def callable_decorated(graph: torch.fx.Graph):
|
||||
pass
|
||||
callable_uuid = CallableInductorPass(simple_callable,
|
||||
InductorPass.hash_source(__file__))
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"works, callable",
|
||||
[(False, simple_callable), (True, callable_decorated),
|
||||
(True, CallableInductorPass(simple_callable, "simple_callable"))])
|
||||
[
|
||||
(False, simple_callable),
|
||||
(True, callable_uuid),
|
||||
(True, CallableInductorPass(simple_callable)),
|
||||
],
|
||||
)
|
||||
def test_pass_manager(works: bool, callable):
|
||||
config = CompilationConfig().pass_config
|
||||
pass_manager = PostGradPassManager([callable])
|
||||
pass_manager.configure(config) # Adds default passes
|
||||
|
||||
pass_manager = PostGradPassManager()
|
||||
pass_manager.configure(config)
|
||||
|
||||
# Try to add the callable to the pass manager
|
||||
if works:
|
||||
pass_manager.add(callable)
|
||||
pickle.dumps(pass_manager)
|
||||
else:
|
||||
with pytest.raises(BypassFxGraphCache):
|
||||
pickle.dumps(pass_manager)
|
||||
with pytest.raises(AssertionError):
|
||||
pass_manager.add(callable)
|
||||
|
||||
@ -14,8 +14,8 @@ import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
from huggingface_hub import snapshot_download
|
||||
from PIL import Image
|
||||
from transformers import (AutoModelForCausalLM, AutoTokenizer, BatchEncoding,
|
||||
BatchFeature)
|
||||
from transformers import (AutoConfig, AutoModelForCausalLM, AutoTokenizer,
|
||||
BatchEncoding, BatchFeature)
|
||||
from transformers.models.auto.auto_factory import _BaseAutoModelClass
|
||||
|
||||
from tests.models.utils import (TokensTextLogprobs,
|
||||
@ -23,7 +23,7 @@ from tests.models.utils import (TokensTextLogprobs,
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.assets.image import ImageAsset
|
||||
from vllm.assets.video import VideoAsset
|
||||
from vllm.config import TaskOption, TokenizerPoolConfig
|
||||
from vllm.config import TaskOption, TokenizerPoolConfig, _get_and_verify_dtype
|
||||
from vllm.connections import global_http_connection
|
||||
from vllm.distributed import (cleanup_dist_env_and_memory,
|
||||
init_distributed_environment,
|
||||
@ -34,8 +34,7 @@ from vllm.inputs import (ExplicitEncoderDecoderPrompt, TextPrompt,
|
||||
from vllm.logger import init_logger
|
||||
from vllm.outputs import RequestOutput
|
||||
from vllm.sampling_params import BeamSearchParams
|
||||
from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, cuda_device_count_stateless,
|
||||
identity, is_list_of)
|
||||
from vllm.utils import cuda_device_count_stateless, is_list_of
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
@ -271,14 +270,18 @@ _R = TypeVar("_R")
|
||||
|
||||
class HfRunner:
|
||||
|
||||
def wrap_device(self, x: _T, device: Optional[str] = None) -> _T:
|
||||
def get_default_device(self):
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
return ("cpu" if current_platform.is_cpu()
|
||||
or current_platform.is_openvino() else "cuda")
|
||||
|
||||
def wrap_device(self, x: _T, device: Optional[str] = None) -> _T:
|
||||
if x is None or isinstance(x, (bool, )):
|
||||
return x
|
||||
|
||||
if device is None:
|
||||
device = "cpu" if current_platform.is_cpu(
|
||||
) or current_platform.is_openvino() else "cuda"
|
||||
device = self.device
|
||||
|
||||
if isinstance(x, dict):
|
||||
return {k: self.wrap_device(v, device) for k, v in x.items()}
|
||||
@ -291,45 +294,59 @@ class HfRunner:
|
||||
def __init__(
|
||||
self,
|
||||
model_name: str,
|
||||
dtype: str = "half",
|
||||
dtype: str = "auto",
|
||||
*,
|
||||
model_kwargs: Optional[dict[str, Any]] = None,
|
||||
is_sentence_transformer: bool = False,
|
||||
is_cross_encoder: bool = False,
|
||||
skip_tokenizer_init: bool = False,
|
||||
auto_cls: type[_BaseAutoModelClass] = AutoModelForCausalLM,
|
||||
postprocess_inputs: Callable[..., BatchEncoding] = identity,
|
||||
) -> None:
|
||||
torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[dtype]
|
||||
|
||||
self.model_name = model_name
|
||||
|
||||
self.config = AutoConfig.from_pretrained(
|
||||
model_name,
|
||||
trust_remote_code=True,
|
||||
)
|
||||
self.device = self.get_default_device()
|
||||
self.dtype = torch_dtype = _get_and_verify_dtype(self.config, dtype)
|
||||
|
||||
model_kwargs = model_kwargs if model_kwargs is not None else {}
|
||||
model_kwargs.setdefault("torch_dtype", torch_dtype)
|
||||
|
||||
if is_sentence_transformer:
|
||||
# Lazy init required for AMD CI
|
||||
from sentence_transformers import SentenceTransformer
|
||||
self.model = self.wrap_device(
|
||||
SentenceTransformer(
|
||||
model_name,
|
||||
device="cpu",
|
||||
trust_remote_code=True,
|
||||
).to(dtype=torch_dtype))
|
||||
|
||||
self.model = SentenceTransformer(
|
||||
model_name,
|
||||
device=self.device,
|
||||
model_kwargs=model_kwargs,
|
||||
trust_remote_code=True,
|
||||
)
|
||||
elif is_cross_encoder:
|
||||
# Lazy init required for AMD CI
|
||||
from sentence_transformers import CrossEncoder
|
||||
self.model = CrossEncoder(model_name,
|
||||
device="cpu",
|
||||
trust_remote_code=True)
|
||||
self.model.model = self.wrap_device(self.model.model)\
|
||||
.to(dtype=torch_dtype)
|
||||
|
||||
self.model = CrossEncoder(
|
||||
model_name,
|
||||
device=self.device,
|
||||
automodel_args=model_kwargs,
|
||||
trust_remote_code=True,
|
||||
)
|
||||
else:
|
||||
model_kwargs = model_kwargs if model_kwargs is not None else {}
|
||||
self.model = self.wrap_device(
|
||||
auto_cls.from_pretrained(
|
||||
model_name,
|
||||
torch_dtype=torch_dtype,
|
||||
trust_remote_code=True,
|
||||
**model_kwargs,
|
||||
))
|
||||
model = auto_cls.from_pretrained(
|
||||
model_name,
|
||||
trust_remote_code=True,
|
||||
**model_kwargs,
|
||||
)
|
||||
|
||||
if (getattr(model, "quantization_method", None) != "bitsandbytes"
|
||||
and len({p.device
|
||||
for p in model.parameters()}) < 2):
|
||||
model = model.to(self.device)
|
||||
|
||||
self.model = model
|
||||
|
||||
if not skip_tokenizer_init:
|
||||
self.tokenizer = AutoTokenizer.from_pretrained(
|
||||
@ -349,16 +366,13 @@ class HfRunner:
|
||||
if skip_tokenizer_init:
|
||||
self.tokenizer = self.processor.tokenizer
|
||||
|
||||
self.dtype = dtype
|
||||
self.postprocess_inputs = postprocess_inputs
|
||||
|
||||
def get_inputs(
|
||||
self,
|
||||
prompts: list[str],
|
||||
images: Optional[PromptImageInput] = None,
|
||||
videos: Optional[PromptVideoInput] = None,
|
||||
audios: Optional[PromptAudioInput] = None,
|
||||
) -> list[BatchEncoding]:
|
||||
) -> list[Union[BatchFeature, BatchEncoding]]:
|
||||
if images is not None:
|
||||
assert len(prompts) == len(images)
|
||||
|
||||
@ -368,7 +382,7 @@ class HfRunner:
|
||||
if audios is not None:
|
||||
assert len(prompts) == len(audios)
|
||||
|
||||
all_inputs: list[BatchEncoding] = []
|
||||
all_inputs: list[Union[BatchFeature, BatchEncoding]] = []
|
||||
for i, prompt in enumerate(prompts):
|
||||
processor_kwargs: dict[str, Any] = {
|
||||
"text": prompt,
|
||||
@ -384,7 +398,8 @@ class HfRunner:
|
||||
processor_kwargs["sampling_rate"] = sr
|
||||
|
||||
inputs = self.processor(**processor_kwargs)
|
||||
inputs = self.postprocess_inputs(inputs, dtype=self.dtype)
|
||||
if isinstance(inputs, BatchFeature):
|
||||
inputs = inputs.to(dtype=self.dtype)
|
||||
|
||||
all_inputs.append(inputs)
|
||||
|
||||
@ -417,7 +432,7 @@ class HfRunner:
|
||||
outputs: list[tuple[list[list[int]], list[str]]] = []
|
||||
for inputs in all_inputs:
|
||||
output_ids = self.model.generate(
|
||||
**self.wrap_device(inputs, device=self.model.device.type),
|
||||
**self.wrap_device(inputs),
|
||||
use_cache=True,
|
||||
**kwargs,
|
||||
)
|
||||
@ -488,7 +503,7 @@ class HfRunner:
|
||||
all_logprobs: list[list[torch.Tensor]] = []
|
||||
for inputs in all_inputs:
|
||||
output = self.model.generate(
|
||||
**self.wrap_device(inputs, device=self.model.device.type),
|
||||
**self.wrap_device(inputs),
|
||||
use_cache=True,
|
||||
do_sample=False,
|
||||
max_new_tokens=max_tokens,
|
||||
@ -569,7 +584,7 @@ class HfRunner:
|
||||
|
||||
for inputs in all_inputs:
|
||||
output = self.model.generate(
|
||||
**self.wrap_device(inputs, device=self.model.device.type),
|
||||
**self.wrap_device(inputs),
|
||||
use_cache=True,
|
||||
do_sample=False,
|
||||
max_new_tokens=max_tokens,
|
||||
@ -620,19 +635,15 @@ class HfRunner:
|
||||
if images is not None and images[i] is not None:
|
||||
processor_kwargs["images"] = images[i]
|
||||
|
||||
encoder_inputs = self.wrap_device(
|
||||
self.processor(**processor_kwargs),
|
||||
device=self.model.device.type,
|
||||
)
|
||||
encoder_inputs = self.processor(**processor_kwargs)
|
||||
encoder_inputs = self.wrap_device(encoder_inputs)
|
||||
|
||||
if decoder_prompt is None:
|
||||
decoder_input_ids = None
|
||||
else:
|
||||
decoder_input_ids = self.wrap_device(
|
||||
self.tokenizer(decoder_prompt,
|
||||
return_tensors="pt").input_ids,
|
||||
device=self.model.device.type,
|
||||
)
|
||||
decoder_inputs = self.tokenizer(decoder_prompt,
|
||||
return_tensors="pt")
|
||||
decoder_input_ids = self.wrap_device(decoder_inputs.input_ids)
|
||||
|
||||
output = self.model.generate(
|
||||
decoder_input_ids=decoder_input_ids,
|
||||
@ -684,6 +695,7 @@ class VllmRunner:
|
||||
"""
|
||||
The default value of some arguments have been modified from
|
||||
:class:`~vllm.LLM` as follows:
|
||||
|
||||
- `trust_remote_code`: Set to `True` instead of `False` for convenience.
|
||||
- `seed`: Set to `0` instead of `None` for test reproducibility.
|
||||
- `max_model_len`: Set to `1024` instead of `None` to reduce memory usage.
|
||||
@ -701,10 +713,8 @@ class VllmRunner:
|
||||
tokenizer_mode: str = "auto",
|
||||
trust_remote_code: bool = True,
|
||||
seed: Optional[int] = 0,
|
||||
# Use smaller max model length, otherwise bigger model cannot run due
|
||||
# to kv cache size limit.
|
||||
max_model_len: int = 1024,
|
||||
dtype: str = "half",
|
||||
dtype: str = "auto",
|
||||
disable_log_stats: bool = True,
|
||||
tensor_parallel_size: int = 1,
|
||||
block_size: int = 16,
|
||||
@ -1110,4 +1120,4 @@ def pytest_collection_modifyitems(config, items):
|
||||
skip_optional = pytest.mark.skip(reason="need --optional option to run")
|
||||
for item in items:
|
||||
if "optional" in item.keywords:
|
||||
item.add_marker(skip_optional)
|
||||
item.add_marker(skip_optional)
|
||||
|
||||
@ -8,7 +8,7 @@ import pytest
|
||||
from vllm.config import TaskOption
|
||||
from vllm.logger import init_logger
|
||||
|
||||
from ..utils import compare_two_settings, fork_new_process_for_each_test
|
||||
from ..utils import compare_two_settings, create_new_process_for_each_test
|
||||
|
||||
logger = init_logger("test_expert_parallel")
|
||||
|
||||
@ -209,7 +209,7 @@ def _compare_tp(
|
||||
for params in settings.iter_params(model_name)
|
||||
],
|
||||
)
|
||||
@fork_new_process_for_each_test
|
||||
@create_new_process_for_each_test()
|
||||
def test_ep(
|
||||
model_name: str,
|
||||
parallel_setup: ParallelSetup,
|
||||
|
||||
@ -17,7 +17,7 @@ from vllm.config import TaskOption
|
||||
from vllm.logger import init_logger
|
||||
|
||||
from ..models.registry import HF_EXAMPLE_MODELS
|
||||
from ..utils import compare_two_settings, fork_new_process_for_each_test
|
||||
from ..utils import compare_two_settings, create_new_process_for_each_test
|
||||
|
||||
logger = init_logger("test_pipeline_parallel")
|
||||
|
||||
@ -402,7 +402,7 @@ def _compare_tp(
|
||||
for params in settings.iter_params(model_id) if model_id in TEST_MODELS
|
||||
],
|
||||
)
|
||||
@fork_new_process_for_each_test
|
||||
@create_new_process_for_each_test()
|
||||
def test_tp_language_generation(
|
||||
model_id: str,
|
||||
parallel_setup: ParallelSetup,
|
||||
@ -431,7 +431,7 @@ def test_tp_language_generation(
|
||||
for params in settings.iter_params(model_id) if model_id in TEST_MODELS
|
||||
],
|
||||
)
|
||||
@fork_new_process_for_each_test
|
||||
@create_new_process_for_each_test()
|
||||
def test_tp_language_embedding(
|
||||
model_id: str,
|
||||
parallel_setup: ParallelSetup,
|
||||
@ -460,7 +460,7 @@ def test_tp_language_embedding(
|
||||
for params in settings.iter_params(model_id) if model_id in TEST_MODELS
|
||||
],
|
||||
)
|
||||
@fork_new_process_for_each_test
|
||||
@create_new_process_for_each_test()
|
||||
def test_tp_multimodal_generation(
|
||||
model_id: str,
|
||||
parallel_setup: ParallelSetup,
|
||||
|
||||
@ -5,7 +5,7 @@ from typing import TYPE_CHECKING
|
||||
|
||||
import pytest
|
||||
|
||||
from ..utils import compare_two_settings, fork_new_process_for_each_test
|
||||
from ..utils import compare_two_settings, create_new_process_for_each_test
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from typing_extensions import LiteralString
|
||||
@ -18,7 +18,7 @@ if TYPE_CHECKING:
|
||||
"FLASH_ATTN",
|
||||
"FLASHINFER",
|
||||
])
|
||||
@fork_new_process_for_each_test
|
||||
@create_new_process_for_each_test()
|
||||
def test_pp_cudagraph(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
PP_SIZE: int,
|
||||
|
||||
@ -9,6 +9,8 @@ import torch.distributed as dist
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.distributed.parallel_state import get_world_group
|
||||
|
||||
dist.init_process_group(backend="gloo")
|
||||
|
||||
# Create prompts
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
|
||||
@ -64,7 +64,6 @@ def test_multi_chat():
|
||||
def test_chat_multi_image(image_urls: list[str]):
|
||||
llm = LLM(
|
||||
model="microsoft/Phi-3.5-vision-instruct",
|
||||
dtype="bfloat16",
|
||||
max_model_len=4096,
|
||||
max_num_seqs=5,
|
||||
enforce_eager=True,
|
||||
|
||||
@ -4,12 +4,12 @@ import pytest
|
||||
|
||||
from vllm import LLM
|
||||
|
||||
from ...utils import fork_new_process_for_each_test
|
||||
from ...utils import create_new_process_for_each_test
|
||||
|
||||
|
||||
@pytest.mark.parametrize("tp_size", [1, 2])
|
||||
@pytest.mark.parametrize("backend", ["mp", "ray"])
|
||||
@fork_new_process_for_each_test
|
||||
@create_new_process_for_each_test()
|
||||
def test_collective_rpc(tp_size, backend):
|
||||
if tp_size == 1 and backend == "ray":
|
||||
pytest.skip("Skip duplicate test case")
|
||||
|
||||
@ -14,7 +14,9 @@ from vllm.outputs import RequestOutput
|
||||
from vllm.sampling_params import GuidedDecodingParams, SamplingParams
|
||||
|
||||
MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"
|
||||
GUIDED_DECODING_BACKENDS = ["outlines", "lm-format-enforcer", "xgrammar"]
|
||||
GUIDED_DECODING_BACKENDS = [
|
||||
"outlines", "lm-format-enforcer", "xgrammar", "guidance"
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
|
||||
@ -18,8 +18,6 @@ TEST_AUDIO_URLS = [
|
||||
@pytest.fixture(scope="module")
|
||||
def server():
|
||||
args = [
|
||||
"--dtype",
|
||||
"bfloat16",
|
||||
"--max-model-len",
|
||||
"2048",
|
||||
"--max-num-seqs",
|
||||
|
||||
@ -24,8 +24,6 @@ def server():
|
||||
args = [
|
||||
"--task",
|
||||
"generate",
|
||||
"--dtype",
|
||||
"bfloat16",
|
||||
"--max-model-len",
|
||||
"32768",
|
||||
"--max-num-seqs",
|
||||
|
||||
@ -25,8 +25,6 @@ def server():
|
||||
args = [
|
||||
"--task",
|
||||
"generate",
|
||||
"--dtype",
|
||||
"bfloat16",
|
||||
"--max-model-len",
|
||||
"2048",
|
||||
"--max-num-seqs",
|
||||
|
||||
@ -28,8 +28,6 @@ def server():
|
||||
args = [
|
||||
"--task",
|
||||
"embed",
|
||||
"--dtype",
|
||||
"bfloat16",
|
||||
"--max-model-len",
|
||||
"2048",
|
||||
"--max-num-seqs",
|
||||
|
||||
@ -34,7 +34,7 @@ def phi3v_model_config():
|
||||
tokenizer=PHI3V_MODEL_ID,
|
||||
tokenizer_mode="auto",
|
||||
trust_remote_code=True,
|
||||
dtype="bfloat16",
|
||||
dtype="auto",
|
||||
seed=0,
|
||||
limit_mm_per_prompt={
|
||||
"image": 2,
|
||||
@ -58,7 +58,7 @@ def mllama_model_config():
|
||||
tokenizer=MLLAMA_MODEL_ID,
|
||||
tokenizer_mode="auto",
|
||||
trust_remote_code=True,
|
||||
dtype="bfloat16",
|
||||
dtype="auto",
|
||||
seed=0,
|
||||
limit_mm_per_prompt={
|
||||
"image": 2,
|
||||
@ -669,7 +669,7 @@ def test_multimodal_image_parsing_matches_hf(model, image_url):
|
||||
tokenizer=MLLAMA_MODEL_ID,
|
||||
tokenizer_mode="auto",
|
||||
trust_remote_code=True,
|
||||
dtype="bfloat16",
|
||||
dtype="auto",
|
||||
seed=0,
|
||||
limit_mm_per_prompt={
|
||||
"image": 2,
|
||||
|
||||
@ -15,6 +15,7 @@ NUM_HEADS = [(4, 4), (8, 2), (16, 2)]
|
||||
HEAD_SIZES = [128, 256]
|
||||
BLOCK_SIZES = [16, 32]
|
||||
DTYPES = [torch.float16, torch.bfloat16]
|
||||
QDTYPES = [None, torch.float8_e4m3fn]
|
||||
# one value large enough to test overflow in index calculation.
|
||||
# one value small enough to test the schema op check
|
||||
NUM_BLOCKS = [32768, 2048]
|
||||
@ -85,6 +86,7 @@ def ref_paged_attn(
|
||||
@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
|
||||
@pytest.mark.parametrize("sliding_window", [None, 256])
|
||||
@pytest.mark.parametrize("fa_version", [2, 3])
|
||||
@pytest.mark.parametrize("q_dtype", QDTYPES)
|
||||
@torch.inference_mode()
|
||||
def test_flash_attn_with_paged_kv(
|
||||
use_out: bool,
|
||||
@ -97,11 +99,15 @@ def test_flash_attn_with_paged_kv(
|
||||
num_blocks: int,
|
||||
sliding_window: Optional[int],
|
||||
fa_version: int,
|
||||
q_dtype: Optional[torch.dtype],
|
||||
) -> None:
|
||||
torch.set_default_device("cuda")
|
||||
if not is_fa_version_supported(fa_version):
|
||||
pytest.skip(f"Flash attention version {fa_version} not supported due "
|
||||
f"to: \"{fa_version_unsupported_reason(fa_version)}\"")
|
||||
if q_dtype is not None and (dtype != torch.bfloat16 or fa_version == 2):
|
||||
pytest.skip("Flash attention with quantized inputs is only "
|
||||
"supported on version 3 with bfloat16 base type")
|
||||
|
||||
current_platform.seed_everything(0)
|
||||
num_seqs = len(kv_lens)
|
||||
@ -130,10 +136,28 @@ def test_flash_attn_with_paged_kv(
|
||||
|
||||
q = query.unsqueeze(1)
|
||||
out = torch.empty_like(q) if use_out else None
|
||||
|
||||
maybe_quantized_query = q
|
||||
maybe_quantized_key_cache = key_cache
|
||||
maybe_quantized_value_cache = value_cache
|
||||
q_descale = None
|
||||
k_descale = None
|
||||
v_descale = None
|
||||
if q_dtype is not None:
|
||||
# QKV are drawn from N(0, 1): no need for a fp8 scaling factor
|
||||
maybe_quantized_query = query.to(q_dtype)
|
||||
maybe_quantized_key_cache = key_cache.to(q_dtype)
|
||||
maybe_quantized_value_cache = value_cache.to(q_dtype)
|
||||
|
||||
scale_shape = (num_seqs, num_kv_heads)
|
||||
q_descale = torch.ones(scale_shape, dtype=torch.float32)
|
||||
k_descale = torch.ones(scale_shape, dtype=torch.float32)
|
||||
v_descale = torch.ones(scale_shape, dtype=torch.float32)
|
||||
|
||||
output = flash_attn_with_kvcache(
|
||||
q=q,
|
||||
k_cache=key_cache,
|
||||
v_cache=value_cache,
|
||||
q=maybe_quantized_query,
|
||||
k_cache=maybe_quantized_key_cache,
|
||||
v_cache=maybe_quantized_value_cache,
|
||||
out=out,
|
||||
softmax_scale=scale,
|
||||
causal=True,
|
||||
@ -142,10 +166,17 @@ def test_flash_attn_with_paged_kv(
|
||||
softcap=soft_cap if soft_cap is not None else 0,
|
||||
window_size=window_size,
|
||||
fa_version=fa_version,
|
||||
q_descale=q_descale,
|
||||
k_descale=k_descale,
|
||||
v_descale=v_descale,
|
||||
)
|
||||
output = output if not use_out else out
|
||||
output = output.squeeze(1)
|
||||
|
||||
atol, rtol = 1.5e-2, 1e-2
|
||||
if q_dtype is not None:
|
||||
atol, rtol = 1.5e-1, 1.5e-1
|
||||
|
||||
ref_output = ref_paged_attn(query=query,
|
||||
key_cache=key_cache,
|
||||
value_cache=value_cache,
|
||||
@ -155,7 +186,7 @@ def test_flash_attn_with_paged_kv(
|
||||
scale=scale,
|
||||
soft_cap=soft_cap,
|
||||
sliding_window=sliding_window)
|
||||
torch.testing.assert_close(output, ref_output, atol=2e-2, rtol=1e-2), \
|
||||
torch.testing.assert_close(output, ref_output, atol=atol, rtol=rtol), \
|
||||
f"{torch.max(torch.abs(output - ref_output))}"
|
||||
|
||||
|
||||
@ -171,6 +202,7 @@ def test_flash_attn_with_paged_kv(
|
||||
@pytest.mark.parametrize("soft_cap", [None, 10.0, 50.0])
|
||||
@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
|
||||
@pytest.mark.parametrize("fa_version", [2, 3])
|
||||
@pytest.mark.parametrize("q_dtype", QDTYPES)
|
||||
@torch.inference_mode()
|
||||
def test_varlen_with_paged_kv(
|
||||
use_out: bool,
|
||||
@ -183,11 +215,15 @@ def test_varlen_with_paged_kv(
|
||||
soft_cap: Optional[float],
|
||||
num_blocks: int,
|
||||
fa_version: int,
|
||||
q_dtype: Optional[torch.dtype],
|
||||
) -> None:
|
||||
torch.set_default_device("cuda")
|
||||
if not is_fa_version_supported(fa_version):
|
||||
pytest.skip(f"Flash attention version {fa_version} not supported due "
|
||||
f"to: \"{fa_version_unsupported_reason(fa_version)}\"")
|
||||
if q_dtype is not None and (dtype != torch.bfloat16 or fa_version == 2):
|
||||
pytest.skip("Flash attention with quantized inputs is only "
|
||||
"supported on version 3 with bfloat16 base type")
|
||||
current_platform.seed_everything(0)
|
||||
num_seqs = len(seq_lens)
|
||||
query_lens = [x[0] for x in seq_lens]
|
||||
@ -223,10 +259,28 @@ def test_varlen_with_paged_kv(
|
||||
dtype=torch.int32)
|
||||
|
||||
out = torch.empty_like(query) if use_out else None
|
||||
|
||||
maybe_quantized_query = query
|
||||
maybe_quantized_key_cache = key_cache
|
||||
maybe_quantized_value_cache = value_cache
|
||||
q_descale = None
|
||||
k_descale = None
|
||||
v_descale = None
|
||||
if q_dtype is not None:
|
||||
# QKV are drawn from N(0, 1): no need for a fp8 scaling factor
|
||||
maybe_quantized_query = query.to(q_dtype)
|
||||
maybe_quantized_key_cache = key_cache.to(q_dtype)
|
||||
maybe_quantized_value_cache = value_cache.to(q_dtype)
|
||||
|
||||
scale_shape = (num_seqs, num_kv_heads)
|
||||
q_descale = torch.ones(scale_shape, dtype=torch.float32)
|
||||
k_descale = torch.ones(scale_shape, dtype=torch.float32)
|
||||
v_descale = torch.ones(scale_shape, dtype=torch.float32)
|
||||
|
||||
output = flash_attn_varlen_func(
|
||||
q=query,
|
||||
k=key_cache,
|
||||
v=value_cache,
|
||||
q=maybe_quantized_query,
|
||||
k=maybe_quantized_key_cache,
|
||||
v=maybe_quantized_value_cache,
|
||||
out=out,
|
||||
cu_seqlens_q=cu_query_lens,
|
||||
seqused_k=kv_lens,
|
||||
@ -238,6 +292,9 @@ def test_varlen_with_paged_kv(
|
||||
block_table=block_tables,
|
||||
softcap=soft_cap if soft_cap is not None else 0,
|
||||
fa_version=fa_version,
|
||||
q_descale=q_descale,
|
||||
k_descale=k_descale,
|
||||
v_descale=v_descale,
|
||||
)
|
||||
output = output if not use_out else out
|
||||
|
||||
@ -252,5 +309,8 @@ def test_varlen_with_paged_kv(
|
||||
sliding_window=sliding_window,
|
||||
soft_cap=soft_cap,
|
||||
)
|
||||
torch.testing.assert_close(output, ref_output, atol=2e-2, rtol=1e-2), \
|
||||
atol, rtol = 1.5e-2, 1e-2
|
||||
if q_dtype is not None:
|
||||
atol, rtol = 1.5e-1, 1.5e-1
|
||||
torch.testing.assert_close(output, ref_output, atol=atol, rtol=rtol), \
|
||||
f"{torch.max(torch.abs(output - ref_output))}"
|
||||
|
||||
@ -3,10 +3,9 @@
|
||||
import pytest
|
||||
|
||||
import vllm
|
||||
from tests.utils import fork_new_process_for_each_test
|
||||
from vllm.lora.request import LoRARequest
|
||||
|
||||
from ..utils import multi_gpu_test
|
||||
from ..utils import create_new_process_for_each_test, multi_gpu_test
|
||||
|
||||
MODEL_PATH = "THUDM/chatglm3-6b"
|
||||
|
||||
@ -55,7 +54,7 @@ def v1(run_with_both_engines_lora):
|
||||
pass
|
||||
|
||||
|
||||
@fork_new_process_for_each_test
|
||||
@create_new_process_for_each_test()
|
||||
def test_chatglm3_lora(chatglm3_lora_files):
|
||||
llm = vllm.LLM(MODEL_PATH,
|
||||
max_model_len=1024,
|
||||
@ -75,7 +74,7 @@ def test_chatglm3_lora(chatglm3_lora_files):
|
||||
|
||||
|
||||
@multi_gpu_test(num_gpus=4)
|
||||
@fork_new_process_for_each_test
|
||||
@create_new_process_for_each_test()
|
||||
def test_chatglm3_lora_tp4(chatglm3_lora_files):
|
||||
llm = vllm.LLM(MODEL_PATH,
|
||||
max_model_len=1024,
|
||||
@ -96,7 +95,7 @@ def test_chatglm3_lora_tp4(chatglm3_lora_files):
|
||||
|
||||
|
||||
@multi_gpu_test(num_gpus=4)
|
||||
@fork_new_process_for_each_test
|
||||
@create_new_process_for_each_test()
|
||||
def test_chatglm3_lora_tp4_fully_sharded_loras(chatglm3_lora_files):
|
||||
llm = vllm.LLM(MODEL_PATH,
|
||||
max_model_len=1024,
|
||||
|
||||
@ -4,10 +4,9 @@ import pytest
|
||||
import ray
|
||||
|
||||
import vllm
|
||||
from tests.utils import fork_new_process_for_each_test
|
||||
from vllm.lora.request import LoRARequest
|
||||
|
||||
from ..utils import multi_gpu_test
|
||||
from ..utils import create_new_process_for_each_test, multi_gpu_test
|
||||
|
||||
MODEL_PATH = "meta-llama/Llama-2-7b-hf"
|
||||
|
||||
@ -82,7 +81,7 @@ def v1(run_with_both_engines_lora):
|
||||
|
||||
# V1 Test: Failing due to numerics on V1.
|
||||
@pytest.mark.skip_v1
|
||||
@fork_new_process_for_each_test
|
||||
@create_new_process_for_each_test()
|
||||
def test_llama_lora(sql_lora_files):
|
||||
|
||||
llm = vllm.LLM(MODEL_PATH,
|
||||
@ -97,7 +96,7 @@ def test_llama_lora(sql_lora_files):
|
||||
# Skipping for v1 as v1 doesn't have a good way to expose the num_gpu_blocks
|
||||
# used by the engine yet.
|
||||
@pytest.mark.skip_v1
|
||||
@fork_new_process_for_each_test
|
||||
@create_new_process_for_each_test()
|
||||
def test_llama_lora_warmup(sql_lora_files):
|
||||
"""Test that the LLM initialization works with a warmup LORA path and
|
||||
is more conservative"""
|
||||
@ -128,7 +127,7 @@ def test_llama_lora_warmup(sql_lora_files):
|
||||
# V1 Test: Failing due to numerics on V1.
|
||||
@pytest.mark.skip_v1
|
||||
@multi_gpu_test(num_gpus=4)
|
||||
@fork_new_process_for_each_test
|
||||
@create_new_process_for_each_test()
|
||||
def test_llama_lora_tp4(sql_lora_files):
|
||||
|
||||
llm = vllm.LLM(
|
||||
@ -143,7 +142,7 @@ def test_llama_lora_tp4(sql_lora_files):
|
||||
|
||||
|
||||
@multi_gpu_test(num_gpus=4)
|
||||
@fork_new_process_for_each_test
|
||||
@create_new_process_for_each_test()
|
||||
def test_llama_lora_tp4_fully_sharded_loras(sql_lora_files):
|
||||
|
||||
llm = vllm.LLM(
|
||||
@ -159,7 +158,7 @@ def test_llama_lora_tp4_fully_sharded_loras(sql_lora_files):
|
||||
|
||||
|
||||
@multi_gpu_test(num_gpus=4)
|
||||
@fork_new_process_for_each_test
|
||||
@create_new_process_for_each_test()
|
||||
def test_llama_lora_tp4_fully_sharded_enable_bias(sql_lora_files):
|
||||
|
||||
llm = vllm.LLM(
|
||||
|
||||
@ -3,11 +3,12 @@
|
||||
import pytest
|
||||
|
||||
import vllm
|
||||
from tests.utils import fork_new_process_for_each_test
|
||||
from vllm.assets.image import ImageAsset
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
from ..utils import create_new_process_for_each_test
|
||||
|
||||
MODEL_PATH = "openbmb/MiniCPM-Llama3-V-2_5"
|
||||
|
||||
PROMPT_TEMPLATE = (
|
||||
@ -57,7 +58,7 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
|
||||
@pytest.mark.xfail(
|
||||
current_platform.is_rocm(),
|
||||
reason="MiniCPM-V dependency xformers incompatible with ROCm")
|
||||
@fork_new_process_for_each_test
|
||||
@create_new_process_for_each_test()
|
||||
def test_minicpmv_lora(minicpmv_lora_files):
|
||||
llm = vllm.LLM(
|
||||
MODEL_PATH,
|
||||
@ -80,7 +81,7 @@ def test_minicpmv_lora(minicpmv_lora_files):
|
||||
@pytest.mark.xfail(
|
||||
current_platform.is_rocm(),
|
||||
reason="MiniCPM-V dependency xformers incompatible with ROCm")
|
||||
@fork_new_process_for_each_test
|
||||
@create_new_process_for_each_test()
|
||||
def test_minicpmv_tp4_wo_fully_sharded_loras(minicpmv_lora_files):
|
||||
llm = vllm.LLM(
|
||||
MODEL_PATH,
|
||||
@ -101,7 +102,7 @@ def test_minicpmv_tp4_wo_fully_sharded_loras(minicpmv_lora_files):
|
||||
@pytest.mark.xfail(
|
||||
current_platform.is_rocm(),
|
||||
reason="MiniCPM-V dependency xformers incompatible with ROCm")
|
||||
@fork_new_process_for_each_test
|
||||
@create_new_process_for_each_test()
|
||||
def test_minicpmv_tp4_fully_sharded_loras(minicpmv_lora_files):
|
||||
llm = vllm.LLM(
|
||||
MODEL_PATH,
|
||||
|
||||
@ -4,18 +4,13 @@ from threading import Lock
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
import vllm.lora.ops.triton_ops # noqa: F401
|
||||
import vllm.lora.ops.triton_ops.v1 # noqa: F401
|
||||
from vllm.lora.ops.torch_ops import (bgmv_expand, bgmv_expand_slice,
|
||||
bgmv_shrink, sgmv_expand,
|
||||
sgmv_expand_slice, sgmv_shrink)
|
||||
import vllm.lora.ops.torch_ops as torch_ops
|
||||
import vllm.lora.ops.triton_ops as triton_ops
|
||||
from vllm.lora.ops.triton_ops import LoRAKernelMeta
|
||||
from vllm.lora.ops.triton_ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT
|
||||
from vllm.lora.ops.triton_ops.v1 import V1KernelMeta
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
from .utils import (PunicaTensors, assert_close, generate_data,
|
||||
generate_data_for_expand_nslices,
|
||||
generate_data_for_nslices)
|
||||
from .utils import PunicaTensors, assert_close, generate_data_for_nslices
|
||||
|
||||
|
||||
# Utility shrink and expand operations used as reference implementations.
|
||||
@ -26,10 +21,10 @@ def sgmv_shrink_for_nslices(
|
||||
prompt_lora_mapping: torch.Tensor, batches: int, max_seq_length: int,
|
||||
num_tokens: int, scaling: float):
|
||||
"""
|
||||
Wrapper around sgmv_shrink that handles any nslices.
|
||||
Wrapper around torch_ops.sgmv_shrink that handles any nslices.
|
||||
"""
|
||||
for index in range(nslices):
|
||||
sgmv_shrink(
|
||||
torch_ops.sgmv_shrink(
|
||||
inputs_tensor,
|
||||
lora_weights_lst[index],
|
||||
out_tensor[index],
|
||||
@ -53,11 +48,11 @@ def sgmv_expand_for_nslices(nslices: int, hidden_size: int,
|
||||
max_seq_length: int, num_tokens: int,
|
||||
add_inputs: bool) -> None:
|
||||
"""
|
||||
Wrapper around sgmv_expand that handles any nslices.
|
||||
Wrapper around torch_ops.sgmv_expand that handles any nslices.
|
||||
"""
|
||||
if nslices == 1:
|
||||
# Verify the torch's sgmv_expand op
|
||||
sgmv_expand(
|
||||
torch_ops.sgmv_expand(
|
||||
inputs_tensor[0],
|
||||
lora_weights_lst[0],
|
||||
out_tensor,
|
||||
@ -73,7 +68,7 @@ def sgmv_expand_for_nslices(nslices: int, hidden_size: int,
|
||||
slice_offset = 0
|
||||
for index in range(nslices):
|
||||
lora_weights = lora_weights_lst[index]
|
||||
sgmv_expand_slice(
|
||||
torch_ops.sgmv_expand_slice(
|
||||
inputs_tensor[index],
|
||||
lora_weights,
|
||||
out_tensor,
|
||||
@ -93,12 +88,13 @@ def sgmv_expand_for_nslices(nslices: int, hidden_size: int,
|
||||
_dict_lock = Lock()
|
||||
|
||||
|
||||
def check_shrink_kernels(batches: int, num_loras: int, rank: int,
|
||||
hidden_size: int, nslices: int, dtype: torch.dtype,
|
||||
device: str, seq_length: int, scaling: float):
|
||||
def check_lora_shrink_kernel(batches: int, num_loras: int, rank: int,
|
||||
hidden_size: int, nslices: int,
|
||||
dtype: torch.dtype, device: str, seq_length: int,
|
||||
scaling: float):
|
||||
"""
|
||||
Compare outputs of vllm.sgmv_shrink and vllm.v1_shrink kernel against a
|
||||
reference implementation.
|
||||
Compare outputs of torch_ops.sgmv_shrink and triton_ops.lora_shrink
|
||||
kernels.
|
||||
"""
|
||||
data: PunicaTensors = generate_data_for_nslices(
|
||||
batches,
|
||||
@ -118,35 +114,24 @@ def check_shrink_kernels(batches: int, num_loras: int, rank: int,
|
||||
data.prompt_lora_mapping, batches, max_seq_length,
|
||||
token_nums)
|
||||
|
||||
# Setup metadata information for the V1 kernel.
|
||||
v1_meta = V1KernelMeta.make(max_loras=num_loras,
|
||||
max_num_tokens=token_nums,
|
||||
device='cuda')
|
||||
v1_meta.prepare_tensors(data.token_lora_mapping)
|
||||
# Setup metadata information for the LoRA kernel.
|
||||
lora_meta = LoRAKernelMeta.make(max_loras=num_loras,
|
||||
max_num_tokens=token_nums,
|
||||
device='cuda')
|
||||
lora_meta.prepare_tensors(data.token_lora_mapping)
|
||||
|
||||
ref_out_tensor = data.ref_out_tensor
|
||||
sgmv_out_tensor = data.our_out_tensor
|
||||
v1_out_tensor = data.our_out_tensor.clone()
|
||||
out_tensor = data.our_out_tensor.clone()
|
||||
|
||||
# Preventing cache error pointer.
|
||||
with _dict_lock:
|
||||
# SGMV shrink kernel
|
||||
# lora_shrink kernel
|
||||
_LORA_A_PTR_DICT.clear()
|
||||
torch.ops.vllm.sgmv_shrink(
|
||||
triton_ops.lora_shrink(
|
||||
data.inputs_tensor,
|
||||
data.lora_weights,
|
||||
sgmv_out_tensor,
|
||||
*sgmv_meta_args,
|
||||
scaling,
|
||||
)
|
||||
|
||||
# V1 shrink kernel
|
||||
_LORA_A_PTR_DICT.clear()
|
||||
torch.ops.vllm.v1_shrink(
|
||||
data.inputs_tensor,
|
||||
data.lora_weights,
|
||||
v1_out_tensor,
|
||||
*v1_meta.meta_args(token_nums=token_nums),
|
||||
out_tensor,
|
||||
*lora_meta.meta_args(token_nums=token_nums),
|
||||
scaling,
|
||||
)
|
||||
|
||||
@ -160,16 +145,16 @@ def check_shrink_kernels(batches: int, num_loras: int, rank: int,
|
||||
scaling,
|
||||
)
|
||||
|
||||
assert_close(sgmv_out_tensor, ref_out_tensor)
|
||||
assert_close(v1_out_tensor, ref_out_tensor)
|
||||
assert_close(out_tensor, ref_out_tensor)
|
||||
|
||||
|
||||
def check_expand_kernels(batches: int, num_loras: int, rank: int,
|
||||
hidden_size: int, nslices: int, dtype: torch.dtype,
|
||||
device: str, seq_length: int, add_inputs: bool):
|
||||
def check_lora_expand_kernel(batches: int, num_loras: int, rank: int,
|
||||
hidden_size: int, nslices: int,
|
||||
dtype: torch.dtype, device: str, seq_length: int,
|
||||
add_inputs: bool):
|
||||
"""
|
||||
Compare outputs of vllm.sgmv_expand and vllm.v1_expand kernels against a
|
||||
reference implementation.
|
||||
Compare outputs of torch_ops.sgmv_expand and triton_ops.lora_expand
|
||||
kernels.
|
||||
"""
|
||||
data: PunicaTensors = generate_data_for_nslices(
|
||||
batches,
|
||||
@ -190,37 +175,25 @@ def check_expand_kernels(batches: int, num_loras: int, rank: int,
|
||||
data.prompt_lora_mapping, batches, max_seq_length,
|
||||
token_nums)
|
||||
|
||||
# Setup metadata information for the V1 kernel.
|
||||
v1_meta = V1KernelMeta.make(max_loras=num_loras,
|
||||
max_num_tokens=token_nums,
|
||||
device='cuda')
|
||||
v1_meta.prepare_tensors(data.token_lora_mapping)
|
||||
# Setup metadata information for the LoRA kernel.
|
||||
lora_meta = LoRAKernelMeta.make(max_loras=num_loras,
|
||||
max_num_tokens=token_nums,
|
||||
device='cuda')
|
||||
lora_meta.prepare_tensors(data.token_lora_mapping)
|
||||
|
||||
# Setup output tensors
|
||||
ref_out_tensor = data.ref_out_tensor
|
||||
sgmv_out_tensor = data.our_out_tensor
|
||||
v1_out_tensor = data.our_out_tensor.clone()
|
||||
out_tensor = data.our_out_tensor.clone()
|
||||
|
||||
with _dict_lock:
|
||||
# SGMV expand kernel
|
||||
# lora_expand kernel
|
||||
_LORA_B_PTR_DICT.clear()
|
||||
torch.ops.vllm.sgmv_expand(
|
||||
data.inputs_tensor,
|
||||
data.lora_weights,
|
||||
sgmv_out_tensor,
|
||||
*sgmv_meta_args,
|
||||
offset_start=0,
|
||||
add_inputs=add_inputs,
|
||||
)
|
||||
|
||||
# V1 expand kernel
|
||||
_LORA_B_PTR_DICT.clear()
|
||||
torch.ops.vllm.v1_expand(data.inputs_tensor,
|
||||
data.lora_weights,
|
||||
v1_out_tensor,
|
||||
*v1_meta.meta_args(token_nums=token_nums),
|
||||
offset_start=0,
|
||||
add_inputs=add_inputs)
|
||||
triton_ops.lora_expand(data.inputs_tensor,
|
||||
data.lora_weights,
|
||||
out_tensor,
|
||||
*lora_meta.meta_args(token_nums=token_nums),
|
||||
offset_start=0,
|
||||
add_inputs=add_inputs)
|
||||
|
||||
# Reference
|
||||
sgmv_expand_for_nslices(nslices,
|
||||
@ -231,124 +204,7 @@ def check_expand_kernels(batches: int, num_loras: int, rank: int,
|
||||
*sgmv_meta_args,
|
||||
add_inputs=add_inputs)
|
||||
|
||||
assert_close(sgmv_out_tensor, ref_out_tensor)
|
||||
assert_close(v1_out_tensor, ref_out_tensor)
|
||||
|
||||
|
||||
def check_bgmv_shrink(batches: int, num_loras: int, rank: int,
|
||||
hidden_size: int, dtype: torch.dtype, device: str,
|
||||
scaling: float):
|
||||
"""
|
||||
Compare vllm.bgmv_shrink against a reference implementation.
|
||||
"""
|
||||
seq_length = 1
|
||||
data: PunicaTensors = generate_data(
|
||||
batches,
|
||||
hidden_size,
|
||||
num_loras,
|
||||
rank,
|
||||
seq_length,
|
||||
dtype,
|
||||
"shrink",
|
||||
device,
|
||||
)
|
||||
|
||||
torch.ops.vllm.bgmv_shrink(
|
||||
data.inputs_tensor,
|
||||
data.lora_weights,
|
||||
data.our_out_tensor,
|
||||
data.token_lora_mapping,
|
||||
scaling,
|
||||
)
|
||||
|
||||
bgmv_shrink(
|
||||
data.inputs_tensor,
|
||||
data.lora_weights,
|
||||
data.ref_out_tensor,
|
||||
data.token_lora_mapping,
|
||||
scaling,
|
||||
)
|
||||
|
||||
data.ref_out_tensor = data.ref_out_tensor.to(torch.float32)
|
||||
assert_close(data.our_out_tensor, data.ref_out_tensor)
|
||||
|
||||
|
||||
def check_bgmv_expand(batches: int, num_loras: int, rank: int,
|
||||
hidden_size: int, dtype: torch.dtype, device: str,
|
||||
add_inputs: bool):
|
||||
"""
|
||||
Compare vllm.bgmv_expand against a reference implementation.
|
||||
"""
|
||||
seq_length = 1
|
||||
data: PunicaTensors = generate_data(
|
||||
batches,
|
||||
hidden_size,
|
||||
num_loras,
|
||||
rank,
|
||||
seq_length,
|
||||
dtype,
|
||||
"expand",
|
||||
device,
|
||||
)
|
||||
|
||||
torch.ops.vllm.bgmv_expand(
|
||||
data.inputs_tensor,
|
||||
data.lora_weights,
|
||||
data.our_out_tensor,
|
||||
data.token_lora_mapping,
|
||||
add_inputs=add_inputs,
|
||||
)
|
||||
bgmv_expand(
|
||||
data.inputs_tensor,
|
||||
data.lora_weights,
|
||||
data.ref_out_tensor,
|
||||
data.token_lora_mapping,
|
||||
add_inputs=add_inputs,
|
||||
)
|
||||
assert_close(data.our_out_tensor, data.ref_out_tensor)
|
||||
|
||||
|
||||
def check_bgmv_expand_slice(batches: int, num_loras: int, rank: int,
|
||||
hidden_size: int, nslices: int, dtype: torch.dtype,
|
||||
device: str, add_inputs: bool):
|
||||
"""
|
||||
Compare vllm.bgmv_expand_slice against a reference implementation.
|
||||
"""
|
||||
seq_length = 1
|
||||
data: PunicaTensors = generate_data_for_expand_nslices(
|
||||
batches,
|
||||
hidden_size,
|
||||
num_loras,
|
||||
rank,
|
||||
seq_length,
|
||||
dtype,
|
||||
nslices,
|
||||
device,
|
||||
)
|
||||
|
||||
slice_offset = 0
|
||||
for index in range(nslices):
|
||||
torch.ops.vllm.bgmv_expand_slice(
|
||||
data.inputs_tensor,
|
||||
data.lora_weights[index],
|
||||
data.our_out_tensor,
|
||||
data.token_lora_mapping,
|
||||
slice_offset,
|
||||
slice_size=hidden_size,
|
||||
add_inputs=add_inputs,
|
||||
)
|
||||
bgmv_expand_slice(
|
||||
data.inputs_tensor,
|
||||
data.lora_weights[index],
|
||||
data.ref_out_tensor,
|
||||
data.token_lora_mapping,
|
||||
slice_offset,
|
||||
slice_size=hidden_size,
|
||||
add_inputs=add_inputs,
|
||||
)
|
||||
|
||||
slice_offset += hidden_size
|
||||
assert_close(data.our_out_tensor, data.ref_out_tensor)
|
||||
assert_close(out_tensor, ref_out_tensor)
|
||||
|
||||
|
||||
# Tests
|
||||
@ -490,31 +346,31 @@ def test_kernels(
|
||||
op_type: str,
|
||||
):
|
||||
"""
|
||||
Tests SGMV and V1 kernels.
|
||||
Tests LoRA kernels.
|
||||
"""
|
||||
torch.set_default_device(device)
|
||||
current_platform.seed_everything(seed)
|
||||
|
||||
if op_type == "shrink":
|
||||
check_shrink_kernels(batches=batches,
|
||||
num_loras=num_loras,
|
||||
rank=rank,
|
||||
hidden_size=hidden_size,
|
||||
nslices=nslices,
|
||||
dtype=dtype,
|
||||
device=device,
|
||||
seq_length=128,
|
||||
scaling=0.5)
|
||||
check_lora_shrink_kernel(batches=batches,
|
||||
num_loras=num_loras,
|
||||
rank=rank,
|
||||
hidden_size=hidden_size,
|
||||
nslices=nslices,
|
||||
dtype=dtype,
|
||||
device=device,
|
||||
seq_length=128,
|
||||
scaling=0.5)
|
||||
else:
|
||||
check_expand_kernels(batches=batches,
|
||||
num_loras=num_loras,
|
||||
rank=rank,
|
||||
hidden_size=hidden_size,
|
||||
nslices=nslices,
|
||||
dtype=dtype,
|
||||
device=device,
|
||||
seq_length=128,
|
||||
add_inputs=True)
|
||||
check_lora_expand_kernel(batches=batches,
|
||||
num_loras=num_loras,
|
||||
rank=rank,
|
||||
hidden_size=hidden_size,
|
||||
nslices=nslices,
|
||||
dtype=dtype,
|
||||
device=device,
|
||||
seq_length=128,
|
||||
add_inputs=True)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("batches", hs_test_params['batches'])
|
||||
@ -538,159 +394,28 @@ def test_kernels_hidden_size(
|
||||
op_type: str,
|
||||
):
|
||||
"""
|
||||
Tests SGMV and V1 kernels.
|
||||
Tests SGMV and LoRA kernels.
|
||||
"""
|
||||
torch.set_default_device(device)
|
||||
current_platform.seed_everything(seed)
|
||||
|
||||
if op_type == "shrink":
|
||||
check_shrink_kernels(batches=batches,
|
||||
num_loras=num_loras,
|
||||
rank=rank,
|
||||
hidden_size=hidden_size,
|
||||
nslices=nslices,
|
||||
dtype=dtype,
|
||||
device=device,
|
||||
seq_length=128,
|
||||
scaling=0.5)
|
||||
check_lora_shrink_kernel(batches=batches,
|
||||
num_loras=num_loras,
|
||||
rank=rank,
|
||||
hidden_size=hidden_size,
|
||||
nslices=nslices,
|
||||
dtype=dtype,
|
||||
device=device,
|
||||
seq_length=128,
|
||||
scaling=0.5)
|
||||
else:
|
||||
check_expand_kernels(batches=batches,
|
||||
num_loras=num_loras,
|
||||
rank=rank,
|
||||
hidden_size=hidden_size,
|
||||
nslices=nslices,
|
||||
dtype=dtype,
|
||||
device=device,
|
||||
seq_length=128,
|
||||
add_inputs=True)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("batches", test_params['batches'])
|
||||
@pytest.mark.parametrize("num_loras", test_params['num_loras'])
|
||||
@pytest.mark.parametrize("rank", test_params['max_ranks'])
|
||||
@pytest.mark.parametrize("hidden_size", test_params['hidden_sizes'])
|
||||
@pytest.mark.parametrize("dtype", DTYPES)
|
||||
@pytest.mark.parametrize("device", DEVICES)
|
||||
@pytest.mark.parametrize("seed", SEED)
|
||||
@pytest.mark.parametrize("op_type", ["shrink", "expand"])
|
||||
def test_punica_bgmv(
|
||||
batches: int,
|
||||
num_loras: int,
|
||||
rank: int,
|
||||
hidden_size: int,
|
||||
dtype: torch.dtype,
|
||||
device: str,
|
||||
seed: int,
|
||||
op_type: str,
|
||||
):
|
||||
torch.set_default_device(device)
|
||||
current_platform.seed_everything(seed)
|
||||
|
||||
if op_type == "shrink":
|
||||
check_bgmv_shrink(batches=batches,
|
||||
num_loras=num_loras,
|
||||
rank=rank,
|
||||
hidden_size=hidden_size,
|
||||
dtype=dtype,
|
||||
device=device,
|
||||
scaling=0.5)
|
||||
else:
|
||||
check_bgmv_expand(batches=batches,
|
||||
num_loras=num_loras,
|
||||
rank=rank,
|
||||
hidden_size=hidden_size,
|
||||
dtype=dtype,
|
||||
device=device,
|
||||
add_inputs=True)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("batches", hs_test_params['batches'])
|
||||
@pytest.mark.parametrize("num_loras", hs_test_params['num_loras'])
|
||||
@pytest.mark.parametrize("rank", hs_test_params['max_ranks'])
|
||||
@pytest.mark.parametrize("hidden_size", hs_test_params['hidden_sizes'])
|
||||
@pytest.mark.parametrize("dtype", DTYPES)
|
||||
@pytest.mark.parametrize("device", DEVICES)
|
||||
@pytest.mark.parametrize("seed", SEED)
|
||||
@pytest.mark.parametrize("op_type", ["shrink", "expand"])
|
||||
def test_punica_bgmv_hidden_size(
|
||||
batches: int,
|
||||
num_loras: int,
|
||||
rank: int,
|
||||
hidden_size: int,
|
||||
dtype: torch.dtype,
|
||||
device: str,
|
||||
seed: int,
|
||||
op_type: str,
|
||||
):
|
||||
torch.set_default_device(device)
|
||||
current_platform.seed_everything(seed)
|
||||
|
||||
if op_type == "shrink":
|
||||
check_bgmv_shrink(batches=batches,
|
||||
num_loras=num_loras,
|
||||
rank=rank,
|
||||
hidden_size=hidden_size,
|
||||
dtype=dtype,
|
||||
device=device,
|
||||
scaling=0.5)
|
||||
else:
|
||||
check_bgmv_expand(batches=batches,
|
||||
num_loras=num_loras,
|
||||
rank=rank,
|
||||
hidden_size=hidden_size,
|
||||
dtype=dtype,
|
||||
device=device,
|
||||
add_inputs=True)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("batches", test_params['batches'])
|
||||
@pytest.mark.parametrize("num_loras", test_params['num_loras'])
|
||||
@pytest.mark.parametrize("rank", test_params['max_ranks'])
|
||||
@pytest.mark.parametrize("hidden_size", test_params['hidden_sizes'])
|
||||
@pytest.mark.parametrize("nslices", [2, 3])
|
||||
@pytest.mark.parametrize("dtype", DTYPES)
|
||||
@pytest.mark.parametrize("device", DEVICES)
|
||||
@pytest.mark.parametrize("seed", SEED)
|
||||
def test_punica_bgmv_expand_nslices(batches: int, num_loras: int, rank: int,
|
||||
hidden_size: int, nslices: int,
|
||||
dtype: torch.dtype, device: str,
|
||||
seed: int):
|
||||
|
||||
torch.set_default_device(device)
|
||||
current_platform.seed_everything(seed)
|
||||
|
||||
check_bgmv_expand_slice(batches=batches,
|
||||
num_loras=num_loras,
|
||||
rank=rank,
|
||||
hidden_size=hidden_size,
|
||||
nslices=nslices,
|
||||
dtype=dtype,
|
||||
device=device,
|
||||
add_inputs=True)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("batches", hs_test_params['batches'])
|
||||
@pytest.mark.parametrize("num_loras", hs_test_params['num_loras'])
|
||||
@pytest.mark.parametrize("rank", hs_test_params['max_ranks'])
|
||||
@pytest.mark.parametrize("hidden_size", hs_test_params['hidden_sizes'])
|
||||
@pytest.mark.parametrize("nslices", [2, 3])
|
||||
@pytest.mark.parametrize("dtype", DTYPES)
|
||||
@pytest.mark.parametrize("device", DEVICES)
|
||||
@pytest.mark.parametrize("seed", SEED)
|
||||
def test_punica_bgmv_expand_nslices_hidden_size(batches: int, num_loras: int,
|
||||
rank: int, hidden_size: int,
|
||||
nslices: int,
|
||||
dtype: torch.dtype,
|
||||
device: str, seed: int):
|
||||
|
||||
torch.set_default_device(device)
|
||||
current_platform.seed_everything(seed)
|
||||
|
||||
check_bgmv_expand_slice(batches=batches,
|
||||
num_loras=num_loras,
|
||||
rank=rank,
|
||||
hidden_size=hidden_size,
|
||||
nslices=nslices,
|
||||
dtype=dtype,
|
||||
device=device,
|
||||
add_inputs=True)
|
||||
check_lora_expand_kernel(batches=batches,
|
||||
num_loras=num_loras,
|
||||
rank=rank,
|
||||
hidden_size=hidden_size,
|
||||
nslices=nslices,
|
||||
dtype=dtype,
|
||||
device=device,
|
||||
seq_length=128,
|
||||
add_inputs=True)
|
||||
|
||||
@ -3,10 +3,9 @@
|
||||
import pytest
|
||||
|
||||
import vllm
|
||||
from tests.utils import fork_new_process_for_each_test
|
||||
from vllm.lora.request import LoRARequest
|
||||
|
||||
from ..utils import multi_gpu_test
|
||||
from ..utils import create_new_process_for_each_test, multi_gpu_test
|
||||
|
||||
MODEL_PATH = "ArthurZ/ilama-3.2-1B"
|
||||
|
||||
@ -56,7 +55,7 @@ def v1(run_with_both_engines_lora):
|
||||
|
||||
|
||||
@pytest.mark.skip_v1
|
||||
@fork_new_process_for_each_test
|
||||
@create_new_process_for_each_test()
|
||||
def test_ilama_lora(ilama_lora_files):
|
||||
llm = vllm.LLM(MODEL_PATH,
|
||||
max_model_len=1024,
|
||||
@ -77,7 +76,7 @@ def test_ilama_lora(ilama_lora_files):
|
||||
|
||||
@pytest.mark.skip_v1
|
||||
@multi_gpu_test(num_gpus=4)
|
||||
@fork_new_process_for_each_test
|
||||
@create_new_process_for_each_test()
|
||||
def test_ilama_lora_tp4(ilama_lora_files):
|
||||
llm = vllm.LLM(MODEL_PATH,
|
||||
max_model_len=1024,
|
||||
@ -99,7 +98,7 @@ def test_ilama_lora_tp4(ilama_lora_files):
|
||||
|
||||
@pytest.mark.skip_v1
|
||||
@multi_gpu_test(num_gpus=4)
|
||||
@fork_new_process_for_each_test
|
||||
@create_new_process_for_each_test()
|
||||
def test_ilama_lora_tp4_fully_sharded_loras(ilama_lora_files):
|
||||
llm = vllm.LLM(MODEL_PATH,
|
||||
max_model_len=1024,
|
||||
|
||||
@ -1,5 +1,6 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
import json
|
||||
import pickle
|
||||
|
||||
import pytest
|
||||
@ -15,7 +16,9 @@ from vllm.model_executor.guided_decoding.outlines_logits_processors import (
|
||||
from vllm.sampling_params import GuidedDecodingParams
|
||||
|
||||
MODEL_NAME = 'HuggingFaceH4/zephyr-7b-beta'
|
||||
GUIDED_DECODING_BACKENDS = ["outlines", "lm-format-enforcer", "xgrammar"]
|
||||
GUIDED_DECODING_BACKENDS = [
|
||||
"outlines", "lm-format-enforcer", "xgrammar", "guidance"
|
||||
]
|
||||
GUIDED_DECODING_BACKENDS_WITH_REASONING_SUPPORT = ["outlines", "xgrammar"]
|
||||
REASONING_MODEL_NAME = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
|
||||
|
||||
@ -208,8 +211,6 @@ def test_guided_decoding_backend_options():
|
||||
|
||||
|
||||
def test_pickle_xgrammar_tokenizer_data():
|
||||
|
||||
# TODO: move to another test file for xgrammar
|
||||
try:
|
||||
import xgrammar as xgr
|
||||
except ImportError:
|
||||
@ -217,7 +218,11 @@ def test_pickle_xgrammar_tokenizer_data():
|
||||
|
||||
from vllm.model_executor.guided_decoding.xgrammar_decoding import (
|
||||
TokenizerData)
|
||||
tokenizer_data = TokenizerData(vocab_type=xgr.VocabType.RAW)
|
||||
tokenizer_data = TokenizerData(
|
||||
metadata=
|
||||
'{"vocab_type":2,"vocab_size":151665,"add_prefix_space":false,"stop_token_ids":[151645]}',
|
||||
encoded_vocab=['!', '"', '#', '$', '%'],
|
||||
)
|
||||
pickled = pickle.dumps(tokenizer_data)
|
||||
|
||||
assert pickled is not None
|
||||
@ -225,4 +230,5 @@ def test_pickle_xgrammar_tokenizer_data():
|
||||
depickled: TokenizerData = pickle.loads(pickled)
|
||||
|
||||
assert depickled is not None
|
||||
assert depickled.vocab_type == xgr.VocabType.RAW
|
||||
assert json.loads(
|
||||
depickled.metadata)['vocab_type'] == xgr.VocabType.BYTE_LEVEL.value
|
||||
|
||||
@ -5,11 +5,10 @@ from typing import Optional
|
||||
import numpy as np
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
from transformers import AutoModel, AutoTokenizer, BatchEncoding
|
||||
from transformers import AutoModel, AutoTokenizer
|
||||
|
||||
from vllm.multimodal.audio import resample_audio
|
||||
from vllm.sequence import SampleLogprobs
|
||||
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
|
||||
|
||||
from ....conftest import HfRunner, VllmRunner
|
||||
from ....utils import RemoteOpenAIServer
|
||||
@ -107,8 +106,6 @@ def run_test(
|
||||
**kwargs,
|
||||
):
|
||||
"""Inference result should be the same between hf and vllm."""
|
||||
torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[dtype]
|
||||
|
||||
# NOTE: take care of the order. run vLLM first, and then run HF.
|
||||
# vLLM needs a fresh new process without cuda initialization.
|
||||
# if we run HF first, the cuda initialization will be done and it
|
||||
@ -124,15 +121,7 @@ def run_test(
|
||||
for vllm_prompt, _, audio in prompts_and_audios
|
||||
]
|
||||
|
||||
def process(hf_inputs: BatchEncoding, **kwargs):
|
||||
hf_inputs["audio_values"] = hf_inputs["audio_values"] \
|
||||
.to(torch_dtype) # type: ignore
|
||||
return hf_inputs
|
||||
|
||||
with hf_runner(model,
|
||||
dtype=dtype,
|
||||
postprocess_inputs=process,
|
||||
auto_cls=AutoModel) as hf_model:
|
||||
with hf_runner(model, dtype=dtype, auto_cls=AutoModel) as hf_model:
|
||||
hf_outputs_per_audio = [
|
||||
hf_model.generate_greedy_logprobs_limit(
|
||||
[hf_prompt],
|
||||
|
||||
@ -9,7 +9,7 @@ from vllm.sampling_params import SamplingParams
|
||||
from ...utils import check_outputs_equal
|
||||
|
||||
# This test is for the hybrid models
|
||||
MODELS = ["ai21labs/Jamba-tiny-dev"]
|
||||
MODELS = ["ai21labs/Jamba-tiny-dev", "Zyphra/Zamba2-1.2B-instruct"]
|
||||
# Bamba at Fp32 is too big for the CI (L4 GPU).
|
||||
# MODELS = ["ai21labs/Jamba-tiny-dev", "ibm-ai-platform/Bamba-9B"]
|
||||
|
||||
@ -27,17 +27,19 @@ def test_models(
|
||||
) -> None:
|
||||
|
||||
# numeric error produces different generation
|
||||
if 'Bamba' in model:
|
||||
if "Bamba" in model:
|
||||
example_prompts.pop(3)
|
||||
|
||||
with hf_runner(
|
||||
model,
|
||||
dtype=dtype,
|
||||
model_kwargs={
|
||||
"use_mamba_kernels":
|
||||
False, # mamba kernels are not installed so HF
|
||||
# don't use them
|
||||
}) as hf_model:
|
||||
model_kwargs = {
|
||||
"use_mamba_kernels": False, # mamba kernels are not installed so HF
|
||||
# don't use them
|
||||
}
|
||||
if "Zamba2" in model:
|
||||
# Zamba2 HF implementation automatically checks if mamba kernels are
|
||||
# installed
|
||||
model_kwargs = {}
|
||||
|
||||
with hf_runner(model, dtype=dtype, model_kwargs=model_kwargs) as hf_model:
|
||||
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
with vllm_runner(model, dtype=dtype) as vllm_model:
|
||||
@ -112,26 +114,31 @@ def test_mamba_prefill_chunking_with_parallel_sampling(
|
||||
def test_mamba_prefill_chunking(hf_runner, vllm_runner, example_prompts,
|
||||
model: str, dtype: str,
|
||||
max_tokens: int) -> None:
|
||||
# numeric error during prefill chucking produces different generation
|
||||
# numeric error during prefill chunking produces different generation
|
||||
# compared to w/o prefill chunking for those examples, removed them for now
|
||||
if 'Jamba' in model:
|
||||
if "Jamba" in model:
|
||||
example_prompts.pop(7)
|
||||
example_prompts.pop(2)
|
||||
example_prompts.pop(1)
|
||||
elif 'Bamba' in model:
|
||||
elif "Bamba" in model:
|
||||
example_prompts.pop(6)
|
||||
example_prompts.pop(3)
|
||||
example_prompts.pop(2)
|
||||
dtype = "half" # use a different dtype for Bamba
|
||||
elif "Zamba2" in model:
|
||||
example_prompts.pop(7)
|
||||
dtype = "half"
|
||||
|
||||
with hf_runner(
|
||||
model,
|
||||
dtype=dtype,
|
||||
model_kwargs={
|
||||
"use_mamba_kernels":
|
||||
False, # mamba kernels are not installed so HF
|
||||
# don't use them
|
||||
}) as hf_model:
|
||||
model_kwargs = {
|
||||
"use_mamba_kernels": False, # mamba kernels are not installed so HF
|
||||
# don't use them
|
||||
}
|
||||
if "Zamba2" in model:
|
||||
# Zamba2 HF implementation automatically checks if mamba kernels are
|
||||
# installed
|
||||
model_kwargs = {}
|
||||
|
||||
with hf_runner(model, dtype=dtype, model_kwargs=model_kwargs) as hf_model:
|
||||
non_chunked = hf_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
with vllm_runner(model,
|
||||
|
||||
@ -9,7 +9,7 @@ from pathlib import PosixPath
|
||||
|
||||
import pytest
|
||||
from packaging.version import Version
|
||||
from transformers import AutoModelForPreTraining, AutoModelForVision2Seq
|
||||
from transformers import AutoModelForImageTextToText, AutoModelForVision2Seq
|
||||
from transformers import __version__ as TRANSFORMERS_VERSION
|
||||
|
||||
from vllm.platforms import current_platform
|
||||
@ -17,7 +17,7 @@ from vllm.utils import identity
|
||||
|
||||
from ....conftest import (IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets,
|
||||
_VideoAssets)
|
||||
from ....utils import (fork_new_process_for_each_test, large_gpu_mark,
|
||||
from ....utils import (create_new_process_for_each_test, large_gpu_mark,
|
||||
multi_gpu_marks)
|
||||
from ...utils import check_outputs_equal
|
||||
from .vlm_utils import custom_inputs, model_utils, runners
|
||||
@ -101,7 +101,7 @@ VLM_TEST_SETTINGS = {
|
||||
prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
|
||||
convert_assets_to_embeddings=model_utils.get_llava_embeddings,
|
||||
max_model_len=4096,
|
||||
auto_cls=AutoModelForVision2Seq,
|
||||
auto_cls=AutoModelForImageTextToText,
|
||||
vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
|
||||
custom_test_opts=[CustomTestOptions(
|
||||
inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs(
|
||||
@ -121,10 +121,7 @@ VLM_TEST_SETTINGS = {
|
||||
"stop_sign": "caption es",
|
||||
"cherry_blossom": "What is in the picture?",
|
||||
}),
|
||||
auto_cls=AutoModelForVision2Seq,
|
||||
postprocess_inputs=model_utils.cast_dtype_post_processor(
|
||||
"pixel_values"
|
||||
),
|
||||
auto_cls=AutoModelForImageTextToText,
|
||||
vllm_output_post_proc=model_utils.paligemma_vllm_to_hf_output,
|
||||
dtype="bfloat16",
|
||||
marks=[pytest.mark.skip(reason="vLLM does not support PrefixLM attention mask")], # noqa: E501
|
||||
@ -179,7 +176,6 @@ VLM_TEST_SETTINGS = {
|
||||
# "cherry_blossom": "<vlm_image>Please infer the season with reason.", # noqa: E501
|
||||
# }),
|
||||
# multi_image_prompt="<vlm_image><vlm_image>Describe the two images shortly.", # noqa: E501
|
||||
# postprocess_inputs=model_utils.cast_dtype_post_processor("pixel_values"), # noqa: E501
|
||||
# stop_str=["<|im_end|>"],
|
||||
# image_size_factors=[(0.10, 0.15)],
|
||||
# max_tokens=64,
|
||||
@ -190,7 +186,7 @@ VLM_TEST_SETTINGS = {
|
||||
test_type=VLMTestType.IMAGE,
|
||||
prompt_formatter=lambda img_prompt: f"Question: {img_prompt} Answer:",
|
||||
img_idx_to_prompt=lambda idx: "",
|
||||
auto_cls=AutoModelForVision2Seq,
|
||||
auto_cls=AutoModelForImageTextToText,
|
||||
vllm_output_post_proc=model_utils.blip2_vllm_to_hf_output,
|
||||
),
|
||||
"chameleon": VLMTestInfo(
|
||||
@ -199,10 +195,7 @@ VLM_TEST_SETTINGS = {
|
||||
prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
|
||||
max_model_len=4096,
|
||||
max_num_seqs=2,
|
||||
auto_cls=AutoModelForVision2Seq,
|
||||
postprocess_inputs=model_utils.cast_dtype_post_processor(
|
||||
"pixel_values"
|
||||
),
|
||||
auto_cls=AutoModelForImageTextToText,
|
||||
# For chameleon, we only compare the sequences
|
||||
vllm_output_post_proc = lambda vllm_output, model: vllm_output[:2],
|
||||
hf_output_post_proc = lambda hf_output, model: hf_output[:2],
|
||||
@ -222,7 +215,6 @@ VLM_TEST_SETTINGS = {
|
||||
}),
|
||||
multi_image_prompt="image_1:<image>\nimage_2:<image>\nWhich image can we see the car and the tower?", # noqa: E501
|
||||
patch_hf_runner=model_utils.deepseekvl2_patch_hf_runner,
|
||||
postprocess_inputs=model_utils.cast_dtype_post_processor("images"),
|
||||
hf_output_post_proc=model_utils.deepseekvl2_trunc_hf_output,
|
||||
stop_str=["<|end▁of▁sentence|>", "<|begin▁of▁sentence|>"], # noqa: E501
|
||||
image_size_factors=[(), (1.0, ), (1.0, 1.0, 1.0), (0.1, 0.5, 1.0)],
|
||||
@ -240,6 +232,7 @@ VLM_TEST_SETTINGS = {
|
||||
img_idx_to_prompt=lambda idx: "",
|
||||
max_model_len=2048,
|
||||
max_num_seqs=2,
|
||||
auto_cls=AutoModelForImageTextToText,
|
||||
use_tokenizer_eos=True,
|
||||
vllm_output_post_proc=model_utils.fuyu_vllm_to_hf_output,
|
||||
num_logprobs=10,
|
||||
@ -256,9 +249,7 @@ VLM_TEST_SETTINGS = {
|
||||
multi_image_prompt="<start_of_image><start_of_image>Describe the two images in detail.", # noqa: E501
|
||||
max_model_len=4096,
|
||||
max_num_seqs=2,
|
||||
# TODO: Use AutoModelForVision2Seq once transformers supports this
|
||||
auto_cls=AutoModelForPreTraining,
|
||||
dtype="bfloat16",
|
||||
auto_cls=AutoModelForImageTextToText,
|
||||
vllm_runner_kwargs={"mm_processor_kwargs": {"do_pan_and_scan": True}},
|
||||
patch_hf_runner=model_utils.gemma3_patch_hf_runner,
|
||||
),
|
||||
@ -272,7 +263,6 @@ VLM_TEST_SETTINGS = {
|
||||
}),
|
||||
max_model_len=2048,
|
||||
max_num_seqs=2,
|
||||
dtype="bfloat16",
|
||||
get_stop_token_ids=lambda tok: [151329, 151336, 151338],
|
||||
patch_hf_runner=model_utils.glm4v_patch_hf_runner,
|
||||
# The image embeddings match with HF but the outputs of the language
|
||||
@ -295,7 +285,6 @@ VLM_TEST_SETTINGS = {
|
||||
}),
|
||||
multi_image_prompt="Image-1: <image>\nImage-2: <image>\nDescribe the two images in short.", # noqa: E501
|
||||
max_model_len=8192,
|
||||
dtype="bfloat16",
|
||||
use_tokenizer_eos=True,
|
||||
num_logprobs=10,
|
||||
patch_hf_runner=model_utils.h2ovl_patch_hf_runner,
|
||||
@ -307,7 +296,7 @@ VLM_TEST_SETTINGS = {
|
||||
img_idx_to_prompt=lambda idx: "<image>",
|
||||
max_model_len=8192,
|
||||
max_num_seqs=2,
|
||||
auto_cls=AutoModelForVision2Seq,
|
||||
auto_cls=AutoModelForImageTextToText,
|
||||
hf_output_post_proc=model_utils.idefics3_trunc_hf_output,
|
||||
),
|
||||
"intern_vl": VLMTestInfo(
|
||||
@ -324,10 +313,6 @@ VLM_TEST_SETTINGS = {
|
||||
}),
|
||||
multi_image_prompt="Image-1: <image>\nImage-2: <image>\nDescribe the two images in short.", # noqa: E501
|
||||
max_model_len=4096,
|
||||
# NOTE: Mono-InternVL-2B doesn't work with fp16,
|
||||
# it will result NaN during inference.
|
||||
# See: https://huggingface.co/OpenGVLab/Mono-InternVL-2B/discussions/9
|
||||
dtype="bfloat16",
|
||||
use_tokenizer_eos=True,
|
||||
patch_hf_runner=model_utils.internvl_patch_hf_runner,
|
||||
),
|
||||
@ -336,7 +321,7 @@ VLM_TEST_SETTINGS = {
|
||||
test_type=(VLMTestType.IMAGE, VLMTestType.CUSTOM_INPUTS),
|
||||
prompt_formatter=lambda img_prompt: f"[INST] {img_prompt} [/INST]",
|
||||
max_model_len=10240,
|
||||
auto_cls=AutoModelForVision2Seq,
|
||||
auto_cls=AutoModelForImageTextToText,
|
||||
vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
|
||||
custom_test_opts=[CustomTestOptions(
|
||||
inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs(
|
||||
@ -351,9 +336,6 @@ VLM_TEST_SETTINGS = {
|
||||
prompt_formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
|
||||
num_video_frames=16,
|
||||
max_model_len=16384,
|
||||
postprocess_inputs=model_utils.cast_dtype_post_processor(
|
||||
"pixel_values_videos"
|
||||
),
|
||||
auto_cls=AutoModelForVision2Seq,
|
||||
vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output,
|
||||
custom_test_opts=[CustomTestOptions(
|
||||
@ -378,11 +360,8 @@ VLM_TEST_SETTINGS = {
|
||||
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
||||
prompt_formatter=lambda img_prompt: f"<|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", # noqa: E501
|
||||
max_model_len=4096,
|
||||
postprocess_inputs=model_utils.cast_dtype_post_processor(
|
||||
"pixel_values"
|
||||
),
|
||||
get_stop_token_ids=lambda tok: [128009],
|
||||
auto_cls=AutoModelForVision2Seq,
|
||||
auto_cls=AutoModelForImageTextToText,
|
||||
vllm_output_post_proc=model_utils.mantis_vllm_to_hf_output,
|
||||
patch_hf_runner=model_utils.mantis_patch_hf_runner,
|
||||
marks=[
|
||||
@ -400,8 +379,8 @@ VLM_TEST_SETTINGS = {
|
||||
max_model_len=4096,
|
||||
max_num_seqs=2,
|
||||
get_stop_token_ids=lambda tok: [tok.eos_id, tok.eot_id],
|
||||
postprocess_inputs=model_utils.wrap_inputs_post_processor,
|
||||
hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
|
||||
patch_hf_runner=model_utils.minicpmv_25_patch_hf_runner,
|
||||
),
|
||||
"minicpmo_26": VLMTestInfo(
|
||||
models=["openbmb/MiniCPM-o-2_6"],
|
||||
@ -411,11 +390,8 @@ VLM_TEST_SETTINGS = {
|
||||
max_model_len=4096,
|
||||
max_num_seqs=2,
|
||||
get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']), # noqa: E501
|
||||
postprocess_inputs=model_utils.ignore_inputs_post_processor(
|
||||
"image_sizes"
|
||||
),
|
||||
hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
|
||||
patch_hf_runner=model_utils.minicpmo_patch_hf_runner
|
||||
patch_hf_runner=model_utils.minicpmo_26_patch_hf_runner,
|
||||
),
|
||||
"minicpmv_26": VLMTestInfo(
|
||||
models=["openbmb/MiniCPM-V-2_6"],
|
||||
@ -425,10 +401,8 @@ VLM_TEST_SETTINGS = {
|
||||
max_model_len=4096,
|
||||
max_num_seqs=2,
|
||||
get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']), # noqa: E501
|
||||
postprocess_inputs=model_utils.ignore_inputs_post_processor(
|
||||
"image_sizes"
|
||||
),
|
||||
hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
|
||||
patch_hf_runner=model_utils.minicpmv_26_patch_hf_runner,
|
||||
),
|
||||
"molmo": VLMTestInfo(
|
||||
models=["allenai/Molmo-7B-D-0924"],
|
||||
@ -437,7 +411,6 @@ VLM_TEST_SETTINGS = {
|
||||
max_model_len=4096,
|
||||
max_num_seqs=2,
|
||||
patch_hf_runner=model_utils.molmo_patch_hf_runner,
|
||||
postprocess_inputs=model_utils.molmo_post_processor,
|
||||
),
|
||||
# Tests for phi3v currently live in another file because of a bug in
|
||||
# transformers. Once this issue is fixed, we can enable them here instead.
|
||||
@ -463,7 +436,7 @@ VLM_TEST_SETTINGS = {
|
||||
img_idx_to_prompt=lambda idx: "[IMG]",
|
||||
max_model_len=8192,
|
||||
max_num_seqs=2,
|
||||
auto_cls=AutoModelForVision2Seq,
|
||||
auto_cls=AutoModelForImageTextToText,
|
||||
marks=[large_gpu_mark(min_gb=48)],
|
||||
),
|
||||
"qwen_vl": VLMTestInfo(
|
||||
@ -481,10 +454,7 @@ VLM_TEST_SETTINGS = {
|
||||
models=["facebook/chameleon-7b"],
|
||||
prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
|
||||
max_model_len=4096,
|
||||
auto_cls=AutoModelForVision2Seq,
|
||||
postprocess_inputs=model_utils.cast_dtype_post_processor(
|
||||
"pixel_values"
|
||||
),
|
||||
auto_cls=AutoModelForImageTextToText,
|
||||
vllm_output_post_proc = lambda vllm_output, model: vllm_output[:2],
|
||||
hf_output_post_proc = lambda hf_output, model: hf_output[:2],
|
||||
comparator=check_outputs_equal,
|
||||
@ -495,7 +465,7 @@ VLM_TEST_SETTINGS = {
|
||||
models=["llava-hf/llava-1.5-7b-hf"],
|
||||
prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
|
||||
max_model_len=4096,
|
||||
auto_cls=AutoModelForVision2Seq,
|
||||
auto_cls=AutoModelForImageTextToText,
|
||||
vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
|
||||
marks=multi_gpu_marks(num_gpus=2),
|
||||
**COMMON_BROADCAST_SETTINGS # type: ignore
|
||||
@ -504,7 +474,7 @@ VLM_TEST_SETTINGS = {
|
||||
models=["llava-hf/llava-v1.6-mistral-7b-hf"],
|
||||
prompt_formatter=lambda img_prompt: f"[INST] {img_prompt} [/INST]",
|
||||
max_model_len=10240,
|
||||
auto_cls=AutoModelForVision2Seq,
|
||||
auto_cls=AutoModelForImageTextToText,
|
||||
vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
|
||||
marks=multi_gpu_marks(num_gpus=2),
|
||||
**COMMON_BROADCAST_SETTINGS # type: ignore
|
||||
@ -529,9 +499,6 @@ VLM_TEST_SETTINGS = {
|
||||
test_type=VLMTestType.CUSTOM_INPUTS,
|
||||
max_model_len=16384,
|
||||
max_num_seqs=2,
|
||||
postprocess_inputs=model_utils.cast_dtype_post_processor(
|
||||
"pixel_values"
|
||||
),
|
||||
auto_cls=AutoModelForVision2Seq,
|
||||
vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output,
|
||||
custom_test_opts=[CustomTestOptions(
|
||||
@ -592,7 +559,7 @@ VLM_TEST_SETTINGS = _mark_splits(VLM_TEST_SETTINGS, num_groups=2)
|
||||
get_parametrized_options(
|
||||
VLM_TEST_SETTINGS,
|
||||
test_type=VLMTestType.IMAGE,
|
||||
fork_new_process_for_each_test=False,
|
||||
create_new_process_for_each_test=False,
|
||||
))
|
||||
def test_single_image_models(tmp_path: PosixPath, model_type: str,
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
@ -617,7 +584,7 @@ def test_single_image_models(tmp_path: PosixPath, model_type: str,
|
||||
get_parametrized_options(
|
||||
VLM_TEST_SETTINGS,
|
||||
test_type=VLMTestType.MULTI_IMAGE,
|
||||
fork_new_process_for_each_test=False,
|
||||
create_new_process_for_each_test=False,
|
||||
))
|
||||
def test_multi_image_models(tmp_path: PosixPath, model_type: str,
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
@ -642,7 +609,7 @@ def test_multi_image_models(tmp_path: PosixPath, model_type: str,
|
||||
get_parametrized_options(
|
||||
VLM_TEST_SETTINGS,
|
||||
test_type=VLMTestType.EMBEDDING,
|
||||
fork_new_process_for_each_test=False,
|
||||
create_new_process_for_each_test=False,
|
||||
))
|
||||
def test_image_embedding_models(model_type: str,
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
@ -666,7 +633,7 @@ def test_image_embedding_models(model_type: str,
|
||||
get_parametrized_options(
|
||||
VLM_TEST_SETTINGS,
|
||||
test_type=VLMTestType.VIDEO,
|
||||
fork_new_process_for_each_test=False,
|
||||
create_new_process_for_each_test=False,
|
||||
))
|
||||
def test_video_models(model_type: str, test_case: ExpandableVLMTestArgs,
|
||||
hf_runner: type[HfRunner], vllm_runner: type[VllmRunner],
|
||||
@ -688,7 +655,7 @@ def test_video_models(model_type: str, test_case: ExpandableVLMTestArgs,
|
||||
get_parametrized_options(
|
||||
VLM_TEST_SETTINGS,
|
||||
test_type=VLMTestType.CUSTOM_INPUTS,
|
||||
fork_new_process_for_each_test=False,
|
||||
create_new_process_for_each_test=False,
|
||||
))
|
||||
def test_custom_inputs_models(
|
||||
model_type: str,
|
||||
@ -714,9 +681,9 @@ def test_custom_inputs_models(
|
||||
get_parametrized_options(
|
||||
VLM_TEST_SETTINGS,
|
||||
test_type=VLMTestType.IMAGE,
|
||||
fork_new_process_for_each_test=True,
|
||||
create_new_process_for_each_test=True,
|
||||
))
|
||||
@fork_new_process_for_each_test
|
||||
@create_new_process_for_each_test()
|
||||
def test_single_image_models_heavy(tmp_path: PosixPath, model_type: str,
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
hf_runner: type[HfRunner],
|
||||
@ -740,9 +707,9 @@ def test_single_image_models_heavy(tmp_path: PosixPath, model_type: str,
|
||||
get_parametrized_options(
|
||||
VLM_TEST_SETTINGS,
|
||||
test_type=VLMTestType.MULTI_IMAGE,
|
||||
fork_new_process_for_each_test=True,
|
||||
create_new_process_for_each_test=True,
|
||||
))
|
||||
@fork_new_process_for_each_test
|
||||
@create_new_process_for_each_test()
|
||||
def test_multi_image_models_heavy(tmp_path: PosixPath, model_type: str,
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
hf_runner: type[HfRunner],
|
||||
@ -766,9 +733,9 @@ def test_multi_image_models_heavy(tmp_path: PosixPath, model_type: str,
|
||||
get_parametrized_options(
|
||||
VLM_TEST_SETTINGS,
|
||||
test_type=VLMTestType.EMBEDDING,
|
||||
fork_new_process_for_each_test=True,
|
||||
create_new_process_for_each_test=True,
|
||||
))
|
||||
@fork_new_process_for_each_test
|
||||
@create_new_process_for_each_test()
|
||||
def test_image_embedding_models_heavy(model_type: str,
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
hf_runner: type[HfRunner],
|
||||
@ -791,7 +758,7 @@ def test_image_embedding_models_heavy(model_type: str,
|
||||
get_parametrized_options(
|
||||
VLM_TEST_SETTINGS,
|
||||
test_type=VLMTestType.VIDEO,
|
||||
fork_new_process_for_each_test=True,
|
||||
create_new_process_for_each_test=True,
|
||||
))
|
||||
def test_video_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs,
|
||||
hf_runner: type[HfRunner],
|
||||
@ -814,9 +781,9 @@ def test_video_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs,
|
||||
get_parametrized_options(
|
||||
VLM_TEST_SETTINGS,
|
||||
test_type=VLMTestType.CUSTOM_INPUTS,
|
||||
fork_new_process_for_each_test=True,
|
||||
create_new_process_for_each_test=True,
|
||||
))
|
||||
@fork_new_process_for_each_test
|
||||
@create_new_process_for_each_test()
|
||||
def test_custom_inputs_models_heavy(
|
||||
model_type: str,
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
|
||||
@ -100,7 +100,6 @@ def run_test(
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
enable_lora=True,
|
||||
max_lora_rank=320,
|
||||
lora_extra_vocab_size=0,
|
||||
gpu_memory_utilization=0.8, # set to 0.8 to avoid OOM in CI
|
||||
enforce_eager=True,
|
||||
) as vllm_model:
|
||||
|
||||
@ -4,7 +4,6 @@
|
||||
Run `pytest tests/models/test_mistral.py`.
|
||||
"""
|
||||
import json
|
||||
import uuid
|
||||
from dataclasses import asdict
|
||||
from typing import TYPE_CHECKING, Any, Optional
|
||||
|
||||
@ -16,8 +15,7 @@ from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
|
||||
from mistral_common.tokens.tokenizers.multimodal import image_from_chunk
|
||||
from transformers import AutoProcessor
|
||||
|
||||
from vllm import (EngineArgs, LLMEngine, RequestOutput, SamplingParams,
|
||||
TextPrompt, TokensPrompt)
|
||||
from vllm import RequestOutput, SamplingParams, TextPrompt, TokensPrompt
|
||||
from vllm.multimodal import MultiModalDataBuiltins
|
||||
from vllm.multimodal.inputs import PlaceholderRange
|
||||
from vllm.sequence import Logprob, SampleLogprobs
|
||||
@ -28,7 +26,11 @@ from ...utils import check_logprobs_close
|
||||
if TYPE_CHECKING:
|
||||
from _typeshed import StrPath
|
||||
|
||||
MODELS = ["mistralai/Pixtral-12B-2409"]
|
||||
PIXTRAL_ID = "mistralai/Pixtral-12B-2409"
|
||||
MISTRAL_SMALL_3_1_ID = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
|
||||
|
||||
MODELS = [PIXTRAL_ID, MISTRAL_SMALL_3_1_ID]
|
||||
|
||||
IMG_URLS = [
|
||||
"https://picsum.photos/id/237/400/300",
|
||||
"https://picsum.photos/id/231/200/300",
|
||||
@ -125,8 +127,10 @@ MAX_MODEL_LEN = [8192, 65536]
|
||||
FIXTURES_PATH = VLLM_PATH / "tests/models/fixtures"
|
||||
assert FIXTURES_PATH.exists()
|
||||
|
||||
FIXTURE_LOGPROBS_CHAT = FIXTURES_PATH / "pixtral_chat.json"
|
||||
FIXTURE_LOGPROBS_ENGINE = FIXTURES_PATH / "pixtral_chat_engine.json"
|
||||
FIXTURE_LOGPROBS_CHAT = {
|
||||
PIXTRAL_ID: FIXTURES_PATH / "pixtral_chat.json",
|
||||
MISTRAL_SMALL_3_1_ID: FIXTURES_PATH / "mistral_small_3_chat.json",
|
||||
}
|
||||
|
||||
OutputsLogprobs = list[tuple[list[int], str, Optional[SampleLogprobs]]]
|
||||
|
||||
@ -166,12 +170,12 @@ def test_chat(
|
||||
model: str,
|
||||
dtype: str,
|
||||
) -> None:
|
||||
EXPECTED_CHAT_LOGPROBS = load_outputs_w_logprobs(FIXTURE_LOGPROBS_CHAT)
|
||||
EXPECTED_CHAT_LOGPROBS = load_outputs_w_logprobs(
|
||||
FIXTURE_LOGPROBS_CHAT[model])
|
||||
with vllm_runner(
|
||||
model,
|
||||
dtype=dtype,
|
||||
tokenizer_mode="mistral",
|
||||
enable_chunked_prefill=False,
|
||||
max_model_len=max_model_len,
|
||||
limit_mm_per_prompt=LIMIT_MM_PER_PROMPT,
|
||||
) as vllm_model:
|
||||
@ -183,70 +187,40 @@ def test_chat(
|
||||
outputs.extend(output)
|
||||
|
||||
logprobs = vllm_runner._final_steps_generate_w_logprobs(outputs)
|
||||
# Remove last `None` prompt_logprobs to compare with fixture
|
||||
for i in range(len(logprobs)):
|
||||
assert logprobs[i][-1] is None
|
||||
logprobs[i] = logprobs[i][:-1]
|
||||
check_logprobs_close(outputs_0_lst=EXPECTED_CHAT_LOGPROBS,
|
||||
outputs_1_lst=logprobs,
|
||||
name_0="h100_ref",
|
||||
name_1="output")
|
||||
|
||||
|
||||
@large_gpu_test(min_gb=80)
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
||||
def test_model_engine(vllm_runner, model: str, dtype: str) -> None:
|
||||
EXPECTED_ENGINE_LOGPROBS = load_outputs_w_logprobs(FIXTURE_LOGPROBS_ENGINE)
|
||||
args = EngineArgs(
|
||||
model=model,
|
||||
tokenizer_mode="mistral",
|
||||
enable_chunked_prefill=False,
|
||||
limit_mm_per_prompt=LIMIT_MM_PER_PROMPT,
|
||||
dtype=dtype,
|
||||
)
|
||||
engine = LLMEngine.from_engine_args(args)
|
||||
|
||||
engine.add_request(uuid.uuid4().hex, ENGINE_INPUTS[0], SAMPLING_PARAMS)
|
||||
engine.add_request(uuid.uuid4().hex, ENGINE_INPUTS[1], SAMPLING_PARAMS)
|
||||
|
||||
outputs = []
|
||||
count = 0
|
||||
while True:
|
||||
out = engine.step()
|
||||
count += 1
|
||||
for request_output in out:
|
||||
if request_output.finished:
|
||||
outputs.append(request_output)
|
||||
|
||||
if count == 2:
|
||||
engine.add_request(uuid.uuid4().hex, ENGINE_INPUTS[2],
|
||||
SAMPLING_PARAMS)
|
||||
if not engine.has_unfinished_requests():
|
||||
break
|
||||
|
||||
logprobs = vllm_runner._final_steps_generate_w_logprobs(outputs)
|
||||
check_logprobs_close(outputs_0_lst=EXPECTED_ENGINE_LOGPROBS,
|
||||
outputs_1_lst=logprobs,
|
||||
name_0="h100_ref",
|
||||
name_1="output")
|
||||
|
||||
|
||||
@large_gpu_test(min_gb=48)
|
||||
@pytest.mark.parametrize(
|
||||
"prompt,expected_ranges",
|
||||
[(_create_engine_inputs_hf(IMG_URLS[:1]), [{
|
||||
"offset": 10,
|
||||
"offset": 11,
|
||||
"length": 494
|
||||
}]),
|
||||
(_create_engine_inputs_hf(IMG_URLS[1:4]), [{
|
||||
"offset": 10,
|
||||
"offset": 11,
|
||||
"length": 266
|
||||
}, {
|
||||
"offset": 276,
|
||||
"offset": 277,
|
||||
"length": 1056
|
||||
}, {
|
||||
"offset": 1332,
|
||||
"offset": 1333,
|
||||
"length": 418
|
||||
}])])
|
||||
def test_multi_modal_placeholders(
|
||||
vllm_runner, prompt, expected_ranges: list[PlaceholderRange]) -> None:
|
||||
def test_multi_modal_placeholders(vllm_runner, prompt,
|
||||
expected_ranges: list[PlaceholderRange],
|
||||
monkeypatch) -> None:
|
||||
|
||||
# This placeholder checking test only works with V0 engine
|
||||
# where `multi_modal_placeholders` is returned with `RequestOutput`
|
||||
monkeypatch.setenv("VLLM_USE_V1", "0")
|
||||
with vllm_runner(
|
||||
"mistral-community/pixtral-12b",
|
||||
max_model_len=8192,
|
||||
|
||||
@ -13,9 +13,9 @@ from .types import (EMBEDDING_SIZE_FACTORS, ExpandableVLMTestArgs,
|
||||
ImageSizeWrapper, SizeType, VLMTestInfo, VLMTestType)
|
||||
|
||||
|
||||
def get_filtered_test_settings(test_settings: dict[str, VLMTestInfo],
|
||||
test_type: VLMTestType,
|
||||
fork_per_test: bool) -> dict[str, VLMTestInfo]:
|
||||
def get_filtered_test_settings(
|
||||
test_settings: dict[str, VLMTestInfo], test_type: VLMTestType,
|
||||
new_proc_per_test: bool) -> dict[str, VLMTestInfo]:
|
||||
"""Given the dict of potential test settings to run, return a subdict
|
||||
of tests who have the current test type enabled with the matching val for
|
||||
fork_per_test.
|
||||
@ -43,7 +43,7 @@ def get_filtered_test_settings(test_settings: dict[str, VLMTestInfo],
|
||||
|
||||
# Everything looks okay; keep if this is has correct proc handling
|
||||
if (test_info.distributed_executor_backend
|
||||
is not None) == fork_per_test:
|
||||
is not None) == new_proc_per_test:
|
||||
matching_tests[test_name] = test_info
|
||||
|
||||
return matching_tests
|
||||
@ -51,14 +51,14 @@ def get_filtered_test_settings(test_settings: dict[str, VLMTestInfo],
|
||||
|
||||
def get_parametrized_options(test_settings: dict[str, VLMTestInfo],
|
||||
test_type: VLMTestType,
|
||||
fork_new_process_for_each_test: bool):
|
||||
create_new_process_for_each_test: bool):
|
||||
"""Converts all of our VLMTestInfo into an expanded list of parameters.
|
||||
This is similar to nesting pytest parametrize calls, but done directly
|
||||
through an itertools product so that each test can set things like
|
||||
size factors etc, while still running in isolated test cases.
|
||||
"""
|
||||
matching_tests = get_filtered_test_settings(
|
||||
test_settings, test_type, fork_new_process_for_each_test)
|
||||
test_settings, test_type, create_new_process_for_each_test)
|
||||
|
||||
# Ensure that something is wrapped as an iterable it's not already
|
||||
ensure_wrapped = lambda e: e if isinstance(e, (list, tuple)) else (e, )
|
||||
|
||||
@ -4,7 +4,6 @@ from typing import Any, Callable, Optional, Union
|
||||
|
||||
import torch
|
||||
from PIL.Image import Image
|
||||
from transformers import BatchEncoding
|
||||
from transformers.models.auto.auto_factory import _BaseAutoModelClass
|
||||
|
||||
from vllm.config import TaskOption
|
||||
@ -31,7 +30,6 @@ def run_test(
|
||||
vllm_output_post_proc: Optional[Callable[[RunnerOutput, str], Any]],
|
||||
auto_cls: type[_BaseAutoModelClass],
|
||||
use_tokenizer_eos: bool,
|
||||
postprocess_inputs: Callable[[BatchEncoding], BatchEncoding],
|
||||
comparator: Callable[..., None],
|
||||
get_stop_token_ids: Optional[Callable[[AnyTokenizer], list[int]]],
|
||||
stop_str: Optional[list[str]],
|
||||
@ -101,7 +99,6 @@ def run_test(
|
||||
hf_model = hf_runner(model,
|
||||
dtype=dtype,
|
||||
auto_cls=auto_cls,
|
||||
postprocess_inputs=postprocess_inputs,
|
||||
model_kwargs=hf_model_kwargs)
|
||||
|
||||
# Some models need to patch things like the model processor, e.g., internvl
|
||||
|
||||
@ -6,16 +6,15 @@ typically specific to a small subset of models.
|
||||
import re
|
||||
import types
|
||||
from pathlib import PosixPath
|
||||
from typing import Callable, Optional, Union
|
||||
from typing import Optional, Union
|
||||
|
||||
import torch
|
||||
from PIL.Image import Image
|
||||
from transformers import (AutoConfig, AutoTokenizer, BatchEncoding,
|
||||
from transformers import (AutoConfig, AutoTokenizer, BatchFeature,
|
||||
GenerationConfig)
|
||||
|
||||
from vllm.sequence import SampleLogprobs
|
||||
from vllm.transformers_utils.tokenizer import patch_padding_side
|
||||
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
|
||||
|
||||
from .....conftest import HfRunner, ImageAsset, _ImageAssets
|
||||
from .types import RunnerOutput
|
||||
@ -211,40 +210,6 @@ def get_llava_embeddings(image_assets: _ImageAssets):
|
||||
return [asset.image_embeds for asset in image_assets]
|
||||
|
||||
|
||||
####### postprocessors to run on HF BatchEncoding
|
||||
def cast_dtype_post_processor(
|
||||
hf_inp_key: str) -> Callable[[BatchEncoding, str], BatchEncoding]:
|
||||
"""Gets a handle to a post processor which converts a given key into a
|
||||
target data type."""
|
||||
|
||||
def process(hf_inputs: BatchEncoding, dtype: str):
|
||||
torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[dtype]
|
||||
hf_inputs[hf_inp_key] = hf_inputs[hf_inp_key].to(torch_dtype)
|
||||
return hf_inputs
|
||||
|
||||
return process
|
||||
|
||||
|
||||
def ignore_inputs_post_processor(
|
||||
hf_inp_key: str) -> Callable[[BatchEncoding, str], BatchEncoding]:
|
||||
"""Gets a handle to a post processor which ignores a given key."""
|
||||
|
||||
def process(hf_inputs: BatchEncoding, dtype: str):
|
||||
del hf_inputs[hf_inp_key]
|
||||
return hf_inputs
|
||||
|
||||
return process
|
||||
|
||||
|
||||
def wrap_inputs_post_processor(hf_inputs: BatchEncoding, dtype: str):
|
||||
return {"model_inputs": hf_inputs}
|
||||
|
||||
|
||||
def molmo_post_processor(hf_inputs: BatchEncoding, dtype: str):
|
||||
hf_inputs = cast_dtype_post_processor("images")(hf_inputs, dtype)
|
||||
return {k: v.unsqueeze(0) for k, v in hf_inputs.items()}
|
||||
|
||||
|
||||
####### Prompt path encoders for models that need models on disk
|
||||
def qwen_prompt_path_encoder(
|
||||
tmp_path: PosixPath, prompt: str, assets: Union[list[ImageAsset],
|
||||
@ -295,8 +260,7 @@ def deepseekvl2_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
for k in inputs.keys() # noqa
|
||||
if k not in ("seq_lens", "sft_format")
|
||||
}
|
||||
inputs = BatchEncoding(data=inputs, tensor_type="pt")
|
||||
return inputs
|
||||
return BatchFeature(data=inputs, tensor_type="pt")
|
||||
|
||||
hf_model.processor = processor
|
||||
hf_model.model.get_output_embeddings = lambda: \
|
||||
@ -529,10 +493,52 @@ def mantis_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
return hf_model
|
||||
|
||||
|
||||
def minicpmo_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
def minicpmv_25_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
orig_generate = hf_model.model.generate
|
||||
|
||||
def _generate(self, *args, **kwargs):
|
||||
def _generate(
|
||||
self,
|
||||
*args,
|
||||
input_ids=None,
|
||||
pixel_values=None,
|
||||
image_sizes=None,
|
||||
image_bound=None,
|
||||
tgt_sizes=None,
|
||||
**kwargs,
|
||||
):
|
||||
model_inputs = {
|
||||
"input_ids": input_ids,
|
||||
"pixel_values": pixel_values,
|
||||
"image_sizes": image_sizes,
|
||||
"image_bound": image_bound,
|
||||
"tgt_sizes": tgt_sizes,
|
||||
}
|
||||
for k in list(model_inputs.keys()):
|
||||
if model_inputs[k] is None:
|
||||
model_inputs.pop(k)
|
||||
|
||||
return orig_generate(model_inputs, *args, decode_text=False, **kwargs)
|
||||
|
||||
hf_model.model.generate = types.MethodType(_generate, hf_model.model)
|
||||
|
||||
return hf_model
|
||||
|
||||
|
||||
def minicpmo_26_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
orig_generate = hf_model.model.generate
|
||||
|
||||
def _generate(self, *args, image_sizes=None, **kwargs):
|
||||
return orig_generate(*args, decode_text=False, **kwargs)
|
||||
|
||||
hf_model.model.generate = types.MethodType(_generate, hf_model.model)
|
||||
|
||||
return hf_model
|
||||
|
||||
|
||||
def minicpmv_26_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
orig_generate = hf_model.model.generate
|
||||
|
||||
def _generate(self, *args, image_sizes=None, **kwargs):
|
||||
return orig_generate(*args, decode_text=False, **kwargs)
|
||||
|
||||
hf_model.model.generate = types.MethodType(_generate, hf_model.model)
|
||||
@ -551,10 +557,11 @@ def molmo_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
|
||||
def _generate(self, max_new_tokens=None, do_sample=None, **kwargs):
|
||||
batch = {
|
||||
k: kwargs.pop(k)
|
||||
k: kwargs.pop(k).unsqueeze(0)
|
||||
for k in ("input_ids", "images", "image_input_idx", "image_masks")
|
||||
if k in kwargs
|
||||
}
|
||||
batch = BatchFeature(batch).to(dtype=self.dtype)
|
||||
|
||||
return self.generate_from_batch(
|
||||
batch,
|
||||
|
||||
@ -8,13 +8,12 @@ from typing import Any, Callable, NamedTuple, Optional, Union
|
||||
import torch
|
||||
from PIL.Image import Image
|
||||
from pytest import MarkDecorator
|
||||
from transformers import AutoModelForCausalLM, BatchEncoding
|
||||
from transformers import AutoModelForCausalLM
|
||||
from transformers.models.auto.auto_factory import _BaseAutoModelClass
|
||||
|
||||
from vllm.config import TaskOption
|
||||
from vllm.sequence import SampleLogprobs
|
||||
from vllm.transformers_utils.tokenizer import AnyTokenizer
|
||||
from vllm.utils import identity
|
||||
|
||||
from .....conftest import IMAGE_ASSETS, HfRunner, ImageAsset, _ImageAssets
|
||||
from ....utils import check_logprobs_close
|
||||
@ -110,11 +109,6 @@ class VLMTestInfo(NamedTuple):
|
||||
# Indicates we should explicitly pass the EOS from the tokenizer
|
||||
use_tokenizer_eos: bool = False
|
||||
auto_cls: type[_BaseAutoModelClass] = AutoModelForCausalLM
|
||||
# Callable to pass to the HF runner to run on inputs; for now, we also pass
|
||||
# the data type to input post processing, because almost all of the uses of
|
||||
# postprocess_inputs are to fix the data types of BatchEncoding values.
|
||||
postprocess_inputs: Callable[[BatchEncoding, str],
|
||||
BatchEncoding] = identity
|
||||
patch_hf_runner: Optional[Callable[[HfRunner], HfRunner]] = None
|
||||
|
||||
# Post processors that if defined, will run oun the outputs of the
|
||||
@ -130,7 +124,7 @@ class VLMTestInfo(NamedTuple):
|
||||
# is all combinations of .models + all fields below
|
||||
max_tokens: Union[int, tuple[int]] = 128
|
||||
num_logprobs: Union[int, tuple[int]] = 5
|
||||
dtype: Union[str, Iterable[str]] = "half"
|
||||
dtype: Union[str, Union[list[str], tuple[str, ...]]] = "auto"
|
||||
distributed_executor_backend: Optional[Union[str, Iterable[str]]] = None
|
||||
# Only expanded in video tests
|
||||
num_video_frames: Union[int, tuple[int]] = 16
|
||||
@ -171,7 +165,6 @@ class VLMTestInfo(NamedTuple):
|
||||
"vllm_output_post_proc": self.vllm_output_post_proc,
|
||||
"auto_cls": self.auto_cls,
|
||||
"use_tokenizer_eos": self.use_tokenizer_eos,
|
||||
"postprocess_inputs": self.postprocess_inputs,
|
||||
"comparator": self.comparator,
|
||||
"get_stop_token_ids": self.get_stop_token_ids,
|
||||
"hf_model_kwargs": self.hf_model_kwargs,
|
||||
|
||||
@ -1,12 +1,12 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
from functools import partial
|
||||
from typing import Callable
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
from PIL import Image
|
||||
from transformers import BatchEncoding, Qwen2VLForConditionalGeneration
|
||||
from transformers import Qwen2VLForConditionalGeneration
|
||||
|
||||
from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
|
||||
from ....utils import large_gpu_test
|
||||
@ -75,10 +75,6 @@ def apply_chat_template_and_add_eos(
|
||||
return prompt
|
||||
|
||||
|
||||
def postprocess_inputs(hf_model: HfRunner, inputs: BatchEncoding, **kwargs):
|
||||
return hf_model.model.prepare_inputs_for_generation(**inputs, **kwargs)
|
||||
|
||||
|
||||
def _run_test(
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
@ -118,14 +114,8 @@ def _run_test(
|
||||
with hf_runner(model,
|
||||
dtype=dtype,
|
||||
auto_cls=Qwen2VLForConditionalGeneration) as hf_model:
|
||||
hf_model.postprocess_inputs = partial(
|
||||
postprocess_inputs,
|
||||
hf_model,
|
||||
cache_position=torch.arange(
|
||||
0,
|
||||
1, # 1 for batch size
|
||||
requires_grad=False),
|
||||
use_cache=False)
|
||||
|
||||
prompts = []
|
||||
for text, image, embed_text in zip(input_texts, input_images,
|
||||
embed_texts):
|
||||
# dse requires non-standard input processing
|
||||
@ -133,20 +123,34 @@ def _run_test(
|
||||
messages = get_messages(image, text, embed_text)
|
||||
prompt = apply_chat_template_and_add_eos(
|
||||
messages, hf_model.processor.apply_chat_template)
|
||||
inputs = hf_model.get_inputs(
|
||||
prompts=[[prompt]],
|
||||
images=[[image]],
|
||||
)
|
||||
with torch.no_grad():
|
||||
|
||||
prompts.append(prompt)
|
||||
|
||||
all_inputs = hf_model.get_inputs(
|
||||
prompts=prompts,
|
||||
images=input_images,
|
||||
)
|
||||
|
||||
with torch.no_grad():
|
||||
all_outputs = []
|
||||
for inputs in all_inputs:
|
||||
inputs = hf_model.model.prepare_inputs_for_generation(
|
||||
**inputs,
|
||||
cache_position=torch.arange(1), # 1 for batch size
|
||||
use_cache=False,
|
||||
)
|
||||
outputs = hf_model.model(
|
||||
**hf_model.wrap_device(inputs[0],
|
||||
device=hf_model.model.device.type),
|
||||
**hf_model.wrap_device(inputs),
|
||||
return_dict=True,
|
||||
output_hidden_states=True,
|
||||
)
|
||||
pooled_output = torch.nn.functional.normalize(
|
||||
outputs.hidden_states[-1][0, -1], p=2, dim=-1)
|
||||
hf_outputs.append(pooled_output.tolist())
|
||||
pooled_output = F.normalize(outputs.hidden_states[-1][0, -1],
|
||||
p=2,
|
||||
dim=-1)
|
||||
|
||||
all_outputs.append(pooled_output.tolist())
|
||||
|
||||
hf_outputs = all_outputs
|
||||
|
||||
check_embeddings_close(
|
||||
embeddings_0_lst=hf_outputs,
|
||||
|
||||
@ -2,7 +2,7 @@
|
||||
|
||||
import pytest
|
||||
import torch.nn.functional as F
|
||||
from transformers import AutoModelForVision2Seq
|
||||
from transformers import AutoModelForImageTextToText
|
||||
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
@ -70,7 +70,7 @@ def _run_test(
|
||||
vllm_outputs = vllm_model.encode(input_texts, images=input_images)
|
||||
|
||||
with hf_runner(model, dtype=dtype,
|
||||
auto_cls=AutoModelForVision2Seq) as hf_model:
|
||||
auto_cls=AutoModelForImageTextToText) as hf_model:
|
||||
# Patch the issue where generation_config.json is missing
|
||||
hf_model.processor.patch_size = \
|
||||
hf_model.model.config.vision_config.patch_size
|
||||
@ -86,8 +86,7 @@ def _run_test(
|
||||
for inputs in all_inputs:
|
||||
# Based on: https://huggingface.co/royokong/e5-v
|
||||
outputs = hf_model.model(
|
||||
**hf_model.wrap_device(inputs,
|
||||
device=hf_model.model.device.type),
|
||||
**hf_model.wrap_device(inputs),
|
||||
return_dict=True,
|
||||
output_hidden_states=True,
|
||||
)
|
||||
|
||||
@ -53,8 +53,7 @@ def _run_test(
|
||||
for inputs in all_inputs:
|
||||
# Based on: https://github.com/TIGER-AI-Lab/VLM2Vec/blob/db3b951bccabba220c1f53ab46a734e50dd2fc08/src/model.py
|
||||
outputs = hf_model.model(
|
||||
**hf_model.wrap_device(inputs,
|
||||
device=hf_model.model.device.type),
|
||||
**hf_model.wrap_device(inputs),
|
||||
return_dict=True,
|
||||
output_hidden_states=True,
|
||||
)
|
||||
|
||||
@ -10,7 +10,7 @@ import pytest
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.assets.audio import AudioAsset
|
||||
|
||||
from ....utils import fork_new_process_for_each_test, multi_gpu_test
|
||||
from ....utils import create_new_process_for_each_test, multi_gpu_test
|
||||
|
||||
PROMPTS = [
|
||||
{
|
||||
@ -119,7 +119,7 @@ def run_test(
|
||||
assert output.outputs[0].text == expected
|
||||
|
||||
|
||||
@fork_new_process_for_each_test
|
||||
@create_new_process_for_each_test()
|
||||
@pytest.mark.core_model
|
||||
@pytest.mark.parametrize(
|
||||
"model", ["openai/whisper-small", "openai/whisper-large-v3-turbo"])
|
||||
|
||||
@ -4,8 +4,7 @@ from typing import Optional, overload
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
from transformers import (AutoConfig, AutoModelForVision2Seq, AutoTokenizer,
|
||||
BatchEncoding)
|
||||
from transformers import AutoConfig, AutoModelForImageTextToText, AutoTokenizer
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.attention.backends.flash_attn import FlashAttentionMetadata
|
||||
@ -216,7 +215,6 @@ def _run_test(
|
||||
max_num_seqs=2,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
enforce_eager=True,
|
||||
limit_mm_per_prompt={"image": _LIMIT_IMAGE_PER_PROMPT
|
||||
}) as vllm_model:
|
||||
vllm_outputs_per_image = [
|
||||
@ -227,14 +225,10 @@ def _run_test(
|
||||
for prompts, images in inputs
|
||||
]
|
||||
|
||||
def process(hf_inputs: BatchEncoding, **kwargs):
|
||||
return hf_inputs
|
||||
|
||||
with hf_runner(model,
|
||||
dtype=dtype,
|
||||
model_kwargs={"device_map": "auto"},
|
||||
postprocess_inputs=process,
|
||||
auto_cls=AutoModelForVision2Seq) as hf_model:
|
||||
auto_cls=AutoModelForImageTextToText) as hf_model:
|
||||
hf_outputs_per_image = [
|
||||
hf_model.generate_greedy_logprobs_limit(prompts,
|
||||
max_tokens,
|
||||
@ -430,7 +424,6 @@ def test_bnb_regression(
|
||||
dtype=dtype,
|
||||
max_model_len=4096,
|
||||
max_num_seqs=2,
|
||||
enforce_eager=True,
|
||||
quantization="bitsandbytes",
|
||||
load_format="bitsandbytes",
|
||||
)
|
||||
@ -486,7 +479,6 @@ def test_explicit_implicit_prompt(
|
||||
max_model_len=4096,
|
||||
max_num_seqs=2,
|
||||
tensor_parallel_size=1,
|
||||
enforce_eager=True,
|
||||
)
|
||||
sampling_params = SamplingParams(
|
||||
temperature=0,
|
||||
@ -518,7 +510,6 @@ def test_regression(vllm_runner, image_assets, model, dtype, max_tokens,
|
||||
max_model_len=4096,
|
||||
max_num_seqs=2,
|
||||
tensor_parallel_size=1,
|
||||
enforce_eager=True,
|
||||
limit_mm_per_prompt={"image":
|
||||
_LIMIT_IMAGE_PER_PROMPT}) as vllm_model:
|
||||
|
||||
|
||||
1
tests/models/fixtures/mistral_small_3_chat.json
Normal file
1
tests/models/fixtures/mistral_small_3_chat.json
Normal file
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@ -195,6 +195,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
|
||||
"XverseForCausalLM": _HfExamplesInfo("xverse/XVERSE-7B-Chat",
|
||||
is_available_online=False,
|
||||
trust_remote_code=True),
|
||||
"Zamba2ForCausalLM": _HfExamplesInfo("Zyphra/Zamba2-7B-instruct",
|
||||
min_transformers_version="4.49"),
|
||||
# [Encoder-decoder]
|
||||
"BartModel": _HfExamplesInfo("facebook/bart-base"),
|
||||
"BartForConditionalGeneration": _HfExamplesInfo("facebook/bart-large-cnn"),
|
||||
|
||||
@ -5,10 +5,10 @@ import pytest
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.assets.image import ImageAsset
|
||||
|
||||
from ..utils import fork_new_process_for_each_test
|
||||
from ..utils import create_new_process_for_each_test
|
||||
|
||||
|
||||
@fork_new_process_for_each_test
|
||||
@create_new_process_for_each_test()
|
||||
def test_plugin(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
dummy_opt_path: str,
|
||||
@ -24,7 +24,7 @@ def test_plugin(
|
||||
assert (error_msg in str(excinfo.value))
|
||||
|
||||
|
||||
@fork_new_process_for_each_test
|
||||
@create_new_process_for_each_test()
|
||||
def test_oot_registration_text_generation(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
dummy_opt_path: str,
|
||||
@ -44,7 +44,7 @@ def test_oot_registration_text_generation(
|
||||
assert rest == ""
|
||||
|
||||
|
||||
@fork_new_process_for_each_test
|
||||
@create_new_process_for_each_test()
|
||||
def test_oot_registration_embedding(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
dummy_gemma2_embedding_path: str,
|
||||
@ -62,7 +62,7 @@ def test_oot_registration_embedding(
|
||||
image = ImageAsset("cherry_blossom").pil_image.convert("RGB")
|
||||
|
||||
|
||||
@fork_new_process_for_each_test
|
||||
@create_new_process_for_each_test()
|
||||
def test_oot_registration_multimodal(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
dummy_llava_path: str,
|
||||
|
||||
@ -17,7 +17,7 @@ from vllm.model_executor.models.registry import (_MULTIMODAL_MODELS,
|
||||
ModelRegistry)
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
from ..utils import fork_new_process_for_each_test
|
||||
from ..utils import create_new_process_for_each_test
|
||||
from .registry import HF_EXAMPLE_MODELS
|
||||
|
||||
|
||||
@ -45,7 +45,7 @@ def test_registry_imports(model_arch):
|
||||
assert supports_multimodal(model_cls)
|
||||
|
||||
|
||||
@fork_new_process_for_each_test
|
||||
@create_new_process_for_each_test()
|
||||
@pytest.mark.parametrize("model_arch,is_mm,init_cuda,is_ce", [
|
||||
("LlamaForCausalLM", False, False, False),
|
||||
("MllamaForConditionalGeneration", True, False, False),
|
||||
@ -70,7 +70,7 @@ def test_registry_model_property(model_arch, is_mm, init_cuda, is_ce):
|
||||
stacklevel=2)
|
||||
|
||||
|
||||
@fork_new_process_for_each_test
|
||||
@create_new_process_for_each_test()
|
||||
@pytest.mark.parametrize("model_arch,is_pp,init_cuda", [
|
||||
("MLPSpeculatorPreTrainedModel", False, False),
|
||||
("DeepseekV2ForCausalLM", True, False),
|
||||
|
||||
@ -2,7 +2,7 @@
|
||||
|
||||
import warnings
|
||||
from collections.abc import Sequence
|
||||
from typing import Optional, Union
|
||||
from typing import Any, Optional, Union
|
||||
|
||||
import torch
|
||||
|
||||
@ -254,9 +254,9 @@ def check_logprobs_close(
|
||||
def build_model_context(
|
||||
model_id: str,
|
||||
task: TaskOption = "auto",
|
||||
dtype: Optional[Union[str, torch.dtype]] = None,
|
||||
mm_processor_kwargs: Optional[dict] = None,
|
||||
limit_mm_per_prompt: Optional[dict] = None,
|
||||
dtype: Union[str, torch.dtype] = "auto",
|
||||
mm_processor_kwargs: Optional[dict[str, Any]] = None,
|
||||
limit_mm_per_prompt: Optional[dict[str, int]] = None,
|
||||
disable_mm_preprocessor_cache: bool = True,
|
||||
):
|
||||
"""Creates an InputContext for a given model.
|
||||
@ -274,9 +274,6 @@ def build_model_context(
|
||||
model_info.check_available_online(on_fail="skip")
|
||||
model_info.check_transformers_version(on_fail="skip")
|
||||
|
||||
if dtype is None:
|
||||
dtype = "half"
|
||||
|
||||
model_config = ModelConfig(
|
||||
model_id,
|
||||
task=task,
|
||||
|
||||
@ -7,19 +7,25 @@ from unittest.mock import MagicMock
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
import torch
|
||||
from transformers import ProcessorMixin
|
||||
|
||||
from vllm.config import ModelConfig
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.inputs import (MultiModalFieldElem, MultiModalKwargs,
|
||||
MultiModalKwargsItem,
|
||||
MultiModalSharedField)
|
||||
# yapf conflicts with isort for this block
|
||||
# yapf: disable
|
||||
from vllm.multimodal.processing import (PlaceholderFeaturesInfo,
|
||||
PromptIndexTargets, PromptInsertion,
|
||||
PromptReplacement, apply_text_matches,
|
||||
ProcessingCache, PromptIndexTargets,
|
||||
PromptInsertion, PromptReplacement,
|
||||
apply_text_matches,
|
||||
apply_token_matches,
|
||||
find_mm_placeholders,
|
||||
find_text_matches, find_token_matches,
|
||||
iter_token_matches)
|
||||
iter_token_matches,
|
||||
replace_token_matches)
|
||||
# yapf: enable
|
||||
from vllm.multimodal.profiling import MultiModalProfiler
|
||||
from vllm.transformers_utils.tokenizer import (AnyTokenizer,
|
||||
@ -89,6 +95,58 @@ def test_iter_token_matches(token_ids, match_ids, expected):
|
||||
assert all(match_len == len(match_ids) for match_len in match_lens)
|
||||
|
||||
|
||||
# yapf: disable
|
||||
@pytest.mark.parametrize(
|
||||
("token_ids", "match_ids", "new_ids", "expected"),
|
||||
[
|
||||
([], [], [-1], []),
|
||||
([], [32000], [-1], []),
|
||||
(
|
||||
[32000, 32000, 32000],
|
||||
[32000],
|
||||
[-1],
|
||||
[-1, -1, -1],
|
||||
),
|
||||
(
|
||||
[32000, 32000, 32000],
|
||||
[32000, 32000],
|
||||
[-1],
|
||||
[-1, 32000],
|
||||
),
|
||||
(
|
||||
[32000, 32000, 32000],
|
||||
[32000, 32000, 32000],
|
||||
[-1],
|
||||
[-1],
|
||||
),
|
||||
(
|
||||
[9833, 28747, 32000, 32000, 32000, 9833, 28747, 32000, 32000, 918],
|
||||
[28747, 32000],
|
||||
[-1],
|
||||
[9833, -1, 32000, 32000, 9833, -1, 32000, 918],
|
||||
),
|
||||
(
|
||||
[9833, 28747, 32000, 32000, 32000, 9833, 28747, 32000, 32000, 918],
|
||||
[28747, 32000, 32000, 32000],
|
||||
[-1],
|
||||
[9833, -1, 9833, 28747, 32000, 32000, 918],
|
||||
),
|
||||
(
|
||||
[9833, 28747, 32000, 32000, 32000, 9833, 28747, 32000, 32000, 918],
|
||||
[28747, 0, 32000],
|
||||
[-1],
|
||||
[9833, 28747, 32000, 32000, 32000, 9833, 28747, 32000, 32000, 918],
|
||||
),
|
||||
],
|
||||
)
|
||||
# yapf: enable
|
||||
def test_replace_token_matches(token_ids, match_ids, new_ids, expected):
|
||||
result = replace_token_matches(token_ids, match_ids, new_ids)
|
||||
|
||||
# Manually constructed results
|
||||
assert result == expected
|
||||
|
||||
|
||||
# yapf: disable
|
||||
@pytest.mark.parametrize(
|
||||
("prompt", "target_by_key", "expected_by_key"),
|
||||
@ -837,6 +895,45 @@ def test_find_mm_placeholders(
|
||||
assert result == expected
|
||||
|
||||
|
||||
def _dummy_elem(modality: str, key: str, size: int):
|
||||
return MultiModalFieldElem(
|
||||
modality=modality,
|
||||
key=key,
|
||||
data=torch.empty((size, ), dtype=torch.int8),
|
||||
field=MultiModalSharedField(1),
|
||||
)
|
||||
|
||||
|
||||
def _dummy_item(modality: str, size_by_key: dict[str, int]):
|
||||
return MultiModalKwargsItem.from_elems([
|
||||
_dummy_elem(modality, key, size) for key, size in size_by_key.items()
|
||||
])
|
||||
|
||||
|
||||
def _dummy_kw(size_by_key_modality: dict[str, dict[str, int]]):
|
||||
return MultiModalKwargs.from_items([
|
||||
_dummy_item(modality, size_by_key)
|
||||
for modality, size_by_key in size_by_key_modality.items()
|
||||
])
|
||||
|
||||
|
||||
# yapf: disable
|
||||
@pytest.mark.parametrize(
|
||||
("item", "expected_size"),
|
||||
[
|
||||
(_dummy_item("a", {"a1": 100}), 100),
|
||||
(_dummy_item("a", {"a1": 100, "a2": 110}), 210),
|
||||
(_dummy_kw({"a": {"a1": 100, "a2": 110}, "b": {"b1": 120, "b2": 130}}), 460), # noqa: E501
|
||||
],
|
||||
)
|
||||
# yapf: enable
|
||||
def test_cache_item_size(item, expected_size):
|
||||
cache = ProcessingCache.get_lru_cache(2048, type(item))
|
||||
cache[""] = item
|
||||
|
||||
assert cache.currsize == expected_size
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
|
||||
@pytest.mark.parametrize(
|
||||
("limit", "num_supported", "is_valid"),
|
||||
@ -853,7 +950,7 @@ def test_limit_mm_per_prompt_dummy(model_id, limit, num_supported, is_valid):
|
||||
tokenizer_mode="auto",
|
||||
trust_remote_code=False,
|
||||
seed=0,
|
||||
dtype="half",
|
||||
dtype="auto",
|
||||
revision=None,
|
||||
limit_mm_per_prompt=limit_mm_per_prompt,
|
||||
)
|
||||
@ -892,7 +989,7 @@ def test_limit_mm_per_prompt_apply(model_id, num_images, limit, is_valid):
|
||||
tokenizer_mode="auto",
|
||||
trust_remote_code=False,
|
||||
seed=0,
|
||||
dtype="half",
|
||||
dtype="auto",
|
||||
revision=None,
|
||||
limit_mm_per_prompt=limit_mm_per_prompt,
|
||||
)
|
||||
@ -965,7 +1062,7 @@ def test_hf_processor_kwargs(model_id, call_kwargs, expected_kwargs):
|
||||
tokenizer_mode="auto",
|
||||
trust_remote_code=False,
|
||||
seed=0,
|
||||
dtype="half",
|
||||
dtype="auto",
|
||||
revision=None,
|
||||
)
|
||||
|
||||
|
||||
@ -314,7 +314,7 @@ def get_active_block_tables(block_tables, query_lens, seq_lens, block_size,
|
||||
|
||||
# Test edge cases
|
||||
(1, 128, 16, 1024, 4, 2, 16, False), # large decode batch
|
||||
(16, 4, 8, 8192, 48, 1, 128, True), # large prefill batch
|
||||
(16, 4, 8, 1024, 4, 2, 128, True), # large prefill batch
|
||||
(4, 12, 32, 2048, 16, 1, 32, True), # multi-head attention (MHA)
|
||||
(4, 12, 32, 2048, 16, 16, 32, True), # multi-query attention (MQA)
|
||||
])
|
||||
|
||||
@ -6,7 +6,7 @@ from vllm.core.scheduler import Scheduler
|
||||
from vllm.engine.arg_utils import EngineArgs
|
||||
from vllm.engine.llm_engine import LLMEngine
|
||||
from vllm.sampling_params import SamplingParams
|
||||
from vllm.v1.core.scheduler import Scheduler as V1Scheduler
|
||||
from vllm.v1.core.sched.scheduler import Scheduler as V1Scheduler
|
||||
from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine
|
||||
|
||||
|
||||
|
||||
@ -10,10 +10,13 @@ import pytest
|
||||
import torch
|
||||
|
||||
from tests.quantization.utils import is_quant_method_supported
|
||||
from tests.utils import compare_two_settings, fork_new_process_for_each_test
|
||||
|
||||
from ..utils import compare_two_settings, create_new_process_for_each_test
|
||||
|
||||
models_4bit_to_test = [
|
||||
("facebook/opt-125m", "quantize opt model inflight"),
|
||||
("mistralai/Mistral-7B-Instruct-v0.3",
|
||||
"quantize inflight model with both HF and Mistral format weights")
|
||||
]
|
||||
|
||||
models_pre_qaunt_4bit_to_test = [
|
||||
@ -32,7 +35,7 @@ models_pre_quant_8bit_to_test = [
|
||||
@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
|
||||
reason='bitsandbytes is not supported on this GPU type.')
|
||||
@pytest.mark.parametrize("model_name, description", models_4bit_to_test)
|
||||
@fork_new_process_for_each_test
|
||||
@create_new_process_for_each_test()
|
||||
def test_load_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
|
||||
model_name, description) -> None:
|
||||
|
||||
@ -45,7 +48,7 @@ def test_load_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
|
||||
reason='bitsandbytes is not supported on this GPU type.')
|
||||
@pytest.mark.parametrize("model_name, description",
|
||||
models_pre_qaunt_4bit_to_test)
|
||||
@fork_new_process_for_each_test
|
||||
@create_new_process_for_each_test()
|
||||
def test_load_pre_quant_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
|
||||
model_name, description) -> None:
|
||||
|
||||
@ -57,7 +60,7 @@ def test_load_pre_quant_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
|
||||
reason='bitsandbytes is not supported on this GPU type.')
|
||||
@pytest.mark.parametrize("model_name, description",
|
||||
models_pre_quant_8bit_to_test)
|
||||
@fork_new_process_for_each_test
|
||||
@create_new_process_for_each_test()
|
||||
def test_load_8bit_bnb_model(hf_runner, vllm_runner, example_prompts,
|
||||
model_name, description) -> None:
|
||||
|
||||
@ -70,7 +73,7 @@ def test_load_8bit_bnb_model(hf_runner, vllm_runner, example_prompts,
|
||||
@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
|
||||
reason='bitsandbytes is not supported on this GPU type.')
|
||||
@pytest.mark.parametrize("model_name, description", models_4bit_to_test)
|
||||
@fork_new_process_for_each_test
|
||||
@create_new_process_for_each_test()
|
||||
def test_load_tp_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
|
||||
model_name, description) -> None:
|
||||
|
||||
@ -88,7 +91,7 @@ def test_load_tp_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
|
||||
@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
|
||||
reason='bitsandbytes is not supported on this GPU type.')
|
||||
@pytest.mark.parametrize("model_name, description", models_4bit_to_test)
|
||||
@fork_new_process_for_each_test
|
||||
@create_new_process_for_each_test()
|
||||
def test_load_pp_4bit_bnb_model(model_name, description) -> None:
|
||||
common_args = [
|
||||
"--disable-log-stats",
|
||||
|
||||
@ -42,7 +42,7 @@ from transformers import AutoTokenizer
|
||||
|
||||
from vllm import SamplingParams
|
||||
|
||||
from ...utils import fork_new_process_for_each_test
|
||||
from ...utils import create_new_process_for_each_test
|
||||
from .conftest import (get_output_from_llm_generator,
|
||||
run_equality_correctness_test)
|
||||
|
||||
@ -82,7 +82,7 @@ from .conftest import (get_output_from_llm_generator,
|
||||
@pytest.mark.parametrize("test_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("batch_size", [1, 32])
|
||||
@pytest.mark.parametrize("seed", [1])
|
||||
@fork_new_process_for_each_test
|
||||
@create_new_process_for_each_test()
|
||||
def test_spec_decode_e2e_with_detokenization(test_llm_generator,
|
||||
batch_size: int):
|
||||
"""Run generation with speculative decoding on a batch. Verify the engine
|
||||
@ -170,7 +170,7 @@ def test_spec_decode_e2e_with_detokenization(test_llm_generator,
|
||||
])
|
||||
@pytest.mark.parametrize("batch_size", [1])
|
||||
@pytest.mark.parametrize("seed", [1])
|
||||
@fork_new_process_for_each_test
|
||||
@create_new_process_for_each_test()
|
||||
def test_spec_decode_e2e_greedy_correctness_tiny_model_bs1(
|
||||
vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
|
||||
baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
|
||||
@ -244,7 +244,7 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_bs1(
|
||||
])
|
||||
@pytest.mark.parametrize("batch_size", [64])
|
||||
@pytest.mark.parametrize("seed", [1])
|
||||
@fork_new_process_for_each_test
|
||||
@create_new_process_for_each_test()
|
||||
def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs(
|
||||
vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
|
||||
baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
|
||||
@ -300,7 +300,7 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs(
|
||||
])
|
||||
@pytest.mark.parametrize("batch_size", [32])
|
||||
@pytest.mark.parametrize("seed", [1])
|
||||
@fork_new_process_for_each_test
|
||||
@create_new_process_for_each_test()
|
||||
def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs_diff_output_len(
|
||||
vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
|
||||
baseline_llm_kwargs, test_llm_kwargs, batch_size: int,
|
||||
@ -356,7 +356,7 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs_diff_output_len(
|
||||
256,
|
||||
])
|
||||
@pytest.mark.parametrize("seed", [1])
|
||||
@fork_new_process_for_each_test
|
||||
@create_new_process_for_each_test()
|
||||
def test_spec_decode_e2e_greedy_correctness_real_model_bs1(
|
||||
vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
|
||||
baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
|
||||
@ -411,7 +411,7 @@ def test_spec_decode_e2e_greedy_correctness_real_model_bs1(
|
||||
64,
|
||||
])
|
||||
@pytest.mark.parametrize("seed", [1])
|
||||
@fork_new_process_for_each_test
|
||||
@create_new_process_for_each_test()
|
||||
def test_spec_decode_e2e_greedy_correctness_real_model_large_bs(
|
||||
vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
|
||||
baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
|
||||
@ -469,7 +469,7 @@ def test_spec_decode_e2e_greedy_correctness_real_model_large_bs(
|
||||
])
|
||||
@pytest.mark.parametrize("batch_size", [4])
|
||||
@pytest.mark.parametrize("seed", [1])
|
||||
@fork_new_process_for_each_test
|
||||
@create_new_process_for_each_test()
|
||||
def test_spec_decode_e2e_greedy_correctness_with_preemption(
|
||||
vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
|
||||
baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
|
||||
@ -534,7 +534,7 @@ def test_spec_decode_e2e_greedy_correctness_with_preemption(
|
||||
32,
|
||||
])
|
||||
@pytest.mark.parametrize("seed", [1])
|
||||
@fork_new_process_for_each_test
|
||||
@create_new_process_for_each_test()
|
||||
def test_spec_decode_different_block_size(vllm_runner, common_llm_kwargs,
|
||||
per_test_common_llm_kwargs,
|
||||
baseline_llm_kwargs, test_llm_kwargs,
|
||||
@ -594,7 +594,7 @@ def test_spec_decode_different_block_size(vllm_runner, common_llm_kwargs,
|
||||
64,
|
||||
])
|
||||
@pytest.mark.parametrize("seed", [1])
|
||||
@fork_new_process_for_each_test
|
||||
@create_new_process_for_each_test()
|
||||
def test_skip_speculation(vllm_runner, common_llm_kwargs,
|
||||
per_test_common_llm_kwargs, baseline_llm_kwargs,
|
||||
test_llm_kwargs, batch_size: int, output_len: int,
|
||||
@ -644,7 +644,7 @@ def test_skip_speculation(vllm_runner, common_llm_kwargs,
|
||||
@pytest.mark.parametrize("batch_size", [8])
|
||||
@pytest.mark.parametrize("output_len", [10])
|
||||
@pytest.mark.parametrize("seed", [1])
|
||||
@fork_new_process_for_each_test
|
||||
@create_new_process_for_each_test()
|
||||
def test_disable_speculation(vllm_runner, common_llm_kwargs,
|
||||
per_test_common_llm_kwargs, baseline_llm_kwargs,
|
||||
test_llm_kwargs, batch_size: int, output_len: int,
|
||||
@ -697,7 +697,7 @@ def test_disable_speculation(vllm_runner, common_llm_kwargs,
|
||||
32,
|
||||
])
|
||||
@pytest.mark.parametrize("seed", [1])
|
||||
@fork_new_process_for_each_test
|
||||
@create_new_process_for_each_test()
|
||||
def test_many_k(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
|
||||
baseline_llm_kwargs, test_llm_kwargs, batch_size: int,
|
||||
output_len: int, seed: int):
|
||||
@ -752,7 +752,7 @@ def test_many_k(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
|
||||
32,
|
||||
])
|
||||
@pytest.mark.parametrize("seed", [1])
|
||||
@fork_new_process_for_each_test
|
||||
@create_new_process_for_each_test()
|
||||
def test_typical_acceptance_sampling(vllm_runner, common_llm_kwargs,
|
||||
per_test_common_llm_kwargs,
|
||||
baseline_llm_kwargs, test_llm_kwargs,
|
||||
|
||||
@ -166,7 +166,7 @@ def test_vllm_model_can_load_with_lora(vllm_runner, tmp_path):
|
||||
test_prompts = multilora_inference.create_test_prompts(lora_path)
|
||||
|
||||
# Serialize model before deserializing and binding LoRA adapters
|
||||
with vllm_runner(model_ref, ) as vllm_model:
|
||||
with vllm_runner(model_ref) as vllm_model:
|
||||
model_path = tmp_path / (model_ref + ".tensors")
|
||||
|
||||
vllm_model.apply_model(
|
||||
@ -208,7 +208,7 @@ def test_load_without_tensorizer_load_format(vllm_runner):
|
||||
@pytest.mark.skipif(not is_curl_installed(), reason="cURL is not installed")
|
||||
def test_openai_apiserver_with_tensorizer(vllm_runner, tmp_path):
|
||||
## Serialize model
|
||||
with vllm_runner(model_ref, ) as vllm_model:
|
||||
with vllm_runner(model_ref) as vllm_model:
|
||||
model_path = tmp_path / (model_ref + ".tensors")
|
||||
|
||||
vllm_model.apply_model(
|
||||
|
||||
@ -16,7 +16,7 @@ from vllm.utils import (FlexibleArgumentParser, MemorySnapshot,
|
||||
deprecate_kwargs, get_open_port, memory_profiling,
|
||||
merge_async_iterators, supports_kw, swap_dict_values)
|
||||
|
||||
from .utils import error_on_warning, fork_new_process_for_each_test
|
||||
from .utils import create_new_process_for_each_test, error_on_warning
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@ -276,7 +276,7 @@ def test_supports_kw(callable,kw_name,requires_kw_only,
|
||||
) == is_supported
|
||||
|
||||
|
||||
@fork_new_process_for_each_test
|
||||
@create_new_process_for_each_test()
|
||||
def test_memory_profiling():
|
||||
# Fake out some model loading + inference memory usage to test profiling
|
||||
# Memory used by other processes will show up as cuda usage outside of torch
|
||||
|
||||
@ -34,7 +34,9 @@ with depyf.prepare_debug(temp_dir):
|
||||
|
||||
# disable custom dispatcher, let Dynamo takes over
|
||||
# all the control
|
||||
llm = LLM(model="google/gemma-2b",
|
||||
llm = LLM(model="Qwen/Qwen2.5-1.5B-Instruct",
|
||||
max_model_len=512,
|
||||
max_num_seqs=64,
|
||||
enforce_eager=True,
|
||||
compilation_config={"level": CompilationLevel.DYNAMO_AS_IS})
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
@ -44,38 +46,51 @@ with depyf.prepare_debug(temp_dir):
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
assert generated_text.startswith(answer)
|
||||
|
||||
compiled_code = sorted(
|
||||
compiled_codes = sorted(
|
||||
glob.glob(os.path.join(temp_dir, "__transformed_code*.py")))
|
||||
|
||||
# we should only trigger Dynamo compilation three times:
|
||||
# one for the profiling phase without kv cache
|
||||
# one for the prefill phase with symbolic shapes
|
||||
# one for the decode phase with symbolic shapes
|
||||
for i, compiled_code in enumerate(compiled_codes):
|
||||
print("{} file: {}".format(i + 1, compiled_code))
|
||||
|
||||
# We should only trigger Dynamo compilation 4 times:
|
||||
# 1. forward pass (symbolic)
|
||||
# 2. compute_logits (symbolic)
|
||||
# 3. forward pass (shape 16)
|
||||
# 4. forward pass (shape 32)
|
||||
# and later calls should not trigger Dynamo compilation again.
|
||||
# NOTE: it might still trigger XLA compilation.
|
||||
# NOTE: It might still trigger XLA compilation.
|
||||
|
||||
# check we have three compiled code
|
||||
# this is the assumption when we use the custom dispatcher
|
||||
assert len(compiled_code) == 3
|
||||
# Check we have 4 compiled codes
|
||||
assert len(compiled_codes) == 4
|
||||
|
||||
# check all the compilations are as expected
|
||||
compiled_fn = sorted(
|
||||
kv_cache_prefix = "kv_cache"
|
||||
attn_prefix = "ragged_paged_attention"
|
||||
|
||||
# Check all the compilations are as expected
|
||||
compiled_fns = sorted(
|
||||
glob.glob(os.path.join(temp_dir, "__compiled_fn*Captured*.py")))
|
||||
|
||||
# the first compilation is the profiling phase,
|
||||
# it should not have any kv cache
|
||||
with open(compiled_fn[0]) as f:
|
||||
content = f.read()
|
||||
assert "kv_caches" not in content
|
||||
for i, compiled_fn in enumerate(compiled_fns):
|
||||
print("{} file: {}".format(i + 1, compiled_fn))
|
||||
|
||||
# the second compilation is the prefill phase,
|
||||
# it should have kv cache and the flash_attention op
|
||||
with open(compiled_fn[1]) as f:
|
||||
# The first compilation is symbolic, so it should not have any kv_caches
|
||||
with open(compiled_fns[0]) as f:
|
||||
content = f.read()
|
||||
assert "kv_caches" in content and "torch.ops.xla.flash_attention" in content
|
||||
assert kv_cache_prefix not in content
|
||||
|
||||
# the third compilation is the decode phase,
|
||||
# it should have kv cache and the paged_attention op
|
||||
with open(compiled_fn[2]) as f:
|
||||
# The second compilation is symbolic, so it should not have any kv_caches
|
||||
with open(compiled_fns[1]) as f:
|
||||
content = f.read()
|
||||
assert "kv_caches" in content and "torch.ops.xla.paged_attention" in content
|
||||
assert kv_cache_prefix not in content
|
||||
|
||||
# The third compilation is shape 16, so it should have kv_caches and the
|
||||
# ragged_paged_attention
|
||||
with open(compiled_fns[2]) as f:
|
||||
content = f.read()
|
||||
assert (kv_cache_prefix in content and attn_prefix in content)
|
||||
|
||||
# The forth compilation is shape 32, so it should have kv_caches and the
|
||||
# ragged_paged_attention
|
||||
with open(compiled_fns[3]) as f:
|
||||
content = f.read()
|
||||
assert (kv_cache_prefix in content and attn_prefix in content)
|
||||
|
||||
@ -14,12 +14,17 @@ from ..utils import compare_two_settings
|
||||
def test_custom_dispatcher(monkeypatch: pytest.MonkeyPatch):
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_RPC_TIMEOUT", "30000")
|
||||
compare_two_settings(
|
||||
"google/gemma-2b",
|
||||
arg1=[
|
||||
"--enforce-eager",
|
||||
f"-O{CompilationLevel.DYNAMO_ONCE}",
|
||||
],
|
||||
arg2=["--enforce-eager", f"-O{CompilationLevel.DYNAMO_AS_IS}"],
|
||||
env1={},
|
||||
env2={})
|
||||
compare_two_settings("Qwen/Qwen2.5-1.5B-Instruct",
|
||||
arg1=[
|
||||
"--max-model-len=256",
|
||||
"--max-num-seqs=32",
|
||||
"--enforce-eager",
|
||||
f"-O{CompilationLevel.DYNAMO_ONCE}",
|
||||
],
|
||||
arg2=[
|
||||
"--max-model-len=256", "--max-num-seqs=32",
|
||||
"--enforce-eager",
|
||||
f"-O{CompilationLevel.DYNAMO_AS_IS}"
|
||||
],
|
||||
env1={},
|
||||
env2={})
|
||||
|
||||
@ -7,12 +7,14 @@ import os
|
||||
import signal
|
||||
import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
import time
|
||||
import warnings
|
||||
from contextlib import contextmanager
|
||||
from contextlib import contextmanager, suppress
|
||||
from pathlib import Path
|
||||
from typing import Any, Callable, Optional, Union
|
||||
from typing import Any, Callable, Literal, Optional, Union
|
||||
|
||||
import cloudpickle
|
||||
import openai
|
||||
import pytest
|
||||
import requests
|
||||
@ -703,6 +705,78 @@ def fork_new_process_for_each_test(
|
||||
return wrapper
|
||||
|
||||
|
||||
def spawn_new_process_for_each_test(
|
||||
f: Callable[_P, None]) -> Callable[_P, None]:
|
||||
"""Decorator to spawn a new process for each test function.
|
||||
"""
|
||||
|
||||
@functools.wraps(f)
|
||||
def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> None:
|
||||
# Check if we're already in a subprocess
|
||||
if os.environ.get('RUNNING_IN_SUBPROCESS') == '1':
|
||||
# If we are, just run the function directly
|
||||
return f(*args, **kwargs)
|
||||
|
||||
import torch.multiprocessing as mp
|
||||
with suppress(RuntimeError):
|
||||
mp.set_start_method('spawn')
|
||||
|
||||
# Get the module
|
||||
module_name = f.__module__
|
||||
|
||||
# Create a process with environment variable set
|
||||
env = os.environ.copy()
|
||||
env['RUNNING_IN_SUBPROCESS'] = '1'
|
||||
|
||||
with tempfile.TemporaryDirectory() as tempdir:
|
||||
output_filepath = os.path.join(tempdir, "new_process.tmp")
|
||||
|
||||
# `cloudpickle` allows pickling complex functions directly
|
||||
input_bytes = cloudpickle.dumps((f, output_filepath))
|
||||
|
||||
cmd = [sys.executable, "-m", f"{module_name}"]
|
||||
|
||||
returned = subprocess.run(cmd,
|
||||
input=input_bytes,
|
||||
capture_output=True,
|
||||
env=env)
|
||||
|
||||
# check if the subprocess is successful
|
||||
try:
|
||||
returned.check_returncode()
|
||||
except Exception as e:
|
||||
# wrap raised exception to provide more information
|
||||
raise RuntimeError(f"Error raised in subprocess:\n"
|
||||
f"{returned.stderr.decode()}") from e
|
||||
|
||||
return wrapper
|
||||
|
||||
|
||||
def create_new_process_for_each_test(
|
||||
method: Optional[Literal["spawn", "fork"]] = None
|
||||
) -> Callable[[Callable[_P, None]], Callable[_P, None]]:
|
||||
"""Creates a decorator that runs each test function in a new process.
|
||||
|
||||
Args:
|
||||
method: The process creation method. Can be either "spawn" or "fork".
|
||||
If not specified,
|
||||
it defaults to "spawn" on ROCm platforms and "fork" otherwise.
|
||||
|
||||
Returns:
|
||||
A decorator to run test functions in separate processes.
|
||||
"""
|
||||
if method is None:
|
||||
method = "spawn" if current_platform.is_rocm() else "fork"
|
||||
|
||||
assert method in ["spawn",
|
||||
"fork"], "Method must be either 'spawn' or 'fork'"
|
||||
|
||||
if method == "fork":
|
||||
return fork_new_process_for_each_test
|
||||
|
||||
return spawn_new_process_for_each_test
|
||||
|
||||
|
||||
def large_gpu_mark(min_gb: int) -> pytest.MarkDecorator:
|
||||
"""
|
||||
Get a pytest mark, which skips the test if the GPU doesn't meet
|
||||
@ -762,7 +836,7 @@ def multi_gpu_test(*, num_gpus: int):
|
||||
marks = multi_gpu_marks(num_gpus=num_gpus)
|
||||
|
||||
def wrapper(f: Callable[_P, None]) -> Callable[_P, None]:
|
||||
func = fork_new_process_for_each_test(f)
|
||||
func = create_new_process_for_each_test()(f)
|
||||
for mark in reversed(marks):
|
||||
func = mark(func)
|
||||
|
||||
|
||||
@ -6,7 +6,8 @@ import pytest
|
||||
from vllm.config import CacheConfig, ModelConfig, SchedulerConfig, VllmConfig
|
||||
from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
|
||||
from vllm.sampling_params import SamplingParams
|
||||
from vllm.v1.core.scheduler import Scheduler, SchedulerOutput
|
||||
from vllm.v1.core.sched.output import SchedulerOutput
|
||||
from vllm.v1.core.sched.scheduler import Scheduler
|
||||
from vllm.v1.outputs import ModelRunnerOutput
|
||||
from vllm.v1.request import Request, RequestStatus
|
||||
from vllm.v1.structured_output import StructuredOutputManager
|
||||
|
||||
@ -76,21 +76,18 @@ async def generate(engine: AsyncLLM,
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY])
|
||||
@pytest.mark.parametrize("engine_args_and_prompt",
|
||||
@pytest.mark.parametrize("engine_args,prompt",
|
||||
[(TEXT_ENGINE_ARGS, TEXT_PROMPT),
|
||||
(VISION_ENGINE_ARGS, VISION_PROMPT)])
|
||||
@pytest.mark.asyncio
|
||||
async def test_load(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
output_kind: RequestOutputKind,
|
||||
engine_args_and_prompt: tuple[AsyncEngineArgs, PromptType],
|
||||
):
|
||||
async def test_load(monkeypatch: pytest.MonkeyPatch,
|
||||
output_kind: RequestOutputKind,
|
||||
engine_args: AsyncEngineArgs, prompt: PromptType):
|
||||
# TODO(rickyx): Remove monkeypatch once we have a better way to test V1
|
||||
# so that in the future when we switch, we don't have to change all the
|
||||
# tests.
|
||||
with monkeypatch.context() as m, ExitStack() as after:
|
||||
m.setenv("VLLM_USE_V1", "1")
|
||||
engine_args, prompt = engine_args_and_prompt
|
||||
|
||||
engine = AsyncLLM.from_engine_args(engine_args)
|
||||
after.callback(engine.shutdown)
|
||||
@ -124,18 +121,16 @@ async def test_load(
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY])
|
||||
@pytest.mark.parametrize("engine_args_and_prompt",
|
||||
@pytest.mark.parametrize("engine_args,prompt",
|
||||
[(TEXT_ENGINE_ARGS, TEXT_PROMPT),
|
||||
(VISION_ENGINE_ARGS, VISION_PROMPT)])
|
||||
@pytest.mark.asyncio
|
||||
async def test_abort(monkeypatch: pytest.MonkeyPatch,
|
||||
output_kind: RequestOutputKind,
|
||||
engine_args_and_prompt: tuple[AsyncEngineArgs,
|
||||
PromptType]):
|
||||
engine_args: AsyncEngineArgs, prompt: PromptType):
|
||||
|
||||
with monkeypatch.context() as m, ExitStack() as after:
|
||||
m.setenv("VLLM_USE_V1", "1")
|
||||
engine_args, prompt = engine_args_and_prompt
|
||||
|
||||
engine = AsyncLLM.from_engine_args(engine_args)
|
||||
after.callback(engine.shutdown)
|
||||
@ -193,17 +188,15 @@ async def test_abort(monkeypatch: pytest.MonkeyPatch,
|
||||
|
||||
|
||||
@pytest.mark.parametrize("n", [1, 3])
|
||||
@pytest.mark.parametrize("engine_args_and_prompt",
|
||||
@pytest.mark.parametrize("engine_args,prompt",
|
||||
[(TEXT_ENGINE_ARGS, TEXT_PROMPT),
|
||||
(VISION_ENGINE_ARGS, VISION_PROMPT)])
|
||||
@pytest.mark.asyncio
|
||||
async def test_finished_flag(monkeypatch, n: int,
|
||||
engine_args_and_prompt: tuple[AsyncEngineArgs,
|
||||
PromptType]):
|
||||
async def test_finished_flag(monkeypatch: pytest.MonkeyPatch, n: int,
|
||||
engine_args: AsyncEngineArgs, prompt: PromptType):
|
||||
|
||||
with monkeypatch.context() as m, ExitStack() as after:
|
||||
m.setenv("VLLM_USE_V1", "1")
|
||||
engine_args, prompt = engine_args_and_prompt
|
||||
|
||||
engine = AsyncLLM.from_engine_args(engine_args)
|
||||
after.callback(engine.shutdown)
|
||||
|
||||
@ -9,7 +9,6 @@ from concurrent.futures import Future
|
||||
import pytest
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
from tests.utils import fork_new_process_for_each_test
|
||||
from vllm import SamplingParams
|
||||
from vllm.engine.arg_utils import EngineArgs
|
||||
from vllm.platforms import current_platform
|
||||
@ -19,6 +18,8 @@ from vllm.v1.executor.abstract import Executor, UniProcExecutor
|
||||
from vllm.v1.kv_cache_interface import KVCacheConfig
|
||||
from vllm.v1.outputs import ModelRunnerOutput
|
||||
|
||||
from ...utils import create_new_process_for_each_test
|
||||
|
||||
if not current_platform.is_cuda():
|
||||
pytest.skip(reason="V1 currently only supported on CUDA.",
|
||||
allow_module_level=True)
|
||||
@ -44,7 +45,7 @@ def make_request() -> EngineCoreRequest:
|
||||
)
|
||||
|
||||
|
||||
@fork_new_process_for_each_test
|
||||
@create_new_process_for_each_test()
|
||||
def test_engine_core(monkeypatch: pytest.MonkeyPatch):
|
||||
|
||||
with monkeypatch.context() as m:
|
||||
@ -157,8 +158,24 @@ def test_engine_core(monkeypatch: pytest.MonkeyPatch):
|
||||
assert len(engine_core.scheduler.waiting) == 0
|
||||
assert len(engine_core.scheduler.running) == 0
|
||||
|
||||
# Sending duplicate requests with same request_id
|
||||
req0 = make_request()
|
||||
req1 = make_request()
|
||||
req0.request_id = req1.request_id = "test"
|
||||
engine_core.add_request(req0)
|
||||
|
||||
@fork_new_process_for_each_test
|
||||
while len(engine_core.step().outputs) > 0:
|
||||
pass
|
||||
|
||||
engine_core.add_request(req1)
|
||||
while len(engine_core.step().outputs) > 0:
|
||||
pass
|
||||
|
||||
assert len(engine_core.scheduler.waiting) == 0
|
||||
assert len(engine_core.scheduler.running) == 0
|
||||
|
||||
|
||||
@create_new_process_for_each_test()
|
||||
def test_engine_core_advanced_sampling(monkeypatch: pytest.MonkeyPatch):
|
||||
"""
|
||||
A basic end-to-end test to verify that the engine functions correctly
|
||||
@ -208,7 +225,7 @@ def test_engine_core_advanced_sampling(monkeypatch: pytest.MonkeyPatch):
|
||||
_check_engine_state()
|
||||
|
||||
|
||||
@fork_new_process_for_each_test
|
||||
@create_new_process_for_each_test()
|
||||
def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch):
|
||||
"""
|
||||
Test that the engine can handle multiple concurrent batches.
|
||||
|
||||
@ -8,7 +8,6 @@ from typing import Optional
|
||||
import pytest
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
from tests.utils import fork_new_process_for_each_test
|
||||
from vllm import SamplingParams
|
||||
from vllm.engine.arg_utils import EngineArgs
|
||||
from vllm.platforms import current_platform
|
||||
@ -19,6 +18,8 @@ from vllm.v1.engine.core_client import (AsyncMPClient, EngineCoreClient,
|
||||
SyncMPClient)
|
||||
from vllm.v1.executor.abstract import Executor
|
||||
|
||||
from ...utils import create_new_process_for_each_test
|
||||
|
||||
if not current_platform.is_cuda():
|
||||
pytest.skip(reason="V1 currently only supported on CUDA.",
|
||||
allow_module_level=True)
|
||||
@ -88,7 +89,7 @@ def echo(self, msg: str, err_msg: Optional[str] = None) -> str:
|
||||
return msg
|
||||
|
||||
|
||||
@fork_new_process_for_each_test
|
||||
@create_new_process_for_each_test()
|
||||
@pytest.mark.parametrize("multiprocessing_mode", [True, False])
|
||||
def test_engine_core_client(monkeypatch: pytest.MonkeyPatch,
|
||||
multiprocessing_mode: bool):
|
||||
|
||||
@ -50,7 +50,7 @@ def _get_test_sampling_params(
|
||||
"""Generate random sampling params for a batch."""
|
||||
|
||||
def get_mostly_n_gt1() -> int:
|
||||
"""Mostly n \in [2,20], ~1/3 n=1"""
|
||||
r"""Mostly n \in [2,20], ~1/3 n=1"""
|
||||
x = random.randint(0, 28)
|
||||
if x < 10:
|
||||
return 1
|
||||
|
||||
@ -18,9 +18,6 @@ MODELS_TO_TEST = [
|
||||
"Qwen/Qwen2.5-1.5B-Instruct", "mistralai/Ministral-8B-Instruct-2410"
|
||||
]
|
||||
|
||||
# Undo after https://github.com/vllm-project/vllm/pull/14868
|
||||
pytest.skip(allow_module_level=True)
|
||||
|
||||
|
||||
@pytest.mark.skip_global_cleanup
|
||||
@pytest.mark.parametrize("guided_decoding_backend",
|
||||
|
||||
@ -6,20 +6,23 @@ import torch
|
||||
import torch.nn.functional as F
|
||||
|
||||
from vllm.v1.sample.metadata import SamplingMetadata
|
||||
from vllm.v1.sample.rejection_sampler import INVALID_TOKEN_ID, RejectionSampler
|
||||
from vllm.v1.sample.rejection_sampler import (PLACEHOLDER_TOKEN_ID,
|
||||
RejectionSampler)
|
||||
from vllm.v1.spec_decode.metadata import SpecDecodeMetadata
|
||||
|
||||
DEVICE = "cpu"
|
||||
DEVICE = "cuda"
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sampler():
|
||||
def rejection_sampler():
|
||||
return RejectionSampler()
|
||||
|
||||
|
||||
def create_logits_tensor(token_ids: list[list[int]],
|
||||
def create_logits_tensor(output_token_ids: list[list[int]],
|
||||
vocab_size: int = 100) -> torch.Tensor:
|
||||
"""Helper function to create logits tensor that
|
||||
will produce desired token ids on argmax"""
|
||||
token_ids = [tokens[:-1] for tokens in output_token_ids]
|
||||
num_total_tokens = sum(len(tokens) for tokens in token_ids)
|
||||
logits = torch.full((num_total_tokens, vocab_size), -100.0, device=DEVICE)
|
||||
start_loc = 0
|
||||
@ -31,15 +34,22 @@ def create_logits_tensor(token_ids: list[list[int]],
|
||||
|
||||
|
||||
def create_sampling_metadata(
|
||||
all_greedy: bool,
|
||||
generators: Optional[dict[int, Any]] = None) -> SamplingMetadata:
|
||||
all_greedy: bool,
|
||||
temperature: Optional[torch.Tensor] = None,
|
||||
generators: Optional[dict[int, Any]] = None,
|
||||
) -> SamplingMetadata:
|
||||
"""Create a v1 sampling metadata object with all_greedy set
|
||||
to the given value. Either all greedy or all random sampling
|
||||
is used.
|
||||
"""
|
||||
generators = generators or {}
|
||||
if all_greedy:
|
||||
temperature = None
|
||||
else:
|
||||
assert temperature is not None
|
||||
|
||||
return SamplingMetadata(
|
||||
temperature=torch.tensor([]),
|
||||
temperature=temperature,
|
||||
all_greedy=all_greedy,
|
||||
all_random=not all_greedy,
|
||||
top_p=None,
|
||||
@ -61,7 +71,7 @@ def create_sampling_metadata(
|
||||
|
||||
|
||||
########################### Tests for Greedy Sampling ###################
|
||||
def test_perfect_match(sampler):
|
||||
def test_perfect_match(rejection_sampler):
|
||||
"""Test when output tokens perfectly match speculated tokens"""
|
||||
spec_tokens = [[1, 2, 3]]
|
||||
output_tokens = [[1, 2, 3, 4]] # 4 is the bonus token
|
||||
@ -70,15 +80,23 @@ def test_perfect_match(sampler):
|
||||
logits = create_logits_tensor(output_tokens)
|
||||
bonus_token_tensor = torch.tensor([output_tokens[0][-1]],
|
||||
device=logits.device)
|
||||
spec_decode_metadata = SpecDecodeMetadata.make_dummy(spec_tokens,
|
||||
device=logits.device)
|
||||
|
||||
output = sampler(spec_tokens, None, bonus_token_tensor, logits, metadata)
|
||||
output = rejection_sampler(
|
||||
spec_decode_metadata,
|
||||
draft_probs=None,
|
||||
target_logits=logits,
|
||||
bonus_token_ids=bonus_token_tensor,
|
||||
sampling_metadata=metadata,
|
||||
)
|
||||
expected = torch.tensor([[1, 2, 3, 4]],
|
||||
dtype=torch.int,
|
||||
device=logits.device)
|
||||
assert torch.equal(output, expected)
|
||||
|
||||
|
||||
def test_early_mismatch(sampler):
|
||||
def test_early_mismatch(rejection_sampler):
|
||||
"""Test when there's an early mismatch in tokens"""
|
||||
spec_tokens = [[1, 2, 3]]
|
||||
output_tokens = [[1, 5, 3, 4]] # Mismatch at position 1
|
||||
@ -87,15 +105,25 @@ def test_early_mismatch(sampler):
|
||||
logits = create_logits_tensor(output_tokens)
|
||||
bonus_token_tensor = torch.tensor([output_tokens[0][-1]],
|
||||
device=logits.device)
|
||||
spec_decode_metadata = SpecDecodeMetadata.make_dummy(spec_tokens,
|
||||
device=logits.device)
|
||||
|
||||
output = sampler(spec_tokens, None, bonus_token_tensor, logits, metadata)
|
||||
expected = torch.tensor([[1, 5, INVALID_TOKEN_ID, INVALID_TOKEN_ID]],
|
||||
dtype=torch.int,
|
||||
device=logits.device)
|
||||
output = rejection_sampler(
|
||||
spec_decode_metadata,
|
||||
draft_probs=None,
|
||||
target_logits=logits,
|
||||
bonus_token_ids=bonus_token_tensor,
|
||||
sampling_metadata=metadata,
|
||||
)
|
||||
expected = torch.tensor(
|
||||
[[1, 5, PLACEHOLDER_TOKEN_ID, PLACEHOLDER_TOKEN_ID]],
|
||||
dtype=torch.int,
|
||||
device=logits.device,
|
||||
)
|
||||
assert torch.equal(output, expected)
|
||||
|
||||
|
||||
def test_multiple_sequences(sampler):
|
||||
def test_multiple_sequences(rejection_sampler):
|
||||
"""Test handling multiple sequences of speculated tokens"""
|
||||
spec_tokens = [[1, 2], [3]]
|
||||
output_tokens = [[1, 2, 5], [3,
|
||||
@ -105,15 +133,23 @@ def test_multiple_sequences(sampler):
|
||||
logits = create_logits_tensor(output_tokens)
|
||||
bonus_token_tensor = torch.tensor(
|
||||
[output_tokens[0][-1], output_tokens[1][-1]], device=logits.device)
|
||||
spec_decode_metadata = SpecDecodeMetadata.make_dummy(spec_tokens,
|
||||
device=logits.device)
|
||||
|
||||
output = sampler(spec_tokens, None, bonus_token_tensor, logits, metadata)
|
||||
expected = torch.tensor([[1, 2, 5], [3, 4, INVALID_TOKEN_ID]],
|
||||
output = rejection_sampler(
|
||||
spec_decode_metadata,
|
||||
draft_probs=None,
|
||||
target_logits=logits,
|
||||
bonus_token_ids=bonus_token_tensor,
|
||||
sampling_metadata=metadata,
|
||||
)
|
||||
expected = torch.tensor([[1, 2, 5], [3, 4, PLACEHOLDER_TOKEN_ID]],
|
||||
dtype=torch.int,
|
||||
device=logits.device)
|
||||
assert torch.equal(output, expected)
|
||||
|
||||
|
||||
def test_single_token_sequence(sampler):
|
||||
def test_single_token_sequence(rejection_sampler):
|
||||
"""Test handling sequences with single token"""
|
||||
spec_tokens = [[1]]
|
||||
output_tokens = [[1, 2]] # Single token with bonus token 2
|
||||
@ -122,13 +158,21 @@ def test_single_token_sequence(sampler):
|
||||
logits = create_logits_tensor(output_tokens)
|
||||
bonus_token_tensor = torch.tensor([output_tokens[0][-1]],
|
||||
device=logits.device)
|
||||
spec_decode_metadata = SpecDecodeMetadata.make_dummy(spec_tokens,
|
||||
device=logits.device)
|
||||
|
||||
output = sampler(spec_tokens, None, bonus_token_tensor, logits, metadata)
|
||||
output = rejection_sampler(
|
||||
spec_decode_metadata,
|
||||
draft_probs=None,
|
||||
target_logits=logits,
|
||||
bonus_token_ids=bonus_token_tensor,
|
||||
sampling_metadata=metadata,
|
||||
)
|
||||
expected = torch.tensor([[1, 2]], dtype=torch.int, device=logits.device)
|
||||
assert torch.equal(output, expected)
|
||||
|
||||
|
||||
def test_empty_sequence(sampler):
|
||||
def test_empty_sequence(rejection_sampler):
|
||||
"""Test handling empty sequence of speculated tokens"""
|
||||
spec_tokens: list[list[int]] = [[]]
|
||||
output_tokens = [[5]] # Just the bonus token
|
||||
@ -137,13 +181,21 @@ def test_empty_sequence(sampler):
|
||||
logits = create_logits_tensor(output_tokens)
|
||||
bonus_token_tensor = torch.tensor([output_tokens[0][-1]],
|
||||
device=logits.device)
|
||||
spec_decode_metadata = SpecDecodeMetadata.make_dummy(spec_tokens,
|
||||
device=logits.device)
|
||||
|
||||
output = sampler(spec_tokens, None, bonus_token_tensor, logits, metadata)
|
||||
output = rejection_sampler(
|
||||
spec_decode_metadata,
|
||||
draft_probs=None,
|
||||
target_logits=logits,
|
||||
bonus_token_ids=bonus_token_tensor,
|
||||
sampling_metadata=metadata,
|
||||
)
|
||||
expected = torch.tensor([[5]], dtype=torch.int, device=logits.device)
|
||||
assert torch.equal(output, expected)
|
||||
|
||||
|
||||
def test_multiple_mismatches(sampler):
|
||||
def test_multiple_mismatches(rejection_sampler):
|
||||
"""Test handling multiple sequences with mismatches"""
|
||||
spec_tokens = [[1, 2, 3], [4, 5, 6]]
|
||||
output_tokens = [[1, 2, 7, 6], [4, 8, 6,
|
||||
@ -153,12 +205,22 @@ def test_multiple_mismatches(sampler):
|
||||
logits = create_logits_tensor(output_tokens)
|
||||
bonus_token_tensor = torch.tensor(
|
||||
[output_tokens[0][-1], output_tokens[1][-1]], device=logits.device)
|
||||
spec_decode_metadata = SpecDecodeMetadata.make_dummy(spec_tokens,
|
||||
device=logits.device)
|
||||
|
||||
output = sampler(spec_tokens, None, bonus_token_tensor, logits, metadata)
|
||||
expected = torch.tensor([[1, 2, 7, INVALID_TOKEN_ID],
|
||||
[4, 8, INVALID_TOKEN_ID, INVALID_TOKEN_ID]],
|
||||
dtype=torch.int,
|
||||
device=logits.device)
|
||||
output = rejection_sampler(
|
||||
spec_decode_metadata,
|
||||
draft_probs=None,
|
||||
target_logits=logits,
|
||||
bonus_token_ids=bonus_token_tensor,
|
||||
sampling_metadata=metadata,
|
||||
)
|
||||
expected = torch.tensor(
|
||||
[[1, 2, 7, PLACEHOLDER_TOKEN_ID],
|
||||
[4, 8, PLACEHOLDER_TOKEN_ID, PLACEHOLDER_TOKEN_ID]],
|
||||
dtype=torch.int,
|
||||
device=logits.device,
|
||||
)
|
||||
assert torch.equal(output, expected)
|
||||
|
||||
|
||||
@ -166,18 +228,27 @@ def test_multiple_mismatches(sampler):
|
||||
"spec_tokens,output_tokens,expected",
|
||||
[
|
||||
([[1, 2]], [[1, 2, 3]], [[1, 2, 3]]), # Perfect match with bonus
|
||||
([[1]], [[2, 3]], [[2, INVALID_TOKEN_ID]]), # First mismatch
|
||||
([[1]], [[2, 3]], [[2, PLACEHOLDER_TOKEN_ID]]), # First mismatch
|
||||
([[1, 2], [3, 4]], [[1, 5, 6], [3, 4, 7]],
|
||||
[[1, 5, INVALID_TOKEN_ID], [3, 4, 7]]), # Mixed matches
|
||||
[[1, 5, PLACEHOLDER_TOKEN_ID], [3, 4, 7]]), # Mixed matches
|
||||
])
|
||||
def test_parametrized_cases(sampler, spec_tokens, output_tokens, expected):
|
||||
def test_parametrized_cases(rejection_sampler, spec_tokens, output_tokens,
|
||||
expected):
|
||||
"""Parametrized test for various matching scenarios"""
|
||||
metadata = create_sampling_metadata(all_greedy=True)
|
||||
logits = create_logits_tensor(output_tokens)
|
||||
bonus_token_tensor = torch.tensor([tokens[-1] for tokens in output_tokens],
|
||||
device=logits.device)
|
||||
spec_decode_metadata = SpecDecodeMetadata.make_dummy(spec_tokens,
|
||||
device=logits.device)
|
||||
|
||||
output = sampler(spec_tokens, None, bonus_token_tensor, logits, metadata)
|
||||
output = rejection_sampler(
|
||||
spec_decode_metadata,
|
||||
draft_probs=None,
|
||||
target_logits=logits,
|
||||
bonus_token_ids=bonus_token_tensor,
|
||||
sampling_metadata=metadata,
|
||||
)
|
||||
expected_tensor = torch.tensor(expected,
|
||||
dtype=torch.int,
|
||||
device=logits.device)
|
||||
@ -190,21 +261,31 @@ def test_parametrized_cases(sampler, spec_tokens, output_tokens, expected):
|
||||
@pytest.mark.parametrize("batch_size", [1, 4, 8])
|
||||
@pytest.mark.parametrize("frac_seeded", [0.0, 0.5])
|
||||
@pytest.mark.parametrize("n_rep", [20])
|
||||
def test_deterministic_when_seeded(sampler, k: int, vocab_size: int,
|
||||
batch_size: int, frac_seeded: float,
|
||||
n_rep: int):
|
||||
draft_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32)
|
||||
target_probs = torch.rand(batch_size * (k + 1),
|
||||
vocab_size,
|
||||
dtype=torch.float32)
|
||||
def test_deterministic_when_seeded(
|
||||
rejection_sampler,
|
||||
k: int,
|
||||
vocab_size: int,
|
||||
batch_size: int,
|
||||
frac_seeded: float,
|
||||
n_rep: int,
|
||||
):
|
||||
num_tokens = batch_size * k
|
||||
draft_probs = torch.rand(num_tokens,
|
||||
vocab_size,
|
||||
dtype=torch.float32,
|
||||
device=DEVICE)
|
||||
draft_probs = F.softmax(draft_probs, dim=-1)
|
||||
target_logits = torch.rand_like(draft_probs)
|
||||
bonus_token_ids = torch.randint(low=0,
|
||||
high=vocab_size,
|
||||
size=(batch_size, 1),
|
||||
dtype=torch.int64)
|
||||
dtype=torch.int64,
|
||||
device=DEVICE)
|
||||
draft_token_ids = torch.randint(low=0,
|
||||
high=vocab_size,
|
||||
size=(batch_size, k),
|
||||
dtype=torch.int64)
|
||||
dtype=torch.int64,
|
||||
device=DEVICE)
|
||||
|
||||
seeded_mask = torch.rand(batch_size, dtype=torch.float32) <= frac_seeded
|
||||
|
||||
@ -215,10 +296,21 @@ def test_deterministic_when_seeded(sampler, k: int, vocab_size: int,
|
||||
for i in range(batch_size) if seeded_mask[i]
|
||||
}
|
||||
|
||||
temperature = torch.ones(batch_size,
|
||||
dtype=torch.float32,
|
||||
device=DEVICE)
|
||||
sampling_metadata = create_sampling_metadata(all_greedy=False,
|
||||
temperature=temperature,
|
||||
generators=seeded_seqs)
|
||||
rep_result = sampler(draft_token_ids.tolist(), draft_probs,
|
||||
bonus_token_ids, target_probs, sampling_metadata)
|
||||
spec_decode_metadata = SpecDecodeMetadata.make_dummy(
|
||||
draft_token_ids.tolist(), device=DEVICE)
|
||||
rep_result = rejection_sampler(
|
||||
spec_decode_metadata,
|
||||
draft_probs=draft_probs,
|
||||
target_logits=target_logits,
|
||||
bonus_token_ids=bonus_token_ids,
|
||||
sampling_metadata=sampling_metadata,
|
||||
)
|
||||
|
||||
results.append(rep_result)
|
||||
|
||||
@ -257,10 +349,10 @@ def test_rejection_sampling_approximates_target_distribution():
|
||||
num_reference_probs = 100
|
||||
|
||||
# Prepare draft, target, and reference probability distributions
|
||||
draft_probs, target_probs = (F.softmax(
|
||||
torch.rand(vocab_size, dtype=torch.float32),
|
||||
dim=-1,
|
||||
) for _ in range(2))
|
||||
draft_probs = F.softmax(torch.rand(vocab_size, dtype=torch.float32),
|
||||
dim=-1)
|
||||
target_logits = torch.rand(vocab_size, dtype=torch.float32)
|
||||
target_probs = F.softmax(target_logits, dim=-1)
|
||||
reference_probs = F.softmax(
|
||||
torch.rand(num_reference_probs, vocab_size, dtype=torch.float32),
|
||||
dim=-1,
|
||||
@ -273,7 +365,7 @@ def test_rejection_sampling_approximates_target_distribution():
|
||||
for num_samples in sample_sizes:
|
||||
# Sample using rejection sampling.
|
||||
rej_sample_probs = estimate_rejection_sampling_pdf(
|
||||
draft_probs, target_probs, k, vocab_size, num_samples)
|
||||
draft_probs, target_logits, k, vocab_size, num_samples)
|
||||
rej_sample_probs = rej_sample_probs.to(DEVICE)
|
||||
|
||||
# Average distance from reference probs.
|
||||
@ -313,7 +405,7 @@ def get_ratio_first_to_last(elements: list[float]) -> float:
|
||||
|
||||
def estimate_rejection_sampling_pdf(
|
||||
draft_probs: torch.Tensor,
|
||||
target_probs: torch.Tensor,
|
||||
target_logits: torch.Tensor,
|
||||
k: int,
|
||||
vocab_size: int,
|
||||
num_samples: int,
|
||||
@ -323,35 +415,44 @@ def estimate_rejection_sampling_pdf(
|
||||
|
||||
Args:
|
||||
draft_probs: Draft probability distribution.
|
||||
target_probs: Target probability distribution.
|
||||
target_logits: Target logits.
|
||||
num_samples: Number of samples to draw.
|
||||
|
||||
Returns:
|
||||
Estimated probability distribution of the output tokens.
|
||||
"""
|
||||
sampler = RejectionSampler()
|
||||
# Repeat draft probs num_samples times.
|
||||
rejection_sampler = RejectionSampler()
|
||||
num_tokens = num_samples * k
|
||||
# Repeat draft probs num_samples * k times.
|
||||
draft_probs = draft_probs.reshape(1, 1,
|
||||
vocab_size).repeat(num_samples, k, 1)
|
||||
|
||||
# Repeat target probs num_samples * (k + 1) times.
|
||||
target_probs = target_probs.reshape(1, 1, vocab_size).repeat(
|
||||
num_samples, k + 1, 1).reshape(num_samples * (k + 1), vocab_size)
|
||||
# Repeat target probs num_tokens times.
|
||||
target_logits = target_logits.reshape(1, vocab_size).repeat(num_tokens, 1)
|
||||
|
||||
# Randomly sample draft token ids from draft probs.
|
||||
draft_token_ids = torch.multinomial(draft_probs[:, 0, :],
|
||||
num_samples=k,
|
||||
replacement=True).reshape(
|
||||
num_samples, k)
|
||||
draft_probs = draft_probs.view(num_tokens, vocab_size)
|
||||
|
||||
# Bonus tokens not used but required.
|
||||
bonus_token_ids = torch.zeros((1, 1), dtype=torch.int64,
|
||||
device=DEVICE).repeat(num_samples, 1)
|
||||
|
||||
sampling_metadata = create_sampling_metadata(all_greedy=False)
|
||||
output_token_ids = sampler(draft_token_ids.tolist(), draft_probs,
|
||||
bonus_token_ids, target_probs,
|
||||
sampling_metadata)
|
||||
temperature = torch.ones(num_samples, dtype=torch.float32, device=DEVICE)
|
||||
sampling_metadata = create_sampling_metadata(all_greedy=False,
|
||||
temperature=temperature)
|
||||
spec_decode_metadata = SpecDecodeMetadata.make_dummy(
|
||||
draft_token_ids.tolist(), device=bonus_token_ids.device)
|
||||
output_token_ids = rejection_sampler(
|
||||
spec_decode_metadata,
|
||||
draft_probs=draft_probs,
|
||||
target_logits=target_logits,
|
||||
bonus_token_ids=bonus_token_ids,
|
||||
sampling_metadata=sampling_metadata,
|
||||
)
|
||||
output_token_ids = output_token_ids[:, :-1].flatten()
|
||||
|
||||
hist = torch.histogram(output_token_ids.to(dtype=torch.float,
|
||||
|
||||
@ -15,9 +15,10 @@ if TYPE_CHECKING:
|
||||
from tests.conftest import VllmRunner
|
||||
|
||||
MODELS = [
|
||||
"Qwen/Qwen2.5-1.5B-Instruct",
|
||||
# TODO: Enable this models with v6e
|
||||
# "Qwen/Qwen2-7B-Instruct",
|
||||
"meta-llama/Llama-3.1-8B",
|
||||
# TODO: Add models here as necessary
|
||||
# "meta-llama/Llama-3.1-8B",
|
||||
]
|
||||
|
||||
TENSOR_PARALLEL_SIZES = [1]
|
||||
|
||||
94
tests/v1/tpu/test_sampler.py
Normal file
94
tests/v1/tpu/test_sampler.py
Normal file
@ -0,0 +1,94 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
import tempfile
|
||||
from time import time
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm import LLM, envs
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.sampling_params import SamplingParams
|
||||
|
||||
if not envs.VLLM_USE_V1:
|
||||
pytest.skip(
|
||||
"Skipping V1 tests. Rerun with `VLLM_USE_V1=1` to test.",
|
||||
allow_module_level=True,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_name", ["D4nt3/Qwen2.5-two-layers"])
|
||||
@pytest.mark.skipif(not current_platform.is_tpu(),
|
||||
reason="This test needs a TPU")
|
||||
def test_sampler_compilation(model_name: str, monkeypatch):
|
||||
"""
|
||||
Check that no recompilation happens despite changing sampling parameters.
|
||||
We can't read XLA metrics from the engine process, hence we measure time.
|
||||
"""
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
monkeypatch.setenv("VLLM_XLA_CACHE_PATH", temp_dir)
|
||||
# Compiling model init may still take some time, enforce_eager to skip.
|
||||
llm = LLM(model_name,
|
||||
enforce_eager=True,
|
||||
max_num_seqs=16,
|
||||
max_model_len=1024,
|
||||
gpu_memory_utilization=0.5)
|
||||
prompts = [
|
||||
"A robot may not injure a human being",
|
||||
"It is only with the heart that one can see rightly;",
|
||||
]
|
||||
# First inference should be slow
|
||||
sampling_params = SamplingParams(
|
||||
temperature=0.7,
|
||||
# top_p=0.6, # TODO too slow!
|
||||
# top_k=10,
|
||||
min_p=0.2,
|
||||
max_tokens=16)
|
||||
s = time()
|
||||
_ = llm.generate(prompts, sampling_params)
|
||||
run1 = time() - s
|
||||
|
||||
# Second request with different params, but for which we
|
||||
# compiled for in previous eager iteration.
|
||||
sampling_params = SamplingParams(temperature=0.1,
|
||||
min_p=0.8,
|
||||
max_tokens=24)
|
||||
s = time()
|
||||
_ = llm.generate(prompts, sampling_params)
|
||||
run2 = time() - s
|
||||
# Much faster after compiling
|
||||
assert run1 * 0.1 > run2
|
||||
print("TIMES", run1, run2)
|
||||
|
||||
# Third request with min_p set to "None". It will not trigger
|
||||
# recompilation as a default 0 value will be used.
|
||||
sampling_params = SamplingParams(max_tokens=24, temperature=0.0)
|
||||
s = time()
|
||||
_ = llm.generate(prompts, sampling_params)
|
||||
run3 = time() - s
|
||||
assert run1 * 0.1 > run3
|
||||
print("TIMES", run1, run3)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_name", ["Qwen/Qwen2.5-1.5B-Instruct"])
|
||||
@pytest.mark.skipif(not current_platform.is_tpu(),
|
||||
reason="This test needs a TPU")
|
||||
def test_sampler_different(model_name: str):
|
||||
"""
|
||||
Test significantly different sampling params to assert the model produces
|
||||
different results.
|
||||
"""
|
||||
llm = LLM(
|
||||
model_name,
|
||||
enforce_eager=True,
|
||||
max_num_seqs=1,
|
||||
max_model_len=64,
|
||||
# TODO: setting to 0.5 or it will go OOM
|
||||
gpu_memory_utilization=0.5)
|
||||
prompts = [
|
||||
"Write a short story about a robot that dreams for the first time."
|
||||
]
|
||||
sampling_params = SamplingParams(temperature=0.9, min_p=0.2, max_tokens=64)
|
||||
output = llm.generate(prompts, sampling_params)
|
||||
|
||||
sampling_params = SamplingParams(temperature=0.1, min_p=0.8, max_tokens=64)
|
||||
output2 = llm.generate(prompts, sampling_params)
|
||||
assert output[0].outputs[0].text != output2[0].outputs[0].text
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user