Compare commits

..

56 Commits

Author SHA1 Message Date
617fb893d5 add compile 2024-07-26 19:29:36 -07:00
55712941e5 [Bug Fix] Illegal memory access, FP8 Llama 3.1 405b (#6852) 2024-07-27 02:27:44 +00:00
981b0d5673 [Frontend] Factor out code for running uvicorn (#6828) 2024-07-27 09:58:25 +08:00
d09b94ca58 [TPU] Support collective communications in XLA devices (#6813) 2024-07-27 01:45:57 +00:00
bb5494676f enforce eager mode with bnb quantization temporarily (#6846) 2024-07-27 01:32:20 +00:00
b5f49ee55b Update README.md (#6847) 2024-07-27 00:26:45 +00:00
150a1ffbfd [Doc] Update SkyPilot doc for wrong indents and instructions for update service (#4283) 2024-07-26 14:39:10 -07:00
281977bd6e [Doc] Add Nemotron to supported model docs (#6843) 2024-07-26 17:32:44 -04:00
3bbb4936dc [Hardware] [Intel] Enable Multiprocessing and tensor parallel in CPU backend and update documentation (#6125) 2024-07-26 13:50:10 -07:00
aa4867791e [Misc][TPU] Support TPU in initialize_ray_cluster (#6812) 2024-07-26 19:39:49 +00:00
71734f1bf2 [Build/CI][ROCm] Minor simplification to Dockerfile.rocm (#6811) 2024-07-26 12:28:32 -07:00
50704f52c4 [Bugfix][Kernel] Promote another index to int64_t (#6838) 2024-07-26 18:41:04 +00:00
07278c37dd [Model] Support Nemotron models (Nemotron-3, Nemotron-4, Minitron) (#6611) 2024-07-26 14:33:42 -04:00
85ad7e2d01 [doc][debugging] add known issues for hangs (#6816) 2024-07-25 21:48:05 -07:00
89a84b0bb7 [Core] Use array to speedup padding (#6779) 2024-07-25 21:31:31 -07:00
084a01fd35 [Bugfix] [Easy] Fixed a bug in the multiprocessing GPU executor. (#6770) 2024-07-25 21:25:35 -07:00
062a1d0fab Fix ReplicatedLinear weight loading (#6793) 2024-07-25 19:24:58 -07:00
2eb9f4ff26 [ci] Mark tensorizer as soft fail and separate from grouped test (#6810)
[ci] Mark tensorizer test as soft fail and separate it from grouped test in fast check (#6810)
Signed-off-by: kevin <kevin@anyscale.com>
2024-07-25 18:08:33 -07:00
443c7cf4cf [ci][distributed] fix flaky tests (#6806) 2024-07-25 17:44:09 -07:00
1adddb14bf [Core] Fix ray forward_dag error mssg (#6792) 2024-07-25 16:53:25 -07:00
b7215de2c5 [Docs] Publish 5th meetup slides (#6799) 2024-07-25 16:47:55 -07:00
f3ff63c3f4 [doc][distributed] improve multinode serving doc (#6804) 2024-07-25 15:38:32 -07:00
cd7edc4e87 [Bugfix] Fix empty (nullptr) channelwise scales when loading wNa16 using compressed tensors (#6798) 2024-07-25 15:05:09 -07:00
6a1e25b151 [Doc] Add documentations for nightly benchmarks (#6412) 2024-07-25 11:57:16 -07:00
95db75de64 [Bugfix] Add synchronize to prevent possible data race (#6788)
Co-authored-by: Lucas Wilkinson <lwilkinson@neuralmagic.com>
2024-07-25 10:40:01 -07:00
65b1f121c8 [Bugfix] Fix kv_cache_dtype=fp8 without scales for FP8 checkpoints (#6761) 2024-07-25 09:46:15 -07:00
889da130e7 [ Misc ] fp8-marlin channelwise via compressed-tensors (#6524)
Co-authored-by: mgoin <michael@neuralmagic.com>
2024-07-25 09:46:04 -07:00
b75e314fff [Bugfix] Add image placeholder for OpenAI Compatible Server of MiniCPM-V (#6787)
Co-authored-by: hezhihui <hzh7269@modelbest.cn>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2024-07-25 09:42:49 -07:00
316a41ac1d [Bugfix] Fix encoding_format in examples/openai_embedding_client.py (#6755) 2024-07-24 22:48:07 -07:00
0310029a2f [Bugfix] Fix awq_marlin and gptq_marlin flags (#6745) 2024-07-24 22:34:11 -07:00
309aaef825 [Bugfix] Fix decode tokens w. CUDA graph (#6757) 2024-07-24 22:33:56 -07:00
9e169a4c61 [Model] Adding support for MiniCPM-V (#4087) 2024-07-24 20:59:30 -07:00
5689e256ba [Frontend] Represent tokens with identifiable strings (#6626) 2024-07-25 09:51:00 +08:00
740374d456 [core][distributed] fix zmq hang (#6759) 2024-07-24 17:37:12 -07:00
d88c458f44 [Doc][AMD][ROCm]Added tips to refer to mi300x tuning guide for mi300x users (#6754) 2024-07-24 14:32:57 -07:00
421e218b37 [Bugfix] Bump transformers to 4.43.2 (#6752) 2024-07-24 13:22:16 -07:00
5448f67635 [Core] Tweaks to model runner/input builder developer APIs (#6712) 2024-07-24 12:17:12 -07:00
0e63494cf3 Add fp8 support to reshape_and_cache_flash (#6667) 2024-07-24 18:36:52 +00:00
ee812580f7 [Frontend] split run_server into build_server and run_server (#6740) 2024-07-24 10:36:04 -07:00
40468b13fa [Bugfix] Miscalculated latency lead to time_to_first_token_seconds inaccurate. (#6686) 2024-07-24 08:58:42 -07:00
2cf0df3381 [Bugfix] Fix speculative decode seeded test (#6743) 2024-07-24 08:58:31 -07:00
545146349c Adding f-string to validation error which is missing (#6748) 2024-07-24 08:55:53 -07:00
f4f8a9d892 [Bugfix]fix modelscope compatible issue (#6730) 2024-07-24 05:04:46 -07:00
b570811706 [Build/CI] Update run-amd-test.sh. Enable Docker Hub login. (#6711) 2024-07-24 05:01:14 -07:00
ccc4a73257 [Docs][ROCm] Detailed instructions to build from source (#6680) 2024-07-24 01:07:23 -07:00
0a740a11ba [Bugfix] Fix token padding for chameleon (#6724) 2024-07-24 01:05:09 -07:00
c882a7f5b3 [SpecDecoding] Update MLPSpeculator CI tests to use smaller model (#6714) 2024-07-24 07:34:22 +00:00
5e8ca973eb [Bugfix] fix flashinfer cudagraph capture for PP (#6708) 2024-07-24 01:49:44 +00:00
87525fab92 [bitsandbytes]: support read bnb pre-quantized model (#5753)
Co-authored-by: Michael Goin <michael@neuralmagic.com>
2024-07-23 23:45:09 +00:00
2f808e69ab [Bugfix] StatLoggers: cache spec decode metrics when they get collected. (#6645)
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
2024-07-23 23:05:05 +00:00
01c16ede6b [CI] Add smoke test for non-uniform AutoFP8 quantization (#6702) 2024-07-23 22:45:12 +00:00
72fc704803 [build] relax wheel size limit (#6704) 2024-07-23 14:03:49 -07:00
1bedf210e3 Bump transformers version for Llama 3.1 hotfix and patch Chameleon (#6690) 2024-07-23 13:47:48 -07:00
507ef787d8 [Model] Pipeline Parallel Support for DeepSeek v2 (#6519)
Signed-off-by: Travis Johnson <tsjohnso@us.ibm.com>
2024-07-23 12:22:09 -07:00
58f53034ad [Frontend] Add Usage data in each chunk for chat_serving. #6540 (#6652) 2024-07-23 11:41:55 -07:00
0eb0757bef [Misc] Add ignored layers for fp8 quantization (#6657) 2024-07-23 14:04:04 -04:00
126 changed files with 4064 additions and 1001 deletions

View File

@ -1,7 +1,7 @@
import os
import zipfile
MAX_SIZE_MB = 200
MAX_SIZE_MB = 250
def print_top_10_largest_files(zip_file):

View File

@ -0,0 +1,11 @@
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nvidia/Minitron-4B-Base -b auto -l 1000 -f 5 -t 1
model_name: "nvidia/Minitron-4B-Base"
tasks:
- name: "gsm8k"
metrics:
- name: "exact_match,strict-match"
value: 0.252
- name: "exact_match,flexible-extract"
value: 0.252
limit: 1000
num_fewshot: 5

View File

@ -0,0 +1,11 @@
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-FP8W8 -b auto -l 1000 -f 5 -t 1
model_name: "nm-testing/Qwen2-1.5B-Instruct-FP8W8"
tasks:
- name: "gsm8k"
metrics:
- name: "exact_match,strict-match"
value: 0.578
- name: "exact_match,flexible-extract"
value: 0.585
limit: 1000
num_fewshot: 5

View File

@ -4,4 +4,6 @@ Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml
Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
Minitron-4B-Base.yaml
Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
Qwen2-1.5B-Instruct-FP8W8.yaml

View File

@ -3,30 +3,51 @@
## Introduction
This directory contains the performance benchmarking CI for vllm.
The goal is to help developers know the impact of their PRs on the performance of vllm.
This directory contains two sets of benchmark for vllm.
- Performance benchmark: benchmark vllm's performance under various workload, for **developers** to gain clarity on whether their PR improves/degrades vllm's performance
- Nightly benchmark: compare vllm's performance against alternatives (tgi, trt-llm and lmdeploy), for **the public** to know when to choose vllm.
This benchmark will be *triggered* upon:
- A PR being merged into vllm.
- Every commit for those PRs with `perf-benchmarks` label.
**Benchmarking Coverage**: latency, throughput and fix-qps serving on A100 (the support for more GPUs is comming later), with different models.
See [vLLM performance dashboard](https://perf.vllm.ai) for the latest performance benchmark results and [vLLM GitHub README](https://github.com/vllm-project/vllm/blob/main/README.md) for latest nightly benchmark results.
## Performance benchmark quick overview
**Benchmarking Coverage**: latency, throughput and fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!), with different models.
**Benchmarking Duration**: about 1hr.
**For benchmarking developers**: please try your best to constraint the duration of benchmarking to less than 1.5 hr so that it won't take forever to run.
**For benchmarking developers**: please try your best to constraint the duration of benchmarking to about 1 hr so that it won't take forever to run.
## Configuring the workload
## Nightly benchmark quick overview
The benchmarking workload contains three parts:
- Latency tests in `latency-tests.json`.
- Throughput tests in `throughput-tests.json`.
- Serving tests in `serving-tests.json`.
**Benchmarking Coverage**: Fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!) on Llama-3 8B, 70B and Mixtral 8x7B.
See [descriptions.md](tests/descriptions.md) for detailed descriptions.
**Benchmarking engines**: vllm, TGI, trt-llm and lmdeploy.
### Latency test
**Benchmarking Duration**: about 3.5hrs.
## Trigger the benchmark
Performance benchmark will be triggered when:
- A PR being merged into vllm.
- Every commit for those PRs with `perf-benchmarks` label.
Nightly benchmark will be triggered when:
- Every commit for those PRs with `nightly-benchmarks` label.
## Performance benchmark details
See [descriptions.md](tests/descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.
#### Latency test
Here is an example of one test inside `latency-tests.json`:
@ -54,12 +75,12 @@ Note that the performance numbers are highly sensitive to the value of the param
WARNING: The benchmarking script will save json results by itself, so please do not configure `--output-json` parameter in the json file.
### Throughput test
#### Throughput test
The tests are specified in `throughput-tests.json`. The syntax is similar to `latency-tests.json`, except for that the parameters will be fed forward to `benchmark_throughput.py`.
The number of this test is also stable -- a slight change on the value of this number might vary the performance numbers by a lot.
### Serving test
#### Serving test
We test the throughput by using `benchmark_serving.py` with request rate = inf to cover the online serving overhead. The corresponding parameters are in `serving-tests.json`, and here is an example:
```
@ -96,9 +117,36 @@ The number of this test is less stable compared to the delay and latency benchma
WARNING: The benchmarking script will save json results by itself, so please do not configure `--save-results` or other results-saving-related parameters in `serving-tests.json`.
## Visualizing the results
#### Visualizing the results
The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](tests/descriptions.md) with real benchmarking results.
You can find the result presented as a table inside the `buildkite/performance-benchmark` job page.
If you do not see the table, please wait till the benchmark finish running.
The json version of the table (together with the json version of the benchmark) will be also attached to the markdown file.
The raw benchmarking results (in the format of json files) are in the `Artifacts` tab of the benchmarking.
## Nightly test details
See [nightly-descriptions.md](nightly-descriptions.md) for the detailed description on test workload, models and docker containers of benchmarking other llm engines.
#### Workflow
- The [nightly-pipeline.yaml](nightly-pipeline.yaml) specifies the docker containers for different LLM serving engines.
- Inside each container, we run [run-nightly-suite.sh](run-nightly-suite.sh), which will probe the serving engine of the current container.
- The `run-nightly-suite.sh` will redirect the request to `tests/run-[llm serving engine name]-nightly.sh`, which parses the workload described in [nightly-tests.json](tests/nightly-tests.json) and performs the benchmark.
- At last, we run [scripts/plot-nightly-results.py](scripts/plot-nightly-results.py) to collect and plot the final benchmarking results, and update the results to buildkite.
#### Nightly tests
In [nightly-tests.json](tests/nightly-tests.json), we include the command line arguments for benchmarking commands, together with the benchmarking test cases. The format is highly similar to performance benchmark.
#### Docker containers
The docker containers for benchmarking are specified in `nightly-pipeline.yaml`.
WARNING: the docker versions are HARD-CODED and SHOULD BE ALIGNED WITH `nightly-descriptions.md`. The docker versions need to be hard-coded as there are several version-specific bug fixes inside `tests/run-[llm serving engine name]-nightly.sh`.
WARNING: populating `trt-llm` to latest version is not easy, as it requires updating several protobuf files in [tensorrt-demo](https://github.com/neuralmagic/tensorrt-demo.git).

View File

@ -55,6 +55,7 @@ while true; do
done
echo "--- Pulling container"
docker login registry-1.docker.io -u alexeivivanovamd -p ${DH_TOKEN}
image_name="rocmshared/vllm-ci:${BUILDKITE_COMMIT}"
container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
docker pull ${image_name}

View File

@ -3,26 +3,38 @@
set -ex
# Try building the docker image
docker build -t cpu-test -f Dockerfile.cpu .
docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-avx2 -f Dockerfile.cpu .
numactl -C 48-95 -N 1 docker build -t cpu-test -f Dockerfile.cpu .
numactl -C 48-95 -N 1 docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-avx2 -f Dockerfile.cpu .
# Setup cleanup
remove_docker_container() { docker rm -f cpu-test cpu-test-avx2 || true; }
trap remove_docker_container EXIT
remove_docker_container
# Run the image
# Run the image, setting --shm-size=4g for tensor parallel.
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=48-95 \
--cpuset-mems=1 --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --name cpu-test cpu-test
--cpuset-mems=1 --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test cpu-test
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=48-95 \
--cpuset-mems=1 --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --name cpu-test-avx2 cpu-test-avx2
--cpuset-mems=1 --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-avx2 cpu-test-avx2
# offline inference
docker exec cpu-test bash -c "python3 examples/offline_inference.py"
docker exec cpu-test-avx2 bash -c "python3 examples/offline_inference.py"
# Run basic model test
docker exec cpu-test bash -c "cd tests;
docker exec cpu-test bash -c "
pip install pytest Pillow protobuf
cd ../
pytest -v -s tests/models -m \"not vlm\" --ignore=tests/models/test_embedding.py --ignore=tests/models/test_registry.py --ignore=tests/models/test_jamba.py" # Mamba on CPU is not supported
# online inference
docker exec cpu-test bash -c "
export VLLM_CPU_KVCACHE_SPACE=10
export VLLM_CPU_OMP_THREADS_BIND=48-92
python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m &
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
python3 benchmarks/benchmark_serving.py \
--backend vllm \
--dataset-name random \
--model facebook/opt-125m \
--num-prompts 20 \
--endpoint /v1/completions \
--tokenizer facebook/opt-125m"

View File

@ -17,11 +17,10 @@ steps:
- pytest -v -s test_utils.py # Utils
- pytest -v -s worker # Worker
- label: Tensorizer, Metrics, Tracing Test
- label: Metrics, Tracing Test
fast_check: true
fast_check_only: true
commands:
- apt-get install -y curl libsodium23 && pytest -v -s tensorizer_loader # Tensorizer
- pytest -v -s metrics # Metrics
- "pip install \
opentelemetry-sdk \
@ -221,6 +220,8 @@ steps:
- label: Tensorizer Test
#mirror_hardwares: [amd]
soft_fail: true
fast_check: true
commands:
- apt-get install -y curl libsodium23
- export VLLM_WORKER_MULTIPROC_METHOD=spawn

View File

@ -2,8 +2,8 @@
FROM ubuntu:22.04 AS cpu-test-1
RUN apt-get update -y \
&& apt-get install -y git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 \
RUN apt-get update -y \
&& apt-get install -y curl git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 libnuma-dev \
&& update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
# https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/performance_tuning/tuning_guide.html
@ -13,8 +13,9 @@ RUN pip install intel-openmp
ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/usr/local/lib/libiomp5.so:$LD_PRELOAD"
RUN echo 'ulimit -c 0' >> ~/.bashrc
RUN pip install https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/cpu/intel_extension_for_pytorch-2.3.100%2Bgit0eb3473-cp310-cp310-linux_x86_64.whl
RUN pip install https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/cpu/intel_extension_for_pytorch-2.4.0%2Bgitfbaa4bc-cp310-cp310-linux_x86_64.whl
RUN pip install --upgrade pip \
&& pip install wheel packaging ninja "setuptools>=49.4.0" numpy
@ -25,7 +26,7 @@ COPY ./ /workspace/vllm
WORKDIR /workspace/vllm
RUN pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
RUN pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/test/cpu
# Support for building with non-AVX512 vLLM: docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" ...
ARG VLLM_CPU_DISABLE_AVX512

View File

@ -53,9 +53,9 @@ RUN apt-get purge -y sccache; python3 -m pip uninstall -y sccache; rm -f "$(whic
# Install torch == 2.5.0 on ROCm
RUN case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \
*"rocm-6.1"*) \
python3 -m pip uninstall -y torch torchaudio torchvision \
python3 -m pip uninstall -y torch torchvision \
&& python3 -m pip install --no-cache-dir --pre \
torch==2.5.0.dev20240710 torchaudio==2.4.0.dev20240710 \
torch==2.5.0.dev20240710 \
torchvision==0.20.0.dev20240710 \
--index-url https://download.pytorch.org/whl/nightly/rocm6.1;; \
*) ;; esac
@ -127,13 +127,6 @@ FROM base AS final
# Import the vLLM development directory from the build context
COPY . .
# Error related to odd state for numpy 1.20.3 where there is no METADATA etc, but an extra LICENSES_bundled.txt.
# Manually remove it so that later steps of numpy upgrade can continue
RUN case "$(which python3)" in \
*"/opt/conda/envs/py_3.9"*) \
rm -rf /opt/conda/envs/py_3.9/lib/python3.9/site-packages/numpy-1.20.3.dist-info/;; \
*) ;; esac
# Package upgrades for useful functionality or to avoid dependency issues
RUN --mount=type=cache,target=/root/.cache/pip \
python3 -m pip install --upgrade numba scipy huggingface-hub[cli]

View File

@ -16,16 +16,8 @@ Easy, fast, and cheap LLM serving for everyone
---
**The Fifth vLLM Bay Area Meetup (July 24th 5pm-8pm PT)**
We are excited to announce our fifth vLLM Meetup!
Join us to hear the vLLM's recent updates and the upcoming roadmap.
Additionally, our collaborators from AWS will be presenting their insights and experiences in deploying vLLM.
Register now [here](https://lu.ma/lp0gyjqr) and be part of the event!
---
*Latest News* 🔥
- [2024/07] We hosted [the fifth vLLM meetup](https://lu.ma/lp0gyjqr) with AWS! Please find the meetup slides [here](https://docs.google.com/presentation/d/1RgUD8aCfcHocghoP3zmXzck9vX3RCI9yfUAB2Bbcl4Y/edit?usp=sharing).
- [2024/07] In partnership with Meta, vLLM officially supports Llama 3.1 with FP8 quantization and pipeline parallelism! Please check out our blog post [here](https://blog.vllm.ai/2024/07/23/llama31.html).
- [2024/06] We hosted [the fourth vLLM meetup](https://lu.ma/agivllm) with Cloudflare and BentoML! Please find the meetup slides [here](https://docs.google.com/presentation/d/1iJ8o7V2bQEi0BFEljLTwc5G1S10_Rhv3beed5oB0NJ4/edit?usp=sharing).
- [2024/04] We hosted [the third vLLM meetup](https://robloxandvllmmeetup2024.splashthat.com/) with Roblox! Please find the meetup slides [here](https://docs.google.com/presentation/d/1A--47JAK4BJ39t954HyTkvtfwn0fkqtsL8NGFuslReM/edit?usp=sharing).
@ -47,7 +39,7 @@ vLLM is fast with:
- Quantization: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), [SqueezeLLM](https://arxiv.org/abs/2306.07629), FP8 KV Cache
- Optimized CUDA kernels
**Performance benchmark**: We include a [performance benchmark](https://buildkite.com/vllm/performance-benchmark/builds/3924) that compares the performance of vllm against other LLM serving engines ([TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM), [text-generation-inference](https://github.com/huggingface/text-generation-inference) and [lmdeploy](https://github.com/InternLM/lmdeploy)).
**Performance benchmark**: We include a [performance benchmark](https://buildkite.com/vllm/performance-benchmark/builds/4068) that compares the performance of vllm against other LLM serving engines ([TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM), [text-generation-inference](https://github.com/huggingface/text-generation-inference) and [lmdeploy](https://github.com/InternLM/lmdeploy)).
vLLM is flexible and easy to use with:

View File

@ -83,6 +83,8 @@ endif()
message(STATUS "CPU extension compile flags: ${CXX_COMPILE_FLAGS}")
list(APPEND LIBS "numa")
#
# Define extension targets
@ -95,6 +97,7 @@ set(VLLM_EXT_SRC
"csrc/cpu/activation.cpp"
"csrc/cpu/attention.cpp"
"csrc/cpu/cache.cpp"
"csrc/cpu/utils.cpp"
"csrc/cpu/layernorm.cpp"
"csrc/cpu/pos_encoding.cpp"
"csrc/cpu/torch_bindings.cpp")
@ -104,6 +107,7 @@ define_gpu_extension_target(
DESTINATION vllm
LANGUAGE CXX
SOURCES ${VLLM_EXT_SRC}
LIBRARIES ${LIBS}
COMPILE_FLAGS ${CXX_COMPILE_FLAGS}
USE_SABI 3
WITH_SOABI

View File

@ -25,7 +25,8 @@ void reshape_and_cache_flash(torch::Tensor& key, torch::Tensor& value,
torch::Tensor& key_cache,
torch::Tensor& value_cache,
torch::Tensor& slot_mapping,
const std::string& kv_cache_dtype);
const std::string& kv_cache_dtype,
const double k_scale, const double v_scale);
// Just for unittest
void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache,

View File

@ -203,17 +203,18 @@ __global__ void reshape_and_cache_kernel(
}
}
template <typename scalar_t>
template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
__global__ void reshape_and_cache_flash_kernel(
const scalar_t* __restrict__ key, // [num_tokens, num_heads, head_size]
const scalar_t* __restrict__ value, // [num_tokens, num_heads, head_size]
scalar_t* __restrict__ k_cache, // [num_blocks, block_size, num_heads,
cache_t* __restrict__ key_cache, // [num_blocks, block_size, num_heads,
// head_size]
scalar_t* __restrict__ v_cache, // [num_blocks, block_size, num_heads,
cache_t* __restrict__ value_cache, // [num_blocks, block_size, num_heads,
// head_size]
const int64_t* __restrict__ slot_mapping, // [num_tokens]
const int block_stride, const int key_stride, const int value_stride,
const int num_heads, const int head_size, const int block_size) {
const int num_heads, const int head_size, const int block_size,
const float k_scale, const float v_scale) {
const int64_t token_idx = blockIdx.x;
const int64_t slot_idx = slot_mapping[token_idx];
// NOTE: slot_idx can be -1 if the token is padded
@ -228,11 +229,20 @@ __global__ void reshape_and_cache_flash_kernel(
const int64_t src_value_idx = token_idx * value_stride + i;
const int head_idx = i / head_size;
const int head_offset = i % head_size;
const int64_t tgt_value_idx = block_idx * block_stride +
block_offset * num_heads * head_size +
head_idx * head_size + head_offset;
k_cache[tgt_value_idx] = key[src_key_idx];
v_cache[tgt_value_idx] = value[src_value_idx];
const int64_t tgt_key_value_idx = block_idx * block_stride +
block_offset * num_heads * head_size +
head_idx * head_size + head_offset;
scalar_t tgt_key = key[src_key_idx];
scalar_t tgt_value = value[src_value_idx];
if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
key_cache[tgt_key_value_idx] = tgt_key;
value_cache[tgt_key_value_idx] = tgt_value;
} else {
key_cache[tgt_key_value_idx] =
fp8::scaled_convert<cache_t, scalar_t, kv_dt>(tgt_key, k_scale);
value_cache[tgt_key_value_idx] =
fp8::scaled_convert<cache_t, scalar_t, kv_dt>(tgt_value, v_scale);
}
}
}
} // namespace vllm
@ -278,40 +288,45 @@ void reshape_and_cache(
CALL_RESHAPE_AND_CACHE)
}
// KV_T is the stored data type of kv-cache.
// CACHE_T is the data type of key and value tensors.
// KV_DTYPE is the real data type of kv-cache.
#define CALL_RESHAPE_AND_CACHE_FLASH(KV_T, CACHE_T, KV_DTYPE) \
vllm::reshape_and_cache_flash_kernel<KV_T, CACHE_T, KV_DTYPE> \
<<<grid, block, 0, stream>>>( \
reinterpret_cast<KV_T*>(key.data_ptr()), \
reinterpret_cast<KV_T*>(value.data_ptr()), \
reinterpret_cast<CACHE_T*>(key_cache.data_ptr()), \
reinterpret_cast<CACHE_T*>(value_cache.data_ptr()), \
slot_mapping.data_ptr<int64_t>(), block_stride, key_stride, \
value_stride, num_heads, head_size, block_size, k_scale, v_scale);
void reshape_and_cache_flash(
torch::Tensor& key, // [num_tokens, num_heads, head_size]
torch::Tensor& value, // [num_tokens, num_heads, head_size]
torch::Tensor& k_cache, // [num_blocks, block_size, num_heads, head_size]
torch::Tensor& v_cache, // [num_blocks, block_size, num_heads, head_size]
torch::Tensor& key, // [num_tokens, num_heads, head_size]
torch::Tensor& value, // [num_tokens, num_heads, head_size]
torch::Tensor& key_cache, // [num_blocks, block_size, num_heads, head_size]
torch::Tensor&
value_cache, // [num_blocks, block_size, num_heads, head_size]
torch::Tensor& slot_mapping, // [num_tokens]
const std::string& kv_cache_dtype) {
// FIXME: only support auto datatype, does not support fp8
if (kv_cache_dtype != "auto") {
TORCH_CHECK(false, "Unsupported data type of kv cache: ", kv_cache_dtype);
}
const std::string& kv_cache_dtype, const double k_scale,
const double v_scale) {
int num_tokens = key.size(0);
int num_heads = key.size(1);
int head_size = key.size(2);
int block_size = k_cache.size(1);
int block_size = key_cache.size(1);
int key_stride = key.stride(0);
int value_stride = value.stride(0);
int block_stride = k_cache.stride(0);
TORCH_CHECK(k_cache.stride(0) == v_cache.stride(0));
int block_stride = key_cache.stride(0);
TORCH_CHECK(key_cache.stride(0) == value_cache.stride(0));
dim3 grid(num_tokens);
dim3 block(std::min(num_heads * head_size, 512));
const at::cuda::OptionalCUDAGuard device_guard(device_of(key));
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
VLLM_DISPATCH_FLOATING_TYPES(
key.scalar_type(), "reshape_and_cache_flash", [&] {
vllm::reshape_and_cache_flash_kernel<scalar_t>
<<<grid, block, 0, stream>>>(
key.data_ptr<scalar_t>(), value.data_ptr<scalar_t>(),
k_cache.data_ptr<scalar_t>(), v_cache.data_ptr<scalar_t>(),
slot_mapping.data_ptr<int64_t>(), block_stride, key_stride,
value_stride, num_heads, head_size, block_size);
});
DISPATCH_BY_KV_CACHE_DTYPE(key.dtype(), kv_cache_dtype,
CALL_RESHAPE_AND_CACHE_FLASH);
}
namespace vllm {

View File

@ -4,6 +4,8 @@
#include <torch/library.h>
void init_cpu_threads_env(const std::string& cpu_ids);
TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
// vLLM custom ops
@ -107,4 +109,9 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
cache_ops.impl("reshape_and_cache", torch::kCPU, &reshape_and_cache);
}
TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _utils), utils) {
// CPU utils
utils.def("init_cpu_threads_env(str cpu_ids) -> ()", &init_cpu_threads_env);
}
REGISTER_EXTENSION(TORCH_EXTENSION_NAME)

65
csrc/cpu/utils.cpp Normal file
View File

@ -0,0 +1,65 @@
#include <numa.h>
#include <unistd.h>
#include <string>
#include <sched.h>
#include "cpu_types.hpp"
void init_cpu_threads_env(const std::string& cpu_ids) {
bitmask* omp_cpu_mask = numa_parse_cpustring(cpu_ids.c_str());
TORCH_CHECK(omp_cpu_mask->size > 0);
std::vector<int> omp_cpu_ids;
omp_cpu_ids.reserve(omp_cpu_mask->size);
constexpr int group_size = 8 * sizeof(*omp_cpu_mask->maskp);
for (int offset = 0; offset < omp_cpu_mask->size; offset += group_size) {
unsigned long group_mask = omp_cpu_mask->maskp[offset / group_size];
int i = 0;
while (group_mask) {
if (group_mask & 1) {
omp_cpu_ids.emplace_back(offset + i);
}
++i;
group_mask >>= 1;
}
}
// Memory node binding
if (numa_available() != -1) {
int mem_node_id = numa_node_of_cpu(omp_cpu_ids.front());
bitmask* mask = numa_parse_nodestring(std::to_string(mem_node_id).c_str());
bitmask* src_mask = numa_get_membind();
int pid = getpid();
// move all existing pages to the specified numa node.
*(src_mask->maskp) = *(src_mask->maskp) ^ *(mask->maskp);
int page_num = numa_migrate_pages(pid, src_mask, mask);
if (page_num == -1) {
TORCH_CHECK(false,
"numa_migrate_pages failed. errno: " + std::to_string(errno));
}
// restrict memory allocation node.
numa_set_membind(mask);
numa_set_strict(1);
}
// OMP threads binding
omp_set_num_threads((int)omp_cpu_ids.size());
torch::set_num_threads((int)omp_cpu_ids.size());
TORCH_CHECK_EQ(omp_cpu_ids.size(), torch::get_num_threads());
TORCH_CHECK_EQ(omp_cpu_ids.size(), omp_get_max_threads());
#pragma omp parallel for schedule(static, 1)
for (size_t i = 0; i < omp_cpu_ids.size(); ++i) {
cpu_set_t* mask = CPU_ALLOC(omp_cpu_mask->size);
size_t size = CPU_ALLOC_SIZE(omp_cpu_mask->size);
CPU_ZERO_S(size, mask);
CPU_SET_S(omp_cpu_ids[i], size, mask);
sched_setaffinity(0, sizeof(cpu_set_t), mask);
CPU_FREE(mask);
}
numa_free_nodemask(omp_cpu_mask);
}

View File

@ -328,20 +328,36 @@ struct Sm90ColOrScalarBroadcast {
return EmptyProducerLoadCallbacks{};
}
template<class GTensor, class RTensor>
template<class GTensor, class RTensor, class CTensor, class ProblemShape>
struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
CUTLASS_DEVICE
ConsumerStoreCallbacks(GTensor&& tCgCol, RTensor&& tCrCol, Params const& params)
: tCgCol(cute::forward<GTensor>(tCgCol)),
tCrCol(cute::forward<RTensor>(tCrCol)),
params(params) {}
ConsumerStoreCallbacks(
GTensor&& tCgCol,
RTensor&& tCrCol,
CTensor&& tCcCol,
ProblemShape problem_shape,
Params const& params
):
tCgCol(cute::forward<GTensor>(tCgCol)),
tCrCol(cute::forward<RTensor>(tCrCol)),
tCcCol(cute::forward<CTensor>(tCcCol)),
m(get<0>(problem_shape)),
params(params) {}
GTensor tCgCol; // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
RTensor tCrCol; // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
RTensor tCrCol;
CTensor tCcCol; // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
Params const& params;
int m;
CUTLASS_DEVICE void
begin() {
Tensor pred = make_tensor<bool>(shape(tCgCol));
CUTLASS_PRAGMA_UNROLL
for (int i = 0; i < size(pred); ++i) {
pred(i) = get<0>(tCcCol(i)) < m;
}
if (!params.col_broadcast) {
fill(tCrCol, *(params.ptr_col));
return;
@ -349,7 +365,7 @@ struct Sm90ColOrScalarBroadcast {
// Filter so we don't issue redundant copies over stride-0 modes
// (only works if 0-strides are in same location, which is by construction)
copy_aligned(filter(tCgCol), filter(tCrCol));
copy_if(pred, filter(tCgCol), filter(tCrCol));
}
template <typename ElementAccumulator, int FragmentSize>
@ -381,8 +397,20 @@ struct Sm90ColOrScalarBroadcast {
mCol, args.tile_shape_mnk, args.tile_coord_mnkl, args.epi_tile, args.tiled_copy, args.thread_idx);
Tensor tCrCol = make_tensor_like(tCgCol); // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
return ConsumerStoreCallbacks<decltype(tCgCol), decltype(tCrCol)>(
cute::move(tCgCol), cute::move(tCrCol), params);
// Generate an identity tensor matching the shape of the global tensor and
// partition the same way, this will be used to generate the predicate
// tensor for loading
Tensor cCol = make_identity_tensor(mCol.shape());
Tensor tCcCol = sm90_partition_for_epilogue<ReferenceSrc>( // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
cCol, args.tile_shape_mnk, args.tile_coord_mnkl, args.epi_tile, args.tiled_copy, args.thread_idx);
return ConsumerStoreCallbacks(
cute::move(tCgCol),
cute::move(tCrCol),
cute::move(tCcCol),
args.problem_shape_mnkl,
params
);
}
};

View File

@ -48,7 +48,7 @@ __global__ void segmented_max_reduction(float* __restrict__ scale,
const scalar_t* __restrict__ input,
int64_t num_elems) {
__shared__ float cache[1024];
int i = blockDim.x * blockIdx.x + threadIdx.x;
int64_t i = blockDim.x * blockIdx.x + threadIdx.x;
// First store maximum for all values processes by
// the current thread in cache[threadIdx.x]

View File

@ -248,7 +248,8 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
" Tensor! key_cache,"
" Tensor! value_cache,"
" Tensor slot_mapping,"
" str kv_cache_dtype) -> ()");
" str kv_cache_dtype,"
" float k_scale, float v_scale) -> ()");
cache_ops.impl("reshape_and_cache_flash", torch::kCUDA,
&reshape_and_cache_flash);

View File

@ -5,6 +5,7 @@ vLLM Meetups
We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below:
- `The fifth vLLM meetup <https://lu.ma/lp0gyjqr>`__, with AWS, July 24th 2024. `[Slides] <https://docs.google.com/presentation/d/1RgUD8aCfcHocghoP3zmXzck9vX3RCI9yfUAB2Bbcl4Y/edit?usp=sharing>`__
- `The fourth vLLM meetup <https://lu.ma/agivllm>`__, with Cloudflare and BentoML, June 11th 2024. `[Slides] <https://docs.google.com/presentation/d/1iJ8o7V2bQEi0BFEljLTwc5G1S10_Rhv3beed5oB0NJ4/edit?usp=sharing>`__
- `The third vLLM meetup <https://robloxandvllmmeetup2024.splashthat.com/>`__, with Roblox, April 2nd 2024. `[Slides] <https://docs.google.com/presentation/d/1A--47JAK4BJ39t954HyTkvtfwn0fkqtsL8NGFuslReM/edit?usp=sharing>`__
- `The second vLLM meetup <https://lu.ma/ygxbpzhl>`__, with IBM Research, January 31st 2024. `[Slides] <https://docs.google.com/presentation/d/12mI2sKABnUw5RBWXDYY-HtHth4iMSNcEoQ10jDQbxgA/edit?usp=sharing>`__ `[Video (vLLM Update)] <https://youtu.be/Y0C-DUvEnZQ>`__ `[Video (IBM Research & torch.compile)] <https://youtu.be/m0dMtFLI-dg>`__

View File

@ -40,6 +40,8 @@ Registry
Base Classes
------------
.. autodata:: vllm.multimodal.NestedTensors
.. autodata:: vllm.multimodal.BatchedTensors
.. autoclass:: vllm.multimodal.MultiModalDataBuiltins

View File

@ -107,9 +107,45 @@ Alternatively, wheels intended for vLLM use can be accessed under the releases.
$ python setup.py develop # This may take 5-10 minutes. Currently, `pip install .`` does not work for ROCm installation
.. tip::
For example, vLLM v0.5.3 on ROCM 6.1 can be built with the following steps:
.. code-block:: console
$ pip install --upgrade pip
$ # Install PyTorch
$ pip uninstall torch -y
$ pip install --no-cache-dir --pre torch==2.5.0.dev20240710 --index-url https://download.pytorch.org/whl/nightly/rocm6.1
$ # Build & install AMD SMI
$ pip install /opt/rocm/share/amd_smi
$ # Install dependencies
$ pip install --upgrade numba scipy huggingface-hub[cli]
$ pip install "numpy<2"
$ pip install -r requirements-rocm.txt
$ # Apply the patch to ROCM 6.1 (requires root permission)
$ wget -N https://github.com/ROCm/vllm/raw/fa78403/rocm_patch/libamdhip64.so.6 -P /opt/rocm/lib
$ rm -f "$(python3 -c 'import torch; print(torch.__path__[0])')"/lib/libamdhip64.so*
$ # Build vLLM for MI210/MI250/MI300.
$ export PYTORCH_ROCM_ARCH="gfx90a;gfx942"
$ python3 setup.py develop
.. tip::
- Triton flash attention is used by default. For benchmarking purposes, it is recommended to run a warm up step before collecting perf numbers.
- Triton flash attention does not currently support sliding window attention. If using half precision, please use CK flash-attention for sliding window support.
- To use CK flash-attention or PyTorch naive attention, please use this flag ``export VLLM_USE_TRITON_FLASH_ATTN=0`` to turn off triton flash attention.
- The ROCm version of PyTorch, ideally, should match the ROCm driver version.
.. tip::
- For MI300x (gfx942) users, to achieve optimal performance, please refer to `MI300x tuning guide <https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/index.html>`_ for performance optimization and tuning tips on system and workflow level.
For vLLM, please refer to `vLLM performance optimization <https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/workload.html#vllm-performance-optimization>`_.

View File

@ -10,6 +10,7 @@ Table of contents:
#. :ref:`Requirements <cpu_backend_requirements>`
#. :ref:`Quick start using Dockerfile <cpu_backend_quick_start_dockerfile>`
#. :ref:`Build from source <build_cpu_backend_from_source>`
#. :ref:`Related runtime environment variables <env_intro>`
#. :ref:`Intel Extension for PyTorch <ipex_guidance>`
#. :ref:`Performance tips <cpu_backend_performance_tips>`
@ -47,7 +48,7 @@ Build from source
.. code-block:: console
$ sudo apt-get update -y
$ sudo apt-get install -y gcc-12 g++-12
$ sudo apt-get install -y gcc-12 g++-12 libnuma-dev
$ sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
- Second, install Python packages for vLLM CPU backend building:
@ -71,6 +72,15 @@ Build from source
- If you want to force enable AVX512_BF16 for the cross-compilation, please set environment variable VLLM_CPU_AVX512BF16=1 before the building.
.. _env_intro:
Related runtime environment variables
-------------------------------------
- ``VLLM_CPU_KVCACHE_SPACE``: specify the KV Cache size (e.g, ``VLLM_CPU_KVCACHE_SPACE=40`` means 40 GB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users.
- ``VLLM_CPU_OMP_THREADS_BIND``: specify the CPU cores dedicated to the OpenMP threads. For example, ``VLLM_CPU_OMP_THREADS_BIND=0-31`` means there will be 32 OpenMP threads bound on 0-31 CPU cores. ``VLLM_CPU_OMP_THREADS_BIND=0-31|32-63`` means there will be 2 tensor parallel processes, 32 OpenMP threads of rank0 are bound on 0-31 CPU cores, and the OpenMP threads of rank1 are bound on 32-63 CPU cores.
.. _ipex_guidance:
Intel Extension for PyTorch
@ -78,15 +88,11 @@ Intel Extension for PyTorch
- `Intel Extension for PyTorch (IPEX) <https://github.com/intel/intel-extension-for-pytorch>`_ extends PyTorch with up-to-date features optimizations for an extra performance boost on Intel hardware.
- IPEX after the ``2.3.0`` can be enabled in the CPU backend by default if it is installed.
.. _cpu_backend_performance_tips:
Performance tips
-----------------
- vLLM CPU backend uses environment variable ``VLLM_CPU_KVCACHE_SPACE`` to specify the KV Cache size (e.g, ``VLLM_CPU_KVCACHE_SPACE=40`` means 40 GB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users.
- We highly recommend to use TCMalloc for high performance memory allocation and better cache locality. For example, on Ubuntu 22.4, you can run:
.. code-block:: console
@ -96,11 +102,44 @@ Performance tips
$ export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$LD_PRELOAD # prepend the library to LD_PRELOAD
$ python examples/offline_inference.py # run vLLM
- vLLM CPU backend uses OpenMP for thread-parallel computation. If you want the best performance on CPU, it will be very critical to isolate CPU cores for OpenMP threads with other thread pools (like web-service event-loop), to avoid CPU oversubscription.
- When using the online serving, it is recommended to reserve 1-2 CPU cores for the serving framework to avoid CPU oversubscription. For example, on a platform with 32 physical CPU cores, reserving CPU 30 and 31 for the framework and using CPU 0-29 for OpenMP:
- If using vLLM CPU backend on a bare-metal machine, it is recommended to disable the hyper-threading.
.. code-block:: console
- If using vLLM CPU backend on a multi-socket machine with NUMA, be aware to set CPU cores and memory nodes, to avoid the remote memory node access. ``numactl`` is an useful tool for CPU core and memory binding on NUMA platform. Besides, ``--cpuset-cpus`` and ``--cpuset-mems`` arguments of ``docker run`` are also useful.
$ export VLLM_CPU_KVCACHE_SPACE=40
$ export VLLM_CPU_OMP_THREADS_BIND=0-29
$ vllm serve facebook/opt-125m
- If using vLLM CPU backend on a machine with hyper-threading, it is recommended to bind only one OpenMP thread on each physical CPU core using ``VLLM_CPU_OMP_THREADS_BIND``. On a hyper-threading enabled platform with 16 logical CPU cores / 8 physical CPU cores:
.. code-block:: console
$ lscpu -e # check the mapping between logical CPU cores and physical CPU cores
# The "CPU" column means the logical CPU core IDs, and the "CORE" column means the physical core IDs. On this platform, two logical cores are sharing one physical core.
CPU NODE SOCKET CORE L1d:L1i:L2:L3 ONLINE MAXMHZ MINMHZ MHZ
0 0 0 0 0:0:0:0 yes 2401.0000 800.0000 800.000
1 0 0 1 1:1:1:0 yes 2401.0000 800.0000 800.000
2 0 0 2 2:2:2:0 yes 2401.0000 800.0000 800.000
3 0 0 3 3:3:3:0 yes 2401.0000 800.0000 800.000
4 0 0 4 4:4:4:0 yes 2401.0000 800.0000 800.000
5 0 0 5 5:5:5:0 yes 2401.0000 800.0000 800.000
6 0 0 6 6:6:6:0 yes 2401.0000 800.0000 800.000
7 0 0 7 7:7:7:0 yes 2401.0000 800.0000 800.000
8 0 0 0 0:0:0:0 yes 2401.0000 800.0000 800.000
9 0 0 1 1:1:1:0 yes 2401.0000 800.0000 800.000
10 0 0 2 2:2:2:0 yes 2401.0000 800.0000 800.000
11 0 0 3 3:3:3:0 yes 2401.0000 800.0000 800.000
12 0 0 4 4:4:4:0 yes 2401.0000 800.0000 800.000
13 0 0 5 5:5:5:0 yes 2401.0000 800.0000 800.000
14 0 0 6 6:6:6:0 yes 2401.0000 800.0000 800.000
15 0 0 7 7:7:7:0 yes 2401.0000 800.0000 800.000
# On this platform, it is recommend to only bind openMP threads on logical CPU cores 0-7 or 8-15
$ export VLLM_CPU_OMP_THREADS_BIND=0-7
$ python examples/offline_inference.py
- If using vLLM CPU backend on a multi-socket machine with NUMA, be aware to set CPU cores using ``VLLM_CPU_OMP_THREADS_BIND`` to avoid cross NUMA node memory access.

View File

@ -65,6 +65,10 @@ Here are some common issues that can cause hangs:
If the problem persists, feel free to `open an issue on GitHub <https://github.com/vllm-project/vllm/issues/new/choose>`_, with a detailed description of the issue, your environment, and the logs.
Some known issues:
- In ``v0.5.2``, ``v0.5.3``, and ``v0.5.3.post1``, there is a bug caused by `zmq <https://github.com/zeromq/pyzmq/issues/2000>`_ , which can cause hangs at a low probability (once in about 20 times, depending on the machine configuration). The solution is to upgrade to the latest version of ``vllm`` to include the `fix <https://github.com/vllm-project/vllm/pull/6759>`_ .
.. warning::
After you find the root cause and solve the issue, remember to turn off all the debugging environment variables defined above, or simply start a new shell to avoid being affected by the debugging settings. If you don't do this, the system might be slow because many debugging functionalities are turned on.

View File

@ -105,6 +105,7 @@ Documentation
quantization/supported_hardware
quantization/auto_awq
quantization/bnb
quantization/fp8
quantization/fp8_e5m2_kvcache
quantization/fp8_e4m3_kvcache
@ -116,6 +117,12 @@ Documentation
automatic_prefix_caching/apc
automatic_prefix_caching/details
.. toctree::
:maxdepth: 1
:caption: Performance benchmarks
performance_benchmark/benchmarks
.. toctree::
:maxdepth: 2
:caption: Developer Documentation

View File

@ -113,6 +113,10 @@ Decoder-only Language Models
- MPT, MPT-Instruct, MPT-Chat, MPT-StoryWriter
- :code:`mosaicml/mpt-7b`, :code:`mosaicml/mpt-7b-storywriter`, :code:`mosaicml/mpt-30b`, etc.
-
* - :code:`NemotronForCausalLM`
- Nemotron-3, Nemotron-4, Minitron
- :code:`nvidia/Minitron-8B-Base`, :code:`mgoin/Nemotron-4-340B-Base-hf-FP8`, etc.
- ✅︎
* - :code:`OLMoForCausalLM`
- OLMo
- :code:`allenai/OLMo-1B-hf`, :code:`allenai/OLMo-7B-hf`, etc.
@ -206,6 +210,10 @@ Vision Language Models
- Phi-3-Vision
- :code:`microsoft/Phi-3-vision-128k-instruct`, etc.
-
* - :code:`MiniCPM-V`
- MiniCPM-V
- :code:`openbmb/MiniCPM-V-2`, :code:`openbmb/MiniCPM-Llama3-V-2_5`, etc.
-
If your model uses one of the above model architectures, you can seamlessly run your model with vLLM.
Otherwise, please refer to :ref:`Adding a New Model <adding_a_new_model>` and :ref:`Enabling Multimodal Inputs <enabling_multimodal_inputs>`

View File

@ -0,0 +1,23 @@
.. _benchmarks:
Benchmark suites of vLLM
========================
vLLM contains two sets of benchmarks:
+ **Performance benchmarks**: benchmark vLLM's performance under various workloads at a high frequency (when a pull request (PR for short) of vLLM is being merged). See `vLLM performance dashboard <https://perf.vllm.ai>`_ for the latest performance results.
+ **Nightly benchmarks**: compare vLLM's performance against alternatives (tgi, trt-llm, and lmdeploy) when there are major updates of vLLM (e.g., bumping up to a new version). The latest results are available in the `vLLM GitHub README <https://github.com/vllm-project/vllm/blob/main/README.md>`_.
Trigger a benchmark
-------------------
The performance benchmarks and nightly benchmarks can be triggered by submitting a PR to vLLM, and label the PR with `perf-benchmarks` and `nightly-benchmarks`.
.. note::
Please refer to `vLLM performance benchmark descriptions <https://github.com/vllm-project/vllm/blob/main/.buildkite/nightly-benchmarks/tests/descriptions.md>`_ and `vLLM nightly benchmark descriptions <https://github.com/vllm-project/vllm/blob/main/.buildkite/nightly-benchmarks/nightly-descriptions.md>`_ for detailed descriptions on benchmark environment, workload and metrics.

View File

@ -0,0 +1,43 @@
.. _bits_and_bytes:
BitsAndBytes
==================
vLLM now supports `BitsAndBytes <https://github.com/TimDettmers/bitsandbytes>`_ for more efficient model inference.
BitsAndBytes quantizes models to reduce memory usage and enhance performance without significantly sacrificing accuracy.
Compared to other quantization methods, BitsAndBytes eliminates the need for calibrating the quantized model with input data.
Below are the steps to utilize BitsAndBytes with vLLM.
.. code-block:: console
$ pip install bitsandbytes>=0.42.0
vLLM reads the model's config file and supports both in-flight quantization and pre-quantized checkpoint.
You can find bitsandbytes quantized models on https://huggingface.co/models?other=bitsandbytes.
And usually, these repositories have a config.json file that includes a quantization_config section.
Read quantized checkpoint.
--------------------------
.. code-block:: python
from vllm import LLM
import torch
# unsloth/tinyllama-bnb-4bit is a pre-quantized checkpoint.
model_id = "unsloth/tinyllama-bnb-4bit"
llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, \
quantization="bitsandbytes", load_format="bitsandbytes")
Inflight quantization: load as 4bit quantization
------------------------------------------------
.. code-block:: python
from vllm import LLM
import torch
model_id = "huggyllama/llama-7b"
llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, \
quantization="bitsandbytes", load_format="bitsandbytes")

View File

@ -79,7 +79,7 @@ On the rest of the worker nodes, run the following command:
$ --worker \
$ /path/to/the/huggingface/home/in/this/node
Then you get a ray cluster of containers. Note that you need to keep the shells running these commands alive to hold the cluster. Any shell disconnect will terminate the cluster.
Then you get a ray cluster of containers. Note that you need to keep the shells running these commands alive to hold the cluster. Any shell disconnect will terminate the cluster. In addition, please note that the argument ``ip_of_head_node`` should be the IP address of the head node, which is accessible by all the worker nodes. A common misunderstanding is to use the IP address of the worker node, which is not correct.
Then, on any node, use ``docker exec -it node /bin/bash`` to enter the container, execute ``ray status`` to check the status of the Ray cluster. You should see the right number of nodes and GPUs.
@ -101,7 +101,7 @@ You can also use tensor parallel without pipeline parallel, just set the tensor
To make tensor parallel performant, you should make sure the communication between nodes is efficient, e.g. using high-speed network cards like Infiniband. To correctly set up the cluster to use Infiniband, append additional arguments like ``--privileged -e NCCL_IB_HCA=mlx5`` to the ``run_cluster.sh`` script. Please contact your system administrator for more information on how to set up the flags. One way to confirm if the Infiniband is working is to run vLLM with ``NCCL_DEBUG=TRACE`` environment variable set, e.g. ``NCCL_DEBUG=TRACE vllm serve ...`` and check the logs for the NCCL version and the network used. If you find ``[send] via NET/Socket`` in the logs, it means NCCL uses raw TCP Socket, which is not efficient for cross-node tensor parallel. If you find ``[send] via NET/IB/GDRDMA`` in the logs, it means NCCL uses Infiniband with GPU-Direct RDMA, which is efficient.
.. warning::
After you start the Ray cluster, you'd better also check the GPU-GPU communication between nodes. It can be non-trivial to set up. Please refer to the `sanity check script <https://docs.vllm.ai/en/latest/getting_started/debugging.html>`_ for more information.
After you start the Ray cluster, you'd better also check the GPU-GPU communication between nodes. It can be non-trivial to set up. Please refer to the `sanity check script <https://docs.vllm.ai/en/latest/getting_started/debugging.html>`_ for more information. If you need to set some environment variables for the communication configuration, you can append them to the ``run_cluster.sh`` script, e.g. ``-e NCCL_SOCKET_IFNAME=eth0``. Note that setting environment variables in the shell (e.g. ``NCCL_SOCKET_IFNAME=eth0 vllm serve ...``) only works for the processes in the same node, not for the processes in the other nodes. Setting environment variables when you create the cluster is the recommended way. See the `discussion <https://github.com/vllm-project/vllm/issues/6803>`_ for more information.
.. warning::

View File

@ -5,9 +5,9 @@ Deploying and scaling up with SkyPilot
.. raw:: html
<p align="center">
<img src="https://imgur.com/yxtzPEu.png" alt="vLLM"/>
</p>
<p align="center">
<img src="https://imgur.com/yxtzPEu.png" alt="vLLM"/>
</p>
vLLM can be **run and scaled to multiple service replicas on clouds and Kubernetes** with `SkyPilot <https://github.com/skypilot-org/skypilot>`__, an open-source framework for running LLMs on any cloud. More examples for various open models, such as Llama-3, Mixtral, etc, can be found in `SkyPilot AI gallery <https://skypilot.readthedocs.io/en/latest/gallery/index.html>`__.
@ -21,8 +21,8 @@ Prerequisites
.. code-block:: console
pip install skypilot-nightly
sky check
pip install skypilot-nightly
sky check
Run on a single instance
@ -32,64 +32,64 @@ See the vLLM SkyPilot YAML for serving, `serving.yaml <https://github.com/skypil
.. code-block:: yaml
resources:
accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model.
use_spot: True
disk_size: 512 # Ensure model checkpoints can fit.
disk_tier: best
ports: 8081 # Expose to internet traffic.
resources:
accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model.
use_spot: True
disk_size: 512 # Ensure model checkpoints can fit.
disk_tier: best
ports: 8081 # Expose to internet traffic.
envs:
MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
HF_TOKEN: <your-huggingface-token> # Change to your own huggingface token, or use --env to pass.
envs:
MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
HF_TOKEN: <your-huggingface-token> # Change to your own huggingface token, or use --env to pass.
setup: |
conda create -n vllm python=3.10 -y
conda activate vllm
setup: |
conda create -n vllm python=3.10 -y
conda activate vllm
pip install vllm==0.4.0.post1
# Install Gradio for web UI.
pip install gradio openai
pip install flash-attn==2.5.7
pip install vllm==0.4.0.post1
# Install Gradio for web UI.
pip install gradio openai
pip install flash-attn==2.5.7
run: |
conda activate vllm
echo 'Starting vllm api server...'
python -u -m vllm.entrypoints.openai.api_server \
--port 8081 \
--model $MODEL_NAME \
--trust-remote-code \
--tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
2>&1 | tee api_server.log &
echo 'Waiting for vllm api server to start...'
while ! `cat api_server.log | grep -q 'Uvicorn running on'`; do sleep 1; done
run: |
conda activate vllm
echo 'Starting vllm api server...'
python -u -m vllm.entrypoints.openai.api_server \
--port 8081 \
--model $MODEL_NAME \
--trust-remote-code \
--tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
2>&1 | tee api_server.log &
echo 'Waiting for vllm api server to start...'
while ! `cat api_server.log | grep -q 'Uvicorn running on'`; do sleep 1; done
echo 'Starting gradio server...'
git clone https://github.com/vllm-project/vllm.git || true
python vllm/examples/gradio_openai_chatbot_webserver.py \
-m $MODEL_NAME \
--port 8811 \
--model-url http://localhost:8081/v1 \
--stop-token-ids 128009,128001
echo 'Starting gradio server...'
git clone https://github.com/vllm-project/vllm.git || true
python vllm/examples/gradio_openai_chatbot_webserver.py \
-m $MODEL_NAME \
--port 8811 \
--model-url http://localhost:8081/v1 \
--stop-token-ids 128009,128001
Start the serving the Llama-3 8B model on any of the candidate GPUs listed (L4, A10g, ...):
.. code-block:: console
HF_TOKEN="your-huggingface-token" sky launch serving.yaml --env HF_TOKEN
HF_TOKEN="your-huggingface-token" sky launch serving.yaml --env HF_TOKEN
Check the output of the command. There will be a shareable gradio link (like the last line of the following). Open it in your browser to use the LLaMA model to do the text completion.
.. code-block:: console
(task, pid=7431) Running on public URL: https://<gradio-hash>.gradio.live
(task, pid=7431) Running on public URL: https://<gradio-hash>.gradio.live
**Optional**: Serve the 70B model instead of the default 8B and use more GPU:
.. code-block:: console
HF_TOKEN="your-huggingface-token" sky launch serving.yaml --gpus A100:8 --env HF_TOKEN --env MODEL_NAME=meta-llama/Meta-Llama-3-70B-Instruct
HF_TOKEN="your-huggingface-token" sky launch serving.yaml --gpus A100:8 --env HF_TOKEN --env MODEL_NAME=meta-llama/Meta-Llama-3-70B-Instruct
Scale up to multiple replicas
@ -99,151 +99,212 @@ SkyPilot can scale up the service to multiple service replicas with built-in aut
.. code-block:: yaml
service:
replicas: 2
# An actual request for readiness probe.
readiness_probe:
path: /v1/chat/completions
post_data:
model: $MODEL_NAME
messages:
- role: user
content: Hello! What is your name?
max_tokens: 1
service:
replicas: 2
# An actual request for readiness probe.
readiness_probe:
path: /v1/chat/completions
post_data:
model: $MODEL_NAME
messages:
- role: user
content: Hello! What is your name?
max_tokens: 1
.. raw:: html
<details>
<summary>Click to see the full recipe YAML</summary>
<details>
<summary>Click to see the full recipe YAML</summary>
.. code-block:: yaml
service:
replicas: 2
# An actual request for readiness probe.
readiness_probe:
path: /v1/chat/completions
post_data:
model: $MODEL_NAME
messages:
- role: user
content: Hello! What is your name?
service:
replicas: 2
# An actual request for readiness probe.
readiness_probe:
path: /v1/chat/completions
post_data:
model: $MODEL_NAME
messages:
- role: user
content: Hello! What is your name?
max_tokens: 1
resources:
accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model.
use_spot: True
disk_size: 512 # Ensure model checkpoints can fit.
disk_tier: best
ports: 8081 # Expose to internet traffic.
resources:
accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model.
use_spot: True
disk_size: 512 # Ensure model checkpoints can fit.
disk_tier: best
ports: 8081 # Expose to internet traffic.
envs:
MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
HF_TOKEN: <your-huggingface-token> # Change to your own huggingface token, or use --env to pass.
envs:
MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
HF_TOKEN: <your-huggingface-token> # Change to your own huggingface token, or use --env to pass.
setup: |
conda create -n vllm python=3.10 -y
conda activate vllm
setup: |
conda create -n vllm python=3.10 -y
conda activate vllm
pip install vllm==0.4.0.post1
# Install Gradio for web UI.
pip install gradio openai
pip install flash-attn==2.5.7
pip install vllm==0.4.0.post1
# Install Gradio for web UI.
pip install gradio openai
pip install flash-attn==2.5.7
run: |
conda activate vllm
echo 'Starting vllm api server...'
python -u -m vllm.entrypoints.openai.api_server \
--port 8081 \
--model $MODEL_NAME \
--trust-remote-code \
--tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
2>&1 | tee api_server.log &
echo 'Waiting for vllm api server to start...'
while ! `cat api_server.log | grep -q 'Uvicorn running on'`; do sleep 1; done
echo 'Starting gradio server...'
git clone https://github.com/vllm-project/vllm.git || true
python vllm/examples/gradio_openai_chatbot_webserver.py \
-m $MODEL_NAME \
--port 8811 \
--model-url http://localhost:8081/v1 \
--stop-token-ids 128009,128001
run: |
conda activate vllm
echo 'Starting vllm api server...'
python -u -m vllm.entrypoints.openai.api_server \
--port 8081 \
--model $MODEL_NAME \
--trust-remote-code \
--tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
2>&1 | tee api_server.log
.. raw:: html
</details>
</details>
Start the serving the Llama-3 8B model on multiple replicas:
.. code-block:: console
HF_TOKEN="your-huggingface-token" sky serve up -n vllm serving.yaml --env HF_TOKEN
HF_TOKEN="your-huggingface-token" sky serve up -n vllm serving.yaml --env HF_TOKEN
Wait until the service is ready:
.. code-block:: console
watch -n10 sky serve status vllm
watch -n10 sky serve status vllm
.. raw:: html
<details>
<summary>Example outputs:</summary>
<details>
<summary>Example outputs:</summary>
.. code-block:: console
Services
NAME VERSION UPTIME STATUS REPLICAS ENDPOINT
vllm 1 35s READY 2/2 xx.yy.zz.100:30001
Services
NAME VERSION UPTIME STATUS REPLICAS ENDPOINT
vllm 1 35s READY 2/2 xx.yy.zz.100:30001
Service Replicas
SERVICE_NAME ID VERSION IP LAUNCHED RESOURCES STATUS REGION
vllm 1 1 xx.yy.zz.121 18 mins ago 1x GCP({'L4': 1}) READY us-east4
vllm 2 1 xx.yy.zz.245 18 mins ago 1x GCP({'L4': 1}) READY us-east4
Service Replicas
SERVICE_NAME ID VERSION IP LAUNCHED RESOURCES STATUS REGION
vllm 1 1 xx.yy.zz.121 18 mins ago 1x GCP([Spot]{'L4': 1}) READY us-east4
vllm 2 1 xx.yy.zz.245 18 mins ago 1x GCP([Spot]{'L4': 1}) READY us-east4
.. raw:: html
</details>
</details>
After the service is READY, you can find a single endpoint for the service and access the service with the endpoint:
.. code-block:: console
ENDPOINT=$(sky serve status --endpoint 8081 vllm)
curl -L http://$ENDPOINT/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "meta-llama/Meta-Llama-3-8B-Instruct",
"messages": [
{
"role": "system",
"content": "You are a helpful assistant."
},
{
"role": "user",
"content": "Who are you?"
}
],
"stop_token_ids": [128009, 128001]
}'
ENDPOINT=$(sky serve status --endpoint 8081 vllm)
curl -L http://$ENDPOINT/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "meta-llama/Meta-Llama-3-8B-Instruct",
"messages": [
{
"role": "system",
"content": "You are a helpful assistant."
},
{
"role": "user",
"content": "Who are you?"
}
],
"stop_token_ids": [128009, 128001]
}'
To enable autoscaling, you could specify additional configs in `services`:
To enable autoscaling, you could replace the `replicas` with the following configs in `service`:
.. code-block:: yaml
services:
replica_policy:
min_replicas: 0
max_replicas: 3
target_qps_per_replica: 2
service:
replica_policy:
min_replicas: 2
max_replicas: 4
target_qps_per_replica: 2
This will scale the service up to when the QPS exceeds 2 for each replica.
.. raw:: html
<details>
<summary>Click to see the full recipe YAML</summary>
.. code-block:: yaml
service:
replica_policy:
min_replicas: 2
max_replicas: 4
target_qps_per_replica: 2
# An actual request for readiness probe.
readiness_probe:
path: /v1/chat/completions
post_data:
model: $MODEL_NAME
messages:
- role: user
content: Hello! What is your name?
max_tokens: 1
resources:
accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model.
use_spot: True
disk_size: 512 # Ensure model checkpoints can fit.
disk_tier: best
ports: 8081 # Expose to internet traffic.
envs:
MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
HF_TOKEN: <your-huggingface-token> # Change to your own huggingface token, or use --env to pass.
setup: |
conda create -n vllm python=3.10 -y
conda activate vllm
pip install vllm==0.4.0.post1
# Install Gradio for web UI.
pip install gradio openai
pip install flash-attn==2.5.7
run: |
conda activate vllm
echo 'Starting vllm api server...'
python -u -m vllm.entrypoints.openai.api_server \
--port 8081 \
--model $MODEL_NAME \
--trust-remote-code \
--tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
2>&1 | tee api_server.log
.. raw:: html
</details>
To update the service with the new config:
.. code-block:: console
HF_TOKEN="your-huggingface-token" sky serve update vllm serving.yaml --env HF_TOKEN
To stop the service:
.. code-block:: console
sky serve down vllm
**Optional**: Connect a GUI to the endpoint
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@ -253,58 +314,53 @@ It is also possible to access the Llama-3 service with a separate GUI frontend,
.. raw:: html
<details>
<summary>Click to see the full GUI YAML</summary>
<details>
<summary>Click to see the full GUI YAML</summary>
.. code-block:: yaml
envs:
MODEL_NAME: meta-llama/Meta-Llama-3-70B-Instruct
ENDPOINT: x.x.x.x:3031 # Address of the API server running vllm.
envs:
MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
ENDPOINT: x.x.x.x:3031 # Address of the API server running vllm.
resources:
cpus: 2
resources:
cpus: 2
setup: |
conda activate vllm
if [ $? -ne 0 ]; then
conda create -n vllm python=3.10 -y
conda activate vllm
fi
setup: |
conda create -n vllm python=3.10 -y
conda activate vllm
# Install Gradio for web UI.
pip install gradio openai
# Install Gradio for web UI.
pip install gradio openai
run: |
conda activate vllm
export PATH=$PATH:/sbin
WORKER_IP=$(hostname -I | cut -d' ' -f1)
CONTROLLER_PORT=21001
WORKER_PORT=21002
run: |
conda activate vllm
export PATH=$PATH:/sbin
echo 'Starting gradio server...'
git clone https://github.com/vllm-project/vllm.git || true
python vllm/examples/gradio_openai_chatbot_webserver.py \
-m $MODEL_NAME \
--port 8811 \
--model-url http://$ENDPOINT/v1 \
--stop-token-ids 128009,128001 | tee ~/gradio.log
echo 'Starting gradio server...'
git clone https://github.com/vllm-project/vllm.git || true
python vllm/examples/gradio_openai_chatbot_webserver.py \
-m $MODEL_NAME \
--port 8811 \
--model-url http://$ENDPOINT/v1 \
--stop-token-ids 128009,128001 | tee ~/gradio.log
.. raw:: html
</details>
</details>
1. Start the chat web UI:
.. code-block:: console
sky launch -c gui ./gui.yaml --env ENDPOINT=$(sky serve status --endpoint vllm)
sky launch -c gui ./gui.yaml --env ENDPOINT=$(sky serve status --endpoint vllm)
2. Then, we can access the GUI at the returned gradio link:
.. code-block:: console
| INFO | stdout | Running on public URL: https://6141e84201ce0bb4ed.gradio.live
| INFO | stdout | Running on public URL: https://6141e84201ce0bb4ed.gradio.live

View File

@ -16,7 +16,7 @@
#### Run on H100 system for speed if FP8; number of GPUs depends on the model size
#### Example: quantize Llama2-7b model from HF to FP8 with FP8 KV Cache:
`python quantize.py --model_dir ./ll2-7b --dtype float16 --qformat fp8 --kv_cache_dtype fp8 --output_dir ./ll2_7b_fp8 --calib_size 512 --tp_size 1`
`python quantize.py --model-dir ./ll2-7b --dtype float16 --qformat fp8 --kv-cache-dtype fp8 --output-dir ./ll2_7b_fp8 --calib-size 512 --tp-size 1`
Outputs: model structure, quantized model & parameters (with scaling factors) are in JSON and Safetensors (npz is generated only for the reference)
```

View File

@ -0,0 +1,55 @@
from transformers import AutoTokenizer
from vllm import LLM, SamplingParams
from vllm.assets.image import ImageAsset
# 2.0
# The official repo doesn't work yet, so we need to use a fork for now
# For more details, please see: See: https://github.com/vllm-project/vllm/pull/4087#issuecomment-2250397630
# MODEL_NAME = "HwwwH/MiniCPM-V-2"
# 2.5
MODEL_NAME = "openbmb/MiniCPM-Llama3-V-2_5"
image = ImageAsset("stop_sign").pil_image.convert("RGB")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
llm = LLM(model=MODEL_NAME,
gpu_memory_utilization=1,
trust_remote_code=True,
max_model_len=4096)
messages = [{
'role':
'user',
'content':
'(<image>./</image>)\n' + "What's the content of the image?"
}]
prompt = tokenizer.apply_chat_template(messages,
tokenize=False,
add_generation_prompt=True)
# 2.0
# stop_token_ids = [tokenizer.eos_id]
# 2.5
stop_token_ids = [tokenizer.eos_id, tokenizer.eot_id]
sampling_params = SamplingParams(
stop_token_ids=stop_token_ids,
# temperature=0.7,
# top_p=0.8,
# top_k=100,
# seed=3472,
max_tokens=1024,
# min_tokens=150,
temperature=0,
use_beam_search=True,
# length_penalty=1.2,
best_of=3)
outputs = llm.generate({
"prompt": prompt,
"multi_modal_data": {
"image": image
}
},
sampling_params=sampling_params)
print(outputs[0].outputs[0].text)

View File

@ -13,11 +13,14 @@ client = OpenAI(
models = client.models.list()
model = models.data[0].id
responses = client.embeddings.create(input=[
"Hello my name is",
"The best thing about vLLM is that it supports many different models"
],
model=model)
responses = client.embeddings.create(
input=[
"Hello my name is",
"The best thing about vLLM is that it supports many different models"
],
model=model,
encoding_format="float",
)
for data in responses.data:
print(data.embedding) # list of float of len 4096

View File

@ -6,7 +6,7 @@ numpy < 2.0.0
requests
tqdm
py-cpuinfo
transformers >= 4.42.4 # Required for Gemma 2 and for additional chat template parameters.
transformers >= 4.43.2 # Required for Chameleon and Llama 3.1 hotfox.
tokenizers >= 0.19.1 # Required for Llama 3.
fastapi
aiohttp

View File

@ -2,6 +2,6 @@
-r requirements-common.txt
# Dependencies for x86_64 CPUs
torch == 2.3.1+cpu; platform_machine != "ppc64le"
torchvision == 0.18.1+cpu; platform_machine != "ppc64le" # required for the image processor of phi3v, this must be updated alongside torch
torch == 2.4.0; platform_machine != "ppc64le"
torchvision; platform_machine != "ppc64le" # required for the image processor of phi3v, this must be updated alongside torch
triton >= 2.2.0 # FIXME(woosuk): This is a hack to avoid import error.

View File

@ -11,7 +11,7 @@ import torch.nn as nn
import torch.nn.functional as F
from PIL import Image
from transformers import (AutoModelForCausalLM, AutoModelForVision2Seq,
AutoTokenizer, BatchEncoding)
AutoTokenizer, BatchEncoding, BatchFeature)
from vllm import LLM, SamplingParams
from vllm.assets.image import ImageAsset
@ -133,7 +133,7 @@ def image_assets() -> _ImageAssets:
return IMAGE_ASSETS
_T = TypeVar("_T", nn.Module, torch.Tensor, BatchEncoding)
_T = TypeVar("_T", nn.Module, torch.Tensor, BatchEncoding, BatchFeature)
class HfRunner:
@ -339,7 +339,6 @@ class HfRunner:
processor_kwargs["images"] = images[i]
inputs = self.processor(**processor_kwargs)
input_ids = inputs.input_ids
output = self.model.generate(
**self.wrap_device(inputs),
@ -381,7 +380,7 @@ class HfRunner:
all_logprobs.append(seq_logprobs_lst)
seq_ids = output.sequences[0]
output_len = seq_ids.shape[0] - input_ids.shape[1]
output_len = len(seq_logprobs_lst)
output_ids = seq_ids[-output_len:]
all_output_ids.append(output_ids.tolist())
all_output_strs.append(self.tokenizer.decode(output_ids))
@ -514,10 +513,12 @@ class VllmRunner:
max_tokens: int,
num_logprobs: int,
images: Optional[List[Image.Image]] = None,
stop_token_ids: Optional[List[int]] = None,
) -> List[Tuple[List[int], str, Optional[SampleLogprobs]]]:
greedy_logprobs_params = SamplingParams(temperature=0.0,
max_tokens=max_tokens,
logprobs=num_logprobs)
logprobs=num_logprobs,
stop_token_ids=stop_token_ids)
outputs = self.generate_w_logprobs(prompts,
greedy_logprobs_params,
images=images)

View File

@ -1,3 +1,10 @@
"""
WARNING: This test runs in both single-node (4 GPUs) and multi-node
(2 node with 2 GPUs each) modes. If the test only uses 2 GPUs, it is
important to set the distributed backend to "mp" to avoid Ray scheduling
all workers in a node other than the head node, which can cause the test
to fail.
"""
import os
import pytest
@ -61,3 +68,27 @@ def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, MODEL_NAME,
tp_args.append("--enforce-eager")
compare_two_settings(MODEL_NAME, pp_args, tp_args)
@pytest.mark.parametrize("PP_SIZE, MODEL_NAME", [
(2, "JackFram/llama-160m"),
])
@pytest.mark.parametrize("ATTN_BACKEND", [
"FLASH_ATTN",
"FLASHINFER",
])
def test_pp_cudagraph(PP_SIZE, MODEL_NAME, ATTN_BACKEND):
cudagraph_args = [
# use half precision for speed and memory savings in CI environment
"--dtype",
"float16",
"--pipeline-parallel-size",
str(PP_SIZE),
"--distributed-executor-backend",
"mp",
]
os.environ["VLLM_ATTENTION_BACKEND"] = ATTN_BACKEND
eager_args = cudagraph_args + ["--enforce-eager"]
compare_two_settings(MODEL_NAME, eager_args, cudagraph_args)

View File

@ -295,14 +295,19 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
async for chunk in stream:
assert chunk.usage is None
# Test stream=True, stream_options={"include_usage": True}
stream = await client.chat.completions.create(
model=model_name,
messages=messages,
max_tokens=10,
temperature=0.0,
stream=True,
stream_options={"include_usage": True})
# Test stream=True, stream_options={"include_usage": True,
# "continuous_usage_stats": False}}
stream = await client.chat.completions.create(model=model_name,
messages=messages,
max_tokens=10,
temperature=0.0,
stream=True,
stream_options={
"include_usage":
True,
"continuous_usage_stats":
False
})
async for chunk in stream:
if chunk.choices[0].finish_reason is None:
@ -338,6 +343,25 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
stream=False,
stream_options={"include_usage": True})
# Test stream=True, stream_options={"include_usage": True,
# "continuous_usage_stats": True}
stream = await client.chat.completions.create(
model=model_name,
messages=messages,
max_tokens=10,
temperature=0.0,
stream=True,
stream_options={
"include_usage": True,
"continuous_usage_stats": True
},
)
async for chunk in stream:
assert chunk.usage.prompt_tokens >= 0
assert chunk.usage.completion_tokens >= 0
assert chunk.usage.total_tokens == (chunk.usage.prompt_tokens +
chunk.usage.completion_tokens)
# NOTE: Not sure why, but when I place this after `test_guided_regex_chat`
# (i.e. using the same ordering as in the Completions API tests), the test

View File

@ -55,8 +55,9 @@ def zephyr_pa_files():
@pytest.fixture(scope="module")
def server(zephyr_lora_files, zephyr_lora_added_tokens_files, zephyr_pa_files):
args = [
def default_server_args(zephyr_lora_files, zephyr_lora_added_tokens_files,
zephyr_pa_files):
return [
# use half precision for speed and memory savings in CI environment
"--dtype",
"bfloat16",
@ -85,7 +86,10 @@ def server(zephyr_lora_files, zephyr_lora_added_tokens_files, zephyr_pa_files):
"128",
]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
@pytest.fixture(scope="module")
def server(default_server_args):
with RemoteOpenAIServer(MODEL_NAME, default_server_args) as remote_server:
yield remote_server

View File

@ -18,7 +18,6 @@ def embedding_server():
"--enforce-eager",
"--max-model-len",
"8192",
"--enforce-eager",
]
with RemoteOpenAIServer(EMBEDDING_MODEL_NAME, args) as remote_server:

View File

@ -0,0 +1,83 @@
# Separate these tests out from test_completion and test_chat, because they
# require launching a second server with a different flag. Running both servers
# at the same time on a single node will OOM.
import pytest
from vllm.transformers_utils.tokenizer import get_tokenizer
from ...utils import RemoteOpenAIServer
from .test_completion import default_server_args # noqa: F401
from .test_completion import zephyr_lora_added_tokens_files # noqa: F401
from .test_completion import zephyr_lora_files # noqa: F401
from .test_completion import zephyr_pa_files # noqa: F401
from .test_completion import MODEL_NAME
@pytest.fixture(scope="module")
def server_with_return_tokens_as_token_ids_flag(
default_server_args): # noqa: F811
args_with_flag = default_server_args + ["--return-tokens-as-token-ids"]
with RemoteOpenAIServer(MODEL_NAME, args_with_flag) as remote_server:
yield remote_server
@pytest.mark.asyncio
async def test_completion_return_tokens_as_token_ids_completion(
server_with_return_tokens_as_token_ids_flag):
client = server_with_return_tokens_as_token_ids_flag.get_async_client()
completion = await client.completions.create(
model=MODEL_NAME,
# Include Unicode characters to test for dividing a single
# character across multiple tokens: 🎉 is [28705, 31862] for the
# Zephyr tokenizer
prompt="Say 'Hello, world! 🎉'",
echo=True,
temperature=0,
max_tokens=10,
logprobs=1)
text = completion.choices[0].text
token_strs = completion.choices[0].logprobs.tokens
tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
# Check that the token representations are consistent between raw tokens
# and top_logprobs
# Slice off the first one, because there's no scoring associated with BOS
top_logprobs = completion.choices[0].logprobs.top_logprobs[1:]
top_logprob_keys = [
next(iter(logprob_by_tokens)) for logprob_by_tokens in top_logprobs
]
assert token_strs[1:] == top_logprob_keys
# Check that decoding the tokens gives the expected text
tokens = [int(token.removeprefix("token_id:")) for token in token_strs]
assert text == tokenizer.decode(tokens, skip_special_tokens=True)
@pytest.mark.asyncio
async def test_chat_return_tokens_as_token_ids_completion(
server_with_return_tokens_as_token_ids_flag):
client = server_with_return_tokens_as_token_ids_flag.get_async_client()
response = await client.chat.completions.create(
model=MODEL_NAME,
# Include Unicode characters to test for dividing a single
# character across multiple tokens: 🎉 is [28705, 31862] for the
# Zephyr tokenizer
messages=[{
"role": "system",
"content": "You like to respond in only emojis, like 🎉"
}, {
"role": "user",
"content": "Please write some emojis: 🐱🐶🎉"
}],
temperature=0,
max_tokens=8,
logprobs=True)
text = response.choices[0].message.content
tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
token_ids = []
for logprob_content in response.choices[0].logprobs.content:
token_ids.append(int(logprob_content.token.removeprefix("token_id:")))
assert tokenizer.decode(token_ids, skip_special_tokens=True) == text

View File

@ -215,8 +215,6 @@ def test_reshape_and_cache_flash(
device: str,
kv_cache_dtype: str,
) -> None:
if kv_cache_dtype == "fp8":
pytest.skip()
random.seed(seed)
torch.random.manual_seed(seed)
torch.cuda.manual_seed(seed)
@ -248,15 +246,33 @@ def test_reshape_and_cache_flash(
dtype,
device=device,
)
key_cache, value_cache = key_caches[0], value_caches[0]
key_cache, value_cache = key_caches[0].contiguous(
), value_caches[0].contiguous()
del key_caches
del value_caches
# Clone the KV caches.
cloned_key_cache = key_cache.clone()
cloned_value_cache = value_cache.clone()
if kv_cache_dtype == "fp8":
cloned_key_cache = torch.empty_like(key_cache, dtype=torch.float16)
ops.convert_fp8(cloned_key_cache, key_cache)
cloned_value_cache = torch.empty_like(value_cache, dtype=torch.float16)
ops.convert_fp8(cloned_value_cache, value_cache)
else:
cloned_key_cache = key_cache.clone()
cloned_value_cache = value_cache.clone()
# Using default kv_scale
k_scale = v_scale = 1.0
# Call the reshape_and_cache kernel.
ops.reshape_and_cache_flash(key, value, key_cache, value_cache,
slot_mapping, kv_cache_dtype)
slot_mapping, kv_cache_dtype, k_scale, v_scale)
if kv_cache_dtype == "fp8":
result_key_cache = torch.empty_like(key_cache, dtype=torch.float16)
ops.convert_fp8(result_key_cache, key_cache)
result_value_cache = torch.empty_like(value_cache, dtype=torch.float16)
ops.convert_fp8(result_value_cache, value_cache)
# Run the reference implementation.
block_indicies = torch.div(slot_mapping, block_size, rounding_mode="floor")
@ -269,8 +285,18 @@ def test_reshape_and_cache_flash(
cloned_key_cache[block_idx, block_offset, :, :] = key[i]
cloned_value_cache[block_idx, block_offset, :, :] = value[i]
assert torch.allclose(key_cache, cloned_key_cache)
assert torch.allclose(value_cache, cloned_value_cache)
if kv_cache_dtype == "fp8":
assert torch.allclose(result_key_cache,
cloned_key_cache,
atol=0.001,
rtol=0.1)
assert torch.allclose(result_value_cache,
cloned_value_cache,
atol=0.001,
rtol=0.1)
else:
assert torch.allclose(key_cache, cloned_key_cache)
assert torch.allclose(value_cache, cloned_value_cache)
@pytest.mark.parametrize("direction", COPYING_DIRECTION)

View File

@ -1,3 +1,4 @@
import time
from typing import List
import pytest
@ -10,6 +11,8 @@ from vllm.engine.async_llm_engine import AsyncLLMEngine
from vllm.engine.metrics import RayPrometheusStatLogger
from vllm.sampling_params import SamplingParams
from ..conftest import cleanup
MODELS = [
"facebook/opt-125m",
]
@ -219,6 +222,94 @@ def test_metric_spec_decode(
"does not meet expectation")
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [10])
@pytest.mark.parametrize("log_interval", [1, 3, 5, 7])
def test_metric_spec_decode_interval(
vllm_runner,
example_prompts,
model: str,
dtype: str,
max_tokens: int,
log_interval: int,
) -> None:
k = 5
engine_args = EngineArgs(model=model,
dtype=dtype,
disable_log_stats=False,
gpu_memory_utilization=0.4,
speculative_model=model,
num_speculative_tokens=k,
use_v2_block_manager=True,
enforce_eager=True)
engine = LLMEngine.from_engine_args(engine_args)
try:
engine.add_request(
"request-id-0",
example_prompts[0],
SamplingParams(max_tokens=max_tokens),
)
# set log internal
stat_logger = engine.stat_loggers['prometheus']
stat_logger.local_interval = log_interval
# prefill
engine.step()
# wait for 5 seconds to ensure that spec decode metrics
# get triggered in first decode step
time.sleep(5)
# first decode step should trigger async collection of metrics
engine.step()
# wait one second to allow H2D transfer to finish
time.sleep(1)
# second decode step should now be able to collect the spec
# decode stats and the request should also be finished
engine.step()
# must have finisehd now
assert not engine.has_unfinished_requests()
# wait to ensure logging occurs
time.sleep(log_interval)
# force logging
engine.step()
# Note that the purpose of this test is to verify spec decode
# metrics instead of functional correctness, so the expected values
# are intended to be loose.
metric_name_to_expected_fn = {
"gauge_spec_decode_draft_acceptance_rate": lambda v: 0 <= v <= 1,
"gauge_spec_decode_efficiency": lambda v: 0 <= v <= 1,
"counter_spec_decode_num_accepted_tokens": lambda v: 0 <= v <= k,
"counter_spec_decode_num_draft_tokens": lambda v: v == k,
"counter_spec_decode_num_emitted_tokens":
lambda v: 0 <= v <= k + 1,
}
for metric_name, is_expected in metric_name_to_expected_fn.items():
metric_val = getattr(
stat_logger.metrics,
metric_name).labels(**stat_logger.labels)._value.get()
assert is_expected(metric_val), (
f"the value of metric {metric_name} ({metric_val}) "
"does not meet expectation")
finally:
del engine
cleanup()
def assert_metrics(engine: LLMEngine, disable_log_stats: bool,
num_requests: int) -> None:
if disable_log_stats:

View File

@ -0,0 +1,163 @@
from collections import UserDict
from typing import List, Optional, Tuple, Type
import pytest
import torch
import torch.types
from transformers import BatchFeature
from vllm.multimodal.utils import rescale_image_size
from vllm.sequence import SampleLogprobs
from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
from .utils import check_logprobs_close
pytestmark = pytest.mark.vlm
# The image token is placed before "user" on purpose so that the test can pass
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
"stop_sign":
"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n" \
"(<image>./</image>)\nWhat's the content of the image?<|eot_id|>" \
"<|start_header_id|>assistant<|end_header_id|>\n\n", # noqa: E501
"cherry_blossom":
"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n" \
"(<image>./</image>)\nWhat is the season?<|eot_id|>" \
"<|start_header_id|>assistant<|end_header_id|>\n\n"
})
models = ["openbmb/MiniCPM-Llama3-V-2_5"]
def trunc_hf_output(hf_output: Tuple[List[int], str,
Optional[SampleLogprobs]]):
output_ids, output_str, out_logprobs = hf_output
if output_str.endswith("<|eot_id|>"):
output_str = output_str.split("<|eot_id|>")[0]
return output_ids, output_str, out_logprobs
target_dtype = "half"
def run_test(
hf_runner: Type[HfRunner],
vllm_runner: Type[VllmRunner],
image_assets: _ImageAssets,
model: str,
*,
size_factors: List[float],
dtype: str,
max_tokens: int,
num_logprobs: int,
tensor_parallel_size: int,
distributed_executor_backend: Optional[str] = None,
):
"""Inference result should be the same between hf and vllm.
All the image fixtures for the test is under tests/images.
For huggingface runner, we provide the PIL images as input.
For vllm runner, we provide MultiModalDataDict objects
and corresponding vision language config as input.
Note, the text input is also adjusted to abide by vllm contract.
The text output is sanitized to be able to compare with hf.
"""
images = [asset.pil_image for asset in image_assets]
inputs_per_image = [(
[prompt for _ in size_factors],
[rescale_image_size(image, factor) for factor in size_factors],
) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
# NOTE: take care of the order. run vLLM first, and then run HF.
# vLLM needs a fresh new process without cuda initialization.
# if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method (the default method).
# max_model_len should be greater than image_feature_size
with vllm_runner(model,
max_model_len=4096,
max_num_seqs=1,
dtype=dtype,
tensor_parallel_size=tensor_parallel_size,
distributed_executor_backend=distributed_executor_backend,
enforce_eager=True) as vllm_model:
tokenizer = vllm_model.model.get_tokenizer()
stop_token_ids = [tokenizer.eos_id, tokenizer.eot_id]
vllm_outputs_per_image = [
vllm_model.generate_greedy_logprobs(prompts,
max_tokens,
num_logprobs=num_logprobs,
images=vllm_images,
stop_token_ids=stop_token_ids)
for prompts, vllm_images in inputs_per_image
]
with hf_runner(model, dtype=dtype) as hf_model, torch.no_grad():
class NestedInputs(UserDict):
def __init__(self, model_inputs: BatchFeature):
super().__init__({"model_inputs": model_inputs})
self.model_inputs = model_inputs
def to(self, device: torch.types.Device):
return NestedInputs(self.model_inputs.to(device))
hf_processor = hf_model.processor
hf_model.processor = lambda **kw: NestedInputs(
hf_processor(**kw) # type: ignore
)
hf_outputs_per_image = [
hf_model.generate_greedy_logprobs_limit(prompts,
max_tokens,
num_logprobs=num_logprobs,
images=hf_images,
tokenizer=tokenizer)
for prompts, hf_images in inputs_per_image
]
for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
vllm_outputs_per_image):
check_logprobs_close(
outputs_0_lst=[
trunc_hf_output(hf_output) for hf_output in hf_outputs
],
outputs_1_lst=vllm_outputs,
name_0="hf",
name_1="vllm",
)
@pytest.mark.parametrize("model", models)
@pytest.mark.parametrize(
"size_factors",
[
# No image
[],
# Single-scale
[1.0],
# Single-scale, batched
[1.0, 1.0, 1.0],
# Multi-scale
[0.25, 0.5, 1.0],
],
)
@pytest.mark.parametrize("dtype", [target_dtype])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [5])
def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
dtype: str, max_tokens: int, num_logprobs: int) -> None:
run_test(
hf_runner,
vllm_runner,
image_assets,
model,
size_factors=size_factors,
dtype=dtype,
max_tokens=max_tokens,
num_logprobs=num_logprobs,
tensor_parallel_size=1,
)

View File

@ -8,15 +8,20 @@ import torch
from tests.quantization.utils import is_quant_method_supported
from vllm import SamplingParams
models_to_test = [
('huggyllama/llama-7b', 'quantize model inflight'),
('lllyasviel/omost-llama-3-8b-4bits', 'read pre-quantized model'),
]
@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
reason='bitsandbytes is not supported on this GPU type.')
def test_load_bnb_model(vllm_runner) -> None:
with vllm_runner('huggyllama/llama-7b',
@pytest.mark.parametrize("model_name, description", models_to_test)
def test_load_bnb_model(vllm_runner, model_name, description) -> None:
with vllm_runner(model_name,
quantization='bitsandbytes',
load_format='bitsandbytes',
enforce_eager=True) as llm:
model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501
# check the weights in MLP & SelfAttention are quantized to torch.uint8
@ -65,12 +70,17 @@ def test_load_bnb_model(vllm_runner) -> None:
'To be or not to be, that is the question.'
]
outputs = llm.generate(prompts, sampling_params=sampling_params)
assert len(outputs) == len(prompts)
for index in range(len(outputs)):
# compare the first line of the output
actual_output = outputs[index][1][0].split('\n', 1)[0]
expected_output = expected_outputs[index].split('\n', 1)[0]
assert len(actual_output) >= len(expected_output), (
f'Actual {actual_output} should be larger than or equal to '
f'expected {expected_output}')
actual_output = actual_output[:len(expected_output)]
assert actual_output == expected_output, (
f'Expected: {expected_output}, but got: {actual_output}')

View File

@ -13,6 +13,7 @@ from vllm.model_executor.layers.quantization.fp8 import (Fp8KVCacheMethod,
MODELS = [
"neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV",
"nm-testing/Phi-3-mini-128k-instruct-FP8",
"nm-testing/Qwen2-0.5B-Instruct-FP8-SkipQKV",
]
@ -59,12 +60,20 @@ def test_kv_cache_model_load_and_run(vllm_runner, model_id: str):
@pytest.mark.skipif(not is_quant_method_supported("fp8"),
reason="FP8 is not supported on this GPU type.")
def test_load_fp16_model(vllm_runner) -> None:
with vllm_runner("facebook/opt-125m", quantization="fp8") as llm:
@pytest.mark.parametrize("kv_cache_dtype", ["auto", "fp8"])
def test_load_fp16_model(vllm_runner, kv_cache_dtype: str) -> None:
with vllm_runner("facebook/opt-125m",
quantization="fp8",
kv_cache_dtype=kv_cache_dtype) as llm:
model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501
fc1 = model.model.decoder.layers[0].fc1
assert isinstance(fc1.quant_method, Fp8LinearMethod)
if kv_cache_dtype == "fp8":
attn = model.model.decoder.layers[0].self_attn.attn
assert isinstance(attn.quant_method, Fp8KVCacheMethod)
assert attn._k_scale == 1.0
assert attn._v_scale == 1.0
capability = torch.cuda.get_device_capability()
capability = capability[0] * 10 + capability[1]

View File

@ -191,7 +191,8 @@ def create_llm_generator(baseline_or_test, request, common_llm_kwargs,
and llm.llm_engine.log_stats):
for sate_logger in llm.llm_engine.stat_loggers.values():
sate_logger.local_interval = 0
set_random_seed(seed)
if seed is not None:
set_random_seed(seed)
yield llm
del llm

View File

@ -24,14 +24,14 @@ import pytest
from .conftest import run_greedy_equality_correctness_test
# main model
MAIN_MODEL = "ibm-granite/granite-3b-code-instruct"
MAIN_MODEL = "JackFram/llama-160m"
# speculative model
SPEC_MODEL = "ibm-granite/granite-3b-code-instruct-accelerator"
SPEC_MODEL = "ibm-fms/llama-160m-accelerator"
# max. number of speculative tokens: this corresponds to
# n_predict in the config.json of the speculator model.
MAX_SPEC_TOKENS = 5
MAX_SPEC_TOKENS = 3
# precision
PRECISION = "float32"

View File

@ -21,7 +21,8 @@ from .conftest import run_equality_correctness_test
"num_speculative_tokens": 3,
}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{"seed": 1}])
@pytest.mark.parametrize("test_llm_kwargs", [{"seed": 5}])
@pytest.mark.parametrize("batch_size", [1, 8, 32])
@pytest.mark.parametrize("temperature", [0.1, 1.0])
@pytest.mark.parametrize(
@ -30,15 +31,26 @@ from .conftest import run_equality_correctness_test
# Use smaller output len for fast test.
10,
])
@pytest.mark.parametrize("seed", [1])
def test_seeded_consistency(baseline_llm_generator, batch_size: int,
temperature: float, output_len: int):
@pytest.mark.parametrize("seed", [None])
def test_seeded_consistency(baseline_llm_generator, test_llm_generator,
batch_size: int, temperature: float,
output_len: int):
"""Verify outputs are consistent across multiple runs with same seed
"""
run_equality_correctness_test(baseline_llm_generator,
baseline_llm_generator,
test_llm_generator,
batch_size,
max_output_len=output_len,
temperature=temperature,
seeded=True,
force_output_len=True)
# Ensure this same test does fail if we _don't_ include per-request seeds
with pytest.raises(AssertionError):
run_equality_correctness_test(baseline_llm_generator,
test_llm_generator,
batch_size,
max_output_len=output_len,
temperature=temperature,
seeded=False,
force_output_len=True)

View File

@ -104,8 +104,10 @@ def test_rope_customization():
dtype="float16",
seed=0,
)
assert getattr(longchat_model_config.hf_config, "rope_scaling",
None) == LONGCHAT_ROPE_SCALING
# Check if LONGCHAT_ROPE_SCALING entries are in longchat_model_config
assert all(
longchat_model_config.hf_config.rope_scaling.get(key) == value
for key, value in LONGCHAT_ROPE_SCALING.items())
assert longchat_model_config.max_model_len == 16384
longchat_model_config = ModelConfig(

View File

@ -193,6 +193,7 @@ def test_prepare_decode_cuda_graph(batch_size):
for _ in range(expected_bs - len(seq_lens)):
seq_lens.append(1)
assert attn_metadata.seq_lens == seq_lens
assert attn_metadata.num_decode_tokens == len(seq_lens)
start_idx = 0
start_loc = [start_idx]
for _ in context_lens:

View File

@ -426,10 +426,13 @@ def reshape_and_cache_flash(
value_cache: torch.Tensor,
slot_mapping: torch.Tensor,
kv_cache_dtype: str,
k_scale: float,
v_scale: float,
) -> None:
torch.ops._C_cache_ops.reshape_and_cache_flash(key, value, key_cache,
value_cache, slot_mapping,
kv_cache_dtype)
kv_cache_dtype, k_scale,
v_scale)
def copy_blocks(key_caches: List[torch.Tensor],

View File

@ -272,7 +272,15 @@ class FlashAttentionMetadataBuilder(
def build(self, seq_lens: List[int], query_lens: List[int],
cuda_graph_pad_size: int, batch_size: int):
"""Build attention metadata with on-device tensors."""
"""Build attention metadata with on-device tensors.
Args:
seq_lens: The maybe padded sequence lengths of the input sequences.
query_lens: The query lengths of the input sequences.
cuda_graph_pad_size: The padding size for cuda graph.
-1 if cuda graph is not used.
batch_size: The maybe padded batch size.
"""
for inter_data in self.input_builder.inter_data_list:
self._add_seq_group(inter_data,
self.input_builder.chunked_prefill_enabled)
@ -297,7 +305,7 @@ class FlashAttentionMetadataBuilder(
if use_captured_graph:
self.slot_mapping.extend([PAD_SLOT_ID] * cuda_graph_pad_size)
self.block_tables.extend([] * cuda_graph_pad_size)
num_decode_tokens = batch_size + cuda_graph_pad_size
num_decode_tokens = batch_size
# The shape of graph_block_tables is
# [max batch size, max context len // block size].
@ -478,6 +486,8 @@ class FlashAttentionImpl(AttentionImpl):
value_cache,
attn_metadata.slot_mapping.flatten(),
self.kv_cache_dtype,
k_scale,
v_scale,
)
num_prefill_tokens = attn_metadata.num_prefill_tokens

View File

@ -297,26 +297,38 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
if is_profile_run:
return
# Get the number of valid blocks based on sequence length.
# If seq_len = 16, block_size = 16,
# block_table_bound is 1 with 1 valid block.
# If seq_len = 15, block_size = 16,
# block_table_bound is 0 + 1 with 1 valid block.
block_table_bound = seq_len // self.block_size + 1 \
if seq_len % self.block_size != 0 \
else seq_len // self.block_size
block_table = block_tables[seq_id]
self.paged_kv_indices.extend(block_table[:block_table_bound])
self.paged_kv_indptr.append(self.paged_kv_indptr[-1] +
block_table_bound)
self._update_paged_kv_tensors(block_table, seq_len)
last_page_len = seq_len % self.block_size
if last_page_len == 0:
last_page_len = self.block_size
self.paged_kv_last_page_len.append(last_page_len)
def _update_paged_kv_tensors(self, block_table: List[int], seq_len: int):
# Get the number of valid blocks based on sequence length.
# If seq_len = 16, block_size = 16,
# block_table_bound is 1 with 1 valid block.
# If seq_len = 15, block_size = 16,
# block_table_bound is 0 + 1 with 1 valid block.
block_table_bound = seq_len // self.block_size + 1 \
if seq_len % self.block_size != 0 \
else seq_len // self.block_size
self.paged_kv_indices.extend(block_table[:block_table_bound])
self.paged_kv_indptr.append(self.paged_kv_indptr[-1] +
block_table_bound)
last_page_len = seq_len % self.block_size
if last_page_len == 0:
last_page_len = self.block_size
self.paged_kv_last_page_len.append(last_page_len)
def build(self, seq_lens: List[int], query_lens: List[int],
cuda_graph_pad_size: int, batch_size: int):
"""Build attention metadata with on-device tensors.
Args:
seq_lens: The maybe padded sequence lengths of the input sequences.
query_lens: The query lengths of the input sequences.
cuda_graph_pad_size: The padding size for cuda graph.
-1 if cuda graph is not used.
batch_size: The maybe padded batch size.
"""
for inter_data in self.input_builder.inter_data_list:
self._add_seq_group(inter_data,
self.input_builder.chunked_prefill_enabled)
@ -331,7 +343,7 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
if use_captured_graph:
self.slot_mapping.extend([PAD_SLOT_ID] * cuda_graph_pad_size)
self.block_tables.extend([] * cuda_graph_pad_size)
num_decode_tokens = batch_size + cuda_graph_pad_size
num_decode_tokens = batch_size
# The shape of graph_block_tables is
# [max batch size, max context len // block size].
@ -489,6 +501,8 @@ class FlashInferImpl(AttentionImpl):
kv_cache[:, 1],
attn_metadata.slot_mapping.flatten(),
self.kv_cache_dtype,
k_scale,
v_scale,
)
query = query.contiguous(

View File

@ -149,6 +149,15 @@ class CommonMetadataBuilder(AttentionMetadataBuilder[TAttentionMetadata]):
def build(self, seq_lens: List[int], query_lens: List[int],
cuda_graph_pad_size: int, batch_size: int):
"""Build attention metadata with on-device tensors.
Args:
seq_lens: The maybe padded sequence lengths of the input sequences.
query_lens: The query lengths of the input sequences.
cuda_graph_pad_size: The padding size for cuda graph.
-1 if cuda graph is not used.
batch_size: The maybe padded batch size.
"""
for inter_data in self.input_builder.inter_data_list:
self._add_seq_group(inter_data,
self.input_builder.chunked_prefill_enabled)
@ -173,7 +182,7 @@ class CommonMetadataBuilder(AttentionMetadataBuilder[TAttentionMetadata]):
if use_captured_graph:
self.slot_mapping.extend([PAD_SLOT_ID] * cuda_graph_pad_size)
self.block_tables.extend([] * cuda_graph_pad_size)
num_decode_tokens = batch_size + cuda_graph_pad_size
num_decode_tokens = batch_size
# The shape of graph_block_tables is
# [max batch size, max context len // block size].

View File

@ -31,6 +31,7 @@ _EMBEDDING_MODEL_MAX_NUM_BATCHED_TOKENS = 32768
_PP_SUPPORTED_MODELS = [
"AquilaModel",
"AquilaForCausalLM",
"DeepseekV2ForCausalLM",
"InternLMForCausalLM",
"LlamaForCausalLM",
"LLaMAForCausalLM",
@ -281,6 +282,10 @@ class ModelConfig:
raise ValueError(
"BitAndBytes quantization with TP or PP is not supported yet.")
if self.quantization == "bitsandbytes" and self.enforce_eager is False:
raise ValueError(
"BitAndBytes with enforce_eager = False is not supported yet.")
def get_hf_config_sliding_window(self) -> Optional[int]:
"""Get the sliding window size, or None if disabled."""
@ -590,9 +595,11 @@ class LoadConfig:
mainly for profiling.
"tensorizer" will use CoreWeave's tensorizer library for
fast weight loading.
"bitsandbytes" will load nf4 type weights.
ignore_patterns: The list of patterns to ignore when loading the model.
Default to "original/**/*" to avoid repeated loading of llama's
checkpoints.
"""
load_format: Union[str, LoadFormat, "BaseModelLoader"] = LoadFormat.AUTO

View File

@ -1,5 +1,5 @@
from pathlib import Path
from typing import Mapping, Optional
from typing import Mapping, MutableMapping, Optional
from urllib.parse import urlparse
import aiohttp
@ -40,7 +40,7 @@ class HTTPConnection:
raise ValueError("Invalid HTTP URL: A valid HTTP URL "
"must have scheme 'http' or 'https'.")
def _headers(self, **extras: str) -> Mapping[str, str]:
def _headers(self, **extras: str) -> MutableMapping[str, str]:
return {"User-Agent": f"vLLM/{VLLM_VERSION}", **extras}
def get_response(

View File

@ -9,7 +9,7 @@ from unittest.mock import patch
import torch
import torch.distributed as dist
from torch.distributed import ProcessGroup
from zmq import PUB, REP, REQ, SUB, SUBSCRIBE, Context # type: ignore
from zmq import SUB, SUBSCRIBE, XPUB, XPUB_VERBOSE, Context # type: ignore
import vllm.envs as envs
from vllm.logger import init_logger
@ -153,9 +153,7 @@ class Handle:
buffer: Optional[ShmRingBuffer] = None
local_subscribe_port: Optional[int] = None
local_sync_port: Optional[int] = None
remote_subscribe_port: Optional[int] = None
remote_sync_port: Optional[int] = None
class MessageQueue:
@ -189,38 +187,36 @@ class MessageQueue:
self.buffer = ShmRingBuffer(n_local_reader, max_chunk_bytes,
max_chunks)
self.local_socket = context.socket(PUB)
# XPUB is very similar to PUB,
# except that it can receive subscription messages
# to confirm the number of subscribers
self.local_socket = context.socket(XPUB)
# set the verbose option so that we can receive every subscription
# message. otherwise, we will only receive the first subscription
# see http://api.zeromq.org/3-3:zmq-setsockopt for more details
self.local_socket.setsockopt(XPUB_VERBOSE, True)
local_subscribe_port = get_open_port()
self.local_socket.bind(f"tcp://*:{local_subscribe_port}")
self.local_sync_socket = context.socket(REP)
local_sync_port = get_open_port()
self.local_sync_socket.bind(f"tcp://*:{local_sync_port}")
self.current_idx = 0
else:
self.buffer = None # type: ignore
local_subscribe_port = None
local_sync_port = None
self.local_socket = None
self.local_sync_socket = None
self.current_idx = -1
if n_remote_reader > 0:
# for remote readers, we will:
# create a publish-subscribe socket to communicate large data
self.remote_socket = context.socket(PUB)
self.remote_socket = context.socket(XPUB)
self.remote_socket.setsockopt(XPUB_VERBOSE, True)
remote_subscribe_port = get_open_port()
self.remote_socket.bind(f"tcp://*:{remote_subscribe_port}")
self.remote_sync_socket = context.socket(REP)
remote_sync_port = get_open_port()
self.remote_sync_socket.bind(f"tcp://*:{remote_sync_port}")
else:
remote_subscribe_port = None
remote_sync_port = None
self.remote_socket = None
self.remote_sync_socket = None
self._is_writer = True
self._is_local_reader = False
@ -233,9 +229,7 @@ class MessageQueue:
local_reader_ranks=local_reader_ranks,
buffer=self.buffer,
local_subscribe_port=local_subscribe_port,
local_sync_port=local_sync_port,
remote_subscribe_port=remote_subscribe_port,
remote_sync_port=remote_sync_port,
)
logger.info("vLLM message queue communication handle: %s", self.handle)
@ -264,12 +258,7 @@ class MessageQueue:
self.local_socket.connect(
f"tcp://{handle.connect_ip}:{handle.local_subscribe_port}")
self.local_sync_socket = context.socket(REQ)
self.local_sync_socket.connect(
f"tcp://{handle.connect_ip}:{handle.local_sync_port}")
self.remote_socket = None
self.remote_sync_socket = None
else:
self.buffer = None # type: ignore
self.current_idx = -1
@ -278,17 +267,12 @@ class MessageQueue:
self._is_remote_reader = True
self.local_socket = None
self.local_sync_socket = None
self.remote_socket = context.socket(SUB)
self.remote_socket.setsockopt_string(SUBSCRIBE, "")
self.remote_socket.connect(
f"tcp://{handle.connect_ip}:{handle.remote_subscribe_port}")
self.remote_sync_socket = context.socket(REQ)
self.remote_sync_socket.connect(
f"tcp://{handle.connect_ip}:{handle.remote_sync_port}")
return self
def wait_until_ready(self):
@ -300,29 +284,27 @@ class MessageQueue:
# local readers
for i in range(self.n_local_reader):
recv = self.local_sync_socket.recv()
assert recv == b"READY"
self.local_sync_socket.send(b"READY")
# wait for subscription messages from all local readers
self.local_socket.recv()
if self.n_local_reader > 0:
# send a message to all local readers
# to make sure the publish channel is working
self.local_socket.send(b"READY")
# remote readers
for i in range(self.n_remote_reader):
recv = self.remote_sync_socket.recv()
assert recv == b"READY"
self.remote_sync_socket.send(b"READY")
# wait for subscription messages from all remote readers
self.remote_socket.recv()
if self.n_remote_reader > 0:
# send a message to all remote readers
# to make sure the publish channel is working
self.remote_socket.send(b"READY")
elif self._is_local_reader:
self.local_sync_socket.send(b"READY")
recv = self.local_sync_socket.recv()
assert recv == b"READY"
# wait for the writer to send a message
recv = self.local_socket.recv()
assert recv == b"READY"
elif self._is_remote_reader:
self.remote_sync_socket.send(b"READY")
recv = self.remote_sync_socket.recv()
assert recv == b"READY"
# wait for the writer to send a message
recv = self.remote_socket.recv()
assert recv == b"READY"

View File

@ -0,0 +1,30 @@
import torch
import torch.distributed as dist
from torch.distributed import ProcessGroup
from vllm.platforms import current_platform
if current_platform.is_tpu():
import torch_xla.core.xla_model as xm
from torch_xla._internal import pjrt
class TpuCommunicator:
def __init__(self, group: ProcessGroup):
if not current_platform.is_tpu():
self.disabled = True
return
self.disabled = False
local_rank = dist.get_rank(group)
world_size = dist.get_world_size(group)
pjrt.initialize_multiprocess(local_rank, world_size)
xm._init_world_size_ordinal()
def all_reduce(self, x: torch.Tensor) -> torch.Tensor:
return xm.all_reduce(xm.REDUCE_SUM, x)
def all_gather(self, x: torch.Tensor, dim: int = -1) -> torch.Tensor:
assert dim == -1, "TPUs only support dim=-1 for all-gather."
return xm.all_gather(x, dim=dim)

View File

@ -133,6 +133,7 @@ class GroupCoordinator:
torch_distributed_backend: Union[str, Backend],
use_pynccl: bool,
use_custom_allreduce: bool,
use_tpu_communicator: bool,
use_message_queue_broadcaster: bool = False,
):
@ -164,6 +165,7 @@ class GroupCoordinator:
self.use_pynccl = use_pynccl
self.use_custom_allreduce = use_custom_allreduce
self.use_tpu_communicator = use_tpu_communicator
# lazy import to avoid documentation build error
from vllm.distributed.device_communicators.custom_all_reduce import (
@ -190,6 +192,12 @@ class GroupCoordinator:
else:
self.ca_comm = None
from vllm.distributed.device_communicators.tpu_communicator import (
TpuCommunicator)
self.tpu_communicator: Optional[TpuCommunicator]
if use_tpu_communicator and self.world_size > 1:
self.tpu_communicator = TpuCommunicator(group=self.cpu_group)
from vllm.distributed.device_communicators.shm_broadcast import (
MessageQueue)
self.mq_broadcaster: Optional[MessageQueue] = None
@ -243,6 +251,13 @@ class GroupCoordinator:
ca_comm = self.ca_comm
maybe_ca_context = nullcontext(
) if ca_comm is None else ca_comm.capture()
# ensure all initialization operations complete before attempting to
# capture the graph on another stream
curr_stream = torch.cuda.current_stream()
if curr_stream != stream:
stream.wait_stream(curr_stream)
with torch.cuda.stream(stream), maybe_ca_context:
# In graph mode, we have to be very careful about the collective
# operations. The current status is:
@ -282,6 +297,12 @@ class GroupCoordinator:
# Bypass the function if we are using only 1 GPU.
if self.world_size == 1:
return input_
# For TPUs, use TPU communicator.
tpu_comm = self.tpu_communicator
if tpu_comm is not None and not tpu_comm.disabled:
return tpu_comm.all_reduce(input_)
if ca_comm is not None:
out = ca_comm.custom_all_reduce(input_)
if out is not None:
@ -289,6 +310,9 @@ class GroupCoordinator:
pynccl_comm = self.pynccl_comm
if (pynccl_comm is not None and not pynccl_comm.disabled):
pynccl_comm.all_reduce(input_)
elif input_.is_cpu:
import intel_extension_for_pytorch as ipex
ipex.distributed.all_reduce(input_, group=self.device_group)
else:
torch.distributed.all_reduce(input_, group=self.device_group)
return input_
@ -300,6 +324,12 @@ class GroupCoordinator:
return input_
assert -input_.dim() <= dim < input_.dim(), (
f"Invalid dim ({dim}) for input tensor with shape {input_.size()}")
# For TPUs, use TPU communicator.
tpu_comm = self.tpu_communicator
if tpu_comm is not None and not tpu_comm.disabled:
return tpu_comm.all_gather(input_, dim)
if dim < 0:
# Convert negative dim to positive.
dim += input_.dim()
@ -717,6 +747,7 @@ def init_world_group(ranks: List[int], local_rank: int,
torch_distributed_backend=backend,
use_pynccl=False,
use_custom_allreduce=False,
use_tpu_communicator=False,
)
@ -735,6 +766,7 @@ def init_model_parallel_group(
torch_distributed_backend=backend,
use_pynccl=True,
use_custom_allreduce=use_custom_allreduce,
use_tpu_communicator=True,
use_message_queue_broadcaster=use_message_queue_broadcaster,
)

View File

@ -676,8 +676,8 @@ class EngineArgs:
# bitsandbytes quantization needs a specific model loader
# so we make sure the quant method and the load format are consistent
if (self.quantization == "bitsandbytes" or
self.qlora_adapter_name_or_path is not None) and \
self.load_format != "bitsandbytes":
self.qlora_adapter_name_or_path is not None) and \
self.load_format != "bitsandbytes":
raise ValueError(
"BitsAndBytes quantization and QLoRA adapter only support "
f"'bitsandbytes' load format, but got {self.load_format}")

View File

@ -410,8 +410,6 @@ class AsyncLLMEngine:
from vllm.executor.tpu_executor import TPUExecutorAsync
executor_class = TPUExecutorAsync
elif engine_config.device_config.device_type == "cpu":
assert distributed_executor_backend is None, (
"Distributed execution is not supported with the CPU backend.")
from vllm.executor.cpu_executor import CPUExecutorAsync
executor_class = CPUExecutorAsync
elif engine_config.device_config.device_type == "openvino":

View File

@ -949,8 +949,9 @@ class LLMEngine:
model_output: Optional[List[SamplerOutput]] = None) -> None:
"""Forced log when no requests active."""
if self.log_stats:
stats = self._get_stats(scheduler_outputs, model_output)
for logger in self.stat_loggers.values():
logger.log(self._get_stats(scheduler_outputs, model_output))
logger.log(stats)
def _get_stats(
self,

View File

@ -355,6 +355,7 @@ class StatLoggerBase(ABC):
self.num_generation_tokens: List[int] = []
self.last_local_log = time.time()
self.local_interval = local_interval
self.spec_decode_metrics: Optional["SpecDecodeWorkerMetrics"] = None
@abstractmethod
def info(self, type: str, obj: SupportsMetricsInfo) -> None:
@ -364,6 +365,12 @@ class StatLoggerBase(ABC):
def log(self, stats: Stats) -> None:
raise NotImplementedError
def maybe_update_spec_decode_metrics(self, stats: Stats):
"""Save spec decode metrics (since they are unlikely
to be emitted at same time as log interval)."""
if stats.spec_decode_metrics is not None:
self.spec_decode_metrics = stats.spec_decode_metrics
class LoggingStatLogger(StatLoggerBase):
"""LoggingStatLogger is used in LLMEngine to log to Stdout."""
@ -379,6 +386,9 @@ class LoggingStatLogger(StatLoggerBase):
self.num_prompt_tokens.append(stats.num_prompt_tokens_iter)
self.num_generation_tokens.append(stats.num_generation_tokens_iter)
# Update spec decode metrics
self.maybe_update_spec_decode_metrics(stats)
# Log locally every local_interval seconds.
if local_interval_elapsed(stats.now, self.last_local_log,
self.local_interval):
@ -408,15 +418,16 @@ class LoggingStatLogger(StatLoggerBase):
stats.cpu_cache_usage_sys * 100,
)
if self.spec_decode_metrics is not None:
logger.info(
self._format_spec_decode_metrics_str(
self.spec_decode_metrics))
# Reset tracked stats for next interval.
self.num_prompt_tokens = []
self.num_generation_tokens = []
self.last_local_log = stats.now
if stats.spec_decode_metrics is not None:
logger.info(
self._format_spec_decode_metrics_str(
stats.spec_decode_metrics))
self.spec_decode_metrics = None
def _format_spec_decode_metrics_str(
self, metrics: "SpecDecodeWorkerMetrics") -> str:
@ -533,6 +544,9 @@ class PrometheusStatLogger(StatLoggerBase):
self.num_prompt_tokens.append(stats.num_prompt_tokens_iter)
self.num_generation_tokens.append(stats.num_generation_tokens_iter)
# Update spec decode metrics
self.maybe_update_spec_decode_metrics(stats)
# Log locally every local_interval seconds.
if local_interval_elapsed(stats.now, self.last_local_log,
self.local_interval):
@ -550,26 +564,27 @@ class PrometheusStatLogger(StatLoggerBase):
prompt_throughput=prompt_throughput,
generation_throughput=generation_throughput)
if self.spec_decode_metrics is not None:
self._log_gauge(
self.metrics.gauge_spec_decode_draft_acceptance_rate,
self.spec_decode_metrics.draft_acceptance_rate)
self._log_gauge(self.metrics.gauge_spec_decode_efficiency,
self.spec_decode_metrics.system_efficiency)
self._log_counter(
self.metrics.counter_spec_decode_num_accepted_tokens,
self.spec_decode_metrics.accepted_tokens)
self._log_counter(
self.metrics.counter_spec_decode_num_draft_tokens,
self.spec_decode_metrics.draft_tokens)
self._log_counter(
self.metrics.counter_spec_decode_num_emitted_tokens,
self.spec_decode_metrics.emitted_tokens)
# Reset tracked stats for next interval.
self.num_prompt_tokens = []
self.num_generation_tokens = []
self.last_local_log = stats.now
if stats.spec_decode_metrics is not None:
self._log_gauge(
self.metrics.gauge_spec_decode_draft_acceptance_rate,
stats.spec_decode_metrics.draft_acceptance_rate)
self._log_gauge(self.metrics.gauge_spec_decode_efficiency,
stats.spec_decode_metrics.system_efficiency)
self._log_counter(
self.metrics.counter_spec_decode_num_accepted_tokens,
stats.spec_decode_metrics.accepted_tokens)
self._log_counter(
self.metrics.counter_spec_decode_num_draft_tokens,
stats.spec_decode_metrics.draft_tokens)
self._log_counter(
self.metrics.counter_spec_decode_num_emitted_tokens,
stats.spec_decode_metrics.emitted_tokens)
self.spec_decode_metrics = None
class RayPrometheusStatLogger(PrometheusStatLogger):

View File

@ -5,12 +5,12 @@ For production use, we recommend using our OpenAI compatible server.
We are also not going to accept PRs modifying this file, please
change `vllm/entrypoints/openai/api_server.py` instead.
"""
import asyncio
import json
import ssl
from typing import AsyncGenerator
from argparse import Namespace
from typing import Any, AsyncGenerator, Optional
import uvicorn
from fastapi import FastAPI, Request
from fastapi.responses import JSONResponse, Response, StreamingResponse
@ -18,8 +18,10 @@ from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.engine.async_llm_engine import AsyncLLMEngine
from vllm.logger import init_logger
from vllm.sampling_params import SamplingParams
from vllm.server import serve_http
from vllm.usage.usage_lib import UsageContext
from vllm.utils import FlexibleArgumentParser, random_uuid
from vllm.version import __version__ as VLLM_VERSION
logger = init_logger("vllm.entrypoints.api_server")
@ -81,6 +83,50 @@ async def generate(request: Request) -> Response:
return JSONResponse(ret)
def build_app(args: Namespace) -> FastAPI:
global app
app.root_path = args.root_path
return app
async def init_app(
args: Namespace,
llm_engine: Optional[AsyncLLMEngine] = None,
) -> FastAPI:
app = build_app(args)
global engine
engine_args = AsyncEngineArgs.from_cli_args(args)
engine = (llm_engine
if llm_engine is not None else AsyncLLMEngine.from_engine_args(
engine_args, usage_context=UsageContext.API_SERVER))
return app
async def run_server(args: Namespace,
llm_engine: Optional[AsyncLLMEngine] = None,
**uvicorn_kwargs: Any) -> None:
logger.info("vLLM API server version %s", VLLM_VERSION)
logger.info("args: %s", args)
app = await init_app(args, llm_engine)
await serve_http(
app,
host=args.host,
port=args.port,
log_level=args.log_level,
timeout_keep_alive=TIMEOUT_KEEP_ALIVE,
ssl_keyfile=args.ssl_keyfile,
ssl_certfile=args.ssl_certfile,
ssl_ca_certs=args.ssl_ca_certs,
ssl_cert_reqs=args.ssl_cert_reqs,
**uvicorn_kwargs,
)
if __name__ == "__main__":
parser = FlexibleArgumentParser()
parser.add_argument("--host", type=str, default=None)
@ -105,25 +151,5 @@ if __name__ == "__main__":
parser.add_argument("--log-level", type=str, default="debug")
parser = AsyncEngineArgs.add_cli_args(parser)
args = parser.parse_args()
engine_args = AsyncEngineArgs.from_cli_args(args)
engine = AsyncLLMEngine.from_engine_args(
engine_args, usage_context=UsageContext.API_SERVER)
app.root_path = args.root_path
logger.info("Available routes are:")
for route in app.routes:
if not hasattr(route, 'methods'):
continue
methods = ', '.join(route.methods)
logger.info("Route: %s, Methods: %s", route.path, methods)
uvicorn.run(app,
host=args.host,
port=args.port,
log_level=args.log_level,
timeout_keep_alive=TIMEOUT_KEEP_ALIVE,
ssl_keyfile=args.ssl_keyfile,
ssl_certfile=args.ssl_certfile,
ssl_ca_certs=args.ssl_ca_certs,
ssl_cert_reqs=args.ssl_cert_reqs)
asyncio.run(run_server(args))

View File

@ -100,14 +100,16 @@ def _image_token_str(model_config: ModelConfig,
if model_type == "phi3_v":
# Workaround since this token is not defined in the tokenizer
return "<|image_1|>"
if model_type in ("blip-2", "chatglm", "fuyu", "minicpmv", "paligemma"):
if model_type == "minicpmv":
return "(<image>./</image>)"
if model_type in ("blip-2", "chatglm", "fuyu", "paligemma"):
# These models do not use image tokens in the prompt
return None
if model_type.startswith("llava"):
return tokenizer.decode(model_config.hf_config.image_token_index)
if model_type == "chameleon":
return "<image>"
raise TypeError("Unknown model type: {model_type}")
raise TypeError(f"Unknown model type: {model_type}")
# TODO: Let user specify how to insert image tokens into prompt

View File

@ -2,13 +2,12 @@ import asyncio
import importlib
import inspect
import re
from argparse import Namespace
from contextlib import asynccontextmanager
from http import HTTPStatus
from typing import Optional, Set
from typing import Any, Optional, Set
import fastapi
import uvicorn
from fastapi import APIRouter, Request
from fastapi import APIRouter, FastAPI, Request
from fastapi.exceptions import RequestValidationError
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import JSONResponse, Response, StreamingResponse
@ -37,6 +36,7 @@ from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding
from vllm.entrypoints.openai.serving_tokenization import (
OpenAIServingTokenization)
from vllm.logger import init_logger
from vllm.server import serve_http
from vllm.usage.usage_lib import UsageContext
from vllm.utils import FlexibleArgumentParser
from vllm.version import __version__ as VLLM_VERSION
@ -56,7 +56,7 @@ _running_tasks: Set[asyncio.Task] = set()
@asynccontextmanager
async def lifespan(app: fastapi.FastAPI):
async def lifespan(app: FastAPI):
async def _force_log():
while True:
@ -74,7 +74,7 @@ async def lifespan(app: fastapi.FastAPI):
router = APIRouter()
def mount_metrics(app: fastapi.FastAPI):
def mount_metrics(app: FastAPI):
# Add prometheus asgi middleware to route /metrics requests
metrics_route = Mount("/metrics", make_asgi_app())
# Workaround for 307 Redirect for /metrics
@ -164,8 +164,8 @@ async def create_embedding(request: EmbeddingRequest, raw_request: Request):
return JSONResponse(content=generator.model_dump())
def build_app(args):
app = fastapi.FastAPI(lifespan=lifespan)
def build_app(args: Namespace) -> FastAPI:
app = FastAPI(lifespan=lifespan)
app.include_router(router)
app.root_path = args.root_path
@ -213,12 +213,10 @@ def build_app(args):
return app
def run_server(args, llm_engine=None):
async def init_app(args: Namespace,
llm_engine: Optional[AsyncLLMEngine] = None) -> FastAPI:
app = build_app(args)
logger.info("vLLM API server version %s", VLLM_VERSION)
logger.info("args: %s", args)
if args.served_model_name is not None:
served_model_names = args.served_model_name
else:
@ -231,19 +229,7 @@ def run_server(args, llm_engine=None):
if llm_engine is not None else AsyncLLMEngine.from_engine_args(
engine_args, usage_context=UsageContext.OPENAI_API_SERVER))
event_loop: Optional[asyncio.AbstractEventLoop]
try:
event_loop = asyncio.get_running_loop()
except RuntimeError:
event_loop = None
if event_loop is not None and event_loop.is_running():
# If the current is instanced by Ray Serve,
# there is already a running event loop
model_config = event_loop.run_until_complete(engine.get_model_config())
else:
# When using single vLLM without engine_use_ray
model_config = asyncio.run(engine.get_model_config())
model_config = await engine.get_model_config()
if args.disable_log_requests:
request_logger = None
@ -264,6 +250,7 @@ def run_server(args, llm_engine=None):
prompt_adapters=args.prompt_adapters,
request_logger=request_logger,
chat_template=args.chat_template,
return_tokens_as_token_ids=args.return_tokens_as_token_ids,
)
openai_serving_completion = OpenAIServingCompletion(
engine,
@ -272,6 +259,7 @@ def run_server(args, llm_engine=None):
lora_modules=args.lora_modules,
prompt_adapters=args.prompt_adapters,
request_logger=request_logger,
return_tokens_as_token_ids=args.return_tokens_as_token_ids,
)
openai_serving_embedding = OpenAIServingEmbedding(
engine,
@ -289,22 +277,28 @@ def run_server(args, llm_engine=None):
)
app.root_path = args.root_path
logger.info("Available routes are:")
for route in app.routes:
if not hasattr(route, 'methods'):
continue
methods = ', '.join(route.methods)
logger.info("Route: %s, Methods: %s", route.path, methods)
return app
uvicorn.run(app,
host=args.host,
port=args.port,
log_level=args.uvicorn_log_level,
timeout_keep_alive=TIMEOUT_KEEP_ALIVE,
ssl_keyfile=args.ssl_keyfile,
ssl_certfile=args.ssl_certfile,
ssl_ca_certs=args.ssl_ca_certs,
ssl_cert_reqs=args.ssl_cert_reqs)
async def run_server(args: Namespace,
llm_engine: Optional[AsyncLLMEngine] = None,
**uvicorn_kwargs: Any) -> None:
logger.info("vLLM API server version %s", VLLM_VERSION)
logger.info("args: %s", args)
app = await init_app(args, llm_engine)
await serve_http(
app,
host=args.host,
port=args.port,
log_level=args.uvicorn_log_level,
timeout_keep_alive=TIMEOUT_KEEP_ALIVE,
ssl_keyfile=args.ssl_keyfile,
ssl_certfile=args.ssl_certfile,
ssl_ca_certs=args.ssl_ca_certs,
ssl_cert_reqs=args.ssl_cert_reqs,
**uvicorn_kwargs,
)
if __name__ == "__main__":
@ -314,4 +308,5 @@ if __name__ == "__main__":
description="vLLM OpenAI-Compatible RESTful API server.")
parser = make_arg_parser(parser)
args = parser.parse_args()
run_server(args)
asyncio.run(run_server(args))

View File

@ -128,6 +128,12 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
"using @app.middleware('http'). "
"If a class is provided, vLLM will add it to the server "
"using app.add_middleware(). ")
parser.add_argument(
"--return-tokens-as-token-ids",
action="store_true",
help="When --max-logprobs is specified, represents single tokens as"
"strings of the form 'token_id:{token_id}' so that tokens that"
"are not JSON-encodable can be identified.")
parser = AsyncEngineArgs.add_cli_args(parser)

View File

@ -50,13 +50,15 @@ class OpenAIServingChat(OpenAIServing):
prompt_adapters: Optional[List[PromptAdapterPath]],
request_logger: Optional[RequestLogger],
chat_template: Optional[str],
return_tokens_as_token_ids: bool = False,
):
super().__init__(engine=engine,
model_config=model_config,
served_model_names=served_model_names,
lora_modules=lora_modules,
prompt_adapters=prompt_adapters,
request_logger=request_logger)
request_logger=request_logger,
return_tokens_as_token_ids=return_tokens_as_token_ids)
self.response_role = response_role
@ -247,7 +249,15 @@ class OpenAIServingChat(OpenAIServing):
model=model_name)
if (request.stream_options
and request.stream_options.include_usage):
chunk.usage = None
if (request.stream_options.continuous_usage_stats):
prompt_tokens = len(res.prompt_token_ids)
usage = UsageInfo(prompt_tokens=prompt_tokens,
completion_tokens=0,
total_tokens=prompt_tokens)
chunk.usage = usage
else:
chunk.usage = None
data = chunk.model_dump_json(exclude_unset=True)
yield f"data: {data}\n\n"
@ -277,7 +287,18 @@ class OpenAIServingChat(OpenAIServing):
model=model_name)
if (request.stream_options and
request.stream_options.include_usage):
chunk.usage = None
if (request.stream_options.
continuous_usage_stats):
prompt_tokens = len(
res.prompt_token_ids)
usage = UsageInfo(
prompt_tokens=prompt_tokens,
completion_tokens=0,
total_tokens=prompt_tokens)
chunk.usage = usage
else:
chunk.usage = None
data = chunk.model_dump_json(
exclude_unset=True)
yield f"data: {data}\n\n"
@ -336,7 +357,19 @@ class OpenAIServingChat(OpenAIServing):
model=model_name)
if (request.stream_options
and request.stream_options.include_usage):
chunk.usage = None
if (request.stream_options.continuous_usage_stats):
prompt_tokens = len(res.prompt_token_ids)
completion_tokens = len(output.token_ids)
usage = UsageInfo(
prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens,
total_tokens=prompt_tokens +
completion_tokens,
)
chunk.usage = usage
else:
chunk.usage = None
data = chunk.model_dump_json(exclude_unset=True)
yield f"data: {data}\n\n"
else:
@ -356,7 +389,18 @@ class OpenAIServingChat(OpenAIServing):
model=model_name)
if (request.stream_options
and request.stream_options.include_usage):
chunk.usage = None
if (request.stream_options.continuous_usage_stats):
prompt_tokens = len(res.prompt_token_ids)
completion_tokens = len(output.token_ids)
usage = UsageInfo(
prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens,
total_tokens=prompt_tokens +
completion_tokens,
)
chunk.usage = usage
else:
chunk.usage = None
data = chunk.model_dump_json(exclude_unset=True)
yield f"data: {data}\n\n"
finish_reason_sent[i] = True
@ -480,11 +524,14 @@ class OpenAIServingChat(OpenAIServing):
self, logprobs: Dict[int, Logprob], top_logprobs: Optional[int],
tokenizer: PreTrainedTokenizer) -> List[ChatCompletionLogProb]:
return [
ChatCompletionLogProb(
token=(token := self._get_decoded_token(p[1], p[0],
tokenizer)),
logprob=max(p[1].logprob, -9999.0),
bytes=list(token.encode("utf-8", errors="replace")))
ChatCompletionLogProb(token=(token := self._get_decoded_token(
p[1],
p[0],
tokenizer,
return_as_token_id=self.return_tokens_as_token_ids)),
logprob=max(p[1].logprob, -9999.0),
bytes=list(
token.encode("utf-8", errors="replace")))
for i, p in enumerate(logprobs.items())
if top_logprobs and i < top_logprobs
]
@ -504,6 +551,8 @@ class OpenAIServingChat(OpenAIServing):
step_top_logprobs = top_logprobs[i]
if step_top_logprobs is None:
token = tokenizer.decode(token_id)
if self.return_tokens_as_token_ids:
token = f"token_id:{token_id}"
logprobs_content.append(
ChatCompletionLogProbsContent(
token=token,
@ -511,7 +560,9 @@ class OpenAIServingChat(OpenAIServing):
else:
logprobs_content.append(
ChatCompletionLogProbsContent(
token=step_top_logprobs[token_id].decoded_token,
token=self._get_decoded_token(
step_top_logprobs[token_id], token_id, tokenizer,
self.return_tokens_as_token_ids),
logprob=max(step_top_logprobs[token_id].logprob,
-9999.0),
bytes=list(

View File

@ -51,13 +51,15 @@ class OpenAIServingCompletion(OpenAIServing):
lora_modules: Optional[List[LoRAModulePath]],
prompt_adapters: Optional[List[PromptAdapterPath]],
request_logger: Optional[RequestLogger],
return_tokens_as_token_ids: bool = False,
):
super().__init__(engine=engine,
model_config=model_config,
served_model_names=served_model_names,
lora_modules=lora_modules,
prompt_adapters=prompt_adapters,
request_logger=request_logger)
request_logger=request_logger,
return_tokens_as_token_ids=return_tokens_as_token_ids)
async def create_completion(self, request: CompletionRequest,
raw_request: Request):
@ -430,12 +432,17 @@ class OpenAIServingCompletion(OpenAIServing):
step_top_logprobs = top_logprobs[i]
if step_top_logprobs is None:
token = tokenizer.decode(token_id)
if self.return_tokens_as_token_ids:
token = f"token_id:{token_id}"
out_tokens.append(token)
out_token_logprobs.append(None)
out_top_logprobs.append(None)
else:
token = self._get_decoded_token(step_top_logprobs[token_id],
token_id, tokenizer)
token = self._get_decoded_token(
step_top_logprobs[token_id],
token_id,
tokenizer,
return_as_token_id=self.return_tokens_as_token_ids)
token_logprob = max(step_top_logprobs[token_id].logprob,
-9999.0)
out_tokens.append(token)
@ -448,7 +455,11 @@ class OpenAIServingCompletion(OpenAIServing):
out_top_logprobs.append({
# Convert float("-inf") to the
# JSON-serializable float that OpenAI uses
self._get_decoded_token(top_lp[1], top_lp[0], tokenizer):
self._get_decoded_token(
top_lp[1],
top_lp[0],
tokenizer,
return_as_token_id=self.return_tokens_as_token_ids):
max(top_lp[1].logprob, -9999.0)
for i, top_lp in enumerate(step_top_logprobs.items())
if num_output_top_logprobs >= i

View File

@ -68,6 +68,7 @@ class OpenAIServing:
lora_modules: Optional[List[LoRAModulePath]],
prompt_adapters: Optional[List[PromptAdapterPath]],
request_logger: Optional[RequestLogger],
return_tokens_as_token_ids: bool = False,
):
super().__init__()
@ -102,6 +103,7 @@ class OpenAIServing:
prompt_adapter_num_virtual_tokens=num_virtual_tokens))
self.request_logger = request_logger
self.return_tokens_as_token_ids = return_tokens_as_token_ids
async def show_available_models(self) -> ModelList:
"""Show available models. Right now we only have one model."""
@ -384,11 +386,13 @@ class OpenAIServing:
)
@staticmethod
def _get_decoded_token(
logprob: Logprob,
token_id: int,
tokenizer: AnyTokenizer,
) -> str:
def _get_decoded_token(logprob: Logprob,
token_id: int,
tokenizer: AnyTokenizer,
return_as_token_id: bool = False) -> str:
if return_as_token_id:
return f"token_id:{token_id}"
if logprob.decoded_token is not None:
return logprob.decoded_token
return tokenizer.decode(token_id)

View File

@ -29,6 +29,7 @@ if TYPE_CHECKING:
VLLM_TRACE_FUNCTION: int = 0
VLLM_ATTENTION_BACKEND: Optional[str] = None
VLLM_CPU_KVCACHE_SPACE: int = 0
VLLM_CPU_OMP_THREADS_BIND: str = ""
VLLM_OPENVINO_KVCACHE_SPACE: int = 0
VLLM_OPENVINO_CPU_KV_CACHE_PRECISION: Optional[str] = None
VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS: bool = False
@ -241,11 +242,16 @@ environment_variables: Dict[str, Callable[[], Any]] = {
"VLLM_ATTENTION_BACKEND":
lambda: os.getenv("VLLM_ATTENTION_BACKEND", None),
# CPU key-value cache space
# (CPU backend only) CPU key-value cache space.
# default is 4GB
"VLLM_CPU_KVCACHE_SPACE":
lambda: int(os.getenv("VLLM_CPU_KVCACHE_SPACE", "0")),
# (CPU backend only) CPU core ids bound by OpenMP threads, e.g., "0-31",
# "0,1,2", "0-31,33". CPU cores of different ranks are separated by '|'.
"VLLM_CPU_OMP_THREADS_BIND":
lambda: os.getenv("VLLM_CPU_OMP_THREADS_BIND", "all"),
# OpenVINO key-value cache space
# default is 4GB
"VLLM_OPENVINO_KVCACHE_SPACE":

View File

@ -1,16 +1,21 @@
from typing import List, Set, Tuple
import os
from functools import partial
from typing import Any, Awaitable, List, Optional, Set, Tuple, Union
import torch
import vllm.envs as envs
from vllm.config import CacheConfig, ModelConfig, SchedulerConfig
from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
from vllm.executor.multiproc_worker_utils import (ProcessWorkerWrapper,
ResultHandler, WorkerMonitor)
from vllm.logger import init_logger
from vllm.lora.request import LoRARequest
from vllm.prompt_adapter.request import PromptAdapterRequest
from vllm.sequence import ExecuteModelRequest, SamplerOutput
from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
make_async)
from vllm.utils import (get_distributed_init_method, get_open_port,
get_vllm_instance_id, make_async)
from vllm.worker.worker_base import WorkerWrapperBase
logger = init_logger(__name__)
@ -22,46 +27,173 @@ class CPUExecutor(ExecutorBase):
def _init_executor(self) -> None:
assert self.device_config.device_type == "cpu"
assert self.lora_config is None, "cpu backend doesn't support LoRA"
#
# Environment variables for CPU executor
#
# Ensure that VLLM_INSTANCE_ID is set, to be inherited by workers
os.environ["VLLM_INSTANCE_ID"] = get_vllm_instance_id()
# Disable torch async compiling which won't work with daemonic processes
os.environ["TORCHINDUCTOR_COMPILE_THREADS"] = "1"
# Intel OpenMP setting
ld_prealod_str = os.getenv("LD_PRELOAD", "")
if "libiomp5.so" in ld_prealod_str:
# The time(milliseconds) that a thread should wait after
# completing the execution of a parallel region, before sleeping.
os.environ['KMP_BLOCKTIME'] = "1"
# Prevents the CPU to run into low performance state
os.environ['KMP_TPAUSE'] = "0"
# Provides fine granularity parallelism
os.environ['KMP_FORKJOIN_BARRIER_PATTERN'] = "dist,dist"
os.environ['KMP_PLAIN_BARRIER_PATTERN'] = "dist,dist"
os.environ['KMP_REDUCTION_BARRIER_PATTERN'] = "dist,dist"
# To hint IPEX uses shared memory based AllReduce
os.environ["LOCAL_WORLD_SIZE"] = str(
self.parallel_config.tensor_parallel_size)
self.model_config = _verify_and_get_model_config(self.model_config)
self.cache_config = _verify_and_get_cache_config(self.cache_config)
self.scheduler_config = _verify_and_get_scheduler_config(
self.scheduler_config)
# Instantiate the worker and load the model to CPU.
self._init_worker()
# Multiprocessing-based executor does not support multi-node setting.
# Since it only works for single node, we can use the loopback address
# 127.0.0.1 for communication.
ip = "127.0.0.1"
port = get_open_port()
self.distributed_init_method = get_distributed_init_method(ip, port)
def _init_worker(self):
from vllm.worker.cpu_worker import CPUWorker
is_async = isinstance(self, CPUExecutorAsync)
assert self.parallel_config.world_size == 1, (
"CPUExecutor only supports single CPU socket currently.")
world_size = self.parallel_config.tensor_parallel_size
result_handler = ResultHandler()
self.parallel_worker_tasks: Optional[Union[Any, Awaitable[Any]]] = None
self.workers = []
distributed_init_method = get_distributed_init_method(
get_ip(), get_open_port())
self.driver_worker = CPUWorker(
if is_async:
self.workers = [
ProcessWorkerWrapper(
result_handler,
partial(
self._create_worker,
rank=rank,
local_rank=rank,
)) for rank in range(0, world_size)
]
self.driver_worker = self.workers[0]
self.workers = self.workers[1:]
self.driver_method_invoker = _async_driver_method_invoker
else:
self.driver_worker = self._create_worker()
self.driver_method_invoker = _driver_method_invoker
if world_size != 1:
self.workers = [
ProcessWorkerWrapper(
result_handler,
partial(
self._create_worker,
rank=rank,
local_rank=rank,
)) for rank in range(1, world_size)
]
if world_size != 1 or is_async:
if is_async:
async_worker_list = self.workers + [self.driver_worker]
else:
async_worker_list = self.workers
self.worker_monitor = WorkerMonitor(async_worker_list,
result_handler)
result_handler.start()
self.worker_monitor.start()
self._run_workers("init_device")
self._run_workers("load_model")
def _create_worker(
self,
local_rank: int = 0,
rank: int = 0,
):
worker_module_name = "vllm.worker.cpu_worker"
worker_class_name = "CPUWorker"
wrapper = WorkerWrapperBase(
worker_module_name=worker_module_name,
worker_class_name=worker_class_name,
)
assert self.distributed_init_method is not None
kwargs = dict(
model_config=self.model_config,
parallel_config=self.parallel_config,
scheduler_config=self.scheduler_config,
device_config=self.device_config,
cache_config=self.cache_config,
load_config=self.load_config,
local_rank=0,
rank=0,
distributed_init_method=distributed_init_method,
local_rank=local_rank,
rank=rank,
distributed_init_method=self.distributed_init_method,
lora_config=self.lora_config,
multimodal_config=self.multimodal_config,
kv_cache_dtype=self.cache_config.cache_dtype,
prompt_adapter_config=self.prompt_adapter_config,
is_driver_worker=True,
is_driver_worker=rank == 0,
)
self.driver_worker.init_device()
self.driver_worker.load_model()
wrapper.init_worker(**kwargs)
return wrapper.worker
def _run_workers(
self,
method: str,
*args,
async_run_remote_workers_only: bool = False,
max_concurrent_workers: Optional[int] = None,
**kwargs,
) -> Any:
"""Runs the given method on all workers.
Args:
async_run_remote_workers_only: If True the method will be run only
in the remote workers, not the driver worker. It will also be
run asynchronously and return a list of futures rather than
blocking on the results.
"""
if max_concurrent_workers:
raise NotImplementedError(
"max_concurrent_workers is not supported yet.")
# Start the workers first.
worker_outputs = [
worker.execute_method(method, *args, **kwargs)
for worker in self.workers
]
if async_run_remote_workers_only:
# Just return futures
return worker_outputs
driver_worker_output = self.driver_method_invoker(
self.driver_worker, method, *args, **kwargs)
# Get the results of the workers.
return [driver_worker_output
] + [output.get() for output in worker_outputs]
def determine_num_available_blocks(self) -> Tuple[int, int]:
"""Determine the number of available KV blocks by invoking the
underlying worker.
"""
return self.driver_worker.determine_num_available_blocks()
return self.driver_method_invoker(self.driver_worker,
"determine_num_available_blocks")
def initialize_cache(self, num_gpu_blocks: int,
num_cpu_blocks: int) -> None:
@ -74,43 +206,95 @@ class CPUExecutor(ExecutorBase):
# referred as `gpu block`. Because we want to reuse the existing block
# management procedure.
logger.info("# CPU blocks: %d", num_gpu_blocks)
self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks)
self._run_workers("initialize_cache",
num_gpu_blocks=num_gpu_blocks,
num_cpu_blocks=num_cpu_blocks)
def execute_model(
self,
execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
output = self.driver_worker.execute_model(execute_model_req)
if (self.parallel_config.tensor_parallel_size > 1
and self.parallel_worker_tasks is None):
self.parallel_worker_tasks = self._run_workers(
"start_worker_execution_loop",
async_run_remote_workers_only=True,
)
output = self.driver_method_invoker(self.driver_worker,
"execute_model", execute_model_req)
return output
def stop_remote_worker_execution_loop(self) -> None:
if self.parallel_worker_tasks is None:
return
"""
Passing None will cause the driver to stop the model execution
loop running in each of the remote workers.
"""
self.driver_method_invoker(self.driver_worker, "execute_model", None)
parallel_worker_tasks = self.parallel_worker_tasks
self.parallel_worker_tasks = None
# Ensure that workers exit model loop cleanly
# (this will raise otherwise)
self._wait_for_tasks_completion(parallel_worker_tasks)
def add_lora(self, lora_request: LoRARequest) -> bool:
return self.driver_worker.add_lora(lora_request)
return all(self._run_workers("add_lora", lora_request))
def remove_lora(self, lora_id: int) -> bool:
return self.driver_worker.remove_lora(lora_id)
return all(self._run_workers("remove_lora", lora_id))
def pin_lora(self, lora_id: int) -> bool:
return self.driver_worker.pin_lora(lora_id)
assert lora_id > 0, "lora_id must be greater than 0."
return all(self._run_workers(
"pin_lora",
lora_id=lora_id,
))
def list_loras(self) -> Set[int]:
return self.driver_worker.list_loras()
return self.driver_method_invoker(self.driver_worker, "list_loras")
def add_prompt_adapter(
self, prompt_adapter_request: PromptAdapterRequest) -> bool:
return self.driver_worker.add_prompt_adapter(prompt_adapter_request)
return all(
self._run_workers(
"add_prompt_adapter",
prompt_adapter_request,
))
def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool:
return self.driver_worker.remove_prompt_adapter(prompt_adapter_id)
return all(
self._run_workers(
"remove_prompt_adapter",
prompt_adapter_id,
))
def list_prompt_adapters(self) -> Set[int]:
return self.driver_worker.list_prompt_adapters()
return self.driver_method_invoker(self.driver_worker,
"list_prompt_adapters")
def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool:
return self.driver_worker.pin_prompt_adapter(prompt_adapter_id)
return all(self._run_workers(
"pin_prompt_adapter",
prompt_adapter_id,
))
def check_health(self) -> None:
# CPUExecutor will always be healthy as long as
# it's running.
return
"""Raises an error if engine is unhealthy."""
if self.worker_monitor is not None and not self.worker_monitor.is_alive(
):
raise RuntimeError("Worker processes are not running")
def shutdown(self):
if (worker_monitor := getattr(self, "worker_monitor",
None)) is not None:
worker_monitor.close()
def _wait_for_tasks_completion(self, parallel_worker_tasks: Any) -> None:
"""Wait for futures returned from _run_workers() with
async_run_remote_workers_only to complete."""
for result in parallel_worker_tasks:
result.get()
class CPUExecutorAsync(CPUExecutor, ExecutorAsyncBase):
@ -118,14 +302,12 @@ class CPUExecutorAsync(CPUExecutor, ExecutorAsyncBase):
async def execute_model_async(
self,
execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
output = await make_async(self.driver_worker.execute_model
output = await make_async(self.execute_model
)(execute_model_req=execute_model_req, )
return output
async def check_health_async(self) -> None:
# CPUExecutor will always be healthy as long as
# it's running.
return
self.check_health()
def _verify_and_get_model_config(config: ModelConfig) -> ModelConfig:
@ -170,3 +352,11 @@ def _verify_and_get_cache_config(config: CacheConfig) -> CacheConfig:
f" {kv_cache_space}, expect a positive integer value.")
return config
def _driver_method_invoker(driver, method: str, *args, **kwargs):
return getattr(driver, method)(*args, **kwargs)
def _async_driver_method_invoker(driver, method: str, *args, **kwargs):
return driver.execute_method(method, *args, **kwargs).get()

View File

@ -1,6 +1,7 @@
import asyncio
import os
import signal
import threading
import weakref
from functools import partial
from typing import Any, List, Optional
@ -115,8 +116,9 @@ class MultiprocessingGPUExecutor(DistributedGPUExecutor):
if executor := ref():
executor.shutdown()
signal.signal(signal.SIGINT, shutdown)
signal.signal(signal.SIGTERM, shutdown)
if threading.current_thread() is threading.main_thread():
signal.signal(signal.SIGINT, shutdown)
signal.signal(signal.SIGTERM, shutdown)
self.driver_worker = self._create_worker(
distributed_init_method=distributed_init_method)

View File

@ -29,6 +29,7 @@ class RayGPUExecutor(DistributedGPUExecutor):
uses_ray: bool = True
def _init_executor(self) -> None:
self.forward_dag: Optional["ray.dag.CompiledDAG"] = None
# If the env var is set, it uses the Ray's compiled DAG API
# which optimizes the control plane overhead.
# Run vLLM with VLLM_USE_RAY_COMPILED_DAG=1 to enable it.
@ -60,8 +61,6 @@ class RayGPUExecutor(DistributedGPUExecutor):
# Create the parallel GPU workers.
self._init_workers_ray(placement_group)
self.forward_dag: Optional["ray.dag.CompiledDAG"] = None
def _configure_ray_workers_use_nsight(self,
ray_remote_kwargs) -> Dict[str, Any]:
# If nsight profiling is enabled, we need to set the profiling

View File

@ -3,7 +3,7 @@ from typing import List, Optional, Tuple
from vllm.config import ParallelConfig
from vllm.logger import init_logger
from vllm.sequence import ExecuteModelRequest
from vllm.utils import get_ip, is_hip, is_xpu
from vllm.utils import get_ip, is_hip, is_tpu, is_xpu
from vllm.worker.worker_base import WorkerWrapperBase
logger = init_logger(__name__)
@ -93,32 +93,38 @@ def initialize_ray_cluster(
# Placement group is already set.
return
device_str = "GPU" if not is_tpu() else "TPU"
# Create placement group for worker processes
current_placement_group = ray.util.get_current_placement_group()
if current_placement_group:
# We are in a placement group
bundles = current_placement_group.bundle_specs
# Verify that we can use the placement group.
gpu_bundles = 0
device_bundles = 0
for bundle in bundles:
bundle_gpus = bundle.get("GPU", 0)
if bundle_gpus > 1:
bundle_devices = bundle.get(device_str, 0)
if bundle_devices > 1:
raise ValueError(
"Placement group bundle cannot have more than 1 GPU.")
if bundle_gpus:
gpu_bundles += 1
if parallel_config.world_size > gpu_bundles:
"Placement group bundle cannot have more than 1 "
f"{device_str}.")
if bundle_devices:
device_bundles += 1
if parallel_config.world_size > device_bundles:
raise ValueError(
"The number of required GPUs exceeds the total number of "
"available GPUs in the placement group.")
f"The number of required {device_str}s exceeds the total "
f"number of available {device_str}s in the placement group."
f"Required number of devices: {parallel_config.world_size}. "
f"Total number of devices: {device_bundles}.")
else:
num_gpus_in_cluster = ray.cluster_resources().get("GPU", 0)
if parallel_config.world_size > num_gpus_in_cluster:
num_devices_in_cluster = ray.cluster_resources().get(device_str, 0)
if parallel_config.world_size > num_devices_in_cluster:
raise ValueError(
"The number of required GPUs exceeds the total number of "
"available GPUs in the cluster.")
f"The number of required {device_str}s exceeds the total "
f"number of available {device_str}s in the placement group.")
# Create a new placement group
placement_group_specs = ([{"GPU": 1}] * parallel_config.world_size)
placement_group_specs = ([{
device_str: 1
}] * parallel_config.world_size)
current_placement_group = ray.util.placement_group(
placement_group_specs)
# Wait until PG is ready - this will block until all

View File

@ -1067,6 +1067,10 @@ class LogitsProcessorWithLoRA(BaseLayerWithLoRA):
def soft_cap(self):
return self.base_layer.soft_cap
@property
def use_gather(self):
return self.base_layer.use_gather
@property
def org_vocab_size(self):
return self.base_layer.org_vocab_size

View File

@ -159,6 +159,19 @@ class QuickGELU(CustomOp):
# def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
class ReLUSquaredActivation(CustomOp):
"""
Applies the relu^2 activation introduced in https://arxiv.org/abs/2109.08668v2
"""
def forward_native(self, x: torch.Tensor) -> torch.Tensor:
"""PyTorch-native implementation equivalent to forward()."""
return torch.square(F.relu(x))
def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
return self.forward_native(x)
class ScaledActivation(nn.Module):
"""An activation function with post-scale parameters.
@ -207,6 +220,7 @@ _ACTIVATION_REGISTRY = {
"gelu_new": NewGELU(),
"gelu_pytorch_tanh": nn.GELU(approximate="tanh"),
"relu": nn.ReLU(),
"relu2": ReLUSquaredActivation(),
"quick_gelu": QuickGELU(),
}

View File

@ -199,12 +199,16 @@ class ReplicatedLinear(LinearBase):
self.input_size,
self.output_size,
self.params_dtype,
weight_loader=self.weight_loader,
prefix=prefix)
if bias:
self.bias = Parameter(
torch.empty(self.output_size, dtype=self.params_dtype))
set_weight_attrs(self.bias, {"output_dim": 0})
set_weight_attrs(self.bias, {
"output_dim": 0,
"weight_loader": self.weight_loader,
})
else:
self.register_parameter("bias", None)

View File

@ -5,10 +5,12 @@ from typing import Optional
import torch
import torch.nn as nn
from vllm.distributed import tensor_model_parallel_gather
from vllm.distributed import (tensor_model_parallel_all_gather,
tensor_model_parallel_gather)
from vllm.model_executor.layers.vocab_parallel_embedding import (
VocabParallelEmbedding)
from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.platforms import current_platform
class LogitsProcessor(nn.Module):
@ -39,6 +41,8 @@ class LogitsProcessor(nn.Module):
self.org_vocab_size = org_vocab_size or vocab_size
# Soft cap the logits. Used in Gemma 2.
self.soft_cap = soft_cap
# Whether to use gather or all-gather to gather the logits.
self.use_gather = not current_platform.is_tpu()
def forward(
self,
@ -76,7 +80,15 @@ class LogitsProcessor(nn.Module):
logits = lm_head.linear_method.apply(lm_head,
hidden_states,
bias=embedding_bias)
logits = tensor_model_parallel_gather(logits)
if self.use_gather:
logits = tensor_model_parallel_gather(logits)
else:
# Gather is not supported for some devices such as TPUs.
# Use all-gather instead.
# NOTE(woosuk): Here, the outputs of every device should not be None
# because XLA requires strict SPMD among all devices. Every device
# should execute the same operations after gathering the logits.
logits = tensor_model_parallel_all_gather(logits)
# Remove paddings in vocab (if any).
if logits is not None:
logits = logits[:, :self.org_vocab_size]

View File

@ -25,7 +25,7 @@ class AWQMarlinConfig(QuantizationConfig):
def __init__(self, weight_bits: int, group_size: int, has_zp: bool,
lm_head_quantized: bool) -> None:
self.weight_bits = weight_bits
self.pack_factor = 32 // self.weight_bits # packed into int32
self.pack_factor = 32 // self.weight_bits # packed into 32bits
self.group_size = group_size
self.has_zp = has_zp
self.lm_head_quantized = lm_head_quantized
@ -69,7 +69,8 @@ class AWQMarlinConfig(QuantizationConfig):
def override_quantization_method(cls, hf_quant_cfg,
user_quant) -> Optional[str]:
can_convert = cls.is_awq_marlin_compatible(hf_quant_cfg)
is_valid_user_quant = (user_quant is None or user_quant == "marlin")
is_valid_user_quant = (user_quant is None or user_quant == "marlin"
or user_quant == "awq_marlin")
if can_convert and is_valid_user_quant:
msg = ("The model is convertible to {} during runtime."

View File

@ -15,19 +15,11 @@ class BitsAndBytesConfig(QuantizationConfig):
Reference: https://arxiv.org/abs/2305.14314
"""
def __init__(
self,
adapter_name_or_path: str,
target_modules: List[str],
) -> None:
self.adapter_name_or_path = adapter_name_or_path
self.target_modules = target_modules
def __init__(self, ) -> None:
pass
def __repr__(self) -> str:
return (
f"BitsAndBytesConfig(adapter_name_or_path={self.adapter_name_or_path}"
)
return "BitsAndBytesConfig"
@classmethod
def get_name(self) -> str:
@ -49,16 +41,7 @@ class BitsAndBytesConfig(QuantizationConfig):
@classmethod
def from_config(cls, config: Dict[str, Any]) -> "BitsAndBytesConfig":
adapter_name = cls.get_from_keys(config, ["adapter_name_or_path"])
default_target_modules = [
"gate_proj", "down_proj", "up_proj", "q_proj", "k_proj", "v_proj",
"o_proj"
]
if adapter_name == "":
target_modules = default_target_modules
else:
target_modules = cls.get_from_keys(config, ["target_modules"])
return cls(adapter_name, target_modules)
return cls()
def get_quant_method(self, layer: torch.nn.Module,
prefix: str) -> Optional["BitsAndBytesLinearMethod"]:

View File

@ -10,7 +10,8 @@ from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
W4A16SPARSE24_SUPPORTED_BITS, WNA16_SUPPORTED_BITS,
CompressedTensorsScheme, CompressedTensorsUnquantized,
CompressedTensorsW4A16Sparse24, CompressedTensorsW8A8Fp8,
CompressedTensorsW8A8Int8, CompressedTensorsWNA16)
CompressedTensorsW8A8Int8, CompressedTensorsW8A16Fp8,
CompressedTensorsWNA16)
from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
CompressionFormat, QuantizationArgs, QuantizationStrategy,
QuantizationType, find_matched_target, is_activation_quantization_format,
@ -100,14 +101,18 @@ class CompressedTensorsConfig(QuantizationConfig):
def get_config_filenames(cls) -> List[str]:
return []
def _check_scheme_supported(self, min_capability: int):
def _check_scheme_supported(self,
min_capability: int,
error: bool = True) -> bool:
capability = current_platform.get_device_capability()
capability = capability[0] * 10 + capability[1]
if capability < min_capability:
supported = capability >= min_capability
if error and not supported:
raise RuntimeError(
"Quantization scheme is not supported for ",
f"the current GPU. Min capability: {min_capability}. ",
f"Current capability: {capability}.")
return supported
def _is_static_tensor_w8a8(self, weight_quant: BaseModel,
input_quant: BaseModel) -> bool:
@ -170,6 +175,29 @@ class CompressedTensorsConfig(QuantizationConfig):
# All conditions satisfied.
return True
def _is_fp8_w8a16(self, weight_quant: BaseModel,
input_quant: BaseModel) -> bool:
# Confirm weights quantized.
if weight_quant is None:
return False
# Confirm we have floating points.
if weight_quant.type != QuantizationType.FLOAT:
return False
# Confirm weight scheme is supported.
is_symmetric_weight = weight_quant.symmetric
is_static_weight = not weight_quant.dynamic
is_per_tensor_or_channel_weight = (weight_quant.strategy in [
QuantizationStrategy.TENSOR, QuantizationStrategy.CHANNEL
])
if not (is_symmetric_weight and is_static_weight
and is_per_tensor_or_channel_weight):
return False
# All conditions satisfied.
return True
def _is_wNa16_group_channel(self, weight_quant: BaseModel,
input_quant: BaseModel) -> bool:
input_quant_none = input_quant is None
@ -204,9 +232,23 @@ class CompressedTensorsConfig(QuantizationConfig):
# Detect If Activation Quantization.
if is_activation_quantization_format(self.quant_format):
if self._is_fp8_w8a8(weight_quant, input_quant):
return CompressedTensorsW8A8Fp8(
is_fp8_w8a8_supported = self._check_scheme_supported(
CompressedTensorsW8A8Fp8.get_min_capability(), error=False)
if is_fp8_w8a8_supported:
return CompressedTensorsW8A8Fp8(
strategy=weight_quant.strategy,
is_static_input_scheme=(not input_quant.dynamic))
else:
return CompressedTensorsW8A16Fp8(
strategy=weight_quant.strategy,
is_static_input_scheme=(input_quant
and not input_quant.dynamic))
if self._is_fp8_w8a16(weight_quant, input_quant):
return CompressedTensorsW8A16Fp8(
strategy=weight_quant.strategy,
is_static_input_scheme=(not input_quant.dynamic))
is_static_input_scheme=(input_quant
and not input_quant.dynamic))
if self._is_static_tensor_w8a8(weight_quant, input_quant):
return CompressedTensorsW8A8Int8(
@ -257,11 +299,10 @@ class CompressedTensorsConfig(QuantizationConfig):
targets=self.target_scheme_map.keys())
# Find the quant_scheme
scheme = self.target_scheme_map[matched_target]
return self._get_scheme_from_parts(
weight_quant=scheme["weights"],
input_quant=scheme["input_activations"])
scheme_dict = self.target_scheme_map[matched_target]
scheme = self._get_scheme_from_parts(
weight_quant=scheme_dict["weights"],
input_quant=scheme_dict["input_activations"])
# Raise error if device does not support the scheme
# (e.g. fp8 needs ada lovelace)

View File

@ -4,6 +4,7 @@ from .compressed_tensors_w4a16_24 import (W4A16SPARSE24_SUPPORTED_BITS,
CompressedTensorsW4A16Sparse24)
from .compressed_tensors_w8a8_fp8 import CompressedTensorsW8A8Fp8
from .compressed_tensors_w8a8_int8 import CompressedTensorsW8A8Int8
from .compressed_tensors_w8a16_fp8 import CompressedTensorsW8A16Fp8
from .compressed_tensors_wNa16 import (WNA16_SUPPORTED_BITS,
CompressedTensorsWNA16)
@ -11,6 +12,7 @@ __all__ = [
"CompressedTensorsScheme",
"CompressedTensorsUnquantized",
"CompressedTensorsWNA16",
"CompressedTensorsW8A16Fp8",
"CompressedTensorsW4A16Sparse24",
"CompressedTensorsW8A8Int8",
"CompressedTensorsW8A8Fp8",

View File

@ -12,8 +12,9 @@ class CompressedTensorsScheme(ABC):
of different quantization schemes supported by CompressedTensors.
"""
@classmethod
@abstractmethod
def get_min_capability(self) -> int:
def get_min_capability(cls) -> int:
"""
Get minimum device capability.
"""

View File

@ -18,7 +18,8 @@ class CompressedTensorsUnquantized(CompressedTensorsScheme):
in a linear transformation.
"""
def get_min_capability(self) -> int:
@classmethod
def get_min_capability(cls) -> int:
# volta and up
return 70

View File

@ -29,7 +29,8 @@ class CompressedTensorsW4A16Sparse24(CompressedTensorsScheme):
raise ValueError(
"group_size must be given when using strategy group")
def get_min_capability(self) -> int:
@classmethod
def get_min_capability(cls) -> int:
# ampere + up
return 80

View File

@ -0,0 +1,105 @@
from typing import Callable, List, Optional
import torch
from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
CompressedTensorsScheme)
from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
QuantizationStrategy)
from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
apply_fp8_marlin_linear, prepare_fp8_layer_for_marlin)
from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
convert_to_channelwise, create_per_channel_scale_param,
create_per_tensor_scale_param)
from vllm.model_executor.utils import set_weight_attrs
__all__ = ["CompressedTensorsW8A16Fp8"]
SUPPORTED_STRATEGIES = [
QuantizationStrategy.CHANNEL, QuantizationStrategy.TENSOR
]
class CompressedTensorsW8A16Fp8(CompressedTensorsScheme):
def __init__(self, strategy: str, is_static_input_scheme: bool):
self.strategy = strategy
self.is_static_input_scheme = is_static_input_scheme
@classmethod
def get_min_capability(cls) -> int:
# ampere and up
return 80
# W8A8-Fp8 kernels support only per-tensor and per-channel cases.
# So if we have a fused module (QKV, MLP) with per tensor scales,
# we expand each scale to its shard's channels.
def process_weights_after_loading(self, layer) -> None:
if self.strategy == QuantizationStrategy.TENSOR:
ws_channelwise = convert_to_channelwise(layer.weight_scale,
layer.logical_widths)
layer.weight_scale = torch.nn.Parameter(ws_channelwise,
requires_grad=False)
# Weights must be transposed for marlin
layer.weight = torch.nn.Parameter(layer.weight.t(),
requires_grad=False)
prepare_fp8_layer_for_marlin(layer, strategy="channel")
def create_weights(self, layer: torch.nn.Module, input_size: int,
output_partition_sizes: List[int],
input_size_per_partition: int,
params_dtype: torch.dtype, weight_loader: Callable,
**kwargs):
output_size_per_partition = sum(output_partition_sizes)
layer.logical_widths = output_partition_sizes
layer.input_size_per_partition = input_size_per_partition
layer.output_size_per_partition = output_size_per_partition
layer.orig_dtype = params_dtype
# WEIGHT
weight = torch.nn.Parameter(torch.empty(output_size_per_partition,
input_size_per_partition,
dtype=torch.float8_e4m3fn),
requires_grad=False)
layer.register_parameter("weight", weight)
set_weight_attrs(weight, {
"input_dim": 1,
"output_dim": 0,
"weight_loader": weight_loader,
})
# WEIGHT SCALE
layer_kwargs = {"weight_loader": weight_loader}
if self.strategy == QuantizationStrategy.CHANNEL:
weight_scale = create_per_channel_scale_param(
output_partition_sizes, **layer_kwargs)
elif self.strategy == QuantizationStrategy.TENSOR:
weight_scale = create_per_tensor_scale_param(
output_partition_sizes, **layer_kwargs)
else:
raise ValueError(
f"Unsupported weight strategy={self.strategy}, "
f"supported strategies are {SUPPORTED_STRATEGIES}")
layer.register_parameter("weight_scale", weight_scale)
# INPUT SCALE (to deal with converted checkpoints)
if self.is_static_input_scheme:
input_scale = create_per_tensor_scale_param(
output_partition_sizes, **layer_kwargs)
layer.register_parameter("input_scale", input_scale)
def apply_weights(self,
layer: torch.nn.Module,
x: torch.Tensor,
bias: Optional[torch.Tensor] = None) -> torch.Tensor:
return apply_fp8_marlin_linear(input=x,
weight=layer.weight,
weight_scale=layer.weight_scale,
workspace=layer.workspace,
size_n=layer.output_size_per_partition,
size_k=layer.input_size_per_partition,
bias=bias)

View File

@ -23,7 +23,8 @@ class CompressedTensorsW8A8Fp8(CompressedTensorsScheme):
self.is_static_input_scheme = is_static_input_scheme
self.cutlass_fp8_supported = cutlass_fp8_supported()
def get_min_capability(self) -> int:
@classmethod
def get_min_capability(cls) -> int:
# lovelace and up
return 89
@ -77,19 +78,20 @@ class CompressedTensorsW8A8Fp8(CompressedTensorsScheme):
})
# WEIGHT SCALE
layer_kwargs = {"weight_loader": weight_loader}
if self.strategy == QuantizationStrategy.CHANNEL:
weight_scale = create_per_channel_scale_param(
output_partition_sizes, weight_loader=weight_loader)
output_partition_sizes, **layer_kwargs)
else:
assert self.strategy == QuantizationStrategy.TENSOR
weight_scale = create_per_tensor_scale_param(
output_partition_sizes, weight_loader=weight_loader)
output_partition_sizes, **layer_kwargs)
layer.register_parameter("weight_scale", weight_scale)
# INPUT SCALE
if self.is_static_input_scheme:
input_scale = create_per_tensor_scale_param(
output_partition_sizes, weight_loader=weight_loader)
output_partition_sizes, **layer_kwargs)
layer.register_parameter("input_scale", input_scale)
def apply_weights(self,

View File

@ -19,7 +19,8 @@ class CompressedTensorsW8A8Int8(CompressedTensorsScheme):
self.strategy = strategy
self.is_static_input_scheme = is_static_input_scheme
def get_min_capability(self) -> int:
@classmethod
def get_min_capability(cls) -> int:
# turing and up
return 75
@ -68,19 +69,19 @@ class CompressedTensorsW8A8Int8(CompressedTensorsScheme):
# WEIGHT SCALE
layer_kwargs = {"weight_loader": weight_loader}
if self.strategy == QuantizationStrategy.CHANNEL:
scale = create_per_channel_scale_param(output_partition_sizes,
**layer_kwargs)
weight_scale = create_per_channel_scale_param(
output_partition_sizes, **layer_kwargs)
else:
assert self.strategy == QuantizationStrategy.TENSOR
scale = create_per_tensor_scale_param(output_partition_sizes,
**layer_kwargs)
layer.register_parameter("weight_scale", scale)
weight_scale = create_per_tensor_scale_param(
output_partition_sizes, **layer_kwargs)
layer.register_parameter("weight_scale", weight_scale)
# INPUT SCALE
if self.is_static_input_scheme:
scale = create_per_tensor_scale_param(output_partition_sizes,
**layer_kwargs)
layer.register_parameter("input_scale", scale)
input_scale = create_per_tensor_scale_param(
output_partition_sizes, **layer_kwargs)
layer.register_parameter("input_scale", input_scale)
def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor,
bias: Optional[torch.Tensor]) -> torch.Tensor:

View File

@ -42,7 +42,8 @@ class CompressedTensorsWNA16(CompressedTensorsScheme):
group_size=self.group_size,
is_sym=True)
def get_min_capability(self) -> int:
@classmethod
def get_min_capability(cls) -> int:
# ampere and up
return 80
@ -54,7 +55,12 @@ class CompressedTensorsWNA16(CompressedTensorsScheme):
output_size_per_partition = sum(output_partition_sizes)
# If group_size is -1, we are in channelwise case.
group_size = input_size if self.group_size == -1 else self.group_size
channelwise = (self.group_size == -1)
group_size = input_size if channelwise else self.group_size
row_parallel = (input_size != input_size_per_partition)
# In the case of channelwise quantization, we need to replicate the
# scales across all gpus.
partition_scales = (row_parallel and not channelwise)
verify_marlin_supports_shape(
output_size_per_partition=output_size_per_partition,
@ -65,8 +71,8 @@ class CompressedTensorsWNA16(CompressedTensorsScheme):
weight_scale_dim = None
scales_and_zp_size = input_size // group_size
if (input_size != input_size_per_partition
and self.group_size is not None):
if partition_scales:
assert input_size_per_partition % group_size == 0
weight_scale_dim = 1
scales_and_zp_size = input_size_per_partition // group_size

View File

@ -5,6 +5,9 @@ from typing import Any, Dict, Iterable, Optional
from pydantic import BaseModel, Field
from torch.nn import Module
from vllm.model_executor.layers.quantization.utils.quant_utils import (
FUSED_LAYER_NAME_MAPPING)
class CompressionFormat(Enum):
dense = "dense"
@ -86,13 +89,6 @@ def is_activation_quantization_format(format: str) -> bool:
return format in _ACTIVATION_QUANTIZATION_FORMATS
# fused_name: List[shard_name]
_FUSED_LAYER_NAME_MAPPING = {
"qkv_proj": ["q_proj", "k_proj", "v_proj"],
"gate_up_proj": ["gate_proj", "up_proj"]
}
def should_ignore_layer(layer_name: Optional[str],
ignore: Iterable[str]) -> bool:
if layer_name is None:
@ -106,8 +102,8 @@ def should_ignore_layer(layer_name: Optional[str],
# in the safetensors checkpoint. So, we convert the name
# from the fused version to unfused + check to make sure that
# each shard of the fused layer has the same scheme.
if proj_name in _FUSED_LAYER_NAME_MAPPING:
shard_proj_names = _FUSED_LAYER_NAME_MAPPING[proj_name]
if proj_name in FUSED_LAYER_NAME_MAPPING:
shard_proj_names = FUSED_LAYER_NAME_MAPPING[proj_name]
# Convert fused_name --> [shard_names]
shard_names = [

View File

@ -11,6 +11,8 @@ from vllm.model_executor.layers.quantization.base_config import (
QuantizationConfig, QuantizeMethodBase)
from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
apply_fp8_marlin_linear, prepare_fp8_layer_for_marlin)
from vllm.model_executor.layers.quantization.utils.quant_utils import (
is_layer_skipped)
from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
apply_fp8_linear, create_per_channel_scale_param)
from vllm.model_executor.utils import set_weight_attrs
@ -18,14 +20,6 @@ from vllm.platforms import current_platform
logger = init_logger(__name__)
# Note: this is a hack. We should update each model to register the
# stacked params and get it from there instead in a future PR.
# fused_name: List[shard_name]
_FUSED_LAYER_NAME_MAPPING = {
"qkv_proj": ["q_proj", "k_proj", "v_proj"],
"gate_up_proj": ["gate_proj", "up_proj"]
}
class FBGEMMFp8Config(QuantizationConfig):
"""Config class for FBGEMM Fp8."""
@ -62,37 +56,10 @@ class FBGEMMFp8Config(QuantizationConfig):
input_scale_ub = cls.get_from_keys(config, ["activation_scale_ub"])
return cls(ignore_list=ignore_list, input_scale_ub=input_scale_ub)
def _is_layer_skipped(self, prefix: str) -> bool:
# prefix: model.layers.0.self_attn.q_proj
# proj_name: q_proj
proj_name = prefix.split(".")[-1]
if proj_name in _FUSED_LAYER_NAME_MAPPING:
shard_prefixes = [
prefix.replace(proj_name, shard_proj_name)
for shard_proj_name in _FUSED_LAYER_NAME_MAPPING[proj_name]
]
is_skipped = None
for shard_prefix in shard_prefixes:
is_shard_skipped = shard_prefix in self.ignore_list
if is_skipped is None:
is_skipped = is_shard_skipped
elif is_shard_skipped != is_skipped:
raise ValueError(
f"Detected some but not all shards of {prefix} "
"are quantized. All shards of fused layers "
"to have the same precision.")
else:
is_skipped = prefix in self.ignore_list
assert is_skipped is not None
return is_skipped
def get_quant_method(self, layer: torch.nn.Module,
prefix: str) -> Optional["QuantizeMethodBase"]:
if isinstance(layer, LinearBase):
if self._is_layer_skipped(prefix):
if is_layer_skipped(prefix, self.ignore_list):
return UnquantizedLinearMethod()
return FBGEMMFp8LinearMethod(self)
return None

View File

@ -8,15 +8,19 @@ from vllm import _custom_ops as ops
from vllm.logger import init_logger
from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEMethodBase,
fused_moe)
from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
UnquantizedLinearMethod)
from vllm.model_executor.layers.quantization.base_config import (
QuantizationConfig, QuantizeMethodBase)
from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
apply_fp8_marlin_linear, prepare_fp8_layer_for_marlin)
from vllm.model_executor.layers.quantization.utils.quant_utils import (
is_layer_skipped)
from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
all_close_1d, apply_fp8_linear, create_per_tensor_scale_param,
cutlass_fp8_supported, per_tensor_dequantize, requantize_with_max_scale)
all_close_1d, apply_fp8_linear, convert_to_channelwise,
create_per_tensor_scale_param, cutlass_fp8_supported,
per_tensor_dequantize, requantize_with_max_scale)
from vllm.model_executor.utils import set_weight_attrs
from vllm.platforms import current_platform
from vllm.utils import print_warning_once
@ -33,6 +37,7 @@ class Fp8Config(QuantizationConfig):
self,
is_checkpoint_fp8_serialized: bool = False,
activation_scheme: str = "dynamic",
ignored_layers: Optional[List[str]] = None,
) -> None:
self.is_checkpoint_fp8_serialized = is_checkpoint_fp8_serialized
if is_checkpoint_fp8_serialized:
@ -42,6 +47,7 @@ class Fp8Config(QuantizationConfig):
raise ValueError(
f"Unsupported activation scheme {activation_scheme}")
self.activation_scheme = activation_scheme
self.ignored_layers = ignored_layers or []
@classmethod
def get_name(cls) -> str:
@ -64,14 +70,18 @@ class Fp8Config(QuantizationConfig):
quant_method = cls.get_from_keys(config, ["quant_method"])
is_checkpoint_fp8_serialized = ("fp8" in quant_method)
activation_scheme = cls.get_from_keys(config, ["activation_scheme"])
ignored_layers = cls.get_from_keys_or(config, ["ignored_layers"], None)
return cls(is_checkpoint_fp8_serialized=is_checkpoint_fp8_serialized,
activation_scheme=activation_scheme)
activation_scheme=activation_scheme,
ignored_layers=ignored_layers)
def get_quant_method(self, layer: torch.nn.Module,
prefix: str) -> Optional["QuantizeMethodBase"]:
from vllm.attention.layer import Attention # Avoid circular import
if isinstance(layer, LinearBase):
if is_layer_skipped(prefix, self.ignored_layers):
return UnquantizedLinearMethod()
return Fp8LinearMethod(self)
elif isinstance(layer, FusedMoE):
return Fp8MoEMethod(self)
@ -170,19 +180,29 @@ class Fp8LinearMethod(LinearMethodBase):
layer.weight_scale = Parameter(weight_scale, requires_grad=False)
layer.input_scale = None
# If checkpoint is fp8, requantize the separately quantized logical
# weights into a single fp8 weight with a single weight scale.
# If checkpoint is fp8, handle that there are N scales for N
# shards in a fused module
else:
# Dequant -> Quant with max scale.
max_w_scale, weight = requantize_with_max_scale(
weight=layer.weight,
weight_scale=layer.weight_scale,
logical_widths=layer.logical_widths,
)
# If using marlin (w8a16), kernel uses channelwise weights,
# so extend the weight scales to be channelwise.
if self.use_marlin:
weight = layer.weight
weight_scale = convert_to_channelwise(layer.weight_scale,
layer.logical_widths)
# If using w8a8, torch._scaled_mm needs per tensor, so
# requantize the logical shards as a single weight.
else:
# Dequant -> Quant with max scale so we can run per tensor.
weight_scale, weight = requantize_with_max_scale(
weight=layer.weight,
weight_scale=layer.weight_scale,
logical_widths=layer.logical_widths,
)
# Update layer with new values.
layer.weight = Parameter(weight.t(), requires_grad=False)
layer.weight_scale = Parameter(max_w_scale, requires_grad=False)
layer.weight_scale = Parameter(weight_scale, requires_grad=False)
if self.quant_config.activation_scheme == "static":
layer.input_scale = Parameter(layer.input_scale.max(),
requires_grad=False)

View File

@ -79,7 +79,8 @@ class GPTQMarlinConfig(QuantizationConfig):
user_quant) -> Optional[str]:
can_convert = cls.is_gptq_marlin_compatible(hf_quant_cfg)
is_valid_user_quant = (user_quant is None or user_quant == "marlin")
is_valid_user_quant = (user_quant is None or user_quant == "marlin"
or user_quant == "gptq_marlin")
if can_convert and is_valid_user_quant:
msg = ("The model is convertible to {} during runtime."

View File

@ -46,10 +46,8 @@ class BaseKVCacheMethod(QuantizeMethodBase):
elif layer.k_scale < 0.0 and layer.v_scale < 0.0:
# If no scales were loaded (both scales are invalid negative
# values), use the default value of 1.0
k_scale = torch.nn.Parameter(torch.tensor(1.0),
requires_grad=False)
v_scale = torch.nn.Parameter(torch.tensor(1.0),
requires_grad=False)
k_scale = 1.0
v_scale = 1.0
else:
# If we find a single kv_scale in the checkpoint, we remap
# kv_scale to k_scale during weight loading, and duplicate

View File

@ -46,7 +46,8 @@ def apply_fp8_marlin_linear(
return output.reshape(out_shape)
def prepare_fp8_layer_for_marlin(layer: torch.nn.Module) -> None:
def prepare_fp8_layer_for_marlin(layer: torch.nn.Module,
strategy: str = "tensor") -> None:
print_warning_once(
"Your GPU does not have native support for FP8 computation but "
"FP8 quantization is being used. Weight-only FP8 compression will "
@ -74,16 +75,7 @@ def prepare_fp8_layer_for_marlin(layer: torch.nn.Module) -> None:
layer.weight = torch.nn.Parameter(marlin_qweight, requires_grad=False)
# WEIGHT SCALES
# Currently Marlin doesn't support per-tensor scales, so we
# expand it to channelwise
is_channelwise = (len(layer.weight_scale.shape) > 0
and layer.weight_scale.shape[0] == part_size_n)
if is_channelwise:
scales = layer.weight_scale
else:
scales = layer.weight_scale.repeat(1, part_size_n)
scales = scales.to(layer.orig_dtype).to(device)
scales = layer.weight_scale.to(layer.orig_dtype)
# Permute scales
marlin_scales = marlin_permute_scales(s=scales,
size_k=part_size_k,

View File

@ -1,10 +1,48 @@
"""This file is used for /tests and /benchmarks"""
from typing import List
import numpy
import torch
SUPPORTED_NUM_BITS = [4, 8]
SUPPORTED_GROUP_SIZES = [-1, 32, 64, 128]
# Note: this is a hack. We should update each model to register the
# stacked params and get it from there instead in a future PR.
# fused_name: List[shard_name]
FUSED_LAYER_NAME_MAPPING = {
"qkv_proj": ["q_proj", "k_proj", "v_proj"],
"gate_up_proj": ["gate_proj", "up_proj"]
}
def is_layer_skipped(prefix: str, ignored_layers: List[str]) -> bool:
# prefix: model.layers.0.self_attn.q_proj
# proj_name: q_proj
proj_name = prefix.split(".")[-1]
if proj_name in FUSED_LAYER_NAME_MAPPING:
shard_prefixes = [
prefix.replace(proj_name, shard_proj_name)
for shard_proj_name in FUSED_LAYER_NAME_MAPPING[proj_name]
]
is_skipped = None
for shard_prefix in shard_prefixes:
is_shard_skipped = shard_prefix in ignored_layers
if is_skipped is None:
is_skipped = is_shard_skipped
elif is_shard_skipped != is_skipped:
raise ValueError(
f"Detected some but not all shards of {prefix} "
"are quantized. All shards of fused layers "
"to have the same precision.")
else:
is_skipped = prefix in ignored_layers
assert is_skipped is not None
return is_skipped
def get_pack_factor(num_bits):
assert num_bits in SUPPORTED_NUM_BITS, f"Unsupported num_bits = {num_bits}"

View File

@ -774,6 +774,7 @@ def get_rope(
is_neox_style: bool = True,
rope_scaling: Optional[Dict[str, Any]] = None,
dtype: Optional[torch.dtype] = None,
rotary_percent: float = 1.0,
) -> RotaryEmbedding:
if dtype is None:
dtype = torch.get_default_dtype()
@ -786,6 +787,8 @@ def get_rope(
rope_scaling_args = tuple(rope_scaling_tuple.items())
else:
rope_scaling_args = None
if rotary_percent < 1.0:
rotary_dim = int(rotary_dim * rotary_percent)
key = (head_size, rotary_dim, max_position, base, is_neox_style,
rope_scaling_args, dtype)
if key in _ROPE_DICT:

View File

@ -220,7 +220,7 @@ def _apply_min_tokens_penalty(
seqs_to_penalize: List[int] = []
for j, seq_id in enumerate(seq_ids):
seq_data = seq_group.seq_data[seq_id]
if len(seq_data.output_token_ids) < min_tokens:
if len(seq_data.output_token_ids_array) < min_tokens:
seqs_to_penalize.append(j)
if seqs_to_penalize:

Some files were not shown because too many files have changed in this diff Show More