Compare commits

..

1 Commits

Author SHA1 Message Date
e17250f0d2 fix precommit 2025-06-18 21:17:43 -07:00
247 changed files with 4691 additions and 10028 deletions

View File

@ -16,7 +16,7 @@ Please download the visualization scripts in the post
- Download `nightly-benchmarks.zip`. - Download `nightly-benchmarks.zip`.
- In the same folder, run the following code: - In the same folder, run the following code:
```bash ```console
export HF_TOKEN=<your HF token> export HF_TOKEN=<your HF token>
apt update apt update
apt install -y git apt install -y git

View File

@ -102,7 +102,6 @@ steps:
commands: commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ." - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest"
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)" - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
env: env:
DOCKER_BUILDKIT: "1" DOCKER_BUILDKIT: "1"
@ -118,7 +117,6 @@ steps:
commands: commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:latest --progress plain -f docker/Dockerfile.neuron ." - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:latest --progress plain -f docker/Dockerfile.neuron ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:latest"
- "docker push public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version)" - "docker push public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version)"
env: env:
DOCKER_BUILDKIT: "1" DOCKER_BUILDKIT: "1"

View File

@ -54,11 +54,10 @@ docker run --rm -it --device=/dev/neuron0 --network bridge \
--name "${container_name}" \ --name "${container_name}" \
${image_name} \ ${image_name} \
/bin/bash -c " /bin/bash -c "
set -e; # Exit on first error
python3 /workspace/vllm/examples/offline_inference/neuron.py; python3 /workspace/vllm/examples/offline_inference/neuron.py;
python3 -m pytest /workspace/vllm/tests/neuron/1_core/ -v --capture=tee-sys; python3 -m pytest /workspace/vllm/tests/neuron/1_core/ -v --capture=tee-sys;
for f in /workspace/vllm/tests/neuron/2_core/*.py; do for f in /workspace/vllm/tests/neuron/2_core/*.py; do
echo \"Running test file: \$f\"; echo 'Running test file: '$f;
python3 -m pytest \$f -v --capture=tee-sys; python3 -m pytest \$f -v --capture=tee-sys;
done done
" "

View File

@ -4,8 +4,8 @@ CONTAINER_NAME=vllm-tpu
# vllm config # vllm config
MODEL=meta-llama/Llama-3.1-8B-Instruct MODEL=meta-llama/Llama-3.1-8B-Instruct
MAX_NUM_SEQS=256 MAX_NUM_SEQS=512
MAX_NUM_BATCHED_TOKENS=1024 MAX_NUM_BATCHED_TOKENS=512
TENSOR_PARALLEL_SIZE=1 TENSOR_PARALLEL_SIZE=1
MAX_MODEL_LEN=2048 MAX_MODEL_LEN=2048
DOWNLOAD_DIR=/mnt/disks/persist DOWNLOAD_DIR=/mnt/disks/persist

View File

@ -89,7 +89,7 @@ steps:
- VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
- label: Chunked Prefill Test - label: Chunked Prefill Test
mirror_hardwares: [amdexperimental, amdproduction] mirror_hardwares: [amdexperimental]
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/basic_correctness/test_chunked_prefill - tests/basic_correctness/test_chunked_prefill
@ -271,15 +271,6 @@ steps:
commands: commands:
- pytest -v -s prefix_caching - pytest -v -s prefix_caching
- label: Platform Tests (CUDA)
mirror_hardwares: [amdexperimental]
source_file_dependencies:
- vllm/
- tests/cuda
commands:
- pytest -v -s cuda/test_cuda_context.py
- label: Samplers Test # 36min - label: Samplers Test # 36min
mirror_hardwares: [amdexperimental] mirror_hardwares: [amdexperimental]
source_file_dependencies: source_file_dependencies:

15
.github/mergify.yml vendored
View File

@ -45,7 +45,6 @@ pull_request_rules:
- files~=^vllm/entrypoints/openai/tool_parsers/llama.*\.py - files~=^vllm/entrypoints/openai/tool_parsers/llama.*\.py
- files~=^vllm/model_executor/models/.*llama.*\.py - files~=^vllm/model_executor/models/.*llama.*\.py
- files~=^vllm/transformers_utils/configs/.*llama.*\.py - files~=^vllm/transformers_utils/configs/.*llama.*\.py
- title~=(?i)llama
actions: actions:
label: label:
add: add:
@ -66,19 +65,6 @@ pull_request_rules:
add: add:
- multi-modality - multi-modality
- name: label-performance
description: Automatically apply performance label
conditions:
- or:
- files~=^benchmarks/
- files~=^vllm/benchmarks/
- files~=^tests/benchmarks/
- files~=^\.buildkite/nightly-benchmarks/
actions:
label:
add:
- performance
- name: label-qwen - name: label-qwen
description: Automatically apply qwen label description: Automatically apply qwen label
conditions: conditions:
@ -88,6 +74,7 @@ pull_request_rules:
- files~=^vllm/model_executor/models/.*qwen.*\.py - files~=^vllm/model_executor/models/.*qwen.*\.py
- files~=^vllm/reasoning/.*qwen.*\.py - files~=^vllm/reasoning/.*qwen.*\.py
- title~=(?i)Qwen - title~=(?i)Qwen
- body~=(?i)Qwen
actions: actions:
label: label:
add: add:

View File

@ -115,11 +115,6 @@ repos:
entry: python tools/check_spdx_header.py entry: python tools/check_spdx_header.py
language: python language: python
types: [python] types: [python]
- id: check-root-lazy-imports
name: Check root lazy imports
entry: python tools/check_init_lazy_imports.py
language: python
types: [python]
- id: check-filenames - id: check-filenames
name: Check for spaces in all filenames name: Check for spaces in all filenames
entry: bash entry: bash

View File

@ -154,13 +154,11 @@ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs
## Contact Us ## Contact Us
<!-- --8<-- [start:contact-us] -->
- For technical questions and feature requests, please use GitHub [Issues](https://github.com/vllm-project/vllm/issues) or [Discussions](https://github.com/vllm-project/vllm/discussions) - For technical questions and feature requests, please use GitHub [Issues](https://github.com/vllm-project/vllm/issues) or [Discussions](https://github.com/vllm-project/vllm/discussions)
- For discussing with fellow users, please use the [vLLM Forum](https://discuss.vllm.ai) - For discussing with fellow users, please use the [vLLM Forum](https://discuss.vllm.ai)
- For coordinating contributions and development, please use [Slack](https://slack.vllm.ai) - For coordinating contributions and development, please use [Slack](https://slack.vllm.ai)
- For security disclosures, please use GitHub's [Security Advisories](https://github.com/vllm-project/vllm/security/advisories) feature - For security disclosures, please use GitHub's [Security Advisories](https://github.com/vllm-project/vllm/security/advisories) feature
- For collaborations and partnerships, please contact us at [vllm-questions@lists.berkeley.edu](mailto:vllm-questions@lists.berkeley.edu) - For collaborations and partnerships, please contact us at [vllm-questions@lists.berkeley.edu](mailto:vllm-questions@lists.berkeley.edu)
<!-- --8<-- [end:contact-us] -->
## Media Kit ## Media Kit

View File

@ -387,178 +387,3 @@ python3 vllm/benchmarks/benchmark_throughput.py \
--enable-lora \ --enable-lora \
--lora-path yard1/llama-2-7b-sql-lora-test --lora-path yard1/llama-2-7b-sql-lora-test
``` ```
---
## Example - Structured Output Benchmark
Benchmark the performance of structured output generation (JSON, grammar, regex).
### Server Setup
```bash
vllm serve NousResearch/Hermes-3-Llama-3.1-8B --disable-log-requests
```
### JSON Schema Benchmark
```bash
python3 benchmarks/benchmark_serving_structured_output.py \
--backend vllm \
--model NousResearch/Hermes-3-Llama-3.1-8B \
--dataset json \
--structured-output-ratio 1.0 \
--request-rate 10 \
--num-prompts 1000
```
### Grammar-based Generation Benchmark
```bash
python3 benchmarks/benchmark_serving_structured_output.py \
--backend vllm \
--model NousResearch/Hermes-3-Llama-3.1-8B \
--dataset grammar \
--structure-type grammar \
--request-rate 10 \
--num-prompts 1000
```
### Regex-based Generation Benchmark
```bash
python3 benchmarks/benchmark_serving_structured_output.py \
--backend vllm \
--model NousResearch/Hermes-3-Llama-3.1-8B \
--dataset regex \
--request-rate 10 \
--num-prompts 1000
```
### Choice-based Generation Benchmark
```bash
python3 benchmarks/benchmark_serving_structured_output.py \
--backend vllm \
--model NousResearch/Hermes-3-Llama-3.1-8B \
--dataset choice \
--request-rate 10 \
--num-prompts 1000
```
### XGrammar Benchmark Dataset
```bash
python3 benchmarks/benchmark_serving_structured_output.py \
--backend vllm \
--model NousResearch/Hermes-3-Llama-3.1-8B \
--dataset xgrammar_bench \
--request-rate 10 \
--num-prompts 1000
```
---
## Example - Long Document QA Throughput Benchmark
Benchmark the performance of long document question-answering with prefix caching.
### Basic Long Document QA Test
```bash
python3 benchmarks/benchmark_long_document_qa_throughput.py \
--model meta-llama/Llama-2-7b-chat-hf \
--enable-prefix-caching \
--num-documents 16 \
--document-length 2000 \
--output-len 50 \
--repeat-count 5
```
### Different Repeat Modes
```bash
# Random mode (default) - shuffle prompts randomly
python3 benchmarks/benchmark_long_document_qa_throughput.py \
--model meta-llama/Llama-2-7b-chat-hf \
--enable-prefix-caching \
--num-documents 8 \
--document-length 3000 \
--repeat-count 3 \
--repeat-mode random
# Tile mode - repeat entire prompt list in sequence
python3 benchmarks/benchmark_long_document_qa_throughput.py \
--model meta-llama/Llama-2-7b-chat-hf \
--enable-prefix-caching \
--num-documents 8 \
--document-length 3000 \
--repeat-count 3 \
--repeat-mode tile
# Interleave mode - repeat each prompt consecutively
python3 benchmarks/benchmark_long_document_qa_throughput.py \
--model meta-llama/Llama-2-7b-chat-hf \
--enable-prefix-caching \
--num-documents 8 \
--document-length 3000 \
--repeat-count 3 \
--repeat-mode interleave
```
---
## Example - Prefix Caching Benchmark
Benchmark the efficiency of automatic prefix caching.
### Fixed Prompt with Prefix Caching
```bash
python3 benchmarks/benchmark_prefix_caching.py \
--model meta-llama/Llama-2-7b-chat-hf \
--enable-prefix-caching \
--num-prompts 1 \
--repeat-count 100 \
--input-length-range 128:256
```
### ShareGPT Dataset with Prefix Caching
```bash
# download dataset
# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
python3 benchmarks/benchmark_prefix_caching.py \
--model meta-llama/Llama-2-7b-chat-hf \
--dataset-path /path/ShareGPT_V3_unfiltered_cleaned_split.json \
--enable-prefix-caching \
--num-prompts 20 \
--repeat-count 5 \
--input-length-range 128:256
```
---
## Example - Request Prioritization Benchmark
Benchmark the performance of request prioritization in vLLM.
### Basic Prioritization Test
```bash
python3 benchmarks/benchmark_prioritization.py \
--model meta-llama/Llama-2-7b-chat-hf \
--input-len 128 \
--output-len 64 \
--num-prompts 100 \
--scheduling-policy priority
```
### Multiple Sequences per Prompt
```bash
python3 benchmarks/benchmark_prioritization.py \
--model meta-llama/Llama-2-7b-chat-hf \
--input-len 128 \
--output-len 64 \
--num-prompts 100 \
--scheduling-policy priority \
--n 2
```

View File

@ -10,7 +10,6 @@
# 3. Set variables (ALL REQUIRED) # 3. Set variables (ALL REQUIRED)
# BASE: your directory for vllm repo # BASE: your directory for vllm repo
# MODEL: the model served by vllm # MODEL: the model served by vllm
# SYSTEM: the hardware, choice TPU or GPU, for other systems, "get best profile" might not support.
# TP: ways of tensor parallelism # TP: ways of tensor parallelism
# DOWNLOAD_DIR: directory to download and load model weights. # DOWNLOAD_DIR: directory to download and load model weights.
# INPUT_LEN: request input len # INPUT_LEN: request input len
@ -35,7 +34,6 @@
TAG=$(date +"%Y_%m_%d_%H_%M") TAG=$(date +"%Y_%m_%d_%H_%M")
BASE="" BASE=""
MODEL="meta-llama/Llama-3.1-8B-Instruct" MODEL="meta-llama/Llama-3.1-8B-Instruct"
SYSTEM="TPU"
TP=1 TP=1
DOWNLOAD_DIR="" DOWNLOAD_DIR=""
INPUT_LEN=4000 INPUT_LEN=4000
@ -47,15 +45,12 @@ NUM_BATCHED_TOKENS_LIST="512 1024 2048 4096"
LOG_FOLDER="$BASE/auto-benchmark/$TAG" LOG_FOLDER="$BASE/auto-benchmark/$TAG"
RESULT="$LOG_FOLDER/result.txt" RESULT="$LOG_FOLDER/result.txt"
PROFILE_PATH="$LOG_FOLDER/profile"
echo "result file: $RESULT" echo "result file: $RESULT"
echo "model: $MODEL" echo "model: $MODEL"
rm -rf $LOG_FOLDER rm -rf $LOG_FOLDER
rm -rf $PROFILE_PATH
mkdir -p $LOG_FOLDER mkdir -p $LOG_FOLDER
mkdir -p $PROFILE_PATH
cd "$BASE/vllm" cd "$BASE/vllm"
@ -75,11 +70,10 @@ start_server() {
local max_num_seqs=$2 local max_num_seqs=$2
local max_num_batched_tokens=$3 local max_num_batched_tokens=$3
local vllm_log=$4 local vllm_log=$4
local profile_dir=$5
pkill -f vllm pkill -f vllm
VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir vllm serve $MODEL \ VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 vllm serve $MODEL \
--disable-log-requests \ --disable-log-requests \
--port 8004 \ --port 8004 \
--gpu-memory-utilization $gpu_memory_utilization \ --gpu-memory-utilization $gpu_memory_utilization \
@ -111,37 +105,19 @@ start_server() {
fi fi
} }
update_best_profile() {
local profile_dir=$1
local profile_index=$2
sorted_paths=($(find "$profile_dir" -maxdepth 1 -not -path "$profile_dir" | sort))
selected_profile_file=
if [[ "$SYSTEM" == "TPU" ]]; then
selected_profile_file="${sorted_paths[$profile_index]}/*.xplane.pb"
fi
if [[ "$SYSTEM" == "GPU" ]]; then
selected_profile_file="${sorted_paths[$profile_index]}"
fi
rm -f $PROFILE_PATH/*
cp $selected_profile_file $PROFILE_PATH
}
run_benchmark() { run_benchmark() {
local max_num_seqs=$1 local max_num_seqs=$1
local max_num_batched_tokens=$2 local max_num_batched_tokens=$2
local gpu_memory_utilization=$3 local gpu_memory_utilization=$3
echo "max_num_seq: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens" echo "max_num_seq: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens"
local vllm_log="$LOG_FOLDER/vllm_log_${max_num_seqs}_${max_num_batched_tokens}.txt" local vllm_log="$LOG_FOLDER/vllm_log_${max_num_seqs}_${max_num_batched_tokens}.txt"
local profile_dir="$LOG_FOLDER/profile_${max_num_seqs}_${max_num_batched_tokens}"
echo "vllm_log: $vllm_log" echo "vllm_log: $vllm_log"
echo echo
rm -f $vllm_log rm -f $vllm_log
mkdir -p $profile_dir
pkill -f vllm pkill -f vllm
local profile_index=0
echo "starting server..." echo "starting server..."
start_server $gpu_memory_utilization $max_num_seqs $max_num_batched_tokens $vllm_log $profile_dir start_server $gpu_memory_utilization $max_num_seqs $max_num_batched_tokens $vllm_log
result=$? result=$?
if [[ "$result" -eq 1 ]]; then if [[ "$result" -eq 1 ]]; then
echo "server failed to start. gpu_memory_utilization:$gpu_memory_utilization, max_num_seqs:$max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens" echo "server failed to start. gpu_memory_utilization:$gpu_memory_utilization, max_num_seqs:$max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens"
@ -168,8 +144,7 @@ run_benchmark() {
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \ --goodput e2el:$MAX_LATENCY_ALLOWED_MS \
--num-prompts 1000 \ --num-prompts 1000 \
--random-prefix-len $prefix_len \ --random-prefix-len $prefix_len \
--port 8004 \ --port 8004 &> "$bm_log"
--profile &> "$bm_log"
throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g') throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}') e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g') goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
@ -183,7 +158,6 @@ run_benchmark() {
# start from request-rate as int(throughput) + 1 # start from request-rate as int(throughput) + 1
request_rate=$((${throughput%.*} + 1)) request_rate=$((${throughput%.*} + 1))
while ((request_rate > 0)); do while ((request_rate > 0)); do
profile_index=$((profile_index+1))
# clear prefix cache # clear prefix cache
curl -X POST http://0.0.0.0:8004/reset_prefix_cache curl -X POST http://0.0.0.0:8004/reset_prefix_cache
sleep 5 sleep 5
@ -221,12 +195,6 @@ run_benchmark() {
best_max_num_seqs=$max_num_seqs best_max_num_seqs=$max_num_seqs
best_num_batched_tokens=$max_num_batched_tokens best_num_batched_tokens=$max_num_batched_tokens
best_goodput=$goodput best_goodput=$goodput
if [[ "$SYSTEM" == "TPU" ]]; then
update_best_profile "$profile_dir/plugins/profile" $profile_index
fi
if [[ "$SYSTEM" == "GPU" ]]; then
update_best_profile "$profile_dir" $profile_index
fi
fi fi
else else
echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens does not meet latency requirement ${MAX_LATENCY_ALLOWED_MS}" echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens does not meet latency requirement ${MAX_LATENCY_ALLOWED_MS}"
@ -271,6 +239,6 @@ for num_seqs in "${num_seqs_list[@]}"; do
done done
done done
echo "finish permutations" echo "finish permutations"
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH" echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput"
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH" >> "$RESULT" echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput" >> "$RESULT"

View File

@ -404,14 +404,8 @@ async def async_request_openai_chat_completions(
chunk_bytes = chunk_bytes.strip() chunk_bytes = chunk_bytes.strip()
if not chunk_bytes: if not chunk_bytes:
continue continue
chunk_bytes = chunk_bytes.decode("utf-8")
# NOTE: SSE comments (often used as pings) start with a colon.
# These are not JSON data payload and should be skipped.
if chunk_bytes.startswith(":"):
continue
chunk = chunk_bytes.removeprefix("data: ")
chunk = chunk_bytes.decode("utf-8").removeprefix("data: ")
if chunk != "[DONE]": if chunk != "[DONE]":
timestamp = time.perf_counter() timestamp = time.perf_counter()
data = json.loads(chunk) data = json.loads(chunk)

View File

@ -353,7 +353,7 @@ class RandomDataset(BenchmarkDataset):
: input_lens[i] : input_lens[i]
] ]
prompt = tokenizer.decode(re_encoded_sequence) prompt = tokenizer.decode(re_encoded_sequence)
total_input_len = len(re_encoded_sequence) total_input_len = prefix_len + int(input_lens[i])
requests.append( requests.append(
SampleRequest( SampleRequest(
prompt=prompt, prompt=prompt,

View File

@ -97,7 +97,7 @@ def run_vllm(
assert lora_requests is None, "BeamSearch API does not support LoRA" assert lora_requests is None, "BeamSearch API does not support LoRA"
prompts = [request.prompt for request in requests] prompts = [request.prompt for request in requests]
# output_len should be the same for all requests. # output_len should be the same for all requests.
output_len = requests[0].expected_output_len output_len = requests[0][2]
for request in requests: for request in requests:
assert request.expected_output_len == output_len assert request.expected_output_len == output_len
start = time.perf_counter() start = time.perf_counter()

View File

@ -22,16 +22,8 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils import (
MARLIN_SUPPORTED_GROUP_SIZES, MARLIN_SUPPORTED_GROUP_SIZES,
query_marlin_supported_quant_types, query_marlin_supported_quant_types,
) )
from vllm.model_executor.layers.quantization.utils.marlin_utils_fp4 import (
FP4_MARLIN_SUPPORTED_GROUP_SIZES,
rand_marlin_weight_fp4_like,
)
from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
marlin_quant_fp8_torch,
)
from vllm.model_executor.layers.quantization.utils.marlin_utils_test import ( from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
MarlinWorkspace, MarlinWorkspace,
awq_marlin_quantize,
marlin_quantize, marlin_quantize,
) )
from vllm.model_executor.layers.quantization.utils.marlin_utils_test_24 import ( from vllm.model_executor.layers.quantization.utils.marlin_utils_test_24 import (
@ -43,7 +35,7 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
quantize_weights, quantize_weights,
sort_weights, sort_weights,
) )
from vllm.scalar_type import ScalarType, scalar_types from vllm.scalar_type import ScalarType
from vllm.utils import FlexibleArgumentParser from vllm.utils import FlexibleArgumentParser
DEFAULT_MODELS = ["meta-llama/Llama-2-7b-hf/TP1"] DEFAULT_MODELS = ["meta-llama/Llama-2-7b-hf/TP1"]
@ -65,144 +57,80 @@ def bench_run(
size_n: int, size_n: int,
): ):
label = "Quant Matmul" label = "Quant Matmul"
sub_label = "{}, act={} k_full={}, q={}, g={}, MKN=({}x{}x{})".format( sub_label = "{}, act={} k_full={}, q={}, g={}, MKN=({}x{}x{})".format(
model, act_order, is_k_full, str(quant_type), group_size, size_m, size_k, size_n model, act_order, is_k_full, str(quant_type), group_size, size_m, size_k, size_n
) )
print(f"Testing: {sub_label}") print(f"Testing: {sub_label}")
a = torch.randn(size_m, size_k).to(torch.half).cuda() a = torch.randn(size_m, size_k).to(torch.half).cuda()
b = torch.rand(size_k, size_n).to(torch.half).cuda() b = torch.rand(size_k, size_n).to(torch.half).cuda()
has_zp = quant_type in [scalar_types.uint4, scalar_types.uint8]
if act_order and (group_size == -1 or group_size == size_k or has_zp):
return
if size_k % group_size != 0:
return
marlin_24_supported = ( a_tmp = torch.zeros(size_m, size_k).to(torch.half).cuda()
quant_type in GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES
and group_size in GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES
)
repack_supported = (
quant_type in GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES
and group_size in MARLIN_SUPPORTED_GROUP_SIZES
)
allspark_supported = (
quant_type in ALLSPARK_SUPPORTED_QUANT_TYPES
and group_size == -1
and not act_order
and is_k_full
)
def gen_marlin_params():
# Marlin quant
marlin_g_idx = marlin_sort_indices = marlin_zp = marlin_s2 = None
if quant_type == scalar_types.float4_e2m1f:
if group_size != 16 or act_order:
return
marlin_w_ref, marlin_q_w, marlin_s, marlin_s2 = rand_marlin_weight_fp4_like(
b.T, group_size
)
elif quant_type == scalar_types.float8_e4m3fn:
if group_size not in [-1, 128] or act_order:
return
marlin_w_ref, marlin_q_w, marlin_s = marlin_quant_fp8_torch(b.T, group_size)
elif group_size == 16:
return
elif has_zp:
marlin_w_ref, marlin_q_w, marlin_s, marlin_zp = awq_marlin_quantize(
b, quant_type, group_size
)
else:
marlin_w_ref, marlin_q_w, marlin_s, marlin_g_idx, marlin_sort_indices, _ = (
marlin_quantize(b, quant_type, group_size, act_order)
)
return (
marlin_w_ref,
marlin_q_w,
marlin_s,
marlin_s2,
marlin_zp,
marlin_g_idx,
marlin_sort_indices,
)
def gen_marlin_24_params():
marlin_24_w_ref = marlin_24_q_w_comp = marlin_24_meta = marlin_24_s = None
if marlin_24_supported:
(marlin_24_w_ref, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s) = (
marlin_24_quantize(b, quant_type, group_size)
)
return (marlin_24_w_ref, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s)
def gen_repack_params():
q_w_gptq = None
repack_sort_indices = None
if repack_supported:
(w_ref, q_w, s, g_idx, rand_perm) = gptq_quantize_weights(
b, quant_type, group_size, act_order
)
q_w_gptq = gptq_pack(q_w, quant_type.size_bits, size_k, size_n)
# For act_order, sort the "weights" and "g_idx"
# so that group ids are increasing
repack_sort_indices = torch.empty(0, dtype=torch.int, device=b.device)
if act_order:
(q_w, g_idx, repack_sort_indices) = sort_weights(q_w, g_idx)
return q_w_gptq, repack_sort_indices
def gen_allspark_params():
qw_reorder = s_reorder = zp_reorder = sm_count = sm_version = (
CUBLAS_M_THRESHOLD
) = None
nonlocal allspark_supported
if allspark_supported:
properties = torch.cuda.get_device_properties(b.device.index)
sm_count = properties.multi_processor_count
sm_version = properties.major * 10 + properties.minor
supported_arch = sm_version >= 80 and sm_version < 90
allspark_supported = allspark_supported and supported_arch
if supported_arch:
w_ref, qw, s, zp = quantize_weights(b, quant_type, group_size, has_zp)
qw = qw.to(torch.uint8)
qw_reorder, s_reorder, zp_reorder = ops.allspark_repack_weight(
qw, s, zp, has_zp
)
CUBLAS_M_THRESHOLD = ALLSPARK_AMPERE_M_CUBLAS_THRESHOLD
return (
qw_reorder,
s_reorder,
zp_reorder,
sm_count,
sm_version,
CUBLAS_M_THRESHOLD,
)
# Marlin quant
( (
marlin_w_ref, marlin_w_ref,
marlin_q_w, marlin_q_w,
marlin_s, marlin_s,
marlin_s2,
marlin_zp,
marlin_g_idx, marlin_g_idx,
marlin_sort_indices, marlin_sort_indices,
) = gen_marlin_params() marlin_rand_perm,
marlin_24_w_ref, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s = ( ) = marlin_quantize(b, quant_type, group_size, act_order)
gen_marlin_24_params()
# Marlin_24 quant
(marlin_24_w_ref, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s) = (
marlin_24_quantize(b, quant_type, group_size)
) )
q_w_gptq, repack_sort_indices = gen_repack_params()
qw_reorder, s_reorder, zp_reorder, sm_count, sm_version, CUBLAS_M_THRESHOLD = ( marlin_zp = torch.empty(0, dtype=torch.int, device=b.device)
gen_allspark_params()
# GPTQ quant
(w_ref, q_w, s, g_idx, rand_perm) = gptq_quantize_weights(
b, quant_type, group_size, act_order
) )
q_w_gptq = gptq_pack(q_w, quant_type.size_bits, size_k, size_n)
# For act_order, sort the "weights" and "g_idx"
# so that group ids are increasing
repack_sort_indices = torch.empty(0, dtype=torch.int, device=b.device)
if act_order:
(q_w, g_idx, repack_sort_indices) = sort_weights(q_w, g_idx)
# Prepare # Prepare
marlin_workspace = MarlinWorkspace( marlin_workspace = MarlinWorkspace(
size_n, GPTQ_MARLIN_MIN_THREAD_N, GPTQ_MARLIN_MAX_PARALLEL size_n, GPTQ_MARLIN_MIN_THREAD_N, GPTQ_MARLIN_MAX_PARALLEL
) )
marlin_24_workspace = MarlinWorkspace( marlin_24_workspace = MarlinWorkspace(
size_n, GPTQ_MARLIN_24_MIN_THREAD_N, GPTQ_MARLIN_24_MAX_PARALLEL size_n, GPTQ_MARLIN_24_MIN_THREAD_N, GPTQ_MARLIN_24_MAX_PARALLEL
) )
marlin_zp = torch.zeros_like(marlin_s, dtype=torch.int)
# AllSpark W8A16 quant
as_supported_case = (
quant_type in ALLSPARK_SUPPORTED_QUANT_TYPES
and group_size == -1
and not act_order
and is_k_full
)
if as_supported_case:
properties = torch.cuda.get_device_properties(b.device.index)
sm_count = properties.multi_processor_count
sm_version = properties.major * 10 + properties.minor
supported_arch = sm_version >= 80 and sm_version < 90
as_supported_case = as_supported_case and supported_arch
if supported_arch:
has_zp = False
w_ref, qw, s, zp = quantize_weights(b, quant_type, group_size, has_zp)
qw = qw.to(torch.uint8)
qw_reorder, s_reorder, zp_reorder = ops.allspark_repack_weight(
qw, s, zp, has_zp
)
CUBLAS_M_THRESHOLD = ALLSPARK_AMPERE_M_CUBLAS_THRESHOLD
globals = { globals = {
# Gen params # Gen params
@ -212,14 +140,15 @@ def bench_run(
"size_n": size_n, "size_n": size_n,
"size_k": size_k, "size_k": size_k,
"a": a, "a": a,
"a_tmp": a_tmp,
# Marlin params # Marlin params
"marlin_w_ref": marlin_w_ref, "marlin_w_ref": marlin_w_ref,
"marlin_q_w": marlin_q_w, "marlin_q_w": marlin_q_w,
"marlin_s": marlin_s, "marlin_s": marlin_s,
"marlin_s2": marlin_s2,
"marlin_zp": marlin_zp, "marlin_zp": marlin_zp,
"marlin_g_idx": marlin_g_idx, "marlin_g_idx": marlin_g_idx,
"marlin_sort_indices": marlin_sort_indices, "marlin_sort_indices": marlin_sort_indices,
"marlin_rand_perm": marlin_rand_perm,
"marlin_workspace": marlin_workspace, "marlin_workspace": marlin_workspace,
"is_k_full": is_k_full, "is_k_full": is_k_full,
# Marlin_24 params # Marlin_24 params
@ -232,12 +161,12 @@ def bench_run(
"q_w_gptq": q_w_gptq, "q_w_gptq": q_w_gptq,
"repack_sort_indices": repack_sort_indices, "repack_sort_indices": repack_sort_indices,
# AllSpark W8A16 params # AllSpark W8A16 params
"qw_reorder": qw_reorder, "qw_reorder": qw_reorder if as_supported_case else None,
"s_reorder": s_reorder, "s_reorder": s_reorder if as_supported_case else None,
"zp_reorder": zp_reorder, "zp_reorder": zp_reorder if as_supported_case else None,
"sm_count": sm_count, "sm_count": sm_count if as_supported_case else None,
"sm_version": sm_version, "sm_version": sm_version if as_supported_case else None,
"CUBLAS_M_THRESHOLD": CUBLAS_M_THRESHOLD, "CUBLAS_M_THRESHOLD": CUBLAS_M_THRESHOLD if as_supported_case else None,
# Kernels # Kernels
"gptq_marlin_gemm": ops.gptq_marlin_gemm, "gptq_marlin_gemm": ops.gptq_marlin_gemm,
"gptq_marlin_24_gemm": ops.gptq_marlin_24_gemm, "gptq_marlin_24_gemm": ops.gptq_marlin_24_gemm,
@ -248,7 +177,7 @@ def bench_run(
min_run_time = 1 min_run_time = 1
# Warmup pytorch # Warmup pytorch
for _ in range(5): for i in range(5):
torch.matmul(a, marlin_w_ref) torch.matmul(a, marlin_w_ref)
results.append( results.append(
@ -263,17 +192,17 @@ def bench_run(
results.append( results.append(
benchmark.Timer( benchmark.Timer(
stmt="output = gptq_marlin_gemm(a, None, marlin_q_w, marlin_s, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, False, False)", # noqa: E501 stmt="output = gptq_marlin_gemm(a, marlin_q_w, marlin_s, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, False, False)", # noqa: E501
globals=globals, globals=globals,
label=label, label=label,
sub_label=sub_label, sub_label=sub_label,
description="gptq_marlin_gemm", description="gptq_marlin_gemm_fp16",
).blocked_autorange(min_run_time=min_run_time) ).blocked_autorange(min_run_time=min_run_time)
) )
results.append( results.append(
benchmark.Timer( benchmark.Timer(
stmt="output = gptq_marlin_gemm(a, None, marlin_q_w, marlin_s, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, True, False)", # noqa: E501 stmt="output = gptq_marlin_gemm(a, marlin_q_w, marlin_s, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, True, False)", # noqa: E501
globals=globals, globals=globals,
label=label, label=label,
sub_label=sub_label, sub_label=sub_label,
@ -281,7 +210,10 @@ def bench_run(
).blocked_autorange(min_run_time=min_run_time) ).blocked_autorange(min_run_time=min_run_time)
) )
if marlin_24_supported: if (
quant_type in GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES
and group_size in GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES
):
results.append( results.append(
benchmark.Timer( benchmark.Timer(
stmt="output = gptq_marlin_24_gemm(a, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s, marlin_24_workspace.scratch, quant_type, size_m, size_n, size_k)", # noqa: E501 stmt="output = gptq_marlin_24_gemm(a, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s, marlin_24_workspace.scratch, quant_type, size_m, size_n, size_k)", # noqa: E501
@ -292,18 +224,17 @@ def bench_run(
).blocked_autorange(min_run_time=min_run_time) ).blocked_autorange(min_run_time=min_run_time)
) )
if repack_supported: results.append(
results.append( benchmark.Timer(
benchmark.Timer( stmt="q_res = gptq_marlin_repack(q_w_gptq, repack_sort_indices, size_k, size_n, quant_type.size_bits)", # noqa: E501
stmt="q_res = gptq_marlin_repack(q_w_gptq, repack_sort_indices, size_k, size_n, quant_type.size_bits)", # noqa: E501 globals=globals,
globals=globals, label=label,
label=label, sub_label=sub_label,
sub_label=sub_label, description="gptq_marlin_repack",
description="gptq_marlin_repack", ).blocked_autorange(min_run_time=min_run_time)
).blocked_autorange(min_run_time=min_run_time) )
)
if allspark_supported: if as_supported_case:
results.append( results.append(
benchmark.Timer( benchmark.Timer(
stmt="output = allspark_w8a16_gemm(a, qw_reorder, s_reorder, zp_reorder, size_n, group_size, sm_count, sm_version, CUBLAS_M_THRESHOLD, False, True)", # noqa: E501 stmt="output = allspark_w8a16_gemm(a, qw_reorder, s_reorder, zp_reorder, size_n, group_size, sm_count, sm_version, CUBLAS_M_THRESHOLD, False, True)", # noqa: E501
@ -319,6 +250,7 @@ def main(args):
print("Benchmarking models:") print("Benchmarking models:")
for i, model in enumerate(args.models): for i, model in enumerate(args.models):
print(f"[{i}] {model}") print(f"[{i}] {model}")
results: list[benchmark.Measurement] = [] results: list[benchmark.Measurement] = []
for model in args.models: for model in args.models:
@ -346,17 +278,14 @@ def main(args):
): ):
continue continue
for quant_type in query_marlin_supported_quant_types(): for quant_type in query_marlin_supported_quant_types(False):
if ( if (
len(args.limit_num_bits) > 0 len(args.limit_num_bits) > 0
and quant_type.size_bits not in args.limit_num_bits and quant_type.size_bits not in args.limit_num_bits
): ):
continue continue
for group_size in ( for group_size in MARLIN_SUPPORTED_GROUP_SIZES:
MARLIN_SUPPORTED_GROUP_SIZES
+ FP4_MARLIN_SUPPORTED_GROUP_SIZES
):
if ( if (
len(args.limit_group_size) > 0 len(args.limit_group_size) > 0
and group_size not in args.limit_group_size and group_size not in args.limit_group_size

View File

@ -12,7 +12,7 @@ ARG PYTORCH_REPO="https://github.com/pytorch/pytorch.git"
ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git" ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"
ARG FA_BRANCH="1a7f4dfa" ARG FA_BRANCH="1a7f4dfa"
ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git" ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git"
ARG AITER_BRANCH="6487649" ARG AITER_BRANCH="c1debd8"
ARG AITER_REPO="https://github.com/ROCm/aiter.git" ARG AITER_REPO="https://github.com/ROCm/aiter.git"
FROM ${BASE_IMAGE} AS base FROM ${BASE_IMAGE} AS base

View File

@ -91,7 +91,7 @@ source to unblock the update process.
### FlashInfer ### FlashInfer
Here is how to build and install it from source with torch2.7.0+cu128 in vLLM [Dockerfile](https://github.com/vllm-project/vllm/blob/27bebcd89792d5c4b08af7a65095759526f2f9e1/docker/Dockerfile#L259-L271): Here is how to build and install it from source with torch2.7.0+cu128 in vLLM [Dockerfile](https://github.com/vllm-project/vllm/blob/27bebcd89792d5c4b08af7a65095759526f2f9e1/docker/Dockerfile#L259-L271):
```bash ```
export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0 10.0+PTX' export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0 10.0+PTX'
export FLASHINFER_ENABLE_SM90=1 export FLASHINFER_ENABLE_SM90=1
uv pip install --system --no-build-isolation "git+https://github.com/flashinfer-ai/flashinfer@v0.2.6.post1" uv pip install --system --no-build-isolation "git+https://github.com/flashinfer-ai/flashinfer@v0.2.6.post1"
@ -105,14 +105,14 @@ team if you want to get the package published there.
### xFormers ### xFormers
Similar to FlashInfer, here is how to build and install xFormers from source: Similar to FlashInfer, here is how to build and install xFormers from source:
```bash ```
export TORCH_CUDA_ARCH_LIST='7.0 7.5 8.0 8.9 9.0 10.0+PTX' export TORCH_CUDA_ARCH_LIST='7.0 7.5 8.0 8.9 9.0 10.0+PTX'
MAX_JOBS=16 uv pip install --system --no-build-isolation "git+https://github.com/facebookresearch/xformers@v0.0.30" MAX_JOBS=16 uv pip install --system --no-build-isolation "git+https://github.com/facebookresearch/xformers@v0.0.30"
``` ```
### Mamba ### Mamba
```bash ```
uv pip install --system --no-build-isolation "git+https://github.com/state-spaces/mamba@v2.2.4" uv pip install --system --no-build-isolation "git+https://github.com/state-spaces/mamba@v2.2.4"
``` ```

View File

@ -16,33 +16,35 @@ vllm {chat,complete,serve,bench,collect-env,run-batch}
Start the vLLM OpenAI Compatible API server. Start the vLLM OpenAI Compatible API server.
??? Examples Examples:
```bash ```bash
# Start with a model # Start with a model
vllm serve meta-llama/Llama-2-7b-hf vllm serve meta-llama/Llama-2-7b-hf
# Specify the port # Specify the port
vllm serve meta-llama/Llama-2-7b-hf --port 8100 vllm serve meta-llama/Llama-2-7b-hf --port 8100
# Check with --help for more options # Check with --help for more options
# To list all groups # To list all groups
vllm serve --help=listgroup vllm serve --help=listgroup
# To view a argument group # To view a argument group
vllm serve --help=ModelConfig vllm serve --help=ModelConfig
# To view a single argument # To view a single argument
vllm serve --help=max-num-seqs vllm serve --help=max-num-seqs
# To search by keyword # To search by keyword
vllm serve --help=max vllm serve --help=max
``` ```
## chat ## chat
Generate chat completions via the running API server. Generate chat completions via the running API server.
Examples:
```bash ```bash
# Directly connect to localhost API without arguments # Directly connect to localhost API without arguments
vllm chat vllm chat
@ -58,6 +60,8 @@ vllm chat --quick "hi"
Generate text completions based on the given prompt via the running API server. Generate text completions based on the given prompt via the running API server.
Examples:
```bash ```bash
# Directly connect to localhost API without arguments # Directly connect to localhost API without arguments
vllm complete vllm complete
@ -69,8 +73,6 @@ vllm complete --url http://{vllm-serve-host}:{vllm-serve-port}/v1
vllm complete --quick "The future of AI is" vllm complete --quick "The future of AI is"
``` ```
</details>
## bench ## bench
Run benchmark tests for latency online serving throughput and offline inference throughput. Run benchmark tests for latency online serving throughput and offline inference throughput.
@ -87,6 +89,8 @@ vllm bench {latency, serve, throughput}
Benchmark the latency of a single batch of requests. Benchmark the latency of a single batch of requests.
Example:
```bash ```bash
vllm bench latency \ vllm bench latency \
--model meta-llama/Llama-3.2-1B-Instruct \ --model meta-llama/Llama-3.2-1B-Instruct \
@ -100,6 +104,8 @@ vllm bench latency \
Benchmark the online serving throughput. Benchmark the online serving throughput.
Example:
```bash ```bash
vllm bench serve \ vllm bench serve \
--model meta-llama/Llama-3.2-1B-Instruct \ --model meta-llama/Llama-3.2-1B-Instruct \
@ -114,6 +120,8 @@ vllm bench serve \
Benchmark offline inference throughput. Benchmark offline inference throughput.
Example:
```bash ```bash
vllm bench throughput \ vllm bench throughput \
--model meta-llama/Llama-3.2-1B-Instruct \ --model meta-llama/Llama-3.2-1B-Instruct \
@ -135,8 +143,7 @@ vllm collect-env
Run batch prompts and write results to file. Run batch prompts and write results to file.
<details> Examples:
<summary>Examples</summary>
```bash ```bash
# Running with a local file # Running with a local file
@ -152,8 +159,6 @@ vllm run-batch \
--model meta-llama/Meta-Llama-3-8B-Instruct --model meta-llama/Meta-Llama-3-8B-Instruct
``` ```
</details>
## More Help ## More Help
For detailed options of any subcommand, use: For detailed options of any subcommand, use:

View File

@ -1,6 +0,0 @@
---
title: Contact Us
---
[](){ #contactus }
--8<-- "README.md:contact-us"

View File

@ -57,21 +57,19 @@ By default, we optimize model inference using CUDA graphs which take up extra me
You can adjust `compilation_config` to achieve a better balance between inference speed and memory usage: You can adjust `compilation_config` to achieve a better balance between inference speed and memory usage:
??? Code ```python
from vllm import LLM
from vllm.config import CompilationConfig, CompilationLevel
```python llm = LLM(
from vllm import LLM model="meta-llama/Llama-3.1-8B-Instruct",
from vllm.config import CompilationConfig, CompilationLevel compilation_config=CompilationConfig(
level=CompilationLevel.PIECEWISE,
llm = LLM( # By default, it goes up to max_num_seqs
model="meta-llama/Llama-3.1-8B-Instruct", cudagraph_capture_sizes=[1, 2, 4, 8, 16],
compilation_config=CompilationConfig( ),
level=CompilationLevel.PIECEWISE, )
# By default, it goes up to max_num_seqs ```
cudagraph_capture_sizes=[1, 2, 4, 8, 16],
),
)
```
You can disable graph capturing completely via the `enforce_eager` flag: You can disable graph capturing completely via the `enforce_eager` flag:
@ -129,20 +127,18 @@ reduce the size of the processed multi-modal inputs, which in turn saves memory.
Here are some examples: Here are some examples:
??? Code ```python
from vllm import LLM
```python # Available for Qwen2-VL series models
from vllm import LLM llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
mm_processor_kwargs={
"max_pixels": 768 * 768, # Default is 1280 * 28 * 28
})
# Available for Qwen2-VL series models # Available for InternVL series models
llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct", llm = LLM(model="OpenGVLab/InternVL2-2B",
mm_processor_kwargs={ mm_processor_kwargs={
"max_pixels": 768 * 768, # Default is 1280 * 28 * 28 "max_dynamic_patch": 4, # Default is 12
}) })
```
# Available for InternVL series models
llm = LLM(model="OpenGVLab/InternVL2-2B",
mm_processor_kwargs={
"max_dynamic_patch": 4, # Default is 12
})
```

View File

@ -7,8 +7,6 @@ vLLM uses the following environment variables to configure the system:
All environment variables used by vLLM are prefixed with `VLLM_`. **Special care should be taken for Kubernetes users**: please do not name the service as `vllm`, otherwise environment variables set by Kubernetes might conflict with vLLM's environment variables, because [Kubernetes sets environment variables for each service with the capitalized service name as the prefix](https://kubernetes.io/docs/concepts/services-networking/service/#environment-variables). All environment variables used by vLLM are prefixed with `VLLM_`. **Special care should be taken for Kubernetes users**: please do not name the service as `vllm`, otherwise environment variables set by Kubernetes might conflict with vLLM's environment variables, because [Kubernetes sets environment variables for each service with the capitalized service name as the prefix](https://kubernetes.io/docs/concepts/services-networking/service/#environment-variables).
??? Code ```python
--8<-- "vllm/envs.py:env-vars-definition"
```python ```
--8<-- "vllm/envs.py:env-vars-definition"
```

View File

@ -93,27 +93,25 @@ For additional features and advanced configurations, refer to the official [MkDo
## Testing ## Testing
??? note "Commands" ```bash
pip install -r requirements/dev.txt
```bash # Linting, formatting and static type checking
pip install -r requirements/dev.txt pre-commit install --hook-type pre-commit --hook-type commit-msg
# Linting, formatting and static type checking # You can manually run pre-commit with
pre-commit install --hook-type pre-commit --hook-type commit-msg pre-commit run --all-files
# You can manually run pre-commit with # To manually run something from CI that does not run
pre-commit run --all-files # locally by default, you can run:
pre-commit run mypy-3.9 --hook-stage manual --all-files
# To manually run something from CI that does not run # Unit tests
# locally by default, you can run: pytest tests/
pre-commit run mypy-3.9 --hook-stage manual --all-files
# Unit tests # Run tests for a single test file with detailed output
pytest tests/ pytest -s -v tests/test_logger.py
```
# Run tests for a single test file with detailed output
pytest -s -v tests/test_logger.py
```
!!! tip !!! tip
Since the <gh-file:docker/Dockerfile> ships with Python 3.12, all tests in CI (except `mypy`) are run with Python 3.12. Since the <gh-file:docker/Dockerfile> ships with Python 3.12, all tests in CI (except `mypy`) are run with Python 3.12.

View File

@ -27,35 +27,33 @@ All vLLM modules within the model must include a `prefix` argument in their cons
The initialization code should look like this: The initialization code should look like this:
??? Code ```python
from torch import nn
from vllm.config import VllmConfig
from vllm.attention import Attention
```python class MyAttention(nn.Module):
from torch import nn def __init__(self, vllm_config: VllmConfig, prefix: str):
from vllm.config import VllmConfig super().__init__()
from vllm.attention import Attention self.attn = Attention(prefix=f"{prefix}.attn")
class MyAttention(nn.Module): class MyDecoderLayer(nn.Module):
def __init__(self, vllm_config: VllmConfig, prefix: str): def __init__(self, vllm_config: VllmConfig, prefix: str):
super().__init__() super().__init__()
self.attn = Attention(prefix=f"{prefix}.attn") self.self_attn = MyAttention(prefix=f"{prefix}.self_attn")
class MyDecoderLayer(nn.Module): class MyModel(nn.Module):
def __init__(self, vllm_config: VllmConfig, prefix: str): def __init__(self, vllm_config: VllmConfig, prefix: str):
super().__init__() super().__init__()
self.self_attn = MyAttention(prefix=f"{prefix}.self_attn") self.layers = nn.ModuleList(
[MyDecoderLayer(vllm_config, prefix=f"{prefix}.layers.{i}") for i in range(vllm_config.model_config.hf_config.num_hidden_layers)]
)
class MyModel(nn.Module): class MyModelForCausalLM(nn.Module):
def __init__(self, vllm_config: VllmConfig, prefix: str): def __init__(self, vllm_config: VllmConfig, prefix: str = ""):
super().__init__() super().__init__()
self.layers = nn.ModuleList( self.model = MyModel(vllm_config, prefix=f"{prefix}.model")
[MyDecoderLayer(vllm_config, prefix=f"{prefix}.layers.{i}") for i in range(vllm_config.model_config.hf_config.num_hidden_layers)] ```
)
class MyModelForCausalLM(nn.Module):
def __init__(self, vllm_config: VllmConfig, prefix: str = ""):
super().__init__()
self.model = MyModel(vllm_config, prefix=f"{prefix}.model")
```
### Computation Code ### Computation Code

View File

@ -25,63 +25,59 @@ Further update the model as follows:
- Implement [get_multimodal_embeddings][vllm.model_executor.models.interfaces.SupportsMultiModal.get_multimodal_embeddings] that returns the embeddings from running the multimodal inputs through the multimodal tokenizer of the model. Below we provide a boilerplate of a typical implementation pattern, but feel free to adjust it to your own needs. - Implement [get_multimodal_embeddings][vllm.model_executor.models.interfaces.SupportsMultiModal.get_multimodal_embeddings] that returns the embeddings from running the multimodal inputs through the multimodal tokenizer of the model. Below we provide a boilerplate of a typical implementation pattern, but feel free to adjust it to your own needs.
??? Code ```python
class YourModelForImage2Seq(nn.Module):
...
```python def _process_image_input(self, image_input: YourModelImageInputs) -> torch.Tensor:
class YourModelForImage2Seq(nn.Module):
...
def _process_image_input(self, image_input: YourModelImageInputs) -> torch.Tensor: assert self.vision_encoder is not None
image_features = self.vision_encoder(image_input)
return self.multi_modal_projector(image_features)
assert self.vision_encoder is not None def get_multimodal_embeddings(
image_features = self.vision_encoder(image_input) self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
return self.multi_modal_projector(image_features)
def get_multimodal_embeddings( # Validate the multimodal input keyword arguments
self, **kwargs: object) -> Optional[MultiModalEmbeddings]: image_input = self._parse_and_validate_image_input(**kwargs)
if image_input is None:
return None
# Validate the multimodal input keyword arguments # Run multimodal inputs through encoder and projector
image_input = self._parse_and_validate_image_input(**kwargs) vision_embeddings = self._process_image_input(image_input)
if image_input is None: return vision_embeddings
return None ```
# Run multimodal inputs through encoder and projector
vision_embeddings = self._process_image_input(image_input)
return vision_embeddings
```
!!! important !!! important
The returned `multimodal_embeddings` must be either a **3D [torch.Tensor][]** of shape `(num_items, feature_size, hidden_size)`, or a **list / tuple of 2D [torch.Tensor][]'s** of shape `(feature_size, hidden_size)`, so that `multimodal_embeddings[i]` retrieves the embeddings generated from the `i`-th multimodal data item (e.g, image) of the request. The returned `multimodal_embeddings` must be either a **3D [torch.Tensor][]** of shape `(num_items, feature_size, hidden_size)`, or a **list / tuple of 2D [torch.Tensor][]'s** of shape `(feature_size, hidden_size)`, so that `multimodal_embeddings[i]` retrieves the embeddings generated from the `i`-th multimodal data item (e.g, image) of the request.
- Implement [get_input_embeddings][vllm.model_executor.models.interfaces.SupportsMultiModal.get_input_embeddings] to merge `multimodal_embeddings` with text embeddings from the `input_ids`. If input processing for the model is implemented correctly (see sections below), then you can leverage the utility function we provide to easily merge the embeddings. - Implement [get_input_embeddings][vllm.model_executor.models.interfaces.SupportsMultiModal.get_input_embeddings] to merge `multimodal_embeddings` with text embeddings from the `input_ids`. If input processing for the model is implemented correctly (see sections below), then you can leverage the utility function we provide to easily merge the embeddings.
??? Code ```python
from .utils import merge_multimodal_embeddings
```python class YourModelForImage2Seq(nn.Module):
from .utils import merge_multimodal_embeddings ...
class YourModelForImage2Seq(nn.Module): def get_input_embeddings(
... self,
input_ids: torch.Tensor,
multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
) -> torch.Tensor:
def get_input_embeddings( # `get_input_embeddings` should already be implemented for the language
self, # model as one of the requirements of basic vLLM model implementation.
input_ids: torch.Tensor, inputs_embeds = self.language_model.get_input_embeddings(input_ids)
multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
) -> torch.Tensor:
# `get_input_embeddings` should already be implemented for the language if multimodal_embeddings is not None:
# model as one of the requirements of basic vLLM model implementation. inputs_embeds = merge_multimodal_embeddings(
inputs_embeds = self.language_model.get_input_embeddings(input_ids) input_ids=input_ids,
inputs_embeds=inputs_embeds,
multimodal_embeddings=multimodal_embeddings,
placeholder_token_id=self.config.image_token_index)
if multimodal_embeddings is not None: return inputs_embeds
inputs_embeds = merge_multimodal_embeddings( ```
input_ids=input_ids,
inputs_embeds=inputs_embeds,
multimodal_embeddings=multimodal_embeddings,
placeholder_token_id=self.config.image_token_index)
return inputs_embeds
```
- Implement [get_language_model][vllm.model_executor.models.interfaces.SupportsMultiModal.get_language_model] getter to provide stable access to the underlying language model. - Implement [get_language_model][vllm.model_executor.models.interfaces.SupportsMultiModal.get_language_model] getter to provide stable access to the underlying language model.
@ -139,46 +135,42 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
Looking at the code of HF's `LlavaForConditionalGeneration`: Looking at the code of HF's `LlavaForConditionalGeneration`:
??? Code ```python
# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/llava/modeling_llava.py#L530-L544
n_image_tokens = (input_ids == self.config.image_token_index).sum().item()
n_image_features = image_features.shape[0] * image_features.shape[1]
```python if n_image_tokens != n_image_features:
# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/llava/modeling_llava.py#L530-L544 raise ValueError(
n_image_tokens = (input_ids == self.config.image_token_index).sum().item() f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
n_image_features = image_features.shape[0] * image_features.shape[1]
if n_image_tokens != n_image_features:
raise ValueError(
f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
)
special_image_mask = (
(input_ids == self.config.image_token_index)
.unsqueeze(-1)
.expand_as(inputs_embeds)
.to(inputs_embeds.device)
) )
image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype) special_image_mask = (
inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features) (input_ids == self.config.image_token_index)
``` .unsqueeze(-1)
.expand_as(inputs_embeds)
.to(inputs_embeds.device)
)
image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
```
The number of placeholder feature tokens per image is `image_features.shape[1]`. The number of placeholder feature tokens per image is `image_features.shape[1]`.
`image_features` is calculated inside the `get_image_features` method: `image_features` is calculated inside the `get_image_features` method:
??? Code ```python
# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/llava/modeling_llava.py#L290-L300
image_outputs = self.vision_tower(pixel_values, output_hidden_states=True)
```python selected_image_feature = image_outputs.hidden_states[vision_feature_layer]
# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/llava/modeling_llava.py#L290-L300 if vision_feature_select_strategy == "default":
image_outputs = self.vision_tower(pixel_values, output_hidden_states=True) selected_image_feature = selected_image_feature[:, 1:]
elif vision_feature_select_strategy == "full":
selected_image_feature = image_outputs.hidden_states[vision_feature_layer] selected_image_feature = selected_image_feature
if vision_feature_select_strategy == "default": else:
selected_image_feature = selected_image_feature[:, 1:] raise ValueError(f"Unexpected select feature strategy: {self.config.vision_feature_select_strategy}")
elif vision_feature_select_strategy == "full": image_features = self.multi_modal_projector(selected_image_feature)
selected_image_feature = selected_image_feature return image_features
else: ```
raise ValueError(f"Unexpected select feature strategy: {self.config.vision_feature_select_strategy}")
image_features = self.multi_modal_projector(selected_image_feature)
return image_features
```
We can infer that `image_features.shape[1]` is based on `image_outputs.hidden_states.shape[1]` from the vision tower We can infer that `image_features.shape[1]` is based on `image_outputs.hidden_states.shape[1]` from the vision tower
(`CLIPVisionModel` for the [`llava-hf/llava-1.5-7b-hf`](https://huggingface.co/llava-hf/llava-1.5-7b-hf) model). (`CLIPVisionModel` for the [`llava-hf/llava-1.5-7b-hf`](https://huggingface.co/llava-hf/llava-1.5-7b-hf) model).
@ -201,22 +193,20 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
To find the sequence length, we turn to the code of `CLIPVisionEmbeddings`: To find the sequence length, we turn to the code of `CLIPVisionEmbeddings`:
??? Code ```python
# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/clip/modeling_clip.py#L247-L257
target_dtype = self.patch_embedding.weight.dtype
patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype)) # shape = [*, width, grid, grid]
patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
```python class_embeds = self.class_embedding.expand(batch_size, 1, -1)
# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/clip/modeling_clip.py#L247-L257 embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
target_dtype = self.patch_embedding.weight.dtype if interpolate_pos_encoding:
patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype)) # shape = [*, width, grid, grid] embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width)
patch_embeds = patch_embeds.flatten(2).transpose(1, 2) else:
embeddings = embeddings + self.position_embedding(self.position_ids)
class_embeds = self.class_embedding.expand(batch_size, 1, -1) return embeddings
embeddings = torch.cat([class_embeds, patch_embeds], dim=1) ```
if interpolate_pos_encoding:
embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width)
else:
embeddings = embeddings + self.position_embedding(self.position_ids)
return embeddings
```
We can infer that `embeddings.shape[1] == self.num_positions`, where We can infer that `embeddings.shape[1] == self.num_positions`, where
@ -228,59 +218,55 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
Overall, the number of placeholder feature tokens for an image can be calculated as: Overall, the number of placeholder feature tokens for an image can be calculated as:
??? Code ```python
def get_num_image_tokens(
self,
*,
image_width: int,
image_height: int,
) -> int:
hf_config = self.get_hf_config()
hf_processor = self.get_hf_processor()
```python image_size = hf_config.vision_config.image_size
def get_num_image_tokens( patch_size = hf_config.vision_config.patch_size
self,
*,
image_width: int,
image_height: int,
) -> int:
hf_config = self.get_hf_config()
hf_processor = self.get_hf_processor()
image_size = hf_config.vision_config.image_size num_image_tokens = (image_size // patch_size) ** 2 + 1
patch_size = hf_config.vision_config.patch_size if hf_processor.vision_feature_select_strategy == "default":
num_image_tokens -= 1
num_image_tokens = (image_size // patch_size) ** 2 + 1 return num_image_tokens
if hf_processor.vision_feature_select_strategy == "default": ```
num_image_tokens -= 1
return num_image_tokens
```
Notice that the number of image tokens doesn't depend on the image width and height. Notice that the number of image tokens doesn't depend on the image width and height.
We can simply use a dummy `image_size` to calculate the multimodal profiling data: We can simply use a dummy `image_size` to calculate the multimodal profiling data:
??? Code ```python
# NOTE: In actuality, this is usually implemented as part of the
# model's subclass of `BaseProcessingInfo`, but we show it as is
# here for simplicity.
def get_image_size_with_most_features(self) -> ImageSize:
hf_config = self.get_hf_config()
width = height = hf_config.image_size
return ImageSize(width=width, height=height)
```python def get_dummy_mm_data(
# NOTE: In actuality, this is usually implemented as part of the self,
# model's subclass of `BaseProcessingInfo`, but we show it as is seq_len: int,
# here for simplicity. mm_counts: Mapping[str, int],
def get_image_size_with_most_features(self) -> ImageSize: ) -> MultiModalDataDict:
hf_config = self.get_hf_config() num_images = mm_counts.get("image", 0)
width = height = hf_config.image_size
return ImageSize(width=width, height=height)
def get_dummy_mm_data( target_width, target_height = \
self, self.info.get_image_size_with_most_features()
seq_len: int,
mm_counts: Mapping[str, int],
) -> MultiModalDataDict:
num_images = mm_counts.get("image", 0)
target_width, target_height = \ return {
self.info.get_image_size_with_most_features() "image":
self._get_dummy_images(width=target_width,
return { height=target_height,
"image": num_images=num_images)
self._get_dummy_images(width=target_width, }
height=target_height, ```
num_images=num_images)
}
```
For the text, we simply expand the multimodal image token from the model config to match the desired number of images. For the text, we simply expand the multimodal image token from the model config to match the desired number of images.
@ -298,23 +284,21 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
Looking at the code of HF's `FuyuForCausalLM`: Looking at the code of HF's `FuyuForCausalLM`:
??? Code ```python
# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/modeling_fuyu.py#L311-L322
```python if image_patches is not None and past_key_values is None:
# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/modeling_fuyu.py#L311-L322 patch_embeddings = [
if image_patches is not None and past_key_values is None: self.vision_embed_tokens(patch.to(self.vision_embed_tokens.weight.dtype))
patch_embeddings = [ .squeeze(0)
self.vision_embed_tokens(patch.to(self.vision_embed_tokens.weight.dtype)) .to(inputs_embeds.device)
.squeeze(0) for patch in image_patches
.to(inputs_embeds.device) ]
for patch in image_patches inputs_embeds = self.gather_continuous_embeddings(
] word_embeddings=inputs_embeds,
inputs_embeds = self.gather_continuous_embeddings( continuous_embeddings=patch_embeddings,
word_embeddings=inputs_embeds, image_patch_input_indices=image_patches_indices,
continuous_embeddings=patch_embeddings, )
image_patch_input_indices=image_patches_indices, ```
)
```
The number of placeholder feature tokens for the `i`th item in the batch is `patch_embeddings[i].shape[0]`, The number of placeholder feature tokens for the `i`th item in the batch is `patch_embeddings[i].shape[0]`,
which is the same as `image_patches[i].shape[0]`, i.e. `num_total_patches`. which is the same as `image_patches[i].shape[0]`, i.e. `num_total_patches`.
@ -328,98 +312,92 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
In `FuyuImageProcessor.preprocess`, the images are resized and padded to the target `FuyuImageProcessor.size`, In `FuyuImageProcessor.preprocess`, the images are resized and padded to the target `FuyuImageProcessor.size`,
returning the dimensions after resizing (but before padding) as metadata. returning the dimensions after resizing (but before padding) as metadata.
??? Code ```python
# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/processing_fuyu.py#L541-L544
image_encoding = self.image_processor.preprocess(images, **output_kwargs["images_kwargs"])
batch_images = image_encoding["images"]
image_unpadded_heights = image_encoding["image_unpadded_heights"]
image_unpadded_widths = image_encoding["image_unpadded_widths"]
```python # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/image_processing_fuyu.py#L480-L
# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/processing_fuyu.py#L541-L544 if do_resize:
image_encoding = self.image_processor.preprocess(images, **output_kwargs["images_kwargs"]) batch_images = [
batch_images = image_encoding["images"] [self.resize(image, size=size, input_data_format=input_data_format) for image in images]
image_unpadded_heights = image_encoding["image_unpadded_heights"] for images in batch_images
image_unpadded_widths = image_encoding["image_unpadded_widths"] ]
# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/image_processing_fuyu.py#L480-L image_sizes = [get_image_size(images[0], channel_dim=input_data_format) for images in batch_images]
if do_resize: image_unpadded_heights = [[image_size[0]] for image_size in image_sizes]
batch_images = [ image_unpadded_widths = [[image_size[1]] for image_size in image_sizes]
[self.resize(image, size=size, input_data_format=input_data_format) for image in images]
for images in batch_images if do_pad:
batch_images = [
[
self.pad_image(
image,
size=size,
mode=padding_mode,
constant_values=padding_value,
input_data_format=input_data_format,
)
for image in images
] ]
for images in batch_images
image_sizes = [get_image_size(images[0], channel_dim=input_data_format) for images in batch_images] ]
image_unpadded_heights = [[image_size[0]] for image_size in image_sizes] ```
image_unpadded_widths = [[image_size[1]] for image_size in image_sizes]
if do_pad:
batch_images = [
[
self.pad_image(
image,
size=size,
mode=padding_mode,
constant_values=padding_value,
input_data_format=input_data_format,
)
for image in images
]
for images in batch_images
]
```
In `FuyuImageProcessor.preprocess_with_tokenizer_info`, the images are split into patches based on this metadata: In `FuyuImageProcessor.preprocess_with_tokenizer_info`, the images are split into patches based on this metadata:
??? Code ```python
# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/processing_fuyu.py#L417-L425
model_image_input = self.image_processor.preprocess_with_tokenizer_info(
image_input=tensor_batch_images,
image_present=image_present,
image_unpadded_h=image_unpadded_heights,
image_unpadded_w=image_unpadded_widths,
image_placeholder_id=image_placeholder_id,
image_newline_id=image_newline_id,
variable_sized=True,
)
```python # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/image_processing_fuyu.py#L638-L658
# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/processing_fuyu.py#L417-L425 image_height, image_width = image.shape[1], image.shape[2]
model_image_input = self.image_processor.preprocess_with_tokenizer_info( if variable_sized: # variable_sized=True
image_input=tensor_batch_images, new_h = min(
image_present=image_present, image_height,
image_unpadded_h=image_unpadded_heights, math.ceil(image_unpadded_h[batch_index, subseq_index] / patch_height) * patch_height,
image_unpadded_w=image_unpadded_widths,
image_placeholder_id=image_placeholder_id,
image_newline_id=image_newline_id,
variable_sized=True,
) )
new_w = min(
# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/image_processing_fuyu.py#L638-L658 image_width,
image_height, image_width = image.shape[1], image.shape[2] math.ceil(image_unpadded_w[batch_index, subseq_index] / patch_width) * patch_width,
if variable_sized: # variable_sized=True
new_h = min(
image_height,
math.ceil(image_unpadded_h[batch_index, subseq_index] / patch_height) * patch_height,
)
new_w = min(
image_width,
math.ceil(image_unpadded_w[batch_index, subseq_index] / patch_width) * patch_width,
)
image = image[:, :new_h, :new_w]
image_height, image_width = new_h, new_w
num_patches = self.get_num_patches(image_height=image_height, image_width=image_width)
tensor_of_image_ids = torch.full(
[num_patches], image_placeholder_id, dtype=torch.int32, device=image_input.device
) )
patches = self.patchify_image(image=image.unsqueeze(0)).squeeze(0) image = image[:, :new_h, :new_w]
assert num_patches == patches.shape[0] image_height, image_width = new_h, new_w
```
num_patches = self.get_num_patches(image_height=image_height, image_width=image_width)
tensor_of_image_ids = torch.full(
[num_patches], image_placeholder_id, dtype=torch.int32, device=image_input.device
)
patches = self.patchify_image(image=image.unsqueeze(0)).squeeze(0)
assert num_patches == patches.shape[0]
```
The number of patches is in turn defined by `FuyuImageProcessor.get_num_patches`: The number of patches is in turn defined by `FuyuImageProcessor.get_num_patches`:
??? Code ```python
# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/image_processing_fuyu.py#L552-L562
patch_size = patch_size if patch_size is not None else self.patch_size
patch_height, patch_width = self.patch_size["height"], self.patch_size["width"]
```python if image_height % patch_height != 0:
# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/image_processing_fuyu.py#L552-L562 raise ValueError(f"{image_height=} must be divisible by {patch_height}")
patch_size = patch_size if patch_size is not None else self.patch_size if image_width % patch_width != 0:
patch_height, patch_width = self.patch_size["height"], self.patch_size["width"] raise ValueError(f"{image_width=} must be divisible by {patch_width}")
if image_height % patch_height != 0: num_patches_per_dim_h = image_height // patch_height
raise ValueError(f"{image_height=} must be divisible by {patch_height}") num_patches_per_dim_w = image_width // patch_width
if image_width % patch_width != 0: num_patches = num_patches_per_dim_h * num_patches_per_dim_w
raise ValueError(f"{image_width=} must be divisible by {patch_width}") ```
num_patches_per_dim_h = image_height // patch_height
num_patches_per_dim_w = image_width // patch_width
num_patches = num_patches_per_dim_h * num_patches_per_dim_w
```
These image patches correspond to placeholder tokens (`|SPEAKER|`). So, we just need to maximize the number of image patches. Since input images are first resized These image patches correspond to placeholder tokens (`|SPEAKER|`). So, we just need to maximize the number of image patches. Since input images are first resized
to fit within `image_processor.size`, we can maximize the number of image patches by inputting an image with size equal to `image_processor.size`. to fit within `image_processor.size`, we can maximize the number of image patches by inputting an image with size equal to `image_processor.size`.
@ -441,25 +419,23 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
For the multimodal image profiling data, the logic is very similar to LLaVA: For the multimodal image profiling data, the logic is very similar to LLaVA:
??? Code ```python
def get_dummy_mm_data(
self,
seq_len: int,
mm_counts: Mapping[str, int],
) -> MultiModalDataDict:
target_width, target_height = \
self.info.get_image_size_with_most_features()
num_images = mm_counts.get("image", 0)
```python return {
def get_dummy_mm_data( "image":
self, self._get_dummy_images(width=target_width,
seq_len: int, height=target_height,
mm_counts: Mapping[str, int], num_images=num_images)
) -> MultiModalDataDict: }
target_width, target_height = \ ```
self.info.get_image_size_with_most_features()
num_images = mm_counts.get("image", 0)
return {
"image":
self._get_dummy_images(width=target_width,
height=target_height,
num_images=num_images)
}
```
## 4. Specify processing details ## 4. Specify processing details
@ -479,7 +455,6 @@ return a schema of the tensors outputted by the HF processor that are related to
The output of `CLIPImageProcessor` is a simple tensor with shape The output of `CLIPImageProcessor` is a simple tensor with shape
`(num_images, num_channels, image_height, image_width)`: `(num_images, num_channels, image_height, image_width)`:
```python ```python
# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/clip/image_processing_clip.py#L339-L345 # https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/clip/image_processing_clip.py#L339-L345
images = [ images = [
@ -530,37 +505,35 @@ return a schema of the tensors outputted by the HF processor that are related to
In order to support the use of [MultiModalFieldConfig.batched][] like in LLaVA, In order to support the use of [MultiModalFieldConfig.batched][] like in LLaVA,
we remove the extra batch dimension by overriding [BaseMultiModalProcessor._call_hf_processor][]: we remove the extra batch dimension by overriding [BaseMultiModalProcessor._call_hf_processor][]:
??? Code ```python
def _call_hf_processor(
self,
prompt: str,
mm_data: Mapping[str, object],
mm_kwargs: Mapping[str, object],
) -> BatchFeature:
processed_outputs = super()._call_hf_processor(
prompt=prompt,
mm_data=mm_data,
mm_kwargs=mm_kwargs,
)
```python image_patches = processed_outputs.get("image_patches")
def _call_hf_processor( if image_patches is not None:
self, images = mm_data["images"]
prompt: str, assert isinstance(images, list)
mm_data: Mapping[str, object],
mm_kwargs: Mapping[str, object],
) -> BatchFeature:
processed_outputs = super()._call_hf_processor(
prompt=prompt,
mm_data=mm_data,
mm_kwargs=mm_kwargs,
)
image_patches = processed_outputs.get("image_patches") # Original output: (1, num_images, Pn, Px * Py * C)
if image_patches is not None: # New output: (num_images, Pn, Px * Py * C)
images = mm_data["images"] assert (isinstance(image_patches, list)
assert isinstance(images, list) and len(image_patches) == 1)
assert (isinstance(image_patches[0], torch.Tensor)
and len(image_patches[0]) == len(images))
# Original output: (1, num_images, Pn, Px * Py * C) processed_outputs["image_patches"] = image_patches[0]
# New output: (num_images, Pn, Px * Py * C)
assert (isinstance(image_patches, list)
and len(image_patches) == 1)
assert (isinstance(image_patches[0], torch.Tensor)
and len(image_patches[0]) == len(images))
processed_outputs["image_patches"] = image_patches[0] return processed_outputs
```
return processed_outputs
```
!!! note !!! note
Our [actual code](gh-file:vllm/model_executor/models/fuyu.py) has special handling Our [actual code](gh-file:vllm/model_executor/models/fuyu.py) has special handling
@ -600,37 +573,35 @@ Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies
It simply repeats each input `image_token` a number of times equal to the number of placeholder feature tokens (`num_image_tokens`). It simply repeats each input `image_token` a number of times equal to the number of placeholder feature tokens (`num_image_tokens`).
Based on this, we override [_get_prompt_updates][vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_updates] as follows: Based on this, we override [_get_prompt_updates][vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_updates] as follows:
??? Code ```python
def _get_prompt_updates(
self,
mm_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, object],
out_mm_kwargs: MultiModalKwargs,
) -> Sequence[PromptUpdate]:
hf_config = self.info.get_hf_config()
image_token_id = hf_config.image_token_index
```python def get_replacement(item_idx: int):
def _get_prompt_updates( images = mm_items.get_items("image", ImageProcessorItems)
self,
mm_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, object],
out_mm_kwargs: MultiModalKwargs,
) -> Sequence[PromptUpdate]:
hf_config = self.info.get_hf_config()
image_token_id = hf_config.image_token_index
def get_replacement(item_idx: int): image_size = images.get_image_size(item_idx)
images = mm_items.get_items("image", ImageProcessorItems) num_image_tokens = self.info.get_num_image_tokens(
image_width=image_size.width,
image_height=image_size.height,
)
image_size = images.get_image_size(item_idx) return [image_token_id] * num_image_tokens
num_image_tokens = self.info.get_num_image_tokens(
image_width=image_size.width,
image_height=image_size.height,
)
return [image_token_id] * num_image_tokens return [
PromptReplacement(
return [ modality="image",
PromptReplacement( target=[image_token_id],
modality="image", replacement=get_replacement,
target=[image_token_id], ),
replacement=get_replacement, ]
), ```
]
```
=== "Handling additional tokens: Fuyu" === "Handling additional tokens: Fuyu"
@ -645,90 +616,117 @@ Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies
We define a helper function to return `ncols` and `nrows` directly: We define a helper function to return `ncols` and `nrows` directly:
??? Code ```python
def get_image_feature_grid_size(
self,
*,
image_width: int,
image_height: int,
) -> tuple[int, int]:
image_processor = self.get_image_processor()
target_width = image_processor.size["width"]
target_height = image_processor.size["height"]
patch_width = image_processor.patch_size["width"]
patch_height = image_processor.patch_size["height"]
```python if not (image_width <= target_width and image_height <= target_height):
def get_image_feature_grid_size( height_scale_factor = target_height / image_height
self, width_scale_factor = target_width / image_width
*, optimal_scale_factor = min(height_scale_factor, width_scale_factor)
image_width: int,
image_height: int,
) -> tuple[int, int]:
image_processor = self.get_image_processor()
target_width = image_processor.size["width"]
target_height = image_processor.size["height"]
patch_width = image_processor.patch_size["width"]
patch_height = image_processor.patch_size["height"]
if not (image_width <= target_width and image_height <= target_height): image_height = int(image_height * optimal_scale_factor)
height_scale_factor = target_height / image_height image_width = int(image_width * optimal_scale_factor)
width_scale_factor = target_width / image_width
optimal_scale_factor = min(height_scale_factor, width_scale_factor)
image_height = int(image_height * optimal_scale_factor) ncols = math.ceil(image_width / patch_width)
image_width = int(image_width * optimal_scale_factor) nrows = math.ceil(image_height / patch_height)
return ncols, nrows
ncols = math.ceil(image_width / patch_width) ```
nrows = math.ceil(image_height / patch_height)
return ncols, nrows
```
Based on this, we can initially define our replacement tokens as: Based on this, we can initially define our replacement tokens as:
??? Code ```python
def get_replacement(item_idx: int):
images = mm_items.get_items("image", ImageProcessorItems)
image_size = images.get_image_size(item_idx)
```python ncols, nrows = self.info.get_image_feature_grid_size(
def get_replacement(item_idx: int): image_width=image_size.width,
images = mm_items.get_items("image", ImageProcessorItems) image_height=image_size.height,
image_size = images.get_image_size(item_idx) )
ncols, nrows = self.info.get_image_feature_grid_size( # `_IMAGE_TOKEN_ID` corresponds to `|SPEAKER|`
image_width=image_size.width, # `_NEWLINE_TOKEN_ID` corresponds to `|NEWLINE|`
image_height=image_size.height, return ([_IMAGE_TOKEN_ID] * ncols + [_NEWLINE_TOKEN_ID]) * nrows
) ```
# `_IMAGE_TOKEN_ID` corresponds to `|SPEAKER|`
# `_NEWLINE_TOKEN_ID` corresponds to `|NEWLINE|`
return ([_IMAGE_TOKEN_ID] * ncols + [_NEWLINE_TOKEN_ID]) * nrows
```
However, this is not entirely correct. After `FuyuImageProcessor.preprocess_with_tokenizer_info` is called, However, this is not entirely correct. After `FuyuImageProcessor.preprocess_with_tokenizer_info` is called,
a BOS token (`<s>`) is also added to the promopt: a BOS token (`<s>`) is also added to the promopt:
??? Code ```python
# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/processing_fuyu.py#L417-L435
```python model_image_input = self.image_processor.preprocess_with_tokenizer_info(
# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/processing_fuyu.py#L417-L435 image_input=tensor_batch_images,
model_image_input = self.image_processor.preprocess_with_tokenizer_info( image_present=image_present,
image_input=tensor_batch_images, image_unpadded_h=image_unpadded_heights,
image_present=image_present, image_unpadded_w=image_unpadded_widths,
image_unpadded_h=image_unpadded_heights, image_placeholder_id=image_placeholder_id,
image_unpadded_w=image_unpadded_widths, image_newline_id=image_newline_id,
image_placeholder_id=image_placeholder_id, variable_sized=True,
image_newline_id=image_newline_id, )
variable_sized=True, prompt_tokens, prompts_length = _tokenize_prompts_with_image_and_batch(
) tokenizer=self.tokenizer,
prompt_tokens, prompts_length = _tokenize_prompts_with_image_and_batch( prompts=prompts,
tokenizer=self.tokenizer, scale_factors=scale_factors,
prompts=prompts, max_tokens_to_generate=self.max_tokens_to_generate,
scale_factors=scale_factors, max_position_embeddings=self.max_position_embeddings,
max_tokens_to_generate=self.max_tokens_to_generate, add_BOS=True,
max_position_embeddings=self.max_position_embeddings, add_beginning_of_answer_token=True,
add_BOS=True, )
add_beginning_of_answer_token=True, ```
)
```
To assign the vision embeddings to only the image tokens, instead of a string To assign the vision embeddings to only the image tokens, instead of a string
you can return an instance of [PromptUpdateDetails][vllm.multimodal.processing.PromptUpdateDetails]: you can return an instance of [PromptUpdateDetails][vllm.multimodal.processing.PromptUpdateDetails]:
??? Code ```python
hf_config = self.info.get_hf_config()
bos_token_id = hf_config.bos_token_id # `<s>`
assert isinstance(bos_token_id, int)
```python def get_replacement_fuyu(item_idx: int):
images = mm_items.get_items("image", ImageProcessorItems)
image_size = images.get_image_size(item_idx)
ncols, nrows = self.info.get_image_feature_grid_size(
image_width=image_size.width,
image_height=image_size.height,
)
image_tokens = ([_IMAGE_TOKEN_ID] * ncols +
[_NEWLINE_TOKEN_ID]) * nrows
return PromptUpdateDetails.select_token_id(
image_tokens + [bos_token_id],
embed_token_id=_IMAGE_TOKEN_ID,
)
```
Finally, noticing that the HF processor removes the `|ENDOFTEXT|` token from the tokenized prompt,
we can search for it to conduct the replacement at the start of the string:
```python
def _get_prompt_updates(
self,
mm_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, object],
out_mm_kwargs: MultiModalKwargs,
) -> Sequence[PromptUpdate]:
hf_config = self.info.get_hf_config() hf_config = self.info.get_hf_config()
bos_token_id = hf_config.bos_token_id # `<s>` bos_token_id = hf_config.bos_token_id
assert isinstance(bos_token_id, int) assert isinstance(bos_token_id, int)
tokenizer = self.info.get_tokenizer()
eot_token_id = tokenizer.bos_token_id
assert isinstance(eot_token_id, int)
def get_replacement_fuyu(item_idx: int): def get_replacement_fuyu(item_idx: int):
images = mm_items.get_items("image", ImageProcessorItems) images = mm_items.get_items("image", ImageProcessorItems)
image_size = images.get_image_size(item_idx) image_size = images.get_image_size(item_idx)
@ -744,52 +742,15 @@ Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies
image_tokens + [bos_token_id], image_tokens + [bos_token_id],
embed_token_id=_IMAGE_TOKEN_ID, embed_token_id=_IMAGE_TOKEN_ID,
) )
```
Finally, noticing that the HF processor removes the `|ENDOFTEXT|` token from the tokenized prompt, return [
we can search for it to conduct the replacement at the start of the string: PromptReplacement(
modality="image",
??? Code target=[eot_token_id],
replacement=get_replacement_fuyu,
```python )
def _get_prompt_updates( ]
self, ```
mm_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, object],
out_mm_kwargs: MultiModalKwargs,
) -> Sequence[PromptUpdate]:
hf_config = self.info.get_hf_config()
bos_token_id = hf_config.bos_token_id
assert isinstance(bos_token_id, int)
tokenizer = self.info.get_tokenizer()
eot_token_id = tokenizer.bos_token_id
assert isinstance(eot_token_id, int)
def get_replacement_fuyu(item_idx: int):
images = mm_items.get_items("image", ImageProcessorItems)
image_size = images.get_image_size(item_idx)
ncols, nrows = self.info.get_image_feature_grid_size(
image_width=image_size.width,
image_height=image_size.height,
)
image_tokens = ([_IMAGE_TOKEN_ID] * ncols +
[_NEWLINE_TOKEN_ID]) * nrows
return PromptUpdateDetails.select_token_id(
image_tokens + [bos_token_id],
embed_token_id=_IMAGE_TOKEN_ID,
)
return [
PromptReplacement(
modality="image",
target=[eot_token_id],
replacement=get_replacement_fuyu,
)
]
```
## 5. Register processor-related classes ## 5. Register processor-related classes

View File

@ -30,21 +30,13 @@ Refer to <gh-file:examples/offline_inference/simple_profiling.py> for an example
#### OpenAI Server #### OpenAI Server
```bash ```bash
VLLM_TORCH_PROFILER_DIR=./vllm_profile \ VLLM_TORCH_PROFILER_DIR=./vllm_profile python -m vllm.entrypoints.openai.api_server --model meta-llama/Meta-Llama-3-70B
python -m vllm.entrypoints.openai.api_server \
--model meta-llama/Meta-Llama-3-70B
``` ```
benchmark_serving.py: benchmark_serving.py:
```bash ```bash
python benchmarks/benchmark_serving.py \ python benchmarks/benchmark_serving.py --backend vllm --model meta-llama/Meta-Llama-3-70B --dataset-name sharegpt --dataset-path sharegpt.json --profile --num-prompts 2
--backend vllm \
--model meta-llama/Meta-Llama-3-70B \
--dataset-name sharegpt \
--dataset-path sharegpt.json \
--profile \
--num-prompts 2
``` ```
## Profile with NVIDIA Nsight Systems ## Profile with NVIDIA Nsight Systems
@ -72,16 +64,7 @@ For basic usage, you can just append `nsys profile -o report.nsys-rep --trace-fo
The following is an example using the `benchmarks/benchmark_latency.py` script: The following is an example using the `benchmarks/benchmark_latency.py` script:
```bash ```bash
nsys profile -o report.nsys-rep \ nsys profile -o report.nsys-rep --trace-fork-before-exec=true --cuda-graph-trace=node python benchmarks/benchmark_latency.py --model meta-llama/Llama-3.1-8B-Instruct --num-iters-warmup 5 --num-iters 1 --batch-size 16 --input-len 512 --output-len 8
--trace-fork-before-exec=true \
--cuda-graph-trace=node \
python benchmarks/benchmark_latency.py \
--model meta-llama/Llama-3.1-8B-Instruct \
--num-iters-warmup 5 \
--num-iters 1 \
--batch-size 16 \
--input-len 512 \
--output-len 8
``` ```
#### OpenAI Server #### OpenAI Server
@ -90,21 +73,10 @@ To profile the server, you will want to prepend your `vllm serve` command with `
```bash ```bash
# server # server
nsys profile -o report.nsys-rep \ nsys profile -o report.nsys-rep --trace-fork-before-exec=true --cuda-graph-trace=node --delay 30 --duration 60 vllm serve meta-llama/Llama-3.1-8B-Instruct
--trace-fork-before-exec=true \
--cuda-graph-trace=node \
--delay 30 \
--duration 60 \
vllm serve meta-llama/Llama-3.1-8B-Instruct
# client # client
python benchmarks/benchmark_serving.py \ python benchmarks/benchmark_serving.py --backend vllm --model meta-llama/Llama-3.1-8B-Instruct --num-prompts 1 --dataset-name random --random-input 1024 --random-output 512
--backend vllm \
--model meta-llama/Llama-3.1-8B-Instruct \
--num-prompts 1 \
--dataset-name random \
--random-input 1024 \
--random-output 512
``` ```
In practice, you should set the `--duration` argument to a large value. Whenever you want the server to stop profiling, run: In practice, you should set the `--duration` argument to a large value. Whenever you want the server to stop profiling, run:
@ -125,26 +97,26 @@ to manually kill the profiler and generate your `nsys-rep` report.
You can view these profiles either as summaries in the CLI, using `nsys stats [profile-file]`, or in the GUI by installing Nsight [locally following the directions here](https://developer.nvidia.com/nsight-systems/get-started). You can view these profiles either as summaries in the CLI, using `nsys stats [profile-file]`, or in the GUI by installing Nsight [locally following the directions here](https://developer.nvidia.com/nsight-systems/get-started).
??? CLI example CLI example:
```bash ```bash
nsys stats report1.nsys-rep nsys stats report1.nsys-rep
... ...
** CUDA GPU Kernel Summary (cuda_gpu_kern_sum): ** CUDA GPU Kernel Summary (cuda_gpu_kern_sum):
Time (%) Total Time (ns) Instances Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Name Time (%) Total Time (ns) Instances Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Name
-------- --------------- --------- ----------- ----------- -------- --------- ----------- ---------------------------------------------------------------------------------------------------- -------- --------------- --------- ----------- ----------- -------- --------- ----------- ----------------------------------------------------------------------------------------------------
46.3 10,327,352,338 17,505 589,965.9 144,383.0 27,040 3,126,460 944,263.8 sm90_xmma_gemm_bf16bf16_bf16f32_f32_tn_n_tilesize128x128x64_warpgroupsize1x1x1_execute_segment_k_of… 46.3 10,327,352,338 17,505 589,965.9 144,383.0 27,040 3,126,460 944,263.8 sm90_xmma_gemm_bf16bf16_bf16f32_f32_tn_n_tilesize128x128x64_warpgroupsize1x1x1_execute_segment_k_of…
14.8 3,305,114,764 5,152 641,520.7 293,408.0 287,296 2,822,716 867,124.9 sm90_xmma_gemm_bf16bf16_bf16f32_f32_tn_n_tilesize256x128x64_warpgroupsize2x1x1_execute_segment_k_of… 14.8 3,305,114,764 5,152 641,520.7 293,408.0 287,296 2,822,716 867,124.9 sm90_xmma_gemm_bf16bf16_bf16f32_f32_tn_n_tilesize256x128x64_warpgroupsize2x1x1_execute_segment_k_of…
12.1 2,692,284,876 14,280 188,535.4 83,904.0 19,328 2,862,237 497,999.9 sm90_xmma_gemm_bf16bf16_bf16f32_f32_tn_n_tilesize64x128x64_warpgroupsize1x1x1_execute_segment_k_off… 12.1 2,692,284,876 14,280 188,535.4 83,904.0 19,328 2,862,237 497,999.9 sm90_xmma_gemm_bf16bf16_bf16f32_f32_tn_n_tilesize64x128x64_warpgroupsize1x1x1_execute_segment_k_off…
9.5 2,116,600,578 33,920 62,399.8 21,504.0 15,326 2,532,285 290,954.1 sm90_xmma_gemm_bf16bf16_bf16f32_f32_tn_n_tilesize64x64x64_warpgroupsize1x1x1_execute_segment_k_off_… 9.5 2,116,600,578 33,920 62,399.8 21,504.0 15,326 2,532,285 290,954.1 sm90_xmma_gemm_bf16bf16_bf16f32_f32_tn_n_tilesize64x64x64_warpgroupsize1x1x1_execute_segment_k_off_…
5.0 1,119,749,165 18,912 59,208.4 9,056.0 6,784 2,578,366 271,581.7 void vllm::act_and_mul_kernel<c10::BFloat16, &vllm::silu_kernel<c10::BFloat16>, (bool)1>(T1 *, cons… 5.0 1,119,749,165 18,912 59,208.4 9,056.0 6,784 2,578,366 271,581.7 void vllm::act_and_mul_kernel<c10::BFloat16, &vllm::silu_kernel<c10::BFloat16>, (bool)1>(T1 *, cons…
4.1 916,662,515 21,312 43,011.6 19,776.0 8,928 2,586,205 199,790.1 void cutlass::device_kernel<flash::enable_sm90_or_later<flash::FlashAttnFwdSm90<flash::CollectiveMa… 4.1 916,662,515 21,312 43,011.6 19,776.0 8,928 2,586,205 199,790.1 void cutlass::device_kernel<flash::enable_sm90_or_later<flash::FlashAttnFwdSm90<flash::CollectiveMa…
2.6 587,283,113 37,824 15,526.7 3,008.0 2,719 2,517,756 139,091.1 std::enable_if<T2>(int)0&&vllm::_typeConvert<T1>::exists, void>::type vllm::fused_add_rms_norm_kern… 2.6 587,283,113 37,824 15,526.7 3,008.0 2,719 2,517,756 139,091.1 std::enable_if<T2>(int)0&&vllm::_typeConvert<T1>::exists, void>::type vllm::fused_add_rms_norm_kern…
1.9 418,362,605 18,912 22,121.5 3,871.0 3,328 2,523,870 175,248.2 void vllm::rotary_embedding_kernel<c10::BFloat16, (bool)1>(const long *, T1 *, T1 *, const T1 *, in… 1.9 418,362,605 18,912 22,121.5 3,871.0 3,328 2,523,870 175,248.2 void vllm::rotary_embedding_kernel<c10::BFloat16, (bool)1>(const long *, T1 *, T1 *, const T1 *, in…
0.7 167,083,069 18,880 8,849.7 2,240.0 1,471 2,499,996 101,436.1 void vllm::reshape_and_cache_flash_kernel<__nv_bfloat16, __nv_bfloat16, (vllm::Fp8KVCacheDataType)0… 0.7 167,083,069 18,880 8,849.7 2,240.0 1,471 2,499,996 101,436.1 void vllm::reshape_and_cache_flash_kernel<__nv_bfloat16, __nv_bfloat16, (vllm::Fp8KVCacheDataType)0…
... ...
``` ```
GUI example: GUI example:

View File

@ -10,7 +10,7 @@ title: Using Docker
vLLM offers an official Docker image for deployment. vLLM offers an official Docker image for deployment.
The image can be used to run OpenAI compatible server and is available on Docker Hub as [vllm/vllm-openai](https://hub.docker.com/r/vllm/vllm-openai/tags). The image can be used to run OpenAI compatible server and is available on Docker Hub as [vllm/vllm-openai](https://hub.docker.com/r/vllm/vllm-openai/tags).
```bash ```console
docker run --runtime nvidia --gpus all \ docker run --runtime nvidia --gpus all \
-v ~/.cache/huggingface:/root/.cache/huggingface \ -v ~/.cache/huggingface:/root/.cache/huggingface \
--env "HUGGING_FACE_HUB_TOKEN=<secret>" \ --env "HUGGING_FACE_HUB_TOKEN=<secret>" \
@ -22,7 +22,7 @@ docker run --runtime nvidia --gpus all \
This image can also be used with other container engines such as [Podman](https://podman.io/). This image can also be used with other container engines such as [Podman](https://podman.io/).
```bash ```console
podman run --gpus all \ podman run --gpus all \
-v ~/.cache/huggingface:/root/.cache/huggingface \ -v ~/.cache/huggingface:/root/.cache/huggingface \
--env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \ --env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \
@ -71,7 +71,7 @@ You can add any other [engine-args][engine-args] you need after the image tag (`
You can build and run vLLM from source via the provided <gh-file:docker/Dockerfile>. To build vLLM: You can build and run vLLM from source via the provided <gh-file:docker/Dockerfile>. To build vLLM:
```bash ```console
# optionally specifies: --build-arg max_jobs=8 --build-arg nvcc_threads=2 # optionally specifies: --build-arg max_jobs=8 --build-arg nvcc_threads=2
DOCKER_BUILDKIT=1 docker build . \ DOCKER_BUILDKIT=1 docker build . \
--target vllm-openai \ --target vllm-openai \
@ -97,28 +97,26 @@ of PyTorch Nightly and should be considered **experimental**. Using the flag `--
flags to speed up build process. However, ensure your `max_jobs` is substantially larger than `nvcc_threads` to get the most benefits. flags to speed up build process. However, ensure your `max_jobs` is substantially larger than `nvcc_threads` to get the most benefits.
Keep an eye on memory usage with parallel jobs as it can be substantial (see example below). Keep an eye on memory usage with parallel jobs as it can be substantial (see example below).
??? Command ```console
# Example of building on Nvidia GH200 server. (Memory usage: ~15GB, Build time: ~1475s / ~25 min, Image size: 6.93GB)
```bash python3 use_existing_torch.py
# Example of building on Nvidia GH200 server. (Memory usage: ~15GB, Build time: ~1475s / ~25 min, Image size: 6.93GB) DOCKER_BUILDKIT=1 docker build . \
python3 use_existing_torch.py --file docker/Dockerfile \
DOCKER_BUILDKIT=1 docker build . \ --target vllm-openai \
--file docker/Dockerfile \ --platform "linux/arm64" \
--target vllm-openai \ -t vllm/vllm-gh200-openai:latest \
--platform "linux/arm64" \ --build-arg max_jobs=66 \
-t vllm/vllm-gh200-openai:latest \ --build-arg nvcc_threads=2 \
--build-arg max_jobs=66 \ --build-arg torch_cuda_arch_list="9.0 10.0+PTX" \
--build-arg nvcc_threads=2 \ --build-arg vllm_fa_cmake_gpu_arches="90-real"
--build-arg torch_cuda_arch_list="9.0 10.0+PTX" \ ```
--build-arg vllm_fa_cmake_gpu_arches="90-real"
```
!!! note !!! note
If you are building the `linux/arm64` image on a non-ARM host (e.g., an x86_64 machine), you need to ensure your system is set up for cross-compilation using QEMU. This allows your host machine to emulate ARM64 execution. If you are building the `linux/arm64` image on a non-ARM host (e.g., an x86_64 machine), you need to ensure your system is set up for cross-compilation using QEMU. This allows your host machine to emulate ARM64 execution.
Run the following command on your host machine to register QEMU user static handlers: Run the following command on your host machine to register QEMU user static handlers:
```bash ```console
docker run --rm --privileged multiarch/qemu-user-static --reset -p yes docker run --rm --privileged multiarch/qemu-user-static --reset -p yes
``` ```
@ -128,7 +126,7 @@ of PyTorch Nightly and should be considered **experimental**. Using the flag `--
To run vLLM with the custom-built Docker image: To run vLLM with the custom-built Docker image:
```bash ```console
docker run --runtime nvidia --gpus all \ docker run --runtime nvidia --gpus all \
-v ~/.cache/huggingface:/root/.cache/huggingface \ -v ~/.cache/huggingface:/root/.cache/huggingface \
-p 8000:8000 \ -p 8000:8000 \

View File

@ -15,7 +15,7 @@ It allows you to deploy a large language model (LLM) server with vLLM as the bac
- Start the vLLM server with the supported chat completion model, e.g. - Start the vLLM server with the supported chat completion model, e.g.
```bash ```console
vllm serve Qwen/Qwen1.5-32B-Chat-AWQ --max-model-len 4096 vllm serve Qwen/Qwen1.5-32B-Chat-AWQ --max-model-len 4096
``` ```

View File

@ -11,7 +11,7 @@ title: AutoGen
- Setup [AutoGen](https://microsoft.github.io/autogen/0.2/docs/installation/) environment - Setup [AutoGen](https://microsoft.github.io/autogen/0.2/docs/installation/) environment
```bash ```console
pip install vllm pip install vllm
# Install AgentChat and OpenAI client from Extensions # Install AgentChat and OpenAI client from Extensions
@ -23,60 +23,58 @@ pip install -U "autogen-agentchat" "autogen-ext[openai]"
- Start the vLLM server with the supported chat completion model, e.g. - Start the vLLM server with the supported chat completion model, e.g.
```bash ```console
python -m vllm.entrypoints.openai.api_server \ python -m vllm.entrypoints.openai.api_server \
--model mistralai/Mistral-7B-Instruct-v0.2 --model mistralai/Mistral-7B-Instruct-v0.2
``` ```
- Call it with AutoGen: - Call it with AutoGen:
??? Code ```python
import asyncio
```python from autogen_core.models import UserMessage
import asyncio from autogen_ext.models.openai import OpenAIChatCompletionClient
from autogen_core.models import UserMessage from autogen_core.models import ModelFamily
from autogen_ext.models.openai import OpenAIChatCompletionClient
from autogen_core.models import ModelFamily
async def main() -> None: async def main() -> None:
# Create a model client # Create a model client
model_client = OpenAIChatCompletionClient( model_client = OpenAIChatCompletionClient(
model="mistralai/Mistral-7B-Instruct-v0.2", model="mistralai/Mistral-7B-Instruct-v0.2",
base_url="http://{your-vllm-host-ip}:{your-vllm-host-port}/v1", base_url="http://{your-vllm-host-ip}:{your-vllm-host-port}/v1",
api_key="EMPTY", api_key="EMPTY",
model_info={ model_info={
"vision": False, "vision": False,
"function_calling": False, "function_calling": False,
"json_output": False, "json_output": False,
"family": ModelFamily.MISTRAL, "family": ModelFamily.MISTRAL,
"structured_output": True, "structured_output": True,
}, },
) )
messages = [UserMessage(content="Write a very short story about a dragon.", source="user")] messages = [UserMessage(content="Write a very short story about a dragon.", source="user")]
# Create a stream. # Create a stream.
stream = model_client.create_stream(messages=messages) stream = model_client.create_stream(messages=messages)
# Iterate over the stream and print the responses. # Iterate over the stream and print the responses.
print("Streamed responses:") print("Streamed responses:")
async for response in stream: async for response in stream:
if isinstance(response, str): if isinstance(response, str):
# A partial response is a string. # A partial response is a string.
print(response, flush=True, end="") print(response, flush=True, end="")
else: else:
# The last response is a CreateResult object with the complete message. # The last response is a CreateResult object with the complete message.
print("\n\n------------\n") print("\n\n------------\n")
print("The complete response:", flush=True) print("The complete response:", flush=True)
print(response.content, flush=True) print(response.content, flush=True)
# Close the client when done. # Close the client when done.
await model_client.close() await model_client.close()
asyncio.run(main()) asyncio.run(main())
``` ```
For details, see the tutorial: For details, see the tutorial:

View File

@ -11,14 +11,14 @@ vLLM can be run on a cloud based GPU machine with [Cerebrium](https://www.cerebr
To install the Cerebrium client, run: To install the Cerebrium client, run:
```bash ```console
pip install cerebrium pip install cerebrium
cerebrium login cerebrium login
``` ```
Next, create your Cerebrium project, run: Next, create your Cerebrium project, run:
```bash ```console
cerebrium init vllm-project cerebrium init vllm-project
``` ```
@ -34,81 +34,75 @@ vllm = "latest"
Next, let us add our code to handle inference for the LLM of your choice (`mistralai/Mistral-7B-Instruct-v0.1` for this example), add the following code to your `main.py`: Next, let us add our code to handle inference for the LLM of your choice (`mistralai/Mistral-7B-Instruct-v0.1` for this example), add the following code to your `main.py`:
??? Code ```python
from vllm import LLM, SamplingParams
```python llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.1")
from vllm import LLM, SamplingParams
llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.1") def run(prompts: list[str], temperature: float = 0.8, top_p: float = 0.95):
def run(prompts: list[str], temperature: float = 0.8, top_p: float = 0.95): sampling_params = SamplingParams(temperature=temperature, top_p=top_p)
outputs = llm.generate(prompts, sampling_params)
sampling_params = SamplingParams(temperature=temperature, top_p=top_p) # Print the outputs.
outputs = llm.generate(prompts, sampling_params) results = []
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
results.append({"prompt": prompt, "generated_text": generated_text})
# Print the outputs. return {"results": results}
results = [] ```
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
results.append({"prompt": prompt, "generated_text": generated_text})
return {"results": results}
```
Then, run the following code to deploy it to the cloud: Then, run the following code to deploy it to the cloud:
```bash ```console
cerebrium deploy cerebrium deploy
``` ```
If successful, you should be returned a CURL command that you can call inference against. Just remember to end the url with the function name you are calling (in our case`/run`) If successful, you should be returned a CURL command that you can call inference against. Just remember to end the url with the function name you are calling (in our case`/run`)
??? Command ```python
curl -X POST https://api.cortex.cerebrium.ai/v4/p-xxxxxx/vllm/run \
```python -H 'Content-Type: application/json' \
curl -X POST https://api.cortex.cerebrium.ai/v4/p-xxxxxx/vllm/run \ -H 'Authorization: <JWT TOKEN>' \
-H 'Content-Type: application/json' \ --data '{
-H 'Authorization: <JWT TOKEN>' \ "prompts": [
--data '{ "Hello, my name is",
"prompts": [ "The president of the United States is",
"Hello, my name is", "The capital of France is",
"The president of the United States is", "The future of AI is"
"The capital of France is", ]
"The future of AI is" }'
] ```
}'
```
You should get a response like: You should get a response like:
??? Response ```python
{
```python "run_id": "52911756-3066-9ae8-bcc9-d9129d1bd262",
{ "result": {
"run_id": "52911756-3066-9ae8-bcc9-d9129d1bd262", "result": [
"result": { {
"result": [ "prompt": "Hello, my name is",
{ "generated_text": " Sarah, and I'm a teacher. I teach elementary school students. One of"
"prompt": "Hello, my name is", },
"generated_text": " Sarah, and I'm a teacher. I teach elementary school students. One of" {
}, "prompt": "The president of the United States is",
{ "generated_text": " elected every four years. This is a democratic system.\n\n5. What"
"prompt": "The president of the United States is", },
"generated_text": " elected every four years. This is a democratic system.\n\n5. What" {
}, "prompt": "The capital of France is",
{ "generated_text": " Paris.\n"
"prompt": "The capital of France is", },
"generated_text": " Paris.\n" {
}, "prompt": "The future of AI is",
{ "generated_text": " bright, but it's important to approach it with a balanced and nuanced perspective."
"prompt": "The future of AI is", }
"generated_text": " bright, but it's important to approach it with a balanced and nuanced perspective." ]
} },
] "run_time_ms": 152.53663063049316
}, }
"run_time_ms": 152.53663063049316 ```
}
```
You now have an autoscaling endpoint where you only pay for the compute you use! You now have an autoscaling endpoint where you only pay for the compute you use!

View File

@ -15,7 +15,7 @@ It allows you to deploy a large language model (LLM) server with vLLM as the bac
- Start the vLLM server with the supported chat completion model, e.g. - Start the vLLM server with the supported chat completion model, e.g.
```bash ```console
vllm serve qwen/Qwen1.5-0.5B-Chat vllm serve qwen/Qwen1.5-0.5B-Chat
``` ```

View File

@ -18,13 +18,13 @@ This guide walks you through deploying Dify using a vLLM backend.
- Start the vLLM server with the supported chat completion model, e.g. - Start the vLLM server with the supported chat completion model, e.g.
```bash ```console
vllm serve Qwen/Qwen1.5-7B-Chat vllm serve Qwen/Qwen1.5-7B-Chat
``` ```
- Start the Dify server with docker compose ([details](https://github.com/langgenius/dify?tab=readme-ov-file#quick-start)): - Start the Dify server with docker compose ([details](https://github.com/langgenius/dify?tab=readme-ov-file#quick-start)):
```bash ```console
git clone https://github.com/langgenius/dify.git git clone https://github.com/langgenius/dify.git
cd dify cd dify
cd docker cd docker

View File

@ -11,14 +11,14 @@ vLLM can be run on a cloud based GPU machine with [dstack](https://dstack.ai/),
To install dstack client, run: To install dstack client, run:
```bash ```console
pip install "dstack[all] pip install "dstack[all]
dstack server dstack server
``` ```
Next, to configure your dstack project, run: Next, to configure your dstack project, run:
```bash ```console
mkdir -p vllm-dstack mkdir -p vllm-dstack
cd vllm-dstack cd vllm-dstack
dstack init dstack init
@ -26,81 +26,75 @@ dstack init
Next, to provision a VM instance with LLM of your choice (`NousResearch/Llama-2-7b-chat-hf` for this example), create the following `serve.dstack.yml` file for the dstack `Service`: Next, to provision a VM instance with LLM of your choice (`NousResearch/Llama-2-7b-chat-hf` for this example), create the following `serve.dstack.yml` file for the dstack `Service`:
??? Config ```yaml
type: service
```yaml python: "3.11"
type: service env:
- MODEL=NousResearch/Llama-2-7b-chat-hf
python: "3.11" port: 8000
env: resources:
- MODEL=NousResearch/Llama-2-7b-chat-hf gpu: 24GB
port: 8000 commands:
resources: - pip install vllm
gpu: 24GB - vllm serve $MODEL --port 8000
commands: model:
- pip install vllm format: openai
- vllm serve $MODEL --port 8000 type: chat
model: name: NousResearch/Llama-2-7b-chat-hf
format: openai ```
type: chat
name: NousResearch/Llama-2-7b-chat-hf
```
Then, run the following CLI for provisioning: Then, run the following CLI for provisioning:
??? Command ```console
$ dstack run . -f serve.dstack.yml
```console ⠸ Getting run plan...
$ dstack run . -f serve.dstack.yml Configuration serve.dstack.yml
Project deep-diver-main
User deep-diver
Min resources 2..xCPU, 8GB.., 1xGPU (24GB)
Max price -
Max duration -
Spot policy auto
Retry policy no
⠸ Getting run plan... # BACKEND REGION INSTANCE RESOURCES SPOT PRICE
Configuration serve.dstack.yml 1 gcp us-central1 g2-standard-4 4xCPU, 16GB, 1xL4 (24GB), 100GB (disk) yes $0.223804
Project deep-diver-main 2 gcp us-east1 g2-standard-4 4xCPU, 16GB, 1xL4 (24GB), 100GB (disk) yes $0.223804
User deep-diver 3 gcp us-west1 g2-standard-4 4xCPU, 16GB, 1xL4 (24GB), 100GB (disk) yes $0.223804
Min resources 2..xCPU, 8GB.., 1xGPU (24GB) ...
Max price - Shown 3 of 193 offers, $5.876 max
Max duration -
Spot policy auto
Retry policy no
# BACKEND REGION INSTANCE RESOURCES SPOT PRICE Continue? [y/n]: y
1 gcp us-central1 g2-standard-4 4xCPU, 16GB, 1xL4 (24GB), 100GB (disk) yes $0.223804 ⠙ Submitting run...
2 gcp us-east1 g2-standard-4 4xCPU, 16GB, 1xL4 (24GB), 100GB (disk) yes $0.223804 ⠏ Launching spicy-treefrog-1 (pulling)
3 gcp us-west1 g2-standard-4 4xCPU, 16GB, 1xL4 (24GB), 100GB (disk) yes $0.223804 spicy-treefrog-1 provisioning completed (running)
... Service is published at ...
Shown 3 of 193 offers, $5.876 max ```
Continue? [y/n]: y
⠙ Submitting run...
⠏ Launching spicy-treefrog-1 (pulling)
spicy-treefrog-1 provisioning completed (running)
Service is published at ...
```
After the provisioning, you can interact with the model by using the OpenAI SDK: After the provisioning, you can interact with the model by using the OpenAI SDK:
??? Code ```python
from openai import OpenAI
```python client = OpenAI(
from openai import OpenAI base_url="https://gateway.<gateway domain>",
api_key="<YOUR-DSTACK-SERVER-ACCESS-TOKEN>"
)
client = OpenAI( completion = client.chat.completions.create(
base_url="https://gateway.<gateway domain>", model="NousResearch/Llama-2-7b-chat-hf",
api_key="<YOUR-DSTACK-SERVER-ACCESS-TOKEN>" messages=[
) {
"role": "user",
"content": "Compose a poem that explains the concept of recursion in programming.",
}
]
)
completion = client.chat.completions.create( print(completion.choices[0].message.content)
model="NousResearch/Llama-2-7b-chat-hf", ```
messages=[
{
"role": "user",
"content": "Compose a poem that explains the concept of recursion in programming.",
}
]
)
print(completion.choices[0].message.content)
```
!!! note !!! note
dstack automatically handles authentication on the gateway using dstack's tokens. Meanwhile, if you don't want to configure a gateway, you can provision dstack `Task` instead of `Service`. The `Task` is for development purpose only. If you want to know more about hands-on materials how to serve vLLM using dstack, check out [this repository](https://github.com/dstackai/dstack-examples/tree/main/deployment/vllm) dstack automatically handles authentication on the gateway using dstack's tokens. Meanwhile, if you don't want to configure a gateway, you can provision dstack `Task` instead of `Service`. The `Task` is for development purpose only. If you want to know more about hands-on materials how to serve vLLM using dstack, check out [this repository](https://github.com/dstackai/dstack-examples/tree/main/deployment/vllm)

View File

@ -13,7 +13,7 @@ It allows you to deploy a large language model (LLM) server with vLLM as the bac
- Setup vLLM and Haystack environment - Setup vLLM and Haystack environment
```bash ```console
pip install vllm haystack-ai pip install vllm haystack-ai
``` ```
@ -21,35 +21,35 @@ pip install vllm haystack-ai
- Start the vLLM server with the supported chat completion model, e.g. - Start the vLLM server with the supported chat completion model, e.g.
```bash ```console
vllm serve mistralai/Mistral-7B-Instruct-v0.1 vllm serve mistralai/Mistral-7B-Instruct-v0.1
``` ```
- Use the `OpenAIGenerator` and `OpenAIChatGenerator` components in Haystack to query the vLLM server. - Use the `OpenAIGenerator` and `OpenAIChatGenerator` components in Haystack to query the vLLM server.
??? Code ```python
from haystack.components.generators.chat import OpenAIChatGenerator
from haystack.dataclasses import ChatMessage
from haystack.utils import Secret
```python generator = OpenAIChatGenerator(
from haystack.components.generators.chat import OpenAIChatGenerator # for compatibility with the OpenAI API, a placeholder api_key is needed
from haystack.dataclasses import ChatMessage api_key=Secret.from_token("VLLM-PLACEHOLDER-API-KEY"),
from haystack.utils import Secret model="mistralai/Mistral-7B-Instruct-v0.1",
api_base_url="http://{your-vLLM-host-ip}:{your-vLLM-host-port}/v1",
generation_kwargs = {"max_tokens": 512}
)
generator = OpenAIChatGenerator( response = generator.run(
# for compatibility with the OpenAI API, a placeholder api_key is needed messages=[ChatMessage.from_user("Hi. Can you help me plan my next trip to Italy?")]
api_key=Secret.from_token("VLLM-PLACEHOLDER-API-KEY"), )
model="mistralai/Mistral-7B-Instruct-v0.1",
api_base_url="http://{your-vLLM-host-ip}:{your-vLLM-host-port}/v1",
generation_kwargs = {"max_tokens": 512}
)
response = generator.run( print("-"*30)
messages=[ChatMessage.from_user("Hi. Can you help me plan my next trip to Italy?")] print(response)
) print("-"*30)
```
print("-"*30) Output e.g.:
print(response)
print("-"*30)
```
```console ```console
------------------------------ ------------------------------

View File

@ -22,7 +22,7 @@ Before you begin, ensure that you have the following:
To install the chart with the release name `test-vllm`: To install the chart with the release name `test-vllm`:
```bash ```console
helm upgrade --install --create-namespace --namespace=ns-vllm test-vllm . -f values.yaml --set secrets.s3endpoint=$ACCESS_POINT --set secrets.s3bucketname=$BUCKET --set secrets.s3accesskeyid=$ACCESS_KEY --set secrets.s3accesskey=$SECRET_KEY helm upgrade --install --create-namespace --namespace=ns-vllm test-vllm . -f values.yaml --set secrets.s3endpoint=$ACCESS_POINT --set secrets.s3bucketname=$BUCKET --set secrets.s3accesskeyid=$ACCESS_KEY --set secrets.s3accesskey=$SECRET_KEY
``` ```
@ -30,7 +30,7 @@ helm upgrade --install --create-namespace --namespace=ns-vllm test-vllm . -f val
To uninstall the `test-vllm` deployment: To uninstall the `test-vllm` deployment:
```bash ```console
helm uninstall test-vllm --namespace=ns-vllm helm uninstall test-vllm --namespace=ns-vllm
``` ```

View File

@ -18,7 +18,7 @@ And LiteLLM supports all models on VLLM.
- Setup vLLM and litellm environment - Setup vLLM and litellm environment
```bash ```console
pip install vllm litellm pip install vllm litellm
``` ```
@ -28,35 +28,33 @@ pip install vllm litellm
- Start the vLLM server with the supported chat completion model, e.g. - Start the vLLM server with the supported chat completion model, e.g.
```bash ```console
vllm serve qwen/Qwen1.5-0.5B-Chat vllm serve qwen/Qwen1.5-0.5B-Chat
``` ```
- Call it with litellm: - Call it with litellm:
??? Code ```python
import litellm
```python messages = [{ "content": "Hello, how are you?","role": "user"}]
import litellm
messages = [{ "content": "Hello, how are you?","role": "user"}] # hosted_vllm is prefix key word and necessary
response = litellm.completion(
model="hosted_vllm/qwen/Qwen1.5-0.5B-Chat", # pass the vllm model name
messages=messages,
api_base="http://{your-vllm-server-host}:{your-vllm-server-port}/v1",
temperature=0.2,
max_tokens=80)
# hosted_vllm is prefix key word and necessary print(response)
response = litellm.completion( ```
model="hosted_vllm/qwen/Qwen1.5-0.5B-Chat", # pass the vllm model name
messages=messages,
api_base="http://{your-vllm-server-host}:{your-vllm-server-port}/v1",
temperature=0.2,
max_tokens=80)
print(response)
```
### Embeddings ### Embeddings
- Start the vLLM server with the supported embedding model, e.g. - Start the vLLM server with the supported embedding model, e.g.
```bash ```console
vllm serve BAAI/bge-base-en-v1.5 vllm serve BAAI/bge-base-en-v1.5
``` ```

View File

@ -17,101 +17,99 @@ vLLM can be deployed with [LWS](https://github.com/kubernetes-sigs/lws) on Kuber
Deploy the following yaml file `lws.yaml` Deploy the following yaml file `lws.yaml`
??? Yaml ```yaml
apiVersion: leaderworkerset.x-k8s.io/v1
```yaml kind: LeaderWorkerSet
apiVersion: leaderworkerset.x-k8s.io/v1 metadata:
kind: LeaderWorkerSet name: vllm
metadata: spec:
name: vllm replicas: 2
spec: leaderWorkerTemplate:
replicas: 2 size: 2
leaderWorkerTemplate: restartPolicy: RecreateGroupOnPodRestart
size: 2 leaderTemplate:
restartPolicy: RecreateGroupOnPodRestart metadata:
leaderTemplate: labels:
metadata: role: leader
labels: spec:
role: leader containers:
spec: - name: vllm-leader
containers: image: docker.io/vllm/vllm-openai:latest
- name: vllm-leader env:
image: docker.io/vllm/vllm-openai:latest - name: HUGGING_FACE_HUB_TOKEN
env: value: <your-hf-token>
- name: HUGGING_FACE_HUB_TOKEN command:
value: <your-hf-token> - sh
command: - -c
- sh - "bash /vllm-workspace/examples/online_serving/multi-node-serving.sh leader --ray_cluster_size=$(LWS_GROUP_SIZE);
- -c python3 -m vllm.entrypoints.openai.api_server --port 8080 --model meta-llama/Meta-Llama-3.1-405B-Instruct --tensor-parallel-size 8 --pipeline_parallel_size 2"
- "bash /vllm-workspace/examples/online_serving/multi-node-serving.sh leader --ray_cluster_size=$(LWS_GROUP_SIZE); resources:
python3 -m vllm.entrypoints.openai.api_server --port 8080 --model meta-llama/Meta-Llama-3.1-405B-Instruct --tensor-parallel-size 8 --pipeline_parallel_size 2" limits:
resources: nvidia.com/gpu: "8"
limits: memory: 1124Gi
nvidia.com/gpu: "8" ephemeral-storage: 800Gi
memory: 1124Gi requests:
ephemeral-storage: 800Gi ephemeral-storage: 800Gi
requests: cpu: 125
ephemeral-storage: 800Gi ports:
cpu: 125 - containerPort: 8080
ports: readinessProbe:
- containerPort: 8080 tcpSocket:
readinessProbe: port: 8080
tcpSocket: initialDelaySeconds: 15
port: 8080 periodSeconds: 10
initialDelaySeconds: 15 volumeMounts:
periodSeconds: 10 - mountPath: /dev/shm
volumeMounts: name: dshm
- mountPath: /dev/shm volumes:
name: dshm - name: dshm
volumes: emptyDir:
- name: dshm medium: Memory
emptyDir: sizeLimit: 15Gi
medium: Memory workerTemplate:
sizeLimit: 15Gi spec:
workerTemplate: containers:
spec: - name: vllm-worker
containers: image: docker.io/vllm/vllm-openai:latest
- name: vllm-worker command:
image: docker.io/vllm/vllm-openai:latest - sh
command: - -c
- sh - "bash /vllm-workspace/examples/online_serving/multi-node-serving.sh worker --ray_address=$(LWS_LEADER_ADDRESS)"
- -c resources:
- "bash /vllm-workspace/examples/online_serving/multi-node-serving.sh worker --ray_address=$(LWS_LEADER_ADDRESS)" limits:
resources: nvidia.com/gpu: "8"
limits: memory: 1124Gi
nvidia.com/gpu: "8" ephemeral-storage: 800Gi
memory: 1124Gi requests:
ephemeral-storage: 800Gi ephemeral-storage: 800Gi
requests: cpu: 125
ephemeral-storage: 800Gi env:
cpu: 125 - name: HUGGING_FACE_HUB_TOKEN
env: value: <your-hf-token>
- name: HUGGING_FACE_HUB_TOKEN volumeMounts:
value: <your-hf-token> - mountPath: /dev/shm
volumeMounts: name: dshm
- mountPath: /dev/shm volumes:
name: dshm - name: dshm
volumes: emptyDir:
- name: dshm medium: Memory
emptyDir: sizeLimit: 15Gi
medium: Memory ---
sizeLimit: 15Gi apiVersion: v1
--- kind: Service
apiVersion: v1 metadata:
kind: Service name: vllm-leader
metadata: spec:
name: vllm-leader ports:
spec: - name: http
ports: port: 8080
- name: http protocol: TCP
port: 8080 targetPort: 8080
protocol: TCP selector:
targetPort: 8080 leaderworkerset.sigs.k8s.io/name: vllm
selector: role: leader
leaderworkerset.sigs.k8s.io/name: vllm type: ClusterIP
role: leader ```
type: ClusterIP
```
```bash ```bash
kubectl apply -f lws.yaml kubectl apply -f lws.yaml
@ -177,27 +175,25 @@ curl http://localhost:8080/v1/completions \
The output should be similar to the following The output should be similar to the following
??? Output ```text
{
```text "id": "cmpl-1bb34faba88b43f9862cfbfb2200949d",
"object": "text_completion",
"created": 1715138766,
"model": "meta-llama/Meta-Llama-3.1-405B-Instruct",
"choices": [
{ {
"id": "cmpl-1bb34faba88b43f9862cfbfb2200949d", "index": 0,
"object": "text_completion", "text": " top destination for foodies, with",
"created": 1715138766, "logprobs": null,
"model": "meta-llama/Meta-Llama-3.1-405B-Instruct", "finish_reason": "length",
"choices": [ "stop_reason": null
{
"index": 0,
"text": " top destination for foodies, with",
"logprobs": null,
"finish_reason": "length",
"stop_reason": null
}
],
"usage": {
"prompt_tokens": 5,
"total_tokens": 12,
"completion_tokens": 7
}
} }
``` ],
"usage": {
"prompt_tokens": 5,
"total_tokens": 12,
"completion_tokens": 7
}
}
```

View File

@ -7,13 +7,13 @@ title: Open WebUI
2. Start the vLLM server with the supported chat completion model, e.g. 2. Start the vLLM server with the supported chat completion model, e.g.
```bash ```console
vllm serve qwen/Qwen1.5-0.5B-Chat vllm serve qwen/Qwen1.5-0.5B-Chat
``` ```
1. Start the [Open WebUI](https://github.com/open-webui/open-webui) docker container (replace the vllm serve host and vllm serve port): 1. Start the [Open WebUI](https://github.com/open-webui/open-webui) docker container (replace the vllm serve host and vllm serve port):
```bash ```console
docker run -d -p 3000:8080 \ docker run -d -p 3000:8080 \
--name open-webui \ --name open-webui \
-v open-webui:/app/backend/data \ -v open-webui:/app/backend/data \

View File

@ -15,7 +15,7 @@ Here are the integrations:
- Setup vLLM and langchain environment - Setup vLLM and langchain environment
```bash ```console
pip install -U vllm \ pip install -U vllm \
langchain_milvus langchain_openai \ langchain_milvus langchain_openai \
langchain_community beautifulsoup4 \ langchain_community beautifulsoup4 \
@ -26,14 +26,14 @@ pip install -U vllm \
- Start the vLLM server with the supported embedding model, e.g. - Start the vLLM server with the supported embedding model, e.g.
```bash ```console
# Start embedding service (port 8000) # Start embedding service (port 8000)
vllm serve ssmits/Qwen2-7B-Instruct-embed-base vllm serve ssmits/Qwen2-7B-Instruct-embed-base
``` ```
- Start the vLLM server with the supported chat completion model, e.g. - Start the vLLM server with the supported chat completion model, e.g.
```bash ```console
# Start chat service (port 8001) # Start chat service (port 8001)
vllm serve qwen/Qwen1.5-0.5B-Chat --port 8001 vllm serve qwen/Qwen1.5-0.5B-Chat --port 8001
``` ```
@ -52,7 +52,7 @@ python retrieval_augmented_generation_with_langchain.py
- Setup vLLM and llamaindex environment - Setup vLLM and llamaindex environment
```bash ```console
pip install vllm \ pip install vllm \
llama-index llama-index-readers-web \ llama-index llama-index-readers-web \
llama-index-llms-openai-like \ llama-index-llms-openai-like \
@ -64,14 +64,14 @@ pip install vllm \
- Start the vLLM server with the supported embedding model, e.g. - Start the vLLM server with the supported embedding model, e.g.
```bash ```console
# Start embedding service (port 8000) # Start embedding service (port 8000)
vllm serve ssmits/Qwen2-7B-Instruct-embed-base vllm serve ssmits/Qwen2-7B-Instruct-embed-base
``` ```
- Start the vLLM server with the supported chat completion model, e.g. - Start the vLLM server with the supported chat completion model, e.g.
```bash ```console
# Start chat service (port 8001) # Start chat service (port 8001)
vllm serve qwen/Qwen1.5-0.5B-Chat --port 8001 vllm serve qwen/Qwen1.5-0.5B-Chat --port 8001
``` ```

View File

@ -15,7 +15,7 @@ vLLM can be **run and scaled to multiple service replicas on clouds and Kubernet
- Check that you have installed SkyPilot ([docs](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html)). - Check that you have installed SkyPilot ([docs](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html)).
- Check that `sky check` shows clouds or Kubernetes are enabled. - Check that `sky check` shows clouds or Kubernetes are enabled.
```bash ```console
pip install skypilot-nightly pip install skypilot-nightly
sky check sky check
``` ```
@ -24,54 +24,52 @@ sky check
See the vLLM SkyPilot YAML for serving, [serving.yaml](https://github.com/skypilot-org/skypilot/blob/master/llm/vllm/serve.yaml). See the vLLM SkyPilot YAML for serving, [serving.yaml](https://github.com/skypilot-org/skypilot/blob/master/llm/vllm/serve.yaml).
??? Yaml ```yaml
resources:
accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model.
use_spot: True
disk_size: 512 # Ensure model checkpoints can fit.
disk_tier: best
ports: 8081 # Expose to internet traffic.
```yaml envs:
resources: MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model. HF_TOKEN: <your-huggingface-token> # Change to your own huggingface token, or use --env to pass.
use_spot: True
disk_size: 512 # Ensure model checkpoints can fit.
disk_tier: best
ports: 8081 # Expose to internet traffic.
envs: setup: |
MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct conda create -n vllm python=3.10 -y
HF_TOKEN: <your-huggingface-token> # Change to your own huggingface token, or use --env to pass. conda activate vllm
setup: | pip install vllm==0.4.0.post1
conda create -n vllm python=3.10 -y # Install Gradio for web UI.
conda activate vllm pip install gradio openai
pip install flash-attn==2.5.7
pip install vllm==0.4.0.post1 run: |
# Install Gradio for web UI. conda activate vllm
pip install gradio openai echo 'Starting vllm api server...'
pip install flash-attn==2.5.7 python -u -m vllm.entrypoints.openai.api_server \
--port 8081 \
--model $MODEL_NAME \
--trust-remote-code \
--tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
2>&1 | tee api_server.log &
run: | echo 'Waiting for vllm api server to start...'
conda activate vllm while ! `cat api_server.log | grep -q 'Uvicorn running on'`; do sleep 1; done
echo 'Starting vllm api server...'
python -u -m vllm.entrypoints.openai.api_server \
--port 8081 \
--model $MODEL_NAME \
--trust-remote-code \
--tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
2>&1 | tee api_server.log &
echo 'Waiting for vllm api server to start...' echo 'Starting gradio server...'
while ! `cat api_server.log | grep -q 'Uvicorn running on'`; do sleep 1; done git clone https://github.com/vllm-project/vllm.git || true
python vllm/examples/online_serving/gradio_openai_chatbot_webserver.py \
echo 'Starting gradio server...' -m $MODEL_NAME \
git clone https://github.com/vllm-project/vllm.git || true --port 8811 \
python vllm/examples/online_serving/gradio_openai_chatbot_webserver.py \ --model-url http://localhost:8081/v1 \
-m $MODEL_NAME \ --stop-token-ids 128009,128001
--port 8811 \ ```
--model-url http://localhost:8081/v1 \
--stop-token-ids 128009,128001
```
Start the serving the Llama-3 8B model on any of the candidate GPUs listed (L4, A10g, ...): Start the serving the Llama-3 8B model on any of the candidate GPUs listed (L4, A10g, ...):
```bash ```console
HF_TOKEN="your-huggingface-token" sky launch serving.yaml --env HF_TOKEN HF_TOKEN="your-huggingface-token" sky launch serving.yaml --env HF_TOKEN
``` ```
@ -83,7 +81,7 @@ Check the output of the command. There will be a shareable gradio link (like the
**Optional**: Serve the 70B model instead of the default 8B and use more GPU: **Optional**: Serve the 70B model instead of the default 8B and use more GPU:
```bash ```console
HF_TOKEN="your-huggingface-token" \ HF_TOKEN="your-huggingface-token" \
sky launch serving.yaml \ sky launch serving.yaml \
--gpus A100:8 \ --gpus A100:8 \
@ -95,71 +93,72 @@ HF_TOKEN="your-huggingface-token" \
SkyPilot can scale up the service to multiple service replicas with built-in autoscaling, load-balancing and fault-tolerance. You can do it by adding a services section to the YAML file. SkyPilot can scale up the service to multiple service replicas with built-in autoscaling, load-balancing and fault-tolerance. You can do it by adding a services section to the YAML file.
??? Yaml ```yaml
service:
replicas: 2
# An actual request for readiness probe.
readiness_probe:
path: /v1/chat/completions
post_data:
model: $MODEL_NAME
messages:
- role: user
content: Hello! What is your name?
max_completion_tokens: 1
```
```yaml <details>
service: <summary>Click to see the full recipe YAML</summary>
replicas: 2
# An actual request for readiness probe. ```yaml
readiness_probe: service:
path: /v1/chat/completions replicas: 2
post_data: # An actual request for readiness probe.
model: $MODEL_NAME readiness_probe:
messages: path: /v1/chat/completions
- role: user post_data:
content: Hello! What is your name? model: $MODEL_NAME
messages:
- role: user
content: Hello! What is your name?
max_completion_tokens: 1 max_completion_tokens: 1
```
??? Yaml resources:
accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model.
use_spot: True
disk_size: 512 # Ensure model checkpoints can fit.
disk_tier: best
ports: 8081 # Expose to internet traffic.
```yaml envs:
service: MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
replicas: 2 HF_TOKEN: <your-huggingface-token> # Change to your own huggingface token, or use --env to pass.
# An actual request for readiness probe.
readiness_probe:
path: /v1/chat/completions
post_data:
model: $MODEL_NAME
messages:
- role: user
content: Hello! What is your name?
max_completion_tokens: 1
resources: setup: |
accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model. conda create -n vllm python=3.10 -y
use_spot: True conda activate vllm
disk_size: 512 # Ensure model checkpoints can fit.
disk_tier: best
ports: 8081 # Expose to internet traffic.
envs: pip install vllm==0.4.0.post1
MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct # Install Gradio for web UI.
HF_TOKEN: <your-huggingface-token> # Change to your own huggingface token, or use --env to pass. pip install gradio openai
pip install flash-attn==2.5.7
setup: | run: |
conda create -n vllm python=3.10 -y conda activate vllm
conda activate vllm echo 'Starting vllm api server...'
python -u -m vllm.entrypoints.openai.api_server \
--port 8081 \
--model $MODEL_NAME \
--trust-remote-code \
--tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
2>&1 | tee api_server.log
```
pip install vllm==0.4.0.post1 </details>
# Install Gradio for web UI.
pip install gradio openai
pip install flash-attn==2.5.7
run: |
conda activate vllm
echo 'Starting vllm api server...'
python -u -m vllm.entrypoints.openai.api_server \
--port 8081 \
--model $MODEL_NAME \
--trust-remote-code \
--tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
2>&1 | tee api_server.log
```
Start the serving the Llama-3 8B model on multiple replicas: Start the serving the Llama-3 8B model on multiple replicas:
```bash ```console
HF_TOKEN="your-huggingface-token" \ HF_TOKEN="your-huggingface-token" \
sky serve up -n vllm serving.yaml \ sky serve up -n vllm serving.yaml \
--env HF_TOKEN --env HF_TOKEN
@ -167,11 +166,12 @@ HF_TOKEN="your-huggingface-token" \
Wait until the service is ready: Wait until the service is ready:
```bash ```console
watch -n10 sky serve status vllm watch -n10 sky serve status vllm
``` ```
Example outputs: <details>
<summary>Example outputs:</summary>
```console ```console
Services Services
@ -184,29 +184,29 @@ vllm 1 1 xx.yy.zz.121 18 mins ago 1x GCP([Spot]{'L4': 1}) R
vllm 2 1 xx.yy.zz.245 18 mins ago 1x GCP([Spot]{'L4': 1}) READY us-east4 vllm 2 1 xx.yy.zz.245 18 mins ago 1x GCP([Spot]{'L4': 1}) READY us-east4
``` ```
</details>
After the service is READY, you can find a single endpoint for the service and access the service with the endpoint: After the service is READY, you can find a single endpoint for the service and access the service with the endpoint:
??? Commands ```console
ENDPOINT=$(sky serve status --endpoint 8081 vllm)
```bash curl -L http://$ENDPOINT/v1/chat/completions \
ENDPOINT=$(sky serve status --endpoint 8081 vllm) -H "Content-Type: application/json" \
curl -L http://$ENDPOINT/v1/chat/completions \ -d '{
-H "Content-Type: application/json" \ "model": "meta-llama/Meta-Llama-3-8B-Instruct",
-d '{ "messages": [
"model": "meta-llama/Meta-Llama-3-8B-Instruct", {
"messages": [ "role": "system",
{ "content": "You are a helpful assistant."
"role": "system", },
"content": "You are a helpful assistant." {
}, "role": "user",
{ "content": "Who are you?"
"role": "user", }
"content": "Who are you?" ],
} "stop_token_ids": [128009, 128001]
], }'
"stop_token_ids": [128009, 128001] ```
}'
```
To enable autoscaling, you could replace the `replicas` with the following configs in `service`: To enable autoscaling, you could replace the `replicas` with the following configs in `service`:
@ -220,64 +220,67 @@ service:
This will scale the service up to when the QPS exceeds 2 for each replica. This will scale the service up to when the QPS exceeds 2 for each replica.
??? Yaml <details>
<summary>Click to see the full recipe YAML</summary>
```yaml ```yaml
service: service:
replica_policy: replica_policy:
min_replicas: 2 min_replicas: 2
max_replicas: 4 max_replicas: 4
target_qps_per_replica: 2 target_qps_per_replica: 2
# An actual request for readiness probe. # An actual request for readiness probe.
readiness_probe: readiness_probe:
path: /v1/chat/completions path: /v1/chat/completions
post_data: post_data:
model: $MODEL_NAME model: $MODEL_NAME
messages: messages:
- role: user - role: user
content: Hello! What is your name? content: Hello! What is your name?
max_completion_tokens: 1 max_completion_tokens: 1
resources: resources:
accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model. accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model.
use_spot: True use_spot: True
disk_size: 512 # Ensure model checkpoints can fit. disk_size: 512 # Ensure model checkpoints can fit.
disk_tier: best disk_tier: best
ports: 8081 # Expose to internet traffic. ports: 8081 # Expose to internet traffic.
envs: envs:
MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
HF_TOKEN: <your-huggingface-token> # Change to your own huggingface token, or use --env to pass. HF_TOKEN: <your-huggingface-token> # Change to your own huggingface token, or use --env to pass.
setup: | setup: |
conda create -n vllm python=3.10 -y conda create -n vllm python=3.10 -y
conda activate vllm conda activate vllm
pip install vllm==0.4.0.post1 pip install vllm==0.4.0.post1
# Install Gradio for web UI. # Install Gradio for web UI.
pip install gradio openai pip install gradio openai
pip install flash-attn==2.5.7 pip install flash-attn==2.5.7
run: | run: |
conda activate vllm conda activate vllm
echo 'Starting vllm api server...' echo 'Starting vllm api server...'
python -u -m vllm.entrypoints.openai.api_server \ python -u -m vllm.entrypoints.openai.api_server \
--port 8081 \ --port 8081 \
--model $MODEL_NAME \ --model $MODEL_NAME \
--trust-remote-code \ --trust-remote-code \
--tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \ --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
2>&1 | tee api_server.log 2>&1 | tee api_server.log
``` ```
</details>
To update the service with the new config: To update the service with the new config:
```bash ```console
HF_TOKEN="your-huggingface-token" sky serve update vllm serving.yaml --env HF_TOKEN HF_TOKEN="your-huggingface-token" sky serve update vllm serving.yaml --env HF_TOKEN
``` ```
To stop the service: To stop the service:
```bash ```console
sky serve down vllm sky serve down vllm
``` ```
@ -285,39 +288,42 @@ sky serve down vllm
It is also possible to access the Llama-3 service with a separate GUI frontend, so the user requests send to the GUI will be load-balanced across replicas. It is also possible to access the Llama-3 service with a separate GUI frontend, so the user requests send to the GUI will be load-balanced across replicas.
??? Yaml <details>
<summary>Click to see the full GUI YAML</summary>
```yaml ```yaml
envs: envs:
MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
ENDPOINT: x.x.x.x:3031 # Address of the API server running vllm. ENDPOINT: x.x.x.x:3031 # Address of the API server running vllm.
resources: resources:
cpus: 2 cpus: 2
setup: | setup: |
conda create -n vllm python=3.10 -y conda create -n vllm python=3.10 -y
conda activate vllm conda activate vllm
# Install Gradio for web UI. # Install Gradio for web UI.
pip install gradio openai pip install gradio openai
run: | run: |
conda activate vllm conda activate vllm
export PATH=$PATH:/sbin export PATH=$PATH:/sbin
echo 'Starting gradio server...' echo 'Starting gradio server...'
git clone https://github.com/vllm-project/vllm.git || true git clone https://github.com/vllm-project/vllm.git || true
python vllm/examples/online_serving/gradio_openai_chatbot_webserver.py \ python vllm/examples/online_serving/gradio_openai_chatbot_webserver.py \
-m $MODEL_NAME \ -m $MODEL_NAME \
--port 8811 \ --port 8811 \
--model-url http://$ENDPOINT/v1 \ --model-url http://$ENDPOINT/v1 \
--stop-token-ids 128009,128001 | tee ~/gradio.log --stop-token-ids 128009,128001 | tee ~/gradio.log
``` ```
</details>
1. Start the chat web UI: 1. Start the chat web UI:
```bash ```console
sky launch \ sky launch \
-c gui ./gui.yaml \ -c gui ./gui.yaml \
--env ENDPOINT=$(sky serve status --endpoint vllm) --env ENDPOINT=$(sky serve status --endpoint vllm)

View File

@ -15,13 +15,13 @@ It can be quickly integrated with vLLM as a backend API server, enabling powerfu
- Start the vLLM server with the supported chat completion model, e.g. - Start the vLLM server with the supported chat completion model, e.g.
```bash ```console
vllm serve qwen/Qwen1.5-0.5B-Chat vllm serve qwen/Qwen1.5-0.5B-Chat
``` ```
- Install streamlit and openai: - Install streamlit and openai:
```bash ```console
pip install streamlit openai pip install streamlit openai
``` ```
@ -29,7 +29,7 @@ pip install streamlit openai
- Start the streamlit web UI and start to chat: - Start the streamlit web UI and start to chat:
```bash ```console
streamlit run streamlit_openai_chatbot_webserver.py streamlit run streamlit_openai_chatbot_webserver.py
# or specify the VLLM_API_BASE or VLLM_API_KEY # or specify the VLLM_API_BASE or VLLM_API_KEY

View File

@ -7,7 +7,7 @@ vLLM is also available via [Llama Stack](https://github.com/meta-llama/llama-sta
To install Llama Stack, run To install Llama Stack, run
```bash ```console
pip install llama-stack -q pip install llama-stack -q
``` ```

View File

@ -60,22 +60,22 @@ And then you can send out a query to the OpenAI-compatible API to check the avai
curl -o- http://localhost:30080/models curl -o- http://localhost:30080/models
``` ```
??? Output Expected output:
```json ```json
{
"object": "list",
"data": [
{ {
"object": "list", "id": "facebook/opt-125m",
"data": [ "object": "model",
{ "created": 1737428424,
"id": "facebook/opt-125m", "owned_by": "vllm",
"object": "model", "root": null
"created": 1737428424,
"owned_by": "vllm",
"root": null
}
]
} }
``` ]
}
```
To send an actual chatting request, you can issue a curl request to the OpenAI `/completion` endpoint: To send an actual chatting request, you can issue a curl request to the OpenAI `/completion` endpoint:
@ -89,23 +89,23 @@ curl -X POST http://localhost:30080/completions \
}' }'
``` ```
??? Output Expected output:
```json ```json
{
"id": "completion-id",
"object": "text_completion",
"created": 1737428424,
"model": "facebook/opt-125m",
"choices": [
{ {
"id": "completion-id", "text": " there was a brave knight who...",
"object": "text_completion", "index": 0,
"created": 1737428424, "finish_reason": "length"
"model": "facebook/opt-125m",
"choices": [
{
"text": " there was a brave knight who...",
"index": 0,
"finish_reason": "length"
}
]
} }
``` ]
}
```
### Uninstall ### Uninstall
@ -121,25 +121,23 @@ sudo helm uninstall vllm
The core vLLM production stack configuration is managed with YAML. Here is the example configuration used in the installation above: The core vLLM production stack configuration is managed with YAML. Here is the example configuration used in the installation above:
??? Yaml ```yaml
servingEngineSpec:
runtimeClassName: ""
modelSpec:
- name: "opt125m"
repository: "vllm/vllm-openai"
tag: "latest"
modelURL: "facebook/opt-125m"
```yaml replicaCount: 1
servingEngineSpec:
runtimeClassName: ""
modelSpec:
- name: "opt125m"
repository: "vllm/vllm-openai"
tag: "latest"
modelURL: "facebook/opt-125m"
replicaCount: 1 requestCPU: 6
requestMemory: "16Gi"
requestGPU: 1
requestCPU: 6 pvcStorage: "10Gi"
requestMemory: "16Gi" ```
requestGPU: 1
pvcStorage: "10Gi"
```
In this YAML configuration: In this YAML configuration:
* **`modelSpec`** includes: * **`modelSpec`** includes:

View File

@ -29,93 +29,89 @@ Alternatively, you can deploy vLLM to Kubernetes using any of the following:
First, create a Kubernetes PVC and Secret for downloading and storing Hugging Face model: First, create a Kubernetes PVC and Secret for downloading and storing Hugging Face model:
??? Config ```bash
cat <<EOF |kubectl apply -f -
```bash apiVersion: v1
cat <<EOF |kubectl apply -f - kind: PersistentVolumeClaim
apiVersion: v1 metadata:
kind: PersistentVolumeClaim name: vllm-models
metadata: spec:
name: vllm-models accessModes:
spec: - ReadWriteOnce
accessModes: volumeMode: Filesystem
- ReadWriteOnce resources:
volumeMode: Filesystem requests:
resources: storage: 50Gi
requests: ---
storage: 50Gi apiVersion: v1
--- kind: Secret
apiVersion: v1 metadata:
kind: Secret name: hf-token-secret
metadata: type: Opaque
name: hf-token-secret data:
type: Opaque token: $(HF_TOKEN)
data: EOF
token: $(HF_TOKEN) ```
EOF
```
Next, start the vLLM server as a Kubernetes Deployment and Service: Next, start the vLLM server as a Kubernetes Deployment and Service:
??? Config ```bash
cat <<EOF |kubectl apply -f -
```bash apiVersion: apps/v1
cat <<EOF |kubectl apply -f - kind: Deployment
apiVersion: apps/v1 metadata:
kind: Deployment name: vllm-server
spec:
replicas: 1
selector:
matchLabels:
app.kubernetes.io/name: vllm
template:
metadata: metadata:
name: vllm-server labels:
spec:
replicas: 1
selector:
matchLabels:
app.kubernetes.io/name: vllm
template:
metadata:
labels:
app.kubernetes.io/name: vllm
spec:
containers:
- name: vllm
image: vllm/vllm-openai:latest
command: ["/bin/sh", "-c"]
args: [
"vllm serve meta-llama/Llama-3.2-1B-Instruct"
]
env:
- name: HUGGING_FACE_HUB_TOKEN
valueFrom:
secretKeyRef:
name: hf-token-secret
key: token
ports:
- containerPort: 8000
volumeMounts:
- name: llama-storage
mountPath: /root/.cache/huggingface
volumes:
- name: llama-storage
persistentVolumeClaim:
claimName: vllm-models
---
apiVersion: v1
kind: Service
metadata:
name: vllm-server
spec:
selector:
app.kubernetes.io/name: vllm app.kubernetes.io/name: vllm
ports: spec:
- protocol: TCP containers:
port: 8000 - name: vllm
targetPort: 8000 image: vllm/vllm-openai:latest
type: ClusterIP command: ["/bin/sh", "-c"]
EOF args: [
``` "vllm serve meta-llama/Llama-3.2-1B-Instruct"
]
env:
- name: HUGGING_FACE_HUB_TOKEN
valueFrom:
secretKeyRef:
name: hf-token-secret
key: token
ports:
- containerPort: 8000
volumeMounts:
- name: llama-storage
mountPath: /root/.cache/huggingface
volumes:
- name: llama-storage
persistentVolumeClaim:
claimName: vllm-models
---
apiVersion: v1
kind: Service
metadata:
name: vllm-server
spec:
selector:
app.kubernetes.io/name: vllm
ports:
- protocol: TCP
port: 8000
targetPort: 8000
type: ClusterIP
EOF
```
We can verify that the vLLM server has started successfully via the logs (this might take a couple of minutes to download the model): We can verify that the vLLM server has started successfully via the logs (this might take a couple of minutes to download the model):
```bash ```console
kubectl logs -l app.kubernetes.io/name=vllm kubectl logs -l app.kubernetes.io/name=vllm
... ...
INFO: Started server process [1] INFO: Started server process [1]
@ -132,9 +128,6 @@ INFO: Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
PVC is used to store the model cache and it is optional, you can use hostPath or other storage options PVC is used to store the model cache and it is optional, you can use hostPath or other storage options
<details>
<summary>Yaml</summary>
```yaml ```yaml
apiVersion: v1 apiVersion: v1
kind: PersistentVolumeClaim kind: PersistentVolumeClaim
@ -151,8 +144,6 @@ INFO: Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
volumeMode: Filesystem volumeMode: Filesystem
``` ```
</details>
Secret is optional and only required for accessing gated models, you can skip this step if you are not using gated models Secret is optional and only required for accessing gated models, you can skip this step if you are not using gated models
```yaml ```yaml
@ -165,16 +156,13 @@ INFO: Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
stringData: stringData:
token: "REPLACE_WITH_TOKEN" token: "REPLACE_WITH_TOKEN"
``` ```
Next to create the deployment file for vLLM to run the model server. The following example deploys the `Mistral-7B-Instruct-v0.3` model. Next to create the deployment file for vLLM to run the model server. The following example deploys the `Mistral-7B-Instruct-v0.3` model.
Here are two examples for using NVIDIA GPU and AMD GPU. Here are two examples for using NVIDIA GPU and AMD GPU.
NVIDIA GPU: NVIDIA GPU:
<details>
<summary>Yaml</summary>
```yaml ```yaml
apiVersion: apps/v1 apiVersion: apps/v1
kind: Deployment kind: Deployment
@ -245,15 +233,10 @@ INFO: Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
periodSeconds: 5 periodSeconds: 5
``` ```
</details>
AMD GPU: AMD GPU:
You can refer to the `deployment.yaml` below if using AMD ROCm GPU like MI300X. You can refer to the `deployment.yaml` below if using AMD ROCm GPU like MI300X.
<details>
<summary>Yaml</summary>
```yaml ```yaml
apiVersion: apps/v1 apiVersion: apps/v1
kind: Deployment kind: Deployment
@ -322,17 +305,12 @@ INFO: Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
mountPath: /dev/shm mountPath: /dev/shm
``` ```
</details>
You can get the full example with steps and sample yaml files from <https://github.com/ROCm/k8s-device-plugin/tree/master/example/vllm-serve>. You can get the full example with steps and sample yaml files from <https://github.com/ROCm/k8s-device-plugin/tree/master/example/vllm-serve>.
2. Create a Kubernetes Service for vLLM 2. Create a Kubernetes Service for vLLM
Next, create a Kubernetes Service file to expose the `mistral-7b` deployment: Next, create a Kubernetes Service file to expose the `mistral-7b` deployment:
<details>
<summary>Yaml</summary>
```yaml ```yaml
apiVersion: v1 apiVersion: v1
kind: Service kind: Service
@ -352,20 +330,18 @@ INFO: Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
type: ClusterIP type: ClusterIP
``` ```
</details>
3. Deploy and Test 3. Deploy and Test
Apply the deployment and service configurations using `kubectl apply -f <filename>`: Apply the deployment and service configurations using `kubectl apply -f <filename>`:
```bash ```console
kubectl apply -f deployment.yaml kubectl apply -f deployment.yaml
kubectl apply -f service.yaml kubectl apply -f service.yaml
``` ```
To test the deployment, run the following `curl` command: To test the deployment, run the following `curl` command:
```bash ```console
curl http://mistral-7b.default.svc.cluster.local/v1/completions \ curl http://mistral-7b.default.svc.cluster.local/v1/completions \
-H "Content-Type: application/json" \ -H "Content-Type: application/json" \
-d '{ -d '{

View File

@ -11,13 +11,13 @@ This document shows how to launch multiple vLLM serving containers and use Nginx
This guide assumes that you have just cloned the vLLM project and you're currently in the vllm root directory. This guide assumes that you have just cloned the vLLM project and you're currently in the vllm root directory.
```bash ```console
export vllm_root=`pwd` export vllm_root=`pwd`
``` ```
Create a file named `Dockerfile.nginx`: Create a file named `Dockerfile.nginx`:
```dockerfile ```console
FROM nginx:latest FROM nginx:latest
RUN rm /etc/nginx/conf.d/default.conf RUN rm /etc/nginx/conf.d/default.conf
EXPOSE 80 EXPOSE 80
@ -26,7 +26,7 @@ CMD ["nginx", "-g", "daemon off;"]
Build the container: Build the container:
```bash ```console
docker build . -f Dockerfile.nginx --tag nginx-lb docker build . -f Dockerfile.nginx --tag nginx-lb
``` ```
@ -36,38 +36,36 @@ docker build . -f Dockerfile.nginx --tag nginx-lb
Create a file named `nginx_conf/nginx.conf`. Note that you can add as many servers as you'd like. In the below example we'll start with two. To add more, add another `server vllmN:8000 max_fails=3 fail_timeout=10000s;` entry to `upstream backend`. Create a file named `nginx_conf/nginx.conf`. Note that you can add as many servers as you'd like. In the below example we'll start with two. To add more, add another `server vllmN:8000 max_fails=3 fail_timeout=10000s;` entry to `upstream backend`.
??? Config ```console
upstream backend {
```console least_conn;
upstream backend { server vllm0:8000 max_fails=3 fail_timeout=10000s;
least_conn; server vllm1:8000 max_fails=3 fail_timeout=10000s;
server vllm0:8000 max_fails=3 fail_timeout=10000s; }
server vllm1:8000 max_fails=3 fail_timeout=10000s; server {
listen 80;
location / {
proxy_pass http://backend;
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto $scheme;
} }
server { }
listen 80; ```
location / {
proxy_pass http://backend;
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto $scheme;
}
}
```
[](){ #nginxloadbalancer-nginx-vllm-container } [](){ #nginxloadbalancer-nginx-vllm-container }
## Build vLLM Container ## Build vLLM Container
```bash ```console
cd $vllm_root cd $vllm_root
docker build -f docker/Dockerfile . --tag vllm docker build -f docker/Dockerfile . --tag vllm
``` ```
If you are behind proxy, you can pass the proxy settings to the docker build command as shown below: If you are behind proxy, you can pass the proxy settings to the docker build command as shown below:
```bash ```console
cd $vllm_root cd $vllm_root
docker build \ docker build \
-f docker/Dockerfile . \ -f docker/Dockerfile . \
@ -80,7 +78,7 @@ docker build \
## Create Docker Network ## Create Docker Network
```bash ```console
docker network create vllm_nginx docker network create vllm_nginx
``` ```
@ -95,32 +93,30 @@ Notes:
- The below example assumes GPU backend used. If you are using CPU backend, remove `--gpus device=ID`, add `VLLM_CPU_KVCACHE_SPACE` and `VLLM_CPU_OMP_THREADS_BIND` environment variables to the docker run command. - The below example assumes GPU backend used. If you are using CPU backend, remove `--gpus device=ID`, add `VLLM_CPU_KVCACHE_SPACE` and `VLLM_CPU_OMP_THREADS_BIND` environment variables to the docker run command.
- Adjust the model name that you want to use in your vLLM servers if you don't want to use `Llama-2-7b-chat-hf`. - Adjust the model name that you want to use in your vLLM servers if you don't want to use `Llama-2-7b-chat-hf`.
??? Commands ```console
mkdir -p ~/.cache/huggingface/hub/
```console hf_cache_dir=~/.cache/huggingface/
mkdir -p ~/.cache/huggingface/hub/ docker run \
hf_cache_dir=~/.cache/huggingface/ -itd \
docker run \ --ipc host \
-itd \ --network vllm_nginx \
--ipc host \ --gpus device=0 \
--network vllm_nginx \ --shm-size=10.24gb \
--gpus device=0 \ -v $hf_cache_dir:/root/.cache/huggingface/ \
--shm-size=10.24gb \ -p 8081:8000 \
-v $hf_cache_dir:/root/.cache/huggingface/ \ --name vllm0 vllm \
-p 8081:8000 \ --model meta-llama/Llama-2-7b-chat-hf
--name vllm0 vllm \ docker run \
--model meta-llama/Llama-2-7b-chat-hf -itd \
docker run \ --ipc host \
-itd \ --network vllm_nginx \
--ipc host \ --gpus device=1 \
--network vllm_nginx \ --shm-size=10.24gb \
--gpus device=1 \ -v $hf_cache_dir:/root/.cache/huggingface/ \
--shm-size=10.24gb \ -p 8082:8000 \
-v $hf_cache_dir:/root/.cache/huggingface/ \ --name vllm1 vllm \
-p 8082:8000 \ --model meta-llama/Llama-2-7b-chat-hf
--name vllm1 vllm \ ```
--model meta-llama/Llama-2-7b-chat-hf
```
!!! note !!! note
If you are behind proxy, you can pass the proxy settings to the docker run command via `-e http_proxy=$http_proxy -e https_proxy=$https_proxy`. If you are behind proxy, you can pass the proxy settings to the docker run command via `-e http_proxy=$http_proxy -e https_proxy=$https_proxy`.
@ -129,7 +125,7 @@ Notes:
## Launch Nginx ## Launch Nginx
```bash ```console
docker run \ docker run \
-itd \ -itd \
-p 8000:80 \ -p 8000:80 \
@ -142,7 +138,7 @@ docker run \
## Verify That vLLM Servers Are Ready ## Verify That vLLM Servers Are Ready
```bash ```console
docker logs vllm0 | grep Uvicorn docker logs vllm0 | grep Uvicorn
docker logs vllm1 | grep Uvicorn docker logs vllm1 | grep Uvicorn
``` ```

View File

@ -22,33 +22,31 @@ server.
Here is a sample of `LLM` class usage: Here is a sample of `LLM` class usage:
??? Code ```python
from vllm import LLM, SamplingParams
```python # Define a list of input prompts
from vllm import LLM, SamplingParams prompts = [
"Hello, my name is",
"The capital of France is",
"The largest ocean is",
]
# Define a list of input prompts # Define sampling parameters
prompts = [ sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
"Hello, my name is",
"The capital of France is",
"The largest ocean is",
]
# Define sampling parameters # Initialize the LLM engine with the OPT-125M model
sampling_params = SamplingParams(temperature=0.8, top_p=0.95) llm = LLM(model="facebook/opt-125m")
# Initialize the LLM engine with the OPT-125M model # Generate outputs for the input prompts
llm = LLM(model="facebook/opt-125m") outputs = llm.generate(prompts, sampling_params)
# Generate outputs for the input prompts # Print the generated outputs
outputs = llm.generate(prompts, sampling_params) for output in outputs:
prompt = output.prompt
# Print the generated outputs generated_text = output.outputs[0].text
for output in outputs: print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
prompt = output.prompt ```
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
```
More API details can be found in the [Offline Inference](#offline-inference-api) section of the API docs. More API details can be found in the [Offline Inference](#offline-inference-api) section of the API docs.
@ -180,34 +178,32 @@ vision-language model.
To avoid accidentally passing incorrect arguments, the constructor is now keyword-only. This ensures that the constructor will raise an error if old configurations are passed. vLLM developers have already made this change for all models within vLLM. For out-of-tree registered models, developers need to update their models, for example by adding shim code to adapt the old constructor signature to the new one: To avoid accidentally passing incorrect arguments, the constructor is now keyword-only. This ensures that the constructor will raise an error if old configurations are passed. vLLM developers have already made this change for all models within vLLM. For out-of-tree registered models, developers need to update their models, for example by adding shim code to adapt the old constructor signature to the new one:
??? Code ```python
class MyOldModel(nn.Module):
def __init__(
self,
config,
cache_config: Optional[CacheConfig] = None,
quant_config: Optional[QuantizationConfig] = None,
lora_config: Optional[LoRAConfig] = None,
prefix: str = "",
) -> None:
...
```python from vllm.config import VllmConfig
class MyOldModel(nn.Module): class MyNewModel(MyOldModel):
def __init__( def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
self, config = vllm_config.model_config.hf_config
config, cache_config = vllm_config.cache_config
cache_config: Optional[CacheConfig] = None, quant_config = vllm_config.quant_config
quant_config: Optional[QuantizationConfig] = None, lora_config = vllm_config.lora_config
lora_config: Optional[LoRAConfig] = None, super().__init__(config, cache_config, quant_config, lora_config, prefix)
prefix: str = "",
) -> None:
...
from vllm.config import VllmConfig if __version__ >= "0.6.4":
class MyNewModel(MyOldModel): MyModel = MyNewModel
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): else:
config = vllm_config.model_config.hf_config MyModel = MyOldModel
cache_config = vllm_config.cache_config ```
quant_config = vllm_config.quant_config
lora_config = vllm_config.lora_config
super().__init__(config, cache_config, quant_config, lora_config, prefix)
if __version__ >= "0.6.4":
MyModel = MyNewModel
else:
MyModel = MyOldModel
```
This way, the model can work with both old and new versions of vLLM. This way, the model can work with both old and new versions of vLLM.

View File

@ -448,29 +448,27 @@ elements of the entire head for all context tokens. However, overall,
all results for output have been calculated but are just stored in all results for output have been calculated but are just stored in
different thread register memory. different thread register memory.
??? Code ```cpp
float* out_smem = reinterpret_cast<float*>(shared_mem);
```cpp for (int i = NUM_WARPS; i > 1; i /= 2) {
float* out_smem = reinterpret_cast<float*>(shared_mem); // Upper warps write to shared memory.
for (int i = NUM_WARPS; i > 1; i /= 2) { ...
// Upper warps write to shared memory. float* dst = &out_smem[(warp_idx - mid) * HEAD_SIZE];
for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
... ...
float* dst = &out_smem[(warp_idx - mid) * HEAD_SIZE]; dst[row_idx] = accs[i];
for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
...
dst[row_idx] = accs[i];
}
// Lower warps update the output.
const float* src = &out_smem[warp_idx * HEAD_SIZE];
for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
...
accs[i] += src[row_idx];
}
// Write out the accs.
} }
```
// Lower warps update the output.
const float* src = &out_smem[warp_idx * HEAD_SIZE];
for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
...
accs[i] += src[row_idx];
}
// Write out the accs.
}
```
## Output ## Output

View File

@ -13,30 +13,28 @@ Plugins are user-registered code that vLLM executes. Given vLLM's architecture (
vLLM's plugin system uses the standard Python `entry_points` mechanism. This mechanism allows developers to register functions in their Python packages for use by other packages. An example of a plugin: vLLM's plugin system uses the standard Python `entry_points` mechanism. This mechanism allows developers to register functions in their Python packages for use by other packages. An example of a plugin:
??? Code ```python
# inside `setup.py` file
from setuptools import setup
```python setup(name='vllm_add_dummy_model',
# inside `setup.py` file version='0.1',
from setuptools import setup packages=['vllm_add_dummy_model'],
entry_points={
'vllm.general_plugins':
["register_dummy_model = vllm_add_dummy_model:register"]
})
setup(name='vllm_add_dummy_model', # inside `vllm_add_dummy_model.py` file
version='0.1', def register():
packages=['vllm_add_dummy_model'], from vllm import ModelRegistry
entry_points={
'vllm.general_plugins':
["register_dummy_model = vllm_add_dummy_model:register"]
})
# inside `vllm_add_dummy_model.py` file if "MyLlava" not in ModelRegistry.get_supported_archs():
def register(): ModelRegistry.register_model(
from vllm import ModelRegistry "MyLlava",
"vllm_add_dummy_model.my_llava:MyLlava",
if "MyLlava" not in ModelRegistry.get_supported_archs(): )
ModelRegistry.register_model( ```
"MyLlava",
"vllm_add_dummy_model.my_llava:MyLlava",
)
```
For more information on adding entry points to your package, please check the [official documentation](https://setuptools.pypa.io/en/latest/userguide/entry_point.html). For more information on adding entry points to your package, please check the [official documentation](https://setuptools.pypa.io/en/latest/userguide/entry_point.html).

View File

@ -61,25 +61,23 @@ To address the above issues, I have designed and developed a local Tensor memory
# Install vLLM # Install vLLM
??? Commands ```shell
# Enter the home directory or your working directory.
cd /home
```shell # Download the installation package, and I will update the commit-id in time. You can directly copy the command.
# Enter the home directory or your working directory. wget https://vllm-wheels.s3.us-west-2.amazonaws.com/9112b443a042d8d815880b8780633882ad32b183/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
cd /home
# Download the installation package, and I will update the commit-id in time. You can directly copy the command. # Download the code repository.
wget https://vllm-wheels.s3.us-west-2.amazonaws.com/9112b443a042d8d815880b8780633882ad32b183/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl git clone -b xpyd-v1 https://github.com/Abatom/vllm.git
cd vllm
# Download the code repository. # Set the installation package path.
git clone -b xpyd-v1 https://github.com/Abatom/vllm.git export VLLM_PRECOMPILED_WHEEL_LOCATION=/home/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
cd vllm
# Set the installation package path. # installation
export VLLM_PRECOMPILED_WHEEL_LOCATION=/home/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl pip install -e . -v
```
# installation
pip install -e . -v
```
# Run xPyD # Run xPyD
@ -106,91 +104,83 @@ python3 disagg_prefill_proxy_xpyd.py &
### Prefill1 (e.g. 10.0.1.2 or 10.0.1.1) ### Prefill1 (e.g. 10.0.1.2 or 10.0.1.1)
??? Command ```shell
VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=0 vllm serve {your model directory} \
```shell --host 0.0.0.0 \
VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=0 vllm serve {your model directory} \ --port 20005 \
--host 0.0.0.0 \ --tensor-parallel-size 1 \
--port 20005 \ --seed 1024 \
--tensor-parallel-size 1 \ --served-model-name base_model \
--seed 1024 \ --dtype float16 \
--served-model-name base_model \ --max-model-len 10000 \
--dtype float16 \ --max-num-batched-tokens 10000 \
--max-model-len 10000 \ --max-num-seqs 256 \
--max-num-batched-tokens 10000 \ --trust-remote-code \
--max-num-seqs 256 \ --gpu-memory-utilization 0.9 \
--trust-remote-code \ --disable-log-request \
--gpu-memory-utilization 0.9 \ --kv-transfer-config \
--disable-log-request \ '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"21001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20005","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
--kv-transfer-config \ ```
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"21001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20005","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
```
### Decode1 (e.g. 10.0.1.3 or 10.0.1.1) ### Decode1 (e.g. 10.0.1.3 or 10.0.1.1)
??? Command ```shell
VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=1 vllm serve {your model directory} \
```shell --host 0.0.0.0 \
VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=1 vllm serve {your model directory} \ --port 20009 \
--host 0.0.0.0 \ --tensor-parallel-size 1 \
--port 20009 \ --seed 1024 \
--tensor-parallel-size 1 \ --served-model-name base_model \
--seed 1024 \ --dtype float16 \
--served-model-name base_model \ --max-model-len 10000 \
--dtype float16 \ --max-num-batched-tokens 10000 \
--max-model-len 10000 \ --max-num-seqs 256 \
--max-num-batched-tokens 10000 \ --trust-remote-code \
--max-num-seqs 256 \ --gpu-memory-utilization 0.7 \
--trust-remote-code \ --disable-log-request \
--gpu-memory-utilization 0.7 \ --kv-transfer-config \
--disable-log-request \ '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"22001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20009","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
--kv-transfer-config \ ```
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"22001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20009","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
```
### Decode2 (e.g. 10.0.1.4 or 10.0.1.1) ### Decode2 (e.g. 10.0.1.4 or 10.0.1.1)
??? Command ```shell
VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=2 vllm serve {your model directory} \
```shell --host 0.0.0.0 \
VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=2 vllm serve {your model directory} \ --port 20003 \
--host 0.0.0.0 \ --tensor-parallel-size 1 \
--port 20003 \ --seed 1024 \
--tensor-parallel-size 1 \ --served-model-name base_model \
--seed 1024 \ --dtype float16 \
--served-model-name base_model \ --max-model-len 10000 \
--dtype float16 \ --max-num-batched-tokens 10000 \
--max-model-len 10000 \ --max-num-seqs 256 \
--max-num-batched-tokens 10000 \ --trust-remote-code \
--max-num-seqs 256 \ --gpu-memory-utilization 0.7 \
--trust-remote-code \ --disable-log-request \
--gpu-memory-utilization 0.7 \ --kv-transfer-config \
--disable-log-request \ '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"23001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20003","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
--kv-transfer-config \ ```
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"23001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20003","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
```
### Decode3 (e.g. 10.0.1.5 or 10.0.1.1) ### Decode3 (e.g. 10.0.1.5 or 10.0.1.1)
??? Command ```shell
VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=3 vllm serve {your model directory} \
```shell --host 0.0.0.0 \
VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=3 vllm serve {your model directory} \ --port 20008 \
--host 0.0.0.0 \ --tensor-parallel-size 1 \
--port 20008 \ --seed 1024 \
--tensor-parallel-size 1 \ --served-model-name base_model \
--seed 1024 \ --dtype float16 \
--served-model-name base_model \ --max-model-len 10000 \
--dtype float16 \ --max-num-batched-tokens 10000 \
--max-model-len 10000 \ --max-num-seqs 256 \
--max-num-batched-tokens 10000 \ --trust-remote-code \
--max-num-seqs 256 \ --gpu-memory-utilization 0.7 \
--trust-remote-code \ --disable-log-request \
--gpu-memory-utilization 0.7 \ --kv-transfer-config \
--disable-log-request \ '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"24001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20008","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
--kv-transfer-config \ ```
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"24001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20008","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
```
## Run 3P1D ## Run 3P1D
@ -203,91 +193,83 @@ python3 disagg_prefill_proxy_xpyd.py &
### Prefill1 (e.g. 10.0.1.2 or 10.0.1.1) ### Prefill1 (e.g. 10.0.1.2 or 10.0.1.1)
??? Command ```shell
VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=0 vllm serve {your model directory} \
```shell --host 0.0.0.0 \
VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=0 vllm serve {your model directory} \ --port 20005 \
--host 0.0.0.0 \ --tensor-parallel-size 1 \
--port 20005 \ --seed 1024 \
--tensor-parallel-size 1 \ --served-model-name base_model \
--seed 1024 \ --dtype float16 \
--served-model-name base_model \ --max-model-len 10000 \
--dtype float16 \ --max-num-batched-tokens 10000 \
--max-model-len 10000 \ --max-num-seqs 256 \
--max-num-batched-tokens 10000 \ --trust-remote-code \
--max-num-seqs 256 \ --gpu-memory-utilization 0.9 \
--trust-remote-code \ --disable-log-request \
--gpu-memory-utilization 0.9 \ --kv-transfer-config \
--disable-log-request \ '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"21001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20005","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
--kv-transfer-config \ ```
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"21001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20005","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
```
### Prefill2 (e.g. 10.0.1.3 or 10.0.1.1) ### Prefill2 (e.g. 10.0.1.3 or 10.0.1.1)
??? Command ```shell
VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=1 vllm serve {your model directory} \
```shell --host 0.0.0.0 \
VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=1 vllm serve {your model directory} \ --port 20009 \
--host 0.0.0.0 \ --tensor-parallel-size 1 \
--port 20009 \ --seed 1024 \
--tensor-parallel-size 1 \ --served-model-name base_model \
--seed 1024 \ --dtype float16 \
--served-model-name base_model \ --max-model-len 10000 \
--dtype float16 \ --max-num-batched-tokens 10000 \
--max-model-len 10000 \ --max-num-seqs 256 \
--max-num-batched-tokens 10000 \ --trust-remote-code \
--max-num-seqs 256 \ --gpu-memory-utilization 0.9 \
--trust-remote-code \ --disable-log-request \
--gpu-memory-utilization 0.9 \ --kv-transfer-config \
--disable-log-request \ '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"22001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20009","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
--kv-transfer-config \ ```
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"22001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20009","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
```
### Prefill3 (e.g. 10.0.1.4 or 10.0.1.1) ### Prefill3 (e.g. 10.0.1.4 or 10.0.1.1)
??? Command ```shell
VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=2 vllm serve {your model directory} \
```shell --host 0.0.0.0 \
VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=2 vllm serve {your model directory} \ --port 20003 \
--host 0.0.0.0 \ --tensor-parallel-size 1 \
--port 20003 \ --seed 1024 \
--tensor-parallel-size 1 \ --served-model-name base_model \
--seed 1024 \ --dtype float16 \
--served-model-name base_model \ --max-model-len 10000 \
--dtype float16 \ --max-num-batched-tokens 10000 \
--max-model-len 10000 \ --max-num-seqs 256 \
--max-num-batched-tokens 10000 \ --trust-remote-code \
--max-num-seqs 256 \ --gpu-memory-utilization 0.9 \
--trust-remote-code \ --disable-log-request \
--gpu-memory-utilization 0.9 \ --kv-transfer-config \
--disable-log-request \ '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"23001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20003","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
--kv-transfer-config \ ```
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"23001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20003","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
```
### Decode1 (e.g. 10.0.1.5 or 10.0.1.1) ### Decode1 (e.g. 10.0.1.5 or 10.0.1.1)
??? Command ```shell
VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=3 vllm serve {your model directory} \
```shell --host 0.0.0.0 \
VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=3 vllm serve {your model directory} \ --port 20008 \
--host 0.0.0.0 \ --tensor-parallel-size 1 \
--port 20008 \ --seed 1024 \
--tensor-parallel-size 1 \ --served-model-name base_model \
--seed 1024 \ --dtype float16 \
--served-model-name base_model \ --max-model-len 10000 \
--dtype float16 \ --max-num-batched-tokens 10000 \
--max-model-len 10000 \ --max-num-seqs 256 \
--max-num-batched-tokens 10000 \ --trust-remote-code \
--max-num-seqs 256 \ --gpu-memory-utilization 0.7 \
--trust-remote-code \ --disable-log-request \
--gpu-memory-utilization 0.7 \ --kv-transfer-config \
--disable-log-request \ '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"24001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20008","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
--kv-transfer-config \ ```
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"24001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20008","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
```
# Single request # Single request
@ -304,27 +286,25 @@ curl -X POST -s http://10.0.1.1:10001/v1/completions \
# Benchmark # Benchmark
??? Command ```shell
python3 benchmark_serving.py \
```shell --backend vllm \
python3 benchmark_serving.py \ --model base_model \
--backend vllm \ --tokenizer meta-llama/Llama-3.1-8B-Instruct \
--model base_model \ --dataset-name "random" \
--tokenizer meta-llama/Llama-3.1-8B-Instruct \ --host 10.0.1.1 \
--dataset-name "random" \ --port 10001 \
--host 10.0.1.1 \ --random-input-len 1024 \
--port 10001 \ --random-output-len 1024 \
--random-input-len 1024 \ --ignore-eos \
--random-output-len 1024 \ --burstiness 100 \
--ignore-eos \ --percentile-metrics "ttft,tpot,itl,e2el" \
--burstiness 100 \ --metric-percentiles "90,95,99" \
--percentile-metrics "ttft,tpot,itl,e2el" \ --seed $(date +%s) \
--metric-percentiles "90,95,99" \ --trust-remote-code \
--seed $(date +%s) \ --request-rate 3 \
--trust-remote-code \ --num-prompts 1000
--request-rate 3 \ ```
--num-prompts 1000
```
# Shut down # Shut down

View File

@ -28,29 +28,27 @@ A unique aspect of vLLM's `torch.compile` integration, is that we guarantee all
In the very verbose logs, we can see: In the very verbose logs, we can see:
??? Logs ```
DEBUG 03-07 03:06:52 [decorators.py:203] Start compiling function <code object forward at 0x7f08acf40c90, file "xxx/vllm/model_executor/models/llama.py", line 339>
```text DEBUG 03-07 03:06:54 [backends.py:370] Traced files (to be considered for compilation cache):
DEBUG 03-07 03:06:52 [decorators.py:203] Start compiling function <code object forward at 0x7f08acf40c90, file "xxx/vllm/model_executor/models/llama.py", line 339> DEBUG 03-07 03:06:54 [backends.py:370] xxx/torch/_dynamo/polyfills/builtins.py
DEBUG 03-07 03:06:54 [backends.py:370] xxx/torch/nn/modules/container.py
DEBUG 03-07 03:06:54 [backends.py:370] xxx/torch/nn/modules/module.py
DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/attention/layer.py
DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/distributed/communication_op.py
DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/distributed/parallel_state.py
DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/model_executor/custom_op.py
DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/model_executor/layers/activation.py
DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/model_executor/layers/layernorm.py
DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/model_executor/layers/linear.py
DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/model_executor/layers/rotary_embedding.py
DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/model_executor/layers/vocab_parallel_embedding.py
DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/model_executor/models/llama.py
DEBUG 03-07 03:06:54 [backends.py:370] Traced files (to be considered for compilation cache): DEBUG 03-07 03:07:07 [backends.py:462] Computation graph saved to ~/.cache/vllm/torch_compile_cache/1517964802/rank_0_0/computation_graph.py
DEBUG 03-07 03:06:54 [backends.py:370] xxx/torch/_dynamo/polyfills/builtins.py DEBUG 03-07 03:07:07 [wrapper.py:105] Dynamo transformed code saved to ~/.cache/vllm/torch_compile_cache/1517964802/rank_0_0/transformed_code.py
DEBUG 03-07 03:06:54 [backends.py:370] xxx/torch/nn/modules/container.py ```
DEBUG 03-07 03:06:54 [backends.py:370] xxx/torch/nn/modules/module.py
DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/attention/layer.py
DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/distributed/communication_op.py
DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/distributed/parallel_state.py
DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/model_executor/custom_op.py
DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/model_executor/layers/activation.py
DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/model_executor/layers/layernorm.py
DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/model_executor/layers/linear.py
DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/model_executor/layers/rotary_embedding.py
DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/model_executor/layers/vocab_parallel_embedding.py
DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/model_executor/models/llama.py
DEBUG 03-07 03:07:07 [backends.py:462] Computation graph saved to ~/.cache/vllm/torch_compile_cache/1517964802/rank_0_0/computation_graph.py
DEBUG 03-07 03:07:07 [wrapper.py:105] Dynamo transformed code saved to ~/.cache/vllm/torch_compile_cache/1517964802/rank_0_0/transformed_code.py
```
This is about the Python code compilation, i.e. graph capture by Dynamo. It tries to trace the function with code `xxx/vllm/model_executor/models/llama.py:339`, which is the `forward` function of the model we compile. During the forward pass, there are also other functions called and inlined by Dynamo, as shown by the logs, including some PyTorch functions from `xxx/torch/nn/modules/module.py` (used by PyTorch `nn.Module`, because module attribute access will trigger a function call), some communication / attention / activation functions from vLLM. All the traced files will be considered when we decide the cache directory to use. This way, any code change in the above files will trigger compilation cache miss, and therefore recompilation. This is about the Python code compilation, i.e. graph capture by Dynamo. It tries to trace the function with code `xxx/vllm/model_executor/models/llama.py:339`, which is the `forward` function of the model we compile. During the forward pass, there are also other functions called and inlined by Dynamo, as shown by the logs, including some PyTorch functions from `xxx/torch/nn/modules/module.py` (used by PyTorch `nn.Module`, because module attribute access will trigger a function call), some communication / attention / activation functions from vLLM. All the traced files will be considered when we decide the cache directory to use. This way, any code change in the above files will trigger compilation cache miss, and therefore recompilation.
@ -101,31 +99,28 @@ This time, Inductor compilation is completely bypassed, and we will load from di
The above example just uses Inductor to compile for a general shape (i.e. symbolic shape). We can also use Inductor to compile for some of the specific shapes, for example: The above example just uses Inductor to compile for a general shape (i.e. symbolic shape). We can also use Inductor to compile for some of the specific shapes, for example:
```bash ```
vllm serve meta-llama/Llama-3.2-1B \ vllm serve meta-llama/Llama-3.2-1B --compilation_config '{"compile_sizes": [1, 2, 4, 8]}'
--compilation_config '{"compile_sizes": [1, 2, 4, 8]}'
``` ```
Then it will also compile a specific kernel just for batch size `1, 2, 4, 8`. At this time, all of the shapes in the computation graph are static and known, and we will turn on auto-tuning to tune for max performance. This can be slow when you run it for the first time, but the next time you run it, we can directly bypass the tuning and run the tuned kernel. Then it will also compile a specific kernel just for batch size `1, 2, 4, 8`. At this time, all of the shapes in the computation graph are static and known, and we will turn on auto-tuning to tune for max performance. This can be slow when you run it for the first time, but the next time you run it, we can directly bypass the tuning and run the tuned kernel.
When all the shapes are known, `torch.compile` can compare different configs, and often find some better configs to run the kernel. For example, we can see the following log: When all the shapes are known, `torch.compile` can compare different configs, and often find some better configs to run the kernel. For example, we can see the following log:
??? Logs ```
AUTOTUNE mm(8x2048, 2048x3072)
``` triton_mm_4 0.0130 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=128, BLOCK_M=16, BLOCK_N=32, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=2
AUTOTUNE mm(8x2048, 2048x3072) triton_mm_8 0.0134 ms 97.4% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=128, BLOCK_M=16, BLOCK_N=64, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4
triton_mm_4 0.0130 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=128, BLOCK_M=16, BLOCK_N=32, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=2 triton_mm_12 0.0148 ms 87.7% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=128, BLOCK_M=16, BLOCK_N=128, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4
triton_mm_8 0.0134 ms 97.4% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=128, BLOCK_M=16, BLOCK_N=64, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 mm 0.0160 ms 81.6%
triton_mm_12 0.0148 ms 87.7% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=128, BLOCK_M=16, BLOCK_N=128, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4 triton_mm_16 0.0165 ms 78.7% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=64, BLOCK_M=16, BLOCK_N=128, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8
mm 0.0160 ms 81.6% triton_mm_3 0.0199 ms 65.4% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=16, BLOCK_N=32, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=2
triton_mm_16 0.0165 ms 78.7% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=64, BLOCK_M=16, BLOCK_N=128, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 triton_mm_1 0.0203 ms 64.2% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=128, BLOCK_M=16, BLOCK_N=32, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=2
triton_mm_3 0.0199 ms 65.4% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=16, BLOCK_N=32, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=2 triton_mm_7 0.0203 ms 64.1% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=64, BLOCK_M=16, BLOCK_N=64, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4
triton_mm_1 0.0203 ms 64.2% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=128, BLOCK_M=16, BLOCK_N=32, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=2 triton_mm_2 0.0208 ms 62.5% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=16, BLOCK_N=64, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4
triton_mm_7 0.0203 ms 64.1% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=64, BLOCK_M=16, BLOCK_N=64, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 triton_mm_11 0.0215 ms 60.5% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=64, BLOCK_M=16, BLOCK_N=128, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4
triton_mm_2 0.0208 ms 62.5% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=16, BLOCK_N=64, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 SingleProcess AUTOTUNE benchmarking takes 2.0428 seconds and 7.5727 seconds precompiling
triton_mm_11 0.0215 ms 60.5% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=64, BLOCK_M=16, BLOCK_N=128, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 ```
SingleProcess AUTOTUNE benchmarking takes 2.0428 seconds and 7.5727 seconds precompiling
```
It means, for a matrix multiplication with shape `8x2048x3072`, `torch.compile` tries triton template with various configs, and it is much faster than the default code (which dispatches to cublas library). It means, for a matrix multiplication with shape `8x2048x3072`, `torch.compile` tries triton template with various configs, and it is much faster than the default code (which dispatches to cublas library).
@ -141,9 +136,8 @@ The cudagraphs are captured and managed by the compiler backend, and replayed wh
By default, vLLM will try to determine a set of sizes to capture cudagraph. You can also override it using the config `cudagraph_capture_sizes`: By default, vLLM will try to determine a set of sizes to capture cudagraph. You can also override it using the config `cudagraph_capture_sizes`:
```bash ```
vllm serve meta-llama/Llama-3.2-1B \ vllm serve meta-llama/Llama-3.2-1B --compilation-config '{"cudagraph_capture_sizes": [1, 2, 4, 8]}'
--compilation-config '{"cudagraph_capture_sizes": [1, 2, 4, 8]}'
``` ```
Then it will only capture cudagraph for the specified sizes. It can be useful to have fine-grained control over the cudagraph capture. Then it will only capture cudagraph for the specified sizes. It can be useful to have fine-grained control over the cudagraph capture.

View File

@ -29,26 +29,24 @@ We can now submit the prompts and call `llm.generate` with the `lora_request` pa
of `LoRARequest` is a human identifiable name, the second parameter is a globally unique ID for the adapter and of `LoRARequest` is a human identifiable name, the second parameter is a globally unique ID for the adapter and
the third parameter is the path to the LoRA adapter. the third parameter is the path to the LoRA adapter.
??? Code ```python
sampling_params = SamplingParams(
temperature=0,
max_tokens=256,
stop=["[/assistant]"]
)
```python prompts = [
sampling_params = SamplingParams( "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]",
temperature=0, "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]",
max_tokens=256, ]
stop=["[/assistant]"]
)
prompts = [ outputs = llm.generate(
"[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]", prompts,
"[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]", sampling_params,
] lora_request=LoRARequest("sql_adapter", 1, sql_lora_path)
)
outputs = llm.generate( ```
prompts,
sampling_params,
lora_request=LoRARequest("sql_adapter", 1, sql_lora_path)
)
```
Check out <gh-file:examples/offline_inference/multilora_inference.py> for an example of how to use LoRA adapters with the async engine and how to use more advanced configuration options. Check out <gh-file:examples/offline_inference/multilora_inference.py> for an example of how to use LoRA adapters with the async engine and how to use more advanced configuration options.
@ -70,26 +68,24 @@ The server entrypoint accepts all other LoRA configuration parameters (`max_lora
etc.), which will apply to all forthcoming requests. Upon querying the `/models` endpoint, we should see our LoRA along etc.), which will apply to all forthcoming requests. Upon querying the `/models` endpoint, we should see our LoRA along
with its base model (if `jq` is not installed, you can follow [this guide](https://jqlang.org/download/) to install it.): with its base model (if `jq` is not installed, you can follow [this guide](https://jqlang.org/download/) to install it.):
??? Command ```bash
curl localhost:8000/v1/models | jq .
```bash {
curl localhost:8000/v1/models | jq . "object": "list",
{ "data": [
"object": "list", {
"data": [ "id": "meta-llama/Llama-2-7b-hf",
{ "object": "model",
"id": "meta-llama/Llama-2-7b-hf", ...
"object": "model", },
... {
}, "id": "sql-lora",
{ "object": "model",
"id": "sql-lora", ...
"object": "model", }
... ]
} }
] ```
}
```
Requests can specify the LoRA adapter as if it were any other model via the `model` request parameter. The requests will be Requests can specify the LoRA adapter as if it were any other model via the `model` request parameter. The requests will be
processed according to the server-wide LoRA configuration (i.e. in parallel with base model requests, and potentially other processed according to the server-wide LoRA configuration (i.e. in parallel with base model requests, and potentially other
@ -172,36 +168,36 @@ Alternatively, follow these example steps to implement your own plugin:
1. Implement the LoRAResolver interface. 1. Implement the LoRAResolver interface.
??? Example of a simple S3 LoRAResolver implementation Example of a simple S3 LoRAResolver implementation:
```python ```python
import os import os
import s3fs import s3fs
from vllm.lora.request import LoRARequest from vllm.lora.request import LoRARequest
from vllm.lora.resolver import LoRAResolver from vllm.lora.resolver import LoRAResolver
class S3LoRAResolver(LoRAResolver): class S3LoRAResolver(LoRAResolver):
def __init__(self): def __init__(self):
self.s3 = s3fs.S3FileSystem() self.s3 = s3fs.S3FileSystem()
self.s3_path_format = os.getenv("S3_PATH_TEMPLATE") self.s3_path_format = os.getenv("S3_PATH_TEMPLATE")
self.local_path_format = os.getenv("LOCAL_PATH_TEMPLATE") self.local_path_format = os.getenv("LOCAL_PATH_TEMPLATE")
async def resolve_lora(self, base_model_name, lora_name): async def resolve_lora(self, base_model_name, lora_name):
s3_path = self.s3_path_format.format(base_model_name=base_model_name, lora_name=lora_name) s3_path = self.s3_path_format.format(base_model_name=base_model_name, lora_name=lora_name)
local_path = self.local_path_format.format(base_model_name=base_model_name, lora_name=lora_name) local_path = self.local_path_format.format(base_model_name=base_model_name, lora_name=lora_name)
# Download the LoRA from S3 to the local path # Download the LoRA from S3 to the local path
await self.s3._get( await self.s3._get(
s3_path, local_path, recursive=True, maxdepth=1 s3_path, local_path, recursive=True, maxdepth=1
) )
lora_request = LoRARequest( lora_request = LoRARequest(
lora_name=lora_name, lora_name=lora_name,
lora_path=local_path, lora_path=local_path,
lora_int_id=abs(hash(lora_name)) lora_int_id=abs(hash(lora_name))
) )
return lora_request return lora_request
``` ```
2. Register `LoRAResolver` plugin. 2. Register `LoRAResolver` plugin.
@ -238,40 +234,38 @@ The new format of `--lora-modules` is mainly to support the display of parent mo
- The `parent` field of LoRA model `sql-lora` now links to its base model `meta-llama/Llama-2-7b-hf`. This correctly reflects the hierarchical relationship between the base model and the LoRA adapter. - The `parent` field of LoRA model `sql-lora` now links to its base model `meta-llama/Llama-2-7b-hf`. This correctly reflects the hierarchical relationship between the base model and the LoRA adapter.
- The `root` field points to the artifact location of the lora adapter. - The `root` field points to the artifact location of the lora adapter.
??? Command output ```bash
$ curl http://localhost:8000/v1/models
```bash {
$ curl http://localhost:8000/v1/models "object": "list",
"data": [
{ {
"object": "list", "id": "meta-llama/Llama-2-7b-hf",
"data": [ "object": "model",
"created": 1715644056,
"owned_by": "vllm",
"root": "~/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-hf/snapshots/01c7f73d771dfac7d292323805ebc428287df4f9/",
"parent": null,
"permission": [
{ {
"id": "meta-llama/Llama-2-7b-hf", .....
"object": "model",
"created": 1715644056,
"owned_by": "vllm",
"root": "~/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-hf/snapshots/01c7f73d771dfac7d292323805ebc428287df4f9/",
"parent": null,
"permission": [
{
.....
}
]
},
{
"id": "sql-lora",
"object": "model",
"created": 1715644056,
"owned_by": "vllm",
"root": "~/.cache/huggingface/hub/models--yard1--llama-2-7b-sql-lora-test/snapshots/0dfa347e8877a4d4ed19ee56c140fa518470028c/",
"parent": meta-llama/Llama-2-7b-hf,
"permission": [
{
....
}
]
} }
] ]
} },
``` {
"id": "sql-lora",
"object": "model",
"created": 1715644056,
"owned_by": "vllm",
"root": "~/.cache/huggingface/hub/models--yard1--llama-2-7b-sql-lora-test/snapshots/0dfa347e8877a4d4ed19ee56c140fa518470028c/",
"parent": meta-llama/Llama-2-7b-hf,
"permission": [
{
....
}
]
}
]
}
```

View File

@ -20,117 +20,111 @@ To input multi-modal data, follow this schema in [vllm.inputs.PromptType][]:
You can pass a single image to the `'image'` field of the multi-modal dictionary, as shown in the following examples: You can pass a single image to the `'image'` field of the multi-modal dictionary, as shown in the following examples:
??? Code ```python
from vllm import LLM
```python llm = LLM(model="llava-hf/llava-1.5-7b-hf")
from vllm import LLM
llm = LLM(model="llava-hf/llava-1.5-7b-hf") # Refer to the HuggingFace repo for the correct format to use
prompt = "USER: <image>\nWhat is the content of this image?\nASSISTANT:"
# Refer to the HuggingFace repo for the correct format to use # Load the image using PIL.Image
prompt = "USER: <image>\nWhat is the content of this image?\nASSISTANT:" image = PIL.Image.open(...)
# Load the image using PIL.Image # Single prompt inference
image = PIL.Image.open(...) outputs = llm.generate({
"prompt": prompt,
"multi_modal_data": {"image": image},
})
# Single prompt inference for o in outputs:
outputs = llm.generate({ generated_text = o.outputs[0].text
"prompt": prompt, print(generated_text)
"multi_modal_data": {"image": image},
})
for o in outputs: # Batch inference
generated_text = o.outputs[0].text image_1 = PIL.Image.open(...)
print(generated_text) image_2 = PIL.Image.open(...)
outputs = llm.generate(
[
{
"prompt": "USER: <image>\nWhat is the content of this image?\nASSISTANT:",
"multi_modal_data": {"image": image_1},
},
{
"prompt": "USER: <image>\nWhat's the color of this image?\nASSISTANT:",
"multi_modal_data": {"image": image_2},
}
]
)
# Batch inference for o in outputs:
image_1 = PIL.Image.open(...) generated_text = o.outputs[0].text
image_2 = PIL.Image.open(...) print(generated_text)
outputs = llm.generate( ```
[
{
"prompt": "USER: <image>\nWhat is the content of this image?\nASSISTANT:",
"multi_modal_data": {"image": image_1},
},
{
"prompt": "USER: <image>\nWhat's the color of this image?\nASSISTANT:",
"multi_modal_data": {"image": image_2},
}
]
)
for o in outputs:
generated_text = o.outputs[0].text
print(generated_text)
```
Full example: <gh-file:examples/offline_inference/vision_language.py> Full example: <gh-file:examples/offline_inference/vision_language.py>
To substitute multiple images inside the same text prompt, you can pass in a list of images instead: To substitute multiple images inside the same text prompt, you can pass in a list of images instead:
??? Code ```python
from vllm import LLM
```python llm = LLM(
from vllm import LLM model="microsoft/Phi-3.5-vision-instruct",
trust_remote_code=True, # Required to load Phi-3.5-vision
max_model_len=4096, # Otherwise, it may not fit in smaller GPUs
limit_mm_per_prompt={"image": 2}, # The maximum number to accept
)
llm = LLM( # Refer to the HuggingFace repo for the correct format to use
model="microsoft/Phi-3.5-vision-instruct", prompt = "<|user|>\n<|image_1|>\n<|image_2|>\nWhat is the content of each image?<|end|>\n<|assistant|>\n"
trust_remote_code=True, # Required to load Phi-3.5-vision
max_model_len=4096, # Otherwise, it may not fit in smaller GPUs
limit_mm_per_prompt={"image": 2}, # The maximum number to accept
)
# Refer to the HuggingFace repo for the correct format to use # Load the images using PIL.Image
prompt = "<|user|>\n<|image_1|>\n<|image_2|>\nWhat is the content of each image?<|end|>\n<|assistant|>\n" image1 = PIL.Image.open(...)
image2 = PIL.Image.open(...)
# Load the images using PIL.Image outputs = llm.generate({
image1 = PIL.Image.open(...) "prompt": prompt,
image2 = PIL.Image.open(...) "multi_modal_data": {
"image": [image1, image2]
},
})
outputs = llm.generate({ for o in outputs:
"prompt": prompt, generated_text = o.outputs[0].text
"multi_modal_data": { print(generated_text)
"image": [image1, image2] ```
},
})
for o in outputs:
generated_text = o.outputs[0].text
print(generated_text)
```
Full example: <gh-file:examples/offline_inference/vision_language_multi_image.py> Full example: <gh-file:examples/offline_inference/vision_language_multi_image.py>
Multi-image input can be extended to perform video captioning. We show this with [Qwen2-VL](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct) as it supports videos: Multi-image input can be extended to perform video captioning. We show this with [Qwen2-VL](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct) as it supports videos:
??? Code ```python
from vllm import LLM
```python # Specify the maximum number of frames per video to be 4. This can be changed.
from vllm import LLM llm = LLM("Qwen/Qwen2-VL-2B-Instruct", limit_mm_per_prompt={"image": 4})
# Specify the maximum number of frames per video to be 4. This can be changed. # Create the request payload.
llm = LLM("Qwen/Qwen2-VL-2B-Instruct", limit_mm_per_prompt={"image": 4}) video_frames = ... # load your video making sure it only has the number of frames specified earlier.
message = {
"role": "user",
"content": [
{"type": "text", "text": "Describe this set of frames. Consider the frames to be a part of the same video."},
],
}
for i in range(len(video_frames)):
base64_image = encode_image(video_frames[i]) # base64 encoding.
new_image = {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}}
message["content"].append(new_image)
# Create the request payload. # Perform inference and log output.
video_frames = ... # load your video making sure it only has the number of frames specified earlier. outputs = llm.chat([message])
message = {
"role": "user",
"content": [
{"type": "text", "text": "Describe this set of frames. Consider the frames to be a part of the same video."},
],
}
for i in range(len(video_frames)):
base64_image = encode_image(video_frames[i]) # base64 encoding.
new_image = {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}}
message["content"].append(new_image)
# Perform inference and log output. for o in outputs:
outputs = llm.chat([message]) generated_text = o.outputs[0].text
print(generated_text)
for o in outputs: ```
generated_text = o.outputs[0].text
print(generated_text)
```
### Video Inputs ### Video Inputs
@ -150,72 +144,68 @@ Full example: <gh-file:examples/offline_inference/audio_language.py>
To input pre-computed embeddings belonging to a data type (i.e. image, video, or audio) directly to the language model, To input pre-computed embeddings belonging to a data type (i.e. image, video, or audio) directly to the language model,
pass a tensor of shape `(num_items, feature_size, hidden_size of LM)` to the corresponding field of the multi-modal dictionary. pass a tensor of shape `(num_items, feature_size, hidden_size of LM)` to the corresponding field of the multi-modal dictionary.
??? Code ```python
from vllm import LLM
```python # Inference with image embeddings as input
from vllm import LLM llm = LLM(model="llava-hf/llava-1.5-7b-hf")
# Inference with image embeddings as input # Refer to the HuggingFace repo for the correct format to use
llm = LLM(model="llava-hf/llava-1.5-7b-hf") prompt = "USER: <image>\nWhat is the content of this image?\nASSISTANT:"
# Refer to the HuggingFace repo for the correct format to use # Embeddings for single image
prompt = "USER: <image>\nWhat is the content of this image?\nASSISTANT:" # torch.Tensor of shape (1, image_feature_size, hidden_size of LM)
image_embeds = torch.load(...)
# Embeddings for single image outputs = llm.generate({
# torch.Tensor of shape (1, image_feature_size, hidden_size of LM) "prompt": prompt,
image_embeds = torch.load(...) "multi_modal_data": {"image": image_embeds},
})
outputs = llm.generate({ for o in outputs:
"prompt": prompt, generated_text = o.outputs[0].text
"multi_modal_data": {"image": image_embeds}, print(generated_text)
}) ```
for o in outputs:
generated_text = o.outputs[0].text
print(generated_text)
```
For Qwen2-VL and MiniCPM-V, we accept additional parameters alongside the embeddings: For Qwen2-VL and MiniCPM-V, we accept additional parameters alongside the embeddings:
??? Code ```python
# Construct the prompt based on your model
prompt = ...
```python # Embeddings for multiple images
# Construct the prompt based on your model # torch.Tensor of shape (num_images, image_feature_size, hidden_size of LM)
prompt = ... image_embeds = torch.load(...)
# Embeddings for multiple images # Qwen2-VL
# torch.Tensor of shape (num_images, image_feature_size, hidden_size of LM) llm = LLM("Qwen/Qwen2-VL-2B-Instruct", limit_mm_per_prompt={"image": 4})
image_embeds = torch.load(...) mm_data = {
"image": {
# Qwen2-VL "image_embeds": image_embeds,
llm = LLM("Qwen/Qwen2-VL-2B-Instruct", limit_mm_per_prompt={"image": 4}) # image_grid_thw is needed to calculate positional encoding.
mm_data = { "image_grid_thw": torch.load(...), # torch.Tensor of shape (1, 3),
"image": {
"image_embeds": image_embeds,
# image_grid_thw is needed to calculate positional encoding.
"image_grid_thw": torch.load(...), # torch.Tensor of shape (1, 3),
}
} }
}
# MiniCPM-V # MiniCPM-V
llm = LLM("openbmb/MiniCPM-V-2_6", trust_remote_code=True, limit_mm_per_prompt={"image": 4}) llm = LLM("openbmb/MiniCPM-V-2_6", trust_remote_code=True, limit_mm_per_prompt={"image": 4})
mm_data = { mm_data = {
"image": { "image": {
"image_embeds": image_embeds, "image_embeds": image_embeds,
# image_sizes is needed to calculate details of the sliced image. # image_sizes is needed to calculate details of the sliced image.
"image_sizes": [image.size for image in images], # list of image sizes "image_sizes": [image.size for image in images], # list of image sizes
}
} }
}
outputs = llm.generate({ outputs = llm.generate({
"prompt": prompt, "prompt": prompt,
"multi_modal_data": mm_data, "multi_modal_data": mm_data,
}) })
for o in outputs: for o in outputs:
generated_text = o.outputs[0].text generated_text = o.outputs[0].text
print(generated_text) print(generated_text)
``` ```
## Online Serving ## Online Serving
@ -245,53 +235,51 @@ vllm serve microsoft/Phi-3.5-vision-instruct --task generate \
Then, you can use the OpenAI client as follows: Then, you can use the OpenAI client as follows:
??? Code ```python
from openai import OpenAI
```python openai_api_key = "EMPTY"
from openai import OpenAI openai_api_base = "http://localhost:8000/v1"
openai_api_key = "EMPTY" client = OpenAI(
openai_api_base = "http://localhost:8000/v1" api_key=openai_api_key,
base_url=openai_api_base,
)
client = OpenAI( # Single-image input inference
api_key=openai_api_key, image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
base_url=openai_api_base,
)
# Single-image input inference chat_response = client.chat.completions.create(
image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" model="microsoft/Phi-3.5-vision-instruct",
messages=[{
"role": "user",
"content": [
# NOTE: The prompt formatting with the image token `<image>` is not needed
# since the prompt will be processed automatically by the API server.
{"type": "text", "text": "Whats in this image?"},
{"type": "image_url", "image_url": {"url": image_url}},
],
}],
)
print("Chat completion output:", chat_response.choices[0].message.content)
chat_response = client.chat.completions.create( # Multi-image input inference
model="microsoft/Phi-3.5-vision-instruct", image_url_duck = "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg"
messages=[{ image_url_lion = "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg"
"role": "user",
"content": [
# NOTE: The prompt formatting with the image token `<image>` is not needed
# since the prompt will be processed automatically by the API server.
{"type": "text", "text": "Whats in this image?"},
{"type": "image_url", "image_url": {"url": image_url}},
],
}],
)
print("Chat completion output:", chat_response.choices[0].message.content)
# Multi-image input inference chat_response = client.chat.completions.create(
image_url_duck = "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg" model="microsoft/Phi-3.5-vision-instruct",
image_url_lion = "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg" messages=[{
"role": "user",
chat_response = client.chat.completions.create( "content": [
model="microsoft/Phi-3.5-vision-instruct", {"type": "text", "text": "What are the animals in these images?"},
messages=[{ {"type": "image_url", "image_url": {"url": image_url_duck}},
"role": "user", {"type": "image_url", "image_url": {"url": image_url_lion}},
"content": [ ],
{"type": "text", "text": "What are the animals in these images?"}, }],
{"type": "image_url", "image_url": {"url": image_url_duck}}, )
{"type": "image_url", "image_url": {"url": image_url_lion}}, print("Chat completion output:", chat_response.choices[0].message.content)
], ```
}],
)
print("Chat completion output:", chat_response.choices[0].message.content)
```
Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for_multimodal.py> Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for_multimodal.py>
@ -307,7 +295,7 @@ Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for
By default, the timeout for fetching images through HTTP URL is `5` seconds. By default, the timeout for fetching images through HTTP URL is `5` seconds.
You can override this by setting the environment variable: You can override this by setting the environment variable:
```bash ```console
export VLLM_IMAGE_FETCH_TIMEOUT=<timeout> export VLLM_IMAGE_FETCH_TIMEOUT=<timeout>
``` ```
@ -323,46 +311,44 @@ vllm serve llava-hf/llava-onevision-qwen2-0.5b-ov-hf --task generate --max-model
Then, you can use the OpenAI client as follows: Then, you can use the OpenAI client as follows:
??? Code ```python
from openai import OpenAI
```python openai_api_key = "EMPTY"
from openai import OpenAI openai_api_base = "http://localhost:8000/v1"
openai_api_key = "EMPTY" client = OpenAI(
openai_api_base = "http://localhost:8000/v1" api_key=openai_api_key,
base_url=openai_api_base,
)
client = OpenAI( video_url = "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerFun.mp4"
api_key=openai_api_key,
base_url=openai_api_base,
)
video_url = "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerFun.mp4" ## Use video url in the payload
chat_completion_from_url = client.chat.completions.create(
## Use video url in the payload messages=[{
chat_completion_from_url = client.chat.completions.create( "role":
messages=[{ "user",
"role": "content": [
"user", {
"content": [ "type": "text",
{ "text": "What's in this video?"
"type": "text", },
"text": "What's in this video?" {
"type": "video_url",
"video_url": {
"url": video_url
}, },
{ },
"type": "video_url", ],
"video_url": { }],
"url": video_url model=model,
}, max_completion_tokens=64,
}, )
],
}],
model=model,
max_completion_tokens=64,
)
result = chat_completion_from_url.choices[0].message.content result = chat_completion_from_url.choices[0].message.content
print("Chat completion output from image url:", result) print("Chat completion output from image url:", result)
``` ```
Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for_multimodal.py> Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for_multimodal.py>
@ -370,7 +356,7 @@ Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for
By default, the timeout for fetching videos through HTTP URL is `30` seconds. By default, the timeout for fetching videos through HTTP URL is `30` seconds.
You can override this by setting the environment variable: You can override this by setting the environment variable:
```bash ```console
export VLLM_VIDEO_FETCH_TIMEOUT=<timeout> export VLLM_VIDEO_FETCH_TIMEOUT=<timeout>
``` ```
@ -387,88 +373,84 @@ vllm serve fixie-ai/ultravox-v0_5-llama-3_2-1b
Then, you can use the OpenAI client as follows: Then, you can use the OpenAI client as follows:
??? Code ```python
import base64
import requests
from openai import OpenAI
from vllm.assets.audio import AudioAsset
```python def encode_base64_content_from_url(content_url: str) -> str:
import base64 """Encode a content retrieved from a remote url to base64 format."""
import requests
from openai import OpenAI
from vllm.assets.audio import AudioAsset
def encode_base64_content_from_url(content_url: str) -> str: with requests.get(content_url) as response:
"""Encode a content retrieved from a remote url to base64 format.""" response.raise_for_status()
result = base64.b64encode(response.content).decode('utf-8')
with requests.get(content_url) as response: return result
response.raise_for_status()
result = base64.b64encode(response.content).decode('utf-8')
return result openai_api_key = "EMPTY"
openai_api_base = "http://localhost:8000/v1"
openai_api_key = "EMPTY" client = OpenAI(
openai_api_base = "http://localhost:8000/v1" api_key=openai_api_key,
base_url=openai_api_base,
)
client = OpenAI( # Any format supported by librosa is supported
api_key=openai_api_key, audio_url = AudioAsset("winning_call").url
base_url=openai_api_base, audio_base64 = encode_base64_content_from_url(audio_url)
)
# Any format supported by librosa is supported chat_completion_from_base64 = client.chat.completions.create(
audio_url = AudioAsset("winning_call").url messages=[{
audio_base64 = encode_base64_content_from_url(audio_url) "role": "user",
"content": [
chat_completion_from_base64 = client.chat.completions.create( {
messages=[{ "type": "text",
"role": "user", "text": "What's in this audio?"
"content": [ },
{ {
"type": "text", "type": "input_audio",
"text": "What's in this audio?" "input_audio": {
"data": audio_base64,
"format": "wav"
}, },
{ },
"type": "input_audio", ],
"input_audio": { }],
"data": audio_base64, model=model,
"format": "wav" max_completion_tokens=64,
}, )
},
],
}],
model=model,
max_completion_tokens=64,
)
result = chat_completion_from_base64.choices[0].message.content result = chat_completion_from_base64.choices[0].message.content
print("Chat completion output from input audio:", result) print("Chat completion output from input audio:", result)
``` ```
Alternatively, you can pass `audio_url`, which is the audio counterpart of `image_url` for image input: Alternatively, you can pass `audio_url`, which is the audio counterpart of `image_url` for image input:
??? Code ```python
chat_completion_from_url = client.chat.completions.create(
```python messages=[{
chat_completion_from_url = client.chat.completions.create( "role": "user",
messages=[{ "content": [
"role": "user", {
"content": [ "type": "text",
{ "text": "What's in this audio?"
"type": "text", },
"text": "What's in this audio?" {
"type": "audio_url",
"audio_url": {
"url": audio_url
}, },
{ },
"type": "audio_url", ],
"audio_url": { }],
"url": audio_url model=model,
}, max_completion_tokens=64,
}, )
],
}],
model=model,
max_completion_tokens=64,
)
result = chat_completion_from_url.choices[0].message.content result = chat_completion_from_url.choices[0].message.content
print("Chat completion output from audio url:", result) print("Chat completion output from audio url:", result)
``` ```
Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for_multimodal.py> Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for_multimodal.py>
@ -476,7 +458,7 @@ Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for
By default, the timeout for fetching audios through HTTP URL is `10` seconds. By default, the timeout for fetching audios through HTTP URL is `10` seconds.
You can override this by setting the environment variable: You can override this by setting the environment variable:
```bash ```console
export VLLM_AUDIO_FETCH_TIMEOUT=<timeout> export VLLM_AUDIO_FETCH_TIMEOUT=<timeout>
``` ```
@ -488,63 +470,61 @@ pass a tensor of shape to the corresponding field of the multi-modal dictionary.
For image embeddings, you can pass the base64-encoded tensor to the `image_embeds` field. For image embeddings, you can pass the base64-encoded tensor to the `image_embeds` field.
The following example demonstrates how to pass image embeddings to the OpenAI server: The following example demonstrates how to pass image embeddings to the OpenAI server:
??? Code ```python
image_embedding = torch.load(...)
grid_thw = torch.load(...) # Required by Qwen/Qwen2-VL-2B-Instruct
```python buffer = io.BytesIO()
image_embedding = torch.load(...) torch.save(image_embedding, buffer)
grid_thw = torch.load(...) # Required by Qwen/Qwen2-VL-2B-Instruct buffer.seek(0)
binary_data = buffer.read()
base64_image_embedding = base64.b64encode(binary_data).decode('utf-8')
buffer = io.BytesIO() client = OpenAI(
torch.save(image_embedding, buffer) # defaults to os.environ.get("OPENAI_API_KEY")
buffer.seek(0) api_key=openai_api_key,
binary_data = buffer.read() base_url=openai_api_base,
base64_image_embedding = base64.b64encode(binary_data).decode('utf-8') )
client = OpenAI( # Basic usage - this is equivalent to the LLaVA example for offline inference
# defaults to os.environ.get("OPENAI_API_KEY") model = "llava-hf/llava-1.5-7b-hf"
api_key=openai_api_key, embeds = {
base_url=openai_api_base, "type": "image_embeds",
) "image_embeds": f"{base64_image_embedding}"
}
# Basic usage - this is equivalent to the LLaVA example for offline inference # Pass additional parameters (available to Qwen2-VL and MiniCPM-V)
model = "llava-hf/llava-1.5-7b-hf" model = "Qwen/Qwen2-VL-2B-Instruct"
embeds = { embeds = {
"type": "image_embeds", "type": "image_embeds",
"image_embeds": f"{base64_image_embedding}" "image_embeds": {
} "image_embeds": f"{base64_image_embedding}" , # Required
"image_grid_thw": f"{base64_image_grid_thw}" # Required by Qwen/Qwen2-VL-2B-Instruct
# Pass additional parameters (available to Qwen2-VL and MiniCPM-V) },
model = "Qwen/Qwen2-VL-2B-Instruct" }
embeds = { model = "openbmb/MiniCPM-V-2_6"
"type": "image_embeds", embeds = {
"image_embeds": { "type": "image_embeds",
"image_embeds": f"{base64_image_embedding}" , # Required "image_embeds": {
"image_grid_thw": f"{base64_image_grid_thw}" # Required by Qwen/Qwen2-VL-2B-Instruct "image_embeds": f"{base64_image_embedding}" , # Required
"image_sizes": f"{base64_image_sizes}" # Required by openbmb/MiniCPM-V-2_6
},
}
chat_completion = client.chat.completions.create(
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": [
{
"type": "text",
"text": "What's in this image?",
}, },
} embeds,
model = "openbmb/MiniCPM-V-2_6" ],
embeds = { },
"type": "image_embeds", ],
"image_embeds": { model=model,
"image_embeds": f"{base64_image_embedding}" , # Required )
"image_sizes": f"{base64_image_sizes}" # Required by openbmb/MiniCPM-V-2_6 ```
},
}
chat_completion = client.chat.completions.create(
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": [
{
"type": "text",
"text": "What's in this image?",
},
embeds,
],
},
],
model=model,
)
```
!!! note !!! note
Only one message can contain `{"type": "image_embeds"}`. Only one message can contain `{"type": "image_embeds"}`.

View File

@ -9,41 +9,39 @@ The main benefits are lower latency and memory usage.
You can quantize your own models by installing AutoAWQ or picking one of the [6500+ models on Huggingface](https://huggingface.co/models?search=awq). You can quantize your own models by installing AutoAWQ or picking one of the [6500+ models on Huggingface](https://huggingface.co/models?search=awq).
```bash ```console
pip install autoawq pip install autoawq
``` ```
After installing AutoAWQ, you are ready to quantize a model. Please refer to the [AutoAWQ documentation](https://casper-hansen.github.io/AutoAWQ/examples/#basic-quantization) for further details. Here is an example of how to quantize `mistralai/Mistral-7B-Instruct-v0.2`: After installing AutoAWQ, you are ready to quantize a model. Please refer to the [AutoAWQ documentation](https://casper-hansen.github.io/AutoAWQ/examples/#basic-quantization) for further details. Here is an example of how to quantize `mistralai/Mistral-7B-Instruct-v0.2`:
??? Code ```python
from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer
```python model_path = 'mistralai/Mistral-7B-Instruct-v0.2'
from awq import AutoAWQForCausalLM quant_path = 'mistral-instruct-v0.2-awq'
from transformers import AutoTokenizer quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM" }
model_path = 'mistralai/Mistral-7B-Instruct-v0.2' # Load model
quant_path = 'mistral-instruct-v0.2-awq' model = AutoAWQForCausalLM.from_pretrained(
quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM" } model_path, **{"low_cpu_mem_usage": True, "use_cache": False}
)
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
# Load model # Quantize
model = AutoAWQForCausalLM.from_pretrained( model.quantize(tokenizer, quant_config=quant_config)
model_path, **{"low_cpu_mem_usage": True, "use_cache": False}
)
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
# Quantize # Save quantized model
model.quantize(tokenizer, quant_config=quant_config) model.save_quantized(quant_path)
tokenizer.save_pretrained(quant_path)
# Save quantized model print(f'Model is quantized and saved at "{quant_path}"')
model.save_quantized(quant_path) ```
tokenizer.save_pretrained(quant_path)
print(f'Model is quantized and saved at "{quant_path}"')
```
To run an AWQ model with vLLM, you can use [TheBloke/Llama-2-7b-Chat-AWQ](https://huggingface.co/TheBloke/Llama-2-7b-Chat-AWQ) with the following command: To run an AWQ model with vLLM, you can use [TheBloke/Llama-2-7b-Chat-AWQ](https://huggingface.co/TheBloke/Llama-2-7b-Chat-AWQ) with the following command:
```bash ```console
python examples/offline_inference/llm_engine_example.py \ python examples/offline_inference/llm_engine_example.py \
--model TheBloke/Llama-2-7b-Chat-AWQ \ --model TheBloke/Llama-2-7b-Chat-AWQ \
--quantization awq --quantization awq
@ -51,29 +49,27 @@ python examples/offline_inference/llm_engine_example.py \
AWQ models are also supported directly through the LLM entrypoint: AWQ models are also supported directly through the LLM entrypoint:
??? Code ```python
from vllm import LLM, SamplingParams
```python # Sample prompts.
from vllm import LLM, SamplingParams prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
# Create a sampling params object.
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
# Sample prompts. # Create an LLM.
prompts = [ llm = LLM(model="TheBloke/Llama-2-7b-Chat-AWQ", quantization="AWQ")
"Hello, my name is", # Generate texts from the prompts. The output is a list of RequestOutput objects
"The president of the United States is", # that contain the prompt, generated text, and other information.
"The capital of France is", outputs = llm.generate(prompts, sampling_params)
"The future of AI is", # Print the outputs.
] for output in outputs:
# Create a sampling params object. prompt = output.prompt
sampling_params = SamplingParams(temperature=0.8, top_p=0.95) generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
# Create an LLM. ```
llm = LLM(model="TheBloke/Llama-2-7b-Chat-AWQ", quantization="AWQ")
# Generate texts from the prompts. The output is a list of RequestOutput objects
# that contain the prompt, generated text, and other information.
outputs = llm.generate(prompts, sampling_params)
# Print the outputs.
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
```

View File

@ -12,7 +12,7 @@ vLLM now supports [BitBLAS](https://github.com/microsoft/BitBLAS) for more effic
Below are the steps to utilize BitBLAS with vLLM. Below are the steps to utilize BitBLAS with vLLM.
```bash ```console
pip install bitblas>=0.1.0 pip install bitblas>=0.1.0
``` ```
@ -43,19 +43,17 @@ llm = LLM(
## Read gptq format checkpoint ## Read gptq format checkpoint
??? Code ```python
from vllm import LLM
import torch
```python # "hxbgsyxh/llama-13b-4bit-g-1" is a pre-quantized checkpoint.
from vllm import LLM model_id = "hxbgsyxh/llama-13b-4bit-g-1"
import torch llm = LLM(
model=model_id,
# "hxbgsyxh/llama-13b-4bit-g-1" is a pre-quantized checkpoint. dtype=torch.float16,
model_id = "hxbgsyxh/llama-13b-4bit-g-1" trust_remote_code=True,
llm = LLM( quantization="bitblas",
model=model_id, max_model_len=1024
dtype=torch.float16, )
trust_remote_code=True, ```
quantization="bitblas",
max_model_len=1024
)
```

View File

@ -9,7 +9,7 @@ Compared to other quantization methods, BitsAndBytes eliminates the need for cal
Below are the steps to utilize BitsAndBytes with vLLM. Below are the steps to utilize BitsAndBytes with vLLM.
```bash ```console
pip install bitsandbytes>=0.45.3 pip install bitsandbytes>=0.45.3
``` ```
@ -54,6 +54,6 @@ llm = LLM(
Append the following to your model arguments for 4bit inflight quantization: Append the following to your model arguments for 4bit inflight quantization:
```bash ```console
--quantization bitsandbytes --quantization bitsandbytes
``` ```

View File

@ -23,7 +23,7 @@ The FP8 types typically supported in hardware have two distinct representations,
To produce performant FP8 quantized models with vLLM, you'll need to install the [llm-compressor](https://github.com/vllm-project/llm-compressor/) library: To produce performant FP8 quantized models with vLLM, you'll need to install the [llm-compressor](https://github.com/vllm-project/llm-compressor/) library:
```bash ```console
pip install llmcompressor pip install llmcompressor
``` ```
@ -58,30 +58,28 @@ For FP8 quantization, we can recover accuracy with simple RTN quantization. We r
Since simple RTN does not require data for weight quantization and the activations are quantized dynamically, we do not need any calibration data for this quantization flow. Since simple RTN does not require data for weight quantization and the activations are quantized dynamically, we do not need any calibration data for this quantization flow.
??? Code ```python
from llmcompressor.transformers import oneshot
from llmcompressor.modifiers.quantization import QuantizationModifier
```python # Configure the simple PTQ quantization
from llmcompressor.transformers import oneshot recipe = QuantizationModifier(
from llmcompressor.modifiers.quantization import QuantizationModifier targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"])
# Configure the simple PTQ quantization # Apply the quantization algorithm.
recipe = QuantizationModifier( oneshot(model=model, recipe=recipe)
targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"])
# Apply the quantization algorithm. # Save the model: Meta-Llama-3-8B-Instruct-FP8-Dynamic
oneshot(model=model, recipe=recipe) SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic"
model.save_pretrained(SAVE_DIR)
# Save the model: Meta-Llama-3-8B-Instruct-FP8-Dynamic tokenizer.save_pretrained(SAVE_DIR)
SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic" ```
model.save_pretrained(SAVE_DIR)
tokenizer.save_pretrained(SAVE_DIR)
```
### 3. Evaluating Accuracy ### 3. Evaluating Accuracy
Install `vllm` and `lm-evaluation-harness` for evaluation: Install `vllm` and `lm-evaluation-harness` for evaluation:
```bash ```console
pip install vllm lm-eval==0.4.4 pip install vllm lm-eval==0.4.4
``` ```
@ -99,9 +97,9 @@ Evaluate accuracy with `lm_eval` (for example on 250 samples of `gsm8k`):
!!! note !!! note
Quantized models can be sensitive to the presence of the `bos` token. `lm_eval` does not add a `bos` token by default, so make sure to include the `add_bos_token=True` argument when running your evaluations. Quantized models can be sensitive to the presence of the `bos` token. `lm_eval` does not add a `bos` token by default, so make sure to include the `add_bos_token=True` argument when running your evaluations.
```bash ```console
MODEL=$PWD/Meta-Llama-3-8B-Instruct-FP8-Dynamic $ MODEL=$PWD/Meta-Llama-3-8B-Instruct-FP8-Dynamic
lm_eval \ $ lm_eval \
--model vllm \ --model vllm \
--model_args pretrained=$MODEL,add_bos_token=True \ --model_args pretrained=$MODEL,add_bos_token=True \
--tasks gsm8k --num_fewshot 5 --batch_size auto --limit 250 --tasks gsm8k --num_fewshot 5 --batch_size auto --limit 250

View File

@ -11,7 +11,7 @@ title: GGUF
To run a GGUF model with vLLM, you can download and use the local GGUF model from [TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF](https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF) with the following command: To run a GGUF model with vLLM, you can download and use the local GGUF model from [TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF](https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF) with the following command:
```bash ```console
wget https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF/resolve/main/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf wget https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF/resolve/main/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf
# We recommend using the tokenizer from base model to avoid long-time and buggy tokenizer conversion. # We recommend using the tokenizer from base model to avoid long-time and buggy tokenizer conversion.
vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \ vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \
@ -20,7 +20,7 @@ vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \
You can also add `--tensor-parallel-size 2` to enable tensor parallelism inference with 2 GPUs: You can also add `--tensor-parallel-size 2` to enable tensor parallelism inference with 2 GPUs:
```bash ```console
# We recommend using the tokenizer from base model to avoid long-time and buggy tokenizer conversion. # We recommend using the tokenizer from base model to avoid long-time and buggy tokenizer conversion.
vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \ vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \
--tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 \ --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 \
@ -32,7 +32,7 @@ vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \
GGUF assumes that huggingface can convert the metadata to a config file. In case huggingface doesn't support your model you can manually create a config and pass it as hf-config-path GGUF assumes that huggingface can convert the metadata to a config file. In case huggingface doesn't support your model you can manually create a config and pass it as hf-config-path
```bash ```console
# If you model is not supported by huggingface you can manually provide a huggingface compatible config path # If you model is not supported by huggingface you can manually provide a huggingface compatible config path
vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \ vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \
--tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 \ --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 \
@ -41,44 +41,42 @@ vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \
You can also use the GGUF model directly through the LLM entrypoint: You can also use the GGUF model directly through the LLM entrypoint:
??? Code ```python
from vllm import LLM, SamplingParams
```python # In this script, we demonstrate how to pass input to the chat method:
from vllm import LLM, SamplingParams conversation = [
{
"role": "system",
"content": "You are a helpful assistant"
},
{
"role": "user",
"content": "Hello"
},
{
"role": "assistant",
"content": "Hello! How can I assist you today?"
},
{
"role": "user",
"content": "Write an essay about the importance of higher education.",
},
]
# In this script, we demonstrate how to pass input to the chat method: # Create a sampling params object.
conversation = [ sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
{
"role": "system",
"content": "You are a helpful assistant"
},
{
"role": "user",
"content": "Hello"
},
{
"role": "assistant",
"content": "Hello! How can I assist you today?"
},
{
"role": "user",
"content": "Write an essay about the importance of higher education.",
},
]
# Create a sampling params object. # Create an LLM.
sampling_params = SamplingParams(temperature=0.8, top_p=0.95) llm = LLM(model="./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
tokenizer="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
# Generate texts from the prompts. The output is a list of RequestOutput objects
# that contain the prompt, generated text, and other information.
outputs = llm.chat(conversation, sampling_params)
# Create an LLM. # Print the outputs.
llm = LLM(model="./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf", for output in outputs:
tokenizer="TinyLlama/TinyLlama-1.1B-Chat-v1.0") prompt = output.prompt
# Generate texts from the prompts. The output is a list of RequestOutput objects generated_text = output.outputs[0].text
# that contain the prompt, generated text, and other information. print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
outputs = llm.chat(conversation, sampling_params) ```
# Print the outputs.
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
```

View File

@ -21,7 +21,7 @@ for more details on this and other advanced features.
You can quantize your own models by installing [GPTQModel](https://github.com/ModelCloud/GPTQModel) or picking one of the [5000+ models on Huggingface](https://huggingface.co/models?search=gptq). You can quantize your own models by installing [GPTQModel](https://github.com/ModelCloud/GPTQModel) or picking one of the [5000+ models on Huggingface](https://huggingface.co/models?search=gptq).
```bash ```console
pip install -U gptqmodel --no-build-isolation -v pip install -U gptqmodel --no-build-isolation -v
``` ```
@ -31,36 +31,34 @@ After installing GPTQModel, you are ready to quantize a model. Please refer to t
Here is an example of how to quantize `meta-llama/Llama-3.2-1B-Instruct`: Here is an example of how to quantize `meta-llama/Llama-3.2-1B-Instruct`:
??? Code ```python
from datasets import load_dataset
from gptqmodel import GPTQModel, QuantizeConfig
```python model_id = "meta-llama/Llama-3.2-1B-Instruct"
from datasets import load_dataset quant_path = "Llama-3.2-1B-Instruct-gptqmodel-4bit"
from gptqmodel import GPTQModel, QuantizeConfig
model_id = "meta-llama/Llama-3.2-1B-Instruct" calibration_dataset = load_dataset(
quant_path = "Llama-3.2-1B-Instruct-gptqmodel-4bit" "allenai/c4",
data_files="en/c4-train.00001-of-01024.json.gz",
split="train"
).select(range(1024))["text"]
calibration_dataset = load_dataset( quant_config = QuantizeConfig(bits=4, group_size=128)
"allenai/c4",
data_files="en/c4-train.00001-of-01024.json.gz",
split="train"
).select(range(1024))["text"]
quant_config = QuantizeConfig(bits=4, group_size=128) model = GPTQModel.load(model_id, quant_config)
model = GPTQModel.load(model_id, quant_config) # increase `batch_size` to match gpu/vram specs to speed up quantization
model.quantize(calibration_dataset, batch_size=2)
# increase `batch_size` to match gpu/vram specs to speed up quantization model.save(quant_path)
model.quantize(calibration_dataset, batch_size=2) ```
model.save(quant_path)
```
## Running a quantized model with vLLM ## Running a quantized model with vLLM
To run an GPTQModel quantized model with vLLM, you can use [DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2](https://huggingface.co/ModelCloud/DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2) with the following command: To run an GPTQModel quantized model with vLLM, you can use [DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2](https://huggingface.co/ModelCloud/DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2) with the following command:
```bash ```console
python examples/offline_inference/llm_engine_example.py \ python examples/offline_inference/llm_engine_example.py \
--model ModelCloud/DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2 --model ModelCloud/DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2
``` ```
@ -69,34 +67,32 @@ python examples/offline_inference/llm_engine_example.py \
GPTQModel quantized models are also supported directly through the LLM entrypoint: GPTQModel quantized models are also supported directly through the LLM entrypoint:
??? Code ```python
from vllm import LLM, SamplingParams
```python # Sample prompts.
from vllm import LLM, SamplingParams prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
# Sample prompts. # Create a sampling params object.
prompts = [ sampling_params = SamplingParams(temperature=0.6, top_p=0.9)
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
# Create a sampling params object. # Create an LLM.
sampling_params = SamplingParams(temperature=0.6, top_p=0.9) llm = LLM(model="ModelCloud/DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2")
# Create an LLM. # Generate texts from the prompts. The output is a list of RequestOutput objects
llm = LLM(model="ModelCloud/DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2") # that contain the prompt, generated text, and other information.
outputs = llm.generate(prompts, sampling_params)
# Generate texts from the prompts. The output is a list of RequestOutput objects # Print the outputs.
# that contain the prompt, generated text, and other information. print("-"*50)
outputs = llm.generate(prompts, sampling_params) for output in outputs:
prompt = output.prompt
# Print the outputs. generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
print("-"*50) print("-"*50)
for output in outputs: ```
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
print("-"*50)
```

View File

@ -14,13 +14,13 @@ Please visit the HF collection of [quantized INT4 checkpoints of popular LLMs re
To use INT4 quantization with vLLM, you'll need to install the [llm-compressor](https://github.com/vllm-project/llm-compressor/) library: To use INT4 quantization with vLLM, you'll need to install the [llm-compressor](https://github.com/vllm-project/llm-compressor/) library:
```bash ```console
pip install llmcompressor pip install llmcompressor
``` ```
Additionally, install `vllm` and `lm-evaluation-harness` for evaluation: Additionally, install `vllm` and `lm-evaluation-harness` for evaluation:
```bash ```console
pip install vllm lm-eval==0.4.4 pip install vllm lm-eval==0.4.4
``` ```
@ -53,55 +53,51 @@ When quantizing weights to INT4, you need sample data to estimate the weight upd
It's best to use calibration data that closely matches your deployment data. It's best to use calibration data that closely matches your deployment data.
For a general-purpose instruction-tuned model, you can use a dataset like `ultrachat`: For a general-purpose instruction-tuned model, you can use a dataset like `ultrachat`:
??? Code ```python
from datasets import load_dataset
```python NUM_CALIBRATION_SAMPLES = 512
from datasets import load_dataset MAX_SEQUENCE_LENGTH = 2048
NUM_CALIBRATION_SAMPLES = 512 # Load and preprocess the dataset
MAX_SEQUENCE_LENGTH = 2048 ds = load_dataset("HuggingFaceH4/ultrachat_200k", split="train_sft")
ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))
# Load and preprocess the dataset def preprocess(example):
ds = load_dataset("HuggingFaceH4/ultrachat_200k", split="train_sft") return {"text": tokenizer.apply_chat_template(example["messages"], tokenize=False)}
ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES)) ds = ds.map(preprocess)
def preprocess(example): def tokenize(sample):
return {"text": tokenizer.apply_chat_template(example["messages"], tokenize=False)} return tokenizer(sample["text"], padding=False, max_length=MAX_SEQUENCE_LENGTH, truncation=True, add_special_tokens=False)
ds = ds.map(preprocess) ds = ds.map(tokenize, remove_columns=ds.column_names)
```
def tokenize(sample):
return tokenizer(sample["text"], padding=False, max_length=MAX_SEQUENCE_LENGTH, truncation=True, add_special_tokens=False)
ds = ds.map(tokenize, remove_columns=ds.column_names)
```
### 3. Applying Quantization ### 3. Applying Quantization
Now, apply the quantization algorithms: Now, apply the quantization algorithms:
??? Code ```python
from llmcompressor.transformers import oneshot
from llmcompressor.modifiers.quantization import GPTQModifier
from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
```python # Configure the quantization algorithms
from llmcompressor.transformers import oneshot recipe = GPTQModifier(targets="Linear", scheme="W4A16", ignore=["lm_head"])
from llmcompressor.modifiers.quantization import GPTQModifier
from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
# Configure the quantization algorithms # Apply quantization
recipe = GPTQModifier(targets="Linear", scheme="W4A16", ignore=["lm_head"]) oneshot(
model=model,
dataset=ds,
recipe=recipe,
max_seq_length=MAX_SEQUENCE_LENGTH,
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
)
# Apply quantization # Save the compressed model: Meta-Llama-3-8B-Instruct-W4A16-G128
oneshot( SAVE_DIR = MODEL_ID.split("/")[1] + "-W4A16-G128"
model=model, model.save_pretrained(SAVE_DIR, save_compressed=True)
dataset=ds, tokenizer.save_pretrained(SAVE_DIR)
recipe=recipe, ```
max_seq_length=MAX_SEQUENCE_LENGTH,
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
)
# Save the compressed model: Meta-Llama-3-8B-Instruct-W4A16-G128
SAVE_DIR = MODEL_ID.split("/")[1] + "-W4A16-G128"
model.save_pretrained(SAVE_DIR, save_compressed=True)
tokenizer.save_pretrained(SAVE_DIR)
```
This process creates a W4A16 model with weights quantized to 4-bit integers. This process creates a W4A16 model with weights quantized to 4-bit integers.
@ -116,8 +112,8 @@ model = LLM("./Meta-Llama-3-8B-Instruct-W4A16-G128")
To evaluate accuracy, you can use `lm_eval`: To evaluate accuracy, you can use `lm_eval`:
```bash ```console
lm_eval --model vllm \ $ lm_eval --model vllm \
--model_args pretrained="./Meta-Llama-3-8B-Instruct-W4A16-G128",add_bos_token=true \ --model_args pretrained="./Meta-Llama-3-8B-Instruct-W4A16-G128",add_bos_token=true \
--tasks gsm8k \ --tasks gsm8k \
--num_fewshot 5 \ --num_fewshot 5 \
@ -141,36 +137,34 @@ lm_eval --model vllm \
The following is an example of an expanded quantization recipe you can tune to your own use case: The following is an example of an expanded quantization recipe you can tune to your own use case:
??? Code ```python
from compressed_tensors.quantization import (
```python QuantizationArgs,
from compressed_tensors.quantization import ( QuantizationScheme,
QuantizationArgs, QuantizationStrategy,
QuantizationScheme, QuantizationType,
QuantizationStrategy, )
QuantizationType, recipe = GPTQModifier(
) targets="Linear",
recipe = GPTQModifier( config_groups={
targets="Linear", "config_group": QuantizationScheme(
config_groups={ targets=["Linear"],
"config_group": QuantizationScheme( weights=QuantizationArgs(
targets=["Linear"], num_bits=4,
weights=QuantizationArgs( type=QuantizationType.INT,
num_bits=4, strategy=QuantizationStrategy.GROUP,
type=QuantizationType.INT, group_size=128,
strategy=QuantizationStrategy.GROUP, symmetric=True,
group_size=128, dynamic=False,
symmetric=True, actorder="weight",
dynamic=False,
actorder="weight",
),
), ),
}, ),
ignore=["lm_head"], },
update_size=NUM_CALIBRATION_SAMPLES, ignore=["lm_head"],
dampening_frac=0.01 update_size=NUM_CALIBRATION_SAMPLES,
) dampening_frac=0.01
``` )
```
## Troubleshooting and Support ## Troubleshooting and Support

View File

@ -15,13 +15,13 @@ Please visit the HF collection of [quantized INT8 checkpoints of popular LLMs re
To use INT8 quantization with vLLM, you'll need to install the [llm-compressor](https://github.com/vllm-project/llm-compressor/) library: To use INT8 quantization with vLLM, you'll need to install the [llm-compressor](https://github.com/vllm-project/llm-compressor/) library:
```bash ```console
pip install llmcompressor pip install llmcompressor
``` ```
Additionally, install `vllm` and `lm-evaluation-harness` for evaluation: Additionally, install `vllm` and `lm-evaluation-harness` for evaluation:
```bash ```console
pip install vllm lm-eval==0.4.4 pip install vllm lm-eval==0.4.4
``` ```
@ -54,60 +54,54 @@ When quantizing activations to INT8, you need sample data to estimate the activa
It's best to use calibration data that closely matches your deployment data. It's best to use calibration data that closely matches your deployment data.
For a general-purpose instruction-tuned model, you can use a dataset like `ultrachat`: For a general-purpose instruction-tuned model, you can use a dataset like `ultrachat`:
??? Code ```python
from datasets import load_dataset
```python NUM_CALIBRATION_SAMPLES = 512
from datasets import load_dataset MAX_SEQUENCE_LENGTH = 2048
NUM_CALIBRATION_SAMPLES = 512 # Load and preprocess the dataset
MAX_SEQUENCE_LENGTH = 2048 ds = load_dataset("HuggingFaceH4/ultrachat_200k", split="train_sft")
ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))
# Load and preprocess the dataset def preprocess(example):
ds = load_dataset("HuggingFaceH4/ultrachat_200k", split="train_sft") return {"text": tokenizer.apply_chat_template(example["messages"], tokenize=False)}
ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES)) ds = ds.map(preprocess)
def preprocess(example): def tokenize(sample):
return {"text": tokenizer.apply_chat_template(example["messages"], tokenize=False)} return tokenizer(sample["text"], padding=False, max_length=MAX_SEQUENCE_LENGTH, truncation=True, add_special_tokens=False)
ds = ds.map(preprocess) ds = ds.map(tokenize, remove_columns=ds.column_names)
```
def tokenize(sample):
return tokenizer(sample["text"], padding=False, max_length=MAX_SEQUENCE_LENGTH, truncation=True, add_special_tokens=False)
ds = ds.map(tokenize, remove_columns=ds.column_names)
```
</details>
### 3. Applying Quantization ### 3. Applying Quantization
Now, apply the quantization algorithms: Now, apply the quantization algorithms:
??? Code ```python
from llmcompressor.transformers import oneshot
from llmcompressor.modifiers.quantization import GPTQModifier
from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
```python # Configure the quantization algorithms
from llmcompressor.transformers import oneshot recipe = [
from llmcompressor.modifiers.quantization import GPTQModifier SmoothQuantModifier(smoothing_strength=0.8),
from llmcompressor.modifiers.smoothquant import SmoothQuantModifier GPTQModifier(targets="Linear", scheme="W8A8", ignore=["lm_head"]),
]
# Configure the quantization algorithms # Apply quantization
recipe = [ oneshot(
SmoothQuantModifier(smoothing_strength=0.8), model=model,
GPTQModifier(targets="Linear", scheme="W8A8", ignore=["lm_head"]), dataset=ds,
] recipe=recipe,
max_seq_length=MAX_SEQUENCE_LENGTH,
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
)
# Apply quantization # Save the compressed model: Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Per-Token
oneshot( SAVE_DIR = MODEL_ID.split("/")[1] + "-W8A8-Dynamic-Per-Token"
model=model, model.save_pretrained(SAVE_DIR, save_compressed=True)
dataset=ds, tokenizer.save_pretrained(SAVE_DIR)
recipe=recipe, ```
max_seq_length=MAX_SEQUENCE_LENGTH,
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
)
# Save the compressed model: Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Per-Token
SAVE_DIR = MODEL_ID.split("/")[1] + "-W8A8-Dynamic-Per-Token"
model.save_pretrained(SAVE_DIR, save_compressed=True)
tokenizer.save_pretrained(SAVE_DIR)
```
This process creates a W8A8 model with weights and activations quantized to 8-bit integers. This process creates a W8A8 model with weights and activations quantized to 8-bit integers.
@ -122,8 +116,8 @@ model = LLM("./Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Per-Token")
To evaluate accuracy, you can use `lm_eval`: To evaluate accuracy, you can use `lm_eval`:
```bash ```console
lm_eval --model vllm \ $ lm_eval --model vllm \
--model_args pretrained="./Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Per-Token",add_bos_token=true \ --model_args pretrained="./Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Per-Token",add_bos_token=true \
--tasks gsm8k \ --tasks gsm8k \
--num_fewshot 5 \ --num_fewshot 5 \

View File

@ -4,7 +4,7 @@ The [NVIDIA TensorRT Model Optimizer](https://github.com/NVIDIA/TensorRT-Model-O
We recommend installing the library with: We recommend installing the library with:
```bash ```console
pip install nvidia-modelopt pip install nvidia-modelopt
``` ```
@ -14,26 +14,24 @@ You can quantize HuggingFace models using the example scripts provided in the Te
Below is an example showing how to quantize a model using modelopt's PTQ API: Below is an example showing how to quantize a model using modelopt's PTQ API:
??? Code ```python
import modelopt.torch.quantization as mtq
from transformers import AutoModelForCausalLM
```python # Load the model from HuggingFace
import modelopt.torch.quantization as mtq model = AutoModelForCausalLM.from_pretrained("<path_or_model_id>")
from transformers import AutoModelForCausalLM
# Load the model from HuggingFace # Select the quantization config, for example, FP8
model = AutoModelForCausalLM.from_pretrained("<path_or_model_id>") config = mtq.FP8_DEFAULT_CFG
# Select the quantization config, for example, FP8 # Define a forward loop function for calibration
config = mtq.FP8_DEFAULT_CFG def forward_loop(model):
for data in calib_set:
model(data)
# Define a forward loop function for calibration # PTQ with in-place replacement of quantized modules
def forward_loop(model): model = mtq.quantize(model, config, forward_loop)
for data in calib_set: ```
model(data)
# PTQ with in-place replacement of quantized modules
model = mtq.quantize(model, config, forward_loop)
```
After the model is quantized, you can export it to a quantized checkpoint using the export API: After the model is quantized, you can export it to a quantized checkpoint using the export API:
@ -50,33 +48,31 @@ with torch.inference_mode():
The quantized checkpoint can then be deployed with vLLM. As an example, the following code shows how to deploy `nvidia/Llama-3.1-8B-Instruct-FP8`, which is the FP8 quantized checkpoint derived from `meta-llama/Llama-3.1-8B-Instruct`, using vLLM: The quantized checkpoint can then be deployed with vLLM. As an example, the following code shows how to deploy `nvidia/Llama-3.1-8B-Instruct-FP8`, which is the FP8 quantized checkpoint derived from `meta-llama/Llama-3.1-8B-Instruct`, using vLLM:
??? Code ```python
from vllm import LLM, SamplingParams
```python def main():
from vllm import LLM, SamplingParams
def main(): model_id = "nvidia/Llama-3.1-8B-Instruct-FP8"
# Ensure you specify quantization='modelopt' when loading the modelopt checkpoint
llm = LLM(model=model_id, quantization="modelopt", trust_remote_code=True)
model_id = "nvidia/Llama-3.1-8B-Instruct-FP8" sampling_params = SamplingParams(temperature=0.8, top_p=0.9)
# Ensure you specify quantization='modelopt' when loading the modelopt checkpoint
llm = LLM(model=model_id, quantization="modelopt", trust_remote_code=True)
sampling_params = SamplingParams(temperature=0.8, top_p=0.9) prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
prompts = [ outputs = llm.generate(prompts, sampling_params)
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
outputs = llm.generate(prompts, sampling_params) for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
for output in outputs: if __name__ == "__main__":
prompt = output.prompt main()
generated_text = output.outputs[0].text ```
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
if __name__ == "__main__":
main()
```

View File

@ -35,22 +35,20 @@ Studies have shown that FP8 E4M3 quantization typically only minimally degrades
Here is an example of how to enable FP8 quantization: Here is an example of how to enable FP8 quantization:
??? Code ```python
# To calculate kv cache scales on the fly enable the calculate_kv_scales
# parameter
```python from vllm import LLM, SamplingParams
# To calculate kv cache scales on the fly enable the calculate_kv_scales
# parameter
from vllm import LLM, SamplingParams sampling_params = SamplingParams(temperature=0.7, top_p=0.8)
llm = LLM(model="meta-llama/Llama-2-7b-chat-hf",
sampling_params = SamplingParams(temperature=0.7, top_p=0.8) kv_cache_dtype="fp8",
llm = LLM(model="meta-llama/Llama-2-7b-chat-hf", calculate_kv_scales=True)
kv_cache_dtype="fp8", prompt = "London is the capital of"
calculate_kv_scales=True) out = llm.generate(prompt, sampling_params)[0].outputs[0].text
prompt = "London is the capital of" print(out)
out = llm.generate(prompt, sampling_params)[0].outputs[0].text ```
print(out)
```
The `kv_cache_dtype` argument specifies the data type for KV cache storage: The `kv_cache_dtype` argument specifies the data type for KV cache storage:
- `"auto"`: Uses the model's default "unquantized" data type - `"auto"`: Uses the model's default "unquantized" data type
@ -65,7 +63,7 @@ For optimal model quality when using FP8 KV Cache, we recommend using calibrated
First, install the required dependencies: First, install the required dependencies:
```bash ```console
pip install llmcompressor pip install llmcompressor
``` ```
@ -73,69 +71,67 @@ pip install llmcompressor
Here's a complete example using `meta-llama/Llama-3.1-8B-Instruct` (most models can use this same pattern): Here's a complete example using `meta-llama/Llama-3.1-8B-Instruct` (most models can use this same pattern):
??? Code ```python
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
from llmcompressor.transformers import oneshot
```python # Select model and load it
from datasets import load_dataset MODEL_ID = "meta-llama/Llama-3.1-8B-Instruct"
from transformers import AutoModelForCausalLM, AutoTokenizer model = AutoModelForCausalLM.from_pretrained(MODEL_ID, device_map="auto", torch_dtype="auto")
from llmcompressor.transformers import oneshot tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
# Select model and load it # Select calibration dataset
MODEL_ID = "meta-llama/Llama-3.1-8B-Instruct" DATASET_ID = "HuggingFaceH4/ultrachat_200k"
model = AutoModelForCausalLM.from_pretrained(MODEL_ID, device_map="auto", torch_dtype="auto") DATASET_SPLIT = "train_sft"
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
# Select calibration dataset # Configure calibration parameters
DATASET_ID = "HuggingFaceH4/ultrachat_200k" NUM_CALIBRATION_SAMPLES = 512 # 512 samples is a good starting point
DATASET_SPLIT = "train_sft" MAX_SEQUENCE_LENGTH = 2048
# Configure calibration parameters # Load and preprocess dataset
NUM_CALIBRATION_SAMPLES = 512 # 512 samples is a good starting point ds = load_dataset(DATASET_ID, split=DATASET_SPLIT)
MAX_SEQUENCE_LENGTH = 2048 ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))
# Load and preprocess dataset def process_and_tokenize(example):
ds = load_dataset(DATASET_ID, split=DATASET_SPLIT) text = tokenizer.apply_chat_template(example["messages"], tokenize=False)
ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES)) return tokenizer(
text,
def process_and_tokenize(example): padding=False,
text = tokenizer.apply_chat_template(example["messages"], tokenize=False) max_length=MAX_SEQUENCE_LENGTH,
return tokenizer( truncation=True,
text, add_special_tokens=False,
padding=False,
max_length=MAX_SEQUENCE_LENGTH,
truncation=True,
add_special_tokens=False,
)
ds = ds.map(process_and_tokenize, remove_columns=ds.column_names)
# Configure quantization settings
recipe = """
quant_stage:
quant_modifiers:
QuantizationModifier:
kv_cache_scheme:
num_bits: 8
type: float
strategy: tensor
dynamic: false
symmetric: true
"""
# Apply quantization
oneshot(
model=model,
dataset=ds,
recipe=recipe,
max_seq_length=MAX_SEQUENCE_LENGTH,
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
) )
# Save quantized model: Llama-3.1-8B-Instruct-FP8-KV ds = ds.map(process_and_tokenize, remove_columns=ds.column_names)
SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-KV"
model.save_pretrained(SAVE_DIR, save_compressed=True) # Configure quantization settings
tokenizer.save_pretrained(SAVE_DIR) recipe = """
``` quant_stage:
quant_modifiers:
QuantizationModifier:
kv_cache_scheme:
num_bits: 8
type: float
strategy: tensor
dynamic: false
symmetric: true
"""
# Apply quantization
oneshot(
model=model,
dataset=ds,
recipe=recipe,
max_seq_length=MAX_SEQUENCE_LENGTH,
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
)
# Save quantized model: Llama-3.1-8B-Instruct-FP8-KV
SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-KV"
model.save_pretrained(SAVE_DIR, save_compressed=True)
tokenizer.save_pretrained(SAVE_DIR)
```
The above script will create a folder in your current directory containing your quantized model (e.g., `Llama-3.1-8B-Instruct-FP8-KV`) with calibrated scales. The above script will create a folder in your current directory containing your quantized model (e.g., `Llama-3.1-8B-Instruct-FP8-KV`) with calibrated scales.

View File

@ -13,7 +13,7 @@ AWQ, GPTQ, Rotation and SmoothQuant.
Before quantizing models, you need to install Quark. The latest release of Quark can be installed with pip: Before quantizing models, you need to install Quark. The latest release of Quark can be installed with pip:
```bash ```console
pip install amd-quark pip install amd-quark
``` ```
@ -22,13 +22,13 @@ for more installation details.
Additionally, install `vllm` and `lm-evaluation-harness` for evaluation: Additionally, install `vllm` and `lm-evaluation-harness` for evaluation:
```bash ```console
pip install vllm lm-eval==0.4.4 pip install vllm lm-eval==0.4.4
``` ```
## Quantization Process ## Quantization Process
After installing Quark, we will use an example to illustrate how to use Quark. After installing Quark, we will use an example to illustrate how to use Quark.
The Quark quantization process can be listed for 5 steps as below: The Quark quantization process can be listed for 5 steps as below:
1. Load the model 1. Load the model
@ -42,22 +42,20 @@ The Quark quantization process can be listed for 5 steps as below:
Quark uses [Transformers](https://huggingface.co/docs/transformers/en/index) Quark uses [Transformers](https://huggingface.co/docs/transformers/en/index)
to fetch model and tokenizer. to fetch model and tokenizer.
??? Code ```python
from transformers import AutoTokenizer, AutoModelForCausalLM
```python MODEL_ID = "meta-llama/Llama-2-70b-chat-hf"
from transformers import AutoTokenizer, AutoModelForCausalLM MAX_SEQ_LEN = 512
MODEL_ID = "meta-llama/Llama-2-70b-chat-hf" model = AutoModelForCausalLM.from_pretrained(
MAX_SEQ_LEN = 512 MODEL_ID, device_map="auto", torch_dtype="auto",
)
model.eval()
model = AutoModelForCausalLM.from_pretrained( tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, model_max_length=MAX_SEQ_LEN)
MODEL_ID, device_map="auto", torch_dtype="auto", tokenizer.pad_token = tokenizer.eos_token
) ```
model.eval()
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, model_max_length=MAX_SEQ_LEN)
tokenizer.pad_token = tokenizer.eos_token
```
### 2. Prepare the Calibration Dataloader ### 2. Prepare the Calibration Dataloader
@ -65,24 +63,22 @@ Quark uses the [PyTorch Dataloader](https://pytorch.org/tutorials/beginner/basic
to load calibration data. For more details about how to use calibration datasets efficiently, please refer to load calibration data. For more details about how to use calibration datasets efficiently, please refer
to [Adding Calibration Datasets](https://quark.docs.amd.com/latest/pytorch/calibration_datasets.html). to [Adding Calibration Datasets](https://quark.docs.amd.com/latest/pytorch/calibration_datasets.html).
??? Code ```python
from datasets import load_dataset
from torch.utils.data import DataLoader
```python BATCH_SIZE = 1
from datasets import load_dataset NUM_CALIBRATION_DATA = 512
from torch.utils.data import DataLoader
BATCH_SIZE = 1 # Load the dataset and get calibration data.
NUM_CALIBRATION_DATA = 512 dataset = load_dataset("mit-han-lab/pile-val-backup", split="validation")
text_data = dataset["text"][:NUM_CALIBRATION_DATA]
# Load the dataset and get calibration data. tokenized_outputs = tokenizer(text_data, return_tensors="pt",
dataset = load_dataset("mit-han-lab/pile-val-backup", split="validation") padding=True, truncation=True, max_length=MAX_SEQ_LEN)
text_data = dataset["text"][:NUM_CALIBRATION_DATA] calib_dataloader = DataLoader(tokenized_outputs['input_ids'],
batch_size=BATCH_SIZE, drop_last=True)
tokenized_outputs = tokenizer(text_data, return_tensors="pt", ```
padding=True, truncation=True, max_length=MAX_SEQ_LEN)
calib_dataloader = DataLoader(tokenized_outputs['input_ids'],
batch_size=BATCH_SIZE, drop_last=True)
```
### 3. Set the Quantization Configuration ### 3. Set the Quantization Configuration
@ -98,44 +94,42 @@ kv-cache and the quantization algorithm is AutoSmoothQuant.
AutoSmoothQuant config file for Llama is AutoSmoothQuant config file for Llama is
`examples/torch/language_modeling/llm_ptq/models/llama/autosmoothquant_config.json`. `examples/torch/language_modeling/llm_ptq/models/llama/autosmoothquant_config.json`.
??? Code ```python
from quark.torch.quantization import (Config, QuantizationConfig,
FP8E4M3PerTensorSpec,
load_quant_algo_config_from_file)
```python # Define fp8/per-tensor/static spec.
from quark.torch.quantization import (Config, QuantizationConfig, FP8_PER_TENSOR_SPEC = FP8E4M3PerTensorSpec(observer_method="min_max",
FP8E4M3PerTensorSpec, is_dynamic=False).to_quantization_spec()
load_quant_algo_config_from_file)
# Define fp8/per-tensor/static spec. # Define global quantization config, input tensors and weight apply FP8_PER_TENSOR_SPEC.
FP8_PER_TENSOR_SPEC = FP8E4M3PerTensorSpec(observer_method="min_max", global_quant_config = QuantizationConfig(input_tensors=FP8_PER_TENSOR_SPEC,
is_dynamic=False).to_quantization_spec() weight=FP8_PER_TENSOR_SPEC)
# Define global quantization config, input tensors and weight apply FP8_PER_TENSOR_SPEC. # Define quantization config for kv-cache layers, output tensors apply FP8_PER_TENSOR_SPEC.
global_quant_config = QuantizationConfig(input_tensors=FP8_PER_TENSOR_SPEC, KV_CACHE_SPEC = FP8_PER_TENSOR_SPEC
weight=FP8_PER_TENSOR_SPEC) kv_cache_layer_names_for_llama = ["*k_proj", "*v_proj"]
kv_cache_quant_config = {name :
QuantizationConfig(input_tensors=global_quant_config.input_tensors,
weight=global_quant_config.weight,
output_tensors=KV_CACHE_SPEC)
for name in kv_cache_layer_names_for_llama}
layer_quant_config = kv_cache_quant_config.copy()
# Define quantization config for kv-cache layers, output tensors apply FP8_PER_TENSOR_SPEC. # Define algorithm config by config file.
KV_CACHE_SPEC = FP8_PER_TENSOR_SPEC LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE =
kv_cache_layer_names_for_llama = ["*k_proj", "*v_proj"] 'examples/torch/language_modeling/llm_ptq/models/llama/autosmoothquant_config.json'
kv_cache_quant_config = {name : algo_config = load_quant_algo_config_from_file(LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE)
QuantizationConfig(input_tensors=global_quant_config.input_tensors,
weight=global_quant_config.weight,
output_tensors=KV_CACHE_SPEC)
for name in kv_cache_layer_names_for_llama}
layer_quant_config = kv_cache_quant_config.copy()
# Define algorithm config by config file. EXCLUDE_LAYERS = ["lm_head"]
LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE = quant_config = Config(
'examples/torch/language_modeling/llm_ptq/models/llama/autosmoothquant_config.json' global_quant_config=global_quant_config,
algo_config = load_quant_algo_config_from_file(LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE) layer_quant_config=layer_quant_config,
kv_cache_quant_config=kv_cache_quant_config,
EXCLUDE_LAYERS = ["lm_head"] exclude=EXCLUDE_LAYERS,
quant_config = Config( algo_config=algo_config)
global_quant_config=global_quant_config, ```
layer_quant_config=layer_quant_config,
kv_cache_quant_config=kv_cache_quant_config,
exclude=EXCLUDE_LAYERS,
algo_config=algo_config)
```
### 4. Quantize the Model and Export ### 4. Quantize the Model and Export
@ -145,72 +139,68 @@ HuggingFace `safetensors`, you can refer to
[HuggingFace format exporting](https://quark.docs.amd.com/latest/pytorch/export/quark_export_hf.html) [HuggingFace format exporting](https://quark.docs.amd.com/latest/pytorch/export/quark_export_hf.html)
for more exporting format details. for more exporting format details.
??? Code ```python
import torch
from quark.torch import ModelQuantizer, ModelExporter
from quark.torch.export import ExporterConfig, JsonExporterConfig
```python # Apply quantization.
import torch quantizer = ModelQuantizer(quant_config)
from quark.torch import ModelQuantizer, ModelExporter quant_model = quantizer.quantize_model(model, calib_dataloader)
from quark.torch.export import ExporterConfig, JsonExporterConfig
# Apply quantization. # Freeze quantized model to export.
quantizer = ModelQuantizer(quant_config) freezed_model = quantizer.freeze(model)
quant_model = quantizer.quantize_model(model, calib_dataloader)
# Freeze quantized model to export. # Define export config.
freezed_model = quantizer.freeze(model) LLAMA_KV_CACHE_GROUP = ["*k_proj", "*v_proj"]
export_config = ExporterConfig(json_export_config=JsonExporterConfig())
export_config.json_export_config.kv_cache_group = LLAMA_KV_CACHE_GROUP
# Define export config. # Model: Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant
LLAMA_KV_CACHE_GROUP = ["*k_proj", "*v_proj"] EXPORT_DIR = MODEL_ID.split("/")[1] + "-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant"
export_config = ExporterConfig(json_export_config=JsonExporterConfig()) exporter = ModelExporter(config=export_config, export_dir=EXPORT_DIR)
export_config.json_export_config.kv_cache_group = LLAMA_KV_CACHE_GROUP with torch.no_grad():
exporter.export_safetensors_model(freezed_model,
# Model: Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant quant_config=quant_config, tokenizer=tokenizer)
EXPORT_DIR = MODEL_ID.split("/")[1] + "-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant" ```
exporter = ModelExporter(config=export_config, export_dir=EXPORT_DIR)
with torch.no_grad():
exporter.export_safetensors_model(freezed_model,
quant_config=quant_config, tokenizer=tokenizer)
```
### 5. Evaluation in vLLM ### 5. Evaluation in vLLM
Now, you can load and run the Quark quantized model directly through the LLM entrypoint: Now, you can load and run the Quark quantized model directly through the LLM entrypoint:
??? Code ```python
from vllm import LLM, SamplingParams
```python # Sample prompts.
from vllm import LLM, SamplingParams prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
# Create a sampling params object.
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
# Sample prompts. # Create an LLM.
prompts = [ llm = LLM(model="Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant",
"Hello, my name is", kv_cache_dtype='fp8',quantization='quark')
"The president of the United States is", # Generate texts from the prompts. The output is a list of RequestOutput objects
"The capital of France is", # that contain the prompt, generated text, and other information.
"The future of AI is", outputs = llm.generate(prompts, sampling_params)
] # Print the outputs.
# Create a sampling params object. print("\nGenerated Outputs:\n" + "-" * 60)
sampling_params = SamplingParams(temperature=0.8, top_p=0.95) for output in outputs:
prompt = output.prompt
# Create an LLM. generated_text = output.outputs[0].text
llm = LLM(model="Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant", print(f"Prompt: {prompt!r}")
kv_cache_dtype='fp8',quantization='quark') print(f"Output: {generated_text!r}")
# Generate texts from the prompts. The output is a list of RequestOutput objects print("-" * 60)
# that contain the prompt, generated text, and other information. ```
outputs = llm.generate(prompts, sampling_params)
# Print the outputs.
print("\nGenerated Outputs:\n" + "-" * 60)
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}")
print(f"Output: {generated_text!r}")
print("-" * 60)
```
Or, you can use `lm_eval` to evaluate accuracy: Or, you can use `lm_eval` to evaluate accuracy:
```bash ```console
lm_eval --model vllm \ $ lm_eval --model vllm \
--model_args pretrained=Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant,kv_cache_dtype='fp8',quantization='quark' \ --model_args pretrained=Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant,kv_cache_dtype='fp8',quantization='quark' \
--tasks gsm8k --tasks gsm8k
``` ```
@ -222,7 +212,7 @@ to quantize large language models more conveniently. It supports quantizing mode
of different quantization schemes and optimization algorithms. It can export the quantized model of different quantization schemes and optimization algorithms. It can export the quantized model
and run evaluation tasks on the fly. With the script, the example above can be: and run evaluation tasks on the fly. With the script, the example above can be:
```bash ```console
python3 quantize_quark.py --model_dir meta-llama/Llama-2-70b-chat-hf \ python3 quantize_quark.py --model_dir meta-llama/Llama-2-70b-chat-hf \
--output_dir /path/to/output \ --output_dir /path/to/output \
--quant_scheme w_fp8_a_fp8 \ --quant_scheme w_fp8_a_fp8 \

View File

@ -4,7 +4,7 @@ TorchAO is an architecture optimization library for PyTorch, it provides high pe
We recommend installing the latest torchao nightly with We recommend installing the latest torchao nightly with
```bash ```console
# Install the latest TorchAO nightly build # Install the latest TorchAO nightly build
# Choose the CUDA version that matches your system (cu126, cu128, etc.) # Choose the CUDA version that matches your system (cu126, cu128, etc.)
pip install \ pip install \
@ -15,28 +15,26 @@ pip install \
## Quantizing HuggingFace Models ## Quantizing HuggingFace Models
You can quantize your own huggingface model with torchao, e.g. [transformers](https://huggingface.co/docs/transformers/main/en/quantization/torchao) and [diffusers](https://huggingface.co/docs/diffusers/en/quantization/torchao), and save the checkpoint to huggingface hub like [this](https://huggingface.co/jerryzh168/llama3-8b-int8wo) with the following example code: You can quantize your own huggingface model with torchao, e.g. [transformers](https://huggingface.co/docs/transformers/main/en/quantization/torchao) and [diffusers](https://huggingface.co/docs/diffusers/en/quantization/torchao), and save the checkpoint to huggingface hub like [this](https://huggingface.co/jerryzh168/llama3-8b-int8wo) with the following example code:
??? Code ```Python
import torch
from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer
from torchao.quantization import Int8WeightOnlyConfig
```Python model_name = "meta-llama/Meta-Llama-3-8B"
import torch quantization_config = TorchAoConfig(Int8WeightOnlyConfig())
from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer quantized_model = AutoModelForCausalLM.from_pretrained(
from torchao.quantization import Int8WeightOnlyConfig model_name,
torch_dtype="auto",
device_map="auto",
quantization_config=quantization_config
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
input_text = "What are we having for dinner?"
input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
model_name = "meta-llama/Meta-Llama-3-8B" hub_repo = # YOUR HUB REPO ID
quantization_config = TorchAoConfig(Int8WeightOnlyConfig()) tokenizer.push_to_hub(hub_repo)
quantized_model = AutoModelForCausalLM.from_pretrained( quantized_model.push_to_hub(hub_repo, safe_serialization=False)
model_name, ```
torch_dtype="auto",
device_map="auto",
quantization_config=quantization_config
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
input_text = "What are we having for dinner?"
input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
hub_repo = # YOUR HUB REPO ID
tokenizer.push_to_hub(hub_repo)
quantized_model.push_to_hub(hub_repo, safe_serialization=False)
```
Alternatively, you can use the [TorchAO Quantization space](https://huggingface.co/spaces/medmekk/TorchAO_Quantization) for quantizing models with a simple UI. Alternatively, you can use the [TorchAO Quantization space](https://huggingface.co/spaces/medmekk/TorchAO_Quantization) for quantizing models with a simple UI.

View File

@ -33,36 +33,34 @@ vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
Next, make a request to the model that should return the reasoning content in the response. Next, make a request to the model that should return the reasoning content in the response.
??? Code ```python
from openai import OpenAI
```python # Modify OpenAI's API key and API base to use vLLM's API server.
from openai import OpenAI openai_api_key = "EMPTY"
openai_api_base = "http://localhost:8000/v1"
# Modify OpenAI's API key and API base to use vLLM's API server. client = OpenAI(
openai_api_key = "EMPTY" api_key=openai_api_key,
openai_api_base = "http://localhost:8000/v1" base_url=openai_api_base,
)
client = OpenAI( models = client.models.list()
api_key=openai_api_key, model = models.data[0].id
base_url=openai_api_base,
)
models = client.models.list() # Round 1
model = models.data[0].id messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
# For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
# For Qwen3 series, if you want to disable thinking in reasoning mode, add:
# extra_body={"chat_template_kwargs": {"enable_thinking": False}}
response = client.chat.completions.create(model=model, messages=messages)
# Round 1 reasoning_content = response.choices[0].message.reasoning_content
messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}] content = response.choices[0].message.content
# For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
# For Qwen3 series, if you want to disable thinking in reasoning mode, add:
# extra_body={"chat_template_kwargs": {"enable_thinking": False}}
response = client.chat.completions.create(model=model, messages=messages)
reasoning_content = response.choices[0].message.reasoning_content print("reasoning_content:", reasoning_content)
content = response.choices[0].message.content print("content:", content)
```
print("reasoning_content:", reasoning_content)
print("content:", content)
```
The `reasoning_content` field contains the reasoning steps that led to the final conclusion, while the `content` field contains the final conclusion. The `reasoning_content` field contains the reasoning steps that led to the final conclusion, while the `content` field contains the final conclusion.
@ -70,81 +68,77 @@ The `reasoning_content` field contains the reasoning steps that led to the final
Streaming chat completions are also supported for reasoning models. The `reasoning_content` field is available in the `delta` field in [chat completion response chunks](https://platform.openai.com/docs/api-reference/chat/streaming). Streaming chat completions are also supported for reasoning models. The `reasoning_content` field is available in the `delta` field in [chat completion response chunks](https://platform.openai.com/docs/api-reference/chat/streaming).
??? Json ```json
{
```json "id": "chatcmpl-123",
{ "object": "chat.completion.chunk",
"id": "chatcmpl-123", "created": 1694268190,
"object": "chat.completion.chunk", "model": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
"created": 1694268190, "system_fingerprint": "fp_44709d6fcb",
"model": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", "choices": [
"system_fingerprint": "fp_44709d6fcb", {
"choices": [ "index": 0,
{ "delta": {
"index": 0, "role": "assistant",
"delta": { "reasoning_content": "is",
"role": "assistant", },
"reasoning_content": "is", "logprobs": null,
}, "finish_reason": null
"logprobs": null, }
"finish_reason": null ]
} }
] ```
}
```
OpenAI Python client library does not officially support `reasoning_content` attribute for streaming output. But the client supports extra attributes in the response. You can use `hasattr` to check if the `reasoning_content` attribute is present in the response. For example: OpenAI Python client library does not officially support `reasoning_content` attribute for streaming output. But the client supports extra attributes in the response. You can use `hasattr` to check if the `reasoning_content` attribute is present in the response. For example:
??? Code ```python
from openai import OpenAI
```python # Modify OpenAI's API key and API base to use vLLM's API server.
from openai import OpenAI openai_api_key = "EMPTY"
openai_api_base = "http://localhost:8000/v1"
# Modify OpenAI's API key and API base to use vLLM's API server. client = OpenAI(
openai_api_key = "EMPTY" api_key=openai_api_key,
openai_api_base = "http://localhost:8000/v1" base_url=openai_api_base,
)
client = OpenAI( models = client.models.list()
api_key=openai_api_key, model = models.data[0].id
base_url=openai_api_base,
)
models = client.models.list() messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
model = models.data[0].id # For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
# For Qwen3 series, if you want to disable thinking in reasoning mode, add:
# extra_body={"chat_template_kwargs": {"enable_thinking": False}}
stream = client.chat.completions.create(model=model,
messages=messages,
stream=True)
messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}] print("client: Start streaming chat completions...")
# For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}` printed_reasoning_content = False
# For Qwen3 series, if you want to disable thinking in reasoning mode, add: printed_content = False
# extra_body={"chat_template_kwargs": {"enable_thinking": False}}
stream = client.chat.completions.create(model=model,
messages=messages,
stream=True)
print("client: Start streaming chat completions...") for chunk in stream:
printed_reasoning_content = False reasoning_content = None
printed_content = False content = None
# Check the content is reasoning_content or content
if hasattr(chunk.choices[0].delta, "reasoning_content"):
reasoning_content = chunk.choices[0].delta.reasoning_content
elif hasattr(chunk.choices[0].delta, "content"):
content = chunk.choices[0].delta.content
for chunk in stream: if reasoning_content is not None:
reasoning_content = None if not printed_reasoning_content:
content = None printed_reasoning_content = True
# Check the content is reasoning_content or content print("reasoning_content:", end="", flush=True)
if hasattr(chunk.choices[0].delta, "reasoning_content"): print(reasoning_content, end="", flush=True)
reasoning_content = chunk.choices[0].delta.reasoning_content elif content is not None:
elif hasattr(chunk.choices[0].delta, "content"): if not printed_content:
content = chunk.choices[0].delta.content printed_content = True
print("\ncontent:", end="", flush=True)
if reasoning_content is not None: # Extract and print the content
if not printed_reasoning_content: print(content, end="", flush=True)
printed_reasoning_content = True ```
print("reasoning_content:", end="", flush=True)
print(reasoning_content, end="", flush=True)
elif content is not None:
if not printed_content:
printed_content = True
print("\ncontent:", end="", flush=True)
# Extract and print the content
print(content, end="", flush=True)
```
Remember to check whether the `reasoning_content` exists in the response before accessing it. You could checkout the [example](https://github.com/vllm-project/vllm/blob/main/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py). Remember to check whether the `reasoning_content` exists in the response before accessing it. You could checkout the [example](https://github.com/vllm-project/vllm/blob/main/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py).
@ -152,43 +146,41 @@ Remember to check whether the `reasoning_content` exists in the response before
The reasoning content is also available when both tool calling and the reasoning parser are enabled. Additionally, tool calling only parses functions from the `content` field, not from the `reasoning_content`. The reasoning content is also available when both tool calling and the reasoning parser are enabled. Additionally, tool calling only parses functions from the `content` field, not from the `reasoning_content`.
??? Code ```python
from openai import OpenAI
```python client = OpenAI(base_url="http://localhost:8000/v1", api_key="dummy")
from openai import OpenAI
client = OpenAI(base_url="http://localhost:8000/v1", api_key="dummy") tools = [{
"type": "function",
tools = [{ "function": {
"type": "function", "name": "get_weather",
"function": { "description": "Get the current weather in a given location",
"name": "get_weather", "parameters": {
"description": "Get the current weather in a given location", "type": "object",
"parameters": { "properties": {
"type": "object", "location": {"type": "string", "description": "City and state, e.g., 'San Francisco, CA'"},
"properties": { "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}
"location": {"type": "string", "description": "City and state, e.g., 'San Francisco, CA'"}, },
"unit": {"type": "string", "enum": ["celsius", "fahrenheit"]} "required": ["location", "unit"]
},
"required": ["location", "unit"]
}
} }
}] }
}]
response = client.chat.completions.create( response = client.chat.completions.create(
model=client.models.list().data[0].id, model=client.models.list().data[0].id,
messages=[{"role": "user", "content": "What's the weather like in San Francisco?"}], messages=[{"role": "user", "content": "What's the weather like in San Francisco?"}],
tools=tools, tools=tools,
tool_choice="auto" tool_choice="auto"
) )
print(response) print(response)
tool_call = response.choices[0].message.tool_calls[0].function tool_call = response.choices[0].message.tool_calls[0].function
print(f"reasoning_content: {response.choices[0].message.reasoning_content}") print(f"reasoning_content: {response.choices[0].message.reasoning_content}")
print(f"Function called: {tool_call.name}") print(f"Function called: {tool_call.name}")
print(f"Arguments: {tool_call.arguments}") print(f"Arguments: {tool_call.arguments}")
``` ```
For more examples, please refer to <gh-file:examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py>. For more examples, please refer to <gh-file:examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py>.
@ -200,89 +192,85 @@ For more examples, please refer to <gh-file:examples/online_serving/openai_chat_
You can add a new `ReasoningParser` similar to <gh-file:vllm/reasoning/deepseek_r1_reasoning_parser.py>. You can add a new `ReasoningParser` similar to <gh-file:vllm/reasoning/deepseek_r1_reasoning_parser.py>.
??? Code ```python
# import the required packages
```python from vllm.reasoning import ReasoningParser, ReasoningParserManager
# import the required packages from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
DeltaMessage)
from vllm.reasoning import ReasoningParser, ReasoningParserManager # define a reasoning parser and register it to vllm
from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, # the name list in register_module can be used
DeltaMessage) # in --reasoning-parser.
@ReasoningParserManager.register_module(["example"])
class ExampleParser(ReasoningParser):
def __init__(self, tokenizer: AnyTokenizer):
super().__init__(tokenizer)
# define a reasoning parser and register it to vllm def extract_reasoning_content_streaming(
# the name list in register_module can be used self,
# in --reasoning-parser. previous_text: str,
@ReasoningParserManager.register_module(["example"]) current_text: str,
class ExampleParser(ReasoningParser): delta_text: str,
def __init__(self, tokenizer: AnyTokenizer): previous_token_ids: Sequence[int],
super().__init__(tokenizer) current_token_ids: Sequence[int],
delta_token_ids: Sequence[int],
) -> Union[DeltaMessage, None]:
"""
Instance method that should be implemented for extracting reasoning
from an incomplete response; for use when handling reasoning calls and
streaming. Has to be an instance method because it requires state -
the current tokens/diffs, but also the information about what has
previously been parsed and extracted (see constructor)
"""
def extract_reasoning_content_streaming( def extract_reasoning_content(
self, self, model_output: str, request: ChatCompletionRequest
previous_text: str, ) -> tuple[Optional[str], Optional[str]]:
current_text: str, """
delta_text: str, Extract reasoning content from a complete model-generated string.
previous_token_ids: Sequence[int],
current_token_ids: Sequence[int],
delta_token_ids: Sequence[int],
) -> Union[DeltaMessage, None]:
"""
Instance method that should be implemented for extracting reasoning
from an incomplete response; for use when handling reasoning calls and
streaming. Has to be an instance method because it requires state -
the current tokens/diffs, but also the information about what has
previously been parsed and extracted (see constructor)
"""
def extract_reasoning_content( Used for non-streaming responses where we have the entire model response
self, model_output: str, request: ChatCompletionRequest available before sending to the client.
) -> tuple[Optional[str], Optional[str]]:
"""
Extract reasoning content from a complete model-generated string.
Used for non-streaming responses where we have the entire model response Parameters:
available before sending to the client. model_output: str
The model-generated string to extract reasoning content from.
Parameters: request: ChatCompletionRequest
model_output: str The request object that was used to generate the model_output.
The model-generated string to extract reasoning content from.
request: ChatCompletionRequest Returns:
The request object that was used to generate the model_output. tuple[Optional[str], Optional[str]]
A tuple containing the reasoning content and the content.
Returns: """
tuple[Optional[str], Optional[str]] ```
A tuple containing the reasoning content and the content.
"""
```
Additionally, to enable structured output, you'll need to create a new `Reasoner` similar to the one in <gh-file:vllm/reasoning/deepseek_r1_reasoning_parser.py>. Additionally, to enable structured output, you'll need to create a new `Reasoner` similar to the one in <gh-file:vllm/reasoning/deepseek_r1_reasoning_parser.py>.
??? Code ```python
@dataclass
class DeepSeekReasoner(Reasoner):
"""
Reasoner for DeepSeek R series models.
"""
start_token_id: int
end_token_id: int
```python start_token: str = "<think>"
@dataclass end_token: str = "</think>"
class DeepSeekReasoner(Reasoner):
"""
Reasoner for DeepSeek R series models.
"""
start_token_id: int
end_token_id: int
start_token: str = "<think>" @classmethod
end_token: str = "</think>" def from_tokenizer(cls, tokenizer: PreTrainedTokenizer) -> Reasoner:
return cls(start_token_id=tokenizer.encode(
"<think>", add_special_tokens=False)[0],
end_token_id=tokenizer.encode("</think>",
add_special_tokens=False)[0])
@classmethod def is_reasoning_end(self, input_ids: list[int]) -> bool:
def from_tokenizer(cls, tokenizer: PreTrainedTokenizer) -> Reasoner: return self.end_token_id in input_ids
return cls(start_token_id=tokenizer.encode( ...
"<think>", add_special_tokens=False)[0], ```
end_token_id=tokenizer.encode("</think>",
add_special_tokens=False)[0])
def is_reasoning_end(self, input_ids: list[int]) -> bool:
return self.end_token_id in input_ids
...
```
The structured output engine like [xgrammar](https://github.com/mlc-ai/xgrammar) will use `end_token_id` to check if the reasoning content is present in the model output and skip the structured output if it is the case. The structured output engine like [xgrammar](https://github.com/mlc-ai/xgrammar) will use `end_token_id` to check if the reasoning content is present in the model output and skip the structured output if it is the case.

View File

@ -18,31 +18,29 @@ Speculative decoding is a technique which improves inter-token latency in memory
The following code configures vLLM in an offline mode to use speculative decoding with a draft model, speculating 5 tokens at a time. The following code configures vLLM in an offline mode to use speculative decoding with a draft model, speculating 5 tokens at a time.
??? Code ```python
from vllm import LLM, SamplingParams
```python prompts = [
from vllm import LLM, SamplingParams "The future of AI is",
]
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
prompts = [ llm = LLM(
"The future of AI is", model="facebook/opt-6.7b",
] tensor_parallel_size=1,
sampling_params = SamplingParams(temperature=0.8, top_p=0.95) speculative_config={
"model": "facebook/opt-125m",
"num_speculative_tokens": 5,
},
)
outputs = llm.generate(prompts, sampling_params)
llm = LLM( for output in outputs:
model="facebook/opt-6.7b", prompt = output.prompt
tensor_parallel_size=1, generated_text = output.outputs[0].text
speculative_config={ print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
"model": "facebook/opt-125m", ```
"num_speculative_tokens": 5,
},
)
outputs = llm.generate(prompts, sampling_params)
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
```
To perform the same with an online mode launch the server: To perform the same with an online mode launch the server:
@ -62,73 +60,69 @@ python -m vllm.entrypoints.openai.api_server \
Then use a client: Then use a client:
??? Code ```python
from openai import OpenAI
```python # Modify OpenAI's API key and API base to use vLLM's API server.
from openai import OpenAI openai_api_key = "EMPTY"
openai_api_base = "http://localhost:8000/v1"
# Modify OpenAI's API key and API base to use vLLM's API server. client = OpenAI(
openai_api_key = "EMPTY" # defaults to os.environ.get("OPENAI_API_KEY")
openai_api_base = "http://localhost:8000/v1" api_key=openai_api_key,
base_url=openai_api_base,
)
client = OpenAI( models = client.models.list()
# defaults to os.environ.get("OPENAI_API_KEY") model = models.data[0].id
api_key=openai_api_key,
base_url=openai_api_base,
)
models = client.models.list() # Completion API
model = models.data[0].id stream = False
completion = client.completions.create(
model=model,
prompt="The future of AI is",
echo=False,
n=1,
stream=stream,
)
# Completion API print("Completion results:")
stream = False if stream:
completion = client.completions.create( for c in completion:
model=model, print(c)
prompt="The future of AI is", else:
echo=False, print(completion)
n=1, ```
stream=stream,
)
print("Completion results:")
if stream:
for c in completion:
print(c)
else:
print(completion)
```
## Speculating by matching n-grams in the prompt ## Speculating by matching n-grams in the prompt
The following code configures vLLM to use speculative decoding where proposals are generated by The following code configures vLLM to use speculative decoding where proposals are generated by
matching n-grams in the prompt. For more information read [this thread.](https://x.com/joao_gante/status/1747322413006643259) matching n-grams in the prompt. For more information read [this thread.](https://x.com/joao_gante/status/1747322413006643259)
??? Code ```python
from vllm import LLM, SamplingParams
```python prompts = [
from vllm import LLM, SamplingParams "The future of AI is",
]
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
prompts = [ llm = LLM(
"The future of AI is", model="facebook/opt-6.7b",
] tensor_parallel_size=1,
sampling_params = SamplingParams(temperature=0.8, top_p=0.95) speculative_config={
"method": "ngram",
"num_speculative_tokens": 5,
"prompt_lookup_max": 4,
},
)
outputs = llm.generate(prompts, sampling_params)
llm = LLM( for output in outputs:
model="facebook/opt-6.7b", prompt = output.prompt
tensor_parallel_size=1, generated_text = output.outputs[0].text
speculative_config={ print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
"method": "ngram", ```
"num_speculative_tokens": 5,
"prompt_lookup_max": 4,
},
)
outputs = llm.generate(prompts, sampling_params)
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
```
## Speculating using MLP speculators ## Speculating using MLP speculators
@ -137,31 +131,29 @@ draft models that conditioning draft predictions on both context vectors and sam
For more information see [this blog](https://pytorch.org/blog/hitchhikers-guide-speculative-decoding/) or For more information see [this blog](https://pytorch.org/blog/hitchhikers-guide-speculative-decoding/) or
[this technical report](https://arxiv.org/abs/2404.19124). [this technical report](https://arxiv.org/abs/2404.19124).
??? Code ```python
from vllm import LLM, SamplingParams
```python prompts = [
from vllm import LLM, SamplingParams "The future of AI is",
]
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
prompts = [ llm = LLM(
"The future of AI is", model="meta-llama/Meta-Llama-3.1-70B-Instruct",
] tensor_parallel_size=4,
sampling_params = SamplingParams(temperature=0.8, top_p=0.95) speculative_config={
"model": "ibm-ai-platform/llama3-70b-accelerator",
"draft_tensor_parallel_size": 1,
},
)
outputs = llm.generate(prompts, sampling_params)
llm = LLM( for output in outputs:
model="meta-llama/Meta-Llama-3.1-70B-Instruct", prompt = output.prompt
tensor_parallel_size=4, generated_text = output.outputs[0].text
speculative_config={ print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
"model": "ibm-ai-platform/llama3-70b-accelerator", ```
"draft_tensor_parallel_size": 1,
},
)
outputs = llm.generate(prompts, sampling_params)
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
```
Note that these speculative models currently need to be run without tensor parallelism, although Note that these speculative models currently need to be run without tensor parallelism, although
it is possible to run the main model using tensor parallelism (see example above). Since the it is possible to run the main model using tensor parallelism (see example above). Since the
@ -185,33 +177,31 @@ A variety of speculative models of this type are available on HF hub:
The following code configures vLLM to use speculative decoding where proposals are generated by The following code configures vLLM to use speculative decoding where proposals are generated by
an [EAGLE (Extrapolation Algorithm for Greater Language-model Efficiency)](https://arxiv.org/pdf/2401.15077) based draft model. A more detailed example for offline mode, including how to extract request level acceptance rate, can be found [here](gh-file:examples/offline_inference/eagle.py). an [EAGLE (Extrapolation Algorithm for Greater Language-model Efficiency)](https://arxiv.org/pdf/2401.15077) based draft model. A more detailed example for offline mode, including how to extract request level acceptance rate, can be found [here](gh-file:examples/offline_inference/eagle.py).
??? Code ```python
from vllm import LLM, SamplingParams
```python prompts = [
from vllm import LLM, SamplingParams "The future of AI is",
]
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
prompts = [ llm = LLM(
"The future of AI is", model="meta-llama/Meta-Llama-3-8B-Instruct",
] tensor_parallel_size=4,
sampling_params = SamplingParams(temperature=0.8, top_p=0.95) speculative_config={
"model": "yuhuili/EAGLE-LLaMA3-Instruct-8B",
"draft_tensor_parallel_size": 1,
},
)
llm = LLM( outputs = llm.generate(prompts, sampling_params)
model="meta-llama/Meta-Llama-3-8B-Instruct",
tensor_parallel_size=4,
speculative_config={
"model": "yuhuili/EAGLE-LLaMA3-Instruct-8B",
"draft_tensor_parallel_size": 1,
},
)
outputs = llm.generate(prompts, sampling_params) for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
for output in outputs: ```
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
```
A few important things to consider when using the EAGLE based draft models: A few important things to consider when using the EAGLE based draft models:

View File

@ -33,43 +33,39 @@ text.
Now let´s see an example for each of the cases, starting with the `guided_choice`, as it´s the easiest one: Now let´s see an example for each of the cases, starting with the `guided_choice`, as it´s the easiest one:
??? Code ```python
from openai import OpenAI
client = OpenAI(
base_url="http://localhost:8000/v1",
api_key="-",
)
model = client.models.list().data[0].id
```python completion = client.chat.completions.create(
from openai import OpenAI model=model,
client = OpenAI( messages=[
base_url="http://localhost:8000/v1", {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"}
api_key="-", ],
) extra_body={"guided_choice": ["positive", "negative"]},
model = client.models.list().data[0].id )
print(completion.choices[0].message.content)
completion = client.chat.completions.create( ```
model=model,
messages=[
{"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"}
],
extra_body={"guided_choice": ["positive", "negative"]},
)
print(completion.choices[0].message.content)
```
The next example shows how to use the `guided_regex`. The idea is to generate an email address, given a simple regex template: The next example shows how to use the `guided_regex`. The idea is to generate an email address, given a simple regex template:
??? Code ```python
completion = client.chat.completions.create(
```python model=model,
completion = client.chat.completions.create( messages=[
model=model, {
messages=[ "role": "user",
{ "content": "Generate an example email address for Alan Turing, who works in Enigma. End in .com and new line. Example result: alan.turing@enigma.com\n",
"role": "user", }
"content": "Generate an example email address for Alan Turing, who works in Enigma. End in .com and new line. Example result: alan.turing@enigma.com\n", ],
} extra_body={"guided_regex": r"\w+@\w+\.com\n", "stop": ["\n"]},
], )
extra_body={"guided_regex": r"\w+@\w+\.com\n", "stop": ["\n"]}, print(completion.choices[0].message.content)
) ```
print(completion.choices[0].message.content)
```
One of the most relevant features in structured text generation is the option to generate a valid JSON with pre-defined fields and formats. One of the most relevant features in structured text generation is the option to generate a valid JSON with pre-defined fields and formats.
For this we can use the `guided_json` parameter in two different ways: For this we can use the `guided_json` parameter in two different ways:
@ -79,43 +75,41 @@ For this we can use the `guided_json` parameter in two different ways:
The next example shows how to use the `guided_json` parameter with a Pydantic model: The next example shows how to use the `guided_json` parameter with a Pydantic model:
??? Code ```python
from pydantic import BaseModel
from enum import Enum
```python class CarType(str, Enum):
from pydantic import BaseModel sedan = "sedan"
from enum import Enum suv = "SUV"
truck = "Truck"
coupe = "Coupe"
class CarType(str, Enum): class CarDescription(BaseModel):
sedan = "sedan" brand: str
suv = "SUV" model: str
truck = "Truck" car_type: CarType
coupe = "Coupe"
class CarDescription(BaseModel): json_schema = CarDescription.model_json_schema()
brand: str
model: str
car_type: CarType
json_schema = CarDescription.model_json_schema() completion = client.chat.completions.create(
model=model,
completion = client.chat.completions.create( messages=[
model=model, {
messages=[ "role": "user",
{ "content": "Generate a JSON with the brand, model and car_type of the most iconic car from the 90's",
"role": "user", }
"content": "Generate a JSON with the brand, model and car_type of the most iconic car from the 90's", ],
} "response_format": {
], "type": "json_schema",
"response_format": { "json_schema": {
"type": "json_schema", "name": "car-description",
"json_schema": { "schema": CarDescription.model_json_schema()
"name": "car-description",
"schema": CarDescription.model_json_schema()
},
}, },
) },
print(completion.choices[0].message.content) )
``` print(completion.choices[0].message.content)
```
!!! tip !!! tip
While not strictly necessary, normally it´s better to indicate in the prompt the While not strictly necessary, normally it´s better to indicate in the prompt the
@ -127,35 +121,33 @@ difficult to use, but it´s really powerful. It allows us to define complete
languages like SQL queries. It works by using a context free EBNF grammar. languages like SQL queries. It works by using a context free EBNF grammar.
As an example, we can use to define a specific format of simplified SQL queries: As an example, we can use to define a specific format of simplified SQL queries:
??? Code ```python
simplified_sql_grammar = """
root ::= select_statement
```python select_statement ::= "SELECT " column " from " table " where " condition
simplified_sql_grammar = """
root ::= select_statement
select_statement ::= "SELECT " column " from " table " where " condition column ::= "col_1 " | "col_2 "
column ::= "col_1 " | "col_2 " table ::= "table_1 " | "table_2 "
table ::= "table_1 " | "table_2 " condition ::= column "= " number
condition ::= column "= " number number ::= "1 " | "2 "
"""
number ::= "1 " | "2 " completion = client.chat.completions.create(
""" model=model,
messages=[
completion = client.chat.completions.create( {
model=model, "role": "user",
messages=[ "content": "Generate an SQL query to show the 'username' and 'email' from the 'users' table.",
{ }
"role": "user", ],
"content": "Generate an SQL query to show the 'username' and 'email' from the 'users' table.", extra_body={"guided_grammar": simplified_sql_grammar},
} )
], print(completion.choices[0].message.content)
extra_body={"guided_grammar": simplified_sql_grammar}, ```
)
print(completion.choices[0].message.content)
```
See also: [full example](https://docs.vllm.ai/en/latest/examples/online_serving/structured_outputs.html) See also: [full example](https://docs.vllm.ai/en/latest/examples/online_serving/structured_outputs.html)
@ -169,36 +161,34 @@ vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-7B --reasoning-parser deepseek_r
Note that you can use reasoning with any provided structured outputs feature. The following uses one with JSON schema: Note that you can use reasoning with any provided structured outputs feature. The following uses one with JSON schema:
??? Code ```python
from pydantic import BaseModel
```python
from pydantic import BaseModel
class People(BaseModel): class People(BaseModel):
name: str name: str
age: int age: int
completion = client.chat.completions.create( completion = client.chat.completions.create(
model=model, model=model,
messages=[ messages=[
{ {
"role": "user", "role": "user",
"content": "Generate a JSON with the name and age of one random person.", "content": "Generate a JSON with the name and age of one random person.",
} }
], ],
response_format={ response_format={
"type": "json_schema", "type": "json_schema",
"json_schema": { "json_schema": {
"name": "people", "name": "people",
"schema": People.model_json_schema() "schema": People.model_json_schema()
} }
}, },
) )
print("reasoning_content: ", completion.choices[0].message.reasoning_content) print("reasoning_content: ", completion.choices[0].message.reasoning_content)
print("content: ", completion.choices[0].message.content) print("content: ", completion.choices[0].message.content)
``` ```
See also: [full example](https://docs.vllm.ai/en/latest/examples/online_serving/structured_outputs.html) See also: [full example](https://docs.vllm.ai/en/latest/examples/online_serving/structured_outputs.html)
@ -212,33 +202,33 @@ For the following examples, vLLM was setup using `vllm serve meta-llama/Llama-3.
Here is a simple example demonstrating how to get structured output using Pydantic models: Here is a simple example demonstrating how to get structured output using Pydantic models:
??? Code ```python
from pydantic import BaseModel
from openai import OpenAI
```python class Info(BaseModel):
from pydantic import BaseModel name: str
from openai import OpenAI age: int
class Info(BaseModel): client = OpenAI(base_url="http://0.0.0.0:8000/v1", api_key="dummy")
name: str model = client.models.list().data[0].id
age: int completion = client.beta.chat.completions.parse(
model=model,
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "My name is Cameron, I'm 28. What's my name and age?"},
],
response_format=Info,
)
client = OpenAI(base_url="http://0.0.0.0:8000/v1", api_key="dummy") message = completion.choices[0].message
model = client.models.list().data[0].id print(message)
completion = client.beta.chat.completions.parse( assert message.parsed
model=model, print("Name:", message.parsed.name)
messages=[ print("Age:", message.parsed.age)
{"role": "system", "content": "You are a helpful assistant."}, ```
{"role": "user", "content": "My name is Cameron, I'm 28. What's my name and age?"},
],
response_format=Info,
)
message = completion.choices[0].message Output:
print(message)
assert message.parsed
print("Name:", message.parsed.name)
print("Age:", message.parsed.age)
```
```console ```console
ParsedChatCompletionMessage[Testing](content='{"name": "Cameron", "age": 28}', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=[], parsed=Testing(name='Cameron', age=28)) ParsedChatCompletionMessage[Testing](content='{"name": "Cameron", "age": 28}', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=[], parsed=Testing(name='Cameron', age=28))
@ -248,37 +238,35 @@ Age: 28
Here is a more complex example using nested Pydantic models to handle a step-by-step math solution: Here is a more complex example using nested Pydantic models to handle a step-by-step math solution:
??? Code ```python
from typing import List
from pydantic import BaseModel
from openai import OpenAI
```python class Step(BaseModel):
from typing import List explanation: str
from pydantic import BaseModel output: str
from openai import OpenAI
class Step(BaseModel): class MathResponse(BaseModel):
explanation: str steps: list[Step]
output: str final_answer: str
class MathResponse(BaseModel): completion = client.beta.chat.completions.parse(
steps: list[Step] model=model,
final_answer: str messages=[
{"role": "system", "content": "You are a helpful expert math tutor."},
{"role": "user", "content": "Solve 8x + 31 = 2."},
],
response_format=MathResponse,
)
completion = client.beta.chat.completions.parse( message = completion.choices[0].message
model=model, print(message)
messages=[ assert message.parsed
{"role": "system", "content": "You are a helpful expert math tutor."}, for i, step in enumerate(message.parsed.steps):
{"role": "user", "content": "Solve 8x + 31 = 2."}, print(f"Step #{i}:", step)
], print("Answer:", message.parsed.final_answer)
response_format=MathResponse, ```
)
message = completion.choices[0].message
print(message)
assert message.parsed
for i, step in enumerate(message.parsed.steps):
print(f"Step #{i}:", step)
print("Answer:", message.parsed.final_answer)
```
Output: Output:
@ -308,21 +296,19 @@ These parameters can be used in the same way as the parameters from the Online
Serving examples above. One example for the usage of the `choice` parameter is Serving examples above. One example for the usage of the `choice` parameter is
shown below: shown below:
??? Code ```python
from vllm import LLM, SamplingParams
from vllm.sampling_params import GuidedDecodingParams
```python llm = LLM(model="HuggingFaceTB/SmolLM2-1.7B-Instruct")
from vllm import LLM, SamplingParams
from vllm.sampling_params import GuidedDecodingParams
llm = LLM(model="HuggingFaceTB/SmolLM2-1.7B-Instruct") guided_decoding_params = GuidedDecodingParams(choice=["Positive", "Negative"])
sampling_params = SamplingParams(guided_decoding=guided_decoding_params)
guided_decoding_params = GuidedDecodingParams(choice=["Positive", "Negative"]) outputs = llm.generate(
sampling_params = SamplingParams(guided_decoding=guided_decoding_params) prompts="Classify this sentiment: vLLM is wonderful!",
outputs = llm.generate( sampling_params=sampling_params,
prompts="Classify this sentiment: vLLM is wonderful!", )
sampling_params=sampling_params, print(outputs[0].outputs[0].text)
) ```
print(outputs[0].outputs[0].text)
```
See also: [full example](https://docs.vllm.ai/en/latest/examples/online_serving/structured_outputs.html) See also: [full example](https://docs.vllm.ai/en/latest/examples/online_serving/structured_outputs.html)

View File

@ -15,46 +15,44 @@ vllm serve meta-llama/Llama-3.1-8B-Instruct \
Next, make a request to the model that should result in it using the available tools: Next, make a request to the model that should result in it using the available tools:
??? Code ```python
from openai import OpenAI
import json
```python client = OpenAI(base_url="http://localhost:8000/v1", api_key="dummy")
from openai import OpenAI
import json
client = OpenAI(base_url="http://localhost:8000/v1", api_key="dummy") def get_weather(location: str, unit: str):
return f"Getting the weather for {location} in {unit}..."
tool_functions = {"get_weather": get_weather}
def get_weather(location: str, unit: str): tools = [{
return f"Getting the weather for {location} in {unit}..." "type": "function",
tool_functions = {"get_weather": get_weather} "function": {
"name": "get_weather",
tools = [{ "description": "Get the current weather in a given location",
"type": "function", "parameters": {
"function": { "type": "object",
"name": "get_weather", "properties": {
"description": "Get the current weather in a given location", "location": {"type": "string", "description": "City and state, e.g., 'San Francisco, CA'"},
"parameters": { "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}
"type": "object", },
"properties": { "required": ["location", "unit"]
"location": {"type": "string", "description": "City and state, e.g., 'San Francisco, CA'"},
"unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}
},
"required": ["location", "unit"]
}
} }
}] }
}]
response = client.chat.completions.create( response = client.chat.completions.create(
model=client.models.list().data[0].id, model=client.models.list().data[0].id,
messages=[{"role": "user", "content": "What's the weather like in San Francisco?"}], messages=[{"role": "user", "content": "What's the weather like in San Francisco?"}],
tools=tools, tools=tools,
tool_choice="auto" tool_choice="auto"
) )
tool_call = response.choices[0].message.tool_calls[0].function tool_call = response.choices[0].message.tool_calls[0].function
print(f"Function called: {tool_call.name}") print(f"Function called: {tool_call.name}")
print(f"Arguments: {tool_call.arguments}") print(f"Arguments: {tool_call.arguments}")
print(f"Result: {get_weather(**json.loads(tool_call.arguments))}") print(f"Result: {get_weather(**json.loads(tool_call.arguments))}")
``` ```
Example output: Example output:
@ -228,25 +226,6 @@ AI21's Jamba-1.5 models are supported.
Flags: `--tool-call-parser jamba` Flags: `--tool-call-parser jamba`
### xLAM Models (`xlam`)
The xLAM tool parser is designed to support models that generate tool calls in various JSON formats. It detects function calls in several different output styles:
1. Direct JSON arrays: Output strings that are JSON arrays starting with `[` and ending with `]`
2. Thinking tags: Using `<think>...</think>` tags containing JSON arrays
3. Code blocks: JSON in code blocks (```json ...```)
4. Tool calls tags: Using `[TOOL_CALLS]` or `<tool_call>...</tool_call>` tags
Parallel function calls are supported, and the parser can effectively separate text content from tool calls.
Supported models:
* Salesforce Llama-xLAM models: `Salesforce/Llama-xLAM-2-8B-fc-r`, `Salesforce/Llama-xLAM-2-70B-fc-r`
* Qwen-xLAM models: `Salesforce/xLAM-1B-fc-r`, `Salesforce/xLAM-3B-fc-r`, `Salesforce/Qwen-xLAM-32B-fc-r`
Flags:
* For Llama-based xLAM models: `--tool-call-parser xlam --chat-template examples/tool_chat_template_xlam_llama.jinja`
* For Qwen-based xLAM models: `--tool-call-parser xlam --chat-template examples/tool_chat_template_xlam_qwen.jinja`
### Qwen Models ### Qwen Models
For Qwen2.5, the chat template in tokenizer_config.json has already included support for the Hermes-style tool use. Therefore, you can use the `hermes` parser to enable tool calls for Qwen models. For more detailed information, please refer to the official [Qwen documentation](https://qwen.readthedocs.io/en/latest/framework/function_call.html#vllm) For Qwen2.5, the chat template in tokenizer_config.json has already included support for the Hermes-style tool use. Therefore, you can use the `hermes` parser to enable tool calls for Qwen models. For more detailed information, please refer to the official [Qwen documentation](https://qwen.readthedocs.io/en/latest/framework/function_call.html#vllm)
@ -303,55 +282,53 @@ A tool parser plugin is a Python file containing one or more ToolParser implemen
Here is a summary of a plugin file: Here is a summary of a plugin file:
??? Code ```python
```python # import the required packages
# import the required packages # define a tool parser and register it to vllm
# the name list in register_module can be used
# in --tool-call-parser. you can define as many
# tool parsers as you want here.
@ToolParserManager.register_module(["example"])
class ExampleToolParser(ToolParser):
def __init__(self, tokenizer: AnyTokenizer):
super().__init__(tokenizer)
# define a tool parser and register it to vllm # adjust request. e.g.: set skip special tokens
# the name list in register_module can be used # to False for tool call output.
# in --tool-call-parser. you can define as many def adjust_request(
# tool parsers as you want here. self, request: ChatCompletionRequest) -> ChatCompletionRequest:
@ToolParserManager.register_module(["example"]) return request
class ExampleToolParser(ToolParser):
def __init__(self, tokenizer: AnyTokenizer):
super().__init__(tokenizer)
# adjust request. e.g.: set skip special tokens # implement the tool call parse for stream call
# to False for tool call output. def extract_tool_calls_streaming(
def adjust_request( self,
self, request: ChatCompletionRequest) -> ChatCompletionRequest: previous_text: str,
return request current_text: str,
delta_text: str,
previous_token_ids: Sequence[int],
current_token_ids: Sequence[int],
delta_token_ids: Sequence[int],
request: ChatCompletionRequest,
) -> Union[DeltaMessage, None]:
return delta
# implement the tool call parse for stream call # implement the tool parse for non-stream call
def extract_tool_calls_streaming( def extract_tool_calls(
self, self,
previous_text: str, model_output: str,
current_text: str, request: ChatCompletionRequest,
delta_text: str, ) -> ExtractedToolCallInformation:
previous_token_ids: Sequence[int], return ExtractedToolCallInformation(tools_called=False,
current_token_ids: Sequence[int], tool_calls=[],
delta_token_ids: Sequence[int], content=text)
request: ChatCompletionRequest,
) -> Union[DeltaMessage, None]:
return delta
# implement the tool parse for non-stream call ```
def extract_tool_calls(
self,
model_output: str,
request: ChatCompletionRequest,
) -> ExtractedToolCallInformation:
return ExtractedToolCallInformation(tools_called=False,
tool_calls=[],
content=text)
```
Then you can use this plugin in the command line like this. Then you can use this plugin in the command line like this.
```bash ```console
--enable-auto-tool-choice \ --enable-auto-tool-choice \
--tool-parser-plugin <absolute path of the plugin file> --tool-parser-plugin <absolute path of the plugin file>
--tool-call-parser example \ --tool-call-parser example \

View File

@ -26,7 +26,7 @@ The easiest way to launch a Trainium or Inferentia instance with pre-installed N
- After launching the instance, follow the instructions in [Connect to your instance](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/AccessingInstancesLinux.html) to connect to the instance - After launching the instance, follow the instructions in [Connect to your instance](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/AccessingInstancesLinux.html) to connect to the instance
- Once inside your instance, activate the pre-installed virtual environment for inference by running - Once inside your instance, activate the pre-installed virtual environment for inference by running
```bash ```console
source /opt/aws_neuronx_venv_pytorch_2_6_nxd_inference/bin/activate source /opt/aws_neuronx_venv_pytorch_2_6_nxd_inference/bin/activate
``` ```
@ -47,7 +47,7 @@ Currently, there are no pre-built Neuron wheels.
To build and install vLLM from source, run: To build and install vLLM from source, run:
```bash ```console
git clone https://github.com/vllm-project/vllm.git git clone https://github.com/vllm-project/vllm.git
cd vllm cd vllm
pip install -U -r requirements/neuron.txt pip install -U -r requirements/neuron.txt
@ -66,7 +66,7 @@ Refer to [vLLM User Guide for NxD Inference](https://awsdocs-neuron.readthedocs-
To install the AWS Neuron fork, run the following: To install the AWS Neuron fork, run the following:
```bash ```console
git clone -b neuron-2.23-vllm-v0.7.2 https://github.com/aws-neuron/upstreaming-to-vllm.git git clone -b neuron-2.23-vllm-v0.7.2 https://github.com/aws-neuron/upstreaming-to-vllm.git
cd upstreaming-to-vllm cd upstreaming-to-vllm
pip install -r requirements/neuron.txt pip install -r requirements/neuron.txt
@ -100,7 +100,7 @@ to perform most of the heavy lifting which includes PyTorch model initialization
To configure NxD Inference features through the vLLM entrypoint, use the `override_neuron_config` setting. Provide the configs you want to override To configure NxD Inference features through the vLLM entrypoint, use the `override_neuron_config` setting. Provide the configs you want to override
as a dictionary (or JSON object when starting vLLM from the CLI). For example, to disable auto bucketing, include as a dictionary (or JSON object when starting vLLM from the CLI). For example, to disable auto bucketing, include
```python ```console
override_neuron_config={ override_neuron_config={
"enable_bucketing":False, "enable_bucketing":False,
} }
@ -108,7 +108,7 @@ override_neuron_config={
or when launching vLLM from the CLI, pass or when launching vLLM from the CLI, pass
```bash ```console
--override-neuron-config "{\"enable_bucketing\":false}" --override-neuron-config "{\"enable_bucketing\":false}"
``` ```

View File

@ -76,25 +76,21 @@ Currently, there are no pre-built CPU wheels.
### Build image from source ### Build image from source
??? Commands ```console
$ docker build -f docker/Dockerfile.cpu --tag vllm-cpu-env --target vllm-openai .
```bash # Launching OpenAI server
docker build -f docker/Dockerfile.cpu \ $ docker run --rm \
--tag vllm-cpu-env \ --privileged=true \
--target vllm-openai . --shm-size=4g \
-p 8000:8000 \
# Launching OpenAI server -e VLLM_CPU_KVCACHE_SPACE=<KV cache space> \
docker run --rm \ -e VLLM_CPU_OMP_THREADS_BIND=<CPU cores for inference> \
--privileged=true \ vllm-cpu-env \
--shm-size=4g \ --model=meta-llama/Llama-3.2-1B-Instruct \
-p 8000:8000 \ --dtype=bfloat16 \
-e VLLM_CPU_KVCACHE_SPACE=<KV cache space> \ other vLLM OpenAI server arguments
-e VLLM_CPU_OMP_THREADS_BIND=<CPU cores for inference> \ ```
vllm-cpu-env \
--model=meta-llama/Llama-3.2-1B-Instruct \
--dtype=bfloat16 \
other vLLM OpenAI server arguments
```
!!! tip !!! tip
For ARM or Apple silicon, use `docker/Dockerfile.arm` For ARM or Apple silicon, use `docker/Dockerfile.arm`
@ -123,7 +119,7 @@ vLLM CPU backend supports the following vLLM features:
- We highly recommend to use TCMalloc for high performance memory allocation and better cache locality. For example, on Ubuntu 22.4, you can run: - We highly recommend to use TCMalloc for high performance memory allocation and better cache locality. For example, on Ubuntu 22.4, you can run:
```bash ```console
sudo apt-get install libtcmalloc-minimal4 # install TCMalloc library sudo apt-get install libtcmalloc-minimal4 # install TCMalloc library
find / -name *libtcmalloc* # find the dynamic link library path find / -name *libtcmalloc* # find the dynamic link library path
export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$LD_PRELOAD # prepend the library to LD_PRELOAD export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$LD_PRELOAD # prepend the library to LD_PRELOAD
@ -132,7 +128,7 @@ python examples/offline_inference/basic/basic.py # run vLLM
- When using the online serving, it is recommended to reserve 1-2 CPU cores for the serving framework to avoid CPU oversubscription. For example, on a platform with 32 physical CPU cores, reserving CPU 30 and 31 for the framework and using CPU 0-29 for OpenMP: - When using the online serving, it is recommended to reserve 1-2 CPU cores for the serving framework to avoid CPU oversubscription. For example, on a platform with 32 physical CPU cores, reserving CPU 30 and 31 for the framework and using CPU 0-29 for OpenMP:
```bash ```console
export VLLM_CPU_KVCACHE_SPACE=40 export VLLM_CPU_KVCACHE_SPACE=40
export VLLM_CPU_OMP_THREADS_BIND=0-29 export VLLM_CPU_OMP_THREADS_BIND=0-29
vllm serve facebook/opt-125m vllm serve facebook/opt-125m
@ -140,7 +136,7 @@ vllm serve facebook/opt-125m
or using default auto thread binding: or using default auto thread binding:
```bash ```console
export VLLM_CPU_KVCACHE_SPACE=40 export VLLM_CPU_KVCACHE_SPACE=40
export VLLM_CPU_NUM_OF_RESERVED_CPU=2 export VLLM_CPU_NUM_OF_RESERVED_CPU=2
vllm serve facebook/opt-125m vllm serve facebook/opt-125m
@ -148,34 +144,32 @@ vllm serve facebook/opt-125m
- If using vLLM CPU backend on a machine with hyper-threading, it is recommended to bind only one OpenMP thread on each physical CPU core using `VLLM_CPU_OMP_THREADS_BIND` or using auto thread binding feature by default. On a hyper-threading enabled platform with 16 logical CPU cores / 8 physical CPU cores: - If using vLLM CPU backend on a machine with hyper-threading, it is recommended to bind only one OpenMP thread on each physical CPU core using `VLLM_CPU_OMP_THREADS_BIND` or using auto thread binding feature by default. On a hyper-threading enabled platform with 16 logical CPU cores / 8 physical CPU cores:
??? Commands ```console
$ lscpu -e # check the mapping between logical CPU cores and physical CPU cores
```console # The "CPU" column means the logical CPU core IDs, and the "CORE" column means the physical core IDs. On this platform, two logical cores are sharing one physical core.
$ lscpu -e # check the mapping between logical CPU cores and physical CPU cores CPU NODE SOCKET CORE L1d:L1i:L2:L3 ONLINE MAXMHZ MINMHZ MHZ
0 0 0 0 0:0:0:0 yes 2401.0000 800.0000 800.000
1 0 0 1 1:1:1:0 yes 2401.0000 800.0000 800.000
2 0 0 2 2:2:2:0 yes 2401.0000 800.0000 800.000
3 0 0 3 3:3:3:0 yes 2401.0000 800.0000 800.000
4 0 0 4 4:4:4:0 yes 2401.0000 800.0000 800.000
5 0 0 5 5:5:5:0 yes 2401.0000 800.0000 800.000
6 0 0 6 6:6:6:0 yes 2401.0000 800.0000 800.000
7 0 0 7 7:7:7:0 yes 2401.0000 800.0000 800.000
8 0 0 0 0:0:0:0 yes 2401.0000 800.0000 800.000
9 0 0 1 1:1:1:0 yes 2401.0000 800.0000 800.000
10 0 0 2 2:2:2:0 yes 2401.0000 800.0000 800.000
11 0 0 3 3:3:3:0 yes 2401.0000 800.0000 800.000
12 0 0 4 4:4:4:0 yes 2401.0000 800.0000 800.000
13 0 0 5 5:5:5:0 yes 2401.0000 800.0000 800.000
14 0 0 6 6:6:6:0 yes 2401.0000 800.0000 800.000
15 0 0 7 7:7:7:0 yes 2401.0000 800.0000 800.000
# The "CPU" column means the logical CPU core IDs, and the "CORE" column means the physical core IDs. On this platform, two logical cores are sharing one physical core. # On this platform, it is recommend to only bind openMP threads on logical CPU cores 0-7 or 8-15
CPU NODE SOCKET CORE L1d:L1i:L2:L3 ONLINE MAXMHZ MINMHZ MHZ $ export VLLM_CPU_OMP_THREADS_BIND=0-7
0 0 0 0 0:0:0:0 yes 2401.0000 800.0000 800.000 $ python examples/offline_inference/basic/basic.py
1 0 0 1 1:1:1:0 yes 2401.0000 800.0000 800.000 ```
2 0 0 2 2:2:2:0 yes 2401.0000 800.0000 800.000
3 0 0 3 3:3:3:0 yes 2401.0000 800.0000 800.000
4 0 0 4 4:4:4:0 yes 2401.0000 800.0000 800.000
5 0 0 5 5:5:5:0 yes 2401.0000 800.0000 800.000
6 0 0 6 6:6:6:0 yes 2401.0000 800.0000 800.000
7 0 0 7 7:7:7:0 yes 2401.0000 800.0000 800.000
8 0 0 0 0:0:0:0 yes 2401.0000 800.0000 800.000
9 0 0 1 1:1:1:0 yes 2401.0000 800.0000 800.000
10 0 0 2 2:2:2:0 yes 2401.0000 800.0000 800.000
11 0 0 3 3:3:3:0 yes 2401.0000 800.0000 800.000
12 0 0 4 4:4:4:0 yes 2401.0000 800.0000 800.000
13 0 0 5 5:5:5:0 yes 2401.0000 800.0000 800.000
14 0 0 6 6:6:6:0 yes 2401.0000 800.0000 800.000
15 0 0 7 7:7:7:0 yes 2401.0000 800.0000 800.000
# On this platform, it is recommend to only bind openMP threads on logical CPU cores 0-7 or 8-15
$ export VLLM_CPU_OMP_THREADS_BIND=0-7
$ python examples/offline_inference/basic/basic.py
```
- If using vLLM CPU backend on a multi-socket machine with NUMA, be aware to set CPU cores using `VLLM_CPU_OMP_THREADS_BIND` to avoid cross NUMA node memory access. - If using vLLM CPU backend on a multi-socket machine with NUMA, be aware to set CPU cores using `VLLM_CPU_OMP_THREADS_BIND` to avoid cross NUMA node memory access.
@ -189,20 +183,14 @@ vllm serve facebook/opt-125m
- Tensor Parallel is supported for serving and offline inferencing. In general each NUMA node is treated as one GPU card. Below is the example script to enable Tensor Parallel = 2 for serving: - Tensor Parallel is supported for serving and offline inferencing. In general each NUMA node is treated as one GPU card. Below is the example script to enable Tensor Parallel = 2 for serving:
```bash ```console
VLLM_CPU_KVCACHE_SPACE=40 VLLM_CPU_OMP_THREADS_BIND="0-31|32-63" \ VLLM_CPU_KVCACHE_SPACE=40 VLLM_CPU_OMP_THREADS_BIND="0-31|32-63" vllm serve meta-llama/Llama-2-7b-chat-hf -tp=2 --distributed-executor-backend mp
vllm serve meta-llama/Llama-2-7b-chat-hf \
-tp=2 \
--distributed-executor-backend mp
``` ```
or using default auto thread binding: or using default auto thread binding:
```bash ```console
VLLM_CPU_KVCACHE_SPACE=40 \ VLLM_CPU_KVCACHE_SPACE=40 vllm serve meta-llama/Llama-2-7b-chat-hf -tp=2 --distributed-executor-backend mp
vllm serve meta-llama/Llama-2-7b-chat-hf \
-tp=2 \
--distributed-executor-backend mp
``` ```
- For each thread id list in `VLLM_CPU_OMP_THREADS_BIND`, users should guarantee threads in the list belong to a same NUMA node. - For each thread id list in `VLLM_CPU_OMP_THREADS_BIND`, users should guarantee threads in the list belong to a same NUMA node.

View File

@ -25,11 +25,11 @@ Currently the CPU implementation for macOS supports FP32 and FP16 datatypes.
After installation of XCode and the Command Line Tools, which include Apple Clang, execute the following commands to build and install vLLM from the source. After installation of XCode and the Command Line Tools, which include Apple Clang, execute the following commands to build and install vLLM from the source.
```bash ```console
git clone https://github.com/vllm-project/vllm.git git clone https://github.com/vllm-project/vllm.git
cd vllm cd vllm
pip install -r requirements/cpu.txt pip install -r requirements/cpu.txt
pip install -e . pip install -e .
``` ```
!!! note !!! note

View File

@ -23,7 +23,7 @@ ARM CPU backend currently supports Float32, FP16 and BFloat16 datatypes.
# --8<-- [end:pre-built-wheels] # --8<-- [end:pre-built-wheels]
# --8<-- [start:build-wheel-from-source] # --8<-- [start:build-wheel-from-source]
--8<-- "docs/getting_started/installation/cpu/build.inc.md" --8<-- "docs/getting_started/installation/cpu/cpu/build.inc.md"
Testing has been conducted on AWS Graviton3 instances for compatibility. Testing has been conducted on AWS Graviton3 instances for compatibility.

View File

@ -1,6 +1,6 @@
First, install recommended compiler. We recommend to use `gcc/g++ >= 12.3.0` as the default compiler to avoid potential problems. For example, on Ubuntu 22.4, you can run: First, install recommended compiler. We recommend to use `gcc/g++ >= 12.3.0` as the default compiler to avoid potential problems. For example, on Ubuntu 22.4, you can run:
```bash ```console
sudo apt-get update -y sudo apt-get update -y
sudo apt-get install -y gcc-12 g++-12 libnuma-dev python3-dev sudo apt-get install -y gcc-12 g++-12 libnuma-dev python3-dev
sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
@ -8,14 +8,14 @@ sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /
Second, clone vLLM project: Second, clone vLLM project:
```bash ```console
git clone https://github.com/vllm-project/vllm.git vllm_source git clone https://github.com/vllm-project/vllm.git vllm_source
cd vllm_source cd vllm_source
``` ```
Third, install Python packages for vLLM CPU backend building: Third, install Python packages for vLLM CPU backend building:
```bash ```console
pip install --upgrade pip pip install --upgrade pip
pip install "cmake>=3.26.1" wheel packaging ninja "setuptools-scm>=8" numpy pip install "cmake>=3.26.1" wheel packaging ninja "setuptools-scm>=8" numpy
pip install -v -r requirements/cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu pip install -v -r requirements/cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
@ -23,13 +23,13 @@ pip install -v -r requirements/cpu.txt --extra-index-url https://download.pytorc
Finally, build and install vLLM CPU backend: Finally, build and install vLLM CPU backend:
```bash ```console
VLLM_TARGET_DEVICE=cpu python setup.py install VLLM_TARGET_DEVICE=cpu python setup.py install
``` ```
If you want to develop vllm, install it in editable mode instead. If you want to develop vllm, install it in editable mode instead.
```bash ```console
VLLM_TARGET_DEVICE=cpu python setup.py develop VLLM_TARGET_DEVICE=cpu python setup.py develop
``` ```

View File

@ -26,7 +26,7 @@ Currently the CPU implementation for s390x architecture supports FP32 datatype o
Install the following packages from the package manager before building the vLLM. For example on RHEL 9.4: Install the following packages from the package manager before building the vLLM. For example on RHEL 9.4:
```bash ```console
dnf install -y \ dnf install -y \
which procps findutils tar vim git gcc g++ make patch make cython zlib-devel \ which procps findutils tar vim git gcc g++ make patch make cython zlib-devel \
libjpeg-turbo-devel libtiff-devel libpng-devel libwebp-devel freetype-devel harfbuzz-devel \ libjpeg-turbo-devel libtiff-devel libpng-devel libwebp-devel freetype-devel harfbuzz-devel \
@ -35,7 +35,7 @@ dnf install -y \
Install rust>=1.80 which is needed for `outlines-core` and `uvloop` python packages installation. Install rust>=1.80 which is needed for `outlines-core` and `uvloop` python packages installation.
```bash ```console
curl https://sh.rustup.rs -sSf | sh -s -- -y && \ curl https://sh.rustup.rs -sSf | sh -s -- -y && \
. "$HOME/.cargo/env" . "$HOME/.cargo/env"
``` ```
@ -45,7 +45,7 @@ Execute the following commands to build and install vLLM from the source.
!!! tip !!! tip
Please build the following dependencies, `torchvision`, `pyarrow` from the source before building vLLM. Please build the following dependencies, `torchvision`, `pyarrow` from the source before building vLLM.
```bash ```console
sed -i '/^torch/d' requirements-build.txt # remove torch from requirements-build.txt since we use nightly builds sed -i '/^torch/d' requirements-build.txt # remove torch from requirements-build.txt since we use nightly builds
pip install -v \ pip install -v \
--extra-index-url https://download.pytorch.org/whl/nightly/cpu \ --extra-index-url https://download.pytorch.org/whl/nightly/cpu \

View File

@ -24,7 +24,7 @@ vLLM initially supports basic model inferencing and serving on x86 CPU platform,
# --8<-- [end:pre-built-wheels] # --8<-- [end:pre-built-wheels]
# --8<-- [start:build-wheel-from-source] # --8<-- [start:build-wheel-from-source]
--8<-- "docs/getting_started/installation/cpu/build.inc.md" --8<-- "docs/getting_started/installation/cpu/cpu/build.inc.md"
!!! note !!! note
- AVX512_BF16 is an extension ISA provides native BF16 data type conversion and vector product instructions, which brings some performance improvement compared with pure AVX512. The CPU backend build script will check the host CPU flags to determine whether to enable AVX512_BF16. - AVX512_BF16 is an extension ISA provides native BF16 data type conversion and vector product instructions, which brings some performance improvement compared with pure AVX512. The CPU backend build script will check the host CPU flags to determine whether to enable AVX512_BF16.

View File

@ -68,7 +68,7 @@ For more information about using TPUs with GKE, see:
Create a TPU v5e with 4 TPU chips: Create a TPU v5e with 4 TPU chips:
```bash ```console
gcloud alpha compute tpus queued-resources create QUEUED_RESOURCE_ID \ gcloud alpha compute tpus queued-resources create QUEUED_RESOURCE_ID \
--node-id TPU_NAME \ --node-id TPU_NAME \
--project PROJECT_ID \ --project PROJECT_ID \
@ -156,13 +156,13 @@ See [deployment-docker-pre-built-image][deployment-docker-pre-built-image] for i
You can use <gh-file:docker/Dockerfile.tpu> to build a Docker image with TPU support. You can use <gh-file:docker/Dockerfile.tpu> to build a Docker image with TPU support.
```bash ```console
docker build -f docker/Dockerfile.tpu -t vllm-tpu . docker build -f docker/Dockerfile.tpu -t vllm-tpu .
``` ```
Run the Docker image with the following command: Run the Docker image with the following command:
```bash ```console
# Make sure to add `--privileged --net host --shm-size=16G`. # Make sure to add `--privileged --net host --shm-size=16G`.
docker run --privileged --net host --shm-size=16G -it vllm-tpu docker run --privileged --net host --shm-size=16G -it vllm-tpu
``` ```
@ -185,6 +185,6 @@ docker run --privileged --net host --shm-size=16G -it vllm-tpu
Install OpenBLAS with the following command: Install OpenBLAS with the following command:
```bash ```console
sudo apt-get install --no-install-recommends --yes libopenblas-base libopenmpi-dev libomp-dev sudo apt-get install --no-install-recommends --yes libopenblas-base libopenmpi-dev libomp-dev
``` ```

View File

@ -22,7 +22,7 @@ Therefore, it is recommended to install vLLM with a **fresh new** environment. I
You can install vLLM using either `pip` or `uv pip`: You can install vLLM using either `pip` or `uv pip`:
```bash ```console
# Install vLLM with CUDA 12.8. # Install vLLM with CUDA 12.8.
# If you are using pip. # If you are using pip.
pip install vllm --extra-index-url https://download.pytorch.org/whl/cu128 pip install vllm --extra-index-url https://download.pytorch.org/whl/cu128
@ -37,7 +37,7 @@ We recommend leveraging `uv` to [automatically select the appropriate PyTorch in
As of now, vLLM's binaries are compiled with CUDA 12.8 and public PyTorch release versions by default. We also provide vLLM binaries compiled with CUDA 12.6, 11.8, and public PyTorch release versions: As of now, vLLM's binaries are compiled with CUDA 12.8 and public PyTorch release versions by default. We also provide vLLM binaries compiled with CUDA 12.6, 11.8, and public PyTorch release versions:
```bash ```console
# Install vLLM with CUDA 11.8. # Install vLLM with CUDA 11.8.
export VLLM_VERSION=0.6.1.post1 export VLLM_VERSION=0.6.1.post1
export PYTHON_VERSION=312 export PYTHON_VERSION=312
@ -52,7 +52,7 @@ LLM inference is a fast-evolving field, and the latest code may contain bug fixe
##### Install the latest code using `pip` ##### Install the latest code using `pip`
```bash ```console
pip install -U vllm \ pip install -U vllm \
--pre \ --pre \
--extra-index-url https://wheels.vllm.ai/nightly --extra-index-url https://wheels.vllm.ai/nightly
@ -62,7 +62,7 @@ pip install -U vllm \
Another way to install the latest code is to use `uv`: Another way to install the latest code is to use `uv`:
```bash ```console
uv pip install -U vllm \ uv pip install -U vllm \
--torch-backend=auto \ --torch-backend=auto \
--extra-index-url https://wheels.vllm.ai/nightly --extra-index-url https://wheels.vllm.ai/nightly
@ -72,7 +72,7 @@ uv pip install -U vllm \
If you want to access the wheels for previous commits (e.g. to bisect the behavior change, performance regression), due to the limitation of `pip`, you have to specify the full URL of the wheel file by embedding the commit hash in the URL: If you want to access the wheels for previous commits (e.g. to bisect the behavior change, performance regression), due to the limitation of `pip`, you have to specify the full URL of the wheel file by embedding the commit hash in the URL:
```bash ```console
export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch
pip install https://wheels.vllm.ai/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl pip install https://wheels.vllm.ai/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
``` ```
@ -83,7 +83,7 @@ Note that the wheels are built with Python 3.8 ABI (see [PEP 425](https://peps.p
If you want to access the wheels for previous commits (e.g. to bisect the behavior change, performance regression), you can specify the commit hash in the URL: If you want to access the wheels for previous commits (e.g. to bisect the behavior change, performance regression), you can specify the commit hash in the URL:
```bash ```console
export VLLM_COMMIT=72d9c316d3f6ede485146fe5aabd4e61dbc59069 # use full commit hash from the main branch export VLLM_COMMIT=72d9c316d3f6ede485146fe5aabd4e61dbc59069 # use full commit hash from the main branch
uv pip install vllm \ uv pip install vllm \
--torch-backend=auto \ --torch-backend=auto \
@ -99,7 +99,7 @@ The `uv` approach works for vLLM `v0.6.6` and later and offers an easy-to-rememb
If you only need to change Python code, you can build and install vLLM without compilation. Using `pip`'s [`--editable` flag](https://pip.pypa.io/en/stable/topics/local-project-installs/#editable-installs), changes you make to the code will be reflected when you run vLLM: If you only need to change Python code, you can build and install vLLM without compilation. Using `pip`'s [`--editable` flag](https://pip.pypa.io/en/stable/topics/local-project-installs/#editable-installs), changes you make to the code will be reflected when you run vLLM:
```bash ```console
git clone https://github.com/vllm-project/vllm.git git clone https://github.com/vllm-project/vllm.git
cd vllm cd vllm
VLLM_USE_PRECOMPILED=1 pip install --editable . VLLM_USE_PRECOMPILED=1 pip install --editable .
@ -118,7 +118,7 @@ This command will do the following:
In case you see an error about wheel not found when running the above command, it might be because the commit you based on in the main branch was just merged and the wheel is being built. In this case, you can wait for around an hour to try again, or manually assign the previous commit in the installation using the `VLLM_PRECOMPILED_WHEEL_LOCATION` environment variable. In case you see an error about wheel not found when running the above command, it might be because the commit you based on in the main branch was just merged and the wheel is being built. In this case, you can wait for around an hour to try again, or manually assign the previous commit in the installation using the `VLLM_PRECOMPILED_WHEEL_LOCATION` environment variable.
```bash ```console
export VLLM_COMMIT=72d9c316d3f6ede485146fe5aabd4e61dbc59069 # use full commit hash from the main branch export VLLM_COMMIT=72d9c316d3f6ede485146fe5aabd4e61dbc59069 # use full commit hash from the main branch
export VLLM_PRECOMPILED_WHEEL_LOCATION=https://wheels.vllm.ai/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl export VLLM_PRECOMPILED_WHEEL_LOCATION=https://wheels.vllm.ai/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
pip install --editable . pip install --editable .
@ -134,7 +134,7 @@ You can find more information about vLLM's wheels in [install-the-latest-code][i
If you want to modify C++ or CUDA code, you'll need to build vLLM from source. This can take several minutes: If you want to modify C++ or CUDA code, you'll need to build vLLM from source. This can take several minutes:
```bash ```console
git clone https://github.com/vllm-project/vllm.git git clone https://github.com/vllm-project/vllm.git
cd vllm cd vllm
pip install -e . pip install -e .
@ -160,7 +160,7 @@ There are scenarios where the PyTorch dependency cannot be easily installed via
To build vLLM using an existing PyTorch installation: To build vLLM using an existing PyTorch installation:
```bash ```console
git clone https://github.com/vllm-project/vllm.git git clone https://github.com/vllm-project/vllm.git
cd vllm cd vllm
python use_existing_torch.py python use_existing_torch.py
@ -173,7 +173,7 @@ pip install --no-build-isolation -e .
Currently, before starting the build process, vLLM fetches cutlass code from GitHub. However, there may be scenarios where you want to use a local version of cutlass instead. Currently, before starting the build process, vLLM fetches cutlass code from GitHub. However, there may be scenarios where you want to use a local version of cutlass instead.
To achieve this, you can set the environment variable VLLM_CUTLASS_SRC_DIR to point to your local cutlass directory. To achieve this, you can set the environment variable VLLM_CUTLASS_SRC_DIR to point to your local cutlass directory.
```bash ```console
git clone https://github.com/vllm-project/vllm.git git clone https://github.com/vllm-project/vllm.git
cd vllm cd vllm
VLLM_CUTLASS_SRC_DIR=/path/to/cutlass pip install -e . VLLM_CUTLASS_SRC_DIR=/path/to/cutlass pip install -e .
@ -184,7 +184,7 @@ VLLM_CUTLASS_SRC_DIR=/path/to/cutlass pip install -e .
To avoid your system being overloaded, you can limit the number of compilation jobs To avoid your system being overloaded, you can limit the number of compilation jobs
to be run simultaneously, via the environment variable `MAX_JOBS`. For example: to be run simultaneously, via the environment variable `MAX_JOBS`. For example:
```bash ```console
export MAX_JOBS=6 export MAX_JOBS=6
pip install -e . pip install -e .
``` ```
@ -194,7 +194,7 @@ A side effect is a much slower build process.
Additionally, if you have trouble building vLLM, we recommend using the NVIDIA PyTorch Docker image. Additionally, if you have trouble building vLLM, we recommend using the NVIDIA PyTorch Docker image.
```bash ```console
# Use `--ipc=host` to make sure the shared memory is large enough. # Use `--ipc=host` to make sure the shared memory is large enough.
docker run \ docker run \
--gpus all \ --gpus all \
@ -205,14 +205,14 @@ docker run \
If you don't want to use docker, it is recommended to have a full installation of CUDA Toolkit. You can download and install it from [the official website](https://developer.nvidia.com/cuda-toolkit-archive). After installation, set the environment variable `CUDA_HOME` to the installation path of CUDA Toolkit, and make sure that the `nvcc` compiler is in your `PATH`, e.g.: If you don't want to use docker, it is recommended to have a full installation of CUDA Toolkit. You can download and install it from [the official website](https://developer.nvidia.com/cuda-toolkit-archive). After installation, set the environment variable `CUDA_HOME` to the installation path of CUDA Toolkit, and make sure that the `nvcc` compiler is in your `PATH`, e.g.:
```bash ```console
export CUDA_HOME=/usr/local/cuda export CUDA_HOME=/usr/local/cuda
export PATH="${CUDA_HOME}/bin:$PATH" export PATH="${CUDA_HOME}/bin:$PATH"
``` ```
Here is a sanity check to verify that the CUDA Toolkit is correctly installed: Here is a sanity check to verify that the CUDA Toolkit is correctly installed:
```bash ```console
nvcc --version # verify that nvcc is in your PATH nvcc --version # verify that nvcc is in your PATH
${CUDA_HOME}/bin/nvcc --version # verify that nvcc is in your CUDA_HOME ${CUDA_HOME}/bin/nvcc --version # verify that nvcc is in your CUDA_HOME
``` ```
@ -223,7 +223,7 @@ vLLM can fully run only on Linux but for development purposes, you can still bui
Simply disable the `VLLM_TARGET_DEVICE` environment variable before installing: Simply disable the `VLLM_TARGET_DEVICE` environment variable before installing:
```bash ```console
export VLLM_TARGET_DEVICE=empty export VLLM_TARGET_DEVICE=empty
pip install -e . pip install -e .
``` ```
@ -238,7 +238,7 @@ See [deployment-docker-pre-built-image][deployment-docker-pre-built-image] for i
Another way to access the latest code is to use the docker images: Another way to access the latest code is to use the docker images:
```bash ```console
export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch
docker pull public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:${VLLM_COMMIT} docker pull public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:${VLLM_COMMIT}
``` ```

View File

@ -31,17 +31,17 @@ Currently, there are no pre-built ROCm wheels.
Alternatively, you can install PyTorch using PyTorch wheels. You can check PyTorch installation guide in PyTorch [Getting Started](https://pytorch.org/get-started/locally/). Example: Alternatively, you can install PyTorch using PyTorch wheels. You can check PyTorch installation guide in PyTorch [Getting Started](https://pytorch.org/get-started/locally/). Example:
```bash ```console
# Install PyTorch # Install PyTorch
pip uninstall torch -y $ pip uninstall torch -y
pip install --no-cache-dir --pre torch --index-url https://download.pytorch.org/whl/nightly/rocm6.3 $ pip install --no-cache-dir --pre torch --index-url https://download.pytorch.org/whl/nightly/rocm6.3
``` ```
1. Install [Triton flash attention for ROCm](https://github.com/ROCm/triton) 1. Install [Triton flash attention for ROCm](https://github.com/ROCm/triton)
Install ROCm's Triton flash attention (the default triton-mlir branch) following the instructions from [ROCm/triton](https://github.com/ROCm/triton/blob/triton-mlir/README.md) Install ROCm's Triton flash attention (the default triton-mlir branch) following the instructions from [ROCm/triton](https://github.com/ROCm/triton/blob/triton-mlir/README.md)
```bash ```console
python3 -m pip install ninja cmake wheel pybind11 python3 -m pip install ninja cmake wheel pybind11
pip uninstall -y triton pip uninstall -y triton
git clone https://github.com/OpenAI/triton.git git clone https://github.com/OpenAI/triton.git
@ -62,7 +62,7 @@ Currently, there are no pre-built ROCm wheels.
For example, for ROCm 6.3, suppose your gfx arch is `gfx90a`. To get your gfx architecture, run `rocminfo |grep gfx`. For example, for ROCm 6.3, suppose your gfx arch is `gfx90a`. To get your gfx architecture, run `rocminfo |grep gfx`.
```bash ```console
git clone https://github.com/ROCm/flash-attention.git git clone https://github.com/ROCm/flash-attention.git
cd flash-attention cd flash-attention
git checkout b7d29fb git checkout b7d29fb
@ -76,7 +76,7 @@ Currently, there are no pre-built ROCm wheels.
3. If you choose to build AITER yourself to use a certain branch or commit, you can build AITER using the following steps: 3. If you choose to build AITER yourself to use a certain branch or commit, you can build AITER using the following steps:
```bash ```console
python3 -m pip uninstall -y aiter python3 -m pip uninstall -y aiter
git clone --recursive https://github.com/ROCm/aiter.git git clone --recursive https://github.com/ROCm/aiter.git
cd aiter cd aiter
@ -90,26 +90,24 @@ Currently, there are no pre-built ROCm wheels.
4. Build vLLM. For example, vLLM on ROCM 6.3 can be built with the following steps: 4. Build vLLM. For example, vLLM on ROCM 6.3 can be built with the following steps:
??? Commands ```bash
pip install --upgrade pip
```bash # Build & install AMD SMI
pip install --upgrade pip pip install /opt/rocm/share/amd_smi
# Build & install AMD SMI # Install dependencies
pip install /opt/rocm/share/amd_smi pip install --upgrade numba \
scipy \
huggingface-hub[cli,hf_transfer] \
setuptools_scm
pip install "numpy<2"
pip install -r requirements/rocm.txt
# Install dependencies # Build vLLM for MI210/MI250/MI300.
pip install --upgrade numba \ export PYTORCH_ROCM_ARCH="gfx90a;gfx942"
scipy \ python3 setup.py develop
huggingface-hub[cli,hf_transfer] \ ```
setuptools_scm
pip install "numpy<2"
pip install -r requirements/rocm.txt
# Build vLLM for MI210/MI250/MI300.
export PYTORCH_ROCM_ARCH="gfx90a;gfx942"
python3 setup.py develop
```
This may take 5-10 minutes. Currently, `pip install .` does not work for ROCm installation. This may take 5-10 minutes. Currently, `pip install .` does not work for ROCm installation.
@ -148,7 +146,7 @@ If you choose to build this rocm_base image yourself, the steps are as follows.
It is important that the user kicks off the docker build using buildkit. Either the user put DOCKER_BUILDKIT=1 as environment variable when calling docker build command, or the user needs to setup buildkit in the docker daemon configuration /etc/docker/daemon.json as follows and restart the daemon: It is important that the user kicks off the docker build using buildkit. Either the user put DOCKER_BUILDKIT=1 as environment variable when calling docker build command, or the user needs to setup buildkit in the docker daemon configuration /etc/docker/daemon.json as follows and restart the daemon:
```json ```console
{ {
"features": { "features": {
"buildkit": true "buildkit": true
@ -158,7 +156,7 @@ It is important that the user kicks off the docker build using buildkit. Either
To build vllm on ROCm 6.3 for MI200 and MI300 series, you can use the default: To build vllm on ROCm 6.3 for MI200 and MI300 series, you can use the default:
```bash ```console
DOCKER_BUILDKIT=1 docker build \ DOCKER_BUILDKIT=1 docker build \
-f docker/Dockerfile.rocm_base \ -f docker/Dockerfile.rocm_base \
-t rocm/vllm-dev:base . -t rocm/vllm-dev:base .
@ -169,7 +167,7 @@ DOCKER_BUILDKIT=1 docker build \
First, build a docker image from <gh-file:docker/Dockerfile.rocm> and launch a docker container from the image. First, build a docker image from <gh-file:docker/Dockerfile.rocm> and launch a docker container from the image.
It is important that the user kicks off the docker build using buildkit. Either the user put `DOCKER_BUILDKIT=1` as environment variable when calling docker build command, or the user needs to setup buildkit in the docker daemon configuration /etc/docker/daemon.json as follows and restart the daemon: It is important that the user kicks off the docker build using buildkit. Either the user put `DOCKER_BUILDKIT=1` as environment variable when calling docker build command, or the user needs to setup buildkit in the docker daemon configuration /etc/docker/daemon.json as follows and restart the daemon:
```bash ```console
{ {
"features": { "features": {
"buildkit": true "buildkit": true
@ -187,13 +185,13 @@ Their values can be passed in when running `docker build` with `--build-arg` opt
To build vllm on ROCm 6.3 for MI200 and MI300 series, you can use the default: To build vllm on ROCm 6.3 for MI200 and MI300 series, you can use the default:
```bash ```console
DOCKER_BUILDKIT=1 docker build -f docker/Dockerfile.rocm -t vllm-rocm . DOCKER_BUILDKIT=1 docker build -f docker/Dockerfile.rocm -t vllm-rocm .
``` ```
To build vllm on ROCm 6.3 for Radeon RX7900 series (gfx1100), you should pick the alternative base image: To build vllm on ROCm 6.3 for Radeon RX7900 series (gfx1100), you should pick the alternative base image:
```bash ```console
DOCKER_BUILDKIT=1 docker build \ DOCKER_BUILDKIT=1 docker build \
--build-arg BASE_IMAGE="rocm/vllm-dev:navi_base" \ --build-arg BASE_IMAGE="rocm/vllm-dev:navi_base" \
-f docker/Dockerfile.rocm \ -f docker/Dockerfile.rocm \
@ -203,21 +201,19 @@ DOCKER_BUILDKIT=1 docker build \
To run the above docker image `vllm-rocm`, use the below command: To run the above docker image `vllm-rocm`, use the below command:
??? Command ```console
docker run -it \
```bash --network=host \
docker run -it \ --group-add=video \
--network=host \ --ipc=host \
--group-add=video \ --cap-add=SYS_PTRACE \
--ipc=host \ --security-opt seccomp=unconfined \
--cap-add=SYS_PTRACE \ --device /dev/kfd \
--security-opt seccomp=unconfined \ --device /dev/dri \
--device /dev/kfd \ -v <path/to/model>:/app/model \
--device /dev/dri \ vllm-rocm \
-v <path/to/model>:/app/model \ bash
vllm-rocm \ ```
bash
```
Where the `<path/to/model>` is the location where the model is stored, for example, the weights for llama2 or llama3 models. Where the `<path/to/model>` is the location where the model is stored, for example, the weights for llama2 or llama3 models.

View File

@ -25,7 +25,7 @@ Currently, there are no pre-built XPU wheels.
- First, install required driver and Intel OneAPI 2025.0 or later. - First, install required driver and Intel OneAPI 2025.0 or later.
- Second, install Python packages for vLLM XPU backend building: - Second, install Python packages for vLLM XPU backend building:
```bash ```console
git clone https://github.com/vllm-project/vllm.git git clone https://github.com/vllm-project/vllm.git
cd vllm cd vllm
pip install --upgrade pip pip install --upgrade pip
@ -34,7 +34,7 @@ pip install -v -r requirements/xpu.txt
- Then, build and install vLLM XPU backend: - Then, build and install vLLM XPU backend:
```bash ```console
VLLM_TARGET_DEVICE=xpu python setup.py install VLLM_TARGET_DEVICE=xpu python setup.py install
``` ```
@ -53,9 +53,9 @@ Currently, there are no pre-built XPU images.
# --8<-- [end:pre-built-images] # --8<-- [end:pre-built-images]
# --8<-- [start:build-image-from-source] # --8<-- [start:build-image-from-source]
```bash ```console
docker build -f docker/Dockerfile.xpu -t vllm-xpu-env --shm-size=4g . $ docker build -f docker/Dockerfile.xpu -t vllm-xpu-env --shm-size=4g .
docker run -it \ $ docker run -it \
--rm \ --rm \
--network=host \ --network=host \
--device /dev/dri \ --device /dev/dri \
@ -68,7 +68,7 @@ docker run -it \
XPU platform supports **tensor parallel** inference/serving and also supports **pipeline parallel** as a beta feature for online serving. We require Ray as the distributed runtime backend. For example, a reference execution like following: XPU platform supports **tensor parallel** inference/serving and also supports **pipeline parallel** as a beta feature for online serving. We require Ray as the distributed runtime backend. For example, a reference execution like following:
```bash ```console
python -m vllm.entrypoints.openai.api_server \ python -m vllm.entrypoints.openai.api_server \
--model=facebook/opt-13b \ --model=facebook/opt-13b \
--dtype=bfloat16 \ --dtype=bfloat16 \

View File

@ -24,7 +24,7 @@ please follow the methods outlined in the
To verify that the Intel Gaudi software was correctly installed, run: To verify that the Intel Gaudi software was correctly installed, run:
```bash ```console
hl-smi # verify that hl-smi is in your PATH and each Gaudi accelerator is visible hl-smi # verify that hl-smi is in your PATH and each Gaudi accelerator is visible
apt list --installed | grep habana # verify that habanalabs-firmware-tools, habanalabs-graph, habanalabs-rdma-core, habanalabs-thunk and habanalabs-container-runtime are installed apt list --installed | grep habana # verify that habanalabs-firmware-tools, habanalabs-graph, habanalabs-rdma-core, habanalabs-thunk and habanalabs-container-runtime are installed
pip list | grep habana # verify that habana-torch-plugin, habana-torch-dataloader, habana-pyhlml and habana-media-loader are installed pip list | grep habana # verify that habana-torch-plugin, habana-torch-dataloader, habana-pyhlml and habana-media-loader are installed
@ -42,7 +42,7 @@ for more details.
Use the following commands to run a Docker image: Use the following commands to run a Docker image:
```bash ```console
docker pull vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest docker pull vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
docker run \ docker run \
-it \ -it \
@ -65,7 +65,7 @@ Currently, there are no pre-built Intel Gaudi wheels.
To build and install vLLM from source, run: To build and install vLLM from source, run:
```bash ```console
git clone https://github.com/vllm-project/vllm.git git clone https://github.com/vllm-project/vllm.git
cd vllm cd vllm
pip install -r requirements/hpu.txt pip install -r requirements/hpu.txt
@ -74,7 +74,7 @@ python setup.py develop
Currently, the latest features and performance optimizations are developed in Gaudi's [vLLM-fork](https://github.com/HabanaAI/vllm-fork) and we periodically upstream them to vLLM main repo. To install latest [HabanaAI/vLLM-fork](https://github.com/HabanaAI/vllm-fork), run the following: Currently, the latest features and performance optimizations are developed in Gaudi's [vLLM-fork](https://github.com/HabanaAI/vllm-fork) and we periodically upstream them to vLLM main repo. To install latest [HabanaAI/vLLM-fork](https://github.com/HabanaAI/vllm-fork), run the following:
```bash ```console
git clone https://github.com/HabanaAI/vllm-fork.git git clone https://github.com/HabanaAI/vllm-fork.git
cd vllm-fork cd vllm-fork
git checkout habana_main git checkout habana_main
@ -90,7 +90,7 @@ Currently, there are no pre-built Intel Gaudi images.
### Build image from source ### Build image from source
```bash ```console
docker build -f docker/Dockerfile.hpu -t vllm-hpu-env . docker build -f docker/Dockerfile.hpu -t vllm-hpu-env .
docker run \ docker run \
-it \ -it \
@ -200,7 +200,7 @@ INFO 08-01 21:37:59 hpu_model_runner.py:509] Generated 48 decode buckets: [(1, 1
`min` determines the lowest value of the bucket. `step` determines the interval between buckets, and `max` determines the upper bound of the bucket. Furthermore, interval between `min` and `step` has special handling -- `min` gets multiplied by consecutive powers of two, until `step` gets reached. We call this the ramp-up phase and it is used for handling lower batch sizes with minimum wastage, while allowing larger padding on larger batch sizes. `min` determines the lowest value of the bucket. `step` determines the interval between buckets, and `max` determines the upper bound of the bucket. Furthermore, interval between `min` and `step` has special handling -- `min` gets multiplied by consecutive powers of two, until `step` gets reached. We call this the ramp-up phase and it is used for handling lower batch sizes with minimum wastage, while allowing larger padding on larger batch sizes.
Example (with ramp-up): Example (with ramp-up)
```text ```text
min = 2, step = 32, max = 64 min = 2, step = 32, max = 64
@ -209,7 +209,7 @@ min = 2, step = 32, max = 64
=> buckets = ramp_up + stable => (2, 4, 8, 16, 32, 64) => buckets = ramp_up + stable => (2, 4, 8, 16, 32, 64)
``` ```
Example (without ramp-up): Example (without ramp-up)
```text ```text
min = 128, step = 128, max = 512 min = 128, step = 128, max = 512
@ -232,21 +232,19 @@ As an example, if a request of 3 sequences, with max sequence length of 412 come
Warmup is an optional, but highly recommended step occurring before vLLM server starts listening. It executes a forward pass for each bucket with dummy data. The goal is to pre-compile all graphs and not incur any graph compilation overheads within bucket boundaries during server runtime. Each warmup step is logged during vLLM startup: Warmup is an optional, but highly recommended step occurring before vLLM server starts listening. It executes a forward pass for each bucket with dummy data. The goal is to pre-compile all graphs and not incur any graph compilation overheads within bucket boundaries during server runtime. Each warmup step is logged during vLLM startup:
??? Logs ```text
INFO 08-01 22:26:47 hpu_model_runner.py:1066] [Warmup][Prompt][1/24] batch_size:4 seq_len:1024 free_mem:79.16 GiB
```text INFO 08-01 22:26:47 hpu_model_runner.py:1066] [Warmup][Prompt][2/24] batch_size:4 seq_len:896 free_mem:55.43 GiB
INFO 08-01 22:26:47 hpu_model_runner.py:1066] [Warmup][Prompt][1/24] batch_size:4 seq_len:1024 free_mem:79.16 GiB INFO 08-01 22:26:48 hpu_model_runner.py:1066] [Warmup][Prompt][3/24] batch_size:4 seq_len:768 free_mem:55.43 GiB
INFO 08-01 22:26:47 hpu_model_runner.py:1066] [Warmup][Prompt][2/24] batch_size:4 seq_len:896 free_mem:55.43 GiB ...
INFO 08-01 22:26:48 hpu_model_runner.py:1066] [Warmup][Prompt][3/24] batch_size:4 seq_len:768 free_mem:55.43 GiB INFO 08-01 22:26:59 hpu_model_runner.py:1066] [Warmup][Prompt][24/24] batch_size:1 seq_len:128 free_mem:55.43 GiB
... INFO 08-01 22:27:00 hpu_model_runner.py:1066] [Warmup][Decode][1/48] batch_size:4 seq_len:2048 free_mem:55.43 GiB
INFO 08-01 22:26:59 hpu_model_runner.py:1066] [Warmup][Prompt][24/24] batch_size:1 seq_len:128 free_mem:55.43 GiB INFO 08-01 22:27:00 hpu_model_runner.py:1066] [Warmup][Decode][2/48] batch_size:4 seq_len:1920 free_mem:55.43 GiB
INFO 08-01 22:27:00 hpu_model_runner.py:1066] [Warmup][Decode][1/48] batch_size:4 seq_len:2048 free_mem:55.43 GiB INFO 08-01 22:27:01 hpu_model_runner.py:1066] [Warmup][Decode][3/48] batch_size:4 seq_len:1792 free_mem:55.43 GiB
INFO 08-01 22:27:00 hpu_model_runner.py:1066] [Warmup][Decode][2/48] batch_size:4 seq_len:1920 free_mem:55.43 GiB ...
INFO 08-01 22:27:01 hpu_model_runner.py:1066] [Warmup][Decode][3/48] batch_size:4 seq_len:1792 free_mem:55.43 GiB INFO 08-01 22:27:16 hpu_model_runner.py:1066] [Warmup][Decode][47/48] batch_size:2 seq_len:128 free_mem:55.43 GiB
... INFO 08-01 22:27:16 hpu_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB
INFO 08-01 22:27:16 hpu_model_runner.py:1066] [Warmup][Decode][47/48] batch_size:2 seq_len:128 free_mem:55.43 GiB ```
INFO 08-01 22:27:16 hpu_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB
```
This example uses the same buckets as in the [Bucketing Mechanism][gaudi-bucketing-mechanism] section. Each output line corresponds to execution of a single bucket. When bucket is executed for the first time, its graph is compiled and can be reused later on, skipping further graph compilations. This example uses the same buckets as in the [Bucketing Mechanism][gaudi-bucketing-mechanism] section. Each output line corresponds to execution of a single bucket. When bucket is executed for the first time, its graph is compiled and can be reused later on, skipping further graph compilations.
@ -281,39 +279,37 @@ When there's large amount of requests pending, vLLM scheduler will attempt to fi
Each described step is logged by vLLM server, as follows (negative values correspond to memory being released): Each described step is logged by vLLM server, as follows (negative values correspond to memory being released):
??? Logs ```text
INFO 08-02 17:37:44 hpu_model_runner.py:493] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 4], seq:[128, 128, 1024]
```text INFO 08-02 17:37:44 hpu_model_runner.py:499] Generated 24 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024)]
INFO 08-02 17:37:44 hpu_model_runner.py:493] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 4], seq:[128, 128, 1024] INFO 08-02 17:37:44 hpu_model_runner.py:504] Decode bucket config (min, step, max_warmup) bs:[1, 128, 4], seq:[128, 128, 2048]
INFO 08-02 17:37:44 hpu_model_runner.py:499] Generated 24 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024)] INFO 08-02 17:37:44 hpu_model_runner.py:509] Generated 48 decode buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)]
INFO 08-02 17:37:44 hpu_model_runner.py:504] Decode bucket config (min, step, max_warmup) bs:[1, 128, 4], seq:[128, 128, 2048] INFO 08-02 17:37:52 hpu_model_runner.py:430] Pre-loading model weights on hpu:0 took 14.97 GiB of device memory (14.97 GiB/94.62 GiB used) and 2.95 GiB of host memory (475.2 GiB/1007 GiB used)
INFO 08-02 17:37:44 hpu_model_runner.py:509] Generated 48 decode buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)] INFO 08-02 17:37:52 hpu_model_runner.py:438] Wrapping in HPU Graph took 0 B of device memory (14.97 GiB/94.62 GiB used) and -252 KiB of host memory (475.2 GiB/1007 GiB used)
INFO 08-02 17:37:52 hpu_model_runner.py:430] Pre-loading model weights on hpu:0 took 14.97 GiB of device memory (14.97 GiB/94.62 GiB used) and 2.95 GiB of host memory (475.2 GiB/1007 GiB used) INFO 08-02 17:37:52 hpu_model_runner.py:442] Loading model weights took in total 14.97 GiB of device memory (14.97 GiB/94.62 GiB used) and 2.95 GiB of host memory (475.2 GiB/1007 GiB used)
INFO 08-02 17:37:52 hpu_model_runner.py:438] Wrapping in HPU Graph took 0 B of device memory (14.97 GiB/94.62 GiB used) and -252 KiB of host memory (475.2 GiB/1007 GiB used) INFO 08-02 17:37:54 hpu_worker.py:134] Model profiling run took 504 MiB of device memory (15.46 GiB/94.62 GiB used) and 180.9 MiB of host memory (475.4 GiB/1007 GiB used)
INFO 08-02 17:37:52 hpu_model_runner.py:442] Loading model weights took in total 14.97 GiB of device memory (14.97 GiB/94.62 GiB used) and 2.95 GiB of host memory (475.2 GiB/1007 GiB used) INFO 08-02 17:37:54 hpu_worker.py:158] Free device memory: 79.16 GiB, 39.58 GiB usable (gpu_memory_utilization=0.5), 15.83 GiB reserved for HPUGraphs (VLLM_GRAPH_RESERVED_MEM=0.4), 23.75 GiB reserved for KV cache
INFO 08-02 17:37:54 hpu_worker.py:134] Model profiling run took 504 MiB of device memory (15.46 GiB/94.62 GiB used) and 180.9 MiB of host memory (475.4 GiB/1007 GiB used) INFO 08-02 17:37:54 hpu_executor.py:85] # HPU blocks: 1519, # CPU blocks: 0
INFO 08-02 17:37:54 hpu_worker.py:158] Free device memory: 79.16 GiB, 39.58 GiB usable (gpu_memory_utilization=0.5), 15.83 GiB reserved for HPUGraphs (VLLM_GRAPH_RESERVED_MEM=0.4), 23.75 GiB reserved for KV cache INFO 08-02 17:37:54 hpu_worker.py:190] Initializing cache engine took 23.73 GiB of device memory (39.2 GiB/94.62 GiB used) and -1.238 MiB of host memory (475.4 GiB/1007 GiB used)
INFO 08-02 17:37:54 hpu_executor.py:85] # HPU blocks: 1519, # CPU blocks: 0 INFO 08-02 17:37:54 hpu_model_runner.py:1066] [Warmup][Prompt][1/24] batch_size:4 seq_len:1024 free_mem:55.43 GiB
INFO 08-02 17:37:54 hpu_worker.py:190] Initializing cache engine took 23.73 GiB of device memory (39.2 GiB/94.62 GiB used) and -1.238 MiB of host memory (475.4 GiB/1007 GiB used) ...
INFO 08-02 17:37:54 hpu_model_runner.py:1066] [Warmup][Prompt][1/24] batch_size:4 seq_len:1024 free_mem:55.43 GiB INFO 08-02 17:38:22 hpu_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB
... INFO 08-02 17:38:22 hpu_model_runner.py:1159] Using 15.85 GiB/55.43 GiB of free device memory for HPUGraphs, 7.923 GiB for prompt and 7.923 GiB for decode (VLLM_GRAPH_PROMPT_RATIO=0.3)
INFO 08-02 17:38:22 hpu_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB INFO 08-02 17:38:22 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][1/24] batch_size:1 seq_len:128 free_mem:55.43 GiB
INFO 08-02 17:38:22 hpu_model_runner.py:1159] Using 15.85 GiB/55.43 GiB of free device memory for HPUGraphs, 7.923 GiB for prompt and 7.923 GiB for decode (VLLM_GRAPH_PROMPT_RATIO=0.3) ...
INFO 08-02 17:38:22 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][1/24] batch_size:1 seq_len:128 free_mem:55.43 GiB INFO 08-02 17:38:26 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][11/24] batch_size:1 seq_len:896 free_mem:48.77 GiB
... INFO 08-02 17:38:27 hpu_model_runner.py:1066] [Warmup][Graph/Decode][1/48] batch_size:4 seq_len:128 free_mem:47.51 GiB
INFO 08-02 17:38:26 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][11/24] batch_size:1 seq_len:896 free_mem:48.77 GiB ...
INFO 08-02 17:38:27 hpu_model_runner.py:1066] [Warmup][Graph/Decode][1/48] batch_size:4 seq_len:128 free_mem:47.51 GiB INFO 08-02 17:38:41 hpu_model_runner.py:1066] [Warmup][Graph/Decode][48/48] batch_size:1 seq_len:2048 free_mem:47.35 GiB
... INFO 08-02 17:38:41 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][12/24] batch_size:4 seq_len:256 free_mem:47.35 GiB
INFO 08-02 17:38:41 hpu_model_runner.py:1066] [Warmup][Graph/Decode][48/48] batch_size:1 seq_len:2048 free_mem:47.35 GiB INFO 08-02 17:38:42 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][13/24] batch_size:2 seq_len:512 free_mem:45.91 GiB
INFO 08-02 17:38:41 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][12/24] batch_size:4 seq_len:256 free_mem:47.35 GiB INFO 08-02 17:38:42 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][14/24] batch_size:1 seq_len:1024 free_mem:44.48 GiB
INFO 08-02 17:38:42 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][13/24] batch_size:2 seq_len:512 free_mem:45.91 GiB INFO 08-02 17:38:43 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][15/24] batch_size:2 seq_len:640 free_mem:43.03 GiB
INFO 08-02 17:38:42 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][14/24] batch_size:1 seq_len:1024 free_mem:44.48 GiB INFO 08-02 17:38:43 hpu_model_runner.py:1128] Graph/Prompt captured:15 (62.5%) used_mem:14.03 GiB buckets:[(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (4, 128), (4, 256)]
INFO 08-02 17:38:43 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][15/24] batch_size:2 seq_len:640 free_mem:43.03 GiB INFO 08-02 17:38:43 hpu_model_runner.py:1128] Graph/Decode captured:48 (100.0%) used_mem:161.9 MiB buckets:[(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)]
INFO 08-02 17:38:43 hpu_model_runner.py:1128] Graph/Prompt captured:15 (62.5%) used_mem:14.03 GiB buckets:[(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (4, 128), (4, 256)] INFO 08-02 17:38:43 hpu_model_runner.py:1206] Warmup finished in 49 secs, allocated 14.19 GiB of device memory
INFO 08-02 17:38:43 hpu_model_runner.py:1128] Graph/Decode captured:48 (100.0%) used_mem:161.9 MiB buckets:[(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)] INFO 08-02 17:38:43 hpu_executor.py:91] init_cache_engine took 37.92 GiB of device memory (53.39 GiB/94.62 GiB used) and 57.86 MiB of host memory (475.4 GiB/1007 GiB used)
INFO 08-02 17:38:43 hpu_model_runner.py:1206] Warmup finished in 49 secs, allocated 14.19 GiB of device memory ```
INFO 08-02 17:38:43 hpu_executor.py:91] init_cache_engine took 37.92 GiB of device memory (53.39 GiB/94.62 GiB used) and 57.86 MiB of host memory (475.4 GiB/1007 GiB used)
```
### Recommended vLLM Parameters ### Recommended vLLM Parameters

View File

@ -1,6 +1,6 @@
It's recommended to use [uv](https://docs.astral.sh/uv/), a very fast Python environment manager, to create and manage Python environments. Please follow the [documentation](https://docs.astral.sh/uv/#getting-started) to install `uv`. After installing `uv`, you can create a new Python environment and install vLLM using the following commands: It's recommended to use [uv](https://docs.astral.sh/uv/), a very fast Python environment manager, to create and manage Python environments. Please follow the [documentation](https://docs.astral.sh/uv/#getting-started) to install `uv`. After installing `uv`, you can create a new Python environment and install vLLM using the following commands:
```bash ```console
uv venv --python 3.12 --seed uv venv --python 3.12 --seed
source .venv/bin/activate source .venv/bin/activate
``` ```

View File

@ -19,7 +19,7 @@ If you are using NVIDIA GPUs, you can install vLLM using [pip](https://pypi.org/
It's recommended to use [uv](https://docs.astral.sh/uv/), a very fast Python environment manager, to create and manage Python environments. Please follow the [documentation](https://docs.astral.sh/uv/#getting-started) to install `uv`. After installing `uv`, you can create a new Python environment and install vLLM using the following commands: It's recommended to use [uv](https://docs.astral.sh/uv/), a very fast Python environment manager, to create and manage Python environments. Please follow the [documentation](https://docs.astral.sh/uv/#getting-started) to install `uv`. After installing `uv`, you can create a new Python environment and install vLLM using the following commands:
```bash ```console
uv venv --python 3.12 --seed uv venv --python 3.12 --seed
source .venv/bin/activate source .venv/bin/activate
uv pip install vllm --torch-backend=auto uv pip install vllm --torch-backend=auto
@ -29,13 +29,13 @@ uv pip install vllm --torch-backend=auto
Another delightful way is to use `uv run` with `--with [dependency]` option, which allows you to run commands such as `vllm serve` without creating any permanent environment: Another delightful way is to use `uv run` with `--with [dependency]` option, which allows you to run commands such as `vllm serve` without creating any permanent environment:
```bash ```console
uv run --with vllm vllm --help uv run --with vllm vllm --help
``` ```
You can also use [conda](https://docs.conda.io/projects/conda/en/latest/user-guide/getting-started.html) to create and manage Python environments. You can install `uv` to the conda environment through `pip` if you want to manage it within the environment. You can also use [conda](https://docs.conda.io/projects/conda/en/latest/user-guide/getting-started.html) to create and manage Python environments. You can install `uv` to the conda environment through `pip` if you want to manage it within the environment.
```bash ```console
conda create -n myenv python=3.12 -y conda create -n myenv python=3.12 -y
conda activate myenv conda activate myenv
pip install --upgrade uv pip install --upgrade uv
@ -110,7 +110,7 @@ By default, it starts the server at `http://localhost:8000`. You can specify the
Run the following command to start the vLLM server with the [Qwen2.5-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct) model: Run the following command to start the vLLM server with the [Qwen2.5-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct) model:
```bash ```console
vllm serve Qwen/Qwen2.5-1.5B-Instruct vllm serve Qwen/Qwen2.5-1.5B-Instruct
``` ```
@ -124,7 +124,7 @@ vllm serve Qwen/Qwen2.5-1.5B-Instruct
This server can be queried in the same format as OpenAI API. For example, to list the models: This server can be queried in the same format as OpenAI API. For example, to list the models:
```bash ```console
curl http://localhost:8000/v1/models curl http://localhost:8000/v1/models
``` ```
@ -134,7 +134,7 @@ You can pass in the argument `--api-key` or environment variable `VLLM_API_KEY`
Once your server is started, you can query the model with input prompts: Once your server is started, you can query the model with input prompts:
```bash ```console
curl http://localhost:8000/v1/completions \ curl http://localhost:8000/v1/completions \
-H "Content-Type: application/json" \ -H "Content-Type: application/json" \
-d '{ -d '{
@ -147,22 +147,20 @@ curl http://localhost:8000/v1/completions \
Since this server is compatible with OpenAI API, you can use it as a drop-in replacement for any applications using OpenAI API. For example, another way to query the server is via the `openai` Python package: Since this server is compatible with OpenAI API, you can use it as a drop-in replacement for any applications using OpenAI API. For example, another way to query the server is via the `openai` Python package:
??? Code ```python
from openai import OpenAI
```python # Modify OpenAI's API key and API base to use vLLM's API server.
from openai import OpenAI openai_api_key = "EMPTY"
openai_api_base = "http://localhost:8000/v1"
# Modify OpenAI's API key and API base to use vLLM's API server. client = OpenAI(
openai_api_key = "EMPTY" api_key=openai_api_key,
openai_api_base = "http://localhost:8000/v1" base_url=openai_api_base,
client = OpenAI( )
api_key=openai_api_key, completion = client.completions.create(model="Qwen/Qwen2.5-1.5B-Instruct",
base_url=openai_api_base, prompt="San Francisco is a")
) print("Completion result:", completion)
completion = client.completions.create(model="Qwen/Qwen2.5-1.5B-Instruct", ```
prompt="San Francisco is a")
print("Completion result:", completion)
```
A more detailed client example can be found here: <gh-file:examples/online_serving/openai_completion_client.py> A more detailed client example can be found here: <gh-file:examples/online_serving/openai_completion_client.py>
@ -172,7 +170,7 @@ vLLM is designed to also support the OpenAI Chat Completions API. The chat inter
You can use the [create chat completion](https://platform.openai.com/docs/api-reference/chat/completions/create) endpoint to interact with the model: You can use the [create chat completion](https://platform.openai.com/docs/api-reference/chat/completions/create) endpoint to interact with the model:
```bash ```console
curl http://localhost:8000/v1/chat/completions \ curl http://localhost:8000/v1/chat/completions \
-H "Content-Type: application/json" \ -H "Content-Type: application/json" \
-d '{ -d '{
@ -186,28 +184,26 @@ curl http://localhost:8000/v1/chat/completions \
Alternatively, you can use the `openai` Python package: Alternatively, you can use the `openai` Python package:
??? Code ```python
from openai import OpenAI
# Set OpenAI's API key and API base to use vLLM's API server.
openai_api_key = "EMPTY"
openai_api_base = "http://localhost:8000/v1"
```python client = OpenAI(
from openai import OpenAI api_key=openai_api_key,
# Set OpenAI's API key and API base to use vLLM's API server. base_url=openai_api_base,
openai_api_key = "EMPTY" )
openai_api_base = "http://localhost:8000/v1"
client = OpenAI( chat_response = client.chat.completions.create(
api_key=openai_api_key, model="Qwen/Qwen2.5-1.5B-Instruct",
base_url=openai_api_base, messages=[
) {"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Tell me a joke."},
chat_response = client.chat.completions.create( ]
model="Qwen/Qwen2.5-1.5B-Instruct", )
messages=[ print("Chat response:", chat_response)
{"role": "system", "content": "You are a helpful assistant."}, ```
{"role": "user", "content": "Tell me a joke."},
]
)
print("Chat response:", chat_response)
```
## On Attention Backends ## On Attention Backends

View File

@ -9,27 +9,27 @@ Further reading can be found in [Run:ai Model Streamer Documentation](https://gi
vLLM supports loading weights in Safetensors format using the Run:ai Model Streamer. vLLM supports loading weights in Safetensors format using the Run:ai Model Streamer.
You first need to install vLLM RunAI optional dependency: You first need to install vLLM RunAI optional dependency:
```bash ```console
pip3 install vllm[runai] pip3 install vllm[runai]
``` ```
To run it as an OpenAI-compatible server, add the `--load-format runai_streamer` flag: To run it as an OpenAI-compatible server, add the `--load-format runai_streamer` flag:
```bash ```console
vllm serve /home/meta-llama/Llama-3.2-3B-Instruct \ vllm serve /home/meta-llama/Llama-3.2-3B-Instruct \
--load-format runai_streamer --load-format runai_streamer
``` ```
To run model from AWS S3 object store run: To run model from AWS S3 object store run:
```bash ```console
vllm serve s3://core-llm/Llama-3-8b \ vllm serve s3://core-llm/Llama-3-8b \
--load-format runai_streamer --load-format runai_streamer
``` ```
To run model from a S3 compatible object store run: To run model from a S3 compatible object store run:
```bash ```console
RUNAI_STREAMER_S3_USE_VIRTUAL_ADDRESSING=0 \ RUNAI_STREAMER_S3_USE_VIRTUAL_ADDRESSING=0 \
AWS_EC2_METADATA_DISABLED=true \ AWS_EC2_METADATA_DISABLED=true \
AWS_ENDPOINT_URL=https://storage.googleapis.com \ AWS_ENDPOINT_URL=https://storage.googleapis.com \
@ -44,7 +44,7 @@ You can tune parameters using `--model-loader-extra-config`:
You can tune `concurrency` that controls the level of concurrency and number of OS threads reading tensors from the file to the CPU buffer. You can tune `concurrency` that controls the level of concurrency and number of OS threads reading tensors from the file to the CPU buffer.
For reading from S3, it will be the number of client instances the host is opening to the S3 server. For reading from S3, it will be the number of client instances the host is opening to the S3 server.
```bash ```console
vllm serve /home/meta-llama/Llama-3.2-3B-Instruct \ vllm serve /home/meta-llama/Llama-3.2-3B-Instruct \
--load-format runai_streamer \ --load-format runai_streamer \
--model-loader-extra-config '{"concurrency":16}' --model-loader-extra-config '{"concurrency":16}'
@ -53,7 +53,7 @@ vllm serve /home/meta-llama/Llama-3.2-3B-Instruct \
You can control the size of the CPU Memory buffer to which tensors are read from the file, and limit this size. You can control the size of the CPU Memory buffer to which tensors are read from the file, and limit this size.
You can read further about CPU buffer memory limiting [here](https://github.com/run-ai/runai-model-streamer/blob/master/docs/src/env-vars.md#runai_streamer_memory_limit). You can read further about CPU buffer memory limiting [here](https://github.com/run-ai/runai-model-streamer/blob/master/docs/src/env-vars.md#runai_streamer_memory_limit).
```bash ```console
vllm serve /home/meta-llama/Llama-3.2-3B-Instruct \ vllm serve /home/meta-llama/Llama-3.2-3B-Instruct \
--load-format runai_streamer \ --load-format runai_streamer \
--model-loader-extra-config '{"memory_limit":5368709120}' --model-loader-extra-config '{"memory_limit":5368709120}'
@ -66,13 +66,13 @@ vllm serve /home/meta-llama/Llama-3.2-3B-Instruct \
vLLM also supports loading sharded models using Run:ai Model Streamer. This is particularly useful for large models that are split across multiple files. To use this feature, use the `--load-format runai_streamer_sharded` flag: vLLM also supports loading sharded models using Run:ai Model Streamer. This is particularly useful for large models that are split across multiple files. To use this feature, use the `--load-format runai_streamer_sharded` flag:
```bash ```console
vllm serve /path/to/sharded/model --load-format runai_streamer_sharded vllm serve /path/to/sharded/model --load-format runai_streamer_sharded
``` ```
The sharded loader expects model files to follow the same naming pattern as the regular sharded state loader: `model-rank-{rank}-part-{part}.safetensors`. You can customize this pattern using the `pattern` parameter in `--model-loader-extra-config`: The sharded loader expects model files to follow the same naming pattern as the regular sharded state loader: `model-rank-{rank}-part-{part}.safetensors`. You can customize this pattern using the `pattern` parameter in `--model-loader-extra-config`:
```bash ```console
vllm serve /path/to/sharded/model \ vllm serve /path/to/sharded/model \
--load-format runai_streamer_sharded \ --load-format runai_streamer_sharded \
--model-loader-extra-config '{"pattern":"custom-model-rank-{rank}-part-{part}.safetensors"}' --model-loader-extra-config '{"pattern":"custom-model-rank-{rank}-part-{part}.safetensors"}'
@ -82,7 +82,7 @@ To create sharded model files, you can use the script provided in <gh-file:examp
The sharded loader supports all the same tunable parameters as the regular Run:ai Model Streamer, including `concurrency` and `memory_limit`. These can be configured in the same way: The sharded loader supports all the same tunable parameters as the regular Run:ai Model Streamer, including `concurrency` and `memory_limit`. These can be configured in the same way:
```bash ```console
vllm serve /path/to/sharded/model \ vllm serve /path/to/sharded/model \
--load-format runai_streamer_sharded \ --load-format runai_streamer_sharded \
--model-loader-extra-config '{"concurrency":16, "memory_limit":5368709120}' --model-loader-extra-config '{"concurrency":16, "memory_limit":5368709120}'

View File

@ -85,37 +85,35 @@ and automatically applies the model's [chat template](https://huggingface.co/doc
In general, only instruction-tuned models have a chat template. In general, only instruction-tuned models have a chat template.
Base models may perform poorly as they are not trained to respond to the chat conversation. Base models may perform poorly as they are not trained to respond to the chat conversation.
??? Code ```python
from vllm import LLM
```python llm = LLM(model="meta-llama/Meta-Llama-3-8B-Instruct")
from vllm import LLM conversation = [
{
"role": "system",
"content": "You are a helpful assistant"
},
{
"role": "user",
"content": "Hello"
},
{
"role": "assistant",
"content": "Hello! How can I assist you today?"
},
{
"role": "user",
"content": "Write an essay about the importance of higher education.",
},
]
outputs = llm.chat(conversation)
llm = LLM(model="meta-llama/Meta-Llama-3-8B-Instruct") for output in outputs:
conversation = [ prompt = output.prompt
{ generated_text = output.outputs[0].text
"role": "system", print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
"content": "You are a helpful assistant" ```
},
{
"role": "user",
"content": "Hello"
},
{
"role": "assistant",
"content": "Hello! How can I assist you today?"
},
{
"role": "user",
"content": "Write an essay about the importance of higher education.",
},
]
outputs = llm.chat(conversation)
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
```
A code example can be found here: <gh-file:examples/offline_inference/basic/chat.py> A code example can be found here: <gh-file:examples/offline_inference/basic/chat.py>

View File

@ -70,10 +70,7 @@ To make your model compatible with the Transformers backend, it needs:
2. `MyAttention` must use `ALL_ATTENTION_FUNCTIONS` to call attention. 2. `MyAttention` must use `ALL_ATTENTION_FUNCTIONS` to call attention.
3. `MyModel` must contain `_supports_attention_backend = True`. 3. `MyModel` must contain `_supports_attention_backend = True`.
<details> ```python title="modeling_my_model.py"
<summary>modeling_my_model.py</summary>
```python
from transformers import PreTrainedModel from transformers import PreTrainedModel
from torch import nn from torch import nn
@ -96,8 +93,6 @@ class MyModel(PreTrainedModel):
_supports_attention_backend = True _supports_attention_backend = True
``` ```
</details>
Here is what happens in the background when this model is loaded: Here is what happens in the background when this model is loaded:
1. The config is loaded. 1. The config is loaded.
@ -108,10 +103,7 @@ That's it!
For your model to be compatible with vLLM's tensor parallel and/or pipeline parallel features, you must add `base_model_tp_plan` and/or `base_model_pp_plan` to your model's config class: For your model to be compatible with vLLM's tensor parallel and/or pipeline parallel features, you must add `base_model_tp_plan` and/or `base_model_pp_plan` to your model's config class:
<details> ```python title="configuration_my_model.py"
<summary>configuration_my_model.py</summary>
```python
from transformers import PretrainedConfig from transformers import PretrainedConfig
@ -131,8 +123,6 @@ class MyConfig(PretrainedConfig):
} }
``` ```
</details>
- `base_model_tp_plan` is a `dict` that maps fully qualified layer name patterns to tensor parallel styles (currently only `"colwise"` and `"rowwise"` are supported). - `base_model_tp_plan` is a `dict` that maps fully qualified layer name patterns to tensor parallel styles (currently only `"colwise"` and `"rowwise"` are supported).
- `base_model_pp_plan` is a `dict` that maps direct child layer names to `tuple`s of `list`s of `str`s: - `base_model_pp_plan` is a `dict` that maps direct child layer names to `tuple`s of `list`s of `str`s:
* You only need to do this for layers which are not present on all pipeline stages * You only need to do this for layers which are not present on all pipeline stages
@ -178,7 +168,7 @@ Alternatively, you can [open an issue on GitHub](https://github.com/vllm-project
If you prefer, you can use the Hugging Face CLI to [download a model](https://huggingface.co/docs/huggingface_hub/guides/cli#huggingface-cli-download) or specific files from a model repository: If you prefer, you can use the Hugging Face CLI to [download a model](https://huggingface.co/docs/huggingface_hub/guides/cli#huggingface-cli-download) or specific files from a model repository:
```bash ```console
# Download a model # Download a model
huggingface-cli download HuggingFaceH4/zephyr-7b-beta huggingface-cli download HuggingFaceH4/zephyr-7b-beta
@ -193,7 +183,7 @@ huggingface-cli download HuggingFaceH4/zephyr-7b-beta eval_results.json
Use the Hugging Face CLI to [manage models](https://huggingface.co/docs/huggingface_hub/guides/manage-cache#scan-your-cache) stored in local cache: Use the Hugging Face CLI to [manage models](https://huggingface.co/docs/huggingface_hub/guides/manage-cache#scan-your-cache) stored in local cache:
```bash ```console
# List cached models # List cached models
huggingface-cli scan-cache huggingface-cli scan-cache
@ -208,9 +198,6 @@ huggingface-cli scan-cache --dir ~/.cache/huggingface/hub
Use the Hugging Face CLI to interactively [delete downloaded model](https://huggingface.co/docs/huggingface_hub/guides/manage-cache#clean-your-cache) from the cache: Use the Hugging Face CLI to interactively [delete downloaded model](https://huggingface.co/docs/huggingface_hub/guides/manage-cache#clean-your-cache) from the cache:
<details>
<summary>Commands</summary>
```console ```console
# The `delete-cache` command requires extra dependencies to work with the TUI. # The `delete-cache` command requires extra dependencies to work with the TUI.
# Please run `pip install huggingface_hub[cli]` to install them. # Please run `pip install huggingface_hub[cli]` to install them.
@ -237,8 +224,6 @@ Start deletion.
Done. Deleted 1 repo(s) and 0 revision(s) for a total of 438.9M. Done. Deleted 1 repo(s) and 0 revision(s) for a total of 438.9M.
``` ```
</details>
#### Using a proxy #### Using a proxy
Here are some tips for loading/downloading models from Hugging Face using a proxy: Here are some tips for loading/downloading models from Hugging Face using a proxy:
@ -407,15 +392,15 @@ Specified using `--task embed`.
| Architecture | Models | Example HF Models | [LoRA][lora-adapter] | [PP][distributed-serving] | [V1](gh-issue:8779) | | Architecture | Models | Example HF Models | [LoRA][lora-adapter] | [PP][distributed-serving] | [V1](gh-issue:8779) |
|--------------------------------------------------------|---------------------|---------------------------------------------------------------------------------------------------------------------|----------------------|---------------------------|-----------------------| |--------------------------------------------------------|---------------------|---------------------------------------------------------------------------------------------------------------------|----------------------|---------------------------|-----------------------|
| `BertModel` | BERT-based | `BAAI/bge-base-en-v1.5`, `Snowflake/snowflake-arctic-embed-xs`, etc. | | | | | `BertModel` | BERT-based | `BAAI/bge-base-en-v1.5`, `Snowflake/snowflake-arctic-embed-xs`, etc. | | | |
| `Gemma2Model` | Gemma 2-based | `BAAI/bge-multilingual-gemma2`, etc. | ✅︎ | | ✅︎ | | `Gemma2Model` | Gemma 2-based | `BAAI/bge-multilingual-gemma2`, etc. | ✅︎ | | |
| `GritLM` | GritLM | `parasail-ai/GritLM-7B-vllm`. | ✅︎ | ✅︎ | | | `GritLM` | GritLM | `parasail-ai/GritLM-7B-vllm`. | ✅︎ | ✅︎ | |
| `GteModel` | Arctic-Embed-2.0-M | `Snowflake/snowflake-arctic-embed-m-v2.0`. | | | | | `GteModel` | Arctic-Embed-2.0-M | `Snowflake/snowflake-arctic-embed-m-v2.0`. | | | |
| `GteNewModel` | mGTE-TRM (see note) | `Alibaba-NLP/gte-multilingual-base`, etc. | | | | | `GteNewModel` | mGTE-TRM (see note) | `Alibaba-NLP/gte-multilingual-base`, etc. | | | |
| `ModernBertModel` | ModernBERT-based | `Alibaba-NLP/gte-modernbert-base`, etc. | | | | | `ModernBertModel` | ModernBERT-based | `Alibaba-NLP/gte-modernbert-base`, etc. | | | |
| `NomicBertModel` | Nomic BERT | `nomic-ai/nomic-embed-text-v1`, `nomic-ai/nomic-embed-text-v2-moe`, `Snowflake/snowflake-arctic-embed-m-long`, etc. | | | | | `NomicBertModel` | Nomic BERT | `nomic-ai/nomic-embed-text-v1`, `nomic-ai/nomic-embed-text-v2-moe`, `Snowflake/snowflake-arctic-embed-m-long`, etc. | | | |
| `LlamaModel`, `LlamaForCausalLM`, `MistralModel`, etc. | Llama-based | `intfloat/e5-mistral-7b-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ | | `LlamaModel`, `LlamaForCausalLM`, `MistralModel`, etc. | Llama-based | `intfloat/e5-mistral-7b-instruct`, etc. | ✅︎ | ✅︎ | |
| `Qwen2Model`, `Qwen2ForCausalLM` | Qwen2-based | `ssmits/Qwen2-7B-Instruct-embed-base` (see note), `Alibaba-NLP/gte-Qwen2-7B-instruct` (see note), etc. | ✅︎ | ✅︎ | ✅︎ | | `Qwen2Model`, `Qwen2ForCausalLM` | Qwen2-based | `ssmits/Qwen2-7B-Instruct-embed-base` (see note), `Alibaba-NLP/gte-Qwen2-7B-instruct` (see note), etc. | ✅︎ | ✅︎ | |
| `Qwen3Model`, `Qwen3ForCausalLM` | Qwen3-based | `Qwen/Qwen3-Embedding-0.6B`, etc. | ✅︎ | ✅︎ | ✅︎ | | `Qwen3Model`, `Qwen3ForCausalLM` | Qwen3-based | `Qwen/Qwen3-Embedding-0.6B`, etc. | ✅︎ | ✅︎ | |
| `RobertaModel`, `RobertaForMaskedLM` | RoBERTa-based | `sentence-transformers/all-roberta-large-v1`, etc. | | | | | `RobertaModel`, `RobertaForMaskedLM` | RoBERTa-based | `sentence-transformers/all-roberta-large-v1`, etc. | | | |
!!! note !!! note
@ -442,10 +427,9 @@ Specified using `--task reward`.
| Architecture | Models | Example HF Models | [LoRA][lora-adapter] | [PP][distributed-serving] | [V1](gh-issue:8779) | | Architecture | Models | Example HF Models | [LoRA][lora-adapter] | [PP][distributed-serving] | [V1](gh-issue:8779) |
|---------------------------|-----------------|------------------------------------------------------------------------|------------------------|-----------------------------|-----------------------| |---------------------------|-----------------|------------------------------------------------------------------------|------------------------|-----------------------------|-----------------------|
| `InternLM2ForRewardModel` | InternLM2-based | `internlm/internlm2-1_8b-reward`, `internlm/internlm2-7b-reward`, etc. | ✅︎ | ✅︎ | ✅︎ | | `InternLM2ForRewardModel` | InternLM2-based | `internlm/internlm2-1_8b-reward`, `internlm/internlm2-7b-reward`, etc. | ✅︎ | ✅︎ | |
| `LlamaForCausalLM` | Llama-based | `peiyi9979/math-shepherd-mistral-7b-prm`, etc. | ✅︎ | ✅︎ | ✅︎ | | `LlamaForCausalLM` | Llama-based | `peiyi9979/math-shepherd-mistral-7b-prm`, etc. | ✅︎ | ✅︎ | |
| `Qwen2ForRewardModel` | Qwen2-based | `Qwen/Qwen2.5-Math-RM-72B`, etc. | ✅︎ | ✅︎ | ✅︎ | | `Qwen2ForRewardModel` | Qwen2-based | `Qwen/Qwen2.5-Math-RM-72B`, etc. | ✅︎ | ✅︎ | |
| `Qwen2ForProcessRewardModel` | Qwen2-based | `Qwen/Qwen2.5-Math-PRM-7B`, etc. | ✅︎ | ✅︎ | ✅︎ |
If your model is not in the above list, we will try to automatically convert the model using If your model is not in the above list, we will try to automatically convert the model using
[as_reward_model][vllm.model_executor.models.adapters.as_reward_model]. By default, we return the hidden states of each token directly. [as_reward_model][vllm.model_executor.models.adapters.as_reward_model]. By default, we return the hidden states of each token directly.
@ -461,7 +445,7 @@ Specified using `--task classify`.
| Architecture | Models | Example HF Models | [LoRA][lora-adapter] | [PP][distributed-serving] | [V1](gh-issue:8779) | | Architecture | Models | Example HF Models | [LoRA][lora-adapter] | [PP][distributed-serving] | [V1](gh-issue:8779) |
|----------------------------------|----------|----------------------------------------|------------------------|-----------------------------|-----------------------| |----------------------------------|----------|----------------------------------------|------------------------|-----------------------------|-----------------------|
| `JambaForSequenceClassification` | Jamba | `ai21labs/Jamba-tiny-reward-dev`, etc. | ✅︎ | ✅︎ | | | `JambaForSequenceClassification` | Jamba | `ai21labs/Jamba-tiny-reward-dev`, etc. | ✅︎ | ✅︎ | |
| `GPT2ForSequenceClassification` | GPT2 | `nie3e/sentiment-polish-gpt2-small` | | | ✅︎ |
If your model is not in the above list, we will try to automatically convert the model using If your model is not in the above list, we will try to automatically convert the model using
[as_classification_model][vllm.model_executor.models.adapters.as_classification_model]. By default, the class probabilities are extracted from the softmaxed hidden state corresponding to the last token. [as_classification_model][vllm.model_executor.models.adapters.as_classification_model]. By default, the class probabilities are extracted from the softmaxed hidden state corresponding to the last token.
@ -472,7 +456,7 @@ Specified using `--task score`.
| Architecture | Models | Example HF Models | [V1](gh-issue:8779) | | Architecture | Models | Example HF Models | [V1](gh-issue:8779) |
|---------------------------------------|-------------------|--------------------------------------------------------------------------------------|-----------------------| |---------------------------------------|-------------------|--------------------------------------------------------------------------------------|-----------------------|
| `BertForSequenceClassification` | BERT-based | `cross-encoder/ms-marco-MiniLM-L-6-v2`, etc. | | | `BertForSequenceClassification` | BERT-based | `cross-encoder/ms-marco-MiniLM-L-6-v2`, etc. | |
| `Qwen3ForSequenceClassification` | Qwen3-based | `tomaarsen/Qwen3-Reranker-0.6B-seq-cls`, `Qwen/Qwen3-Reranker-0.6B` (see note), etc. | ✅︎ | | `Qwen3ForSequenceClassification` | Qwen3-based | `tomaarsen/Qwen3-Reranker-0.6B-seq-cls`, `Qwen/Qwen3-Reranker-0.6B` (see note), etc. | |
| `RobertaForSequenceClassification` | RoBERTa-based | `cross-encoder/quora-roberta-base`, etc. | | | `RobertaForSequenceClassification` | RoBERTa-based | `cross-encoder/quora-roberta-base`, etc. | |
| `XLMRobertaForSequenceClassification` | XLM-RoBERTa-based | `BAAI/bge-reranker-v2-m3`, etc. | | | `XLMRobertaForSequenceClassification` | XLM-RoBERTa-based | `BAAI/bge-reranker-v2-m3`, etc. | |
@ -578,7 +562,6 @@ Specified using `--task generate`.
| `SkyworkR1VChatModel` | Skywork-R1V-38B | T + I | `Skywork/Skywork-R1V-38B` | | ✅︎ | ✅︎ | | `SkyworkR1VChatModel` | Skywork-R1V-38B | T + I | `Skywork/Skywork-R1V-38B` | | ✅︎ | ✅︎ |
| `SmolVLMForConditionalGeneration` | SmolVLM2 | T + I | `SmolVLM2-2.2B-Instruct` | ✅︎ | | ✅︎ | | `SmolVLMForConditionalGeneration` | SmolVLM2 | T + I | `SmolVLM2-2.2B-Instruct` | ✅︎ | | ✅︎ |
| `TarsierForConditionalGeneration` | Tarsier | T + I<sup>E+</sup> | `omni-search/Tarsier-7b`,`omni-search/Tarsier-34b` | | ✅︎ | ✅︎ | | `TarsierForConditionalGeneration` | Tarsier | T + I<sup>E+</sup> | `omni-search/Tarsier-7b`,`omni-search/Tarsier-34b` | | ✅︎ | ✅︎ |
| `Tarsier2ForConditionalGeneration`<sup>^</sup> | Tarsier2 | T + I<sup>E+</sup> + V<sup>E+</sup> | `omni-research/Tarsier2-Recap-7b`,`omni-research/Tarsier2-7b-0115` | | ✅︎ | ✅︎ |
<sup>^</sup> You need to set the architecture name via `--hf-overrides` to match the one in vLLM. <sup>^</sup> You need to set the architecture name via `--hf-overrides` to match the one in vLLM.
&nbsp;&nbsp;&nbsp;&nbsp;• For example, to use DeepSeek-VL2 series models: &nbsp;&nbsp;&nbsp;&nbsp;• For example, to use DeepSeek-VL2 series models:
@ -617,29 +600,27 @@ Specified using `--task generate`.
For the best results, we recommend using the following dependency versions (tested on A10 and L40): For the best results, we recommend using the following dependency versions (tested on A10 and L40):
??? Dependency versions ```text
# Core vLLM-compatible dependencies with Molmo accuracy setup (tested on L40)
torch==2.5.1
torchvision==0.20.1
transformers==4.48.1
tokenizers==0.21.0
tiktoken==0.7.0
vllm==0.7.0
```text # Optional but recommended for improved performance and stability
# Core vLLM-compatible dependencies with Molmo accuracy setup (tested on L40) triton==3.1.0
torch==2.5.1 xformers==0.0.28.post3
torchvision==0.20.1 uvloop==0.21.0
transformers==4.48.1 protobuf==5.29.3
tokenizers==0.21.0 openai==1.60.2
tiktoken==0.7.0 opencv-python-headless==4.11.0.86
vllm==0.7.0 pillow==10.4.0
# Optional but recommended for improved performance and stability # Installed FlashAttention (for float16 only)
triton==3.1.0 flash-attn>=2.5.6 # Not used in float32, but should be documented
xformers==0.0.28.post3 ```
uvloop==0.21.0
protobuf==5.29.3
openai==1.60.2
opencv-python-headless==4.11.0.86
pillow==10.4.0
# Installed FlashAttention (for float16 only)
flash-attn>=2.5.6 # Not used in float32, but should be documented
```
**Note:** Make sure you understand the security implications of using outdated packages. **Note:** Make sure you understand the security implications of using outdated packages.

View File

@ -34,15 +34,15 @@ output = llm.generate("San Francisco is a")
To run multi-GPU serving, pass in the `--tensor-parallel-size` argument when starting the server. For example, to run API server on 4 GPUs: To run multi-GPU serving, pass in the `--tensor-parallel-size` argument when starting the server. For example, to run API server on 4 GPUs:
```bash ```console
vllm serve facebook/opt-13b \ vllm serve facebook/opt-13b \
--tensor-parallel-size 4 --tensor-parallel-size 4
``` ```
You can also additionally specify `--pipeline-parallel-size` to enable pipeline parallelism. For example, to run API server on 8 GPUs with pipeline parallelism and tensor parallelism: You can also additionally specify `--pipeline-parallel-size` to enable pipeline parallelism. For example, to run API server on 8 GPUs with pipeline parallelism and tensor parallelism:
```bash ```console
vllm serve gpt2 \ vllm serve gpt2 \
--tensor-parallel-size 4 \ --tensor-parallel-size 4 \
--pipeline-parallel-size 2 --pipeline-parallel-size 2
``` ```
@ -55,7 +55,7 @@ The first step, is to start containers and organize them into a cluster. We have
Pick a node as the head node, and run the following command: Pick a node as the head node, and run the following command:
```bash ```console
bash run_cluster.sh \ bash run_cluster.sh \
vllm/vllm-openai \ vllm/vllm-openai \
ip_of_head_node \ ip_of_head_node \
@ -66,7 +66,7 @@ bash run_cluster.sh \
On the rest of the worker nodes, run the following command: On the rest of the worker nodes, run the following command:
```bash ```console
bash run_cluster.sh \ bash run_cluster.sh \
vllm/vllm-openai \ vllm/vllm-openai \
ip_of_head_node \ ip_of_head_node \
@ -87,7 +87,7 @@ Then, on any node, use `docker exec -it node /bin/bash` to enter the container,
After that, on any node, use `docker exec -it node /bin/bash` to enter the container again. **In the container**, you can use vLLM as usual, just as you have all the GPUs on one node: vLLM will be able to leverage GPU resources of all nodes in the Ray cluster, and therefore, only run the `vllm` command on this node but not other nodes. The common practice is to set the tensor parallel size to the number of GPUs in each node, and the pipeline parallel size to the number of nodes. For example, if you have 16 GPUs in 2 nodes (8 GPUs per node), you can set the tensor parallel size to 8 and the pipeline parallel size to 2: After that, on any node, use `docker exec -it node /bin/bash` to enter the container again. **In the container**, you can use vLLM as usual, just as you have all the GPUs on one node: vLLM will be able to leverage GPU resources of all nodes in the Ray cluster, and therefore, only run the `vllm` command on this node but not other nodes. The common practice is to set the tensor parallel size to the number of GPUs in each node, and the pipeline parallel size to the number of nodes. For example, if you have 16 GPUs in 2 nodes (8 GPUs per node), you can set the tensor parallel size to 8 and the pipeline parallel size to 2:
```bash ```console
vllm serve /path/to/the/model/in/the/container \ vllm serve /path/to/the/model/in/the/container \
--tensor-parallel-size 8 \ --tensor-parallel-size 8 \
--pipeline-parallel-size 2 --pipeline-parallel-size 2
@ -95,7 +95,7 @@ After that, on any node, use `docker exec -it node /bin/bash` to enter the conta
You can also use tensor parallel without pipeline parallel, just set the tensor parallel size to the number of GPUs in the cluster. For example, if you have 16 GPUs in 2 nodes (8 GPUs per node), you can set the tensor parallel size to 16: You can also use tensor parallel without pipeline parallel, just set the tensor parallel size to the number of GPUs in the cluster. For example, if you have 16 GPUs in 2 nodes (8 GPUs per node), you can set the tensor parallel size to 16:
```bash ```console
vllm serve /path/to/the/model/in/the/container \ vllm serve /path/to/the/model/in/the/container \
--tensor-parallel-size 16 --tensor-parallel-size 16
``` ```

View File

@ -7,27 +7,25 @@ vLLM is also available via [LangChain](https://github.com/langchain-ai/langchain
To install LangChain, run To install LangChain, run
```bash ```console
pip install langchain langchain_community -q pip install langchain langchain_community -q
``` ```
To run inference on a single or multiple GPUs, use `VLLM` class from `langchain`. To run inference on a single or multiple GPUs, use `VLLM` class from `langchain`.
??? Code ```python
from langchain_community.llms import VLLM
```python llm = VLLM(model="mosaicml/mpt-7b",
from langchain_community.llms import VLLM trust_remote_code=True, # mandatory for hf models
max_new_tokens=128,
top_k=10,
top_p=0.95,
temperature=0.8,
# tensor_parallel_size=... # for distributed inference
)
llm = VLLM(model="mosaicml/mpt-7b", print(llm("What is the capital of France ?"))
trust_remote_code=True, # mandatory for hf models ```
max_new_tokens=128,
top_k=10,
top_p=0.95,
temperature=0.8,
# tensor_parallel_size=... # for distributed inference
)
print(llm("What is the capital of France ?"))
```
Please refer to this [Tutorial](https://python.langchain.com/docs/integrations/llms/vllm) for more details. Please refer to this [Tutorial](https://python.langchain.com/docs/integrations/llms/vllm) for more details.

View File

@ -7,7 +7,7 @@ vLLM is also available via [LlamaIndex](https://github.com/run-llama/llama_index
To install LlamaIndex, run To install LlamaIndex, run
```bash ```console
pip install llama-index-llms-vllm -q pip install llama-index-llms-vllm -q
``` ```

View File

@ -15,24 +15,22 @@ vllm serve NousResearch/Meta-Llama-3-8B-Instruct \
To call the server, in your preferred text editor, create a script that uses an HTTP client. Include any messages that you want to send to the model. Then run that script. Below is an example script using the [official OpenAI Python client](https://github.com/openai/openai-python). To call the server, in your preferred text editor, create a script that uses an HTTP client. Include any messages that you want to send to the model. Then run that script. Below is an example script using the [official OpenAI Python client](https://github.com/openai/openai-python).
??? Code ```python
from openai import OpenAI
client = OpenAI(
base_url="http://localhost:8000/v1",
api_key="token-abc123",
)
```python completion = client.chat.completions.create(
from openai import OpenAI model="NousResearch/Meta-Llama-3-8B-Instruct",
client = OpenAI( messages=[
base_url="http://localhost:8000/v1", {"role": "user", "content": "Hello!"}
api_key="token-abc123", ]
) )
completion = client.chat.completions.create( print(completion.choices[0].message)
model="NousResearch/Meta-Llama-3-8B-Instruct", ```
messages=[
{"role": "user", "content": "Hello!"}
]
)
print(completion.choices[0].message)
```
!!! tip !!! tip
vLLM supports some parameters that are not supported by OpenAI, `top_k` for example. vLLM supports some parameters that are not supported by OpenAI, `top_k` for example.
@ -149,29 +147,27 @@ with `--enable-request-id-headers`.
> rather than within the vLLM layer for this reason. > rather than within the vLLM layer for this reason.
> See [this PR](https://github.com/vllm-project/vllm/pull/11529) for more details. > See [this PR](https://github.com/vllm-project/vllm/pull/11529) for more details.
??? Code ```python
completion = client.chat.completions.create(
model="NousResearch/Meta-Llama-3-8B-Instruct",
messages=[
{"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"}
],
extra_headers={
"x-request-id": "sentiment-classification-00001",
}
)
print(completion._request_id)
```python completion = client.completions.create(
completion = client.chat.completions.create( model="NousResearch/Meta-Llama-3-8B-Instruct",
model="NousResearch/Meta-Llama-3-8B-Instruct", prompt="A robot may not injure a human being",
messages=[ extra_headers={
{"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"} "x-request-id": "completion-test",
], }
extra_headers={ )
"x-request-id": "sentiment-classification-00001", print(completion._request_id)
} ```
)
print(completion._request_id)
completion = client.completions.create(
model="NousResearch/Meta-Llama-3-8B-Instruct",
prompt="A robot may not injure a human being",
extra_headers={
"x-request-id": "completion-test",
}
)
print(completion._request_id)
```
## API Reference ## API Reference
@ -188,19 +184,15 @@ Code example: <gh-file:examples/online_serving/openai_completion_client.py>
The following [sampling parameters][sampling-params] are supported. The following [sampling parameters][sampling-params] are supported.
??? Code ```python
--8<-- "vllm/entrypoints/openai/protocol.py:completion-sampling-params"
```python ```
--8<-- "vllm/entrypoints/openai/protocol.py:completion-sampling-params"
```
The following extra parameters are supported: The following extra parameters are supported:
??? Code ```python
--8<-- "vllm/entrypoints/openai/protocol.py:completion-extra-params"
```python ```
--8<-- "vllm/entrypoints/openai/protocol.py:completion-extra-params"
```
[](){ #chat-api } [](){ #chat-api }
@ -220,19 +212,15 @@ Code example: <gh-file:examples/online_serving/openai_chat_completion_client.py>
The following [sampling parameters][sampling-params] are supported. The following [sampling parameters][sampling-params] are supported.
??? Code ```python
--8<-- "vllm/entrypoints/openai/protocol.py:chat-completion-sampling-params"
```python ```
--8<-- "vllm/entrypoints/openai/protocol.py:chat-completion-sampling-params"
```
The following extra parameters are supported: The following extra parameters are supported:
??? Code ```python
--8<-- "vllm/entrypoints/openai/protocol.py:chat-completion-extra-params"
```python ```
--8<-- "vllm/entrypoints/openai/protocol.py:chat-completion-extra-params"
```
[](){ #embeddings-api } [](){ #embeddings-api }
@ -271,31 +259,29 @@ and passing a list of `messages` in the request. Refer to the examples below for
Since the request schema is not defined by OpenAI client, we post a request to the server using the lower-level `requests` library: Since the request schema is not defined by OpenAI client, we post a request to the server using the lower-level `requests` library:
??? Code ```python
import requests
```python image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
import requests
image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" response = requests.post(
"http://localhost:8000/v1/embeddings",
response = requests.post( json={
"http://localhost:8000/v1/embeddings", "model": "TIGER-Lab/VLM2Vec-Full",
json={ "messages": [{
"model": "TIGER-Lab/VLM2Vec-Full", "role": "user",
"messages": [{ "content": [
"role": "user", {"type": "image_url", "image_url": {"url": image_url}},
"content": [ {"type": "text", "text": "Represent the given image."},
{"type": "image_url", "image_url": {"url": image_url}}, ],
{"type": "text", "text": "Represent the given image."}, }],
], "encoding_format": "float",
}], },
"encoding_format": "float", )
}, response.raise_for_status()
) response_json = response.json()
response.raise_for_status() print("Embedding output:", response_json["data"][0]["embedding"])
response_json = response.json() ```
print("Embedding output:", response_json["data"][0]["embedding"])
```
=== "DSE-Qwen2-MRL" === "DSE-Qwen2-MRL"
@ -330,19 +316,15 @@ The following [pooling parameters][pooling-params] are supported.
The following extra parameters are supported by default: The following extra parameters are supported by default:
??? Code ```python
--8<-- "vllm/entrypoints/openai/protocol.py:embedding-extra-params"
```python ```
--8<-- "vllm/entrypoints/openai/protocol.py:embedding-extra-params"
```
For chat-like input (i.e. if `messages` is passed), these extra parameters are supported instead: For chat-like input (i.e. if `messages` is passed), these extra parameters are supported instead:
??? Code ```python
--8<-- "vllm/entrypoints/openai/protocol.py:chat-embedding-extra-params"
```python ```
--8<-- "vllm/entrypoints/openai/protocol.py:chat-embedding-extra-params"
```
[](){ #transcriptions-api } [](){ #transcriptions-api }
@ -361,19 +343,15 @@ Code example: <gh-file:examples/online_serving/openai_transcription_client.py>
The following [sampling parameters][sampling-params] are supported. The following [sampling parameters][sampling-params] are supported.
??? Code ```python
--8<-- "vllm/entrypoints/openai/protocol.py:transcription-sampling-params"
```python ```
--8<-- "vllm/entrypoints/openai/protocol.py:transcription-sampling-params"
```
The following extra parameters are supported: The following extra parameters are supported:
??? Code ```python
--8<-- "vllm/entrypoints/openai/protocol.py:transcription-extra-params"
```python ```
--8<-- "vllm/entrypoints/openai/protocol.py:transcription-extra-params"
```
[](){ #tokenizer-api } [](){ #tokenizer-api }
@ -409,6 +387,8 @@ Code example: <gh-file:examples/online_serving/openai_classification_client.py>
You can classify multiple texts by passing an array of strings: You can classify multiple texts by passing an array of strings:
Request:
```bash ```bash
curl -v "http://127.0.0.1:8000/classify" \ curl -v "http://127.0.0.1:8000/classify" \
-H "Content-Type: application/json" \ -H "Content-Type: application/json" \
@ -421,45 +401,47 @@ curl -v "http://127.0.0.1:8000/classify" \
}' }'
``` ```
??? Response Response:
```bash ```bash
{
"id": "classify-7c87cac407b749a6935d8c7ce2a8fba2",
"object": "list",
"created": 1745383065,
"model": "jason9693/Qwen2.5-1.5B-apeach",
"data": [
{ {
"id": "classify-7c87cac407b749a6935d8c7ce2a8fba2", "index": 0,
"object": "list", "label": "Default",
"created": 1745383065, "probs": [
"model": "jason9693/Qwen2.5-1.5B-apeach", 0.565970778465271,
"data": [ 0.4340292513370514
{
"index": 0,
"label": "Default",
"probs": [
0.565970778465271,
0.4340292513370514
],
"num_classes": 2
},
{
"index": 1,
"label": "Spoiled",
"probs": [
0.26448777318000793,
0.7355121970176697
],
"num_classes": 2
}
], ],
"usage": { "num_classes": 2
"prompt_tokens": 20, },
"total_tokens": 20, {
"completion_tokens": 0, "index": 1,
"prompt_tokens_details": null "label": "Spoiled",
} "probs": [
0.26448777318000793,
0.7355121970176697
],
"num_classes": 2
} }
``` ],
"usage": {
"prompt_tokens": 20,
"total_tokens": 20,
"completion_tokens": 0,
"prompt_tokens_details": null
}
}
```
You can also pass a string directly to the `input` field: You can also pass a string directly to the `input` field:
Request:
```bash ```bash
curl -v "http://127.0.0.1:8000/classify" \ curl -v "http://127.0.0.1:8000/classify" \
-H "Content-Type: application/json" \ -H "Content-Type: application/json" \
@ -469,33 +451,33 @@ curl -v "http://127.0.0.1:8000/classify" \
}' }'
``` ```
??? Response Response:
```bash ```bash
{
"id": "classify-9bf17f2847b046c7b2d5495f4b4f9682",
"object": "list",
"created": 1745383213,
"model": "jason9693/Qwen2.5-1.5B-apeach",
"data": [
{ {
"id": "classify-9bf17f2847b046c7b2d5495f4b4f9682", "index": 0,
"object": "list", "label": "Default",
"created": 1745383213, "probs": [
"model": "jason9693/Qwen2.5-1.5B-apeach", 0.565970778465271,
"data": [ 0.4340292513370514
{
"index": 0,
"label": "Default",
"probs": [
0.565970778465271,
0.4340292513370514
],
"num_classes": 2
}
], ],
"usage": { "num_classes": 2
"prompt_tokens": 10,
"total_tokens": 10,
"completion_tokens": 0,
"prompt_tokens_details": null
}
} }
``` ],
"usage": {
"prompt_tokens": 10,
"total_tokens": 10,
"completion_tokens": 0,
"prompt_tokens_details": null
}
}
```
#### Extra parameters #### Extra parameters
@ -526,6 +508,8 @@ Code example: <gh-file:examples/online_serving/openai_cross_encoder_score.py>
You can pass a string to both `text_1` and `text_2`, forming a single sentence pair. You can pass a string to both `text_1` and `text_2`, forming a single sentence pair.
Request:
```bash ```bash
curl -X 'POST' \ curl -X 'POST' \
'http://127.0.0.1:8000/score' \ 'http://127.0.0.1:8000/score' \
@ -539,24 +523,24 @@ curl -X 'POST' \
}' }'
``` ```
??? Response Response:
```bash ```bash
{
"id": "score-request-id",
"object": "list",
"created": 693447,
"model": "BAAI/bge-reranker-v2-m3",
"data": [
{ {
"id": "score-request-id", "index": 0,
"object": "list", "object": "score",
"created": 693447, "score": 1
"model": "BAAI/bge-reranker-v2-m3",
"data": [
{
"index": 0,
"object": "score",
"score": 1
}
],
"usage": {}
} }
``` ],
"usage": {}
}
```
#### Batch inference #### Batch inference
@ -564,95 +548,95 @@ You can pass a string to `text_1` and a list to `text_2`, forming multiple sente
where each pair is built from `text_1` and a string in `text_2`. where each pair is built from `text_1` and a string in `text_2`.
The total number of pairs is `len(text_2)`. The total number of pairs is `len(text_2)`.
??? Request Request:
```bash ```bash
curl -X 'POST' \ curl -X 'POST' \
'http://127.0.0.1:8000/score' \ 'http://127.0.0.1:8000/score' \
-H 'accept: application/json' \ -H 'accept: application/json' \
-H 'Content-Type: application/json' \ -H 'Content-Type: application/json' \
-d '{ -d '{
"model": "BAAI/bge-reranker-v2-m3", "model": "BAAI/bge-reranker-v2-m3",
"text_1": "What is the capital of France?", "text_1": "What is the capital of France?",
"text_2": [ "text_2": [
"The capital of Brazil is Brasilia.", "The capital of Brazil is Brasilia.",
"The capital of France is Paris." "The capital of France is Paris."
] ]
}' }'
``` ```
??? Response Response:
```bash ```bash
{
"id": "score-request-id",
"object": "list",
"created": 693570,
"model": "BAAI/bge-reranker-v2-m3",
"data": [
{ {
"id": "score-request-id", "index": 0,
"object": "list", "object": "score",
"created": 693570, "score": 0.001094818115234375
"model": "BAAI/bge-reranker-v2-m3", },
"data": [ {
{ "index": 1,
"index": 0, "object": "score",
"object": "score", "score": 1
"score": 0.001094818115234375
},
{
"index": 1,
"object": "score",
"score": 1
}
],
"usage": {}
} }
``` ],
"usage": {}
}
```
You can pass a list to both `text_1` and `text_2`, forming multiple sentence pairs You can pass a list to both `text_1` and `text_2`, forming multiple sentence pairs
where each pair is built from a string in `text_1` and the corresponding string in `text_2` (similar to `zip()`). where each pair is built from a string in `text_1` and the corresponding string in `text_2` (similar to `zip()`).
The total number of pairs is `len(text_2)`. The total number of pairs is `len(text_2)`.
??? Request Request:
```bash ```bash
curl -X 'POST' \ curl -X 'POST' \
'http://127.0.0.1:8000/score' \ 'http://127.0.0.1:8000/score' \
-H 'accept: application/json' \ -H 'accept: application/json' \
-H 'Content-Type: application/json' \ -H 'Content-Type: application/json' \
-d '{ -d '{
"model": "BAAI/bge-reranker-v2-m3", "model": "BAAI/bge-reranker-v2-m3",
"encoding_format": "float", "encoding_format": "float",
"text_1": [ "text_1": [
"What is the capital of Brazil?", "What is the capital of Brazil?",
"What is the capital of France?" "What is the capital of France?"
], ],
"text_2": [ "text_2": [
"The capital of Brazil is Brasilia.", "The capital of Brazil is Brasilia.",
"The capital of France is Paris." "The capital of France is Paris."
] ]
}' }'
``` ```
??? Response Response:
```bash ```bash
{
"id": "score-request-id",
"object": "list",
"created": 693447,
"model": "BAAI/bge-reranker-v2-m3",
"data": [
{ {
"id": "score-request-id", "index": 0,
"object": "list", "object": "score",
"created": 693447, "score": 1
"model": "BAAI/bge-reranker-v2-m3", },
"data": [ {
{ "index": 1,
"index": 0, "object": "score",
"object": "score", "score": 1
"score": 1
},
{
"index": 1,
"object": "score",
"score": 1
}
],
"usage": {}
} }
``` ],
"usage": {}
}
```
#### Extra parameters #### Extra parameters
@ -691,51 +675,51 @@ Code example: <gh-file:examples/online_serving/jinaai_rerank_client.py>
Note that the `top_n` request parameter is optional and will default to the length of the `documents` field. Note that the `top_n` request parameter is optional and will default to the length of the `documents` field.
Result documents will be sorted by relevance, and the `index` property can be used to determine original order. Result documents will be sorted by relevance, and the `index` property can be used to determine original order.
??? Request Request:
```bash ```bash
curl -X 'POST' \ curl -X 'POST' \
'http://127.0.0.1:8000/v1/rerank' \ 'http://127.0.0.1:8000/v1/rerank' \
-H 'accept: application/json' \ -H 'accept: application/json' \
-H 'Content-Type: application/json' \ -H 'Content-Type: application/json' \
-d '{ -d '{
"model": "BAAI/bge-reranker-base", "model": "BAAI/bge-reranker-base",
"query": "What is the capital of France?", "query": "What is the capital of France?",
"documents": [ "documents": [
"The capital of Brazil is Brasilia.", "The capital of Brazil is Brasilia.",
"The capital of France is Paris.", "The capital of France is Paris.",
"Horses and cows are both animals" "Horses and cows are both animals"
] ]
}' }'
``` ```
??? Response Response:
```bash ```bash
{
"id": "rerank-fae51b2b664d4ed38f5969b612edff77",
"model": "BAAI/bge-reranker-base",
"usage": {
"total_tokens": 56
},
"results": [
{ {
"id": "rerank-fae51b2b664d4ed38f5969b612edff77", "index": 1,
"model": "BAAI/bge-reranker-base", "document": {
"usage": { "text": "The capital of France is Paris."
"total_tokens": 56
}, },
"results": [ "relevance_score": 0.99853515625
{ },
"index": 1, {
"document": { "index": 0,
"text": "The capital of France is Paris." "document": {
}, "text": "The capital of Brazil is Brasilia."
"relevance_score": 0.99853515625 },
}, "relevance_score": 0.0005860328674316406
{
"index": 0,
"document": {
"text": "The capital of Brazil is Brasilia."
},
"relevance_score": 0.0005860328674316406
}
]
} }
``` ]
}
```
#### Extra parameters #### Extra parameters

View File

@ -6,38 +6,34 @@ OpenAI compatible API server.
You can start the server using Python, or using [Docker][deployment-docker]: You can start the server using Python, or using [Docker][deployment-docker]:
```bash ```console
vllm serve unsloth/Llama-3.2-1B-Instruct vllm serve unsloth/Llama-3.2-1B-Instruct
``` ```
Then query the endpoint to get the latest metrics from the server: Then query the endpoint to get the latest metrics from the server:
??? Output ```console
$ curl http://0.0.0.0:8000/metrics
```console # HELP vllm:iteration_tokens_total Histogram of number of tokens per engine_step.
$ curl http://0.0.0.0:8000/metrics # TYPE vllm:iteration_tokens_total histogram
vllm:iteration_tokens_total_sum{model_name="unsloth/Llama-3.2-1B-Instruct"} 0.0
# HELP vllm:iteration_tokens_total Histogram of number of tokens per engine_step. vllm:iteration_tokens_total_bucket{le="1.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0
# TYPE vllm:iteration_tokens_total histogram vllm:iteration_tokens_total_bucket{le="8.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0
vllm:iteration_tokens_total_sum{model_name="unsloth/Llama-3.2-1B-Instruct"} 0.0 vllm:iteration_tokens_total_bucket{le="16.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0
vllm:iteration_tokens_total_bucket{le="1.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0 vllm:iteration_tokens_total_bucket{le="32.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0
vllm:iteration_tokens_total_bucket{le="8.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0 vllm:iteration_tokens_total_bucket{le="64.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0
vllm:iteration_tokens_total_bucket{le="16.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0 vllm:iteration_tokens_total_bucket{le="128.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0
vllm:iteration_tokens_total_bucket{le="32.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0 vllm:iteration_tokens_total_bucket{le="256.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0
vllm:iteration_tokens_total_bucket{le="64.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0 vllm:iteration_tokens_total_bucket{le="512.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0
vllm:iteration_tokens_total_bucket{le="128.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0 ...
vllm:iteration_tokens_total_bucket{le="256.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0 ```
vllm:iteration_tokens_total_bucket{le="512.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0
...
```
The following metrics are exposed: The following metrics are exposed:
??? Code ```python
--8<-- "vllm/engine/metrics.py:metrics-definitions"
```python ```
--8<-- "vllm/engine/metrics.py:metrics-definitions"
```
Note: when metrics are deprecated in version `X.Y`, they are hidden in version `X.Y+1` Note: when metrics are deprecated in version `X.Y`, they are hidden in version `X.Y+1`
but can be re-enabled using the `--show-hidden-metrics-for-version=X.Y` escape hatch, but can be re-enabled using the `--show-hidden-metrics-for-version=X.Y` escape hatch,

View File

@ -60,84 +60,79 @@ To identify the particular CUDA operation that causes the error, you can add `--
If GPU/CPU communication cannot be established, you can use the following Python script and follow the instructions below to confirm whether the GPU/CPU communication is working correctly. If GPU/CPU communication cannot be established, you can use the following Python script and follow the instructions below to confirm whether the GPU/CPU communication is working correctly.
??? Code ```python
# Test PyTorch NCCL
import torch
import torch.distributed as dist
dist.init_process_group(backend="nccl")
local_rank = dist.get_rank() % torch.cuda.device_count()
torch.cuda.set_device(local_rank)
data = torch.FloatTensor([1,] * 128).to("cuda")
dist.all_reduce(data, op=dist.ReduceOp.SUM)
torch.cuda.synchronize()
value = data.mean().item()
world_size = dist.get_world_size()
assert value == world_size, f"Expected {world_size}, got {value}"
```python print("PyTorch NCCL is successful!")
# Test PyTorch NCCL
import torch
import torch.distributed as dist
dist.init_process_group(backend="nccl")
local_rank = dist.get_rank() % torch.cuda.device_count()
torch.cuda.set_device(local_rank)
data = torch.FloatTensor([1,] * 128).to("cuda")
dist.all_reduce(data, op=dist.ReduceOp.SUM)
torch.cuda.synchronize()
value = data.mean().item()
world_size = dist.get_world_size()
assert value == world_size, f"Expected {world_size}, got {value}"
print("PyTorch NCCL is successful!") # Test PyTorch GLOO
gloo_group = dist.new_group(ranks=list(range(world_size)), backend="gloo")
cpu_data = torch.FloatTensor([1,] * 128)
dist.all_reduce(cpu_data, op=dist.ReduceOp.SUM, group=gloo_group)
value = cpu_data.mean().item()
assert value == world_size, f"Expected {world_size}, got {value}"
# Test PyTorch GLOO print("PyTorch GLOO is successful!")
gloo_group = dist.new_group(ranks=list(range(world_size)), backend="gloo")
cpu_data = torch.FloatTensor([1,] * 128)
dist.all_reduce(cpu_data, op=dist.ReduceOp.SUM, group=gloo_group)
value = cpu_data.mean().item()
assert value == world_size, f"Expected {world_size}, got {value}"
print("PyTorch GLOO is successful!") if world_size <= 1:
exit()
if world_size <= 1: # Test vLLM NCCL, with cuda graph
exit() from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
# Test vLLM NCCL, with cuda graph pynccl = PyNcclCommunicator(group=gloo_group, device=local_rank)
from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator # pynccl is enabled by default for 0.6.5+,
# but for 0.6.4 and below, we need to enable it manually.
pynccl = PyNcclCommunicator(group=gloo_group, device=local_rank) # keep the code for backward compatibility when because people
# pynccl is enabled by default for 0.6.5+, # prefer to read the latest documentation.
# but for 0.6.4 and below, we need to enable it manually. pynccl.disabled = False
# keep the code for backward compatibility when because people
# prefer to read the latest documentation.
pynccl.disabled = False
s = torch.cuda.Stream()
with torch.cuda.stream(s):
data.fill_(1)
out = pynccl.all_reduce(data, stream=s)
value = out.mean().item()
assert value == world_size, f"Expected {world_size}, got {value}"
print("vLLM NCCL is successful!")
g = torch.cuda.CUDAGraph()
with torch.cuda.graph(cuda_graph=g, stream=s):
out = pynccl.all_reduce(data, stream=torch.cuda.current_stream())
s = torch.cuda.Stream()
with torch.cuda.stream(s):
data.fill_(1) data.fill_(1)
g.replay() out = pynccl.all_reduce(data, stream=s)
torch.cuda.current_stream().synchronize()
value = out.mean().item() value = out.mean().item()
assert value == world_size, f"Expected {world_size}, got {value}" assert value == world_size, f"Expected {world_size}, got {value}"
print("vLLM NCCL with cuda graph is successful!") print("vLLM NCCL is successful!")
dist.destroy_process_group(gloo_group) g = torch.cuda.CUDAGraph()
dist.destroy_process_group() with torch.cuda.graph(cuda_graph=g, stream=s):
``` out = pynccl.all_reduce(data, stream=torch.cuda.current_stream())
data.fill_(1)
g.replay()
torch.cuda.current_stream().synchronize()
value = out.mean().item()
assert value == world_size, f"Expected {world_size}, got {value}"
print("vLLM NCCL with cuda graph is successful!")
dist.destroy_process_group(gloo_group)
dist.destroy_process_group()
```
If you are testing with a single node, adjust `--nproc-per-node` to the number of GPUs you want to use: If you are testing with a single node, adjust `--nproc-per-node` to the number of GPUs you want to use:
```bash ```console
NCCL_DEBUG=TRACE torchrun --nproc-per-node=<number-of-GPUs> test.py NCCL_DEBUG=TRACE torchrun --nproc-per-node=<number-of-GPUs> test.py
``` ```
If you are testing with multi-nodes, adjust `--nproc-per-node` and `--nnodes` according to your setup and set `MASTER_ADDR` to the correct IP address of the master node, reachable from all nodes. Then, run: If you are testing with multi-nodes, adjust `--nproc-per-node` and `--nnodes` according to your setup and set `MASTER_ADDR` to the correct IP address of the master node, reachable from all nodes. Then, run:
```bash ```console
NCCL_DEBUG=TRACE torchrun --nnodes 2 \ NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=$MASTER_ADDR test.py
--nproc-per-node=2 \
--rdzv_backend=c10d \
--rdzv_endpoint=$MASTER_ADDR test.py
``` ```
If the script runs successfully, you should see the message `sanity check is successful!`. If the script runs successfully, you should see the message `sanity check is successful!`.
@ -170,27 +165,25 @@ WARNING 12-11 14:50:37 multiproc_worker_utils.py:281] CUDA was previously
or an error from Python that looks like this: or an error from Python that looks like this:
??? Logs ```console
RuntimeError:
An attempt has been made to start a new process before the
current process has finished its bootstrapping phase.
```console This probably means that you are not using fork to start your
RuntimeError: child processes and you have forgotten to use the proper idiom
An attempt has been made to start a new process before the in the main module:
current process has finished its bootstrapping phase.
This probably means that you are not using fork to start your if __name__ == '__main__':
child processes and you have forgotten to use the proper idiom freeze_support()
in the main module: ...
if __name__ == '__main__': The "freeze_support()" line can be omitted if the program
freeze_support() is not going to be frozen to produce an executable.
...
The "freeze_support()" line can be omitted if the program To fix this issue, refer to the "Safe importing of main module"
is not going to be frozen to produce an executable. section in https://docs.python.org/3/library/multiprocessing.html
```
To fix this issue, refer to the "Safe importing of main module"
section in https://docs.python.org/3/library/multiprocessing.html
```
then you must update your Python code to guard usage of `vllm` behind a `if then you must update your Python code to guard usage of `vllm` behind a `if
__name__ == '__main__':` block. For example, instead of this: __name__ == '__main__':` block. For example, instead of this:
@ -214,22 +207,20 @@ if __name__ == '__main__':
vLLM heavily depends on `torch.compile` to optimize the model for better performance, which introduces the dependency on the `torch.compile` functionality and the `triton` library. By default, we use `torch.compile` to [optimize some functions](https://github.com/vllm-project/vllm/pull/10406) in the model. Before running vLLM, you can check if `torch.compile` is working as expected by running the following script: vLLM heavily depends on `torch.compile` to optimize the model for better performance, which introduces the dependency on the `torch.compile` functionality and the `triton` library. By default, we use `torch.compile` to [optimize some functions](https://github.com/vllm-project/vllm/pull/10406) in the model. Before running vLLM, you can check if `torch.compile` is working as expected by running the following script:
??? Code ```python
import torch
```python @torch.compile
import torch def f(x):
# a simple function to test torch.compile
x = x + 1
x = x * 2
x = x.sin()
return x
@torch.compile x = torch.randn(4, 4).cuda()
def f(x): print(f(x))
# a simple function to test torch.compile ```
x = x + 1
x = x * 2
x = x.sin()
return x
x = torch.randn(4, 4).cuda()
print(f(x))
```
If it raises errors from `torch/_inductor` directory, usually it means you have a custom `triton` library that is not compatible with the version of PyTorch you are using. See [this issue](https://github.com/vllm-project/vllm/issues/12219) for example. If it raises errors from `torch/_inductor` directory, usually it means you have a custom `triton` library that is not compatible with the version of PyTorch you are using. See [this issue](https://github.com/vllm-project/vllm/issues/12219) for example.

View File

@ -10,38 +10,36 @@ The list of data collected by the latest version of vLLM can be found here: <gh-
Here is an example as of v0.4.0: Here is an example as of v0.4.0:
??? Output ```json
{
```json "uuid": "fbe880e9-084d-4cab-a395-8984c50f1109",
{ "provider": "GCP",
"uuid": "fbe880e9-084d-4cab-a395-8984c50f1109", "num_cpu": 24,
"provider": "GCP", "cpu_type": "Intel(R) Xeon(R) CPU @ 2.20GHz",
"num_cpu": 24, "cpu_family_model_stepping": "6,85,7",
"cpu_type": "Intel(R) Xeon(R) CPU @ 2.20GHz", "total_memory": 101261135872,
"cpu_family_model_stepping": "6,85,7", "architecture": "x86_64",
"total_memory": 101261135872, "platform": "Linux-5.10.0-28-cloud-amd64-x86_64-with-glibc2.31",
"architecture": "x86_64", "gpu_count": 2,
"platform": "Linux-5.10.0-28-cloud-amd64-x86_64-with-glibc2.31", "gpu_type": "NVIDIA L4",
"gpu_count": 2, "gpu_memory_per_device": 23580639232,
"gpu_type": "NVIDIA L4", "model_architecture": "OPTForCausalLM",
"gpu_memory_per_device": 23580639232, "vllm_version": "0.3.2+cu123",
"model_architecture": "OPTForCausalLM", "context": "LLM_CLASS",
"vllm_version": "0.3.2+cu123", "log_time": 1711663373492490000,
"context": "LLM_CLASS", "source": "production",
"log_time": 1711663373492490000, "dtype": "torch.float16",
"source": "production", "tensor_parallel_size": 1,
"dtype": "torch.float16", "block_size": 16,
"tensor_parallel_size": 1, "gpu_memory_utilization": 0.9,
"block_size": 16, "quantization": null,
"gpu_memory_utilization": 0.9, "kv_cache_dtype": "auto",
"quantization": null, "enable_lora": false,
"kv_cache_dtype": "auto", "enable_prefix_caching": false,
"enable_lora": false, "enforce_eager": false,
"enable_prefix_caching": false, "disable_custom_all_reduce": true
"enforce_eager": false, }
"disable_custom_all_reduce": true ```
}
```
You can preview the collected data by running the following command: You can preview the collected data by running the following command:

View File

@ -39,24 +39,12 @@ This living user guide outlines a few known **important changes and limitations*
For each item, our progress towards V1 support falls into one of the following states: For each item, our progress towards V1 support falls into one of the following states:
- **🚀 Optimized**: Nearly fully optimized, with no further work currently planned. - **🚀 Optimized**: Nearly fully optimized, with no further work currently planned.
- **🟢 Functional**: Fully operational, with ongoing optimizations. - **🟢 Functional**: Fully operational, with ongoing optimizations.
- **🚧 WIP**: Under active development. - **🚧 WIP**: Under active development.
- **🟡 Planned**: Scheduled for future implementation (some may have open PRs/RFCs). - **🟡 Planned**: Scheduled for future implementation (some may have open PRs/RFCs).
- **🟠 Delayed**: Temporarily dropped in V1 but planned to be re-introduced later. - **🟠 Delayed**: Temporarily dropped in V1 but planned to be re-introduced later.
- **🔴 Deprecated**: Not planned for V1 unless there is strong demand. - **🔴 Deprecated**: Not planned for V1 unless there is strong demand.
!!! note
vLLM V1s unified scheduler treats both prompt and output tokens the same
way by using a simple dictionary (e.g., `{request_id: num_tokens}`) to dynamically
allocate a fixed token budget per request, enabling features like chunked prefills,
prefix caching, and speculative decoding without a strict separation between prefill
and decode phases.
The V1 scheduler supports multiple scheduling policies, including First-Come,
First-Served (FCFS) and priority-based scheduling (where requests are processed
based on assigned priority, with FCFS as a tie-breaker), configurable via the
`--scheduling-policy` argument.
### Hardware ### Hardware
| Hardware | Status | | Hardware | Status |
@ -82,7 +70,7 @@ based on assigned priority, with FCFS as a tie-breaker), configurable via the
|-----------------------------|------------------------------------------------------------------------------------| |-----------------------------|------------------------------------------------------------------------------------|
| **Decoder-only Models** | <nobr>🚀 Optimized</nobr> | | **Decoder-only Models** | <nobr>🚀 Optimized</nobr> |
| **Encoder-Decoder Models** | <nobr>🟠 Delayed</nobr> | | **Encoder-Decoder Models** | <nobr>🟠 Delayed</nobr> |
| **Embedding Models** | <nobr>🟢 Functional</nobr> | | **Embedding Models** | <nobr>🚧 WIP ([PR #16188](https://github.com/vllm-project/vllm/pull/16188))</nobr> |
| **Mamba Models** | <nobr>🚧 WIP ([PR #19327](https://github.com/vllm-project/vllm/pull/19327))</nobr> | | **Mamba Models** | <nobr>🚧 WIP ([PR #19327](https://github.com/vllm-project/vllm/pull/19327))</nobr> |
| **Multimodal Models** | <nobr>🟢 Functional</nobr> | | **Multimodal Models** | <nobr>🟢 Functional</nobr> |
@ -92,11 +80,11 @@ vLLM V1 currently excludes model architectures with the `SupportsV0Only` protoco
This corresponds to the V1 column in our [list of supported models][supported-models]. This corresponds to the V1 column in our [list of supported models][supported-models].
See below for the status of models that are not yet supported or have more features planned in V1. See below for the status of models that are still not yet supported in V1.
#### Embedding Models #### Embedding Models
The initial basic support is now functional. The initial support will be provided by [PR #16188](https://github.com/vllm-project/vllm/pull/16188).
Later, we will consider using [hidden states processor](https://github.com/vllm-project/vllm/issues/12249), Later, we will consider using [hidden states processor](https://github.com/vllm-project/vllm/issues/12249),
which is based on [global logits processor](https://github.com/vllm-project/vllm/pull/13360) which is based on [global logits processor](https://github.com/vllm-project/vllm/pull/13360)

View File

@ -12,10 +12,7 @@ def parse_args():
parser = EngineArgs.add_cli_args(parser) parser = EngineArgs.add_cli_args(parser)
# Set example specific arguments # Set example specific arguments
parser.set_defaults( parser.set_defaults(
model="intfloat/e5-mistral-7b-instruct", model="intfloat/e5-mistral-7b-instruct", task="embed", enforce_eager=True
task="embed",
enforce_eager=True,
max_model_len=1024,
) )
return parser.parse_args() return parser.parse_args()

View File

@ -29,14 +29,14 @@ We currently support `/v1/chat/completions`, `/v1/embeddings`, and `/v1/score` e
To follow along with this example, you can download the example batch, or create your own batch file in your working directory. To follow along with this example, you can download the example batch, or create your own batch file in your working directory.
```bash ```console
wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai_batch/openai_example_batch.jsonl wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai_batch/openai_example_batch.jsonl
``` ```
Once you've created your batch file it should look like this Once you've created your batch file it should look like this
```bash ```console
cat offline_inference/openai_batch/openai_example_batch.jsonl $ cat offline_inference/openai_batch/openai_example_batch.jsonl
{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}} {"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}} {"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
``` ```
@ -47,7 +47,7 @@ The batch running tool is designed to be used from the command line.
You can run the batch with the following command, which will write its results to a file called `results.jsonl` You can run the batch with the following command, which will write its results to a file called `results.jsonl`
```bash ```console
python -m vllm.entrypoints.openai.run_batch \ python -m vllm.entrypoints.openai.run_batch \
-i offline_inference/openai_batch/openai_example_batch.jsonl \ -i offline_inference/openai_batch/openai_example_batch.jsonl \
-o results.jsonl \ -o results.jsonl \
@ -56,7 +56,7 @@ python -m vllm.entrypoints.openai.run_batch \
or use command-line: or use command-line:
```bash ```console
vllm run-batch \ vllm run-batch \
-i offline_inference/openai_batch/openai_example_batch.jsonl \ -i offline_inference/openai_batch/openai_example_batch.jsonl \
-o results.jsonl \ -o results.jsonl \
@ -67,8 +67,8 @@ vllm run-batch \
You should now have your results at `results.jsonl`. You can check your results by running `cat results.jsonl` You should now have your results at `results.jsonl`. You can check your results by running `cat results.jsonl`
```bash ```console
cat results.jsonl $ cat results.jsonl
{"id":"vllm-383d1c59835645aeb2e07d004d62a826","custom_id":"request-1","response":{"id":"cmpl-61c020e54b964d5a98fa7527bfcdd378","object":"chat.completion","created":1715633336,"model":"meta-llama/Meta-Llama-3-8B-Instruct","choices":[{"index":0,"message":{"role":"assistant","content":"Hello! It's great to meet you! I'm here to help with any questions or tasks you may have. What's on your mind today?"},"logprobs":null,"finish_reason":"stop","stop_reason":null}],"usage":{"prompt_tokens":25,"total_tokens":56,"completion_tokens":31}},"error":null} {"id":"vllm-383d1c59835645aeb2e07d004d62a826","custom_id":"request-1","response":{"id":"cmpl-61c020e54b964d5a98fa7527bfcdd378","object":"chat.completion","created":1715633336,"model":"meta-llama/Meta-Llama-3-8B-Instruct","choices":[{"index":0,"message":{"role":"assistant","content":"Hello! It's great to meet you! I'm here to help with any questions or tasks you may have. What's on your mind today?"},"logprobs":null,"finish_reason":"stop","stop_reason":null}],"usage":{"prompt_tokens":25,"total_tokens":56,"completion_tokens":31}},"error":null}
{"id":"vllm-42e3d09b14b04568afa3f1797751a267","custom_id":"request-2","response":{"id":"cmpl-f44d049f6b3a42d4b2d7850bb1e31bcc","object":"chat.completion","created":1715633336,"model":"meta-llama/Meta-Llama-3-8B-Instruct","choices":[{"index":0,"message":{"role":"assistant","content":"*silence*"},"logprobs":null,"finish_reason":"stop","stop_reason":null}],"usage":{"prompt_tokens":27,"total_tokens":32,"completion_tokens":5}},"error":null} {"id":"vllm-42e3d09b14b04568afa3f1797751a267","custom_id":"request-2","response":{"id":"cmpl-f44d049f6b3a42d4b2d7850bb1e31bcc","object":"chat.completion","created":1715633336,"model":"meta-llama/Meta-Llama-3-8B-Instruct","choices":[{"index":0,"message":{"role":"assistant","content":"*silence*"},"logprobs":null,"finish_reason":"stop","stop_reason":null}],"usage":{"prompt_tokens":27,"total_tokens":32,"completion_tokens":5}},"error":null}
``` ```
@ -79,7 +79,7 @@ The batch runner supports remote input and output urls that are accessible via h
For example, to run against our example input file located at `https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai_batch/openai_example_batch.jsonl`, you can run For example, to run against our example input file located at `https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai_batch/openai_example_batch.jsonl`, you can run
```bash ```console
python -m vllm.entrypoints.openai.run_batch \ python -m vllm.entrypoints.openai.run_batch \
-i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai_batch/openai_example_batch.jsonl \ -i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai_batch/openai_example_batch.jsonl \
-o results.jsonl \ -o results.jsonl \
@ -88,7 +88,7 @@ python -m vllm.entrypoints.openai.run_batch \
or use command-line: or use command-line:
```bash ```console
vllm run-batch \ vllm run-batch \
-i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai_batch/openai_example_batch.jsonl \ -i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai_batch/openai_example_batch.jsonl \
-o results.jsonl \ -o results.jsonl \
@ -112,21 +112,21 @@ To integrate with cloud blob storage, we recommend using presigned urls.
To follow along with this example, you can download the example batch, or create your own batch file in your working directory. To follow along with this example, you can download the example batch, or create your own batch file in your working directory.
```bash ```console
wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai_batch/openai_example_batch.jsonl wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai_batch/openai_example_batch.jsonl
``` ```
Once you've created your batch file it should look like this Once you've created your batch file it should look like this
```bash ```console
cat offline_inference/openai_batch/openai_example_batch.jsonl $ cat offline_inference/openai_batch/openai_example_batch.jsonl
{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}} {"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}} {"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
``` ```
Now upload your batch file to your S3 bucket. Now upload your batch file to your S3 bucket.
```bash ```console
aws s3 cp offline_inference/openai_batch/openai_example_batch.jsonl s3://MY_BUCKET/MY_INPUT_FILE.jsonl aws s3 cp offline_inference/openai_batch/openai_example_batch.jsonl s3://MY_BUCKET/MY_INPUT_FILE.jsonl
``` ```
@ -181,7 +181,7 @@ output_url='https://s3.us-west-2.amazonaws.com/MY_BUCKET/MY_OUTPUT_FILE.jsonl?AW
You can now run the batch runner, using the urls generated in the previous section. You can now run the batch runner, using the urls generated in the previous section.
```bash ```console
python -m vllm.entrypoints.openai.run_batch \ python -m vllm.entrypoints.openai.run_batch \
-i "https://s3.us-west-2.amazonaws.com/MY_BUCKET/MY_INPUT_FILE.jsonl?AWSAccessKeyId=ABCDEFGHIJKLMNOPQRST&Signature=abcdefghijklmnopqrstuvwxyz12345&Expires=1715800091" \ -i "https://s3.us-west-2.amazonaws.com/MY_BUCKET/MY_INPUT_FILE.jsonl?AWSAccessKeyId=ABCDEFGHIJKLMNOPQRST&Signature=abcdefghijklmnopqrstuvwxyz12345&Expires=1715800091" \
-o "https://s3.us-west-2.amazonaws.com/MY_BUCKET/MY_OUTPUT_FILE.jsonl?AWSAccessKeyId=ABCDEFGHIJKLMNOPQRST&Signature=abcdefghijklmnopqrstuvwxyz12345&Expires=1715800091" \ -o "https://s3.us-west-2.amazonaws.com/MY_BUCKET/MY_OUTPUT_FILE.jsonl?AWSAccessKeyId=ABCDEFGHIJKLMNOPQRST&Signature=abcdefghijklmnopqrstuvwxyz12345&Expires=1715800091" \
@ -190,7 +190,7 @@ python -m vllm.entrypoints.openai.run_batch \
or use command-line: or use command-line:
```bash ```console
vllm run-batch \ vllm run-batch \
-i "https://s3.us-west-2.amazonaws.com/MY_BUCKET/MY_INPUT_FILE.jsonl?AWSAccessKeyId=ABCDEFGHIJKLMNOPQRST&Signature=abcdefghijklmnopqrstuvwxyz12345&Expires=1715800091" \ -i "https://s3.us-west-2.amazonaws.com/MY_BUCKET/MY_INPUT_FILE.jsonl?AWSAccessKeyId=ABCDEFGHIJKLMNOPQRST&Signature=abcdefghijklmnopqrstuvwxyz12345&Expires=1715800091" \
-o "https://s3.us-west-2.amazonaws.com/MY_BUCKET/MY_OUTPUT_FILE.jsonl?AWSAccessKeyId=ABCDEFGHIJKLMNOPQRST&Signature=abcdefghijklmnopqrstuvwxyz12345&Expires=1715800091" \ -o "https://s3.us-west-2.amazonaws.com/MY_BUCKET/MY_OUTPUT_FILE.jsonl?AWSAccessKeyId=ABCDEFGHIJKLMNOPQRST&Signature=abcdefghijklmnopqrstuvwxyz12345&Expires=1715800091" \
@ -201,7 +201,7 @@ vllm run-batch \
Your results are now on S3. You can view them in your terminal by running Your results are now on S3. You can view them in your terminal by running
```bash ```console
aws s3 cp s3://MY_BUCKET/MY_OUTPUT_FILE.jsonl - aws s3 cp s3://MY_BUCKET/MY_OUTPUT_FILE.jsonl -
``` ```
@ -230,8 +230,8 @@ You can run the batch using the same command as in earlier examples.
You can check your results by running `cat results.jsonl` You can check your results by running `cat results.jsonl`
```bash ```console
cat results.jsonl $ cat results.jsonl
{"id":"vllm-db0f71f7dec244e6bce530e0b4ef908b","custom_id":"request-1","response":{"status_code":200,"request_id":"vllm-batch-3580bf4d4ae54d52b67eee266a6eab20","body":{"id":"embd-33ac2efa7996430184461f2e38529746","object":"list","created":444647,"model":"intfloat/e5-mistral-7b-instruct","data":[{"index":0,"object":"embedding","embedding":[0.016204833984375,0.0092010498046875,0.0018358230590820312,-0.0028228759765625,0.001422882080078125,-0.0031147003173828125,...]}],"usage":{"prompt_tokens":8,"total_tokens":8,"completion_tokens":0}}},"error":null} {"id":"vllm-db0f71f7dec244e6bce530e0b4ef908b","custom_id":"request-1","response":{"status_code":200,"request_id":"vllm-batch-3580bf4d4ae54d52b67eee266a6eab20","body":{"id":"embd-33ac2efa7996430184461f2e38529746","object":"list","created":444647,"model":"intfloat/e5-mistral-7b-instruct","data":[{"index":0,"object":"embedding","embedding":[0.016204833984375,0.0092010498046875,0.0018358230590820312,-0.0028228759765625,0.001422882080078125,-0.0031147003173828125,...]}],"usage":{"prompt_tokens":8,"total_tokens":8,"completion_tokens":0}}},"error":null}
... ...
``` ```
@ -261,8 +261,8 @@ You can run the batch using the same command as in earlier examples.
You can check your results by running `cat results.jsonl` You can check your results by running `cat results.jsonl`
```bash ```console
cat results.jsonl $ cat results.jsonl
{"id":"vllm-f87c5c4539184f618e555744a2965987","custom_id":"request-1","response":{"status_code":200,"request_id":"vllm-batch-806ab64512e44071b37d3f7ccd291413","body":{"id":"score-4ee45236897b4d29907d49b01298cdb1","object":"list","created":1737847944,"model":"BAAI/bge-reranker-v2-m3","data":[{"index":0,"object":"score","score":0.0010900497436523438},{"index":1,"object":"score","score":1.0}],"usage":{"prompt_tokens":37,"total_tokens":37,"completion_tokens":0,"prompt_tokens_details":null}}},"error":null} {"id":"vllm-f87c5c4539184f618e555744a2965987","custom_id":"request-1","response":{"status_code":200,"request_id":"vllm-batch-806ab64512e44071b37d3f7ccd291413","body":{"id":"score-4ee45236897b4d29907d49b01298cdb1","object":"list","created":1737847944,"model":"BAAI/bge-reranker-v2-m3","data":[{"index":0,"object":"score","score":0.0010900497436523438},{"index":1,"object":"score","score":1.0}],"usage":{"prompt_tokens":37,"total_tokens":37,"completion_tokens":0,"prompt_tokens_details":null}}},"error":null}
{"id":"vllm-41990c51a26d4fac8419077f12871099","custom_id":"request-2","response":{"status_code":200,"request_id":"vllm-batch-73ce66379026482699f81974e14e1e99","body":{"id":"score-13f2ffe6ba40460fbf9f7f00ad667d75","object":"list","created":1737847944,"model":"BAAI/bge-reranker-v2-m3","data":[{"index":0,"object":"score","score":0.001094818115234375},{"index":1,"object":"score","score":1.0}],"usage":{"prompt_tokens":37,"total_tokens":37,"completion_tokens":0,"prompt_tokens_details":null}}},"error":null} {"id":"vllm-41990c51a26d4fac8419077f12871099","custom_id":"request-2","response":{"status_code":200,"request_id":"vllm-batch-73ce66379026482699f81974e14e1e99","body":{"id":"score-13f2ffe6ba40460fbf9f7f00ad667d75","object":"list","created":1737847944,"model":"BAAI/bge-reranker-v2-m3","data":[{"index":0,"object":"score","score":0.001094818115234375},{"index":1,"object":"score","score":1.0}],"usage":{"prompt_tokens":37,"total_tokens":37,"completion_tokens":0,"prompt_tokens_details":null}}},"error":null}
``` ```

View File

@ -22,19 +22,15 @@ model_name = "Qwen/Qwen3-Reranker-0.6B"
# If you want to load the official original version, the init parameters are # If you want to load the official original version, the init parameters are
# as follows. # as follows.
model = LLM(
def get_model() -> LLM: model=model_name,
"""Initializes and returns the LLM model for Qwen3-Reranker.""" task="score",
return LLM( hf_overrides={
model=model_name, "architectures": ["Qwen3ForSequenceClassification"],
task="score", "classifier_from_token": ["no", "yes"],
hf_overrides={ "is_original_qwen3_reranker": True,
"architectures": ["Qwen3ForSequenceClassification"], },
"classifier_from_token": ["no", "yes"], )
"is_original_qwen3_reranker": True,
},
)
# Why do we need hf_overrides for the official original version: # Why do we need hf_overrides for the official original version:
# vllm converts it to Qwen3ForSequenceClassification when loaded for # vllm converts it to Qwen3ForSequenceClassification when loaded for
@ -55,8 +51,7 @@ suffix = "<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n"
query_template = "{prefix}<Instruct>: {instruction}\n<Query>: {query}\n" query_template = "{prefix}<Instruct>: {instruction}\n<Query>: {query}\n"
document_template = "<Document>: {doc}{suffix}" document_template = "<Document>: {doc}{suffix}"
if __name__ == "__main__":
def main() -> None:
instruction = ( instruction = (
"Given a web search query, retrieve relevant passages that answer the query" "Given a web search query, retrieve relevant passages that answer the query"
) )
@ -77,13 +72,6 @@ def main() -> None:
] ]
documents = [document_template.format(doc=doc, suffix=suffix) for doc in documents] documents = [document_template.format(doc=doc, suffix=suffix) for doc in documents]
model = get_model()
outputs = model.score(queries, documents) outputs = model.score(queries, documents)
print("-" * 30)
print([output.outputs.score for output in outputs]) print([output.outputs.score for output in outputs])
print("-" * 30)
if __name__ == "__main__":
main()

View File

@ -1040,37 +1040,6 @@ def run_qwen2_5_omni(questions: list[str], modality: str):
) )
def run_tarsier2(questions: list[str], modality: str) -> ModelRequestData:
model_name = "omni-research/Tarsier2-Recap-7b"
engine_args = EngineArgs(
model=model_name,
max_model_len=4096,
hf_overrides={"architectures": ["Tarsier2ForConditionalGeneration"]},
limit_mm_per_prompt={modality: 1},
)
if modality == "image":
placeholder = "<|image_pad|>"
elif modality == "video":
placeholder = "<|video_pad|>"
prompts = [
(
"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
f"{question}<|im_end|>\n"
"<|im_start|>assistant\n"
)
for question in questions
]
return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
)
# SkyworkR1V # SkyworkR1V
def run_skyworkr1v(questions: list[str], modality: str) -> ModelRequestData: def run_skyworkr1v(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image" assert modality == "image"
@ -1143,7 +1112,6 @@ model_example_map = {
"skywork_chat": run_skyworkr1v, "skywork_chat": run_skyworkr1v,
"smolvlm": run_smolvlm, "smolvlm": run_smolvlm,
"tarsier": run_tarsier, "tarsier": run_tarsier,
"tarsier2": run_tarsier2,
} }

View File

@ -94,7 +94,6 @@ def run_vlm2vec(query: Query) -> ModelRequestData:
engine_args = EngineArgs( engine_args = EngineArgs(
model="TIGER-Lab/VLM2Vec-Full", model="TIGER-Lab/VLM2Vec-Full",
task="embed", task="embed",
max_model_len=4096,
trust_remote_code=True, trust_remote_code=True,
mm_processor_kwargs={"num_crops": 4}, mm_processor_kwargs={"num_crops": 4},
limit_mm_per_prompt={"image": 1}, limit_mm_per_prompt={"image": 1},

View File

@ -828,32 +828,6 @@ def load_tarsier(question: str, image_urls: list[str]) -> ModelRequestData:
) )
def load_tarsier2(question: str, image_urls: list[str]) -> ModelRequestData:
model_name = "omni-research/Tarsier2-Recap-7b"
engine_args = EngineArgs(
model=model_name,
trust_remote_code=True,
max_model_len=32768,
limit_mm_per_prompt={"image": len(image_urls)},
hf_overrides={"architectures": ["Tarsier2ForConditionalGeneration"]},
)
prompt = (
"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
f"<|im_start|>user\n<|vision_start|>{'<|image_pad|>' * len(image_urls)}"
f"<|vision_end|>{question}<|im_end|>\n"
"<|im_start|>assistant\n"
)
image_data = [fetch_image(url) for url in image_urls]
return ModelRequestData(
engine_args=engine_args,
prompt=prompt,
image_data=image_data,
)
model_example_map = { model_example_map = {
"aria": load_aria, "aria": load_aria,
"aya_vision": load_aya_vision, "aya_vision": load_aya_vision,
@ -879,7 +853,6 @@ model_example_map = {
"qwen2_5_vl": load_qwen2_5_vl, "qwen2_5_vl": load_qwen2_5_vl,
"smolvlm": load_smolvlm, "smolvlm": load_smolvlm,
"tarsier": load_tarsier, "tarsier": load_tarsier,
"tarsier2": load_tarsier2,
} }

View File

@ -1,244 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# ruff: noqa: E501
"""
Set up this example by starting a vLLM OpenAI-compatible server with tool call
options enabled for xLAM-2 models:
vllm serve --model Salesforce/Llama-xLAM-2-8b-fc-r --enable-auto-tool-choice --tool-call-parser xlam
OR
vllm serve --model Salesforce/xLAM-2-3b-fc-r --enable-auto-tool-choice --tool-call-parser xlam
"""
import json
import time
from openai import OpenAI
# Modify OpenAI's API key and API base to use vLLM's API server.
openai_api_key = "empty"
openai_api_base = "http://localhost:8000/v1"
# Define tool functions
def get_weather(location: str, unit: str):
return f"Weather in {location} is 22 degrees {unit}."
def calculate_expression(expression: str):
try:
result = eval(expression)
return f"The result of {expression} is {result}"
except Exception as e:
return f"Could not calculate {expression}: {e}"
def translate_text(text: str, target_language: str):
return f"Translation of '{text}' to {target_language}: [translated content]"
# Define tools
tools = [
{
"type": "function",
"function": {
"name": "get_weather",
"description": "Get the current weather in a given location",
"parameters": {
"type": "object",
"properties": {
"location": {
"type": "string",
"description": "City and state, e.g., 'San Francisco, CA'",
},
"unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
},
"required": ["location", "unit"],
},
},
},
{
"type": "function",
"function": {
"name": "calculate_expression",
"description": "Calculate a mathematical expression",
"parameters": {
"type": "object",
"properties": {
"expression": {
"type": "string",
"description": "Mathematical expression to evaluate, needs to be a valid python expression",
}
},
"required": ["expression"],
},
},
},
{
"type": "function",
"function": {
"name": "translate_text",
"description": "Translate text to another language",
"parameters": {
"type": "object",
"properties": {
"text": {"type": "string", "description": "Text to translate"},
"target_language": {
"type": "string",
"description": "Target language for translation",
},
},
"required": ["text", "target_language"],
},
},
},
]
# Map of function names to implementations
tool_functions = {
"get_weather": get_weather,
"calculate_expression": calculate_expression,
"translate_text": translate_text,
}
def process_response(response, tool_functions, original_query):
"""Process a non-streaming response with possible tool calls"""
print("\n--- Response Output ---")
# Check if the response has content
if response.choices[0].message.content:
print(f"Content: {response.choices[0].message.content}")
# Check if the response has tool calls
if response.choices[0].message.tool_calls:
print("--------------------------------")
print(f"Tool calls: {response.choices[0].message.tool_calls}")
print("--------------------------------")
# Collect all tool calls and results before making follow-up request
tool_results = []
assistant_message = {"role": "assistant"}
if response.choices[0].message.content:
assistant_message["content"] = response.choices[0].message.content
assistant_tool_calls = []
# Process each tool call
for tool_call in response.choices[0].message.tool_calls:
function_name = tool_call.function.name
function_args = tool_call.function.arguments
function_id = tool_call.id
print(f"Function called: {function_name}")
print(f"Arguments: {function_args}")
print(f"Function ID: {function_id}")
# Execute the function
try:
# Parse the JSON arguments
args = json.loads(function_args)
# Call the function with the arguments
function_result = tool_functions[function_name](**args)
print(f"\n--- Function Result ---\n{function_result}\n")
# Add tool call to assistant message
assistant_tool_calls.append(
{
"id": function_id,
"type": "function",
"function": {"name": function_name, "arguments": function_args},
}
)
# Add tool result to tool_results
tool_results.append(
{
"role": "tool",
"tool_call_id": function_id,
"content": function_result,
}
)
except Exception as e:
print(f"Error executing function: {e}")
# Add tool_calls to assistant message
assistant_message["tool_calls"] = assistant_tool_calls
# Create a follow-up message with all function results
follow_up_messages = [
{"role": "user", "content": original_query},
assistant_message,
]
# Add all tool results to the messages
follow_up_messages.extend(tool_results)
# Get completion with all tool results in a single follow-up
follow_up_response = client.chat.completions.create(
model=client.models.list().data[0].id,
messages=follow_up_messages,
stream=False,
)
print("\n--- Follow-up Response ---")
print(follow_up_response.choices[0].message.content)
print("--- End Follow-up ---\n")
print("--- End Response ---\n")
def run_test_case(query, test_name):
"""Run a single test case with the given query"""
print(f"\n{'=' * 50}\nTEST CASE: {test_name}\n{'=' * 50}")
print(f"Query: '{query}'")
start_time = time.time()
# Create non-streaming chat completion request
response = client.chat.completions.create(
model=client.models.list().data[0].id,
messages=[{"role": "user", "content": query}],
tools=tools,
tool_choice="auto",
stream=False,
)
# Process the non-streaming response, passing the original query
process_response(response, tool_functions, query)
end_time = time.time()
print(f"Test completed in {end_time - start_time:.2f} seconds")
def main():
# Initialize OpenAI client
global client
client = OpenAI(
api_key=openai_api_key,
base_url=openai_api_base,
)
# Run test cases
test_cases = [
("I want to know the weather in San Francisco", "Weather Information"),
("Calculate 25 * 17 + 31", "Math Calculation"),
("Translate 'Hello world' to Spanish", "Text Translation"),
("What is the weather in Tokyo and New York in celsius", "Multiple Tool Usage"),
]
# Execute all test cases
for query, test_name in test_cases:
run_test_case(query, test_name)
time.sleep(1) # Small delay between tests
print("\nAll tests completed.")
if __name__ == "__main__":
main()

View File

@ -1,272 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# ruff: noqa: E501
"""
Set up this example by starting a vLLM OpenAI-compatible server with tool call
options enabled for xLAM-2 models:
vllm serve --model Salesforce/Llama-xLAM-2-8b-fc-r --enable-auto-tool-choice --tool-call-parser xlam
OR
vllm serve --model Salesforce/xLAM-2-3b-fc-r --enable-auto-tool-choice --tool-call-parser xlam
This example demonstrates streaming tool calls with xLAM models.
"""
import json
import time
from openai import OpenAI
# Modify OpenAI's API key and API base to use vLLM's API server.
openai_api_key = "empty"
openai_api_base = "http://localhost:8000/v1"
# Define tool functions
def get_weather(location: str, unit: str):
return f"Weather in {location} is 22 degrees {unit}."
def calculate_expression(expression: str):
try:
result = eval(expression)
return f"The result of {expression} is {result}"
except Exception as e:
return f"Could not calculate {expression}: {e}"
def translate_text(text: str, target_language: str):
return f"Translation of '{text}' to {target_language}: [translated content]"
# Define tools
tools = [
{
"type": "function",
"function": {
"name": "get_weather",
"description": "Get the current weather in a given location",
"parameters": {
"type": "object",
"properties": {
"location": {
"type": "string",
"description": "City and state, e.g., 'San Francisco, CA'",
},
"unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
},
"required": ["location", "unit"],
},
},
},
{
"type": "function",
"function": {
"name": "calculate_expression",
"description": "Calculate a mathematical expression",
"parameters": {
"type": "object",
"properties": {
"expression": {
"type": "string",
"description": "Mathematical expression to evaluate, needs to be a valid Python expression",
}
},
"required": ["expression"],
},
},
},
{
"type": "function",
"function": {
"name": "translate_text",
"description": "Translate text to another language",
"parameters": {
"type": "object",
"properties": {
"text": {"type": "string", "description": "Text to translate"},
"target_language": {
"type": "string",
"description": "Target language for translation",
},
},
"required": ["text", "target_language"],
},
},
},
]
# Map of function names to implementations
tool_functions = {
"get_weather": get_weather,
"calculate_expression": calculate_expression,
"translate_text": translate_text,
}
def process_stream(response, tool_functions, original_query):
"""Process a streaming response with possible tool calls"""
# Track multiple tool calls
tool_calls = {} # Dictionary to store tool calls by ID
current_id = None
print("\n--- Stream Output ---")
for chunk in response:
# Handle tool calls in the stream
if chunk.choices[0].delta.tool_calls:
for tool_call_chunk in chunk.choices[0].delta.tool_calls:
# Get the tool call ID
if hasattr(tool_call_chunk, "id") and tool_call_chunk.id:
current_id = tool_call_chunk.id
if current_id not in tool_calls:
tool_calls[current_id] = {
"function_name": None,
"function_args": "",
"function_id": current_id,
}
# Extract function information as it comes in chunks
if (
hasattr(tool_call_chunk, "function")
and current_id
and current_id in tool_calls
):
if (
hasattr(tool_call_chunk.function, "name")
and tool_call_chunk.function.name
):
tool_calls[current_id]["function_name"] = (
tool_call_chunk.function.name
)
print(f"Function called: {tool_call_chunk.function.name}")
if (
hasattr(tool_call_chunk.function, "arguments")
and tool_call_chunk.function.arguments
):
tool_calls[current_id]["function_args"] += (
tool_call_chunk.function.arguments
)
print(f"Arguments chunk: {tool_call_chunk.function.arguments}")
# Handle regular content in the stream
elif chunk.choices[0].delta.content:
print(chunk.choices[0].delta.content, end="")
print("\n--- End Stream ---\n")
# Execute each function call and build messages for follow-up
follow_up_messages = [{"role": "user", "content": original_query}]
for tool_id, tool_data in tool_calls.items():
function_name = tool_data["function_name"]
function_args = tool_data["function_args"]
function_id = tool_data["function_id"]
if function_name and function_args:
try:
# Parse the JSON arguments
args = json.loads(function_args)
# Call the function with the arguments
function_result = tool_functions[function_name](**args)
print(
f"\n--- Function Result ({function_name}) ---\n{function_result}\n"
)
# Add the assistant message with tool call
follow_up_messages.append(
{
"role": "assistant",
"tool_calls": [
{
"id": function_id,
"type": "function",
"function": {
"name": function_name,
"arguments": function_args,
},
}
],
}
)
# Add the tool message with function result
follow_up_messages.append(
{
"role": "tool",
"tool_call_id": function_id,
"content": function_result,
}
)
except Exception as e:
print(f"Error executing function: {e}")
# Only send follow-up if we have results to process
if len(follow_up_messages) > 1:
# Create a follow-up message with all the function results
follow_up_response = client.chat.completions.create(
model=client.models.list().data[0].id,
messages=follow_up_messages,
stream=True,
)
print("\n--- Follow-up Response ---")
for chunk in follow_up_response:
if chunk.choices[0].delta.content:
print(chunk.choices[0].delta.content, end="")
print("\n--- End Follow-up ---\n")
def run_test_case(query, test_name):
"""Run a single test case with the given query"""
print(f"\n{'=' * 50}\nTEST CASE: {test_name}\n{'=' * 50}")
print(f"Query: '{query}'")
start_time = time.time()
# Create streaming chat completion request
response = client.chat.completions.create(
model=client.models.list().data[0].id,
messages=[{"role": "user", "content": query}],
tools=tools,
tool_choice="auto",
stream=True,
)
# Process the streaming response
process_stream(response, tool_functions, query)
end_time = time.time()
print(f"Test completed in {end_time - start_time:.2f} seconds")
def main():
# Initialize OpenAI client
global client
client = OpenAI(
api_key=openai_api_key,
base_url=openai_api_base,
)
# Run test cases
test_cases = [
("I want to know the weather in San Francisco", "Weather Information"),
("Calculate 25 * 17 + 31", "Math Calculation"),
("Translate 'Hello world' to Spanish", "Text Translation"),
("What is the weather in Tokyo and New York in celsius", "Multiple Tool Usage"),
]
# Execute all test cases
for query, test_name in test_cases:
run_test_case(query, test_name)
time.sleep(1) # Small delay between tests
print("\nAll tests completed.")
if __name__ == "__main__":
main()

View File

@ -1,23 +1,5 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
This script demonstrates how to use the vLLM API server to perform audio
transcription with the `openai/whisper-large-v3` model.
Before running this script, you must start the vLLM server with the following command:
vllm serve openai/whisper-large-v3
Requirements:
- vLLM with audio support
- openai Python SDK
- httpx for streaming support
The script performs:
1. Synchronous transcription using OpenAI-compatible API.
2. Streaming transcription using raw HTTP request to the vLLM server.
"""
import asyncio import asyncio
import json import json
@ -39,9 +21,6 @@ client = OpenAI(
def sync_openai(): def sync_openai():
"""
Perform synchronous transcription using OpenAI-compatible API.
"""
with open(str(mary_had_lamb), "rb") as f: with open(str(mary_had_lamb), "rb") as f:
transcription = client.audio.transcriptions.create( transcription = client.audio.transcriptions.create(
file=f, file=f,
@ -58,11 +37,11 @@ def sync_openai():
print("transcription result:", transcription.text) print("transcription result:", transcription.text)
sync_openai()
# OpenAI Transcription API client does not support streaming. # OpenAI Transcription API client does not support streaming.
async def stream_openai_response(): async def stream_openai_response():
"""
Perform streaming transcription using vLLM's raw HTTP streaming API.
"""
data = { data = {
"language": "en", "language": "en",
"stream": True, "stream": True,
@ -89,15 +68,7 @@ async def stream_openai_response():
# Extract and print the content # Extract and print the content
content = chunk["choices"][0].get("delta", {}).get("content") content = chunk["choices"][0].get("delta", {}).get("content")
print(content, end="") print(content, end="")
print() # Final newline after stream ends
def main(): # Run the asynchronous function
sync_openai() asyncio.run(stream_openai_response())
# Run the asynchronous function
asyncio.run(stream_openai_response())
if __name__ == "__main__":
main()

Some files were not shown because too many files have changed in this diff Show More