Compare commits
126 Commits
benchmark-
...
low_latenc
| Author | SHA1 | Date | |
|---|---|---|---|
| 79acf80471 | |||
| 8b464d9660 | |||
| 889ebb2638 | |||
| 3ad986c28b | |||
| 344e193b7d | |||
| fb1c933ade | |||
| 72c5b97231 | |||
| fa93cd9f60 | |||
| aec9674dbe | |||
| 7fcc4223dc | |||
| 8262a3e23b | |||
| f211331c48 | |||
| 9053d0b134 | |||
| cb3f2d8d10 | |||
| c12df53b60 | |||
| d1aeea7553 | |||
| d8bccde686 | |||
| 20e489eaa1 | |||
| 4213475ec7 | |||
| d92879baf6 | |||
| 690fe019f0 | |||
| ed7a29d9f8 | |||
| 756848e79e | |||
| 18445edd0f | |||
| 30215ca61f | |||
| 838cedade7 | |||
| 4283a28c2f | |||
| 93a126fbc7 | |||
| 8e4b351a0c | |||
| 9869453c42 | |||
| 3642c59aa8 | |||
| 43eea2953b | |||
| de7eb10ce4 | |||
| fd11a325b8 | |||
| 4d17e20310 | |||
| 10fd1d7380 | |||
| 52b4f4a8d7 | |||
| e782e0a170 | |||
| dc2ceca5c5 | |||
| f8acd01ff7 | |||
| c48334d405 | |||
| 909fdaf152 | |||
| 8c1c926d00 | |||
| df6f3ce883 | |||
| 513f074766 | |||
| b07bf83c7d | |||
| 53e8cf53a4 | |||
| 54271bb766 | |||
| 9e96f56efb | |||
| b278911229 | |||
| 7bd0c7745c | |||
| 1cf0719ebd | |||
| 537d5ee025 | |||
| c8e5be35f7 | |||
| a6e72e1e4f | |||
| 5e83a7277f | |||
| 68af5f6c5c | |||
| 8de2901fea | |||
| c53e0730cb | |||
| a0e619e62a | |||
| 70116459c3 | |||
| 65e262b93b | |||
| 43faa0461a | |||
| 48cb2109b6 | |||
| a5450f11c9 | |||
| 9d98ab5ec6 | |||
| df5c879527 | |||
| 423e9f1cbe | |||
| 0bd7f8fca5 | |||
| d5615af9ae | |||
| 19dcc02a72 | |||
| 7feae92c1f | |||
| f851b84266 | |||
| fc966e9cc6 | |||
| ef19e67d2c | |||
| a41351f363 | |||
| 6aae216b4e | |||
| b22980a1dc | |||
| 881f735827 | |||
| 2f54045508 | |||
| 5aa6efb9a5 | |||
| 6ca0234478 | |||
| 649818995f | |||
| 7a0a9da72b | |||
| 69bff9bc89 | |||
| 41ca7eb491 | |||
| eef364723c | |||
| 0d6e187e88 | |||
| 9420a1fc30 | |||
| 583e900996 | |||
| 05e1fbfc52 | |||
| fe92176321 | |||
| 6d0df0ebeb | |||
| 0fa939e2d1 | |||
| 0422ce109f | |||
| 47bdee409c | |||
| 49f189439d | |||
| 5adf6f6b7f | |||
| 4115f19958 | |||
| 340d7b1b21 | |||
| 1bcbcbf574 | |||
| 82e43b2d7e | |||
| 67309a1cb5 | |||
| b724afe343 | |||
| 21f4f1c9a4 | |||
| b0c1f6202d | |||
| c0dfd97519 | |||
| a9138e85b1 | |||
| 0a05ed57e6 | |||
| 14288d1332 | |||
| b411418ff0 | |||
| 2bc0f72ae5 | |||
| 9c1244de57 | |||
| db2f8d915c | |||
| 6167c0e5d2 | |||
| ed2e464653 | |||
| 2c8ed8ee48 | |||
| ed50f46641 | |||
| 46e678bcff | |||
| 6b2427f995 | |||
| b07d741661 | |||
| 41fb013d29 | |||
| 32d4b669d0 | |||
| 3cde34a4a4 | |||
| bdb3660312 | |||
| f3a21e9c68 |
@ -1,3 +1,4 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m deepseek-ai/DeepSeek-V2-Lite-Chat -b "auto" -l 1000 -f 5 -t 2
|
||||
model_name: "deepseek-ai/DeepSeek-V2-Lite-Chat"
|
||||
tasks:
|
||||
|
||||
@ -1,3 +1,4 @@
|
||||
# For hf script, without -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5
|
||||
model_name: "nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform"
|
||||
tasks:
|
||||
|
||||
@ -1,3 +1,4 @@
|
||||
# For hf script, without -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-70B-Instruct -b 32 -l 250 -f 5
|
||||
model_name: "meta-llama/Meta-Llama-3-70B-Instruct"
|
||||
tasks:
|
||||
|
||||
@ -1,3 +1,4 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors -b auto -l 1000 -f 5 -t 1
|
||||
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors"
|
||||
tasks:
|
||||
|
||||
@ -1,3 +1,4 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5 -t 1
|
||||
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform"
|
||||
tasks:
|
||||
|
||||
@ -1,3 +1,4 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test -b 32 -l 1000 -f 5 -t 1
|
||||
model_name: "nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test"
|
||||
tasks:
|
||||
|
||||
@ -1,3 +1,4 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Meta-Llama-3-8B-Instruct-FP8 -b 32 -l 250 -f 5 -t 1
|
||||
model_name: "neuralmagic/Meta-Llama-3-8B-Instruct-FP8"
|
||||
tasks:
|
||||
|
||||
@ -1,3 +1,4 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
|
||||
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test"
|
||||
tasks:
|
||||
|
||||
@ -1,3 +1,4 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
|
||||
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test"
|
||||
tasks:
|
||||
|
||||
@ -1,3 +1,4 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test -b auto -l 1000 -f 5 -t 1
|
||||
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test"
|
||||
tasks:
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-8B-Instruct -b 32 -l 250 -f 5 -t 1
|
||||
# For hf script, without -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-8B-Instruct -b 32 -l 250 -f 5
|
||||
model_name: "meta-llama/Meta-Llama-3-8B-Instruct"
|
||||
tasks:
|
||||
- name: "gsm8k"
|
||||
|
||||
@ -1,3 +1,4 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m HandH1998/QQQ-Llama-3-8b-g128 -b 32 -l 1000 -f 5 -t 1
|
||||
model_name: "HandH1998/QQQ-Llama-3-8b-g128"
|
||||
tasks:
|
||||
|
||||
@ -1,3 +1,4 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
|
||||
model_name: "neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8"
|
||||
tasks:
|
||||
|
||||
@ -1,3 +1,4 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m mgoin/Minitron-4B-Base-FP8 -b auto -l 1000 -f 5 -t 1
|
||||
model_name: "mgoin/Minitron-4B-Base-FP8"
|
||||
tasks:
|
||||
|
||||
@ -1,3 +1,4 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic -b "auto" -l 250 -f 5 -t 8
|
||||
model_name: "neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic"
|
||||
tasks:
|
||||
|
||||
@ -1,3 +1,4 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8 -b "auto" -l 250 -f 5 -t 4
|
||||
model_name: "neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8"
|
||||
tasks:
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1 -b 32 -l 250 -f 5 -t 4
|
||||
# For hf script, without -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1 -b 32 -l 250 -f 5
|
||||
model_name: "mistralai/Mixtral-8x7B-Instruct-v0.1"
|
||||
tasks:
|
||||
- name: "gsm8k"
|
||||
|
||||
@ -1,3 +1,4 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16 -b auto -l 1319 -f 5 -t 1
|
||||
model_name: "nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16"
|
||||
tasks:
|
||||
|
||||
@ -1,3 +1,4 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-FP8W8 -b auto -l 1000 -f 5 -t 1
|
||||
model_name: "nm-testing/Qwen2-1.5B-Instruct-FP8W8"
|
||||
tasks:
|
||||
|
||||
@ -1,3 +1,4 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
|
||||
model_name: "neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8"
|
||||
tasks:
|
||||
|
||||
@ -1,3 +1,4 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise -b "auto" -l 1000 -f 5 -t 1
|
||||
model_name: "nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise"
|
||||
tasks:
|
||||
|
||||
@ -1,3 +1,4 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m Qwen/Qwen2-57B-A14B-Instruct -b "auto" -l 250 -f 5 -t 4
|
||||
model_name: "Qwen/Qwen2-57B-A14B-Instruct"
|
||||
tasks:
|
||||
|
||||
@ -1,3 +1,4 @@
|
||||
# For vllm script, with -t option (tensor parallel size).
|
||||
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM -b "auto" -t 2
|
||||
model_name: "nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM"
|
||||
tasks:
|
||||
|
||||
@ -5,7 +5,12 @@
|
||||
set -ex
|
||||
|
||||
# Setup cleanup
|
||||
remove_docker_container() { podman rm -f cpu-test-ubi9-ppc || true; podman system prune -f; }
|
||||
remove_docker_container() {
|
||||
if [[ -n "$container_id" ]]; then
|
||||
podman rm -f "$container_id" || true
|
||||
fi
|
||||
podman system prune -f
|
||||
}
|
||||
trap remove_docker_container EXIT
|
||||
remove_docker_container
|
||||
|
||||
@ -13,17 +18,17 @@ remove_docker_container
|
||||
podman build -t cpu-test-ubi9-ppc -f docker/Dockerfile.ppc64le .
|
||||
|
||||
# Run the image
|
||||
podman run -itd --entrypoint /bin/bash -v /tmp/:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN --name cpu-test-ubi9-ppc cpu-test-ubi9-ppc
|
||||
container_id=$(podman run -itd --entrypoint /bin/bash -v /tmp/:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN cpu-test-ubi9-ppc)
|
||||
|
||||
function cpu_tests() {
|
||||
|
||||
# offline inference
|
||||
podman exec cpu-test-ubi9-ppc bash -c "
|
||||
podman exec -it "$container_id" bash -c "
|
||||
set -e
|
||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
|
||||
|
||||
# Run basic model test
|
||||
podman exec cpu-test-ubi9-ppc bash -c "
|
||||
podman exec -it "$container_id" bash -c "
|
||||
set -e
|
||||
pip install pytest pytest-asyncio einops peft Pillow soundfile transformers_stream_generator matplotlib
|
||||
pip install sentence-transformers datamodel_code_generator
|
||||
@ -33,6 +38,8 @@ function cpu_tests() {
|
||||
}
|
||||
|
||||
# All of CPU tests are expected to be finished less than 40 mins.
|
||||
|
||||
export container_id
|
||||
export -f cpu_tests
|
||||
timeout 40m bash -c cpu_tests
|
||||
|
||||
|
||||
@ -19,6 +19,7 @@ docker run --privileged --net host --shm-size=16G -it \
|
||||
vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \
|
||||
&& python3 -m pip install pytest pytest-asyncio tpu-info \
|
||||
&& python3 -m pip install lm_eval[api]==0.4.4 \
|
||||
&& export VLLM_XLA_CACHE_PATH= \
|
||||
&& export VLLM_USE_V1=1 \
|
||||
&& export VLLM_XLA_CHECK_RECOMPILATION=1 \
|
||||
&& echo HARDWARE \
|
||||
|
||||
@ -299,6 +299,7 @@ steps:
|
||||
commands:
|
||||
- pytest -v -s compile/test_pass_manager.py
|
||||
- pytest -v -s compile/test_fusion.py
|
||||
- pytest -v -s compile/test_sequence_parallelism.py
|
||||
|
||||
- label: PyTorch Fullgraph Smoke Test # 9min
|
||||
source_file_dependencies:
|
||||
@ -583,6 +584,8 @@ steps:
|
||||
- pytest models/encoder_decoder/language/test_bart.py -v -s -m 'distributed(num_gpus=2)'
|
||||
- pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m 'distributed(num_gpus=2)'
|
||||
- pytest models/decoder_only/vision_language/test_models.py -v -s -m 'distributed(num_gpus=2)'
|
||||
# test sequence parallel
|
||||
- pytest -v -s distributed/test_sequence_parallel.py
|
||||
# this test fails consistently.
|
||||
# TODO: investigate and fix
|
||||
# - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
|
||||
|
||||
1
.github/CODEOWNERS
vendored
1
.github/CODEOWNERS
vendored
@ -12,6 +12,7 @@
|
||||
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth
|
||||
/vllm/model_executor/guided_decoding @mgoin @russellb
|
||||
/vllm/multimodal @DarkLight1337 @ywang96
|
||||
/vllm/vllm_flash_attn @LucasWilkinson
|
||||
CMakeLists.txt @tlrmchlsmth
|
||||
|
||||
# vLLM V1
|
||||
|
||||
22
.github/mergify.yml
vendored
22
.github/mergify.yml
vendored
@ -126,6 +126,28 @@ pull_request_rules:
|
||||
remove:
|
||||
- tpu
|
||||
|
||||
- name: label-tool-calling
|
||||
description: Automatically add tool-calling label
|
||||
conditions:
|
||||
- or:
|
||||
- files~=^tests/tool_use/
|
||||
- files~=^tests/mistral_tool_use/
|
||||
- files~=^tests/entrypoints/openai/tool_parsers/
|
||||
- files=tests/entrypoints/openai/test_chat_with_tool_reasoning.py
|
||||
- files~=^vllm/entrypoints/openai/tool_parsers/
|
||||
- files=docs/source/features/tool_calling.md
|
||||
- files=docs/source/getting_started/examples/openai_chat_completion_client_with_tools.md
|
||||
- files=docs/source/getting_started/examples/chat_with_tools.md
|
||||
- files~=^examples/tool_chat_*
|
||||
- files=examples/offline_inference/chat_with_tools.py
|
||||
- files=examples/online_serving/openai_chat_completion_client_with_tools_required.py
|
||||
- files=examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py
|
||||
- files=examples/online_serving/openai_chat_completion_client_with_tools.py
|
||||
actions:
|
||||
label:
|
||||
add:
|
||||
- tool-calling
|
||||
|
||||
- name: ping author on conflicts and add 'needs-rebase' label
|
||||
conditions:
|
||||
- conflict
|
||||
|
||||
1
.gitignore
vendored
1
.gitignore
vendored
@ -3,7 +3,6 @@
|
||||
|
||||
# vllm-flash-attn built from source
|
||||
vllm/vllm_flash_attn/*
|
||||
!vllm/vllm_flash_attn/fa_utils.py
|
||||
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
|
||||
@ -251,7 +251,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
|
||||
# Set CUTLASS_REVISION manually -- its revision detection doesn't work in this case.
|
||||
# Please keep this in sync with FetchContent_Declare line below.
|
||||
set(CUTLASS_REVISION "v3.8.0" CACHE STRING "CUTLASS revision to use")
|
||||
set(CUTLASS_REVISION "v3.9.0" CACHE STRING "CUTLASS revision to use")
|
||||
|
||||
# Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
|
||||
if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
|
||||
@ -269,7 +269,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
cutlass
|
||||
GIT_REPOSITORY https://github.com/nvidia/cutlass.git
|
||||
# Please keep this in sync with CUTLASS_REVISION line above.
|
||||
GIT_TAG v3.8.0
|
||||
GIT_TAG v3.9.0
|
||||
GIT_PROGRESS TRUE
|
||||
|
||||
# Speed up CUTLASS download by retrieving only the specified GIT_TAG instead of the history.
|
||||
@ -290,7 +290,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
"csrc/quantization/fp4/nvfp4_quant_entry.cu"
|
||||
"csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu"
|
||||
"csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
|
||||
"csrc/cutlass_extensions/common.cpp")
|
||||
"csrc/cutlass_extensions/common.cpp"
|
||||
"csrc/attention/mla/cutlass_mla_entry.cu")
|
||||
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${VLLM_EXT_SRC}"
|
||||
@ -463,7 +464,26 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
set(FP4_ARCHS)
|
||||
endif()
|
||||
|
||||
#
|
||||
# CUTLASS MLA Archs and flags
|
||||
cuda_archs_loose_intersection(MLA_ARCHS "10.0a" "${CUDA_ARCHS}")
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND MLA_ARCHS)
|
||||
set(SRCS
|
||||
"csrc/attention/mla/cutlass_mla_kernels.cu")
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${SRCS}"
|
||||
CUDA_ARCHS "${MLA_ARCHS}")
|
||||
list(APPEND VLLM_EXT_SRC "${SRCS}")
|
||||
list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MLA=1")
|
||||
# Add MLA-specific include directories only to MLA source files
|
||||
set_source_files_properties(${SRCS}
|
||||
PROPERTIES INCLUDE_DIRECTORIES "${CUTLASS_DIR}/examples/77_blackwell_fmha;${CUTLASS_DIR}/examples/common")
|
||||
message(STATUS "Building CUTLASS MLA for archs: ${MLA_ARCHS}")
|
||||
else()
|
||||
message(STATUS "Not building CUTLASS MLA as no compatible archs were found.")
|
||||
# clear MLA_ARCHS
|
||||
set(MLA_ARCHS)
|
||||
endif()
|
||||
|
||||
# CUTLASS MoE kernels
|
||||
|
||||
# The MoE kernel cutlass_moe_mm requires CUDA 12.3 or later (and only works
|
||||
|
||||
@ -63,14 +63,16 @@ class Request:
|
||||
output_len: int
|
||||
|
||||
|
||||
def sample_tokens(tokenizer: PreTrainedTokenizerBase, length: int) -> str:
|
||||
def sample_tokens(tokenizer: PreTrainedTokenizerBase,
|
||||
length: int) -> list[int]:
|
||||
vocab = tokenizer.get_vocab()
|
||||
all_special_ids = set(tokenizer.all_special_ids)
|
||||
|
||||
# Remove the special tokens.
|
||||
vocab = {
|
||||
k: v
|
||||
for k, v in vocab.items() if k not in tokenizer.all_special_ids
|
||||
}
|
||||
return random.choices(list(vocab.values()), k=length)
|
||||
return random.choices(
|
||||
[v for k, v in vocab.items() if k not in all_special_ids],
|
||||
k=length,
|
||||
)
|
||||
|
||||
|
||||
def sample_requests_from_dataset(
|
||||
|
||||
@ -713,7 +713,7 @@ def main(args: argparse.Namespace):
|
||||
))
|
||||
|
||||
# Save config and results to json
|
||||
if args.save_result:
|
||||
if args.save_result or args.append_result:
|
||||
result_json: dict[str, Any] = {}
|
||||
|
||||
# Setup
|
||||
@ -734,6 +734,14 @@ def main(args: argparse.Namespace):
|
||||
raise ValueError(
|
||||
"Invalid metadata format. Please use KEY=VALUE format."
|
||||
)
|
||||
# Traffic
|
||||
result_json["request_rate"] = (args.request_rate if args.request_rate
|
||||
< float("inf") else "inf")
|
||||
result_json["burstiness"] = args.burstiness
|
||||
result_json["max_concurrency"] = args.max_concurrency
|
||||
|
||||
# Merge with benchmark result
|
||||
result_json = {**result_json, **benchmark_result}
|
||||
|
||||
if not args.save_detailed:
|
||||
# Remove fields with too many data points
|
||||
@ -744,15 +752,6 @@ def main(args: argparse.Namespace):
|
||||
if field in result_json:
|
||||
del result_json[field]
|
||||
|
||||
# Traffic
|
||||
result_json["request_rate"] = (args.request_rate if args.request_rate
|
||||
< float("inf") else "inf")
|
||||
result_json["burstiness"] = args.burstiness
|
||||
result_json["max_concurrency"] = args.max_concurrency
|
||||
|
||||
# Merge with benchmark result
|
||||
result_json = {**result_json, **benchmark_result}
|
||||
|
||||
# Save to file
|
||||
base_model_id = model_id.split("/")[-1]
|
||||
max_concurrency_str = (f"-concurrency{args.max_concurrency}"
|
||||
@ -762,7 +761,12 @@ def main(args: argparse.Namespace):
|
||||
file_name = args.result_filename
|
||||
if args.result_dir:
|
||||
file_name = os.path.join(args.result_dir, file_name)
|
||||
with open(file_name, "w", encoding='utf-8') as outfile:
|
||||
with open(file_name,
|
||||
mode="a+" if args.append_result else "w",
|
||||
encoding='utf-8') as outfile:
|
||||
# Append a newline.
|
||||
if args.append_result and outfile.tell() != 0:
|
||||
outfile.write("\n")
|
||||
json.dump(result_json, outfile)
|
||||
save_to_pytorch_benchmark_format(args, result_json, file_name)
|
||||
|
||||
@ -894,6 +898,11 @@ if __name__ == "__main__":
|
||||
help="When saving the results, whether to include per request "
|
||||
"information such as response, error, ttfs, tpots, etc.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--append-result",
|
||||
action="store_true",
|
||||
help="Append the benchmark result to the existing json file.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--metadata",
|
||||
metavar="KEY=VALUE",
|
||||
|
||||
@ -150,17 +150,17 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase,
|
||||
|
||||
elif args.dataset == "grammar":
|
||||
schema = """
|
||||
?start: select_statement
|
||||
root ::= select_statement
|
||||
|
||||
?select_statement: "SELECT " column_list " FROM " table_name
|
||||
select_statement ::= "SELECT " column " from " table " where " condition
|
||||
|
||||
?column_list: column_name ("," column_name)*
|
||||
column ::= "col_1 " | "col_2 "
|
||||
|
||||
?table_name: identifier
|
||||
table ::= "table_1 " | "table_2 "
|
||||
|
||||
?column_name: identifier
|
||||
condition ::= column "= " number
|
||||
|
||||
?identifier: /[a-zA-Z_][a-zA-Z0-9_]*/
|
||||
number ::= "1 " | "2 "
|
||||
"""
|
||||
prompt = "Generate an SQL query to show the 'username' \
|
||||
and 'email' from the 'users' table."
|
||||
|
||||
@ -17,8 +17,14 @@ from torch.utils.benchmark import Measurement as TMeasurement
|
||||
from utils import ArgPool, Bench, CudaGraphBenchParams
|
||||
from weight_shapes import WEIGHT_SHAPES
|
||||
|
||||
from vllm.lora.ops.triton_ops import LoRAKernelMeta, lora_expand, lora_shrink
|
||||
from vllm.lora.ops.triton_ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT
|
||||
from vllm.triton_utils import HAS_TRITON
|
||||
|
||||
if HAS_TRITON:
|
||||
from vllm.lora.ops.triton_ops import (LoRAKernelMeta, lora_expand,
|
||||
lora_shrink)
|
||||
from vllm.lora.ops.triton_ops.utils import (_LORA_A_PTR_DICT,
|
||||
_LORA_B_PTR_DICT)
|
||||
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
|
||||
DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
|
||||
|
||||
@ -553,9 +553,8 @@ def main(args: argparse.Namespace):
|
||||
intermediate_size = config.moe_intermediate_size
|
||||
shard_intermediate_size = 2 * intermediate_size // args.tp_size
|
||||
else:
|
||||
if not hasattr(config, "hidden_size"):
|
||||
# Support for llama4
|
||||
config = config.text_config
|
||||
# Support for llama4
|
||||
config = config.get_text_config()
|
||||
# Default: Mixtral.
|
||||
E = config.num_local_experts
|
||||
topk = config.num_experts_per_tok
|
||||
|
||||
@ -38,7 +38,7 @@ else()
|
||||
FetchContent_Declare(
|
||||
vllm-flash-attn
|
||||
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
|
||||
GIT_TAG 0a721daebe4fa7149f06ecf3d3eabeb6dcd0f1fa
|
||||
GIT_TAG 8798f27777fb57f447070301bf33a9f9c607f491
|
||||
GIT_PROGRESS TRUE
|
||||
# Don't share the vllm-flash-attn build between build types
|
||||
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
|
||||
|
||||
38
csrc/attention/mla/cutlass_mla_entry.cu
Normal file
38
csrc/attention/mla/cutlass_mla_entry.cu
Normal file
@ -0,0 +1,38 @@
|
||||
/*
|
||||
* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include <torch/all.h>
|
||||
|
||||
#if defined ENABLE_CUTLASS_MLA && ENABLE_CUTLASS_MLA
|
||||
void cutlass_mla_decode_sm100a(torch::Tensor const& out,
|
||||
torch::Tensor const& q_nope,
|
||||
torch::Tensor const& q_pe,
|
||||
torch::Tensor const& kv_c_and_k_pe_cache,
|
||||
torch::Tensor const& seq_lens,
|
||||
torch::Tensor const& page_table, double scale);
|
||||
#endif
|
||||
|
||||
void cutlass_mla_decode(torch::Tensor const& out, torch::Tensor const& q_nope,
|
||||
torch::Tensor const& q_pe,
|
||||
torch::Tensor const& kv_c_and_k_pe_cache,
|
||||
torch::Tensor const& seq_lens,
|
||||
torch::Tensor const& page_table, double scale) {
|
||||
#if defined ENABLE_CUTLASS_MLA && ENABLE_CUTLASS_MLA
|
||||
return cutlass_mla_decode_sm100a(out, q_nope, q_pe, kv_c_and_k_pe_cache,
|
||||
seq_lens, page_table, scale);
|
||||
#endif
|
||||
TORCH_CHECK_NOT_IMPLEMENTED(false, "No compiled cutlass MLA");
|
||||
}
|
||||
225
csrc/attention/mla/cutlass_mla_kernels.cu
Normal file
225
csrc/attention/mla/cutlass_mla_kernels.cu
Normal file
@ -0,0 +1,225 @@
|
||||
/*
|
||||
* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include <torch/all.h>
|
||||
|
||||
#include <ATen/cuda/CUDAContext.h>
|
||||
#include <c10/cuda/CUDAGuard.h>
|
||||
|
||||
#include "cute/tensor.hpp"
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cutlass/kernel_hardware_info.h"
|
||||
|
||||
#include "cutlass_extensions/common.hpp"
|
||||
|
||||
#include "device/sm100_mla.hpp"
|
||||
#include "kernel/sm100_mla_tile_scheduler.hpp"
|
||||
|
||||
using namespace cute;
|
||||
using namespace cutlass::fmha::kernel;
|
||||
|
||||
template <typename T, bool PersistenceOption = true>
|
||||
struct MlaSm100 {
|
||||
using Element = T;
|
||||
using ElementAcc = float;
|
||||
using ElementOut = T;
|
||||
|
||||
using TileShape = Shape<_128, _128, Shape<_512, _64>>;
|
||||
using TileShapeH = cute::tuple_element_t<0, TileShape>;
|
||||
using TileShapeD = cute::tuple_element_t<2, TileShape>;
|
||||
|
||||
// H K (D_latent D_rope) B
|
||||
using ProblemShape = cute::tuple<TileShapeH, int, TileShapeD, int>;
|
||||
|
||||
using StrideQ = cute::tuple<int64_t, _1, int64_t>; // H D B
|
||||
using StrideK = cute::tuple<int64_t, _1, int64_t>; // K D B
|
||||
using StrideO = StrideK; // H D B
|
||||
using StrideLSE = cute::tuple<_1, int>; // H B
|
||||
|
||||
using TileScheduler =
|
||||
std::conditional_t<PersistenceOption, Sm100MlaPersistentTileScheduler,
|
||||
Sm100MlaIndividualTileScheduler>;
|
||||
|
||||
using FmhaKernel =
|
||||
cutlass::fmha::kernel::Sm100FmhaMlaKernelTmaWarpspecialized<
|
||||
TileShape, Element, ElementAcc, ElementOut, ElementAcc, TileScheduler,
|
||||
/*kIsCpAsync=*/true>;
|
||||
using Fmha = cutlass::fmha::device::MLA<FmhaKernel>;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
typename T::Fmha::Arguments args_from_options(
|
||||
at::Tensor const& out, at::Tensor const& q_nope, at::Tensor const& q_pe,
|
||||
at::Tensor const& kv_c_and_k_pe_cache, at::Tensor const& seq_lens,
|
||||
at::Tensor const& page_table, double scale) {
|
||||
cutlass::KernelHardwareInfo hw_info;
|
||||
hw_info.device_id = q_nope.device().index();
|
||||
hw_info.sm_count =
|
||||
cutlass::KernelHardwareInfo::query_device_multiprocessor_count(
|
||||
hw_info.device_id);
|
||||
|
||||
int batches = q_nope.sizes()[0];
|
||||
int page_count_per_seq = page_table.sizes()[1];
|
||||
int page_count_total = kv_c_and_k_pe_cache.sizes()[0];
|
||||
int page_size = kv_c_and_k_pe_cache.sizes()[1];
|
||||
int max_seq_len = page_size * page_count_per_seq;
|
||||
using TileShapeH = typename T::TileShapeH;
|
||||
using TileShapeD = typename T::TileShapeD;
|
||||
auto problem_shape =
|
||||
cute::make_tuple(TileShapeH{}, max_seq_len, TileShapeD{}, batches);
|
||||
|
||||
auto [H, K, D, B] = problem_shape;
|
||||
auto [D_latent, D_rope] = D;
|
||||
|
||||
using StrideQ = typename T::StrideQ;
|
||||
using StrideK = typename T::StrideK;
|
||||
using StrideO = typename T::StrideO;
|
||||
using StrideLSE = typename T::StrideLSE;
|
||||
|
||||
StrideQ stride_Q_latent = cute::make_tuple(
|
||||
static_cast<int64_t>(D_latent), _1{}, static_cast<int64_t>(H * D_latent));
|
||||
StrideQ stride_Q_rope = cute::make_tuple(static_cast<int64_t>(D_rope), _1{},
|
||||
static_cast<int64_t>(H * D_rope));
|
||||
StrideK stride_C =
|
||||
cute::make_tuple(static_cast<int64_t>(D_latent + D_rope), _1{},
|
||||
static_cast<int64_t>(page_size * (D_latent + D_rope)));
|
||||
StrideLSE stride_PT = cute::make_stride(_1{}, page_count_per_seq);
|
||||
StrideLSE stride_LSE = cute::make_tuple(_1{}, static_cast<int>(H));
|
||||
StrideO stride_O = cute::make_tuple(static_cast<int64_t>(D_latent), _1{},
|
||||
static_cast<int64_t>(H * D_latent));
|
||||
|
||||
using Element = typename T::Element;
|
||||
using ElementOut = typename T::ElementOut;
|
||||
using ElementAcc = typename T::ElementAcc;
|
||||
auto Q_latent_ptr = static_cast<Element*>(q_nope.data_ptr());
|
||||
auto Q_rope_ptr = static_cast<Element*>(q_pe.data_ptr());
|
||||
auto C_ptr = static_cast<Element*>(kv_c_and_k_pe_cache.data_ptr());
|
||||
auto scale_f = static_cast<float>(scale);
|
||||
typename T::Fmha::Arguments arguments{
|
||||
problem_shape,
|
||||
{scale_f, Q_latent_ptr, stride_Q_latent, Q_rope_ptr, stride_Q_rope, C_ptr,
|
||||
stride_C, C_ptr + D_latent, stride_C,
|
||||
static_cast<int*>(seq_lens.data_ptr()),
|
||||
static_cast<int*>(page_table.data_ptr()), stride_PT, page_count_total,
|
||||
page_size},
|
||||
{static_cast<ElementOut*>(out.data_ptr()), stride_O,
|
||||
static_cast<ElementAcc*>(nullptr), stride_LSE},
|
||||
hw_info,
|
||||
-1, // split_kv
|
||||
nullptr, // is_var_split_kv
|
||||
};
|
||||
// TODO(kaixih@nvidia): When split_kv=-1 and is_var_split_kv=false, we compute
|
||||
// split_kv automatically based on batch size and sequence length to balance
|
||||
// workload across available SMs. Consider using var_split_kv for manual
|
||||
// control if needed.
|
||||
T::Fmha::set_split_kv(arguments);
|
||||
return arguments;
|
||||
}
|
||||
|
||||
template <typename Element>
|
||||
void runMla(at::Tensor const& out, at::Tensor const& q_nope,
|
||||
at::Tensor const& q_pe, at::Tensor const& kv_c_and_k_pe_cache,
|
||||
at::Tensor const& seq_lens, at::Tensor const& page_table,
|
||||
float scale, cudaStream_t stream) {
|
||||
using MlaSm100Type = MlaSm100<Element>;
|
||||
typename MlaSm100Type::Fmha fmha;
|
||||
auto arguments = args_from_options<MlaSm100Type>(
|
||||
out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, scale);
|
||||
size_t workspace_size = MlaSm100Type::Fmha::get_workspace_size(arguments);
|
||||
auto const workspace_options =
|
||||
torch::TensorOptions().dtype(torch::kUInt8).device(q_nope.device());
|
||||
auto workspace = torch::empty(workspace_size, workspace_options);
|
||||
|
||||
CUTLASS_CHECK(fmha.can_implement(arguments));
|
||||
|
||||
CUTLASS_CHECK(fmha.initialize(arguments, workspace.data_ptr(), stream));
|
||||
|
||||
CUTLASS_CHECK(fmha.run(arguments, workspace.data_ptr(), stream));
|
||||
}
|
||||
|
||||
void cutlass_mla_decode_sm100a(torch::Tensor const& out,
|
||||
torch::Tensor const& q_nope,
|
||||
torch::Tensor const& q_pe,
|
||||
torch::Tensor const& kv_c_and_k_pe_cache,
|
||||
torch::Tensor const& seq_lens,
|
||||
torch::Tensor const& page_table, double scale) {
|
||||
TORCH_CHECK(q_nope.device().is_cuda(), "q_nope must be on CUDA");
|
||||
TORCH_CHECK(q_nope.dim() == 3, "q_nope must be a 3D tensor");
|
||||
TORCH_CHECK(q_pe.dim() == 3, "q_pe must be a 3D tensor");
|
||||
TORCH_CHECK(kv_c_and_k_pe_cache.dim() == 3,
|
||||
"kv_c_and_k_pe_cache must be a 3D tensor");
|
||||
TORCH_CHECK(seq_lens.dim() == 1, "seq_lens must be a 1D tensor");
|
||||
TORCH_CHECK(page_table.dim() == 2, "page_table must be a 2D tensor");
|
||||
TORCH_CHECK(out.dim() == 3, "out must be a 3D tensor");
|
||||
|
||||
auto B_q_nope = q_nope.size(0);
|
||||
auto H_q_nope = q_nope.size(1);
|
||||
auto D_q_nope = q_nope.size(2);
|
||||
auto B_q_pe = q_pe.size(0);
|
||||
auto H_q_pe = q_pe.size(1);
|
||||
auto D_q_pe = q_pe.size(2);
|
||||
auto B_pt = page_table.size(0);
|
||||
auto PAGE_NUM = page_table.size(1);
|
||||
auto PAGE_SIZE = kv_c_and_k_pe_cache.size(1);
|
||||
auto D_ckv = kv_c_and_k_pe_cache.size(2);
|
||||
auto B_o = out.size(0);
|
||||
auto H_o = out.size(1);
|
||||
auto D_o = out.size(2);
|
||||
|
||||
TORCH_CHECK(D_q_nope == 512, "D_q_nope must be equal to 512");
|
||||
TORCH_CHECK(D_q_pe == 64, "D_q_pe must be equal to 64");
|
||||
TORCH_CHECK(D_ckv == 576, "D_ckv must be equal to 576");
|
||||
TORCH_CHECK(H_q_nope == H_q_pe && H_q_nope == H_o && H_o == 128,
|
||||
"H_q_nope, H_q_pe, and H_o must be equal to 128");
|
||||
TORCH_CHECK(PAGE_SIZE > 0 && (PAGE_SIZE & (PAGE_SIZE - 1)) == 0,
|
||||
"PAGE_SIZE must be a power of 2");
|
||||
TORCH_CHECK(
|
||||
B_q_nope == B_q_pe && B_q_nope == B_pt && B_q_nope == B_o,
|
||||
"Batch dims must be same for page_table, q_nope and q_pe, and out");
|
||||
TORCH_CHECK(PAGE_NUM % (128 / PAGE_SIZE) == 0,
|
||||
"PAGE_NUM must be divisible by 128 / PAGE_SIZE");
|
||||
TORCH_CHECK(D_o == 512, "D_o must be equal to 512");
|
||||
|
||||
TORCH_CHECK(q_nope.dtype() == at::ScalarType::Half ||
|
||||
q_nope.dtype() == at::ScalarType::BFloat16 ||
|
||||
q_nope.dtype() == at::ScalarType::Float8_e4m3fn,
|
||||
"q_nope must be a half, bfloat16, or float8_e4m3fn tensor");
|
||||
TORCH_CHECK(kv_c_and_k_pe_cache.dtype() == q_nope.dtype() &&
|
||||
q_nope.dtype() == q_pe.dtype(),
|
||||
"kv_c_and_k_pe_cache, q_nope, and q_pe must be the same type");
|
||||
TORCH_CHECK(seq_lens.dtype() == torch::kInt32,
|
||||
"seq_lens must be a 32-bit integer tensor");
|
||||
TORCH_CHECK(page_table.dtype() == torch::kInt32,
|
||||
"page_table must be a 32-bit integer tensor");
|
||||
|
||||
auto in_dtype = q_nope.dtype();
|
||||
at::cuda::CUDAGuard device_guard{(char)q_nope.get_device()};
|
||||
const cudaStream_t stream =
|
||||
at::cuda::getCurrentCUDAStream(q_nope.get_device());
|
||||
if (in_dtype == at::ScalarType::Half) {
|
||||
runMla<cutlass::half_t>(out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens,
|
||||
page_table, scale, stream);
|
||||
} else if (in_dtype == at::ScalarType::BFloat16) {
|
||||
runMla<cutlass::bfloat16_t>(out, q_nope, q_pe, kv_c_and_k_pe_cache,
|
||||
seq_lens, page_table, scale, stream);
|
||||
} else if (in_dtype == at::ScalarType::Float8_e4m3fn) {
|
||||
runMla<cutlass::float_e4m3_t>(out, q_nope, q_pe, kv_c_and_k_pe_cache,
|
||||
seq_lens, page_table, scale, stream);
|
||||
} else {
|
||||
TORCH_CHECK(false, "Unsupported input data type of MLA");
|
||||
}
|
||||
}
|
||||
@ -270,9 +270,10 @@ __global__ void reshape_and_cache_flash_kernel(
|
||||
cache_t* __restrict__ value_cache, // [num_blocks, block_size, num_heads,
|
||||
// head_size]
|
||||
const int64_t* __restrict__ slot_mapping, // [num_tokens]
|
||||
const int block_stride, const int key_stride, const int value_stride,
|
||||
const int num_heads, const int head_size, const int block_size,
|
||||
const float* k_scale, const float* v_scale) {
|
||||
const int64_t block_stride, const int64_t page_stride,
|
||||
const int64_t head_stride, const int64_t key_stride,
|
||||
const int64_t value_stride, const int num_heads, const int head_size,
|
||||
const int block_size, const float* k_scale, const float* v_scale) {
|
||||
const int64_t token_idx = blockIdx.x;
|
||||
const int64_t slot_idx = slot_mapping[token_idx];
|
||||
// NOTE: slot_idx can be -1 if the token is padded
|
||||
@ -288,8 +289,8 @@ __global__ void reshape_and_cache_flash_kernel(
|
||||
const int head_idx = i / head_size;
|
||||
const int head_offset = i % head_size;
|
||||
const int64_t tgt_key_value_idx = block_idx * block_stride +
|
||||
block_offset * num_heads * head_size +
|
||||
head_idx * head_size + head_offset;
|
||||
block_offset * page_stride +
|
||||
head_idx * head_stride + head_offset;
|
||||
scalar_t tgt_key = key[src_key_idx];
|
||||
scalar_t tgt_value = value[src_value_idx];
|
||||
if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
|
||||
@ -396,16 +397,16 @@ void reshape_and_cache(
|
||||
// KV_T is the data type of key and value tensors.
|
||||
// CACHE_T is the stored data type of kv-cache.
|
||||
// KV_DTYPE is the real data type of kv-cache.
|
||||
#define CALL_RESHAPE_AND_CACHE_FLASH(KV_T, CACHE_T, KV_DTYPE) \
|
||||
vllm::reshape_and_cache_flash_kernel<KV_T, CACHE_T, KV_DTYPE> \
|
||||
<<<grid, block, 0, stream>>>( \
|
||||
reinterpret_cast<KV_T*>(key.data_ptr()), \
|
||||
reinterpret_cast<KV_T*>(value.data_ptr()), \
|
||||
reinterpret_cast<CACHE_T*>(key_cache.data_ptr()), \
|
||||
reinterpret_cast<CACHE_T*>(value_cache.data_ptr()), \
|
||||
slot_mapping.data_ptr<int64_t>(), block_stride, key_stride, \
|
||||
value_stride, num_heads, head_size, block_size, \
|
||||
reinterpret_cast<const float*>(k_scale.data_ptr()), \
|
||||
#define CALL_RESHAPE_AND_CACHE_FLASH(KV_T, CACHE_T, KV_DTYPE) \
|
||||
vllm::reshape_and_cache_flash_kernel<KV_T, CACHE_T, KV_DTYPE> \
|
||||
<<<grid, block, 0, stream>>>( \
|
||||
reinterpret_cast<KV_T*>(key.data_ptr()), \
|
||||
reinterpret_cast<KV_T*>(value.data_ptr()), \
|
||||
reinterpret_cast<CACHE_T*>(key_cache.data_ptr()), \
|
||||
reinterpret_cast<CACHE_T*>(value_cache.data_ptr()), \
|
||||
slot_mapping.data_ptr<int64_t>(), block_stride, page_stride, \
|
||||
head_stride, key_stride, value_stride, num_heads, head_size, \
|
||||
block_size, reinterpret_cast<const float*>(k_scale.data_ptr()), \
|
||||
reinterpret_cast<const float*>(v_scale.data_ptr()));
|
||||
|
||||
void reshape_and_cache_flash(
|
||||
@ -432,9 +433,11 @@ void reshape_and_cache_flash(
|
||||
int head_size = key.size(2);
|
||||
int block_size = key_cache.size(1);
|
||||
|
||||
int key_stride = key.stride(0);
|
||||
int value_stride = value.stride(0);
|
||||
int block_stride = key_cache.stride(0);
|
||||
int64_t key_stride = key.stride(0);
|
||||
int64_t value_stride = value.stride(0);
|
||||
int64_t block_stride = key_cache.stride(0);
|
||||
int64_t page_stride = key_cache.stride(1);
|
||||
int64_t head_stride = key_cache.stride(2);
|
||||
TORCH_CHECK(key_cache.stride(0) == value_cache.stride(0));
|
||||
|
||||
dim3 grid(num_tokens);
|
||||
|
||||
@ -128,6 +128,12 @@ void advance_step_flashinfer(
|
||||
torch::Tensor& paged_kv_indices, torch::Tensor& paged_kv_indptr,
|
||||
torch::Tensor& paged_kv_last_page_len, torch::Tensor& block_table_bounds);
|
||||
|
||||
void cutlass_mla_decode(torch::Tensor const& out, torch::Tensor const& q_nope,
|
||||
torch::Tensor const& q_pe,
|
||||
torch::Tensor const& kv_c_and_k_pe_cache,
|
||||
torch::Tensor const& seq_lens,
|
||||
torch::Tensor const& page_table, double scale);
|
||||
|
||||
torch::Tensor get_cuda_view_from_cpu_tensor(torch::Tensor& cpu_tensor);
|
||||
|
||||
#ifndef USE_ROCM
|
||||
|
||||
@ -336,7 +336,7 @@ inline void cutlass_gemm_sm89_fp8_dispatch(torch::Tensor& out,
|
||||
|
||||
uint32_t const m = a.size(0);
|
||||
uint32_t const mp2 =
|
||||
std::max(static_cast<uint32_t>(32), next_pow_2(m)); // next power of 2
|
||||
std::max(static_cast<uint32_t>(16), next_pow_2(m)); // next power of 2
|
||||
|
||||
if (mp2 <= 16) {
|
||||
// M in [1, 16]
|
||||
|
||||
@ -321,7 +321,7 @@ inline void cutlass_gemm_sm89_int8_dispatch(torch::Tensor& out,
|
||||
|
||||
uint32_t const m = a.size(0);
|
||||
uint32_t const mp2 =
|
||||
std::max(static_cast<uint32_t>(32), next_pow_2(m)); // next power of 2
|
||||
std::max(static_cast<uint32_t>(16), next_pow_2(m)); // next power of 2
|
||||
|
||||
if (mp2 <= 16) {
|
||||
// M in [1, 16]
|
||||
|
||||
@ -134,7 +134,7 @@ typename T::Gemm::Arguments args_from_options(
|
||||
using StrideB = typename T::StrideB;
|
||||
using StrideD = typename T::StrideD;
|
||||
using Sm100BlkScaledConfig =
|
||||
typename T::Gemm::GemmKernel::CollectiveMainloop::Sm100BlkScaledConfig;
|
||||
typename T::Gemm::GemmKernel::CollectiveMainloop::Sm1xxBlkScaledConfig;
|
||||
|
||||
int m = static_cast<int>(M);
|
||||
int n = static_cast<int>(N);
|
||||
|
||||
@ -150,7 +150,7 @@ __global__ void LLGemm1_kernel(const scalar_t* in_a, const scalar_t* in_b,
|
||||
colB_elem4w = bf4[threadid * 4 + 3];
|
||||
|
||||
scalar2_t Af2;
|
||||
scalar2_t Bf2;
|
||||
[[maybe_unused]] scalar2_t Bf2;
|
||||
float2 S;
|
||||
|
||||
auto Ah2ptr = reinterpret_cast<scalar2_t*>(&rowA_elem4);
|
||||
@ -1597,4 +1597,4 @@ void wvSplitKQ(at::Tensor& in_a, at::Tensor& in_b, at::Tensor& out_c,
|
||||
}
|
||||
});
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
@ -130,6 +130,13 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
||||
") -> ()");
|
||||
ops.impl("advance_step_flashinfer", torch::kCUDA, &advance_step_flashinfer);
|
||||
|
||||
// Compute MLA decode using cutlass.
|
||||
ops.def(
|
||||
"cutlass_mla_decode(Tensor! out, Tensor q_nope, Tensor q_pe,"
|
||||
" Tensor kv_c_and_k_pe_cache, Tensor seq_lens,"
|
||||
" Tensor page_table, float scale) -> ()");
|
||||
ops.impl("cutlass_mla_decode", torch::kCUDA, &cutlass_mla_decode);
|
||||
|
||||
// Layernorm
|
||||
// Apply Root Mean Square (RMS) Normalization to the input tensor.
|
||||
ops.def(
|
||||
|
||||
@ -162,6 +162,9 @@ ENV UV_HTTP_TIMEOUT=500
|
||||
COPY requirements/lint.txt requirements/lint.txt
|
||||
COPY requirements/test.txt requirements/test.txt
|
||||
COPY requirements/dev.txt requirements/dev.txt
|
||||
# Workaround for #17068
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --system mamba-ssm==2.2.4 --no-build-isolation
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --system -r requirements/dev.txt
|
||||
#################### DEV IMAGE ####################
|
||||
@ -265,6 +268,9 @@ ADD . /vllm-workspace/
|
||||
ENV UV_HTTP_TIMEOUT=500
|
||||
|
||||
# install development dependencies (for testing)
|
||||
# Workaround for #17068
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --system mamba-ssm==2.2.4 --no-build-isolation
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --system -r requirements/dev.txt
|
||||
|
||||
@ -291,6 +297,7 @@ RUN mv vllm test_docs/
|
||||
#################### OPENAI API SERVER ####################
|
||||
# base openai image with additional requirements, for any subsequent openai-style images
|
||||
FROM vllm-base AS vllm-openai-base
|
||||
ARG TARGETPLATFORM
|
||||
|
||||
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
|
||||
# Reference: https://github.com/astral-sh/uv/pull/1694
|
||||
|
||||
BIN
docs/source/assets/deployment/anything-llm-chat-with-doc.png
Normal file
BIN
docs/source/assets/deployment/anything-llm-chat-with-doc.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 118 KiB |
BIN
docs/source/assets/deployment/anything-llm-chat-without-doc.png
Normal file
BIN
docs/source/assets/deployment/anything-llm-chat-without-doc.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 136 KiB |
BIN
docs/source/assets/deployment/anything-llm-provider.png
Normal file
BIN
docs/source/assets/deployment/anything-llm-provider.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 110 KiB |
BIN
docs/source/assets/deployment/anything-llm-upload-doc.png
Normal file
BIN
docs/source/assets/deployment/anything-llm-upload-doc.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 111 KiB |
@ -177,6 +177,11 @@ def linkcode_resolve(domain, info):
|
||||
for part in info['fullname'].split('.'):
|
||||
obj = getattr(obj, part)
|
||||
|
||||
# Skip decorator wrappers by checking if the object is a function
|
||||
# and has a __wrapped__ attribute (which decorators typically set)
|
||||
while hasattr(obj, '__wrapped__'):
|
||||
obj = obj.__wrapped__
|
||||
|
||||
if not (inspect.isclass(obj) or inspect.isfunction(obj)
|
||||
or inspect.ismethod(obj)):
|
||||
obj = obj.__class__ # Get the class of the instance
|
||||
|
||||
47
docs/source/deployment/frameworks/anything-llm.md
Normal file
47
docs/source/deployment/frameworks/anything-llm.md
Normal file
@ -0,0 +1,47 @@
|
||||
(deployment-anything-llm)=
|
||||
|
||||
# Anything LLM
|
||||
|
||||
[Anything LLM](https://github.com/Mintplex-Labs/anything-llm) is a full-stack application that enables you to turn any document, resource, or piece of content into context that any LLM can use as references during chatting.
|
||||
|
||||
It allows you to deploy a large language model (LLM) server with vLLM as the backend, which exposes OpenAI-compatible endpoints.
|
||||
|
||||
## Prerequisites
|
||||
|
||||
- Setup vLLM environment
|
||||
|
||||
## Deploy
|
||||
|
||||
- Start the vLLM server with the supported chat completion model, e.g.
|
||||
|
||||
```console
|
||||
vllm serve Qwen/Qwen1.5-32B-Chat-AWQ --max-model-len 4096
|
||||
```
|
||||
|
||||
- Download and install [Anything LLM desktop](https://anythingllm.com/desktop).
|
||||
|
||||
- On the bottom left of open settings, AI Prooviders --> LLM:
|
||||
- LLM Provider: Generic OpenAI
|
||||
- Base URL: http://{vllm server host}:{vllm server port}/v1
|
||||
- Chat Model Name: `Qwen/Qwen1.5-32B-Chat-AWQ`
|
||||
|
||||
:::{image} /assets/deployment/anything-llm-provider.png
|
||||
:::
|
||||
|
||||
- Back to home page, New Workspace --> create `vllm` workspace, and start to chat:
|
||||
|
||||
:::{image} /assets/deployment/anything-llm-chat-without-doc.png
|
||||
:::
|
||||
|
||||
- Click the upload button:
|
||||
- upload the doc
|
||||
- select the doc and move to the workspace
|
||||
- save and embed
|
||||
|
||||
:::{image} /assets/deployment/anything-llm-upload-doc.png
|
||||
:::
|
||||
|
||||
- Chat again:
|
||||
|
||||
:::{image} /assets/deployment/anything-llm-chat-with-doc.png
|
||||
:::
|
||||
@ -3,6 +3,7 @@
|
||||
:::{toctree}
|
||||
:maxdepth: 1
|
||||
|
||||
anything-llm
|
||||
bentoml
|
||||
cerebrium
|
||||
dstack
|
||||
|
||||
@ -21,11 +21,11 @@ Disaggregated prefill DOES NOT improve throughput.
|
||||
|
||||
## Usage example
|
||||
|
||||
Please refer to `examples/online_serving/disaggregated_prefill.sh` for the example usage of disaggregated prefilling.
|
||||
Please refer to <gh-file:examples/online_serving/disaggregated_prefill.sh> for the example usage of disaggregated prefilling.
|
||||
|
||||
## Benchmarks
|
||||
|
||||
Please refer to `benchmarks/disagg_benchmarks/` for disaggregated prefilling benchmarks.
|
||||
Please refer to <gh-file:benchmarks/disagg_benchmarks> for disaggregated prefilling benchmarks.
|
||||
|
||||
## Development
|
||||
|
||||
|
||||
@ -6,13 +6,13 @@ To create a new 4-bit quantized model, you can leverage [AutoAWQ](https://github
|
||||
Quantization reduces the model's precision from BF16/FP16 to INT4 which effectively reduces the total model memory footprint.
|
||||
The main benefits are lower latency and memory usage.
|
||||
|
||||
You can quantize your own models by installing AutoAWQ or picking one of the [6500+ models on Huggingface](https://huggingface.co/models?sort=trending&search=awq).
|
||||
You can quantize your own models by installing AutoAWQ or picking one of the [6500+ models on Huggingface](https://huggingface.co/models?search=awq).
|
||||
|
||||
```console
|
||||
pip install autoawq
|
||||
```
|
||||
|
||||
After installing AutoAWQ, you are ready to quantize a model. Please refer to the `AutoAWQ documentation <https://casper-hansen.github.io/AutoAWQ/examples/#basic-quantization>`_ for further details. Here is an example of how to quantize `mistralai/Mistral-7B-Instruct-v0.2`:
|
||||
After installing AutoAWQ, you are ready to quantize a model. Please refer to the [AutoAWQ documentation](https://casper-hansen.github.io/AutoAWQ/examples/#basic-quantization) for further details. Here is an example of how to quantize `mistralai/Mistral-7B-Instruct-v0.2`:
|
||||
|
||||
```python
|
||||
from awq import AutoAWQForCausalLM
|
||||
|
||||
@ -20,8 +20,8 @@ vLLM reads the model's config file and supports pre-quantized checkpoints.
|
||||
|
||||
You can find pre-quantized models on:
|
||||
|
||||
- [Hugging Face (BitBLAS)](https://huggingface.co/models?other=bitblas)
|
||||
- [Hugging Face (GPTQ)](https://huggingface.co/models?other=gptq)
|
||||
- [Hugging Face (BitBLAS)](https://huggingface.co/models?search=bitblas)
|
||||
- [Hugging Face (GPTQ)](https://huggingface.co/models?search=gptq)
|
||||
|
||||
Usually, these repositories have a `quantize_config.json` file that includes a `quantization_config` section.
|
||||
|
||||
|
||||
@ -14,7 +14,7 @@ pip install bitsandbytes>=0.45.3
|
||||
|
||||
vLLM reads the model's config file and supports both in-flight quantization and pre-quantized checkpoint.
|
||||
|
||||
You can find bitsandbytes quantized models on <https://huggingface.co/models?other=bitsandbytes>.
|
||||
You can find bitsandbytes quantized models on <https://huggingface.co/models?search=bitsandbytes>.
|
||||
And usually, these repositories have a config.json file that includes a quantization_config section.
|
||||
|
||||
## Read quantized checkpoint
|
||||
|
||||
@ -16,12 +16,16 @@ GPTQModel is one of the few quantization toolkits in the world that allows `Dyna
|
||||
is fully integrated into vLLM and backed up by support from the ModelCloud.AI team. Please refer to [GPTQModel readme](https://github.com/ModelCloud/GPTQModel?tab=readme-ov-file#dynamic-quantization-per-module-quantizeconfig-override)
|
||||
for more details on this and other advanced features.
|
||||
|
||||
You can quantize your own models by installing [GPTQModel](https://github.com/ModelCloud/GPTQModel) or picking one of the [5000+ models on Huggingface](https://huggingface.co/models?sort=trending&search=gptq).
|
||||
## Installation
|
||||
|
||||
You can quantize your own models by installing [GPTQModel](https://github.com/ModelCloud/GPTQModel) or picking one of the [5000+ models on Huggingface](https://huggingface.co/models?search=gptq).
|
||||
|
||||
```console
|
||||
pip install -U gptqmodel --no-build-isolation -v
|
||||
```
|
||||
|
||||
## Quantizing a model
|
||||
|
||||
After installing GPTQModel, you are ready to quantize a model. Please refer to the [GPTQModel readme](https://github.com/ModelCloud/GPTQModel/?tab=readme-ov-file#quantization) for further details.
|
||||
|
||||
Here is an example of how to quantize `meta-llama/Llama-3.2-1B-Instruct`:
|
||||
@ -49,12 +53,16 @@ model.quantize(calibration_dataset, batch_size=2)
|
||||
model.save(quant_path)
|
||||
```
|
||||
|
||||
## Running a quantized model with vLLM
|
||||
|
||||
To run an GPTQModel quantized model with vLLM, you can use [DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2](https://huggingface.co/ModelCloud/DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2) with the following command:
|
||||
|
||||
```console
|
||||
python examples/offline_inference/llm_engine_example.py --model DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2
|
||||
python examples/offline_inference/llm_engine_example.py --model ModelCloud/DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2
|
||||
```
|
||||
|
||||
## Using GPTQModel with vLLM's Python API
|
||||
|
||||
GPTQModel quantized models are also supported directly through the LLM entrypoint:
|
||||
|
||||
```python
|
||||
@ -67,17 +75,22 @@ prompts = [
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
|
||||
# Create a sampling params object.
|
||||
sampling_params = SamplingParams(temperature=0.6, top_p=0.9)
|
||||
|
||||
# Create an LLM.
|
||||
llm = LLM(model="DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2")
|
||||
llm = LLM(model="ModelCloud/DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2")
|
||||
|
||||
# Generate texts from the prompts. The output is a list of RequestOutput objects
|
||||
# that contain the prompt, generated text, and other information.
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
|
||||
# Print the outputs.
|
||||
print("-"*50)
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
|
||||
print("-"*50)
|
||||
```
|
||||
|
||||
@ -30,5 +30,4 @@ tokenizer.push_to_hub(hub_repo)
|
||||
quantized_model.push_to_hub(hub_repo, safe_serialization=False)
|
||||
```
|
||||
|
||||
Alternatively, you can use the TorchAO Quantization space for quantizing models with a simple UI.
|
||||
See: https://huggingface.co/spaces/medmekk/TorchAO_Quantization
|
||||
Alternatively, you can use the [TorchAO Quantization space](https://huggingface.co/spaces/medmekk/TorchAO_Quantization) for quantizing models with a simple UI.
|
||||
|
||||
@ -2,8 +2,11 @@
|
||||
|
||||
# Structured Outputs
|
||||
|
||||
vLLM supports the generation of structured outputs using [outlines](https://github.com/dottxt-ai/outlines), [lm-format-enforcer](https://github.com/noamgat/lm-format-enforcer), or [xgrammar](https://github.com/mlc-ai/xgrammar) as backends for the guided decoding.
|
||||
This document shows you some examples of the different options that are available to generate structured outputs.
|
||||
vLLM supports the generation of structured outputs using
|
||||
[xgrammar](https://github.com/mlc-ai/xgrammar) or
|
||||
[guidance](https://github.com/guidance-ai/llguidance) as backends.
|
||||
This document shows you some examples of the different options that are
|
||||
available to generate structured outputs.
|
||||
|
||||
## Online Serving (OpenAI API)
|
||||
|
||||
@ -15,10 +18,17 @@ The following parameters are supported, which must be added as extra parameters:
|
||||
- `guided_regex`: the output will follow the regex pattern.
|
||||
- `guided_json`: the output will follow the JSON schema.
|
||||
- `guided_grammar`: the output will follow the context free grammar.
|
||||
- `guided_whitespace_pattern`: used to override the default whitespace pattern for guided json decoding.
|
||||
- `guided_decoding_backend`: used to select the guided decoding backend to use. Additional backend-specific options can be supplied in a comma separated list following a colon after the backend name. For example `"xgrammar:no-fallback"` will not allow vLLM to fallback to a different backend on error.
|
||||
- `structural_tag`: Follow a JSON schema within a set of specified tags within the generated text.
|
||||
|
||||
You can see the complete list of supported parameters on the [OpenAI-Compatible Server](#openai-compatible-server)page.
|
||||
You can see the complete list of supported parameters on the [OpenAI-Compatible Server](#openai-compatible-server) page.
|
||||
|
||||
Structured outputs are supported by default in the OpenAI-Compatible Server. You
|
||||
may choose to specify the backend to use by setting the
|
||||
`--guided-decoding-backend` flag to `vllm serve`. The default backend is `auto`,
|
||||
which will try to choose an appropriate backend based on the details of the
|
||||
request. You may also choose a specific backend, along with
|
||||
some options. A full set of options is available in the `vllm serve --help`
|
||||
text.
|
||||
|
||||
Now let´s see an example for each of the cases, starting with the `guided_choice`, as it´s the easiest one:
|
||||
|
||||
@ -50,7 +60,7 @@ completion = client.chat.completions.create(
|
||||
"content": "Generate an example email address for Alan Turing, who works in Enigma. End in .com and new line. Example result: alan.turing@enigma.com\n",
|
||||
}
|
||||
],
|
||||
extra_body={"guided_regex": "\w+@\w+\.com\n", "stop": ["\n"]},
|
||||
extra_body={"guided_regex": r"\w+@\w+\.com\n", "stop": ["\n"]},
|
||||
)
|
||||
print(completion.choices[0].message.content)
|
||||
```
|
||||
@ -96,26 +106,29 @@ print(completion.choices[0].message.content)
|
||||
```
|
||||
|
||||
:::{tip}
|
||||
While not strictly necessary, normally it´s better to indicate in the prompt that a JSON needs to be generated and which fields and how should the LLM fill them.
|
||||
This can improve the results notably in most cases.
|
||||
While not strictly necessary, normally it´s better to indicate in the prompt the
|
||||
JSON schema and how the fields should be populated. This can improve the
|
||||
results notably in most cases.
|
||||
:::
|
||||
|
||||
Finally we have the `guided_grammar`, which probably is the most difficult one to use but it´s really powerful, as it allows us to define complete languages like SQL queries.
|
||||
It works by using a context free EBNF grammar, which for example we can use to define a specific format of simplified SQL queries, like in the example below:
|
||||
Finally we have the `guided_grammar` option, which is probably the most
|
||||
difficult to use, but it´s really powerful. It allows us to define complete
|
||||
languages like SQL queries. It works by using a context free EBNF grammar.
|
||||
As an example, we can use to define a specific format of simplified SQL queries:
|
||||
|
||||
```python
|
||||
simplified_sql_grammar = """
|
||||
?start: select_statement
|
||||
root ::= select_statement
|
||||
|
||||
?select_statement: "SELECT " column_list " FROM " table_name
|
||||
select_statement ::= "SELECT " column " from " table " where " condition
|
||||
|
||||
?column_list: column_name ("," column_name)*
|
||||
column ::= "col_1 " | "col_2 "
|
||||
|
||||
?table_name: identifier
|
||||
table ::= "table_1 " | "table_2 "
|
||||
|
||||
?column_name: identifier
|
||||
condition ::= column "= " number
|
||||
|
||||
?identifier: /[a-zA-Z_][a-zA-Z0-9_]*/
|
||||
number ::= "1 " | "2 "
|
||||
"""
|
||||
|
||||
completion = client.chat.completions.create(
|
||||
@ -226,6 +239,8 @@ Step #2: explanation="Next, let's isolate 'x' by dividing both sides of the equa
|
||||
Answer: x = -29/8
|
||||
```
|
||||
|
||||
An example of using `structural_tag` can be found here: <gh-file:examples/online_serving/openai_chat_completion_structured_outputs_structural_tag.py>
|
||||
|
||||
## Offline Inference
|
||||
|
||||
Offline inference allows for the same types of guided decoding.
|
||||
@ -236,11 +251,11 @@ The main available options inside `GuidedDecodingParams` are:
|
||||
- `regex`
|
||||
- `choice`
|
||||
- `grammar`
|
||||
- `backend`
|
||||
- `whitespace_pattern`
|
||||
- `structural_tag`
|
||||
|
||||
These parameters can be used in the same way as the parameters from the Online Serving examples above.
|
||||
One example for the usage of the `choices` parameter is shown below:
|
||||
These parameters can be used in the same way as the parameters from the Online
|
||||
Serving examples above. One example for the usage of the `choice` parameter is
|
||||
shown below:
|
||||
|
||||
```python
|
||||
from vllm import LLM, SamplingParams
|
||||
|
||||
@ -152,10 +152,11 @@ Recommended flags: `--tool-call-parser mistral --chat-template examples/tool_cha
|
||||
|
||||
Supported models:
|
||||
|
||||
All Llama 3.1 and 3.2 models should be supported.
|
||||
All Llama 3.1, 3.2 and 4 models should be supported.
|
||||
|
||||
* `meta-llama/Llama-3.1-*`
|
||||
* `meta-llama/Llama-3.2-*`
|
||||
* `meta-llama/Llama-4-*`
|
||||
|
||||
The tool calling that is supported is the [JSON based tool calling](https://llama.meta.com/docs/model-cards-and-prompt-formats/llama3_1/#json-based-tool-calling). For [pythonic tool calling](https://github.com/meta-llama/llama-models/blob/main/models/llama3_2/text_prompt_format.md#zero-shot-function-calling) introduced by the Llama-3.2 models, see the `pythonic` tool parser below.
|
||||
|
||||
@ -176,6 +177,12 @@ images.
|
||||
|
||||
Recommended flags: `--tool-call-parser llama3_json --chat-template {see_above}`
|
||||
|
||||
VLLM also provides a JSON based chat template for Llama 4:
|
||||
* `examples/tool_chat_template_llama4_json.jinja` - this is based on the "official" chat template for the Llama 4
|
||||
models, but tweaked so that it works better with vLLM.
|
||||
|
||||
For Llama 4 use `--tool-call-parser llama4_json examples/tool_chat_template_llama4_json.jinja`.
|
||||
|
||||
#### IBM Granite
|
||||
|
||||
Supported models:
|
||||
|
||||
@ -44,7 +44,7 @@ There are no pre-built wheels for this device, so you must either use the pre-bu
|
||||
|
||||
You can provision Cloud TPUs using the [Cloud TPU API](https://cloud.google.com/tpu/docs/reference/rest)
|
||||
or the [queued resources](https://cloud.google.com/tpu/docs/queued-resources)
|
||||
API. This section shows how to create TPUs using the queued resource API. For
|
||||
API (preferred). This section shows how to create TPUs using the queued resource API. For
|
||||
more information about using the Cloud TPU API, see [Create a Cloud TPU using the Create Node API](https://cloud.google.com/tpu/docs/managing-tpus-tpu-vm#create-node-api).
|
||||
Queued resources enable you to request Cloud TPU resources in a queued manner.
|
||||
When you request queued resources, the request is added to a queue maintained by
|
||||
@ -97,10 +97,10 @@ gcloud alpha compute tpus queued-resources create QUEUED_RESOURCE_ID \
|
||||
`TPU regions and zones <https://cloud.google.com/tpu/docs/regions-zones>`_
|
||||
- * ACCELERATOR_TYPE
|
||||
* The TPU version you want to use. Specify the TPU version, for example
|
||||
`v5litepod-4` specifies a v5e TPU with 4 cores. For more information,
|
||||
see `TPU versions <https://cloud.devsite.corp.google.com/tpu/docs/system-architecture-tpu-vm#versions>`_.
|
||||
`v5litepod-4` specifies a v5e TPU with 4 cores, `v6e-1` specifies a v6e TPU with 1 core. For more information,
|
||||
see [TPU versions](https://cloud.devsite.corp.google.com/tpu/docs/system-architecture-tpu-vm#versions).
|
||||
- * RUNTIME_VERSION
|
||||
* The TPU VM runtime version to use. For more information see `TPU VM images <https://cloud.google.com/tpu/docs/runtimes>`_.
|
||||
* The TPU VM runtime version to use. For example, use `v2-alpha-tpuv6e` for a VM loaded with one or more v6e TPU(s). For more information see [TPU VM images](https://cloud.google.com/tpu/docs/runtimes).
|
||||
- * SERVICE_ACCOUNT
|
||||
* The email address for your service account. You can find it in the IAM
|
||||
Cloud Console under *Service Accounts*. For example:
|
||||
|
||||
@ -153,7 +153,7 @@ git clone https://github.com/vllm-project/vllm.git
|
||||
cd vllm
|
||||
python use_existing_torch.py
|
||||
pip install -r requirements/build.txt
|
||||
pip install -e . --no-build-isolation
|
||||
pip install --no-build-isolation -e .
|
||||
```
|
||||
|
||||
##### Use the local cutlass for compilation
|
||||
|
||||
@ -44,8 +44,8 @@ This living user guide outlines a few known **important changes and limitations*
|
||||
|-----------------|-----------------------------------------------------------------------------------|
|
||||
| **Prefix Caching** | <nobr>🚀 Optimized</nobr> |
|
||||
| **Chunked Prefill** | <nobr>🚀 Optimized</nobr> |
|
||||
| **LoRA** | <nobr>🚀 Optimized</nobr> |
|
||||
| **Logprobs Calculation** | <nobr>🟢 Functional</nobr> |
|
||||
| **LoRA** | <nobr>🟢 Functional ([PR #13096](https://github.com/vllm-project/vllm/pull/13096))</nobr>|
|
||||
| **Multimodal Models** | <nobr>🟢 Functional</nobr> |
|
||||
| **FP8 KV Cache** | <nobr>🟢 Functional on Hopper devices ([PR #15191](https://github.com/vllm-project/vllm/pull/15191))</nobr>|
|
||||
| **Spec Decode** | <nobr>🚧 WIP ([PR #13933](https://github.com/vllm-project/vllm/pull/13933))</nobr>|
|
||||
@ -121,11 +121,6 @@ Although we have re-implemented and partially optimized many features and models
|
||||
These features are already supported in vLLM V1, but their optimization is still
|
||||
in progress.
|
||||
|
||||
- **LoRA**: LoRA is functionally working on vLLM V1 but its performance is
|
||||
inferior to that of V0. The team is actively working on improving its
|
||||
performance
|
||||
(e.g., see [PR #13096](https://github.com/vllm-project/vllm/pull/13096)).
|
||||
|
||||
- **Spec Decode**: Currently, only ngram-based spec decode is supported in V1. There
|
||||
will be follow-up work to support other types of spec decode (e.g., see [PR #13933](https://github.com/vllm-project/vllm/pull/13933)). We will prioritize the support for Eagle, MTP compared to draft model based spec decode.
|
||||
|
||||
|
||||
@ -51,3 +51,29 @@ vllm serve /home/meta-llama/Llama-3.2-3B-Instruct --load-format runai_streamer -
|
||||
:::{note}
|
||||
For further instructions about tunable parameters and additional parameters configurable through environment variables, read the [Environment Variables Documentation](https://github.com/run-ai/runai-model-streamer/blob/master/docs/src/env-vars.md).
|
||||
:::
|
||||
|
||||
## Sharded Model Loading
|
||||
|
||||
vLLM also supports loading sharded models using Run:ai Model Streamer. This is particularly useful for large models that are split across multiple files. To use this feature, use the `--load-format runai_streamer_sharded` flag:
|
||||
|
||||
```console
|
||||
vllm serve /path/to/sharded/model --load-format runai_streamer_sharded
|
||||
```
|
||||
|
||||
The sharded loader expects model files to follow the same naming pattern as the regular sharded state loader: `model-rank-{rank}-part-{part}.safetensors`. You can customize this pattern using the `pattern` parameter in `--model-loader-extra-config`:
|
||||
|
||||
```console
|
||||
vllm serve /path/to/sharded/model --load-format runai_streamer_sharded --model-loader-extra-config '{"pattern":"custom-model-rank-{rank}-part-{part}.safetensors"}'
|
||||
```
|
||||
|
||||
To create sharded model files, you can use the script provided in <gh-file:examples/offline_inference/save_sharded_state.py>. This script demonstrates how to save a model in the sharded format that is compatible with the Run:ai Model Streamer sharded loader.
|
||||
|
||||
The sharded loader supports all the same tunable parameters as the regular Run:ai Model Streamer, including `concurrency` and `memory_limit`. These can be configured in the same way:
|
||||
|
||||
```console
|
||||
vllm serve /path/to/sharded/model --load-format runai_streamer_sharded --model-loader-extra-config '{"concurrency":16, "memory_limit":5368709120}'
|
||||
```
|
||||
|
||||
:::{note}
|
||||
The sharded loader is particularly efficient for tensor or pipeline parallel models where each worker only needs to read its own shard rather than the entire checkpoint.
|
||||
:::
|
||||
|
||||
@ -59,7 +59,7 @@ A code example can be found here: <gh-file:examples/offline_inference/basic/basi
|
||||
|
||||
### `LLM.beam_search`
|
||||
|
||||
The {class}`~vllm.LLM.beam_search` method implements [beam search](https://huggingface.co/docs/transformers/en/generation_strategies#beam-search-decoding) on top of {class}`~vllm.LLM.generate`.
|
||||
The {class}`~vllm.LLM.beam_search` method implements [beam search](https://huggingface.co/docs/transformers/en/generation_strategies#beam-search) on top of {class}`~vllm.LLM.generate`.
|
||||
For example, to search using 5 beams and output at most 50 tokens:
|
||||
|
||||
```python
|
||||
|
||||
@ -159,14 +159,14 @@ For example, setting `dimensions` parameter while using the `BAAI/bge-m3` model
|
||||
|
||||
### Manually enable Matryoshka Embeddings
|
||||
|
||||
There is currently no official interface for specifying support for Matryoshka Embeddings. In vLLM, we simply check the existence of the fields `is_matryoshka` or `matryoshka_dimensions` inside `config.json`.
|
||||
There is currently no official interface for specifying support for Matryoshka Embeddings. In vLLM, if `is_matryoshka` is `True` in `config.json,` it is allowed to change the output to arbitrary dimensions. Using `matryoshka_dimensions` can control the allowed output dimensions.
|
||||
|
||||
For models that support Matryoshka Embeddings but not recognized by vLLM, please manually override the config using `hf_overrides={"is_matryoshka": True}` (offline) or `--hf_overrides '{"is_matryoshka": true}'` (online).
|
||||
For models that support Matryoshka Embeddings but not recognized by vLLM, please manually override the config using `hf_overrides={"is_matryoshka": True}`, `hf_overrides={"matryoshka_dimensions": [<allowed output dimensions>]}` (offline) or `--hf_overrides '{"is_matryoshka": true}'`, `--hf_overrides '{"matryoshka_dimensions": [<allowed output dimensions>]}'`(online).
|
||||
|
||||
Here is an example to serve a model with Matryoshka Embeddings enabled.
|
||||
|
||||
```text
|
||||
vllm serve Snowflake/snowflake-arctic-embed-m-v1.5 --hf_overrides '{"is_matryoshka":true}'
|
||||
vllm serve Snowflake/snowflake-arctic-embed-m-v1.5 --hf_overrides '{"matryoshka_dimensions":[256]}'
|
||||
```
|
||||
|
||||
### Offline Inference
|
||||
@ -204,14 +204,14 @@ curl http://127.0.0.1:8000/v1/embeddings \
|
||||
"input": "Follow the white rabbit.",
|
||||
"model": "jinaai/jina-embeddings-v3",
|
||||
"encoding_format": "float",
|
||||
"dimensions": 1
|
||||
"dimensions": 32
|
||||
}'
|
||||
```
|
||||
|
||||
Expected output:
|
||||
|
||||
```json
|
||||
{"id":"embd-0aab28c384d348c3b8f0eb783109dc5f","object":"list","created":1744195454,"model":"jinaai/jina-embeddings-v3","data":[{"index":0,"object":"embedding","embedding":[-1.0]}],"usage":{"prompt_tokens":10,"total_tokens":10,"completion_tokens":0,"prompt_tokens_details":null}}
|
||||
{"id":"embd-5c21fc9a5c9d4384a1b021daccaf9f64","object":"list","created":1745476417,"model":"jinaai/jina-embeddings-v3","data":[{"index":0,"object":"embedding","embedding":[-0.3828125,-0.1357421875,0.03759765625,0.125,0.21875,0.09521484375,-0.003662109375,0.1591796875,-0.130859375,-0.0869140625,-0.1982421875,0.1689453125,-0.220703125,0.1728515625,-0.2275390625,-0.0712890625,-0.162109375,-0.283203125,-0.055419921875,-0.0693359375,0.031982421875,-0.04052734375,-0.2734375,0.1826171875,-0.091796875,0.220703125,0.37890625,-0.0888671875,-0.12890625,-0.021484375,-0.0091552734375,0.23046875]}],"usage":{"prompt_tokens":8,"total_tokens":8,"completion_tokens":0,"prompt_tokens_details":null}}
|
||||
```
|
||||
|
||||
A openai client example can be found here: <gh-file:examples/online_serving/openai_embedding_matryoshka_fy.py>
|
||||
|
||||
@ -40,33 +40,37 @@ You can force the use of `TransformersForCausalLM` by setting `model_impl="trans
|
||||
vLLM may not fully optimise the Transformers implementation so you may see degraded performance if comparing a native model to a Transformers model in vLLM.
|
||||
:::
|
||||
|
||||
#### Supported features
|
||||
#### Custom models
|
||||
|
||||
The Transformers modeling backend explicitly supports the following features:
|
||||
If a model is neither supported natively by vLLM or Transformers, it can still be used in vLLM!
|
||||
|
||||
- <project:#quantization-index> (except GGUF)
|
||||
- <project:#lora-adapter>
|
||||
- <project:#distributed-serving>
|
||||
For a model to be compatible with the Transformers backend for vLLM it must:
|
||||
|
||||
#### Remote Code
|
||||
- be a Transformers compatible custom model (see [Transformers - Customizing models](https://huggingface.co/docs/transformers/en/custom_models)):
|
||||
* The model directory must have the correct structure (e.g. `config.json` is present).
|
||||
* `config.json` must contain `auto_map.AutoModel`.
|
||||
- be a Transformers backend for vLLM compatible model (see <project:#writing-custom-models>):
|
||||
* Customisation should be done in the base model (e.g. in `MyModel`, not `MyModelForCausalLM`).
|
||||
|
||||
If your model is neither supported natively by vLLM or Transformers, you can still run it in vLLM!
|
||||
If the compatible model is:
|
||||
|
||||
Simply set `trust_remote_code=True` and vLLM will run any model on the Model Hub that is compatible with Transformers.
|
||||
Provided that the model writer implements their model in a compatible way, this means that you can run new models before they are officially supported in Transformers or vLLM!
|
||||
- on the Hugging Face Model Hub, simply set `trust_remote_code=True` for <project:#offline-inference> or `--trust-remode-code` for the <project:#openai-compatible-server>.
|
||||
- in a local directory, simply pass directory path to `model=<MODEL_DIR>` for <project:#offline-inference> or `vllm serve <MODEL_DIR>` for the <project:#openai-compatible-server>.
|
||||
|
||||
:::{tip}
|
||||
If you have not yet created your custom model, you can follow this guide on [customising models in Transformers](https://huggingface.co/docs/transformers/en/custom_models).
|
||||
:::
|
||||
This means that, with the Transformers backend for vLLM, new models can be used before they are officially supported in Transformers or vLLM!
|
||||
|
||||
```python
|
||||
from vllm import LLM
|
||||
llm = LLM(model=..., task="generate", trust_remote_code=True) # Name or path of your model
|
||||
llm.apply_model(lambda model: print(model.__class__))
|
||||
```
|
||||
(writing-custom-models)=
|
||||
|
||||
#### Writing custom models
|
||||
|
||||
This section details the necessary modifications to make to a Transformers compatible custom model that make it compatible with the Transformers backend for vLLM. (We assume that a Transformers compatible custom model has already been created, see [Transformers - Customizing models](https://huggingface.co/docs/transformers/en/custom_models)).
|
||||
|
||||
To make your model compatible with the Transformers backend, it needs:
|
||||
|
||||
1. `kwargs` passed down through all modules from `MyModel` to `MyAttention`.
|
||||
2. `MyAttention` must use `ALL_ATTENTION_FUNCTIONS` to call attention.
|
||||
3. `MyModel` must contain `_supports_attention_backend = True`.
|
||||
|
||||
```{code-block} python
|
||||
:caption: modeling_my_model.py
|
||||
|
||||
@ -75,7 +79,7 @@ from torch import nn
|
||||
|
||||
class MyAttention(nn.Module):
|
||||
|
||||
def forward(self, hidden_states, **kwargs): # <- kwargs are required
|
||||
def forward(self, hidden_states, **kwargs):
|
||||
...
|
||||
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
|
||||
attn_output, attn_weights = attention_interface(
|
||||
@ -91,11 +95,11 @@ class MyModel(PreTrainedModel):
|
||||
_supports_attention_backend = True
|
||||
```
|
||||
|
||||
Here is what happens in the background:
|
||||
Here is what happens in the background when this model is loaded:
|
||||
|
||||
1. The config is loaded
|
||||
2. `MyModel` Python class is loaded from the `auto_map`, and we check that the model `_supports_attention_backend`.
|
||||
3. The `TransformersForCausalLM` backend is used. See <gh-file:vllm/model_executor/models/transformers.py>, which leverage `self.config._attn_implementation = "vllm"`, thus the need to use `ALL_ATTENTION_FUNCTION`.
|
||||
1. The config is loaded.
|
||||
2. `MyModel` Python class is loaded from the `auto_map` in config, and we check that the model `is_backend_compatible()`.
|
||||
3. `MyModel` is loaded into `TransformersForCausalLM` (see <gh-file:vllm/model_executor/models/transformers.py>) which sets `self.config._attn_implementation = "vllm"` so that vLLM's attention layer is used.
|
||||
|
||||
That's it!
|
||||
|
||||
@ -217,6 +221,16 @@ output = llm.encode("Hello, my name is")
|
||||
print(output)
|
||||
```
|
||||
|
||||
(feature-status-legend)=
|
||||
|
||||
## Feature Status Legend
|
||||
|
||||
- ✅︎ indicates that the feature is supported for the model.
|
||||
|
||||
- 🚧 indicates that the feature is planned but not yet supported for the model.
|
||||
|
||||
- ⚠️ indicates that the feature is available but may have known issues or limitations.
|
||||
|
||||
(supported-text-models)=
|
||||
|
||||
## List of Text-only Language Models
|
||||
@ -318,7 +332,7 @@ See [this page](#generative-models) for more information on how to use generativ
|
||||
* ✅︎
|
||||
- * `GemmaForCausalLM`
|
||||
* Gemma
|
||||
* `google/gemma-2b`, `google/gemma-7b`, etc.
|
||||
* `google/gemma-2b`, `google/gemma-1.1-2b-it`, etc.
|
||||
* ✅︎
|
||||
* ✅︎
|
||||
- * `Gemma2ForCausalLM`
|
||||
@ -779,6 +793,8 @@ or `--limit-mm-per-prompt` (online serving). For example, to enable passing up t
|
||||
Offline inference:
|
||||
|
||||
```python
|
||||
from vllm import LLM
|
||||
|
||||
llm = LLM(
|
||||
model="Qwen/Qwen2-VL-7B-Instruct",
|
||||
limit_mm_per_prompt={"image": 4},
|
||||
@ -879,6 +895,13 @@ See [this page](#generative-models) for more information on how to use generativ
|
||||
* ✅︎
|
||||
* ✅︎
|
||||
* ✅︎
|
||||
- * `GraniteSpeechForConditionalGeneration`
|
||||
* Granite Speech
|
||||
* T + A
|
||||
* `ibm-granite/granite-speech-3.3-8b`
|
||||
* ✅︎
|
||||
* ✅︎
|
||||
* ✅︎
|
||||
- * `H2OVLChatModel`
|
||||
* H2OVL
|
||||
* T + I<sup>E+</sup>
|
||||
@ -1078,7 +1101,7 @@ See [this page](#generative-models) for more information on how to use generativ
|
||||
|
||||
:::{important}
|
||||
Pan-and-scan image pre-processing is currently supported on V0 (but not V1).
|
||||
You can enable it by passing `--mm-processor-kwargs '{"do_pan_and_scan": True}'`.
|
||||
You can enable it by passing `--mm-processor-kwargs '{"do_pan_and_scan": true}'`.
|
||||
:::
|
||||
|
||||
:::{warning}
|
||||
@ -1093,7 +1116,7 @@ V0 correctly implements the model's attention pattern:
|
||||
|
||||
V1 currently uses a simplified attention pattern:
|
||||
- Uses causal attention for all tokens, including image tokens
|
||||
- Generates reasonable outputs but does not match the original model's attention for text + image inputs, especially when `{"do_pan_and_scan": True}`
|
||||
- Generates reasonable outputs but does not match the original model's attention for text + image inputs, especially when `{"do_pan_and_scan": true}`
|
||||
- Will be updated in the future to support the correct behavior
|
||||
|
||||
This limitation exists because the model's mixed attention pattern (bidirectional for images, causal otherwise) is not yet supported by vLLM's attention backends.
|
||||
@ -1107,6 +1130,36 @@ This limitation exists because the model's mixed attention pattern (bidirectiona
|
||||
To use `TIGER-Lab/Mantis-8B-siglip-llama3`, you have to pass `--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'` when running vLLM.
|
||||
:::
|
||||
|
||||
:::{warning}
|
||||
The output quality of `AllenAI/Molmo-7B-D-0924` (especially in object localization tasks) has deteriorated in recent updates.
|
||||
|
||||
For the best results, we recommend using the following dependency versions (tested on A10 and L40):
|
||||
|
||||
```text
|
||||
# Core vLLM-compatible dependencies with Molmo accuracy setup (tested on L40)
|
||||
torch==2.5.1
|
||||
torchvision==0.20.1
|
||||
transformers==4.48.1
|
||||
tokenizers==0.21.0
|
||||
tiktoken==0.7.0
|
||||
vllm==0.7.0
|
||||
|
||||
# Optional but recommended for improved performance and stability
|
||||
triton==3.1.0
|
||||
xformers==0.0.28.post3
|
||||
uvloop==0.21.0
|
||||
protobuf==5.29.3
|
||||
openai==1.60.2
|
||||
opencv-python-headless==4.11.0.86
|
||||
pillow==10.4.0
|
||||
|
||||
# Installed FlashAttention (for float16 only)
|
||||
flash-attn>=2.5.6 # Not used in float32, but should be documented
|
||||
```
|
||||
|
||||
**Note:** Make sure you understand the security implications of using outdated packages.
|
||||
:::
|
||||
|
||||
:::{note}
|
||||
The official `openbmb/MiniCPM-V-2` doesn't work yet, so we need to use a fork (`HwwwH/MiniCPM-V-2`) for now.
|
||||
For more details, please see: <gh-pr:4087#issuecomment-2250397630>
|
||||
@ -1121,7 +1174,7 @@ To use Qwen2.5-Omni, you have to install Hugging Face Transformers library from
|
||||
`pip install git+https://github.com/huggingface/transformers.git`.
|
||||
|
||||
Read audio from video pre-processing is currently supported on V0 (but not V1), because overlapping modalities is not yet supported in V1.
|
||||
`--mm-processor-kwargs '{"use_audio_in_video": True}'`.
|
||||
`--mm-processor-kwargs '{"use_audio_in_video": true}'`.
|
||||
:::
|
||||
|
||||
### Pooling Models
|
||||
|
||||
@ -16,6 +16,7 @@ Below, you can find an explanation of every engine argument:
|
||||
:func: _engine_args_parser
|
||||
:prog: vllm serve
|
||||
:nodefaultconst:
|
||||
:markdownhelp:
|
||||
```
|
||||
|
||||
## Async Engine Arguments
|
||||
@ -29,4 +30,5 @@ Additional arguments are available to the asynchronous engine which is used for
|
||||
:func: _async_engine_args_parser
|
||||
:prog: vllm serve
|
||||
:nodefaultconst:
|
||||
:markdownhelp:
|
||||
```
|
||||
|
||||
56
examples/lmcache/README.md
Normal file
56
examples/lmcache/README.md
Normal file
@ -0,0 +1,56 @@
|
||||
# LMCache Examples
|
||||
|
||||
This folder demonstrates how to use LMCache for disaggregated prefilling, CPU offloading and KV cache sharing.
|
||||
|
||||
## 1. Disaggregated Prefill in vLLM v1
|
||||
|
||||
This example demonstrates how to run LMCache with disaggregated prefill using NIXL on a single node.
|
||||
|
||||
### Prerequisites
|
||||
|
||||
- Install [LMCache](https://github.com/LMCache/LMCache). You can simply run `pip install lmcache`.
|
||||
- Install [NIXL](https://github.com/ai-dynamo/nixl).
|
||||
- At least 2 GPUs
|
||||
- Valid Hugging Face token (HF_TOKEN) for Llama 3.1 8B Instruct.
|
||||
|
||||
### Usage
|
||||
|
||||
Run
|
||||
`cd disagg_prefill_lmcache_v1`
|
||||
to get into `disagg_prefill_lmcache_v1` folder, and then run
|
||||
|
||||
```bash
|
||||
bash disagg_example_nixl.sh
|
||||
```
|
||||
|
||||
to run disaggregated prefill and benchmark the performance.
|
||||
|
||||
### Components
|
||||
|
||||
#### Server Scripts
|
||||
- `disagg_prefill_lmcache_v1/disagg_vllm_launcher.sh` - Launches individual vLLM servers for prefill/decode, and also launches the proxy server.
|
||||
- `disagg_prefill_lmcache_v1/disagg_proxy_server.py` - FastAPI proxy server that coordinates between prefiller and decoder
|
||||
- `disagg_prefill_lmcache_v1/disagg_example_nixl.sh` - Main script to run the example
|
||||
|
||||
#### Configuration
|
||||
- `disagg_prefill_lmcache_v1/configs/lmcache-prefiller-config.yaml` - Configuration for prefiller server
|
||||
- `disagg_prefill_lmcache_v1/configs/lmcache-decoder-config.yaml` - Configuration for decoder server
|
||||
|
||||
#### Log Files
|
||||
The main script generates several log files:
|
||||
- `prefiller.log` - Logs from the prefill server
|
||||
- `decoder.log` - Logs from the decode server
|
||||
- `proxy.log` - Logs from the proxy server
|
||||
|
||||
## 2. CPU Offload Examples
|
||||
|
||||
- `cpu_offload_lmcache_v0.py` - CPU offloading implementation for vLLM v0
|
||||
- `cpu_offload_lmcache_v1.py` - CPU offloading implementation for vLLM v1
|
||||
|
||||
## 3. KV Cache Sharing
|
||||
|
||||
The `kv_cache_sharing_lmcache_v1.py` example demonstrates how to share KV caches between vLLM v1 instances.
|
||||
|
||||
## 4. Disaggregated Prefill in vLLM v0
|
||||
|
||||
The `disaggregated_prefill_lmcache_v0.py` provides an example of how to run disaggregated prefill in vLLM v0.
|
||||
57
examples/lmcache/cpu_offload_lmcache_v1.py
Normal file
57
examples/lmcache/cpu_offload_lmcache_v1.py
Normal file
@ -0,0 +1,57 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
"""
|
||||
This file demonstrates the example usage of cpu offloading
|
||||
with LMCache in vLLM v1.
|
||||
|
||||
Note that lmcache needs to be installed to run this example.
|
||||
Learn more about LMCache in https://github.com/LMCache/LMCache.
|
||||
"""
|
||||
import os
|
||||
|
||||
from lmcache.experimental.cache_engine import LMCacheEngineBuilder
|
||||
from lmcache.integration.vllm.utils import ENGINE_NAME
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.config import KVTransferConfig
|
||||
|
||||
# LMCache-related environment variables
|
||||
# Use experimental features in LMCache
|
||||
os.environ["LMCACHE_USE_EXPERIMENTAL"] = "True"
|
||||
# LMCache is set to use 256 tokens per chunk
|
||||
os.environ["LMCACHE_CHUNK_SIZE"] = "256"
|
||||
# Enable local CPU backend in LMCache
|
||||
os.environ["LMCACHE_LOCAL_CPU"] = "True"
|
||||
# Set local CPU memory limit to 5.0 GB
|
||||
os.environ["LMCACHE_MAX_LOCAL_CPU_SIZE"] = "5.0"
|
||||
|
||||
# This example script runs two requests with a shared prefix.
|
||||
shared_prompt = "Hello, how are you?" * 1000
|
||||
first_prompt = [
|
||||
shared_prompt + "Hello, my name is",
|
||||
]
|
||||
second_prompt = [
|
||||
shared_prompt + "Tell me a very long story",
|
||||
]
|
||||
|
||||
sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10)
|
||||
|
||||
ktc = KVTransferConfig.from_cli(
|
||||
'{"kv_connector":"LMCacheConnectorV1", "kv_role":"kv_both"}')
|
||||
# Set GPU memory utilization to 0.8 for an A40 GPU with 40GB
|
||||
# memory. Reduce the value if your GPU has less memory.
|
||||
# Note that LMCache is not compatible with chunked prefill for now.
|
||||
llm = LLM(model="meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||
kv_transfer_config=ktc,
|
||||
max_model_len=8000,
|
||||
gpu_memory_utilization=0.8)
|
||||
|
||||
# Should be able to see logs like the following:
|
||||
# `LMCache INFO: Storing KV cache for 6006 out of 6006 tokens for request 0`
|
||||
# This indicates that the KV cache has been stored in LMCache.
|
||||
outputs = llm.generate(first_prompt, sampling_params)
|
||||
for output in outputs:
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Generated text: {generated_text!r}")
|
||||
|
||||
# Clean up lmcache backend
|
||||
LMCacheEngineBuilder.destroy(ENGINE_NAME)
|
||||
@ -0,0 +1,13 @@
|
||||
local_cpu: False
|
||||
max_local_cpu_size: 0
|
||||
#local_disk:
|
||||
max_local_disk_size: 0
|
||||
remote_serde: NULL
|
||||
|
||||
enable_nixl: True
|
||||
nixl_role: "receiver"
|
||||
nixl_peer_host: "localhost"
|
||||
nixl_peer_port: 55555
|
||||
nixl_buffer_size: 1073741824 # 1GB
|
||||
nixl_buffer_device: "cuda"
|
||||
nixl_enable_gc: True
|
||||
@ -0,0 +1,13 @@
|
||||
local_cpu: False
|
||||
max_local_cpu_size: 0
|
||||
#local_disk:
|
||||
max_local_disk_size: 0
|
||||
remote_serde: NULL
|
||||
|
||||
enable_nixl: True
|
||||
nixl_role: "sender"
|
||||
nixl_peer_host: "localhost"
|
||||
nixl_peer_port: 55555
|
||||
nixl_buffer_size: 1073741824 # 1GB
|
||||
nixl_buffer_device: "cuda"
|
||||
nixl_enable_gc: True
|
||||
@ -0,0 +1,136 @@
|
||||
#!/bin/bash
|
||||
|
||||
echo "Warning: LMCache disaggregated prefill support for vLLM v1 is experimental and subject to change."
|
||||
|
||||
|
||||
PIDS=()
|
||||
|
||||
# Switch to the directory of the current script
|
||||
cd "$(dirname "${BASH_SOURCE[0]}")"
|
||||
|
||||
check_hf_token() {
|
||||
if [ -z "$HF_TOKEN" ]; then
|
||||
echo "HF_TOKEN is not set. Please set it to your Hugging Face token."
|
||||
exit 1
|
||||
fi
|
||||
if [[ "$HF_TOKEN" != hf_* ]]; then
|
||||
echo "HF_TOKEN is not a valid Hugging Face token. Please set it to your Hugging Face token."
|
||||
exit 1
|
||||
fi
|
||||
echo "HF_TOKEN is set and valid."
|
||||
}
|
||||
|
||||
check_num_gpus() {
|
||||
# can you check if the number of GPUs are >=2 via nvidia-smi?
|
||||
num_gpus=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
|
||||
if [ "$num_gpus" -lt 2 ]; then
|
||||
echo "You need at least 2 GPUs to run disaggregated prefill."
|
||||
exit 1
|
||||
else
|
||||
echo "Found $num_gpus GPUs."
|
||||
fi
|
||||
}
|
||||
|
||||
ensure_python_library_installed() {
|
||||
echo "Checking if $1 is installed..."
|
||||
python -c "import $1" > /dev/null 2>&1
|
||||
if [ $? -ne 0 ]; then
|
||||
if [ "$1" == "nixl" ]; then
|
||||
echo "$1 is not installed. Please refer to https://github.com/ai-dynamo/nixl for installation."
|
||||
else
|
||||
echo "$1 is not installed. Please install it via pip install $1."
|
||||
fi
|
||||
exit 1
|
||||
else
|
||||
echo "$1 is installed."
|
||||
fi
|
||||
}
|
||||
|
||||
cleanup() {
|
||||
echo "Stopping everything…"
|
||||
trap - INT TERM # prevent re-entrancy
|
||||
kill -- -$$ # negative PID == “this whole process-group”
|
||||
wait # reap children so we don't leave zombies
|
||||
exit 0
|
||||
}
|
||||
|
||||
wait_for_server() {
|
||||
local port=$1
|
||||
local timeout_seconds=1200
|
||||
local start_time=$(date +%s)
|
||||
|
||||
echo "Waiting for server on port $port..."
|
||||
|
||||
while true; do
|
||||
if curl -s "localhost:${port}/v1/completions" > /dev/null; then
|
||||
return 0
|
||||
fi
|
||||
|
||||
local now=$(date +%s)
|
||||
if (( now - start_time >= timeout_seconds )); then
|
||||
echo "Timeout waiting for server"
|
||||
return 1
|
||||
fi
|
||||
|
||||
sleep 1
|
||||
done
|
||||
}
|
||||
|
||||
|
||||
main() {
|
||||
check_hf_token
|
||||
check_num_gpus
|
||||
ensure_python_library_installed lmcache
|
||||
ensure_python_library_installed nixl
|
||||
ensure_python_library_installed pandas
|
||||
ensure_python_library_installed datasets
|
||||
ensure_python_library_installed vllm
|
||||
|
||||
trap cleanup INT
|
||||
trap cleanup USR1
|
||||
trap cleanup TERM
|
||||
|
||||
echo "Launching prefiller, decoder and proxy..."
|
||||
echo "Please check prefiller.log, decoder.log and proxy.log for logs."
|
||||
|
||||
bash disagg_vllm_launcher.sh prefiller \
|
||||
> >(tee prefiller.log) 2>&1 &
|
||||
prefiller_pid=$!
|
||||
PIDS+=($prefiller_pid)
|
||||
|
||||
bash disagg_vllm_launcher.sh decoder \
|
||||
> >(tee decoder.log) 2>&1 &
|
||||
decoder_pid=$!
|
||||
PIDS+=($decoder_pid)
|
||||
|
||||
python3 disagg_proxy_server.py \
|
||||
--host localhost \
|
||||
--port 9000 \
|
||||
--prefiller-host localhost \
|
||||
--prefiller-port 8100 \
|
||||
--decoder-host localhost \
|
||||
--decoder-port 8200 \
|
||||
> >(tee proxy.log) 2>&1 &
|
||||
proxy_pid=$!
|
||||
PIDS+=($proxy_pid)
|
||||
|
||||
wait_for_server 8100
|
||||
wait_for_server 8200
|
||||
wait_for_server 9000
|
||||
|
||||
echo "All servers are up. Starting benchmark..."
|
||||
|
||||
# begin benchmark
|
||||
cd ../../../benchmarks/
|
||||
python benchmark_serving.py --port 9000 --seed $(date +%s) \
|
||||
--model meta-llama/Llama-3.1-8B-Instruct \
|
||||
--dataset-name random --random-input-len 7500 --random-output-len 200 \
|
||||
--num-prompts 200 --burstiness 100 --request-rate 3.6 | tee benchmark.log
|
||||
|
||||
echo "Benchmarking done. Cleaning up..."
|
||||
|
||||
cleanup
|
||||
|
||||
}
|
||||
|
||||
main
|
||||
@ -0,0 +1,193 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import time
|
||||
from contextlib import asynccontextmanager
|
||||
|
||||
import httpx
|
||||
import numpy as np
|
||||
from fastapi import FastAPI, Request
|
||||
from fastapi.responses import StreamingResponse
|
||||
|
||||
|
||||
@asynccontextmanager
|
||||
async def lifespan(app: FastAPI):
|
||||
"""
|
||||
Lifespan context manager to handle startup and shutdown events.
|
||||
"""
|
||||
# Startup: Initialize clients
|
||||
prefiller_base_url = f'http://{global_args.prefiller_host}:{global_args.prefiller_port}/v1'
|
||||
decoder_base_url = f'http://{global_args.decoder_host}:{global_args.decoder_port}/v1'
|
||||
|
||||
app.state.prefill_client = httpx.AsyncClient(timeout=None,
|
||||
base_url=prefiller_base_url)
|
||||
app.state.decode_client = httpx.AsyncClient(timeout=None,
|
||||
base_url=decoder_base_url)
|
||||
|
||||
yield
|
||||
|
||||
# Shutdown: Close clients
|
||||
await app.state.prefill_client.aclose()
|
||||
await app.state.decode_client.aclose()
|
||||
|
||||
|
||||
# Update FastAPI app initialization to use lifespan
|
||||
app = FastAPI(lifespan=lifespan)
|
||||
|
||||
|
||||
class StatsCalculator:
|
||||
|
||||
def __init__(self):
|
||||
self._stats = []
|
||||
self._last_log_time = time.time()
|
||||
|
||||
def add(self, value):
|
||||
self._stats.append(value)
|
||||
if time.time() - self._last_log_time > 5:
|
||||
self._log_stats()
|
||||
self._last_log_time = time.time()
|
||||
|
||||
def _log_stats(self):
|
||||
# Print average, median, and 99th percentile
|
||||
np_arr = np.array(self._stats)
|
||||
output_str = f"\nNum requests: {len(self._stats)}" + \
|
||||
"\nPrefill node TTFT stats:" + \
|
||||
f"\n - Average (ms): {np.mean(np_arr)}" + \
|
||||
f"\n - Median (ms): {np.median(np_arr)}" + \
|
||||
f"\n - 99th Percentile (ms): {np.percentile(np_arr, 99)}\n"
|
||||
print("===============================", output_str,
|
||||
"===============================")
|
||||
|
||||
|
||||
stats_calculator = StatsCalculator()
|
||||
counter = 0
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
parser.add_argument("--port", type=int, default=8000)
|
||||
parser.add_argument("--host", type=str, default="localhost")
|
||||
parser.add_argument("--prefiller-host", type=str, default="localhost")
|
||||
parser.add_argument("--prefiller-port", type=int, default=8100)
|
||||
parser.add_argument("--decoder-host", type=str, default="localhost")
|
||||
parser.add_argument("--decoder-port", type=int, default=8200)
|
||||
args = parser.parse_args()
|
||||
return args
|
||||
|
||||
|
||||
# Initialize variables to hold the persistent clients
|
||||
app.state.prefill_client = None
|
||||
app.state.decode_client = None
|
||||
|
||||
|
||||
async def send_request_to_service(client: httpx.AsyncClient, endpoint: str,
|
||||
req_data: dict):
|
||||
"""
|
||||
Send a request to a service using a persistent client.
|
||||
"""
|
||||
req_data = req_data.copy()
|
||||
req_data['max_tokens'] = 1
|
||||
if 'max_completion_tokens' in req_data:
|
||||
req_data['max_completion_tokens'] = 1
|
||||
|
||||
headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
|
||||
response = await client.post(endpoint, json=req_data, headers=headers)
|
||||
response.raise_for_status()
|
||||
return response
|
||||
|
||||
|
||||
async def stream_service_response(client: httpx.AsyncClient, endpoint: str,
|
||||
req_data: dict):
|
||||
"""
|
||||
Asynchronously stream the response from a service using a persistent client.
|
||||
"""
|
||||
headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
|
||||
async with client.stream("POST", endpoint, json=req_data,
|
||||
headers=headers) as response:
|
||||
response.raise_for_status()
|
||||
async for chunk in response.aiter_bytes():
|
||||
yield chunk
|
||||
|
||||
|
||||
@app.post("/v1/completions")
|
||||
async def handle_completions(request: Request):
|
||||
global counter, stats_calculator
|
||||
counter += 1
|
||||
|
||||
st = time.time()
|
||||
try:
|
||||
req_data = await request.json()
|
||||
|
||||
# Send request to prefill service, ignore the response
|
||||
await send_request_to_service(app.state.prefill_client, "/completions",
|
||||
req_data)
|
||||
|
||||
et = time.time()
|
||||
stats_calculator.add(et - st)
|
||||
|
||||
# Stream response from decode service
|
||||
async def generate_stream():
|
||||
async for chunk in stream_service_response(app.state.decode_client,
|
||||
"/completions",
|
||||
req_data):
|
||||
yield chunk
|
||||
|
||||
return StreamingResponse(generate_stream(),
|
||||
media_type="application/json")
|
||||
|
||||
except Exception as e:
|
||||
import sys
|
||||
import traceback
|
||||
exc_info = sys.exc_info()
|
||||
print("Error occurred in disagg prefill proxy server"
|
||||
" - completions endpoint")
|
||||
print(e)
|
||||
print("".join(traceback.format_exception(*exc_info)))
|
||||
raise
|
||||
|
||||
|
||||
@app.post("/v1/chat/completions")
|
||||
async def handle_chat_completions(request: Request):
|
||||
global counter, stats_calculator
|
||||
counter += 1
|
||||
|
||||
st = time.time()
|
||||
try:
|
||||
req_data = await request.json()
|
||||
|
||||
# Send request to prefill service, ignore the response
|
||||
await send_request_to_service(app.state.prefill_client,
|
||||
"/chat/completions", req_data)
|
||||
|
||||
et = time.time()
|
||||
stats_calculator.add(et - st)
|
||||
|
||||
# Stream response from decode service
|
||||
async def generate_stream():
|
||||
async for chunk in stream_service_response(app.state.decode_client,
|
||||
"/chat/completions",
|
||||
req_data):
|
||||
yield chunk
|
||||
|
||||
return StreamingResponse(generate_stream(),
|
||||
media_type="application/json")
|
||||
|
||||
except Exception as e:
|
||||
import sys
|
||||
import traceback
|
||||
exc_info = sys.exc_info()
|
||||
print("Error occurred in disagg prefill proxy server "
|
||||
" - chat completions endpoint")
|
||||
print(e)
|
||||
print("".join(traceback.format_exception(*exc_info)))
|
||||
raise
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
global global_args
|
||||
global_args = parse_args()
|
||||
|
||||
import uvicorn
|
||||
uvicorn.run(app, host=global_args.host, port=global_args.port)
|
||||
@ -0,0 +1,59 @@
|
||||
#!/bin/bash
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
|
||||
if [[ $# -lt 1 ]]; then
|
||||
echo "Usage: $0 <prefiller | decoder> [model]"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [[ $# -eq 1 ]]; then
|
||||
echo "Using default model: meta-llama/Llama-3.1-8B-Instruct"
|
||||
MODEL="meta-llama/Llama-3.1-8B-Instruct"
|
||||
else
|
||||
echo "Using model: $2"
|
||||
MODEL=$2
|
||||
fi
|
||||
|
||||
|
||||
if [[ $1 == "prefiller" ]]; then
|
||||
# Prefiller listens on port 8100
|
||||
prefill_config_file=$SCRIPT_DIR/configs/lmcache-prefiller-config.yaml
|
||||
|
||||
UCX_TLS=cuda_ipc,cuda_copy,tcp \
|
||||
LMCACHE_CONFIG_FILE=$prefill_config_file \
|
||||
LMCACHE_USE_EXPERIMENTAL=True \
|
||||
VLLM_ENABLE_V1_MULTIPROCESSING=1 \
|
||||
VLLM_WORKER_MULTIPROC_METHOD=spawn \
|
||||
CUDA_VISIBLE_DEVICES=0 \
|
||||
vllm serve $MODEL \
|
||||
--port 8100 \
|
||||
--disable-log-requests \
|
||||
--enforce-eager \
|
||||
--kv-transfer-config \
|
||||
'{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_producer","kv_connector_extra_config": {"discard_partial_chunks": false, "lmcache_rpc_port": "producer1"}}'
|
||||
|
||||
|
||||
elif [[ $1 == "decoder" ]]; then
|
||||
# Decoder listens on port 8200
|
||||
decode_config_file=$SCRIPT_DIR/configs/lmcache-decoder-config.yaml
|
||||
|
||||
UCX_TLS=cuda_ipc,cuda_copy,tcp \
|
||||
LMCACHE_CONFIG_FILE=$decode_config_file \
|
||||
LMCACHE_USE_EXPERIMENTAL=True \
|
||||
VLLM_ENABLE_V1_MULTIPROCESSING=1 \
|
||||
VLLM_WORKER_MULTIPROC_METHOD=spawn \
|
||||
CUDA_VISIBLE_DEVICES=1 \
|
||||
vllm serve $MODEL \
|
||||
--port 8200 \
|
||||
--disable-log-requests \
|
||||
--enforce-eager \
|
||||
--kv-transfer-config \
|
||||
'{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_consumer","kv_connector_extra_config": {"discard_partial_chunks": false, "lmcache_rpc_port": "consumer1"}}'
|
||||
|
||||
|
||||
else
|
||||
echo "Invalid role: $1"
|
||||
echo "Should be either prefill, decode"
|
||||
exit 1
|
||||
fi
|
||||
130
examples/lmcache/kv_cache_sharing_lmcache_v1.py
Normal file
130
examples/lmcache/kv_cache_sharing_lmcache_v1.py
Normal file
@ -0,0 +1,130 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
"""
|
||||
This file demonstrates the example usage of remote KV cache sharing
|
||||
with LMCache.
|
||||
We will launch 2 vllm instances, and launch an additional LMCache server.
|
||||
KV cache is transferred in the following manner:
|
||||
(1) vLLM instance 1 -> LMCache server (KV cache store).
|
||||
(2) LMCache server -> vLLM instance 2 (KV cache reuse/retrieve).
|
||||
|
||||
Note that lmcache needs to be installed to run this example.
|
||||
Learn more about LMCache in https://github.com/LMCache/LMCache.
|
||||
"""
|
||||
import os
|
||||
import subprocess
|
||||
import time
|
||||
from multiprocessing import Event, Process
|
||||
|
||||
from lmcache.experimental.cache_engine import LMCacheEngineBuilder
|
||||
from lmcache.integration.vllm.utils import ENGINE_NAME
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.config import KVTransferConfig
|
||||
|
||||
# LMCache-related environment variables
|
||||
# The port to start LMCache server
|
||||
port = 8100
|
||||
# Use experimental features in LMCache
|
||||
os.environ["LMCACHE_USE_EXPERIMENTAL"] = "True"
|
||||
# LMCache is set to use 256 tokens per chunk
|
||||
os.environ["LMCACHE_CHUNK_SIZE"] = "256"
|
||||
# Disable local CPU backend in LMCache
|
||||
os.environ["LMCACHE_LOCAL_CPU"] = "False"
|
||||
# Set local CPU memory buffer limit to 5.0 GB
|
||||
os.environ["LMCACHE_MAX_LOCAL_CPU_SIZE"] = "5.0"
|
||||
# Set the remote URL for LMCache server
|
||||
os.environ["LMCACHE_REMOTE_URL"] = f"lm://localhost:{port}"
|
||||
# Set the serializer/deserializer between vllm and LMCache server
|
||||
# `naive` indicates using raw bytes of the tensor without any compression
|
||||
os.environ["LMCACHE_REMOTE_SERDE"] = "naive"
|
||||
|
||||
prompts = [
|
||||
"Hello, how are you?" * 1000,
|
||||
]
|
||||
|
||||
|
||||
def run_store(store_done, prompts):
|
||||
# We use GPU 0 for KV cache store process.
|
||||
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
|
||||
|
||||
sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10)
|
||||
|
||||
ktc = KVTransferConfig.from_cli(
|
||||
'{"kv_connector":"LMCacheConnectorV1", "kv_role":"kv_both"}')
|
||||
# Set GPU memory utilization to 0.8 for an A40 GPU with 40GB
|
||||
# memory. Reduce the value if your GPU has less memory.
|
||||
llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.2",
|
||||
kv_transfer_config=ktc,
|
||||
max_model_len=8000,
|
||||
gpu_memory_utilization=0.8,
|
||||
enforce_eager=True)
|
||||
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
for output in outputs:
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Generated text: {generated_text!r}")
|
||||
print("KV cache store is finished.")
|
||||
store_done.set()
|
||||
|
||||
# Clean up lmcache backend
|
||||
LMCacheEngineBuilder.destroy(ENGINE_NAME)
|
||||
|
||||
|
||||
def run_retrieve(store_done, prompts, timeout=1):
|
||||
# We use GPU 1 for KV cache retrieve process.
|
||||
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
|
||||
|
||||
sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10)
|
||||
|
||||
ktc = KVTransferConfig.from_cli(
|
||||
'{"kv_connector":"LMCacheConnectorV1", "kv_role":"kv_both"}')
|
||||
# Set GPU memory utilization to 0.8 for an A40 GPU with 40GB
|
||||
# of memory. Reduce the value if your GPU has less memory.
|
||||
llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.2",
|
||||
kv_transfer_config=ktc,
|
||||
max_model_len=8000,
|
||||
gpu_memory_utilization=0.8,
|
||||
enforce_eager=True)
|
||||
|
||||
print("Waiting for KV cache store to finish...")
|
||||
store_done.wait()
|
||||
time.sleep(timeout)
|
||||
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
for output in outputs:
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Generated text: {generated_text!r}")
|
||||
|
||||
# Clean up lmcache backend
|
||||
LMCacheEngineBuilder.destroy(ENGINE_NAME)
|
||||
|
||||
|
||||
def run_lmcache_server(port):
|
||||
server_proc = subprocess.Popen([
|
||||
"python", "-m", "lmcache.experimental.server", "localhost",
|
||||
str(port)
|
||||
])
|
||||
return server_proc
|
||||
|
||||
|
||||
def main():
|
||||
store_done = Event()
|
||||
store_process = Process(target=run_store, args=(store_done, prompts))
|
||||
retrieve_process = Process(target=run_retrieve, args=(store_done, prompts))
|
||||
lmcache_server_process = run_lmcache_server(port)
|
||||
|
||||
# Start KV cache store process
|
||||
store_process.start()
|
||||
|
||||
# Start KV cache retrieve process
|
||||
retrieve_process.start()
|
||||
|
||||
# Clean up the processes
|
||||
store_process.join()
|
||||
retrieve_process.terminate()
|
||||
lmcache_server_process.terminate()
|
||||
lmcache_server_process.wait()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@ -38,6 +38,37 @@ class ModelRequestData(NamedTuple):
|
||||
# Unless specified, these settings have been tested to work on a single L4.
|
||||
|
||||
|
||||
# Granite Speech
|
||||
def run_granite_speech(question: str, audio_count: int) -> ModelRequestData:
|
||||
# NOTE - the setting in this example are somehat different than what is
|
||||
# optimal for granite speech, and it is generally recommended to use beam
|
||||
# search. Check the model README for suggested settings.
|
||||
# https://huggingface.co/ibm-granite/granite-speech-3.3-8b
|
||||
model_name = "ibm-granite/granite-speech-3.3-8b"
|
||||
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
trust_remote_code=True,
|
||||
max_model_len=2048,
|
||||
max_num_seqs=2,
|
||||
enable_lora=True,
|
||||
max_lora_rank=64,
|
||||
limit_mm_per_prompt={"audio": audio_count},
|
||||
)
|
||||
|
||||
# The model has an audio-specific lora directly in its model dir;
|
||||
# it should be enabled whenever you pass audio inputs to the model.
|
||||
speech_lora_path = model_name
|
||||
audio_placeholder = "<|audio|>" * audio_count
|
||||
prompts = f"<|start_of_role|>system<|end_of_role|>Knowledge Cutoff Date: April 2024.\nToday's Date: December 19, 2024.\nYou are Granite, developed by IBM. You are a helpful AI assistant<|end_of_text|>\n<|start_of_role|>user<|end_of_role|>{audio_placeholder}{question}<|end_of_text|>\n<|start_of_role|>assistant<|end_of_role|>" # noqa: E501
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompt=prompts,
|
||||
lora_requests=[LoRARequest("speech", 1, speech_lora_path)],
|
||||
)
|
||||
|
||||
|
||||
# MiniCPM-O
|
||||
def run_minicpmo(question: str, audio_count: int) -> ModelRequestData:
|
||||
model_name = "openbmb/MiniCPM-o-2_6"
|
||||
@ -209,6 +240,7 @@ def run_whisper(question: str, audio_count: int) -> ModelRequestData:
|
||||
|
||||
|
||||
model_example_map = {
|
||||
"granite_speech": run_granite_speech,
|
||||
"minicpmo": run_minicpmo,
|
||||
"phi4_mm": run_phi4mm,
|
||||
"qwen2_audio": run_qwen2_audio,
|
||||
|
||||
@ -10,12 +10,12 @@ prompts = [
|
||||
"The future of AI is",
|
||||
]
|
||||
# Create a sampling params object.
|
||||
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
||||
sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=10)
|
||||
|
||||
|
||||
def main():
|
||||
# Create an LLM.
|
||||
llm = LLM(model="facebook/opt-125m")
|
||||
llm = LLM(model="facebook/opt-125m", disable_cascade_attn=True)
|
||||
# Generate texts from the prompts.
|
||||
# The output is a list of RequestOutput objects
|
||||
# that contain the prompt, generated text, and other information.
|
||||
|
||||
@ -52,8 +52,8 @@ def main():
|
||||
|
||||
args = parse_args()
|
||||
|
||||
model_dir = "meta-llama/Meta-Llama-3-8B-Instruct"
|
||||
eagle_dir = "abhigoyal/EAGLE-LLaMA3-Instruct-8B-vllm"
|
||||
model_dir = "meta-llama/Llama-3.1-8B-Instruct"
|
||||
eagle_dir = "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B"
|
||||
|
||||
max_model_len = 2048
|
||||
|
||||
@ -81,7 +81,7 @@ def main():
|
||||
max_num_seqs=args.max_num_seqs,
|
||||
gpu_memory_utilization=0.8,
|
||||
speculative_config={
|
||||
"method": "eagle",
|
||||
"method": "eagle3" if "eagle3" in eagle_dir.lower() else "eagle",
|
||||
"model": eagle_dir,
|
||||
"num_speculative_tokens": args.num_spec_tokens,
|
||||
"draft_tensor_parallel_size": args.draft_tp,
|
||||
@ -95,6 +95,9 @@ def main():
|
||||
outputs = llm.generate(prompt_token_ids=prompt_ids,
|
||||
sampling_params=sampling_params)
|
||||
|
||||
if not hasattr(outputs, "metrics") or outputs.metrics is None:
|
||||
return
|
||||
|
||||
# calculate the average number of accepted tokens per forward pass, +1 is
|
||||
# to account for the token from the target model that's always going to be
|
||||
# accepted
|
||||
@ -109,6 +112,11 @@ def main():
|
||||
{sum(acceptance_counts) / acceptance_counts[0]:.2f}")
|
||||
print("-" * 50)
|
||||
|
||||
# print acceptance at each token position
|
||||
for i in range(len(acceptance_counts)):
|
||||
print(f"acceptance at token {i}:"
|
||||
f"{acceptance_counts[i] / (acceptance_counts[0]):.2f}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@ -791,7 +791,9 @@ def parse_args():
|
||||
parser.add_argument(
|
||||
"--num-images",
|
||||
"-n",
|
||||
choices=list(range(1, 13)), # 12 is the max number of images
|
||||
type=int,
|
||||
choices=list(range(1,
|
||||
len(IMAGE_URLS) + 1)), # the max number of images
|
||||
default=2,
|
||||
help="Number of images to use for the demo.")
|
||||
return parser.parse_args()
|
||||
|
||||
@ -1,43 +1,49 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
"""
|
||||
To run this example, you need to start the vLLM server:
|
||||
|
||||
```bash
|
||||
vllm serve Qwen/Qwen2.5-3B-Instruct
|
||||
```
|
||||
"""
|
||||
|
||||
from enum import Enum
|
||||
|
||||
from openai import BadRequestError, OpenAI
|
||||
from pydantic import BaseModel
|
||||
|
||||
client = OpenAI(
|
||||
base_url="http://localhost:8000/v1",
|
||||
api_key="-",
|
||||
)
|
||||
|
||||
# Guided decoding by Choice (list of possible options)
|
||||
completion = client.chat.completions.create(
|
||||
model="Qwen/Qwen2.5-3B-Instruct",
|
||||
messages=[{
|
||||
"role": "user",
|
||||
"content": "Classify this sentiment: vLLM is wonderful!"
|
||||
}],
|
||||
extra_body={"guided_choice": ["positive", "negative"]},
|
||||
)
|
||||
print(completion.choices[0].message.content)
|
||||
def guided_choice_completion(client: OpenAI, model: str):
|
||||
completion = client.chat.completions.create(
|
||||
model=model,
|
||||
messages=[{
|
||||
"role": "user",
|
||||
"content": "Classify this sentiment: vLLM is wonderful!"
|
||||
}],
|
||||
extra_body={"guided_choice": ["positive", "negative"]},
|
||||
)
|
||||
return completion.choices[0].message.content
|
||||
|
||||
|
||||
# Guided decoding by Regex
|
||||
prompt = ("Generate an email address for Alan Turing, who works in Enigma."
|
||||
"End in .com and new line. Example result:"
|
||||
"alan.turing@enigma.com\n")
|
||||
def guided_regex_completion(client: OpenAI, model: str):
|
||||
prompt = ("Generate an email address for Alan Turing, who works in Enigma."
|
||||
"End in .com and new line. Example result:"
|
||||
"alan.turing@enigma.com\n")
|
||||
|
||||
completion = client.chat.completions.create(
|
||||
model="Qwen/Qwen2.5-3B-Instruct",
|
||||
messages=[{
|
||||
"role": "user",
|
||||
"content": prompt,
|
||||
}],
|
||||
extra_body={
|
||||
"guided_regex": "\w+@\w+\.com\n",
|
||||
"stop": ["\n"]
|
||||
},
|
||||
)
|
||||
print(completion.choices[0].message.content)
|
||||
completion = client.chat.completions.create(
|
||||
model=model,
|
||||
messages=[{
|
||||
"role": "user",
|
||||
"content": prompt,
|
||||
}],
|
||||
extra_body={
|
||||
"guided_regex": r"\w+@\w+\.com\n",
|
||||
"stop": ["\n"]
|
||||
},
|
||||
)
|
||||
return completion.choices[0].message.content
|
||||
|
||||
|
||||
# Guided decoding by JSON using Pydantic schema
|
||||
@ -54,66 +60,100 @@ class CarDescription(BaseModel):
|
||||
car_type: CarType
|
||||
|
||||
|
||||
json_schema = CarDescription.model_json_schema()
|
||||
def guided_json_completion(client: OpenAI, model: str):
|
||||
json_schema = CarDescription.model_json_schema()
|
||||
|
||||
prompt = ("Generate a JSON with the brand, model and car_type of"
|
||||
"the most iconic car from the 90's")
|
||||
completion = client.chat.completions.create(
|
||||
model="Qwen/Qwen2.5-3B-Instruct",
|
||||
messages=[{
|
||||
"role": "user",
|
||||
"content": prompt,
|
||||
}],
|
||||
extra_body={"guided_json": json_schema},
|
||||
)
|
||||
print(completion.choices[0].message.content)
|
||||
|
||||
# Guided decoding by Grammar
|
||||
simplified_sql_grammar = """
|
||||
?start: select_statement
|
||||
|
||||
?select_statement: "SELECT " column_list " FROM " table_name
|
||||
|
||||
?column_list: column_name ("," column_name)*
|
||||
|
||||
?table_name: identifier
|
||||
|
||||
?column_name: identifier
|
||||
|
||||
?identifier: /[a-zA-Z_][a-zA-Z0-9_]*/
|
||||
"""
|
||||
|
||||
prompt = ("Generate an SQL query to show the 'username' and 'email'"
|
||||
"from the 'users' table.")
|
||||
completion = client.chat.completions.create(
|
||||
model="Qwen/Qwen2.5-3B-Instruct",
|
||||
messages=[{
|
||||
"role": "user",
|
||||
"content": prompt,
|
||||
}],
|
||||
extra_body={"guided_grammar": simplified_sql_grammar},
|
||||
)
|
||||
print(completion.choices[0].message.content)
|
||||
|
||||
# Extra backend options
|
||||
prompt = ("Generate an email address for Alan Turing, who works in Enigma."
|
||||
"End in .com and new line. Example result:"
|
||||
"alan.turing@enigma.com\n")
|
||||
|
||||
try:
|
||||
# The no-fallback option forces vLLM to use xgrammar, so when it fails
|
||||
# you get a 400 with the reason why
|
||||
prompt = ("Generate a JSON with the brand, model and car_type of"
|
||||
"the most iconic car from the 90's")
|
||||
completion = client.chat.completions.create(
|
||||
model="Qwen/Qwen2.5-3B-Instruct",
|
||||
model=model,
|
||||
messages=[{
|
||||
"role": "user",
|
||||
"content": prompt,
|
||||
}],
|
||||
extra_body={
|
||||
"guided_regex": "\w+@\w+\.com\n",
|
||||
"stop": ["\n"],
|
||||
"guided_decoding_backend": "xgrammar:no-fallback"
|
||||
},
|
||||
extra_body={"guided_json": json_schema},
|
||||
)
|
||||
except BadRequestError as e:
|
||||
print("This error is expected:", e)
|
||||
return completion.choices[0].message.content
|
||||
|
||||
|
||||
# Guided decoding by Grammar
|
||||
def guided_grammar_completion(client: OpenAI, model: str):
|
||||
simplified_sql_grammar = """
|
||||
root ::= select_statement
|
||||
|
||||
select_statement ::= "SELECT " column " from " table " where " condition
|
||||
|
||||
column ::= "col_1 " | "col_2 "
|
||||
|
||||
table ::= "table_1 " | "table_2 "
|
||||
|
||||
condition ::= column "= " number
|
||||
|
||||
number ::= "1 " | "2 "
|
||||
"""
|
||||
|
||||
prompt = ("Generate an SQL query to show the 'username' and 'email'"
|
||||
"from the 'users' table.")
|
||||
completion = client.chat.completions.create(
|
||||
model=model,
|
||||
messages=[{
|
||||
"role": "user",
|
||||
"content": prompt,
|
||||
}],
|
||||
extra_body={"guided_grammar": simplified_sql_grammar},
|
||||
)
|
||||
return completion.choices[0].message.content
|
||||
|
||||
|
||||
# Extra backend options
|
||||
def extra_backend_options_completion(client: OpenAI, model: str):
|
||||
prompt = ("Generate an email address for Alan Turing, who works in Enigma."
|
||||
"End in .com and new line. Example result:"
|
||||
"alan.turing@enigma.com\n")
|
||||
|
||||
try:
|
||||
# The no-fallback option forces vLLM to use xgrammar, so when it fails
|
||||
# you get a 400 with the reason why
|
||||
completion = client.chat.completions.create(
|
||||
model=model,
|
||||
messages=[{
|
||||
"role": "user",
|
||||
"content": prompt,
|
||||
}],
|
||||
extra_body={
|
||||
"guided_regex": r"\w+@\w+\.com\n",
|
||||
"stop": ["\n"],
|
||||
"guided_decoding_backend": "xgrammar:no-fallback"
|
||||
},
|
||||
)
|
||||
return completion.choices[0].message.content
|
||||
except BadRequestError as e:
|
||||
print("This error is expected:", e)
|
||||
|
||||
|
||||
def main():
|
||||
client: OpenAI = OpenAI(
|
||||
base_url="http://localhost:8000/v1",
|
||||
api_key="-",
|
||||
)
|
||||
|
||||
model = "Qwen/Qwen2.5-3B-Instruct"
|
||||
|
||||
print("Guided Choice Completion:")
|
||||
print(guided_choice_completion(client, model))
|
||||
|
||||
print("\nGuided Regex Completion:")
|
||||
print(guided_regex_completion(client, model))
|
||||
|
||||
print("\nGuided JSON Completion:")
|
||||
print(guided_json_completion(client, model))
|
||||
|
||||
print("\nGuided Grammar Completion:")
|
||||
print(guided_grammar_completion(client, model))
|
||||
|
||||
print("\nExtra Backend Options Completion:")
|
||||
print(extra_backend_options_completion(client, model))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@ -0,0 +1,85 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
from openai import OpenAI
|
||||
|
||||
# This example demonstrates the `structural_tag` response format.
|
||||
# It can be used to specify a structured output format that occurs between
|
||||
# specific tags in the response. This example shows how it could be used
|
||||
# to enforce the format of a tool call response, but it could be used for
|
||||
# any structured output within a subset of the response.
|
||||
|
||||
|
||||
def main():
|
||||
client = OpenAI(
|
||||
base_url="http://localhost:8000/v1",
|
||||
api_key="-",
|
||||
)
|
||||
|
||||
messages = [{
|
||||
"role":
|
||||
"user",
|
||||
"content":
|
||||
"""
|
||||
You have access to the following function to retrieve the weather in a city:
|
||||
|
||||
{
|
||||
"name": "get_weather",
|
||||
"parameters": {
|
||||
"city": {
|
||||
"param_type": "string",
|
||||
"description": "The city to get the weather for",
|
||||
"required": True
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
If a you choose to call a function ONLY reply in the following format:
|
||||
<{start_tag}={function_name}>{parameters}{end_tag}
|
||||
where
|
||||
|
||||
start_tag => `<function`
|
||||
parameters => a JSON dict with the function argument name as key and function
|
||||
argument value as value.
|
||||
end_tag => `</function>`
|
||||
|
||||
Here is an example,
|
||||
<function=example_function_name>{"example_name": "example_value"}</function>
|
||||
|
||||
Reminder:
|
||||
- Function calls MUST follow the specified format
|
||||
- Required parameters MUST be specified
|
||||
- Only call one function at a time
|
||||
- Put the entire function call reply on one line
|
||||
- Always add your sources when using search results to answer the user query
|
||||
|
||||
You are a helpful assistant.
|
||||
|
||||
Given the previous instructions, what is the weather in New York City, Boston,
|
||||
and San Francisco?
|
||||
"""
|
||||
}]
|
||||
|
||||
response = client.chat.completions.create(
|
||||
model="meta-llama/Llama-3.1-8B-Instruct",
|
||||
messages=messages,
|
||||
response_format={
|
||||
"type":
|
||||
"structural_tag",
|
||||
"structures": [{
|
||||
"begin": "<function=get_weather>",
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"city": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
},
|
||||
"end": "</function>"
|
||||
}],
|
||||
"triggers": ["<function="]
|
||||
})
|
||||
print(response)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@ -25,29 +25,28 @@ from pydantic import BaseModel
|
||||
openai_api_key = "EMPTY"
|
||||
openai_api_base = "http://localhost:8000/v1"
|
||||
|
||||
client = OpenAI(
|
||||
api_key=openai_api_key,
|
||||
base_url=openai_api_base,
|
||||
)
|
||||
|
||||
models = client.models.list()
|
||||
model = models.data[0].id
|
||||
def print_completion_details(completion):
|
||||
print("reasoning_content: ",
|
||||
completion.choices[0].message.reasoning_content)
|
||||
print("content: ", completion.choices[0].message.content)
|
||||
|
||||
|
||||
# Guided decoding by Regex
|
||||
prompt = ("What is the capital of France?")
|
||||
def guided_regex_completion(client: OpenAI, model: str):
|
||||
prompt = ("What is the capital of France?")
|
||||
|
||||
completion = client.chat.completions.create(
|
||||
model=model,
|
||||
messages=[{
|
||||
"role": "user",
|
||||
"content": prompt,
|
||||
}],
|
||||
extra_body={
|
||||
"guided_regex": "(Paris|London)",
|
||||
},
|
||||
)
|
||||
print("reasoning_content: ", completion.choices[0].message.reasoning_content)
|
||||
print("content: ", completion.choices[0].message.content)
|
||||
completion = client.chat.completions.create(
|
||||
model=model,
|
||||
messages=[{
|
||||
"role": "user",
|
||||
"content": prompt,
|
||||
}],
|
||||
extra_body={
|
||||
"guided_regex": "(Paris|London)",
|
||||
},
|
||||
)
|
||||
print_completion_details(completion)
|
||||
|
||||
|
||||
class People(BaseModel):
|
||||
@ -55,19 +54,19 @@ class People(BaseModel):
|
||||
age: int
|
||||
|
||||
|
||||
json_schema = People.model_json_schema()
|
||||
def guided_json_completion(client: OpenAI, model: str):
|
||||
json_schema = People.model_json_schema()
|
||||
|
||||
prompt = ("Generate a JSON with the name and age of one random person.")
|
||||
completion = client.chat.completions.create(
|
||||
model=model,
|
||||
messages=[{
|
||||
"role": "user",
|
||||
"content": prompt,
|
||||
}],
|
||||
extra_body={"guided_json": json_schema},
|
||||
)
|
||||
print("reasoning_content: ", completion.choices[0].message.reasoning_content)
|
||||
print("content: ", completion.choices[0].message.content)
|
||||
prompt = ("Generate a JSON with the name and age of one random person.")
|
||||
completion = client.chat.completions.create(
|
||||
model=model,
|
||||
messages=[{
|
||||
"role": "user",
|
||||
"content": prompt,
|
||||
}],
|
||||
extra_body={"guided_json": json_schema},
|
||||
)
|
||||
print_completion_details(completion)
|
||||
|
||||
|
||||
# Guided decoding by JSON using Pydantic schema
|
||||
@ -84,46 +83,73 @@ class CarDescription(BaseModel):
|
||||
car_type: CarType
|
||||
|
||||
|
||||
json_schema = CarDescription.model_json_schema()
|
||||
def guided_car_json_completion(client: OpenAI, model: str):
|
||||
json_schema = CarDescription.model_json_schema()
|
||||
|
||||
prompt = ("Generate a JSON with the brand, model and car_type of"
|
||||
"the most iconic car from the 90's")
|
||||
completion = client.chat.completions.create(
|
||||
model=model,
|
||||
messages=[{
|
||||
"role": "user",
|
||||
"content": prompt,
|
||||
}],
|
||||
extra_body={"guided_json": json_schema},
|
||||
)
|
||||
print_completion_details(completion)
|
||||
|
||||
prompt = ("Generate a JSON with the brand, model and car_type of"
|
||||
"the most iconic car from the 90's")
|
||||
completion = client.chat.completions.create(
|
||||
model=model,
|
||||
messages=[{
|
||||
"role": "user",
|
||||
"content": prompt,
|
||||
}],
|
||||
extra_body={"guided_json": json_schema},
|
||||
)
|
||||
print("reasoning_content: ", completion.choices[0].message.reasoning_content)
|
||||
print("content: ", completion.choices[0].message.content)
|
||||
|
||||
# Guided decoding by Grammar
|
||||
simplified_sql_grammar = """
|
||||
?start: select_statement
|
||||
def guided_grammar_completion(client: OpenAI, model: str):
|
||||
simplified_sql_grammar = """
|
||||
root ::= select_statement
|
||||
|
||||
?select_statement: "SELECT " column_list " FROM " table_name
|
||||
select_statement ::= "SELECT " column " from " table " where " condition
|
||||
|
||||
?column_list: column_name ("," column_name)*
|
||||
column ::= "col_1 " | "col_2 "
|
||||
|
||||
?table_name: identifier
|
||||
table ::= "table_1 " | "table_2 "
|
||||
|
||||
?column_name: identifier
|
||||
condition ::= column "= " number
|
||||
|
||||
?identifier: /[a-zA-Z_][a-zA-Z0-9_]*/
|
||||
"""
|
||||
number ::= "1 " | "2 "
|
||||
"""
|
||||
|
||||
# This may be very slow https://github.com/vllm-project/vllm/issues/12122
|
||||
prompt = ("Generate an SQL query to show the 'username' and 'email'"
|
||||
"from the 'users' table.")
|
||||
completion = client.chat.completions.create(
|
||||
model=model,
|
||||
messages=[{
|
||||
"role": "user",
|
||||
"content": prompt,
|
||||
}],
|
||||
extra_body={"guided_grammar": simplified_sql_grammar},
|
||||
)
|
||||
print("reasoning_content: ", completion.choices[0].message.reasoning_content)
|
||||
print("content: ", completion.choices[0].message.content)
|
||||
# This may be very slow https://github.com/vllm-project/vllm/issues/12122
|
||||
prompt = ("Generate an SQL query to show the 'username' and 'email'"
|
||||
"from the 'users' table.")
|
||||
completion = client.chat.completions.create(
|
||||
model=model,
|
||||
messages=[{
|
||||
"role": "user",
|
||||
"content": prompt,
|
||||
}],
|
||||
extra_body={"guided_grammar": simplified_sql_grammar},
|
||||
)
|
||||
print_completion_details(completion)
|
||||
|
||||
|
||||
def main():
|
||||
client: OpenAI = OpenAI(
|
||||
api_key=openai_api_key,
|
||||
base_url=openai_api_base,
|
||||
)
|
||||
|
||||
models = client.models.list()
|
||||
model: str = models.data[0].id
|
||||
|
||||
print("Guided Regex Completion:")
|
||||
guided_regex_completion(client, model)
|
||||
|
||||
print("\nGuided JSON Completion (People):")
|
||||
guided_json_completion(client, model)
|
||||
|
||||
print("\nGuided JSON Completion (CarDescription):")
|
||||
guided_car_json_completion(client, model)
|
||||
|
||||
print("\nGuided Grammar Completion:")
|
||||
guided_grammar_completion(client, model)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@ -25,11 +25,11 @@ def main():
|
||||
responses = client.embeddings.create(
|
||||
input=["Follow the white rabbit."],
|
||||
model=model,
|
||||
dimensions=1,
|
||||
dimensions=32,
|
||||
)
|
||||
|
||||
for data in responses.data:
|
||||
print(data.embedding) # List of float of len 1
|
||||
print(data.embedding) # List of float of len 32
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
48
examples/online_serving/ray_serve_deepseek.py
Normal file
48
examples/online_serving/ray_serve_deepseek.py
Normal file
@ -0,0 +1,48 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
"""
|
||||
Example to deploy DeepSeek R1 or V3 with Ray Serve LLM.
|
||||
See Ray Serve LLM documentation at:
|
||||
https://docs.ray.io/en/latest/serve/llm/serving-llms.html
|
||||
|
||||
Run `python3 ray_serve_deepseek.py` to deploy the model.
|
||||
"""
|
||||
|
||||
from ray import serve
|
||||
from ray.serve.llm import LLMConfig, build_openai_app
|
||||
|
||||
llm_config = LLMConfig(
|
||||
model_loading_config={
|
||||
"model_id": "deepseek",
|
||||
# Since DeepSeek model is huge, it is recommended to pre-download
|
||||
# the model to local disk, say /path/to/the/model and specify:
|
||||
# model_source="/path/to/the/model"
|
||||
"model_source": "deepseek-ai/DeepSeek-R1",
|
||||
},
|
||||
deployment_config={
|
||||
"autoscaling_config": {
|
||||
"min_replicas": 1,
|
||||
"max_replicas": 1,
|
||||
}
|
||||
},
|
||||
# Change to the accelerator type of the node
|
||||
accelerator_type="H100",
|
||||
runtime_env={"env_vars": {
|
||||
"VLLM_USE_V1": "1"
|
||||
}},
|
||||
# Customize engine arguments as needed (e.g. vLLM engine kwargs)
|
||||
engine_kwargs={
|
||||
"tensor_parallel_size": 8,
|
||||
"pipeline_parallel_size": 2,
|
||||
"gpu_memory_utilization": 0.92,
|
||||
"dtype": "auto",
|
||||
"max_num_seqs": 40,
|
||||
"max_model_len": 16384,
|
||||
"enable_chunked_prefill": True,
|
||||
"enable_prefix_caching": True,
|
||||
"trust_remote_code": True,
|
||||
},
|
||||
)
|
||||
|
||||
# Deploy the application
|
||||
llm_app = build_openai_app({"llm_configs": [llm_config]})
|
||||
serve.run(llm_app)
|
||||
116
examples/tool_chat_template_llama4_json.jinja
Normal file
116
examples/tool_chat_template_llama4_json.jinja
Normal file
@ -0,0 +1,116 @@
|
||||
{%- macro is_array_of_type_objects(var) -%}
|
||||
{%- if var is iterable and var is not string -%}
|
||||
{%- set valid = true -%}
|
||||
{%- for item in var -%}
|
||||
{%- if 'type' not in item -%}
|
||||
{%- set valid = false -%}
|
||||
{%- break -%}
|
||||
{%- endif -%}
|
||||
{%- endfor -%}
|
||||
{{ valid }}
|
||||
{%- else -%}
|
||||
{{ false }}
|
||||
{%- endif -%}
|
||||
{%- endmacro %}
|
||||
|
||||
{%- macro render_message(message) %}
|
||||
{%- if message['content'] is string %}
|
||||
{{- message['content']|trim }}
|
||||
{%- elif is_array_of_type_objects(data) == 'True' %}
|
||||
{%- for content in message['content'] %}
|
||||
{%- if content['type'] == 'image' %}
|
||||
{{- '<|image|>' }}
|
||||
{%- elif content['type'] == 'text' %}
|
||||
{{- content['text']|trim }}
|
||||
{%- endif %}
|
||||
{%- endfor %}
|
||||
{%- else %}
|
||||
{{- message['content']|tojson }}
|
||||
{%- endif %}
|
||||
{%- endmacro %}
|
||||
|
||||
{{- bos_token }}
|
||||
{%- if custom_tools is defined %}
|
||||
{%- set tools = custom_tools %}
|
||||
{%- endif %}
|
||||
{%- if not tools_in_user_message is defined %}
|
||||
{%- set tools_in_user_message = true %}
|
||||
{%- endif %}
|
||||
{%- if not tools is defined %}
|
||||
{%- set tools = none %}
|
||||
{%- endif %}
|
||||
|
||||
{#- This block extracts the system message, so we can slot it into the right place. #}
|
||||
{%- if messages[0]['role'] == 'system' %}
|
||||
{%- set system_message = messages[0] %}
|
||||
{%- set messages = messages[1:] %}
|
||||
{%- else %}
|
||||
{%- set system_message = ({ "content": "You are a helpful assistant with tool calling "
|
||||
"capabilities. Only reply with a tool call if the function exists in the "
|
||||
"library provided by the user. If it doesn't exist, just reply directly in "
|
||||
"natural language. When you receive a tool call response, use the output to "
|
||||
"format an answer to the original user question."}) %}
|
||||
{%- endif %}
|
||||
|
||||
{%- set tool_lib_preamble = 'Tools: You have access to the following tools. You might need to use one '
|
||||
'or more function/tool calls to fulfill the task. \n'
|
||||
'If none are needed, then proceed to the response.\n\n'
|
||||
'Tool Call Syntax: You can call tools using the following syntax:\n'
|
||||
'{"name": function name, "parameters": dictionary of argument name and its value}.\n'
|
||||
'Separate multiple function calls by "; ". Do not use variables.\n'
|
||||
'Do not include anything else when calling the tools with the syntax above.\n\n'
|
||||
'Here is a list of functions in JSON format that you can invoke.\n' %}
|
||||
|
||||
{{- "<|header_start|>system<|header_end|>\n\n" }}
|
||||
{%- if tools is not none and not tools_in_user_message %}
|
||||
{{- tool_lib_preamble }}
|
||||
{%- for t in tools %}
|
||||
{{- t | tojson(indent=4) }}
|
||||
{{- "\n\n" }}
|
||||
{%- endfor %}
|
||||
{%- endif %}
|
||||
{{- render_message(system_message) }}
|
||||
{{ "<|eot|>\n" }}
|
||||
|
||||
{#- Custom tools are passed in a user message with some extra guidance #}
|
||||
{%- if tools_in_user_message and not tools is none %}
|
||||
{#- Extract the first user message so we can plug it in here #}
|
||||
{%- if messages | length != 0 %}
|
||||
{%- set first_user_message = messages[0] %}
|
||||
{%- set messages = messages[1:] %}
|
||||
{%- else %}
|
||||
{{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }}
|
||||
{%- endif %}
|
||||
{{- '<|header_start|>user<|header_end|>\n\n' }}
|
||||
{{- tool_lib_preamble }}
|
||||
{%- for t in tools %}
|
||||
{{- t | tojson(indent=4) }}
|
||||
{{- "\n\n" }}
|
||||
{%- endfor %}
|
||||
{{- render_message(first_user_message) + "\n<|eot|>"}}
|
||||
{%- endif %}
|
||||
|
||||
{%- for message in messages %}
|
||||
{%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}
|
||||
{{- '<|header_start|>' + message['role'] + '<|header_end|>\n\n' }}
|
||||
{{- render_message(message) }}
|
||||
{{- "\n<|eot|>" }}
|
||||
{%- elif 'tool_calls' in message and message.tool_calls|length > 0 %}
|
||||
{{- '\n<|header_start|>assistant<|header_end|>\n\n' -}}
|
||||
{{- render_message(message) }}
|
||||
{%- for tool_call in message.tool_calls %}
|
||||
{{- '{"name": "' + tool_call.function.name + '", ' }}
|
||||
{{- '"parameters": ' }}
|
||||
{{- tool_call.function.arguments | tojson }}
|
||||
{{- "}" }}
|
||||
{%- endfor %}
|
||||
{{- "\n<|eot|>" }}
|
||||
{%- elif message.role == "tool" or message.role == "ipython" %}
|
||||
{{- "\n<|header_start|>ipython<|header_end|>\n\n" }}
|
||||
{{- render_message(message) }}
|
||||
{{- "\n<|eom|>" }}
|
||||
{%- endif %}
|
||||
{%- endfor %}
|
||||
{%- if add_generation_prompt %}
|
||||
{{- '\n<|header_start|>assistant<|header_end|>\n\n' }}
|
||||
{%- endif %}
|
||||
@ -46,8 +46,7 @@ vllm = "vllm.entrypoints.cli.main:main"
|
||||
|
||||
[tool.setuptools.packages.find]
|
||||
where = ["."]
|
||||
exclude = ["benchmarks", "csrc", "docs", "examples", "tests*"]
|
||||
namespaces = false
|
||||
include = ["vllm*"]
|
||||
|
||||
[tool.yapfignore]
|
||||
ignore_patterns = [
|
||||
@ -59,7 +58,8 @@ ignore_patterns = [
|
||||
line-length = 80
|
||||
exclude = [
|
||||
# External file, leaving license intact
|
||||
"examples/other/fp8/quantizer/quantize.py"
|
||||
"examples/other/fp8/quantizer/quantize.py",
|
||||
"vllm/vllm_flash_attn/flash_attn_interface.pyi"
|
||||
]
|
||||
|
||||
[tool.ruff.lint.per-file-ignores]
|
||||
|
||||
@ -7,6 +7,7 @@ sphinx-togglebutton==0.3.2
|
||||
myst-parser==3.0.1
|
||||
msgspec
|
||||
cloudpickle
|
||||
commonmark # Required by sphinx-argparse when using :markdownhelp:
|
||||
|
||||
# packages to install to build the documentation
|
||||
cachetools
|
||||
|
||||
@ -9,4 +9,4 @@ numpy==1.26.4
|
||||
tabulate
|
||||
setuptools>=61
|
||||
setuptools-scm>=8
|
||||
vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@4312768
|
||||
vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@f1f6624
|
||||
|
||||
@ -34,7 +34,7 @@ num2words # required for smolvlm test
|
||||
opencv-python-headless >= 4.11.0 # required for video test
|
||||
datamodel_code_generator # required for minicpm3 test
|
||||
lm-eval[api]==0.4.8 # required for model evaluation test
|
||||
transformers==4.51.1
|
||||
transformers==4.51.3
|
||||
tokenizers==0.21.1
|
||||
huggingface-hub[hf_xet]>=0.30.0 # Required for Xet downloads.
|
||||
schemathesis>=3.39.15 # Required for openai schema test.
|
||||
|
||||
@ -737,7 +737,7 @@ tqdm==4.66.6
|
||||
# transformers
|
||||
tqdm-multiprocess==0.0.11
|
||||
# via lm-eval
|
||||
transformers==4.51.1
|
||||
transformers==4.51.3
|
||||
# via
|
||||
# -r requirements/test.in
|
||||
# genai-perf
|
||||
|
||||
27
setup.py
27
setup.py
@ -269,15 +269,17 @@ class cmake_build_ext(build_ext):
|
||||
# First, run the standard build_ext command to compile the extensions
|
||||
super().run()
|
||||
|
||||
# copy vllm/vllm_flash_attn/*.py from self.build_lib to current
|
||||
# copy vllm/vllm_flash_attn/**/*.py from self.build_lib to current
|
||||
# directory so that they can be included in the editable build
|
||||
import glob
|
||||
files = glob.glob(
|
||||
os.path.join(self.build_lib, "vllm", "vllm_flash_attn", "*.py"))
|
||||
files = glob.glob(os.path.join(self.build_lib, "vllm",
|
||||
"vllm_flash_attn", "**", "*.py"),
|
||||
recursive=True)
|
||||
for file in files:
|
||||
dst_file = os.path.join("vllm/vllm_flash_attn",
|
||||
os.path.basename(file))
|
||||
file.split("vllm/vllm_flash_attn/")[-1])
|
||||
print(f"Copying {file} to {dst_file}")
|
||||
os.makedirs(os.path.dirname(dst_file), exist_ok=True)
|
||||
self.copy_file(file, dst_file)
|
||||
|
||||
|
||||
@ -377,13 +379,22 @@ class repackage_wheel(build_ext):
|
||||
"vllm/_flashmla_C.abi3.so",
|
||||
"vllm/vllm_flash_attn/_vllm_fa2_C.abi3.so",
|
||||
"vllm/vllm_flash_attn/_vllm_fa3_C.abi3.so",
|
||||
"vllm/vllm_flash_attn/flash_attn_interface.py",
|
||||
"vllm/vllm_flash_attn/__init__.py",
|
||||
"vllm/cumem_allocator.abi3.so",
|
||||
# "vllm/_version.py", # not available in nightly wheels yet
|
||||
]
|
||||
file_members = filter(lambda x: x.filename in files_to_copy,
|
||||
wheel.filelist)
|
||||
|
||||
file_members = list(
|
||||
filter(lambda x: x.filename in files_to_copy, wheel.filelist))
|
||||
|
||||
# vllm_flash_attn python code:
|
||||
# Regex from
|
||||
# `glob.translate('vllm/vllm_flash_attn/**/*.py', recursive=True)`
|
||||
import re
|
||||
compiled_regex = re.compile(
|
||||
r"vllm/vllm_flash_attn/(?:[^/.][^/]*/)*(?!\.)[^/]*\.py")
|
||||
file_members += list(
|
||||
filter(lambda x: compiled_regex.match(x.filename),
|
||||
wheel.filelist))
|
||||
|
||||
for file in file_members:
|
||||
print(f"Extracting and including {file.filename} "
|
||||
|
||||
@ -10,7 +10,7 @@ from vllm.compilation.fusion import (FUSED_OPS, FusionPass, QuantKey,
|
||||
kFp8DynamicTokenSym, kFp8StaticTensorSym)
|
||||
from vllm.compilation.fx_utils import find_auto_fn, find_auto_fn_maybe, is_func
|
||||
from vllm.compilation.noop_elimination import NoOpEliminationPass
|
||||
from vllm.config import CompilationConfig
|
||||
from vllm.config import CompilationConfig, VllmConfig
|
||||
|
||||
from .backend import TestBackend
|
||||
|
||||
@ -49,13 +49,15 @@ def test_fix_functionalization(model: str, quant_key: QuantKey,
|
||||
do_fusion: bool):
|
||||
torch.set_default_device("cuda")
|
||||
|
||||
config = CompilationConfig.PassConfig(enable_fusion=do_fusion,
|
||||
enable_noop=True)
|
||||
noop_pass = NoOpEliminationPass(config)
|
||||
fusion_pass = FusionPass.instance(config)
|
||||
vllm_config = VllmConfig()
|
||||
vllm_config.compilation_config = CompilationConfig(pass_config= \
|
||||
CompilationConfig.PassConfig(enable_fusion=do_fusion,
|
||||
enable_noop=True))
|
||||
noop_pass = NoOpEliminationPass(vllm_config)
|
||||
fusion_pass = FusionPass.instance(vllm_config)
|
||||
|
||||
passes = [noop_pass, fusion_pass] if do_fusion else [noop_pass]
|
||||
func_pass = FixFunctionalizationPass(config)
|
||||
func_pass = FixFunctionalizationPass(vllm_config)
|
||||
backend_func = TestBackend(*passes, func_pass)
|
||||
backend_no_func = TestBackend(*passes)
|
||||
|
||||
|
||||
@ -77,12 +77,13 @@ def test_fusion_rmsnorm_quant(dtype, hidden_size, num_tokens, eps, static,
|
||||
|
||||
vllm_config = VllmConfig(compilation_config=CompilationConfig(
|
||||
level=CompilationLevel.PIECEWISE, custom_ops=["+rms_norm"]))
|
||||
vllm_config.compilation_config.pass_config = \
|
||||
CompilationConfig.PassConfig(enable_fusion=True,
|
||||
enable_noop=True)
|
||||
with vllm.config.set_current_vllm_config(vllm_config):
|
||||
# Reshape pass is needed for the fusion pass to work
|
||||
config = CompilationConfig.PassConfig(enable_fusion=True,
|
||||
enable_noop=True)
|
||||
noop_pass = NoOpEliminationPass(config)
|
||||
fusion_pass = FusionPass.instance(config)
|
||||
noop_pass = NoOpEliminationPass(vllm_config)
|
||||
fusion_pass = FusionPass.instance(vllm_config)
|
||||
|
||||
backend = TestBackend(noop_pass, fusion_pass)
|
||||
model = TestModel(hidden_size, eps, static, cutlass_fp8_enabled)
|
||||
|
||||
@ -6,7 +6,7 @@ import torch
|
||||
|
||||
from vllm.compilation.inductor_pass import CallableInductorPass, InductorPass
|
||||
from vllm.compilation.pass_manager import PostGradPassManager
|
||||
from vllm.config import CompilationConfig
|
||||
from vllm.config import VllmConfig
|
||||
|
||||
|
||||
# dummy custom pass that doesn't inherit
|
||||
@ -16,7 +16,7 @@ def simple_callable(graph: torch.fx.Graph):
|
||||
|
||||
# Should fail to add directly to the pass manager
|
||||
def test_bad_callable():
|
||||
config = CompilationConfig().pass_config
|
||||
config = VllmConfig()
|
||||
|
||||
pass_manager = PostGradPassManager()
|
||||
pass_manager.configure(config)
|
||||
@ -43,7 +43,7 @@ class ProperPass(InductorPass):
|
||||
],
|
||||
)
|
||||
def test_pass_manager_uuid(callable):
|
||||
config = CompilationConfig().pass_config
|
||||
config = VllmConfig()
|
||||
|
||||
pass_manager = PostGradPassManager()
|
||||
pass_manager.configure(config)
|
||||
@ -64,7 +64,8 @@ def test_pass_manager_uuid(callable):
|
||||
|
||||
# UUID should be different due to config change
|
||||
config2 = copy.deepcopy(config)
|
||||
config2.enable_fusion = not config2.enable_fusion
|
||||
config2.compilation_config.pass_config.enable_fusion = not \
|
||||
config2.compilation_config.pass_config.enable_fusion
|
||||
pass_manager3 = PostGradPassManager()
|
||||
pass_manager3.configure(config2)
|
||||
pass_manager3.add(callable)
|
||||
|
||||
190
tests/compile/test_sequence_parallelism.py
Normal file
190
tests/compile/test_sequence_parallelism.py
Normal file
@ -0,0 +1,190 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm.compilation.fix_functionalization import FixFunctionalizationPass
|
||||
from vllm.compilation.fx_utils import (find_auto_fn, find_auto_fn_maybe,
|
||||
find_specified_fn,
|
||||
find_specified_fn_maybe, is_func)
|
||||
from vllm.compilation.sequence_parallelism import SequenceParallelismPass
|
||||
from vllm.config import (CompilationConfig, DeviceConfig, ModelConfig,
|
||||
VllmConfig)
|
||||
from vllm.distributed import tensor_model_parallel_all_reduce
|
||||
from vllm.distributed.parallel_state import (init_distributed_environment,
|
||||
initialize_model_parallel)
|
||||
from vllm.model_executor.layers.layernorm import RMSNorm
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils import update_environment_variables
|
||||
|
||||
from ..utils import multi_gpu_test
|
||||
from .backend import TestBackend
|
||||
|
||||
OPS_IN_MODEL_BEFORE = [
|
||||
torch.ops.vllm.all_reduce.default,
|
||||
]
|
||||
|
||||
OPS_IN_MODEL_AFTER = [
|
||||
torch.ops.vllm.reduce_scatter.default,
|
||||
torch.ops.vllm.all_gather.default,
|
||||
]
|
||||
|
||||
OPS_IN_MODEL = [torch.ops._C.fused_add_rms_norm.default]
|
||||
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
|
||||
|
||||
class TestModel(torch.nn.Module):
|
||||
|
||||
def __init__(self, hidden_size=16, intermediate_size=32):
|
||||
super().__init__()
|
||||
self.hidden_size = hidden_size
|
||||
self.intermediate_size = intermediate_size
|
||||
self.gate_proj = torch.nn.Parameter(
|
||||
torch.empty((intermediate_size, hidden_size)))
|
||||
self.norm = RMSNorm(hidden_size, 1e-05)
|
||||
# Initialize weights
|
||||
torch.nn.init.normal_(self.gate_proj, std=0.02)
|
||||
|
||||
def forward(self, hidden_states, residual):
|
||||
"""
|
||||
Forward pass implementing the operations in the FX graph
|
||||
|
||||
Args:
|
||||
hidden_states: Input tensor
|
||||
residual: Residual tensor from previous layer
|
||||
|
||||
Returns:
|
||||
Tuple containing the output tensor
|
||||
"""
|
||||
# Reshape input
|
||||
view = hidden_states.reshape(-1, self.hidden_size)
|
||||
|
||||
#matrix multiplication
|
||||
permute = self.gate_proj.permute(1, 0)
|
||||
mm = torch.mm(view, permute)
|
||||
|
||||
# Tensor parallel all-reduce
|
||||
all_reduce = tensor_model_parallel_all_reduce(mm)
|
||||
|
||||
# layer normalization
|
||||
norm_output, residual_output = self.norm(all_reduce, residual)
|
||||
|
||||
return norm_output, residual_output
|
||||
|
||||
|
||||
@multi_gpu_test(num_gpus=2)
|
||||
@pytest.mark.parametrize("batch_size", [8])
|
||||
@pytest.mark.parametrize("seq_len", [16])
|
||||
@pytest.mark.parametrize("hidden_size", [16])
|
||||
@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
|
||||
@pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda"],
|
||||
reason="Only test on CUDA")
|
||||
def test_sequence_parallelism_pass(batch_size: int, seq_len: int,
|
||||
hidden_size: int, dtype: torch.dtype):
|
||||
num_processes = 2
|
||||
|
||||
def run_torch_spawn(fn, nprocs):
|
||||
# need to use torch.mp.spawn otherwise will have problems with
|
||||
# torch.distributed and cuda
|
||||
torch.multiprocessing.spawn(fn,
|
||||
args=(num_processes, batch_size, seq_len,
|
||||
hidden_size, dtype),
|
||||
nprocs=nprocs)
|
||||
|
||||
run_torch_spawn(sequence_parallelism_pass_on_test_model, num_processes)
|
||||
|
||||
|
||||
def sequence_parallelism_pass_on_test_model(local_rank: int, world_size: int,
|
||||
batch_size: int, seq_len: int,
|
||||
hidden_size: int,
|
||||
dtype: torch.dtype):
|
||||
current_platform.seed_everything(0)
|
||||
|
||||
device = torch.device(f"cuda:{local_rank}")
|
||||
torch.cuda.set_device(device)
|
||||
torch.set_default_device(device)
|
||||
torch.set_default_dtype(dtype)
|
||||
|
||||
update_environment_variables({
|
||||
'RANK': str(local_rank),
|
||||
'LOCAL_RANK': str(local_rank),
|
||||
'WORLD_SIZE': str(world_size),
|
||||
'MASTER_ADDR': 'localhost',
|
||||
'MASTER_PORT': '12345',
|
||||
})
|
||||
|
||||
# initialize distributed
|
||||
init_distributed_environment()
|
||||
initialize_model_parallel(tensor_model_parallel_size=world_size)
|
||||
|
||||
# configure vllm config for SequenceParallelismPass
|
||||
vllm_config = VllmConfig()
|
||||
vllm_config.compilation_config = CompilationConfig(
|
||||
pass_config=CompilationConfig.PassConfig(
|
||||
enable_sequence_parallelism=True, ), )
|
||||
vllm_config.device_config = DeviceConfig(device=torch.device("cuda"))
|
||||
|
||||
# this is a fake model name to construct the model config
|
||||
# in the vllm_config, it's not really used.
|
||||
model = "nm-testing/TinyLlama-1.1B-Chat-v1.0-FP8-e2e"
|
||||
vllm_config.model_config = ModelConfig(model=model,
|
||||
task="auto",
|
||||
tokenizer=model,
|
||||
tokenizer_mode="auto",
|
||||
trust_remote_code=True,
|
||||
dtype=dtype,
|
||||
seed=42)
|
||||
|
||||
sequence_parallelism_pass = SequenceParallelismPass(vllm_config)
|
||||
backend_no_func = TestBackend(sequence_parallelism_pass)
|
||||
func_pass = FixFunctionalizationPass(vllm_config)
|
||||
backend_func = TestBackend(sequence_parallelism_pass, func_pass)
|
||||
|
||||
model = TestModel(hidden_size, hidden_size * 2)
|
||||
hidden_states = torch.randn((batch_size * seq_len, hidden_size),
|
||||
dtype=dtype)
|
||||
residual = torch.randn((batch_size * seq_len, hidden_size), dtype=dtype)
|
||||
|
||||
compiled_model_no_func = torch.compile(model, backend=backend_no_func)
|
||||
compiled_model_no_func(hidden_states, residual)
|
||||
compiled_model_func = torch.compile(model, backend=backend_func)
|
||||
compiled_model_func(hidden_states, residual)
|
||||
|
||||
# Check substitution worked
|
||||
pre_nodes = backend_no_func.graph_pre_pass.nodes
|
||||
post_nodes = backend_no_func.graph_post_pass.nodes
|
||||
|
||||
# In pre-nodes, all reduce should be there,
|
||||
# reduce scatter and all gather should not
|
||||
for op in OPS_IN_MODEL_BEFORE:
|
||||
find_specified_fn(pre_nodes, op)
|
||||
for op in OPS_IN_MODEL_AFTER:
|
||||
assert find_specified_fn_maybe(pre_nodes, op) is None
|
||||
|
||||
# In post-nodes, reduce scatter and all gather should be there,
|
||||
# all reduce should not
|
||||
for op in OPS_IN_MODEL_AFTER:
|
||||
find_specified_fn(post_nodes, op)
|
||||
for op in OPS_IN_MODEL_BEFORE:
|
||||
assert find_specified_fn_maybe(post_nodes, op) is None
|
||||
|
||||
# check if the functionalization pass is applied
|
||||
for op in OPS_IN_MODEL:
|
||||
find_auto_fn(backend_no_func.graph_post_pass.nodes, op)
|
||||
assert find_auto_fn_maybe(backend_func.graph_post_pass.nodes,
|
||||
op) is None # noqa: E501
|
||||
|
||||
# make sure the ops were all de-functionalized
|
||||
found = dict()
|
||||
for node in backend_func.graph_post_pass.nodes:
|
||||
for op in OPS_IN_MODEL:
|
||||
if is_func(node, op):
|
||||
found[op] = True
|
||||
assert all(found[op] for op in OPS_IN_MODEL)
|
||||
@ -21,9 +21,10 @@ from transformers.models.auto.auto_factory import _BaseAutoModelClass
|
||||
from tests.models.utils import (TokensTextLogprobs,
|
||||
TokensTextLogprobsPromptLogprobs)
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.assets.audio import AudioAsset
|
||||
from vllm.assets.image import ImageAsset
|
||||
from vllm.assets.video import VideoAsset
|
||||
from vllm.config import TaskOption, TokenizerPoolConfig, _get_and_verify_dtype
|
||||
from vllm.config import TaskOption, _get_and_verify_dtype
|
||||
from vllm.connections import global_http_connection
|
||||
from vllm.distributed import (cleanup_dist_env_and_memory,
|
||||
init_distributed_environment,
|
||||
@ -103,10 +104,25 @@ class _VideoAssets(_VideoAssetsBase):
|
||||
return [prompts["sample_demo_1"]]
|
||||
|
||||
|
||||
class _AudioAssetsBase(UserList[AudioAsset]):
|
||||
pass
|
||||
|
||||
|
||||
class _AudioAssets(_AudioAssetsBase):
|
||||
|
||||
def __init__(self) -> None:
|
||||
super().__init__([
|
||||
AudioAsset("mary_had_lamb"),
|
||||
AudioAsset("winning_call"),
|
||||
])
|
||||
|
||||
|
||||
IMAGE_ASSETS = _ImageAssets()
|
||||
"""Singleton instance of :class:`_ImageAssets`."""
|
||||
VIDEO_ASSETS = _VideoAssets()
|
||||
"""Singleton instance of :class:`_VideoAssets`."""
|
||||
AUDIO_ASSETS = _AudioAssets()
|
||||
"""Singleton instance of :class:`_AudioAssets`."""
|
||||
|
||||
|
||||
@pytest.fixture(scope="function", autouse=True)
|
||||
@ -263,6 +279,11 @@ def video_assets() -> _VideoAssets:
|
||||
return VIDEO_ASSETS
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def audio_assets() -> _AudioAssets:
|
||||
return AUDIO_ASSETS
|
||||
|
||||
|
||||
_T = TypeVar("_T", nn.Module, torch.Tensor, BatchEncoding, BatchFeature, dict)
|
||||
_R = TypeVar("_R")
|
||||
|
||||
@ -390,10 +411,15 @@ class HfRunner:
|
||||
processor_kwargs["images"] = image
|
||||
if videos is not None and (video := videos[i]) is not None:
|
||||
processor_kwargs["videos"] = video
|
||||
if audios is not None and (audio_tuple := audios[i]) is not None:
|
||||
audio, sr = audio_tuple
|
||||
processor_kwargs["audio"] = audio
|
||||
processor_kwargs["sampling_rate"] = sr
|
||||
if audios is not None and (audio_inputs := audios[i]) is not None:
|
||||
# HACK - not all processors take sampling_rate; we should
|
||||
# clean this up in the future.
|
||||
if len(audio_inputs) == 2:
|
||||
audio, sr = audio_inputs
|
||||
processor_kwargs["audio"] = audio
|
||||
processor_kwargs["sampling_rate"] = sr
|
||||
else:
|
||||
processor_kwargs["audio"] = audio_inputs
|
||||
|
||||
inputs = self.processor(**processor_kwargs)
|
||||
if isinstance(inputs, BatchFeature):
|
||||
@ -531,7 +557,10 @@ class HfRunner:
|
||||
for _, hidden_state in enumerate(hidden_states):
|
||||
last_hidden_states = hidden_state[-1][0]
|
||||
logits = torch.matmul(
|
||||
last_hidden_states.to(output_embeddings.weight.device),
|
||||
last_hidden_states.to(
|
||||
device=output_embeddings.weight.device,
|
||||
dtype=output_embeddings.weight.dtype,
|
||||
),
|
||||
output_embeddings.weight.t(),
|
||||
)
|
||||
if getattr(output_embeddings, "bias", None) is not None:
|
||||
@ -1010,20 +1039,6 @@ def vllm_runner():
|
||||
return VllmRunner
|
||||
|
||||
|
||||
def get_tokenizer_pool_config(tokenizer_group_type):
|
||||
if tokenizer_group_type is None:
|
||||
return None
|
||||
if tokenizer_group_type == "ray":
|
||||
return TokenizerPoolConfig(pool_size=1,
|
||||
pool_type="ray",
|
||||
extra_config={})
|
||||
if isinstance(tokenizer_group_type, type):
|
||||
return TokenizerPoolConfig(pool_size=1,
|
||||
pool_type=tokenizer_group_type,
|
||||
extra_config={})
|
||||
raise ValueError(f"Unknown tokenizer_group_type: {tokenizer_group_type}")
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def temporary_enable_log_propagate():
|
||||
import logging
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user