Compare commits
115 Commits
whisper-tr
...
fix_use_ep
| Author | SHA1 | Date | |
|---|---|---|---|
| 87e47eb1db | |||
| 090c856d76 | |||
| ad434d4cfe | |||
| 66d433b94f | |||
| 027b204ff1 | |||
| 55dcce91df | |||
| 8017c8db7f | |||
| dc3529dbf6 | |||
| 7699258ef0 | |||
| e9ba99f296 | |||
| 7c80368710 | |||
| 95d63f38c0 | |||
| bb8dab821e | |||
| fc0f87768a | |||
| 0a57386721 | |||
| 3749e28774 | |||
| 86fc2321ff | |||
| 2549c0dfef | |||
| b10e519895 | |||
| 9bde5ba127 | |||
| 72c8f1ad04 | |||
| da224daaa9 | |||
| 3a100b9278 | |||
| 242a637aea | |||
| c2a9671510 | |||
| d5ae4f7f42 | |||
| b6c502a150 | |||
| 9ca710e525 | |||
| eb07c8cb5b | |||
| ba10801961 | |||
| 620fc2d09e | |||
| 29283eaa7e | |||
| 2fa66ef713 | |||
| 13affc432d | |||
| d8f094a92a | |||
| 97ae6d777f | |||
| 6baeee70d1 | |||
| d2517a4939 | |||
| 6342adc438 | |||
| 0adba91547 | |||
| 4285e423a6 | |||
| 63375f0cdb | |||
| 70ad3f9e98 | |||
| d6fc629f4d | |||
| af51d80fa1 | |||
| f5722a5052 | |||
| 651cf0fec1 | |||
| 4dc52e1c53 | |||
| 4708f13a9c | |||
| a6d042df0a | |||
| 40a36ccfeb | |||
| ef608c37a7 | |||
| 2386803f2a | |||
| 95862f7b4d | |||
| 230b131b54 | |||
| 0812d8dd41 | |||
| bf7e3c51ae | |||
| a35a8a8392 | |||
| 4ef0bb1fcf | |||
| fadc59c0e6 | |||
| 86cbd2eee9 | |||
| 092475f738 | |||
| dcc56d62da | |||
| f15e70d906 | |||
| b6be6f8d1e | |||
| 03a70eacaf | |||
| 45b1ff7a25 | |||
| 15ba07ef25 | |||
| d2b58ca203 | |||
| 82e7e19a6e | |||
| 421c462948 | |||
| 84884cd9ac | |||
| a43aa183dc | |||
| 463bbb1835 | |||
| 5e125e74d1 | |||
| 06f21ce7a5 | |||
| 57a810db9c | |||
| 8b664706aa | |||
| 37bfee92bf | |||
| e73ff24e31 | |||
| bd7599d34a | |||
| 01b6113659 | |||
| 1b84eff03a | |||
| 55acf86bf8 | |||
| f021b97993 | |||
| 1cab43c2d2 | |||
| 8bd651b318 | |||
| 58e234a754 | |||
| e86c414d6a | |||
| 550b2801ad | |||
| cefb9e5a28 | |||
| 98d7367b61 | |||
| 594a8b9030 | |||
| 44f990515b | |||
| 252937806c | |||
| 51826d51fa | |||
| 14e53ed11f | |||
| ddb94c2605 | |||
| 90969fb39a | |||
| 101f1481f9 | |||
| 2edc87b161 | |||
| 4203926f10 | |||
| cdb57015a7 | |||
| aa557e6422 | |||
| 0e00d40e4f | |||
| c920e01242 | |||
| 274d8e8818 | |||
| 2039c6305b | |||
| 6efb195a6e | |||
| 24b7fb455a | |||
| 58f5a59769 | |||
| db9dfcfa6a | |||
| 9ef98d527e | |||
| 93491aefc7 | |||
| 7acd539cd7 |
@ -10,15 +10,24 @@ set -x
|
||||
set -o pipefail
|
||||
|
||||
check_gpus() {
|
||||
# check the number of GPUs and GPU type.
|
||||
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
|
||||
if command -v nvidia-smi; then
|
||||
# check the number of GPUs and GPU type.
|
||||
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
|
||||
elif command -v amd-smi; then
|
||||
declare -g gpu_count=$(amd-smi list | grep 'GPU' | wc -l)
|
||||
fi
|
||||
|
||||
if [[ $gpu_count -gt 0 ]]; then
|
||||
echo "GPU found."
|
||||
else
|
||||
echo "Need at least 1 GPU to run benchmarking."
|
||||
exit 1
|
||||
fi
|
||||
declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')
|
||||
if command -v nvidia-smi; then
|
||||
declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')
|
||||
elif command -v amd-smi; then
|
||||
declare -g gpu_type=$(amd-smi static -g 0 -a | grep 'MARKET_NAME' | awk '{print $2}')
|
||||
fi
|
||||
echo "GPU type is $gpu_type"
|
||||
}
|
||||
|
||||
@ -90,9 +99,15 @@ kill_gpu_processes() {
|
||||
|
||||
|
||||
# wait until GPU memory usage smaller than 1GB
|
||||
while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
|
||||
sleep 1
|
||||
done
|
||||
if command -v nvidia-smi; then
|
||||
while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
|
||||
sleep 1
|
||||
done
|
||||
elif command -v amd-smi; then
|
||||
while [ "$(amd-smi metric -g 0 | grep 'USED_VRAM' | awk '{print $2}')" -ge 1000 ]; do
|
||||
sleep 1
|
||||
done
|
||||
fi
|
||||
|
||||
# remove vllm config file
|
||||
rm -rf ~/.config/vllm
|
||||
|
||||
@ -6,7 +6,7 @@ steps:
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||
- "mkdir artifacts"
|
||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||
- "bash .buildkite/upload-wheels.sh"
|
||||
- "bash .buildkite/scripts/upload-wheels.sh"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
@ -17,7 +17,7 @@ steps:
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||
- "mkdir artifacts"
|
||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||
- "bash .buildkite/upload-wheels.sh"
|
||||
- "bash .buildkite/scripts/upload-wheels.sh"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
@ -34,7 +34,7 @@ steps:
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||
- "mkdir artifacts"
|
||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||
- "bash .buildkite/upload-wheels.sh"
|
||||
- "bash .buildkite/scripts/upload-wheels.sh"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
|
||||
@ -105,19 +105,33 @@ fi
|
||||
if [[ $commands == *" entrypoints/openai "* ]]; then
|
||||
commands=${commands//" entrypoints/openai "/" entrypoints/openai \
|
||||
--ignore=entrypoints/openai/test_audio.py \
|
||||
--ignore=entrypoints/openai/test_chat.py \
|
||||
--ignore=entrypoints/openai/test_shutdown.py \
|
||||
--ignore=entrypoints/openai/test_completion.py \
|
||||
--ignore=entrypoints/openai/test_sleep.py \
|
||||
--ignore=entrypoints/openai/test_models.py \
|
||||
--ignore=entrypoints/openai/test_lora_adapters.py \
|
||||
--ignore=entrypoints/openai/test_return_tokens_as_ids.py \
|
||||
--ignore=entrypoints/openai/test_root_path.py \
|
||||
--ignore=entrypoints/openai/test_tokenization.py \
|
||||
--ignore=entrypoints/openai/test_prompt_validation.py "}
|
||||
fi
|
||||
|
||||
#ignore certain Entrypoints/llm tests
|
||||
if [[ $commands == *" && pytest -v -s entrypoints/llm/test_guided_generate.py"* ]]; then
|
||||
commands=${commands//" && pytest -v -s entrypoints/llm/test_guided_generate.py"/" "}
|
||||
if [[ $commands == *" entrypoints/llm "* ]]; then
|
||||
commands=${commands//" entrypoints/llm "/" entrypoints/llm \
|
||||
--ignore=entrypoints/llm/test_chat.py \
|
||||
--ignore=entrypoints/llm/test_accuracy.py \
|
||||
--ignore=entrypoints/llm/test_init.py \
|
||||
--ignore=entrypoints/llm/test_generate_multiple_loras.py \
|
||||
--ignore=entrypoints/llm/test_prompt_validation.py "}
|
||||
fi
|
||||
|
||||
#Obsolete currently
|
||||
##ignore certain Entrypoints/llm tests
|
||||
#if [[ $commands == *" && pytest -v -s entrypoints/llm/test_guided_generate.py"* ]]; then
|
||||
# commands=${commands//" && pytest -v -s entrypoints/llm/test_guided_generate.py"/" "}
|
||||
#fi
|
||||
|
||||
# --ignore=entrypoints/openai/test_encoder_decoder.py \
|
||||
# --ignore=entrypoints/openai/test_embedding.py \
|
||||
# --ignore=entrypoints/openai/test_oot_registration.py
|
||||
@ -1,6 +1,6 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
set -xue
|
||||
|
||||
# Build the docker image.
|
||||
docker build -f docker/Dockerfile.tpu -t vllm-tpu .
|
||||
@ -36,7 +36,11 @@ docker run --privileged --net host --shm-size=16G -it \
|
||||
&& echo TEST_6 \
|
||||
&& pytest -s -v /workspace/vllm/tests/v1/tpu/worker/test_tpu_model_runner.py \
|
||||
&& echo TEST_7 \
|
||||
&& pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py" \
|
||||
&& pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py \
|
||||
&& echo TEST_8 \
|
||||
&& pytest -s -v /workspace/vllm/tests/v1/tpu/test_topk_topp_sampler.py \
|
||||
&& echo TEST_9 \
|
||||
&& pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py" \
|
||||
|
||||
|
||||
# TODO: This test fails because it uses RANDOM_SEED sampling
|
||||
@ -5,8 +5,8 @@
|
||||
set -ex
|
||||
set -o pipefail
|
||||
|
||||
# cd into parent directory of this file
|
||||
cd "$(dirname "${BASH_SOURCE[0]}")/.."
|
||||
# cd 2 levels into the working directory
|
||||
cd "$(dirname "${BASH_SOURCE[0]}")/../.."
|
||||
|
||||
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
|
||||
|
||||
@ -3,7 +3,7 @@
|
||||
set -euox pipefail
|
||||
|
||||
if [[ $# -lt 4 ]]; then
|
||||
echo "Usage: .buildkite/run-multi-node-test.sh WORKING_DIR NUM_NODES NUM_GPUS DOCKER_IMAGE COMMAND1 COMMAND2 ... COMMANDN"
|
||||
echo "Usage: .buildkite/scripts/run-multi-node-test.sh WORKING_DIR NUM_NODES NUM_GPUS DOCKER_IMAGE COMMAND1 COMMAND2 ... COMMANDN"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
@ -104,7 +104,7 @@ steps:
|
||||
- label: Entrypoints Test # 40min
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
fast_check: true
|
||||
mirror_hardwares: [amd]
|
||||
#mirror_hardwares: [amd]
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/entrypoints/llm
|
||||
@ -155,6 +155,7 @@ steps:
|
||||
- popd
|
||||
|
||||
- label: Metrics, Tracing Test # 10min
|
||||
mirror_hardwares: [amd]
|
||||
num_gpus: 2
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
@ -173,7 +174,7 @@ steps:
|
||||
##### 1 GPU test #####
|
||||
|
||||
- label: Regression Test # 5min
|
||||
mirror_hardwares: [amd]
|
||||
#mirror_hardwares: [amd]
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/test_regression
|
||||
@ -204,7 +205,6 @@ steps:
|
||||
commands:
|
||||
# split the test to avoid interference
|
||||
- pytest -v -s v1/core
|
||||
- pytest -v -s v1/entrypoints
|
||||
- pytest -v -s v1/engine
|
||||
- pytest -v -s v1/entrypoints
|
||||
- pytest -v -s v1/sample
|
||||
@ -285,11 +285,11 @@ steps:
|
||||
- pytest -v -s spec_decode/e2e/test_eagle_correctness.py
|
||||
|
||||
- label: LoRA Test %N # 15min each
|
||||
mirror_hardwares: [amd]
|
||||
#mirror_hardwares: [amd]
|
||||
source_file_dependencies:
|
||||
- vllm/lora
|
||||
- tests/lora
|
||||
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_minicpmv_tp.py --ignore=lora/test_transfomers_model.py
|
||||
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py
|
||||
parallelism: 4
|
||||
|
||||
- label: PyTorch Fullgraph Smoke Test # 9min
|
||||
@ -311,7 +311,7 @@ steps:
|
||||
- pytest -v -s compile/test_full_graph.py
|
||||
|
||||
- label: Kernels Test %N # 1h each
|
||||
mirror_hardwares: [amd]
|
||||
# mirror_hardwares: [amd]
|
||||
source_file_dependencies:
|
||||
- csrc/
|
||||
- vllm/attention
|
||||
@ -321,7 +321,7 @@ steps:
|
||||
parallelism: 4
|
||||
|
||||
- label: Tensorizer Test # 11min
|
||||
mirror_hardwares: [amd]
|
||||
# mirror_hardwares: [amd]
|
||||
soft_fail: true
|
||||
source_file_dependencies:
|
||||
- vllm/model_executor/model_loader
|
||||
@ -337,7 +337,7 @@ steps:
|
||||
source_file_dependencies:
|
||||
- benchmarks/
|
||||
commands:
|
||||
- bash run-benchmarks.sh
|
||||
- bash scripts/run-benchmarks.sh
|
||||
|
||||
- label: Quantization Test # 33min
|
||||
source_file_dependencies:
|
||||
@ -372,7 +372,7 @@ steps:
|
||||
|
||||
- label: OpenAI-Compatible Tool Use # 20 min
|
||||
fast_check: false
|
||||
mirror_hardwares: [ amd ]
|
||||
#mirror_hardwares: [ amd ]
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/tool_use
|
||||
@ -389,7 +389,8 @@ steps:
|
||||
- pytest -v -s models/test_transformers.py
|
||||
- pytest -v -s models/test_registry.py
|
||||
# V1 Test: https://github.com/vllm-project/vllm/issues/14531
|
||||
- VLLM_USE_V1=0 pytest -v -s models/test_initialization.py
|
||||
- VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4'
|
||||
- VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'llama4'
|
||||
|
||||
- label: Language Models Test (Standard) # 32min
|
||||
#mirror_hardwares: [amd]
|
||||
@ -464,6 +465,7 @@ steps:
|
||||
|
||||
# This test is used only in PR development phase to test individual models and should never run on main
|
||||
- label: Custom Models Test
|
||||
mirror_hardwares: [amd]
|
||||
optional: true
|
||||
commands:
|
||||
- echo 'Testing custom models...'
|
||||
@ -475,6 +477,7 @@ steps:
|
||||
##### multi gpus test #####
|
||||
|
||||
- label: Distributed Comm Ops Test # 7min
|
||||
mirror_hardwares: [amd]
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 2
|
||||
source_file_dependencies:
|
||||
@ -602,8 +605,6 @@ steps:
|
||||
# requires multi-GPU testing for validation.
|
||||
- pytest -v -s -x lora/test_chatglm3_tp.py
|
||||
- pytest -v -s -x lora/test_llama_tp.py
|
||||
- pytest -v -s -x lora/test_minicpmv_tp.py
|
||||
- pytest -v -s -x lora/test_transfomers_model.py
|
||||
|
||||
|
||||
- label: Weight Loading Multiple GPU Test # 33min
|
||||
|
||||
@ -15,14 +15,12 @@ Easy, fast, and cheap LLM serving for everyone
|
||||
|
||||
---
|
||||
|
||||
[2025/03] We are collaborating with Ollama to host an [Inference Night](https://lu.ma/vllm-ollama) at Y Combinator in San Francisco on Thursday, March 27, at 6 PM. Discuss all things inference local or data center!
|
||||
|
||||
[2025/04] We're hosting our first-ever *vLLM Asia Developer Day* in Singapore on *April 3rd*! This is a full-day event (9 AM - 9 PM SGT) in partnership with SGInnovate, AMD, and Embedded LLM. Meet the vLLM team and learn about LLM inference for RL, MI300X, and more! [Register Now](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day)
|
||||
|
||||
---
|
||||
|
||||
*Latest News* 🔥
|
||||
|
||||
- [2025/03] We hosted [vLLM x Ollama Inference Night](https://lu.ma/vllm-ollama)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/16T2PDD1YwRnZ4Tu8Q5r6n53c5Lr5c73UV9Vd2_eBo4U/edit?usp=sharing).
|
||||
- [2025/03] We hosted [the first vLLM China Meetup](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg)! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1REHvfQMKGnvz6p3Fd23HhSO4c8j5WPGZV0bKYLwnHyQ/edit?usp=sharing).
|
||||
- [2025/03] We hosted [the East Coast vLLM Meetup](https://lu.ma/7mu4k4xx)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1NHiv8EUFF1NLd3fEYODm56nDmL26lEeXCaDgyDlTsRs/edit#slide=id.g31441846c39_0_0).
|
||||
- [2025/02] We hosted [the ninth vLLM meetup](https://lu.ma/h7g3kuj9) with Meta! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1jzC_PZVXrVNSFVCW-V4cFXb6pn7zZ2CyP_Flwo05aqg/edit?usp=sharing) and AMD [here](https://drive.google.com/file/d/1Zk5qEJIkTmlQ2eQcXQZlljAx3m9s7nwn/view?usp=sharing). The slides from Meta will not be posted.
|
||||
@ -103,7 +101,7 @@ Visit our [documentation](https://docs.vllm.ai/en/latest/) to learn more.
|
||||
## Contributing
|
||||
|
||||
We welcome and value any contributions and collaborations.
|
||||
Please check out [CONTRIBUTING.md](./CONTRIBUTING.md) for how to get involved.
|
||||
Please check out [Contributing to vLLM](https://docs.vllm.ai/en/stable/contributing/overview.html) for how to get involved.
|
||||
|
||||
## Sponsors
|
||||
|
||||
@ -126,6 +124,7 @@ Compute Resources:
|
||||
- Databricks
|
||||
- DeepInfra
|
||||
- Google Cloud
|
||||
- Intel
|
||||
- Lambda Lab
|
||||
- Nebius
|
||||
- Novita AI
|
||||
|
||||
@ -51,6 +51,12 @@ become available.
|
||||
<td style="text-align: center;">✅</td>
|
||||
<td style="text-align: center;">✅</td>
|
||||
<td><code>likaixin/InstructCoder</code></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td><strong>HuggingFace-AIMO</strong></td>
|
||||
<td style="text-align: center;">✅</td>
|
||||
<td style="text-align: center;">✅</td>
|
||||
<td><code>AI-MO/aimo-validation-aime</code> , <code>AI-MO/NuminaMath-1.5</code>, <code>AI-MO/NuminaMath-CoT</code></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td><strong>HuggingFace-Other</strong></td>
|
||||
@ -187,6 +193,35 @@ python3 vllm/benchmarks/benchmark_serving.py \
|
||||
--num-prompts 10
|
||||
```
|
||||
|
||||
**`AI-MO/aimo-validation-aime`**
|
||||
|
||||
``` bash
|
||||
python3 vllm/benchmarks/benchmark_serving.py \
|
||||
--model Qwen/QwQ-32B \
|
||||
--dataset-name hf \
|
||||
--dataset-path AI-MO/aimo-validation-aime \
|
||||
--num-prompts 10 \
|
||||
--seed 42
|
||||
```
|
||||
|
||||
### Running With Sampling Parameters
|
||||
|
||||
When using OpenAI-compatible backends such as `vllm`, optional sampling
|
||||
parameters can be specified. Example client command:
|
||||
|
||||
```bash
|
||||
python3 vllm/benchmarks/benchmark_serving.py \
|
||||
--backend vllm \
|
||||
--model NousResearch/Hermes-3-Llama-3.1-8B \
|
||||
--endpoint /v1/completions \
|
||||
--dataset-name sharegpt \
|
||||
--dataset-path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
|
||||
--top-k 10 \
|
||||
--top-p 0.9 \
|
||||
--temperature 0.5 \
|
||||
--num-prompts 10
|
||||
```
|
||||
|
||||
---
|
||||
## Example - Offline Throughput Benchmark
|
||||
|
||||
@ -278,6 +313,18 @@ python3 vllm/benchmarks/benchmark_throughput.py \
|
||||
--num-prompts 10
|
||||
```
|
||||
|
||||
**`AI-MO/aimo-validation-aime`**
|
||||
|
||||
```bash
|
||||
python3 benchmarks/benchmark_throughput.py \
|
||||
--model Qwen/QwQ-32B \
|
||||
--backend vllm \
|
||||
--dataset-name hf \
|
||||
--dataset-path AI-MO/aimo-validation-aime \
|
||||
--hf-split train \
|
||||
--num-prompts 10
|
||||
```
|
||||
|
||||
### Benchmark with LoRA Adapters
|
||||
|
||||
``` bash
|
||||
|
||||
@ -219,7 +219,15 @@ async def async_request_deepspeed_mii(
|
||||
if response.status == 200:
|
||||
parsed_resp = await response.json()
|
||||
output.latency = time.perf_counter() - st
|
||||
output.generated_text = parsed_resp["text"][0]
|
||||
if "choices" in parsed_resp:
|
||||
output.generated_text = parsed_resp["choices"][0][
|
||||
"text"]
|
||||
elif "text" in parsed_resp:
|
||||
output.generated_text = parsed_resp["text"][0]
|
||||
else:
|
||||
output.error = ("Unexpected response format: "
|
||||
"neither 'choices' nor 'text' found")
|
||||
output.success = False
|
||||
output.success = True
|
||||
else:
|
||||
output.error = response.reason or ""
|
||||
@ -489,3 +497,9 @@ ASYNC_REQUEST_FUNCS = {
|
||||
"scalellm": async_request_openai_completions,
|
||||
"sglang": async_request_openai_completions,
|
||||
}
|
||||
|
||||
OPENAI_COMPATIBLE_BACKENDS = [
|
||||
k for k, v in ASYNC_REQUEST_FUNCS.items()
|
||||
if v in (async_request_openai_completions,
|
||||
async_request_openai_chat_completions)
|
||||
]
|
||||
|
||||
@ -582,15 +582,6 @@ class HuggingFaceDataset(BenchmarkDataset):
|
||||
) -> None:
|
||||
super().__init__(dataset_path=dataset_path, **kwargs)
|
||||
|
||||
# Validate dataset path
|
||||
if self.SUPPORTED_DATASET_PATHS and \
|
||||
self.dataset_path not in self.SUPPORTED_DATASET_PATHS:
|
||||
raise ValueError(
|
||||
f"{self.__class__.__name__} "
|
||||
f"only supports: {', '.join(self.SUPPORTED_DATASET_PATHS)}. "
|
||||
"Please consider contributing if you would "
|
||||
"like to add support for additional dataset formats.")
|
||||
|
||||
self.dataset_split = dataset_split
|
||||
self.dataset_subset = dataset_subset
|
||||
self.load_data()
|
||||
@ -761,3 +752,52 @@ class InstructCoderDataset(HuggingFaceDataset):
|
||||
))
|
||||
self.maybe_oversample_requests(sampled_requests, num_requests)
|
||||
return sampled_requests
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# AIMO Dataset Implementation
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
|
||||
class AIMODataset(HuggingFaceDataset):
|
||||
"""
|
||||
Dataset class for processing a AIMO dataset with reasoning questions.
|
||||
"""
|
||||
SUPPORTED_DATASET_PATHS = {
|
||||
"AI-MO/aimo-validation-aime", "AI-MO/NuminaMath-1.5",
|
||||
"AI-MO/NuminaMath-CoT"
|
||||
}
|
||||
|
||||
def sample(self,
|
||||
tokenizer: PreTrainedTokenizerBase,
|
||||
num_requests: int,
|
||||
output_len: Optional[int] = None,
|
||||
**kwargs) -> list:
|
||||
sampled_requests = []
|
||||
dynamic_output = output_len is None
|
||||
|
||||
for item in self.data:
|
||||
if len(sampled_requests) >= num_requests:
|
||||
break
|
||||
prompt, completion = item['problem'], item["solution"]
|
||||
|
||||
prompt_ids = tokenizer(prompt).input_ids
|
||||
completion_ids = tokenizer(completion).input_ids
|
||||
prompt_len = len(prompt_ids)
|
||||
completion_len = len(completion_ids)
|
||||
output_len = completion_len if dynamic_output else output_len
|
||||
assert isinstance(output_len, int) and output_len > 0
|
||||
if dynamic_output and not is_valid_sequence(prompt_len,
|
||||
completion_len,
|
||||
max_prompt_len=2048,
|
||||
max_total_len=32000):
|
||||
continue
|
||||
sampled_requests.append(
|
||||
SampleRequest(
|
||||
prompt=prompt,
|
||||
prompt_len=prompt_len,
|
||||
expected_output_len=output_len,
|
||||
multi_modal_data=None,
|
||||
))
|
||||
self.maybe_oversample_requests(sampled_requests, num_requests)
|
||||
return sampled_requests
|
||||
|
||||
@ -34,7 +34,8 @@ from datetime import datetime
|
||||
from typing import Any, Optional
|
||||
|
||||
import numpy as np
|
||||
from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput,
|
||||
from backend_request_func import (ASYNC_REQUEST_FUNCS,
|
||||
OPENAI_COMPATIBLE_BACKENDS, RequestFuncInput,
|
||||
RequestFuncOutput)
|
||||
from tqdm.asyncio import tqdm
|
||||
from transformers import PreTrainedTokenizerBase
|
||||
@ -49,7 +50,8 @@ try:
|
||||
except ImportError:
|
||||
from argparse import ArgumentParser as FlexibleArgumentParser
|
||||
|
||||
from benchmark_dataset import (BurstGPTDataset, ConversationDataset,
|
||||
from benchmark_dataset import (AIMODataset, BurstGPTDataset,
|
||||
ConversationDataset, HuggingFaceDataset,
|
||||
InstructCoderDataset, RandomDataset,
|
||||
SampleRequest, ShareGPTDataset, SonnetDataset,
|
||||
VisionArenaDataset)
|
||||
@ -259,6 +261,7 @@ async def benchmark(
|
||||
goodput_config_dict: dict[str, float],
|
||||
max_concurrency: Optional[int],
|
||||
lora_modules: Optional[Iterable[str]],
|
||||
extra_body: Optional[dict],
|
||||
):
|
||||
if backend in ASYNC_REQUEST_FUNCS:
|
||||
request_func = ASYNC_REQUEST_FUNCS[backend]
|
||||
@ -286,6 +289,7 @@ async def benchmark(
|
||||
logprobs=logprobs,
|
||||
multi_modal_content=test_mm_content,
|
||||
ignore_eos=ignore_eos,
|
||||
extra_body=extra_body,
|
||||
)
|
||||
|
||||
test_output = await request_func(request_func_input=test_input)
|
||||
@ -312,7 +316,8 @@ async def benchmark(
|
||||
output_len=test_output_len,
|
||||
logprobs=logprobs,
|
||||
multi_modal_content=test_mm_content,
|
||||
ignore_eos=ignore_eos)
|
||||
ignore_eos=ignore_eos,
|
||||
extra_body=extra_body)
|
||||
profile_output = await request_func(request_func_input=profile_input)
|
||||
if profile_output.success:
|
||||
print("Profiler started")
|
||||
@ -362,7 +367,8 @@ async def benchmark(
|
||||
output_len=output_len,
|
||||
logprobs=logprobs,
|
||||
multi_modal_content=mm_content,
|
||||
ignore_eos=ignore_eos)
|
||||
ignore_eos=ignore_eos,
|
||||
extra_body=extra_body)
|
||||
tasks.append(
|
||||
asyncio.create_task(
|
||||
limited_request_func(request_func_input=request_func_input,
|
||||
@ -595,14 +601,28 @@ def main(args: argparse.Namespace):
|
||||
args.hf_split = "train"
|
||||
elif args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS:
|
||||
dataset_class = ConversationDataset
|
||||
elif args.dataset_path in AIMODataset.SUPPORTED_DATASET_PATHS:
|
||||
dataset_class = AIMODataset
|
||||
args.hf_split = "train"
|
||||
else:
|
||||
supported_datasets = set([
|
||||
dataset_name for cls in HuggingFaceDataset.__subclasses__()
|
||||
for dataset_name in cls.SUPPORTED_DATASET_PATHS
|
||||
])
|
||||
raise ValueError(
|
||||
f"Unsupported dataset path: {args.dataset_path}. "
|
||||
"Huggingface dataset only supports dataset_path"
|
||||
f" from one of following: {supported_datasets}. "
|
||||
"Please consider contributing if you would "
|
||||
"like to add support for additional dataset formats.")
|
||||
input_requests = dataset_class(
|
||||
dataset_path=args.dataset_path,
|
||||
dataset_subset=args.hf_subset,
|
||||
dataset_split=args.hf_split,
|
||||
random_seed=args.seed,
|
||||
).sample(
|
||||
num_requests=args.num_prompts,
|
||||
tokenizer=tokenizer,
|
||||
random_seed=args.seed,
|
||||
output_len=args.hf_output_len,
|
||||
)
|
||||
|
||||
@ -637,6 +657,26 @@ def main(args: argparse.Namespace):
|
||||
raise ValueError(f"Unknown dataset: {args.dataset_name}") from err
|
||||
goodput_config_dict = check_goodput_args(args)
|
||||
|
||||
# Collect the sampling parameters.
|
||||
sampling_params = {
|
||||
k: v
|
||||
for k, v in {
|
||||
"top_p": args.top_p,
|
||||
"top_k": args.top_k,
|
||||
"min_p": args.min_p,
|
||||
"temperature": args.temperature
|
||||
}.items() if v is not None
|
||||
}
|
||||
|
||||
# Sampling parameters are only supported by openai-compatible backend.
|
||||
if sampling_params and args.backend not in OPENAI_COMPATIBLE_BACKENDS:
|
||||
raise ValueError(
|
||||
"Sampling parameters are only supported by openai-compatible "
|
||||
"backends.")
|
||||
|
||||
if "temperature" not in sampling_params:
|
||||
sampling_params["temperature"] = 0.0 # Default to greedy decoding.
|
||||
|
||||
# Avoid GC processing "static" data - reduce pause times.
|
||||
gc.collect()
|
||||
gc.freeze()
|
||||
@ -663,6 +703,7 @@ def main(args: argparse.Namespace):
|
||||
goodput_config_dict=goodput_config_dict,
|
||||
max_concurrency=args.max_concurrency,
|
||||
lora_modules=args.lora_modules,
|
||||
extra_body=sampling_params,
|
||||
))
|
||||
|
||||
# Save config and results to json
|
||||
@ -985,6 +1026,33 @@ if __name__ == "__main__":
|
||||
"from the sampled HF dataset.",
|
||||
)
|
||||
|
||||
sampling_group = parser.add_argument_group("sampling parameters")
|
||||
sampling_group.add_argument(
|
||||
"--top-p",
|
||||
type=float,
|
||||
default=None,
|
||||
help="Top-p sampling parameter. Only has effect on openai-compatible "
|
||||
"backends.")
|
||||
sampling_group.add_argument(
|
||||
"--top-k",
|
||||
type=int,
|
||||
default=None,
|
||||
help="Top-k sampling parameter. Only has effect on openai-compatible "
|
||||
"backends.")
|
||||
sampling_group.add_argument(
|
||||
"--min-p",
|
||||
type=float,
|
||||
default=None,
|
||||
help="Min-p sampling parameter. Only has effect on openai-compatible "
|
||||
"backends.")
|
||||
sampling_group.add_argument(
|
||||
"--temperature",
|
||||
type=float,
|
||||
default=None,
|
||||
help="Temperature sampling parameter. Only has effect on "
|
||||
"openai-compatible backends. If not specified, default to greedy "
|
||||
"decoding (i.e. temperature==0.0).")
|
||||
|
||||
parser.add_argument(
|
||||
'--tokenizer-mode',
|
||||
type=str,
|
||||
|
||||
@ -11,10 +11,10 @@ from typing import Any, Optional, Union
|
||||
|
||||
import torch
|
||||
import uvloop
|
||||
from benchmark_dataset import (BurstGPTDataset, ConversationDataset,
|
||||
InstructCoderDataset, RandomDataset,
|
||||
SampleRequest, ShareGPTDataset, SonnetDataset,
|
||||
VisionArenaDataset)
|
||||
from benchmark_dataset import (AIMODataset, BurstGPTDataset,
|
||||
ConversationDataset, InstructCoderDataset,
|
||||
RandomDataset, SampleRequest, ShareGPTDataset,
|
||||
SonnetDataset, VisionArenaDataset)
|
||||
from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
|
||||
from tqdm import tqdm
|
||||
from transformers import (AutoModelForCausalLM, AutoTokenizer,
|
||||
@ -332,7 +332,10 @@ def get_requests(args, tokenizer):
|
||||
common_kwargs['dataset_subset'] = args.hf_subset
|
||||
common_kwargs['dataset_split'] = args.hf_split
|
||||
sample_kwargs["enable_multimodal_chat"] = True
|
||||
|
||||
elif args.dataset_path in AIMODataset.SUPPORTED_DATASET_PATHS:
|
||||
dataset_cls = AIMODataset
|
||||
common_kwargs['dataset_subset'] = None
|
||||
common_kwargs['dataset_split'] = "train"
|
||||
else:
|
||||
raise ValueError(f"Unknown dataset name: {args.dataset_name}")
|
||||
# Remove None values
|
||||
@ -467,12 +470,13 @@ def validate_args(args):
|
||||
since --dataset-name is not 'hf'.",
|
||||
stacklevel=2)
|
||||
elif args.dataset_name == "hf":
|
||||
if args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS:
|
||||
assert args.backend == "vllm-chat", "VisionArenaDataset needs to use vllm-chat as the backend." #noqa: E501
|
||||
elif args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS:
|
||||
assert args.backend == "vllm", "InstructCoder dataset needs to use vllm as the backend." #noqa: E501
|
||||
elif args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS:
|
||||
assert args.backend == "vllm-chat", "ConversationDataset needs to use vllm-chat as the backend." #noqa: E501
|
||||
if args.dataset_path in (
|
||||
VisionArenaDataset.SUPPORTED_DATASET_PATHS.keys()
|
||||
| ConversationDataset.SUPPORTED_DATASET_PATHS):
|
||||
assert args.backend == "vllm-chat", f"{args.dataset_path} needs to use vllm-chat as the backend." #noqa: E501
|
||||
elif args.dataset_path in (InstructCoderDataset.SUPPORTED_DATASET_PATHS
|
||||
| AIMODataset.SUPPORTED_DATASET_PATHS):
|
||||
assert args.backend == "vllm", f"{args.dataset_path} needs to use vllm as the backend." #noqa: E501
|
||||
else:
|
||||
raise ValueError(
|
||||
f"{args.dataset_path} is not supported by hf dataset.")
|
||||
|
||||
@ -553,6 +553,9 @@ def main(args: argparse.Namespace):
|
||||
intermediate_size = config.moe_intermediate_size
|
||||
shard_intermediate_size = 2 * intermediate_size // args.tp_size
|
||||
else:
|
||||
if not hasattr(config, "hidden_size"):
|
||||
# Support for llama4
|
||||
config = config.text_config
|
||||
# Default: Mixtral.
|
||||
E = config.num_local_experts
|
||||
topk = config.num_experts_per_tok
|
||||
|
||||
@ -33,8 +33,6 @@ endif()
|
||||
|
||||
if(MACOSX_FOUND)
|
||||
list(APPEND CXX_COMPILE_FLAGS
|
||||
"-Xpreprocessor"
|
||||
"-fopenmp"
|
||||
"-DVLLM_CPU_EXTENSION")
|
||||
else()
|
||||
list(APPEND CXX_COMPILE_FLAGS
|
||||
@ -197,6 +195,7 @@ set(VLLM_EXT_SRC
|
||||
if (AVX512_FOUND AND NOT AVX512_DISABLED)
|
||||
set(VLLM_EXT_SRC
|
||||
"csrc/cpu/quant.cpp"
|
||||
"csrc/cpu/shm.cpp"
|
||||
${VLLM_EXT_SRC})
|
||||
endif()
|
||||
|
||||
|
||||
@ -78,9 +78,14 @@ struct FP16Vec16 : public Vec<FP16Vec16> {
|
||||
|
||||
__m256i reg;
|
||||
|
||||
// normal load
|
||||
explicit FP16Vec16(const void* ptr)
|
||||
: reg((__m256i)_mm256_loadu_si256((__m256i*)ptr)) {}
|
||||
|
||||
// non-temproal load
|
||||
explicit FP16Vec16(bool, void* ptr)
|
||||
: reg(_mm256_stream_load_si256((__m256i*)ptr)) {}
|
||||
|
||||
explicit FP16Vec16(const FP32Vec16&);
|
||||
|
||||
void save(void* ptr) const { *reinterpret_cast<__m256i*>(ptr) = reg; }
|
||||
@ -110,9 +115,14 @@ struct BF16Vec16 : public Vec<BF16Vec16> {
|
||||
|
||||
__m256i reg;
|
||||
|
||||
// normal load
|
||||
explicit BF16Vec16(const void* ptr)
|
||||
: reg((__m256i)_mm256_loadu_si256((__m256i*)ptr)) {}
|
||||
|
||||
// non-temproal load
|
||||
explicit BF16Vec16(bool, void* ptr)
|
||||
: reg(_mm256_stream_load_si256((__m256i*)ptr)) {}
|
||||
|
||||
explicit BF16Vec16(const FP32Vec16&);
|
||||
|
||||
void save(void* ptr) const { *reinterpret_cast<__m256i*>(ptr) = reg; }
|
||||
@ -313,8 +323,13 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
|
||||
|
||||
explicit FP32Vec16() : reg(_mm512_set1_ps(0.0)) {}
|
||||
|
||||
// normal load
|
||||
explicit FP32Vec16(const float* ptr) : reg(_mm512_loadu_ps(ptr)) {}
|
||||
|
||||
// non-temproal load
|
||||
explicit FP32Vec16(bool, void* ptr)
|
||||
: reg((__m512)_mm512_stream_load_si512(ptr)) {}
|
||||
|
||||
explicit FP32Vec16(__m512 data) : reg(data) {}
|
||||
|
||||
explicit FP32Vec16(const FP32Vec4& data)
|
||||
@ -547,6 +562,33 @@ struct INT8Vec16 : public Vec<INT8Vec16> {
|
||||
_mm_mask_storeu_epi8(ptr, mask, reg);
|
||||
}
|
||||
};
|
||||
|
||||
struct INT8Vec64 : public Vec<INT8Vec64> {
|
||||
constexpr static int VEC_ELEM_NUM = 64;
|
||||
union AliasReg {
|
||||
__m512i reg;
|
||||
int8_t values[VEC_ELEM_NUM];
|
||||
};
|
||||
|
||||
__m512i reg;
|
||||
|
||||
// normal load
|
||||
explicit INT8Vec64(void* ptr) : reg(_mm512_loadu_epi8(ptr)) {}
|
||||
|
||||
// non-temproal load
|
||||
explicit INT8Vec64(bool, void* ptr) : reg(_mm512_stream_load_si512(ptr)) {}
|
||||
|
||||
void save(void* ptr) const { _mm512_storeu_epi8(ptr, reg); }
|
||||
|
||||
void save(int8_t* ptr, const int elem_num) const {
|
||||
constexpr uint64_t M = 0xFFFFFFFFFFFFFFFF;
|
||||
__mmask64 mask = _cvtu64_mask64(M >> (64 - elem_num));
|
||||
_mm512_mask_storeu_epi8(ptr, mask, reg);
|
||||
}
|
||||
|
||||
// non-temproal save
|
||||
void nt_save(int8_t* ptr) { _mm512_stream_si512((__m512i*)ptr, reg); }
|
||||
};
|
||||
#endif
|
||||
|
||||
template <typename T>
|
||||
@ -657,6 +699,22 @@ inline BF16Vec16::BF16Vec16(const FP32Vec16& v) {
|
||||
|
||||
inline void prefetch(const void* addr) { _mm_prefetch(addr, _MM_HINT_T1); }
|
||||
|
||||
#ifdef __AVX512F__
|
||||
inline void non_temporal_save(FP16Vec16& vec, void* ptr) {
|
||||
_mm256_stream_si256((__m256i*)ptr, vec.reg);
|
||||
}
|
||||
inline void non_temporal_save(BF16Vec32& vec, void* ptr) {
|
||||
_mm512_stream_si512((__m512i*)ptr, vec.reg);
|
||||
}
|
||||
inline void non_temporal_save(BF16Vec16& vec, void* ptr) {
|
||||
_mm256_stream_si256((__m256i*)ptr, vec.reg);
|
||||
}
|
||||
inline void non_temporal_save(FP32Vec16& vec, void* ptr) {
|
||||
_mm512_stream_ps((float*)ptr, vec.reg);
|
||||
}
|
||||
#endif
|
||||
|
||||
inline void mem_barrier() { _mm_mfence(); }
|
||||
}; // namespace vec_op
|
||||
|
||||
#endif
|
||||
|
||||
781
csrc/cpu/shm.cpp
Normal file
781
csrc/cpu/shm.cpp
Normal file
@ -0,0 +1,781 @@
|
||||
#include "cpu/cpu_types.hpp"
|
||||
|
||||
#include <fcntl.h>
|
||||
#include <sys/mman.h>
|
||||
#include <sys/stat.h>
|
||||
#include <unistd.h>
|
||||
|
||||
namespace {
|
||||
#define MAX_SHM_RANK_NUM 8
|
||||
#define MAX_THREAD_NUM 12
|
||||
#define PER_THREAD_SHM_BUFFER_BYTES (4 * 1024 * 1024)
|
||||
#define MIN_THREAD_PROCESS_SIZE (8 * 1024)
|
||||
#define MAX_P2P_SEND_TENSOR_NUM 8
|
||||
|
||||
template <typename scalar_t>
|
||||
struct KernelVecType {
|
||||
using scalar_vec_t = void;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct KernelVecType<float> {
|
||||
using scalar_vec_t = vec_op::FP32Vec16;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct KernelVecType<c10::BFloat16> {
|
||||
using scalar_vec_t = vec_op::BF16Vec16;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct KernelVecType<c10::Half> {
|
||||
using scalar_vec_t = vec_op::FP16Vec16;
|
||||
};
|
||||
|
||||
enum class ThreadSHMStat : char { THREAD_READY = 0, SHM_DATA_READY, DONE };
|
||||
|
||||
struct ThreadSHMContext {
|
||||
volatile ThreadSHMStat thread_stats[MAX_SHM_RANK_NUM];
|
||||
int thread_id;
|
||||
int thread_num;
|
||||
int rank;
|
||||
int group_size;
|
||||
size_t _spinning_count;
|
||||
int swizzled_ranks[MAX_SHM_RANK_NUM];
|
||||
void* thread_shm_ptrs[MAX_SHM_RANK_NUM];
|
||||
ThreadSHMContext* shm_contexts[MAX_SHM_RANK_NUM];
|
||||
|
||||
ThreadSHMContext(const int thread_id, const int thread_num, const int rank,
|
||||
const int group_size, void* thread_shm_ptr)
|
||||
: thread_id(thread_id),
|
||||
thread_num(thread_num),
|
||||
rank(rank),
|
||||
group_size(group_size),
|
||||
_spinning_count(0) {
|
||||
static_assert(sizeof(ThreadSHMContext) % 64 == 0);
|
||||
TORCH_CHECK(group_size <= MAX_SHM_RANK_NUM);
|
||||
TORCH_CHECK((size_t)this % 64 == 0);
|
||||
TORCH_CHECK((size_t)thread_shm_ptr % 64 == 0);
|
||||
for (int i = 0; i < MAX_SHM_RANK_NUM; ++i) {
|
||||
shm_contexts[i] = nullptr;
|
||||
thread_shm_ptrs[i] = nullptr;
|
||||
swizzled_ranks[i] = (i + rank) % group_size;
|
||||
thread_stats[i] = ThreadSHMStat::DONE;
|
||||
}
|
||||
set_context(rank, this, thread_shm_ptr);
|
||||
}
|
||||
|
||||
void set_context(int rank, ThreadSHMContext* ptr, void* thread_shm_ptr) {
|
||||
TORCH_CHECK(rank < MAX_SHM_RANK_NUM);
|
||||
TORCH_CHECK(ptr);
|
||||
TORCH_CHECK(thread_shm_ptr);
|
||||
TORCH_CHECK_EQ(ptr->thread_num, thread_num);
|
||||
TORCH_CHECK_EQ(ptr->thread_id, thread_id);
|
||||
shm_contexts[rank] = ptr;
|
||||
thread_shm_ptrs[rank] = thread_shm_ptr;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
T* get_thread_shm_ptr(int rank) {
|
||||
return reinterpret_cast<T*>(thread_shm_ptrs[rank]);
|
||||
}
|
||||
|
||||
int get_swizzled_rank(int idx) { return swizzled_ranks[idx]; }
|
||||
|
||||
void wait_for_all(ThreadSHMStat prev_stat) {
|
||||
for (int idx = 0; idx < group_size; ++idx) {
|
||||
int rank = get_swizzled_rank(idx);
|
||||
while (thread_stats[rank] == prev_stat) {
|
||||
++_spinning_count;
|
||||
_mm_pause();
|
||||
}
|
||||
}
|
||||
vec_op::mem_barrier();
|
||||
}
|
||||
|
||||
void wait_for_one(int rank, ThreadSHMStat prev_stat) {
|
||||
while (thread_stats[rank] == prev_stat) {
|
||||
++_spinning_count;
|
||||
_mm_pause();
|
||||
}
|
||||
vec_op::mem_barrier();
|
||||
}
|
||||
|
||||
void set_thread_stat(ThreadSHMStat stat) {
|
||||
for (int idx = 0; idx < group_size; ++idx) {
|
||||
int rank = get_swizzled_rank(idx);
|
||||
shm_contexts[rank]->thread_stats[this->rank] = stat;
|
||||
}
|
||||
}
|
||||
|
||||
void set_thread_stat(int target_rank, ThreadSHMStat stat) {
|
||||
for (int idx = 0; idx < group_size; ++idx) {
|
||||
int rank = get_swizzled_rank(idx);
|
||||
shm_contexts[rank]->thread_stats[target_rank] = stat;
|
||||
}
|
||||
}
|
||||
|
||||
// barrier for all ranks in the group, used for all2all ops
|
||||
// DONE -> THREAD_READY -> SHM_DATA_READY -> DONE -> ...
|
||||
void barrier(ThreadSHMStat next_stat) {
|
||||
if (next_stat == ThreadSHMStat::THREAD_READY) {
|
||||
set_thread_stat(ThreadSHMStat::THREAD_READY);
|
||||
wait_for_all(ThreadSHMStat::DONE);
|
||||
} else if (next_stat == ThreadSHMStat::SHM_DATA_READY) {
|
||||
set_thread_stat(ThreadSHMStat::SHM_DATA_READY);
|
||||
wait_for_all(ThreadSHMStat::THREAD_READY);
|
||||
} else if (next_stat == ThreadSHMStat::DONE) {
|
||||
set_thread_stat(ThreadSHMStat::DONE);
|
||||
wait_for_all(ThreadSHMStat::SHM_DATA_READY);
|
||||
} else {
|
||||
TORCH_CHECK(false, "Invalid next_stat to barrier.");
|
||||
}
|
||||
}
|
||||
|
||||
std::string to_string() const {
|
||||
std::stringstream ss;
|
||||
ss << "SHMContext:";
|
||||
ss << "\nrank: " << rank;
|
||||
ss << "\ngroup_size: " << group_size;
|
||||
ss << "\nthread_num: " << thread_num;
|
||||
ss << "\nthread_id: " << thread_id;
|
||||
|
||||
ss << "\nshm_ctx_stat_loop_seq: [";
|
||||
for (int i = 0; i < group_size; ++i) {
|
||||
ss << swizzled_ranks[i] << ", ";
|
||||
}
|
||||
ss << "]";
|
||||
|
||||
ss << "\nshm_contexts: [";
|
||||
for (int i = 0; i < group_size; ++i) {
|
||||
if (shm_contexts[i]) {
|
||||
ss << shm_contexts[i]->rank << ", ";
|
||||
}
|
||||
}
|
||||
ss << "]";
|
||||
|
||||
return ss.str();
|
||||
}
|
||||
};
|
||||
|
||||
class SHMManager {
|
||||
public:
|
||||
explicit SHMManager(const std::string& name, const int rank,
|
||||
const int group_size)
|
||||
: _rank(rank),
|
||||
_group_size(group_size),
|
||||
_thread_num(std::min(torch::get_num_threads(), MAX_THREAD_NUM)),
|
||||
_shm_names({""}),
|
||||
_shared_mem_ptrs({nullptr}),
|
||||
_shm_ctx(nullptr) {
|
||||
_shm_names[rank] = get_shm_name(name, rank);
|
||||
_shared_mem_ptrs[rank] = init_shm(rank);
|
||||
_shm_ctx = reinterpret_cast<ThreadSHMContext*>(_shared_mem_ptrs[rank]);
|
||||
|
||||
for (int i = 0; i < _thread_num; ++i) {
|
||||
ThreadSHMContext* ctx = new (_shm_ctx + i)
|
||||
ThreadSHMContext(i, _thread_num, _rank, _group_size,
|
||||
compute_thread_shm_ptr(_shm_ctx, i));
|
||||
}
|
||||
}
|
||||
|
||||
void join(const std::string& name) {
|
||||
for (int rank_idx = 0; rank_idx < _group_size; ++rank_idx) {
|
||||
if (rank_idx != _rank) {
|
||||
TORCH_CHECK(_shm_names[rank_idx].empty());
|
||||
TORCH_CHECK(_shared_mem_ptrs[rank_idx] == nullptr);
|
||||
_shm_names[rank_idx] = get_shm_name(name, rank_idx);
|
||||
_shared_mem_ptrs[rank_idx] = init_shm(rank_idx);
|
||||
ThreadSHMContext* target_ctx =
|
||||
reinterpret_cast<ThreadSHMContext*>(_shared_mem_ptrs[rank_idx]);
|
||||
for (int thread_idx = 0; thread_idx < _thread_num; ++thread_idx) {
|
||||
_shm_ctx[thread_idx].set_context(
|
||||
rank_idx, target_ctx + thread_idx,
|
||||
compute_thread_shm_ptr(target_ctx, thread_idx));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
~SHMManager() { destroy_shm(); }
|
||||
|
||||
ThreadSHMContext* get_shm_ctx() const { return _shm_ctx; }
|
||||
|
||||
static std::string get_shm_name(const std::string& name, int rank) {
|
||||
return name + "_" + std::to_string(rank);
|
||||
}
|
||||
|
||||
static int64_t create_singleton_instance(const std::string& name,
|
||||
const int group_size,
|
||||
const int rank) {
|
||||
std::lock_guard<std::mutex> guard(SingletonInstancesLock);
|
||||
SingletonInstances.emplace_back(
|
||||
std::make_unique<SHMManager>(name, rank, group_size));
|
||||
return static_cast<int64_t>(SingletonInstances.size() - 1);
|
||||
}
|
||||
|
||||
static SHMManager* get_singleton_instance(int64_t handle) {
|
||||
return SingletonInstances[handle].get();
|
||||
}
|
||||
|
||||
protected:
|
||||
static std::vector<std::unique_ptr<SHMManager>> SingletonInstances;
|
||||
static std::mutex SingletonInstancesLock;
|
||||
|
||||
private:
|
||||
static size_t round_to_alignment(size_t num) {
|
||||
return ((num + 63) / 64) * 64;
|
||||
}
|
||||
|
||||
int8_t* compute_thread_shm_ptr(ThreadSHMContext* ctx, int thread_id) {
|
||||
int8_t* thread_shm_ptr =
|
||||
reinterpret_cast<int8_t*>(ctx) +
|
||||
round_to_alignment(_thread_num * sizeof(ThreadSHMContext));
|
||||
return thread_shm_ptr +
|
||||
thread_id * round_to_alignment(PER_THREAD_SHM_BUFFER_BYTES);
|
||||
}
|
||||
|
||||
size_t compute_shm_size() {
|
||||
const size_t rounded_rank_buffer_size =
|
||||
round_to_alignment(PER_THREAD_SHM_BUFFER_BYTES) * _thread_num;
|
||||
const size_t rounded_thread_shm_ctx_size =
|
||||
round_to_alignment(_thread_num * sizeof(ThreadSHMContext));
|
||||
const size_t shm_size =
|
||||
rounded_thread_shm_ctx_size + rounded_rank_buffer_size;
|
||||
return shm_size;
|
||||
}
|
||||
|
||||
void* init_shm(int target_rank) {
|
||||
const std::string& shm_name = _shm_names[target_rank];
|
||||
const int local_rank = _rank;
|
||||
const size_t shm_size = compute_shm_size();
|
||||
|
||||
int fd = -1;
|
||||
if (local_rank == target_rank) {
|
||||
fd = shm_open(shm_name.c_str(), O_CREAT | O_EXCL | O_RDWR,
|
||||
S_IRUSR | S_IWUSR);
|
||||
|
||||
if (fd == -1)
|
||||
TORCH_CHECK(false, "create shm in SHMManager failed. errno: " +
|
||||
std::to_string(errno));
|
||||
|
||||
if (ftruncate(fd, shm_size) == -1)
|
||||
TORCH_CHECK(false, "ftruncate in SHMManager failed. errno: " +
|
||||
std::to_string(errno));
|
||||
} else {
|
||||
fd = shm_open(shm_name.c_str(), O_RDWR, S_IRUSR | S_IWUSR);
|
||||
|
||||
if (fd == -1)
|
||||
TORCH_CHECK(false, "open shm in SHMManager failed. errno: " +
|
||||
std::to_string(errno));
|
||||
}
|
||||
|
||||
void* shm_ptr = mmap(nullptr, shm_size, PROT_READ | PROT_WRITE,
|
||||
MAP_SHARED | MAP_POPULATE, fd, 0);
|
||||
|
||||
if (shm_ptr == MAP_FAILED) {
|
||||
TORCH_CHECK(false,
|
||||
"mmap in SHMManager failed. errno: " + std::to_string(errno));
|
||||
}
|
||||
|
||||
if (close(fd) != 0) {
|
||||
TORCH_CHECK(
|
||||
false, "close in SHMManager failed. errno: " + std::to_string(errno));
|
||||
}
|
||||
|
||||
TORCH_CHECK((size_t)shm_ptr % 64 == 0);
|
||||
|
||||
return shm_ptr;
|
||||
}
|
||||
|
||||
void destroy_shm() {
|
||||
std::stringstream ss;
|
||||
ss << "local rank " << _rank << ": [";
|
||||
for (int thread_id = 0; thread_id < _thread_num; ++thread_id) {
|
||||
ss << _shm_ctx[thread_id]._spinning_count << ", ";
|
||||
}
|
||||
ss << "]\n";
|
||||
|
||||
for (int i = 0; i < MAX_SHM_RANK_NUM; ++i) {
|
||||
if (_shared_mem_ptrs[i] != nullptr) {
|
||||
munmap(_shared_mem_ptrs[i], compute_shm_size());
|
||||
}
|
||||
|
||||
if (!_shm_names[i].empty()) {
|
||||
shm_unlink(_shm_names[i].c_str());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int _rank;
|
||||
int _group_size;
|
||||
int _thread_num;
|
||||
std::array<std::string, MAX_SHM_RANK_NUM> _shm_names;
|
||||
std::array<void*, MAX_SHM_RANK_NUM> _shared_mem_ptrs;
|
||||
ThreadSHMContext* _shm_ctx;
|
||||
};
|
||||
|
||||
namespace shm_cc_ops {
|
||||
template <typename scalar_t, typename F>
|
||||
void shm_cc_loop(ThreadSHMContext* ctx, int64_t elem_num, F&& inner_func) {
|
||||
int thread_num = ctx->thread_num;
|
||||
int64_t total_bytes = elem_num * sizeof(scalar_t);
|
||||
int64_t total_units_num =
|
||||
(total_bytes + MIN_THREAD_PROCESS_SIZE - 1) / MIN_THREAD_PROCESS_SIZE;
|
||||
int64_t per_thread_units_num =
|
||||
(total_units_num + thread_num - 1) / thread_num;
|
||||
int64_t per_unit_elem_num = MIN_THREAD_PROCESS_SIZE / sizeof(scalar_t);
|
||||
int64_t max_per_thread_iteration_elem_num =
|
||||
PER_THREAD_SHM_BUFFER_BYTES / sizeof(scalar_t);
|
||||
int64_t per_thread_elem_num = per_unit_elem_num * per_thread_units_num;
|
||||
|
||||
#pragma omp parallel for schedule(static, 1)
|
||||
for (int i = 0; i < thread_num; ++i) {
|
||||
int64_t offset = i * per_thread_elem_num;
|
||||
int64_t end = std::min(elem_num, offset + per_thread_elem_num);
|
||||
int64_t curr_elem_num =
|
||||
std::min(max_per_thread_iteration_elem_num, end - offset);
|
||||
ThreadSHMContext* thread_ctx = ctx + i;
|
||||
|
||||
while (curr_elem_num > 0) {
|
||||
inner_func(thread_ctx, offset, curr_elem_num);
|
||||
|
||||
offset += max_per_thread_iteration_elem_num;
|
||||
curr_elem_num = std::min(max_per_thread_iteration_elem_num, end - offset);
|
||||
}
|
||||
}
|
||||
}
|
||||
}; // namespace shm_cc_ops
|
||||
|
||||
namespace shm_cc_ops {
|
||||
|
||||
void memcpy_from_shm(void* dst, void* src, const int64_t bytes) {
|
||||
const int64_t aligned_bytes = ((bytes >> 6) << 6); // 64 bytes aligned
|
||||
int64_t i = 0;
|
||||
#pragma GCC unroll 4
|
||||
for (; i < aligned_bytes; i += 64) {
|
||||
vec_op::INT8Vec64 data(
|
||||
true, (int8_t*)src + i); // stream loading shm to avoid caching
|
||||
data.save((int8_t*)dst + i);
|
||||
}
|
||||
if (aligned_bytes < bytes) {
|
||||
vec_op::INT8Vec64 data(true, (int8_t*)src + aligned_bytes);
|
||||
data.save((int8_t*)dst + aligned_bytes, bytes - aligned_bytes);
|
||||
}
|
||||
}
|
||||
|
||||
void memcpy_to_shm(void* dst, void* src, const int64_t bytes) {
|
||||
#pragma GCC unroll 4
|
||||
for (int64_t i = 0; i < bytes; i += 64) {
|
||||
vec_op::INT8Vec64 data((int8_t*)src + i);
|
||||
data.nt_save((int8_t*)dst + i);
|
||||
}
|
||||
}
|
||||
|
||||
void memcpy(void* dst, void* src, const int64_t bytes) {
|
||||
const int64_t aligned_bytes = ((bytes >> 6) << 6); // 64 bytes aligned
|
||||
int64_t i = 0;
|
||||
#pragma GCC unroll 4
|
||||
for (; i < aligned_bytes; i += 64) {
|
||||
vec_op::INT8Vec64 data((int8_t*)src + i);
|
||||
data.save((int8_t*)dst + i);
|
||||
}
|
||||
if (aligned_bytes < bytes) {
|
||||
vec_op::INT8Vec64 data((int8_t*)src + aligned_bytes);
|
||||
data.save((int8_t*)dst + aligned_bytes, bytes - aligned_bytes);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename scalar_t, int RANKS>
|
||||
void all_reduce_sum_impl(ThreadSHMContext* ctx, scalar_t* data,
|
||||
size_t elem_num) {
|
||||
CPU_KERNEL_GUARD_IN(all_reduce_sum_impl)
|
||||
using vec_t = typename KernelVecType<scalar_t>::scalar_vec_t;
|
||||
constexpr int64_t vec_elem_num = vec_t::get_elem_num();
|
||||
const int worldsize = ctx->group_size;
|
||||
|
||||
shm_cc_ops::shm_cc_loop<scalar_t>(
|
||||
ctx, elem_num,
|
||||
[&](ThreadSHMContext* thread_ctx, int64_t data_offset,
|
||||
int64_t data_elem_num) {
|
||||
int rank = thread_ctx->rank;
|
||||
scalar_t* thread_shm_ptr =
|
||||
thread_ctx->get_thread_shm_ptr<scalar_t>(rank);
|
||||
scalar_t* thread_data_ptr = data + data_offset;
|
||||
int64_t thread_data_elem_num = data_elem_num * sizeof(scalar_t);
|
||||
|
||||
scalar_t* remote_data_ptrs[RANKS - 1];
|
||||
vec_op::unroll_loop<int, RANKS - 1>([&](int idx) {
|
||||
remote_data_ptrs[idx] = thread_ctx->get_thread_shm_ptr<scalar_t>(
|
||||
thread_ctx->get_swizzled_rank(idx + 1));
|
||||
});
|
||||
|
||||
thread_ctx->barrier(ThreadSHMStat::THREAD_READY);
|
||||
|
||||
shm_cc_ops::memcpy_to_shm(thread_shm_ptr, thread_data_ptr,
|
||||
thread_data_elem_num);
|
||||
|
||||
thread_ctx->barrier(ThreadSHMStat::SHM_DATA_READY);
|
||||
|
||||
int64_t aligned_data_elem_num =
|
||||
(data_elem_num / vec_elem_num) * vec_elem_num;
|
||||
int64_t i = 0;
|
||||
#pragma GCC unroll 4
|
||||
for (; i < aligned_data_elem_num; i += vec_elem_num) {
|
||||
vec_t local_data(thread_data_ptr + i); // load from cache
|
||||
vec_op::FP32Vec16 local_data_fp32(local_data);
|
||||
vec_op::unroll_loop<int, RANKS - 1>([&](int idx) {
|
||||
vec_t remote_data(
|
||||
true, remote_data_ptrs[idx] + i); // stream load from shm
|
||||
vec_op::FP32Vec16 remote_data_fp32(remote_data);
|
||||
local_data_fp32 = local_data_fp32 + remote_data_fp32; // sum reduce
|
||||
});
|
||||
vec_t reduced_data(local_data_fp32);
|
||||
reduced_data.save(thread_data_ptr + i);
|
||||
}
|
||||
|
||||
if (i < data_elem_num) {
|
||||
vec_t local_data(thread_data_ptr + i); // load from cache
|
||||
vec_op::FP32Vec16 local_data_fp32(local_data);
|
||||
vec_op::unroll_loop<int, RANKS - 1>([&](int idx) {
|
||||
vec_t remote_data(
|
||||
true, remote_data_ptrs[idx] + i); // stream load from shm
|
||||
vec_op::FP32Vec16 remote_data_fp32(remote_data);
|
||||
local_data_fp32 = local_data_fp32 + remote_data_fp32; // sum reduce
|
||||
});
|
||||
vec_t reduced_data(local_data_fp32);
|
||||
reduced_data.save(thread_data_ptr + i,
|
||||
data_elem_num - aligned_data_elem_num);
|
||||
}
|
||||
|
||||
thread_ctx->barrier(ThreadSHMStat::DONE);
|
||||
});
|
||||
|
||||
return;
|
||||
}
|
||||
}; // namespace shm_cc_ops
|
||||
|
||||
std::vector<std::unique_ptr<SHMManager>> SHMManager::SingletonInstances = {};
|
||||
std::mutex SHMManager::SingletonInstancesLock = {};
|
||||
|
||||
template <typename scalar_t>
|
||||
void shm_allreduce_sum(ThreadSHMContext* ctx, scalar_t* data, size_t elem_num) {
|
||||
switch (ctx->group_size) {
|
||||
case 2:
|
||||
shm_cc_ops::all_reduce_sum_impl<scalar_t, 2>(ctx, data, elem_num);
|
||||
break;
|
||||
case 3:
|
||||
shm_cc_ops::all_reduce_sum_impl<scalar_t, 3>(ctx, data, elem_num);
|
||||
break;
|
||||
case 4:
|
||||
shm_cc_ops::all_reduce_sum_impl<scalar_t, 4>(ctx, data, elem_num);
|
||||
break;
|
||||
case 8:
|
||||
shm_cc_ops::all_reduce_sum_impl<scalar_t, 8>(ctx, data, elem_num);
|
||||
break;
|
||||
default:
|
||||
TORCH_CHECK(false,
|
||||
"Invalid world size: " + std::to_string(ctx->group_size));
|
||||
}
|
||||
}
|
||||
|
||||
template <typename scalar_t>
|
||||
void shm_gather_impl(ThreadSHMContext* ctx, scalar_t* data, size_t elem_num,
|
||||
scalar_t** outputs, const int dst) {
|
||||
CPU_KERNEL_GUARD_IN(shm_gather_impl)
|
||||
const int worldsize = ctx->group_size;
|
||||
TORCH_CHECK_LT(dst, worldsize);
|
||||
shm_cc_ops::shm_cc_loop<scalar_t>(
|
||||
ctx, elem_num,
|
||||
[&](ThreadSHMContext* thread_ctx, int64_t data_offset,
|
||||
int64_t data_elem_num) {
|
||||
int rank = thread_ctx->rank;
|
||||
scalar_t* thread_shm_ptr =
|
||||
thread_ctx->get_thread_shm_ptr<scalar_t>(rank);
|
||||
|
||||
thread_ctx->barrier(ThreadSHMStat::THREAD_READY);
|
||||
|
||||
shm_cc_ops::memcpy_to_shm(thread_shm_ptr, data + data_offset,
|
||||
data_elem_num * sizeof(scalar_t));
|
||||
|
||||
thread_ctx->barrier(ThreadSHMStat::SHM_DATA_READY);
|
||||
|
||||
if (rank == dst) {
|
||||
shm_cc_ops::memcpy(outputs[rank] + data_offset, data + data_offset,
|
||||
data_elem_num * sizeof(scalar_t));
|
||||
for (int i = 1; i < worldsize; ++i) {
|
||||
int src_rank = thread_ctx->get_swizzled_rank(i);
|
||||
scalar_t* src_ptr =
|
||||
thread_ctx->get_thread_shm_ptr<scalar_t>(src_rank); // shm
|
||||
scalar_t* dst_ptr = outputs[src_rank] + data_offset;
|
||||
shm_cc_ops::memcpy_from_shm(dst_ptr, src_ptr,
|
||||
data_elem_num * sizeof(scalar_t));
|
||||
}
|
||||
}
|
||||
|
||||
thread_ctx->barrier(ThreadSHMStat::DONE);
|
||||
});
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
struct MemPiece {
|
||||
void* ptr;
|
||||
int64_t size;
|
||||
|
||||
template <typename T>
|
||||
T* data_ptr() {
|
||||
return reinterpret_cast<T*>(ptr);
|
||||
}
|
||||
};
|
||||
|
||||
struct TensorListMeta {
|
||||
int64_t tensor_bytes[MAX_P2P_SEND_TENSOR_NUM];
|
||||
torch::ScalarType tensor_types[MAX_P2P_SEND_TENSOR_NUM];
|
||||
int64_t tensor_num;
|
||||
int64_t total_bytes;
|
||||
|
||||
TensorListMeta() : tensor_num(0), total_bytes(0) {
|
||||
static_assert(sizeof(TensorListMeta) % 64 == 0);
|
||||
static_assert(sizeof(TensorListMeta) <
|
||||
MIN_THREAD_PROCESS_SIZE); // To ensure the metadata always
|
||||
// hold by the thread 0
|
||||
for (int i = 0; i < MAX_P2P_SEND_TENSOR_NUM; ++i) {
|
||||
tensor_bytes[i] = 0;
|
||||
tensor_ptrs[i] = nullptr;
|
||||
tensor_types[i] = torch::ScalarType::Undefined;
|
||||
}
|
||||
}
|
||||
|
||||
// For send and recv
|
||||
void bind_tensor_list(std::vector<torch::Tensor>& tensor_list) {
|
||||
TORCH_CHECK(tensor_types[0] == torch::ScalarType::Undefined,
|
||||
"Re-bind TensorListMeta is not allowed.")
|
||||
TORCH_CHECK_LE(tensor_list.size(), MAX_P2P_SEND_TENSOR_NUM);
|
||||
tensor_num = tensor_list.size();
|
||||
int64_t bytes_sum = 0;
|
||||
for (int i = 0; i < tensor_list.size(); ++i) {
|
||||
torch::Tensor& t = tensor_list[i];
|
||||
TORCH_CHECK(t.is_contiguous());
|
||||
tensor_bytes[i] = t.nbytes();
|
||||
tensor_types[i] = t.scalar_type();
|
||||
tensor_ptrs[i] = t.data_ptr();
|
||||
bytes_sum += t.nbytes();
|
||||
}
|
||||
total_bytes = bytes_sum;
|
||||
}
|
||||
|
||||
// For recv
|
||||
std::vector<torch::Tensor> generate_tensor_list() {
|
||||
std::vector<torch::Tensor> tensor_list;
|
||||
tensor_list.reserve(tensor_num);
|
||||
|
||||
for (int i = 0; i < tensor_num; ++i) {
|
||||
int64_t bytes = tensor_bytes[i];
|
||||
auto type = tensor_types[i];
|
||||
int64_t elem_bytes = torch::elementSize(type);
|
||||
|
||||
TORCH_CHECK_EQ(bytes % elem_bytes, 0);
|
||||
int64_t elem_num = bytes / elem_bytes;
|
||||
auto options = torch::TensorOptions().dtype(type).device(torch::kCPU);
|
||||
tensor_list.emplace_back(torch::empty({elem_num}, options));
|
||||
}
|
||||
return tensor_list;
|
||||
}
|
||||
|
||||
MemPiece get_data(int64_t offset) {
|
||||
for (int i = 0; i < tensor_num; ++i) {
|
||||
if (offset < tensor_bytes[i]) {
|
||||
return {reinterpret_cast<int8_t*>(tensor_ptrs[i]) + offset,
|
||||
tensor_bytes[i] - offset};
|
||||
}
|
||||
offset -= tensor_bytes[i];
|
||||
}
|
||||
return {nullptr, 0};
|
||||
}
|
||||
|
||||
private:
|
||||
void* tensor_ptrs[MAX_P2P_SEND_TENSOR_NUM];
|
||||
int8_t _padding[40];
|
||||
};
|
||||
|
||||
void shm_send_tensor_list_impl(ThreadSHMContext* ctx,
|
||||
const std::vector<torch::Tensor>& tensor_list) {
|
||||
CPU_KERNEL_GUARD_IN(shm_send_tensor_list_impl)
|
||||
std::vector<torch::Tensor> tensor_list_with_metadata;
|
||||
tensor_list_with_metadata.reserve(1 + tensor_list.size());
|
||||
|
||||
auto options = torch::TensorOptions().dtype(torch::kInt8).device(torch::kCPU);
|
||||
tensor_list_with_metadata.emplace_back(
|
||||
torch::empty({sizeof(TensorListMeta)}, options));
|
||||
tensor_list_with_metadata.insert(tensor_list_with_metadata.end(),
|
||||
tensor_list.begin(), tensor_list.end());
|
||||
|
||||
torch::Tensor& metadata_tensor = tensor_list_with_metadata[0];
|
||||
TORCH_CHECK_EQ(metadata_tensor.nbytes(), sizeof(TensorListMeta));
|
||||
|
||||
TensorListMeta* metadata = new (metadata_tensor.data_ptr()) TensorListMeta();
|
||||
metadata->bind_tensor_list(tensor_list_with_metadata);
|
||||
|
||||
shm_cc_ops::shm_cc_loop<int8_t>(
|
||||
ctx, metadata->total_bytes,
|
||||
[&](ThreadSHMContext* thread_ctx, int64_t data_offset,
|
||||
int64_t data_elem_num) {
|
||||
int rank = thread_ctx->rank;
|
||||
// Wait until the receiver set the stat to DONE
|
||||
thread_ctx->wait_for_one(rank, ThreadSHMStat::SHM_DATA_READY);
|
||||
|
||||
int64_t curr_shm_offset = 0;
|
||||
while (curr_shm_offset < data_elem_num) {
|
||||
MemPiece frag = metadata->get_data(data_offset + curr_shm_offset);
|
||||
frag.size = std::min(frag.size, data_elem_num - curr_shm_offset);
|
||||
shm_cc_ops::memcpy(
|
||||
thread_ctx->get_thread_shm_ptr<int8_t>(rank) + curr_shm_offset,
|
||||
frag.ptr, frag.size);
|
||||
curr_shm_offset += frag.size;
|
||||
}
|
||||
|
||||
thread_ctx->set_thread_stat(rank, ThreadSHMStat::SHM_DATA_READY);
|
||||
});
|
||||
}
|
||||
|
||||
std::vector<torch::Tensor> shm_recv_tensor_list_impl(ThreadSHMContext* ctx,
|
||||
int64_t src) {
|
||||
CPU_KERNEL_GUARD_IN(shm_recv_tensor_list_impl)
|
||||
auto options = torch::TensorOptions().dtype(torch::kInt8).device(torch::kCPU);
|
||||
torch::Tensor metadata_tensor =
|
||||
torch::empty({sizeof(TensorListMeta)}, options);
|
||||
|
||||
// Wait until the sender set the stat of the thread 0 to SHM_DATA_READY
|
||||
ctx->wait_for_one(src, ThreadSHMStat::DONE);
|
||||
shm_cc_ops::memcpy(metadata_tensor.data_ptr(),
|
||||
ctx->get_thread_shm_ptr<void>(src),
|
||||
sizeof(TensorListMeta));
|
||||
TensorListMeta* src_metadata =
|
||||
reinterpret_cast<TensorListMeta*>(metadata_tensor.data_ptr());
|
||||
std::vector<torch::Tensor> tensor_list_with_metadata =
|
||||
src_metadata->generate_tensor_list();
|
||||
|
||||
TensorListMeta metadata;
|
||||
metadata.bind_tensor_list(tensor_list_with_metadata);
|
||||
TORCH_CHECK_EQ(metadata.tensor_num, src_metadata->tensor_num);
|
||||
TORCH_CHECK_EQ(metadata.total_bytes, src_metadata->total_bytes);
|
||||
|
||||
shm_cc_ops::shm_cc_loop<int8_t>(
|
||||
ctx, metadata.total_bytes,
|
||||
[&](ThreadSHMContext* thread_ctx, int64_t data_offset,
|
||||
int64_t data_elem_num) {
|
||||
// Wait until the sender set the stat to SHM_DATA_READY
|
||||
thread_ctx->wait_for_one(src, ThreadSHMStat::DONE);
|
||||
int64_t curr_shm_offset = 0;
|
||||
while (curr_shm_offset < data_elem_num) {
|
||||
MemPiece frag = metadata.get_data(data_offset + curr_shm_offset);
|
||||
frag.size = std::min(frag.size, data_elem_num - curr_shm_offset);
|
||||
shm_cc_ops::memcpy(
|
||||
frag.ptr,
|
||||
thread_ctx->get_thread_shm_ptr<int8_t>(src) + curr_shm_offset,
|
||||
frag.size);
|
||||
curr_shm_offset += frag.size;
|
||||
}
|
||||
|
||||
thread_ctx->set_thread_stat(src, ThreadSHMStat::DONE);
|
||||
});
|
||||
|
||||
std::vector<torch::Tensor> tensor_list;
|
||||
tensor_list.reserve(metadata.tensor_num - 1);
|
||||
tensor_list.insert(tensor_list.begin(), tensor_list_with_metadata.begin() + 1,
|
||||
tensor_list_with_metadata.end());
|
||||
|
||||
return tensor_list;
|
||||
}
|
||||
} // namespace
|
||||
|
||||
void shm_gather(int64_t handle, torch::Tensor& data,
|
||||
const std::optional<std::vector<torch::Tensor>>& outputs,
|
||||
int64_t dst) {
|
||||
TORCH_CHECK(data.is_contiguous())
|
||||
VLLM_DISPATCH_FLOATING_TYPES(data.scalar_type(), "shm_gather_impl", [&] {
|
||||
CPU_KERNEL_GUARD_IN(shm_gather_impl)
|
||||
|
||||
if (outputs.has_value()) {
|
||||
TORCH_CHECK_LE(outputs->size(), MAX_SHM_RANK_NUM);
|
||||
scalar_t* output_ptrs[MAX_SHM_RANK_NUM] = {nullptr};
|
||||
for (int i = 0; i < outputs->size(); ++i) {
|
||||
output_ptrs[i] = outputs->at(i).data_ptr<scalar_t>();
|
||||
}
|
||||
shm_gather_impl(SHMManager::get_singleton_instance(handle)->get_shm_ctx(),
|
||||
data.data_ptr<scalar_t>(), data.numel(), output_ptrs,
|
||||
dst);
|
||||
} else {
|
||||
shm_gather_impl(SHMManager::get_singleton_instance(handle)->get_shm_ctx(),
|
||||
data.data_ptr<scalar_t>(), data.numel(), (scalar_t**)(0),
|
||||
dst);
|
||||
}
|
||||
|
||||
CPU_KERNEL_GUARD_OUT(shm_gather_impl)
|
||||
});
|
||||
}
|
||||
|
||||
void shm_all_gather(int64_t handle, const torch::Tensor& data,
|
||||
torch::Tensor& output) {
|
||||
TORCH_CHECK(data.is_contiguous())
|
||||
TORCH_CHECK(output.is_contiguous())
|
||||
|
||||
const int64_t input_elem_num = data.numel();
|
||||
const int64_t output_elem_num = output.numel();
|
||||
TORCH_CHECK_EQ(output_elem_num % input_elem_num, 0);
|
||||
const int world_size = output_elem_num / input_elem_num;
|
||||
|
||||
VLLM_DISPATCH_FLOATING_TYPES(data.scalar_type(), "shm_all_gather_impl", [&] {
|
||||
CPU_KERNEL_GUARD_IN(shm_all_gather_impl)
|
||||
auto ctx = SHMManager::get_singleton_instance(handle)->get_shm_ctx();
|
||||
TORCH_CHECK_EQ(ctx->group_size, world_size);
|
||||
|
||||
scalar_t* output_ptrs[MAX_SHM_RANK_NUM] = {nullptr};
|
||||
for (int i = 0; i < world_size; ++i) {
|
||||
output_ptrs[i] = output.data_ptr<scalar_t>() + i * input_elem_num;
|
||||
}
|
||||
shm_gather_impl(ctx, data.data_ptr<scalar_t>(), data.numel(), output_ptrs,
|
||||
ctx->rank);
|
||||
CPU_KERNEL_GUARD_OUT(shm_all_gather_impl)
|
||||
});
|
||||
}
|
||||
|
||||
void shm_allreduce(int64_t handle, torch::Tensor& data) {
|
||||
TORCH_CHECK(data.is_contiguous())
|
||||
VLLM_DISPATCH_FLOATING_TYPES(data.scalar_type(), "shm_allreduce_sum", [&] {
|
||||
CPU_KERNEL_GUARD_IN(shm_allreduce_sum)
|
||||
shm_allreduce_sum(SHMManager::get_singleton_instance(handle)->get_shm_ctx(),
|
||||
data.data_ptr<scalar_t>(), data.numel());
|
||||
CPU_KERNEL_GUARD_OUT(shm_allreduce_sum)
|
||||
});
|
||||
}
|
||||
|
||||
void shm_send_tensor_list(int64_t handle,
|
||||
const std::vector<torch::Tensor>& tensor_list,
|
||||
int64_t dst) {
|
||||
CPU_KERNEL_GUARD_IN(shm_send_tensor_list)
|
||||
shm_send_tensor_list_impl(
|
||||
SHMManager::get_singleton_instance(handle)->get_shm_ctx(), tensor_list);
|
||||
CPU_KERNEL_GUARD_OUT(shm_send_tensor_list)
|
||||
}
|
||||
|
||||
std::vector<torch::Tensor> shm_recv_tensor_list(int64_t handle, int64_t src) {
|
||||
CPU_KERNEL_GUARD_IN(shm_recv_tensor_list)
|
||||
auto tensor_list = shm_recv_tensor_list_impl(
|
||||
SHMManager::get_singleton_instance(handle)->get_shm_ctx(), src);
|
||||
CPU_KERNEL_GUARD_OUT(shm_recv_tensor_list)
|
||||
return tensor_list;
|
||||
}
|
||||
|
||||
int64_t init_shm_manager(const std::string& name, const int64_t group_size,
|
||||
const int64_t rank) {
|
||||
return SHMManager::create_singleton_instance(name, group_size, rank);
|
||||
}
|
||||
|
||||
std::string join_shm_manager(int64_t handle, const std::string& name) {
|
||||
auto shm_manager = SHMManager::get_singleton_instance(handle);
|
||||
TORCH_CHECK(shm_manager);
|
||||
shm_manager->join(name);
|
||||
return shm_manager->get_shm_ctx()->to_string();
|
||||
}
|
||||
@ -22,6 +22,26 @@ void mla_decode_kvcache(torch::Tensor& out, torch::Tensor& query,
|
||||
torch::Tensor& kv_cache, double scale,
|
||||
torch::Tensor& block_tables, torch::Tensor& seq_lens);
|
||||
|
||||
int64_t init_shm_manager(const std::string& name, const int64_t group_size,
|
||||
const int64_t rank);
|
||||
|
||||
std::string join_shm_manager(int64_t handle, const std::string& name);
|
||||
|
||||
void shm_allreduce(int64_t handle, torch::Tensor& data);
|
||||
|
||||
void shm_gather(int64_t handle, torch::Tensor& data,
|
||||
const std::optional<std::vector<torch::Tensor>>& outputs,
|
||||
int64_t dst);
|
||||
|
||||
void shm_all_gather(int64_t handle, const torch::Tensor& data,
|
||||
torch::Tensor& output);
|
||||
|
||||
void shm_send_tensor_list(int64_t handle,
|
||||
const std::vector<torch::Tensor>& tensor_list,
|
||||
int64_t dst);
|
||||
|
||||
std::vector<torch::Tensor> shm_recv_tensor_list(int64_t handle, int64_t src);
|
||||
|
||||
TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
||||
// vLLM custom ops
|
||||
|
||||
@ -131,6 +151,29 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
||||
" Tensor? azp, Tensor? bias) -> ()");
|
||||
ops.impl("cutlass_scaled_mm_azp", torch::kCPU, &int8_scaled_mm_azp);
|
||||
#endif
|
||||
|
||||
// SHM CCL
|
||||
#ifdef __AVX512F__
|
||||
ops.def("init_shm_manager(str name, int group_size, int rank) -> int",
|
||||
&init_shm_manager);
|
||||
ops.def("join_shm_manager(int handle, str name) -> str", &join_shm_manager);
|
||||
ops.def("shm_allreduce(int handle, Tensor! data) -> ()");
|
||||
ops.impl("shm_allreduce", torch::kCPU, &shm_allreduce);
|
||||
ops.def(
|
||||
"shm_gather(int handle, Tensor data, Tensor[](a!)? outputs, int dst) -> "
|
||||
"()");
|
||||
ops.impl("shm_gather", torch::kCPU, &shm_gather);
|
||||
ops.def(
|
||||
"shm_all_gather(int handle, Tensor data, Tensor! output) -> "
|
||||
"()");
|
||||
ops.impl("shm_all_gather", torch::kCPU, &shm_all_gather);
|
||||
ops.def(
|
||||
"shm_send_tensor_list(int handle, Tensor[](a) tensor_list, int dst) -> "
|
||||
"()");
|
||||
ops.impl("shm_send_tensor_list", torch::kCPU, &shm_send_tensor_list);
|
||||
ops.def("shm_recv_tensor_list(int handle, int src) -> Tensor[](a)",
|
||||
&shm_recv_tensor_list);
|
||||
#endif
|
||||
}
|
||||
|
||||
TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
|
||||
|
||||
@ -18,7 +18,7 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) {
|
||||
|
||||
#ifndef VLLM_NUMA_DISABLED
|
||||
std::string init_cpu_threads_env(const std::string& cpu_ids) {
|
||||
bitmask* omp_cpu_mask = numa_parse_cpustring(cpu_ids.c_str());
|
||||
bitmask* omp_cpu_mask = numa_parse_cpustring_all(cpu_ids.c_str());
|
||||
TORCH_CHECK(omp_cpu_mask->size > 0);
|
||||
std::vector<int> omp_cpu_ids;
|
||||
omp_cpu_ids.reserve(omp_cpu_mask->size);
|
||||
|
||||
@ -145,7 +145,8 @@ torch::Tensor permute_cols(torch::Tensor const& A, torch::Tensor const& perm);
|
||||
#endif
|
||||
|
||||
torch::Tensor ggml_dequantize(torch::Tensor W, int64_t type, int64_t m,
|
||||
int64_t n);
|
||||
int64_t n,
|
||||
std::optional<at::ScalarType> const& dtype);
|
||||
|
||||
torch::Tensor ggml_mul_mat_vec_a8(torch::Tensor W, torch::Tensor X,
|
||||
int64_t type, int64_t row);
|
||||
|
||||
@ -94,8 +94,8 @@ static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __
|
||||
dfloat2 v;
|
||||
dequantize_kernel(vx, ib, iqs, v);
|
||||
|
||||
y[iybs + iqs + 0] = v.x;
|
||||
y[iybs + iqs + y_offset] = v.y;
|
||||
y[iybs + iqs + 0] = convert_from_half<dst_t>(v.x);
|
||||
y[iybs + iqs + y_offset] = convert_from_half<dst_t>(v.y);
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
@ -114,10 +114,10 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, dst_t
|
||||
|
||||
half dall = __low2half(x[i].dm);
|
||||
half dmin = __high2half(x[i].dm);
|
||||
y[l+ 0] = __hsub(__hmul(dall, __int2half_rn((x[i].scales[is+0] & 0xF) * ((q >> 0) & 3))), __hmul(dmin, __int2half_rn(x[i].scales[is+0] >> 4)));
|
||||
y[l+32] = __hsub(__hmul(dall, __int2half_rn((x[i].scales[is+2] & 0xF) * ((q >> 2) & 3))), __hmul(dmin, __int2half_rn(x[i].scales[is+2] >> 4)));
|
||||
y[l+64] = __hsub(__hmul(dall, __int2half_rn((x[i].scales[is+4] & 0xF) * ((q >> 4) & 3))), __hmul(dmin, __int2half_rn(x[i].scales[is+4] >> 4)));
|
||||
y[l+96] = __hsub(__hmul(dall, __int2half_rn((x[i].scales[is+6] & 0xF) * ((q >> 6) & 3))), __hmul(dmin, __int2half_rn(x[i].scales[is+6] >> 4)));
|
||||
y[l+ 0] = convert_from_half<dst_t>(__hsub(__hmul(dall, __int2half_rn((x[i].scales[is+0] & 0xF) * ((q >> 0) & 3))), __hmul(dmin, __int2half_rn(x[i].scales[is+0] >> 4))));
|
||||
y[l+32] = convert_from_half<dst_t>(__hsub(__hmul(dall, __int2half_rn((x[i].scales[is+2] & 0xF) * ((q >> 2) & 3))), __hmul(dmin, __int2half_rn(x[i].scales[is+2] >> 4))));
|
||||
y[l+64] = convert_from_half<dst_t>(__hsub(__hmul(dall, __int2half_rn((x[i].scales[is+4] & 0xF) * ((q >> 4) & 3))), __hmul(dmin, __int2half_rn(x[i].scales[is+4] >> 4))));
|
||||
y[l+96] = convert_from_half<dst_t>(__hsub(__hmul(dall, __int2half_rn((x[i].scales[is+6] & 0xF) * ((q >> 6) & 3))), __hmul(dmin, __int2half_rn(x[i].scales[is+6] >> 4))));
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
@ -148,7 +148,9 @@ static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, dst_t
|
||||
const uint8_t * q = x[i].qs + 32*n;
|
||||
const uint8_t * hm = x[i].hmask;
|
||||
|
||||
for (int l = l0; l < l0+4; ++l) y[l] = __hmul(dl, __int2half_rn((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4)));
|
||||
for (int l = l0; l < l0+4; ++l) {
|
||||
y[l] = convert_from_half<dst_t>(__hmul(dl, __int2half_rn((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4))));
|
||||
}
|
||||
}
|
||||
|
||||
static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) {
|
||||
@ -188,8 +190,8 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, dst_t
|
||||
const half d2 = __hmul(dall, __int2half_rn(sc));
|
||||
const half m2 = __hmul(dmin, __int2half_rn(m));
|
||||
for (int l = 0; l < n; ++l) {
|
||||
y[l + 0] = __hsub(__hmul(d1, __int2half_rn(q[l] & 0xF)), m1);
|
||||
y[l +32] = __hsub(__hmul(d2, __int2half_rn(q[l] >> 4)), m2);
|
||||
y[l + 0] = convert_from_half<dst_t>(__hsub(__hmul(d1, __int2half_rn(q[l] & 0xF)), m1));
|
||||
y[l +32] = convert_from_half<dst_t>(__hsub(__hmul(d2, __int2half_rn(q[l] >> 4)), m2));
|
||||
}
|
||||
}
|
||||
|
||||
@ -220,11 +222,11 @@ static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, dst_t
|
||||
const half d2 = __hmul(dall, __int2half_rn(sc)); const half m2 = __hmul(dmin, __int2half_rn(m));
|
||||
|
||||
uint8_t hm = 1 << (2*il);
|
||||
y[ 0] = __hsub(__hmul(d1, __int2half_rn((ql[0] & 0xF) + (qh[0] & hm ? 16 : 0))), m1);
|
||||
y[ 1] = __hsub(__hmul(d1, __int2half_rn((ql[1] & 0xF) + (qh[1] & hm ? 16 : 0))), m1);
|
||||
y[ 0] = convert_from_half<dst_t>(__hsub(__hmul(d1, __int2half_rn((ql[0] & 0xF) + (qh[0] & hm ? 16 : 0))), m1));
|
||||
y[ 1] = convert_from_half<dst_t>(__hsub(__hmul(d1, __int2half_rn((ql[1] & 0xF) + (qh[1] & hm ? 16 : 0))), m1));
|
||||
hm <<= 1;
|
||||
y[32] = __hsub(__hmul(d2, __int2half_rn((ql[0] >> 4) + (qh[0] & hm ? 16 : 0))), m2);
|
||||
y[33] = __hsub(__hmul(d2, __int2half_rn((ql[1] >> 4) + (qh[1] & hm ? 16 : 0))), m2);
|
||||
y[32] = convert_from_half<dst_t>(__hsub(__hmul(d2, __int2half_rn((ql[0] >> 4) + (qh[0] & hm ? 16 : 0))), m2));
|
||||
y[33] = convert_from_half<dst_t>(__hsub(__hmul(d2, __int2half_rn((ql[1] >> 4) + (qh[1] & hm ? 16 : 0))), m2));
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
@ -247,10 +249,10 @@ static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t
|
||||
const uint8_t qh = x[i].qh[32*ip + il];
|
||||
const int8_t * sc = x[i].scales + is;
|
||||
|
||||
y[ 0] = __hmul(d, __int2half_rn(sc[0] * ((int8_t)((ql[ 0] & 0xF) | (((qh >> 0) & 3) << 4)) - 32)));
|
||||
y[32] = __hmul(d, __int2half_rn(sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32)));
|
||||
y[64] = __hmul(d, __int2half_rn(sc[4] * ((int8_t)((ql[ 0] >> 4) | (((qh >> 4) & 3) << 4)) - 32)));
|
||||
y[96] = __hmul(d, __int2half_rn(sc[6] * ((int8_t)((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32)));
|
||||
y[ 0] = convert_from_half<dst_t>(__hmul(d, __int2half_rn(sc[0] * ((int8_t)((ql[ 0] & 0xF) | (((qh >> 0) & 3) << 4)) - 32))));
|
||||
y[32] = convert_from_half<dst_t>(__hmul(d, __int2half_rn(sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32))));
|
||||
y[64] = convert_from_half<dst_t>(__hmul(d, __int2half_rn(sc[4] * ((int8_t)((ql[ 0] >> 4) | (((qh >> 4) & 3) << 4)) - 32))));
|
||||
y[96] = convert_from_half<dst_t>(__hmul(d, __int2half_rn(sc[6] * ((int8_t)((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32))));
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
@ -269,7 +271,7 @@ static __global__ void dequantize_block_iq2_xxs(const void * __restrict__ vx, ds
|
||||
const uint32_t aux32 = q2[2] | (q2[3] << 16);
|
||||
const float d = __half2float(x[i].d) * (0.5f + (aux32 >> 28)) * 0.25f;
|
||||
const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*il) & 127];
|
||||
for (int j = 0; j < 8; ++j) y[j] = __float2half(d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f));
|
||||
for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
@ -286,7 +288,7 @@ static __global__ void dequantize_block_iq2_xs(const void * __restrict__ vx, dst
|
||||
const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[il] & 511));
|
||||
const float d = __half2float(x[i].d) * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
|
||||
const uint8_t signs = ksigns_iq2xs[q2[il] >> 9];
|
||||
for (int j = 0; j < 8; ++j) y[j] = __float2half(d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f));
|
||||
for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
|
||||
|
||||
}
|
||||
|
||||
@ -303,7 +305,7 @@ static __global__ void dequantize_block_iq2_s(const void * __restrict__ vx, dst_
|
||||
const uint8_t * grid = (const uint8_t *)(iq2s_grid + (x[i].qs[4*ib+il] | ((x[i].qh[ib] << (8-2*il)) & 0x300)));
|
||||
const float d = __half2float(x[i].d) * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
|
||||
const uint8_t signs = x[i].qs[QK_K/8+4*ib+il];
|
||||
for (int j = 0; j < 8; ++j) y[j] = __float2half(d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f));
|
||||
for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
@ -324,8 +326,8 @@ static __global__ void dequantize_block_iq3_xxs(const void * __restrict__ vx, ds
|
||||
const float d = __half2float(x[i].d) * (0.5f + (aux32 >> 28)) * 0.5f;
|
||||
const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*il) & 127];
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
y[j+0] = __float2half(d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f));
|
||||
y[j+4] = __float2half(d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f));
|
||||
y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
|
||||
y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
|
||||
}
|
||||
}
|
||||
|
||||
@ -345,8 +347,8 @@ static __global__ void dequantize_block_iq3_s(const void * __restrict__ vx, dst_
|
||||
const float d = __half2float(x[i].d) * (0.5f + ((x[i].scales[ib/2] >> 4*(ib%2)) & 0xf)) * 0.5f;
|
||||
const uint8_t signs = x[i].signs[4*ib + il];
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
y[j+0] = __float2half(d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f));
|
||||
y[j+4] = __float2half(d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f));
|
||||
y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
|
||||
y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
|
||||
}
|
||||
}
|
||||
|
||||
@ -367,7 +369,7 @@ static __global__ void dequantize_block_iq1_s(const void * __restrict__ vx, dst_
|
||||
grid32[1] = (grid32[0] >> 4) & 0x0f0f0f0f;
|
||||
grid32[0] &= 0x0f0f0f0f;
|
||||
for (int j = 0; j < 8; ++j) {
|
||||
y[j] = __float2half(d * (q[j] + delta));
|
||||
y[j] = d * (q[j] + delta);
|
||||
}
|
||||
}
|
||||
|
||||
@ -392,7 +394,7 @@ static __global__ void dequantize_block_iq1_m(const void * __restrict__ vx, dst_
|
||||
grid32[1] = (grid32[0] >> 4) & 0x0f0f0f0f;
|
||||
grid32[0] &= 0x0f0f0f0f;
|
||||
for (int j = 0; j < 8; ++j) {
|
||||
y[j] = __float2half(d * (q[j] + delta));
|
||||
y[j] = d * (q[j] + delta);
|
||||
}
|
||||
}
|
||||
|
||||
@ -409,8 +411,8 @@ static __global__ void dequantize_block_iq4_nl(const void * __restrict__ vx, dst
|
||||
const uint8_t * q4 = x[ib].qs + 4*il;
|
||||
const float d = __half2float(x[ib].d);
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
y[j+ 0] = __float2half(d * kvalues_iq4nl[q4[j] & 0xf]);
|
||||
y[j+16] = __float2half(d * kvalues_iq4nl[q4[j] >> 4]);
|
||||
y[j+ 0] = d * kvalues_iq4nl[q4[j] & 0xf];
|
||||
y[j+16] = d * kvalues_iq4nl[q4[j] >> 4];
|
||||
}
|
||||
|
||||
}
|
||||
@ -427,8 +429,8 @@ static __global__ void dequantize_block_iq4_xs(const void * __restrict__ vx, dst
|
||||
const uint8_t * q4 = x[i].qs + 16*ib + 4*il;
|
||||
const float d = __half2float(x[i].d) * ((((x[i].scales_l[ib/2] >> 4*(ib%2)) & 0xf) | (((x[i].scales_h >> 2*ib) & 3) << 4)) - 32);
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
y[j+ 0] = __float2half(d * kvalues_iq4nl[q4[j] & 0xf]);
|
||||
y[j+16] = __float2half(d * kvalues_iq4nl[q4[j] >> 4]);
|
||||
y[j+ 0] = d * kvalues_iq4nl[q4[j] & 0xf];
|
||||
y[j+16] = d * kvalues_iq4nl[q4[j] >> 4];
|
||||
}
|
||||
}
|
||||
|
||||
@ -522,7 +524,8 @@ static void dequantize_row_iq4_xs_cuda(const void * vx, dst_t * y, const int k,
|
||||
dequantize_block_iq4_xs<<<nb, 32, 0, stream>>>(vx, y);
|
||||
}
|
||||
|
||||
static to_fp16_cuda_t ggml_get_to_fp16_cuda(int64_t type) {
|
||||
template<typename dst_t>
|
||||
static to_cuda_ggml_t<dst_t> ggml_get_to_cuda(int64_t type) {
|
||||
switch (type) {
|
||||
case 2:
|
||||
return dequantize_block_cuda<QK4_0, QR4_0, dequantize_q4_0>;
|
||||
|
||||
@ -1063,7 +1063,8 @@ static const __device__ int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -
|
||||
typedef half dfloat; // dequantize float
|
||||
typedef half2 dfloat2;
|
||||
typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, dfloat2 & v);
|
||||
typedef void (*to_fp16_cuda_t)(const void * __restrict__ x, dfloat * __restrict__ y, int k, cudaStream_t stream);
|
||||
template<typename dst_t>
|
||||
using to_cuda_ggml_t = void (*)(const void * __restrict__ x, dst_t * __restrict__ y, int k, cudaStream_t stream);
|
||||
typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs);
|
||||
typedef void (*allocate_tiles_cuda_t)(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc);
|
||||
typedef void (*load_tiles_cuda_t)(
|
||||
@ -1075,6 +1076,25 @@ typedef float (*vec_dot_q_mul_mat_cuda_t)(
|
||||
|
||||
// Utility function
|
||||
|
||||
template<typename dst_t>
|
||||
static __device__ __forceinline__ dst_t convert_from_half(half val) {
|
||||
return val;
|
||||
}
|
||||
|
||||
template<>
|
||||
__device__ __forceinline__ c10::BFloat16 convert_from_half<c10::BFloat16>(half val) {
|
||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
|
||||
return __float2bfloat16(__half2float(val));
|
||||
#else
|
||||
return __half2float(val);
|
||||
#endif // defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
|
||||
}
|
||||
|
||||
template<>
|
||||
__device__ __forceinline__ float convert_from_half<float>(half val) {
|
||||
return __half2float(val);
|
||||
}
|
||||
|
||||
#if defined(USE_ROCM)
|
||||
|
||||
#ifndef __has_builtin
|
||||
|
||||
@ -71,14 +71,19 @@ static void quantize_row_q8_1_cuda(const scalar_t* x, void* vy, const int kx,
|
||||
}
|
||||
|
||||
torch::Tensor ggml_dequantize(torch::Tensor W, // quant weight
|
||||
int64_t type, int64_t m, int64_t n) {
|
||||
int64_t type, int64_t m, int64_t n,
|
||||
std::optional<at::ScalarType> const& dtype) {
|
||||
const at::cuda::OptionalCUDAGuard device_guard(device_of(W));
|
||||
auto options =
|
||||
torch::TensorOptions().dtype(torch::kFloat16).device(W.device());
|
||||
auto dtype_ = dtype.value_or(torch::kFloat16);
|
||||
auto options = torch::TensorOptions().dtype(dtype_).device(W.device());
|
||||
at::Tensor DW = torch::empty({m, n}, options);
|
||||
cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
|
||||
const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(type);
|
||||
to_fp16_cuda((void*)W.data_ptr(), (half*)DW.data_ptr(), m * n, stream);
|
||||
|
||||
VLLM_DISPATCH_FLOATING_TYPES(DW.scalar_type(), "ggml_dequantize", [&] {
|
||||
auto to_cuda = ggml_get_to_cuda<scalar_t>(type);
|
||||
to_cuda((void*)W.data_ptr(), (scalar_t*)DW.data_ptr(), m * n, stream);
|
||||
});
|
||||
|
||||
return DW;
|
||||
}
|
||||
|
||||
|
||||
@ -1785,7 +1785,7 @@ __global__ void Marlin(
|
||||
<<<blocks, NUM_THREADS, max_shared_mem, stream>>>( \
|
||||
A_ptr, B_ptr, C_ptr, C_tmp_ptr, s_ptr, zp_ptr, g_idx_ptr, \
|
||||
num_groups, prob_m, prob_n, prob_k, lda, locks, \
|
||||
use_atomic_add, use_fp32_reduce); \
|
||||
part_use_atomic_add, use_fp32_reduce); \
|
||||
} \
|
||||
}
|
||||
|
||||
@ -2215,6 +2215,10 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* s,
|
||||
thread_m_blocks = exec_cfg.max_m_blocks;
|
||||
}
|
||||
|
||||
// atomic add reduce have better performance only when m * n is small
|
||||
bool part_use_atomic_add =
|
||||
use_atomic_add && div_ceil(prob_m, 64) * prob_n <= 2048;
|
||||
|
||||
if (false) {
|
||||
}
|
||||
GPTQ_CALL_IF(vllm::kU4B8, 16, 4, 256)
|
||||
|
||||
@ -272,6 +272,7 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel(
|
||||
const float scale,
|
||||
const int* __restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq]
|
||||
const int* __restrict__ context_lens, // [num_seqs]
|
||||
const int* __restrict__ query_start_loc_ptr, // [num_seqs]
|
||||
const int max_num_blocks_per_seq,
|
||||
const float* __restrict__ alibi_slopes, // [num_heads]
|
||||
const int q_stride,
|
||||
@ -291,6 +292,13 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel(
|
||||
const int rowid = laneid / 16;
|
||||
|
||||
const auto seq_idx = blockIdx.x;
|
||||
// NOTE queries with sequence len > 1 are prefills and taken care by another
|
||||
// kernel.
|
||||
if (query_start_loc_ptr != nullptr &&
|
||||
(query_start_loc_ptr[seq_idx + 1] - query_start_loc_ptr[seq_idx]) != 1) {
|
||||
return;
|
||||
}
|
||||
|
||||
const auto partition_idx = blockIdx.y;
|
||||
|
||||
constexpr int T_PAR_SIZE = 256; // token partition size set to 256
|
||||
@ -377,9 +385,10 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel(
|
||||
// fetch Q in shared across warps and then write to registers
|
||||
const int local_qhead_idx = 4 * warpid + rowid;
|
||||
const int global_qhead_idx = wg_start_head_idx + local_qhead_idx;
|
||||
const int64_t seq_idx64 = static_cast<int64_t>(seq_idx);
|
||||
const int64_t query_start_off = static_cast<int64_t>(
|
||||
query_start_loc_ptr ? query_start_loc_ptr[seq_idx] : seq_idx);
|
||||
const scalar_t* q_ptr =
|
||||
q + seq_idx64 * q_stride + global_qhead_idx * HEAD_SIZE;
|
||||
q + query_start_off * q_stride + global_qhead_idx * HEAD_SIZE;
|
||||
|
||||
const int qhead_element = lane16id * CONTIGUOUS_SCALAR_ELEMS_16B;
|
||||
if ((local_qhead_idx < GQA_RATIO) && (qhead_element < HEAD_SIZE)) {
|
||||
@ -777,6 +786,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel(
|
||||
const float scale,
|
||||
const int* __restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq]
|
||||
const int* __restrict__ context_lens, // [num_seqs]
|
||||
const int* __restrict__ query_start_loc_ptr, // [num_seqs]
|
||||
const int max_num_blocks_per_seq,
|
||||
const float* __restrict__ alibi_slopes, // [num_heads]
|
||||
const int q_stride,
|
||||
@ -794,6 +804,12 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel(
|
||||
const int lane4id = laneid % 4;
|
||||
|
||||
const auto seq_idx = blockIdx.x;
|
||||
// NOTE queries with sequence len > 1 are prefills and taken care by another
|
||||
// kernel.
|
||||
if (query_start_loc_ptr != nullptr &&
|
||||
(query_start_loc_ptr[seq_idx + 1] - query_start_loc_ptr[seq_idx] != 1)) {
|
||||
return;
|
||||
}
|
||||
const auto partition_idx = blockIdx.y;
|
||||
const auto partition_size = blockDim.x;
|
||||
const auto max_num_partitions = gridDim.y;
|
||||
@ -882,9 +898,11 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel(
|
||||
}
|
||||
|
||||
// fetch q elements
|
||||
// every 4 lanes fetch 8 elems, so warp fetches 8*16 = 128 elems
|
||||
// every 4 lanes fetch 8 elems, so warp fetches 8*16 = 128 elemsc
|
||||
const int64_t query_start_off = static_cast<int64_t>(
|
||||
query_start_loc_ptr ? query_start_loc_ptr[seq_idx] : seq_idx);
|
||||
const scalar_t* q_ptr =
|
||||
q + seq_idx * q_stride + wg_start_head_idx * HEAD_SIZE;
|
||||
q + query_start_off * q_stride + wg_start_head_idx * HEAD_SIZE;
|
||||
const _B16x8* q_ptrh8 = reinterpret_cast<const _B16x8*>(q_ptr);
|
||||
const int qhead_elemh8 = laneid / 4;
|
||||
|
||||
@ -1267,10 +1285,19 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
|
||||
const scalar_t* __restrict__ tmp_out, // [num_seqs, num_heads,
|
||||
// max_num_partitions, head_size]
|
||||
const int* __restrict__ context_lens, // [num_seqs]
|
||||
const int* __restrict__ query_start_loc_ptr, // [num_seqs]
|
||||
const int max_num_partitions) {
|
||||
const auto num_heads = gridDim.x;
|
||||
const auto head_idx = blockIdx.x;
|
||||
const auto seq_idx = blockIdx.y;
|
||||
|
||||
// NOTE queries with sequence len > 1 are prefills and taken care by another
|
||||
// kernel.
|
||||
if (query_start_loc_ptr != nullptr &&
|
||||
(query_start_loc_ptr[seq_idx + 1] - query_start_loc_ptr[seq_idx] != 1)) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int context_len = context_lens[seq_idx];
|
||||
const int num_partitions = DIVIDE_ROUND_UP(context_len, PARTITION_SIZE);
|
||||
[[maybe_unused]] constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
|
||||
@ -1439,7 +1466,9 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
|
||||
__fdividef(1.0f, shared_global_exp_sum + 1e-6f);
|
||||
acc *= inv_global_exp_sum;
|
||||
|
||||
OUTT* out_ptr = out + static_cast<int64_t>(seq_idx) * num_heads * HEAD_SIZE +
|
||||
const int64_t query_start_off = static_cast<int64_t>(
|
||||
query_start_loc_ptr ? query_start_loc_ptr[seq_idx] : seq_idx);
|
||||
OUTT* out_ptr = out + query_start_off * num_heads * HEAD_SIZE +
|
||||
static_cast<int64_t>(head_idx) * HEAD_SIZE;
|
||||
if constexpr (std::is_same<OUTT, bit8_t>::value) {
|
||||
out_ptr[threadIdx.x] =
|
||||
@ -1466,6 +1495,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma16_kernel(
|
||||
const float scale,
|
||||
const int* __restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq]
|
||||
const int* __restrict__ context_lens, // [num_seqs]
|
||||
const int* __restrict__ query_start_loc_ptr, // [num_seqs]
|
||||
const int max_num_blocks_per_seq,
|
||||
const float* __restrict__ alibi_slopes, // [num_heads]
|
||||
const int q_stride,
|
||||
@ -1492,6 +1522,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel(
|
||||
const float scale,
|
||||
const int* __restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq]
|
||||
const int* __restrict__ context_lens, // [num_seqs]
|
||||
const int* __restrict__ query_start_loc_ptr, // [num_seqs]
|
||||
const int max_num_blocks_per_seq,
|
||||
const float* __restrict__ alibi_slopes, // [num_heads]
|
||||
const int q_stride,
|
||||
@ -1515,6 +1546,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
|
||||
const float* __restrict__ max_logits, // [num_seqs, num_heads, max_num_partitions]
|
||||
const scalar_t* __restrict__ tmp_out, // [num_seqs, num_heads, max_num_partitions, head_size]
|
||||
const int* __restrict__ context_lens, // [num_seqs]
|
||||
const int* __restrict__ query_start_loc_ptr, // [num_seqs]
|
||||
const int max_num_partitions) {
|
||||
UNREACHABLE_CODE
|
||||
}
|
||||
@ -1522,34 +1554,34 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
|
||||
|
||||
#endif // defined(__HIP__MI300_MI250__) TODO: Add NAVI support
|
||||
|
||||
#define LAUNCH_CUSTOM_ATTENTION_MFMA16(GQA_RATIO) \
|
||||
paged_attention_ll4mi_QKV_mfma16_kernel<T, KVT, KV_DTYPE, OUTT, BLOCK_SIZE, \
|
||||
HEAD_SIZE, NTHR, ALIBI_ENABLED, \
|
||||
GQA_RATIO> \
|
||||
<<<grid, block, 0, stream>>>( \
|
||||
query_ptr, key_cache_ptr, value_cache_ptr, num_kv_heads, scale, \
|
||||
block_tables_ptr, context_lens_ptr, max_num_blocks_per_seq, \
|
||||
alibi_slopes_ptr, q_stride, kv_block_stride, kv_head_stride, \
|
||||
exp_sums_ptr, max_logits_ptr, tmp_out_ptr, out_ptr, max_ctx_blocks, \
|
||||
k_scale_ptr, v_scale_ptr);
|
||||
#define LAUNCH_CUSTOM_ATTENTION_MFMA16(GQA_RATIO) \
|
||||
paged_attention_ll4mi_QKV_mfma16_kernel<T, KVT, KV_DTYPE, OUTT, BLOCK_SIZE, \
|
||||
HEAD_SIZE, NTHR, ALIBI_ENABLED, \
|
||||
GQA_RATIO> \
|
||||
<<<grid, block, 0, stream>>>( \
|
||||
query_ptr, key_cache_ptr, value_cache_ptr, num_kv_heads, scale, \
|
||||
block_tables_ptr, context_lens_ptr, query_start_loc_ptr, \
|
||||
max_num_blocks_per_seq, alibi_slopes_ptr, q_stride, kv_block_stride, \
|
||||
kv_head_stride, exp_sums_ptr, max_logits_ptr, tmp_out_ptr, out_ptr, \
|
||||
max_ctx_blocks, k_scale_ptr, v_scale_ptr);
|
||||
|
||||
#define LAUNCH_CUSTOM_ATTENTION_MFMA4(GQA_RATIO) \
|
||||
paged_attention_ll4mi_QKV_mfma4_kernel<T, KVT, KV_DTYPE, OUTT, BLOCK_SIZE, \
|
||||
HEAD_SIZE, NTHR, ALIBI_ENABLED, \
|
||||
GQA_RATIO> \
|
||||
<<<grid, block, 0, stream>>>( \
|
||||
query_ptr, key_cache_ptr, value_cache_ptr, num_kv_heads, scale, \
|
||||
block_tables_ptr, context_lens_ptr, max_num_blocks_per_seq, \
|
||||
alibi_slopes_ptr, q_stride, kv_block_stride, kv_head_stride, \
|
||||
exp_sums_ptr, max_logits_ptr, tmp_out_ptr, out_ptr, max_ctx_blocks, \
|
||||
k_scale_ptr, v_scale_ptr);
|
||||
#define LAUNCH_CUSTOM_ATTENTION_MFMA4(GQA_RATIO) \
|
||||
paged_attention_ll4mi_QKV_mfma4_kernel<T, KVT, KV_DTYPE, OUTT, BLOCK_SIZE, \
|
||||
HEAD_SIZE, NTHR, ALIBI_ENABLED, \
|
||||
GQA_RATIO> \
|
||||
<<<grid, block, 0, stream>>>( \
|
||||
query_ptr, key_cache_ptr, value_cache_ptr, num_kv_heads, scale, \
|
||||
block_tables_ptr, context_lens_ptr, query_start_loc_ptr, \
|
||||
max_num_blocks_per_seq, alibi_slopes_ptr, q_stride, kv_block_stride, \
|
||||
kv_head_stride, exp_sums_ptr, max_logits_ptr, tmp_out_ptr, out_ptr, \
|
||||
max_ctx_blocks, k_scale_ptr, v_scale_ptr);
|
||||
|
||||
#define LAUNCH_CUSTOM_REDUCTION(NPAR_LOOPS) \
|
||||
paged_attention_ll4mi_reduce_kernel<T, OUTT, HEAD_SIZE, HEAD_SIZE, \
|
||||
PARTITION_SIZE, NPAR_LOOPS> \
|
||||
<<<reduce_grid, reduce_block, 0, stream>>>( \
|
||||
out_ptr, exp_sums_ptr, max_logits_ptr, tmp_out_ptr, \
|
||||
context_lens_ptr, max_num_partitions);
|
||||
context_lens_ptr, query_start_loc_ptr, max_num_partitions);
|
||||
|
||||
template <typename T, typename KVT, vllm::Fp8KVCacheDataType KV_DTYPE,
|
||||
int BLOCK_SIZE, int HEAD_SIZE, typename OUTT, int PARTITION_SIZE_OLD,
|
||||
@ -1559,9 +1591,10 @@ void paged_attention_custom_launcher(
|
||||
torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
|
||||
torch::Tensor& value_cache, const int num_kv_heads, float scale,
|
||||
torch::Tensor& block_tables, torch::Tensor& context_lens,
|
||||
int max_context_len, const std::optional<torch::Tensor>& alibi_slopes,
|
||||
torch::Tensor& k_scale, torch::Tensor& v_scale) {
|
||||
int num_seqs = query.size(0);
|
||||
const std::optional<torch::Tensor>& query_start_loc, int max_context_len,
|
||||
const std::optional<torch::Tensor>& alibi_slopes, torch::Tensor& k_scale,
|
||||
torch::Tensor& v_scale) {
|
||||
int num_seqs = block_tables.size(0);
|
||||
int num_heads = query.size(1);
|
||||
int head_size = query.size(2);
|
||||
int max_num_blocks_per_seq = block_tables.size(1);
|
||||
@ -1569,6 +1602,13 @@ void paged_attention_custom_launcher(
|
||||
int kv_block_stride = key_cache.stride(0);
|
||||
int kv_head_stride = key_cache.stride(1);
|
||||
|
||||
// NOTE: query start location is optional for V0 decode should not be used.
|
||||
// If batch contains mix of prefills and decode, prefills should be skipped.
|
||||
const int* query_start_loc_ptr =
|
||||
query_start_loc
|
||||
? reinterpret_cast<const int*>(query_start_loc.value().data_ptr())
|
||||
: nullptr;
|
||||
|
||||
// NOTE: alibi_slopes is optional.
|
||||
const float* alibi_slopes_ptr =
|
||||
alibi_slopes
|
||||
@ -1700,8 +1740,8 @@ void paged_attention_custom_launcher(
|
||||
paged_attention_custom_launcher<T, KVT, KV_DTYPE, BLK_SIZE, HEAD_SIZE, T, \
|
||||
PSIZE, ALIBI_ENABLED>( \
|
||||
out, exp_sums, max_logits, tmp_out, query, key_cache, value_cache, \
|
||||
num_kv_heads, scale, block_tables, context_lens, max_context_len, \
|
||||
alibi_slopes, k_scale, v_scale);
|
||||
num_kv_heads, scale, block_tables, context_lens, query_start_loc, \
|
||||
max_context_len, alibi_slopes, k_scale, v_scale);
|
||||
|
||||
#define CALL_CUSTOM_LAUNCHER_ALIBI(T, KVT, KV_DTYPE, BLK_SIZE, HEAD_SIZE, \
|
||||
PSIZE) \
|
||||
@ -1750,6 +1790,7 @@ void paged_attention(
|
||||
double scale,
|
||||
torch::Tensor& block_tables, // [num_seqs, max_num_blocks_per_seq]
|
||||
torch::Tensor& context_lens, // [num_seqs]
|
||||
const std::optional<torch::Tensor>& query_start_loc, // [num_seqs]
|
||||
int64_t block_size, int64_t max_context_len,
|
||||
const std::optional<torch::Tensor>& alibi_slopes,
|
||||
const std::string& kv_cache_dtype, torch::Tensor& k_scale,
|
||||
|
||||
@ -7,8 +7,9 @@ void paged_attention(torch::Tensor& out, torch::Tensor& exp_sums,
|
||||
torch::Tensor& query, torch::Tensor& key_cache,
|
||||
torch::Tensor& value_cache, int64_t num_kv_heads,
|
||||
double scale, torch::Tensor& block_tables,
|
||||
torch::Tensor& context_lens, int64_t block_size,
|
||||
int64_t max_context_len,
|
||||
torch::Tensor& context_lens,
|
||||
const std::optional<torch::Tensor>& query_start_loc,
|
||||
int64_t block_size, int64_t max_context_len,
|
||||
const std::optional<torch::Tensor>& alibi_slopes,
|
||||
const std::string& kv_cache_dtype, torch::Tensor& k_scale,
|
||||
torch::Tensor& v_scale);
|
||||
|
||||
@ -23,7 +23,9 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, rocm_ops) {
|
||||
" Tensor query, Tensor key_cache,"
|
||||
" Tensor value_cache, int num_kv_heads,"
|
||||
" float scale, Tensor block_tables,"
|
||||
" Tensor context_lens, int block_size,"
|
||||
" Tensor context_lens,"
|
||||
" Tensor? query_start_loc,"
|
||||
" int block_size,"
|
||||
" int max_context_len,"
|
||||
" Tensor? alibi_slopes,"
|
||||
" str kv_cache_dtype,"
|
||||
|
||||
@ -295,7 +295,9 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
||||
#endif
|
||||
|
||||
// Dequantization for GGML.
|
||||
ops.def("ggml_dequantize(Tensor W, int type, SymInt m, SymInt n) -> Tensor");
|
||||
ops.def(
|
||||
"ggml_dequantize(Tensor W, int type, SymInt m, SymInt n, ScalarType? "
|
||||
"dtype) -> Tensor");
|
||||
ops.impl("ggml_dequantize", torch::kCUDA, &ggml_dequantize);
|
||||
|
||||
// mmvq kernel for GGML.
|
||||
|
||||
@ -38,7 +38,7 @@ RUN microdnf install -y openssl-devel dnf \
|
||||
&& ln -sf /usr/lib64/libatomic.so.1 /usr/lib64/libatomic.so \
|
||||
&& python${PYTHON_VERSION} -m venv ${VIRTUAL_ENV} \
|
||||
&& python -m pip install -U pip uv \
|
||||
&& uv pip install wheel build "setuptools<70" setuptools_scm setuptools_rust meson-python cmake ninja cython scikit_build_core scikit_build \
|
||||
&& uv pip install wheel build "setuptools<70" setuptools_scm setuptools_rust meson-python 'cmake<4' ninja cython scikit_build_core scikit_build \
|
||||
&& curl -sL https://ftp2.osuosl.org/pub/ppc64el/openblas/latest/Openblas_${OPENBLAS_VERSION}_ppc64le.tar.gz | tar xvf - -C /usr/local \
|
||||
&& curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y \
|
||||
&& cd /tmp && touch control
|
||||
@ -238,7 +238,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
&& python -m pip install -U pip uv --no-cache \
|
||||
&& curl -sL https://ftp2.osuosl.org/pub/ppc64el/openblas/latest/Openblas_${OPENBLAS_VERSION}_ppc64le.tar.gz | tar xvf - -C /usr/local \
|
||||
&& make -C /numactl install \
|
||||
&& uv pip install cmake \
|
||||
&& uv pip install 'cmake<4' \
|
||||
&& cmake --install /lapack/build \
|
||||
&& uv pip uninstall cmake
|
||||
|
||||
|
||||
@ -4,6 +4,8 @@
|
||||
|
||||
We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below:
|
||||
|
||||
- [vLLM x Ollama Inference Night](https://lu.ma/vllm-ollama), March 27th 2025. [[Slides]](https://docs.google.com/presentation/d/16T2PDD1YwRnZ4Tu8Q5r6n53c5Lr5c73UV9Vd2_eBo4U/edit?usp=sharing).
|
||||
- [The first vLLM China Meetup](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg), March 16th 2025. [[Slides]](https://docs.google.com/presentation/d/1REHvfQMKGnvz6p3Fd23HhSO4c8j5WPGZV0bKYLwnHyQ/edit?usp=sharing).
|
||||
- [The East Coast vLLM Meetup](https://lu.ma/7mu4k4xx), March 11th 2025. [[Slides]](https://docs.google.com/presentation/d/1NHiv8EUFF1NLd3fEYODm56nDmL26lEeXCaDgyDlTsRs/edit#slide=id.g31441846c39_0_0)
|
||||
- [The ninth vLLM meetup](https://lu.ma/h7g3kuj9), with Meta, February 27th 2025. [[Slides]](https://docs.google.com/presentation/d/1jzC_PZVXrVNSFVCW-V4cFXb6pn7zZ2CyP_Flwo05aqg/edit?usp=sharing)
|
||||
- [The eighth vLLM meetup](https://lu.ma/zep56hui), with Google Cloud, January 22nd 2025. [[Slides]](https://docs.google.com/presentation/d/1epVkt4Zu8Jz_S5OhEHPc798emsYh2BwYfRuDDVEF7u4/edit?usp=sharing)
|
||||
|
||||
@ -22,6 +22,7 @@ Compute Resources:
|
||||
- Databricks
|
||||
- DeepInfra
|
||||
- Google Cloud
|
||||
- Intel
|
||||
- Lambda Lab
|
||||
- Nebius
|
||||
- Novita AI
|
||||
|
||||
@ -34,11 +34,11 @@ If you need to use those dependencies (having accepted the license terms),
|
||||
create a custom Dockerfile on top of the base image with an extra layer that installs them:
|
||||
|
||||
```Dockerfile
|
||||
FROM vllm/vllm-openai:v0.8.2
|
||||
FROM vllm/vllm-openai:v0.8.3
|
||||
|
||||
# e.g. install the `audio` and `video` optional dependencies
|
||||
# e.g. install the `audio` optional dependencies
|
||||
# NOTE: Make sure the version of vLLM matches the base image!
|
||||
RUN uv pip install --system vllm[audio,video]==0.8.2
|
||||
RUN uv pip install --system vllm[audio]==0.8.3
|
||||
```
|
||||
|
||||
:::
|
||||
|
||||
@ -46,6 +46,7 @@ metadata:
|
||||
type: Opaque
|
||||
data:
|
||||
token: $(HF_TOKEN)
|
||||
EOF
|
||||
```
|
||||
|
||||
Next, start the vLLM server as a Kubernetes Deployment and Service:
|
||||
|
||||
@ -8,7 +8,7 @@ Here are the main features of {class}`~vllm.multimodal.processing.BaseMultiModal
|
||||
|
||||
## Prompt Update Detection
|
||||
|
||||
One of the main responsibilies of HF processor is to update the prompt with placeholder tokens. For example:
|
||||
One of the main responsibilities of HF processor is to update the prompt with placeholder tokens. For example:
|
||||
|
||||
- Insert feature placeholder tokens (e.g. `<image><image>...<image>`, the number of which equals to the feature size) at the start of the string.
|
||||
- Replace existing input placeholder tokens (e.g. `<image>` for a single image) with feature placeholder tokens (e.g. `<image><image>...<image>`, the number of which equals to the feature size).
|
||||
|
||||
@ -126,7 +126,7 @@ Unfortunately, because auto-tuning takes quite a long time (from seconds to minu
|
||||
|
||||
## Cudagraph Capture
|
||||
|
||||
vLLM's V1 architecture uses piecewise cudagraph. The full computation graph is split as mentioned above, and we only capture the cudagraph for the piece of graph between attention operations (including the first graph before any attention operation, and the last graph after all the attention operation). This is based on a common observation: computation between attentions are usually token-wise and easy to deal with for cudagraph; while the attention operation is non-trival to be cudagraph compatible. Thus, by running the attention operation in eager mode while the rest operations in cudagraph, we keep the flexibility of the attention operation.
|
||||
vLLM's V1 architecture uses piecewise cudagraph. The full computation graph is split as mentioned above, and we only capture the cudagraph for the piece of graph between attention operations (including the first graph before any attention operation, and the last graph after all the attention operation). This is based on a common observation: computation between attentions are usually token-wise and easy to deal with for cudagraph; while the attention operation is non-trivial to be cudagraph compatible. Thus, by running the attention operation in eager mode while the rest operations in cudagraph, we keep the flexibility of the attention operation.
|
||||
|
||||
The piecewise cudagraph also has fine-grained memory management. The purpose is to only exclude the attention kernel from cudagraph, while keeping all the rest modules and the memory allocation operations in the cudagraph. This is why the attention operation in V1 has the output tensor as the input of the attention.
|
||||
|
||||
|
||||
@ -19,17 +19,20 @@ And usually, these repositories have a config.json file that includes a quantiza
|
||||
|
||||
## Read quantized checkpoint
|
||||
|
||||
For pre-quantized checkpoints, vLLM will try to infer the quantization method from the config file, so you don't need to explicitly specify the quantization argument.
|
||||
|
||||
```python
|
||||
from vllm import LLM
|
||||
import torch
|
||||
# unsloth/tinyllama-bnb-4bit is a pre-quantized checkpoint.
|
||||
model_id = "unsloth/tinyllama-bnb-4bit"
|
||||
llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, \
|
||||
quantization="bitsandbytes")
|
||||
llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True)
|
||||
```
|
||||
|
||||
## Inflight quantization: load as 4bit quantization
|
||||
|
||||
For inflight 4bit quantization with BitsAndBytes, you need to explicitly specify the quantization argument.
|
||||
|
||||
```python
|
||||
from vllm import LLM
|
||||
import torch
|
||||
@ -40,7 +43,7 @@ quantization="bitsandbytes")
|
||||
|
||||
## OpenAI Compatible Server
|
||||
|
||||
Append the following to your 4bit model arguments:
|
||||
Append the following to your model arguments for 4bit inflight quantization:
|
||||
|
||||
```console
|
||||
--quantization bitsandbytes
|
||||
|
||||
@ -29,7 +29,7 @@ vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf --tokenizer TinyLlama/TinyLlam
|
||||
We recommend using the tokenizer from base model instead of GGUF model. Because the tokenizer conversion from GGUF is time-consuming and unstable, especially for some models with large vocab size.
|
||||
:::
|
||||
|
||||
GGUF assumes that huggingface can convert the metadata to a config file. In case huggingface doesn't support your model you can manually create a config and pass it as hf-confing-path
|
||||
GGUF assumes that huggingface can convert the metadata to a config file. In case huggingface doesn't support your model you can manually create a config and pass it as hf-config-path
|
||||
|
||||
```console
|
||||
# If you model is not supported by huggingface you can manually provide a huggingface compatible config path
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
# Tool Calling
|
||||
|
||||
vLLM currently supports named function calling, as well as the `auto` and `none` options for the `tool_choice` field in the chat completion API. The `tool_choice` option `required` is **not yet supported** but [on the roadmap](gh-issue:13002).
|
||||
vLLM currently supports named function calling, as well as the `auto`, `required` (as of `vllm>=0.8.3`) and `none` options for the `tool_choice` field in the chat completion API.
|
||||
|
||||
## Quickstart
|
||||
|
||||
@ -91,6 +91,12 @@ For best results, we recommend ensuring that the expected output format / schema
|
||||
To use a named function, you need to define the functions in the `tools` parameter of the chat completion request, and
|
||||
specify the `name` of one of the tools in the `tool_choice` parameter of the chat completion request.
|
||||
|
||||
## Required Function Calling
|
||||
|
||||
vLLM supports the `tool_choice='required'` option in the chat completion API. Similar to the named function calling, it also uses guided decoding, so this is enabled by default and will work with any supported model. The required guided decoding features (JSON schema with `anyOf`) are currently only supported in the V0 engine with the guided decoding backend `outlines`. However, support for alternative decoding backends are on the [roadmap](https://docs.vllm.ai/en/latest/getting_started/v1_user_guide.html#feature-model) for the V1 engine.
|
||||
|
||||
When tool_choice='required' is set, the model is guaranteed to generate one or more tool calls based on the specified tool list in the `tools` parameter. The number of tool calls depends on the user's query. The output format strictly follows the schema defined in the `tools` parameter.
|
||||
|
||||
## Automatic Function Calling
|
||||
|
||||
To enable this feature, you should set the following flags:
|
||||
|
||||
@ -17,6 +17,7 @@ def fix_case(text: str) -> str:
|
||||
"cli": "CLI",
|
||||
"cpu": "CPU",
|
||||
"llm": "LLM",
|
||||
"mae": "MAE",
|
||||
"tpu": "TPU",
|
||||
"aqlm": "AQLM",
|
||||
"gguf": "GGUF",
|
||||
@ -24,6 +25,7 @@ def fix_case(text: str) -> str:
|
||||
"rlhf": "RLHF",
|
||||
"vllm": "vLLM",
|
||||
"openai": "OpenAI",
|
||||
"lmcache": "LMCache",
|
||||
"multilora": "MultiLoRA",
|
||||
"mlpspeculator": "MLPSpeculator",
|
||||
r"fp\d+": lambda x: x.group(0).upper(), # e.g. fp16, fp32
|
||||
|
||||
@ -272,12 +272,14 @@ $ python examples/offline_inference/basic/basic.py
|
||||
|
||||
- Decouple the HTTP serving components from the inference components. In a GPU backend configuration, the HTTP serving and tokenization tasks operate on the CPU, while inference runs on the GPU, which typically does not pose a problem. However, in a CPU-based setup, the HTTP serving and tokenization can cause significant context switching and reduced cache efficiency. Therefore, it is strongly recommended to segregate these two components for improved performance.
|
||||
|
||||
- On CPU based setup with NUMA enabled, the memory access performance may be largely impacted by the [topology](https://github.com/intel/intel-extension-for-pytorch/blob/main/docs/tutorials/performance_tuning/tuning_guide.inc.md#non-uniform-memory-access-numa). For NUMA architecture, two optimizations are to recommended: Tensor Parallel or Data Parallel.
|
||||
- On CPU based setup with NUMA enabled, the memory access performance may be largely impacted by the [topology](https://github.com/intel/intel-extension-for-pytorch/blob/main/docs/tutorials/performance_tuning/tuning_guide.inc.md#non-uniform-memory-access-numa). For NUMA architecture, Tensor Parallel is a option for better performance.
|
||||
|
||||
- Using Tensor Parallel for a latency constraints deployment: following GPU backend design, a Megatron-LM's parallel algorithm will be used to shard the model, based on the number of NUMA nodes (e.g. TP = 2 for a two NUMA node system). With [TP feature on CPU](gh-pr:6125) merged, Tensor Parallel is supported for serving and offline inferencing. In general each NUMA node is treated as one GPU card. Below is the example script to enable Tensor Parallel = 2 for serving:
|
||||
- Tensor Parallel is supported for serving and offline inferencing. In general each NUMA node is treated as one GPU card. Below is the example script to enable Tensor Parallel = 2 for serving:
|
||||
|
||||
```console
|
||||
VLLM_CPU_KVCACHE_SPACE=40 VLLM_CPU_OMP_THREADS_BIND="0-31|32-63" vllm serve meta-llama/Llama-2-7b-chat-hf -tp=2 --distributed-executor-backend mp
|
||||
```
|
||||
|
||||
- Using Data Parallel for maximum throughput: to launch an LLM serving endpoint on each NUMA node along with one additional load balancer to dispatch the requests to those endpoints. Common solutions like [Nginx](#nginxloadbalancer) or HAProxy are recommended. Anyscale Ray project provides the feature on LLM [serving](https://docs.ray.io/en/latest/serve/index.html). Here is the example to setup a scalable LLM serving with [Ray Serve](https://github.com/intel/llm-on-ray/blob/main/docs/setup.inc.md).
|
||||
- For each thread id list in `VLLM_CPU_OMP_THREADS_BIND`, users should guarantee threads in the list belong to a same NUMA node.
|
||||
|
||||
- Meanwhile, users should also take care of memory capacity of each NUMA node. The memory usage of each TP rank is the sum of `weight shard size` and `VLLM_CPU_KVCACHE_SPACE`, if it exceeds the capacity of a single NUMA node, TP worker will be killed due to out-of-memory.
|
||||
|
||||
@ -31,7 +31,7 @@ Currently, there are no pre-built ROCm wheels.
|
||||
```console
|
||||
# Install PyTorch
|
||||
$ pip uninstall torch -y
|
||||
$ pip install --no-cache-dir --pre torch --index-url https://download.pytorch.org/whl/rocm6.3
|
||||
$ pip install --no-cache-dir --pre torch --index-url https://download.pytorch.org/whl/nightly/rocm6.3
|
||||
```
|
||||
|
||||
1. Install [Triton flash attention for ROCm](https://github.com/ROCm/triton)
|
||||
|
||||
@ -156,10 +156,3 @@ vLLM V1 is currently optimized for decoder-only transformers. Models requiring
|
||||
cross-attention between separate encoder and decoder are not yet supported (e.g., `BartForConditionalGeneration`, `MllamaForConditionalGeneration`).
|
||||
|
||||
For a complete list of supported models, see the [list of supported models](https://docs.vllm.ai/en/latest/models/supported_models.html).
|
||||
|
||||
## Frequently Asked Questions
|
||||
|
||||
**I'm using vLLM V1 and I'm getting CUDA OOM errors. What should I do?**
|
||||
The default `max_num_seqs` has been raised from `256` in V0 to `1024` in V1. If you encounter CUDA OOM only when using V1 engine, try setting a lower value of `max_num_seqs` or `gpu_memory_utilization`.
|
||||
|
||||
On the other hand, if you get an error about insufficient memory for the cache blocks, you should increase `gpu_memory_utilization` as this indicates that your GPU has sufficient memory but you're not allocating enough to vLLM for KV cache blocks.
|
||||
|
||||
@ -24,7 +24,7 @@ vLLM also supports model implementations that are available in Transformers. Thi
|
||||
|
||||
To check if the modeling backend is Transformers, you can simply do this:
|
||||
|
||||
```python
|
||||
```python
|
||||
from vllm import LLM
|
||||
llm = LLM(model=..., task="generate") # Name or path of your model
|
||||
llm.apply_model(lambda model: print(type(model)))
|
||||
@ -55,7 +55,7 @@ If your model is neither supported natively by vLLM or Transformers, you can sti
|
||||
Simply set `trust_remote_code=True` and vLLM will run any model on the Model Hub that is compatible with Transformers.
|
||||
Provided that the model writer implements their model in a compatible way, this means that you can run new models before they are officially supported in Transformers or vLLM!
|
||||
|
||||
```python
|
||||
```python
|
||||
from vllm import LLM
|
||||
llm = LLM(model=..., task="generate", trust_remote_code=True) # Name or path of your model
|
||||
llm.apply_model(lambda model: print(model.__class__))
|
||||
@ -218,6 +218,11 @@ See [this page](#generative-models) for more information on how to use generativ
|
||||
* `baichuan-inc/Baichuan2-13B-Chat`, `baichuan-inc/Baichuan-7B`, etc.
|
||||
* ✅︎
|
||||
* ✅︎
|
||||
- * `BambaForCausalLM`
|
||||
* Bamba
|
||||
* `ibm-ai-platform/Bamba-9B-fp8`, `ibm-ai-platform/Bamba-9B`
|
||||
*
|
||||
*
|
||||
- * `BloomForCausalLM`
|
||||
* BLOOM, BLOOMZ, BLOOMChat
|
||||
* `bigscience/bloom`, `bigscience/bloomz`, etc.
|
||||
@ -228,9 +233,9 @@ See [this page](#generative-models) for more information on how to use generativ
|
||||
* `facebook/bart-base`, `facebook/bart-large-cnn`, etc.
|
||||
*
|
||||
*
|
||||
- * `ChatGLMModel`
|
||||
- * `ChatGLMModel`, `ChatGLMForConditionalGeneration`
|
||||
* ChatGLM
|
||||
* `THUDM/chatglm2-6b`, `THUDM/chatglm3-6b`, etc.
|
||||
* `THUDM/chatglm2-6b`, `THUDM/chatglm3-6b`, `ShieldLM-6B-chatglm3`, etc.
|
||||
* ✅︎
|
||||
* ✅︎
|
||||
- * `CohereForCausalLM`, `Cohere2ForCausalLM`
|
||||
@ -473,6 +478,16 @@ See [this page](#generative-models) for more information on how to use generativ
|
||||
* `Qwen/Qwen1.5-MoE-A2.7B`, `Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc.
|
||||
*
|
||||
* ✅︎
|
||||
- * `Qwen3ForCausalLM`
|
||||
* Qwen3
|
||||
* `Qwen/Qwen3-8B`, etc.
|
||||
* ✅︎
|
||||
* ✅︎
|
||||
- * `Qwen3MoeForCausalLM`
|
||||
* Qwen3MoE
|
||||
* `Qwen/Qwen3-MoE-15B-A2B`, etc.
|
||||
* ✅︎
|
||||
* ✅︎
|
||||
- * `StableLmForCausalLM`
|
||||
* StableLM
|
||||
* `stabilityai/stablelm-3b-4e1t`, `stabilityai/stablelm-base-alpha-7b-v2`, etc.
|
||||
@ -503,6 +518,11 @@ See [this page](#generative-models) for more information on how to use generativ
|
||||
* `xverse/XVERSE-7B-Chat`, `xverse/XVERSE-13B-Chat`, `xverse/XVERSE-65B-Chat`, etc.
|
||||
* ✅︎
|
||||
* ✅︎
|
||||
- * `MiniMaxText01ForCausalLM`
|
||||
* MiniMax-Text
|
||||
* `MiniMaxAI/MiniMax-Text-01`, etc.
|
||||
*
|
||||
* ✅︎
|
||||
- * `Zamba2ForCausalLM`
|
||||
* Zamba2
|
||||
* `Zyphra/Zamba2-7B-instruct`, `Zyphra/Zamba2-2.7B-instruct`, `Zyphra/Zamba2-1.2B-instruct`, etc.
|
||||
@ -830,6 +850,13 @@ See [this page](#generative-models) for more information on how to use generativ
|
||||
*
|
||||
* ✅︎
|
||||
* ✅︎
|
||||
- * `Llama4ForConditionalGeneration`
|
||||
* Llama-4-17B-Omni-Instruct
|
||||
* T + I<sup>+</sup>
|
||||
* `meta-llama/Llama-4-Scout-17B-16E-Instruct`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct`, etc.
|
||||
*
|
||||
* ✅︎
|
||||
* ✅︎
|
||||
- * `LlavaForConditionalGeneration`
|
||||
* LLaVA-1.5
|
||||
* T + I<sup>E+</sup>
|
||||
@ -878,7 +905,7 @@ See [this page](#generative-models) for more information on how to use generativ
|
||||
* `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, etc.
|
||||
*
|
||||
* ✅︎
|
||||
*
|
||||
* ✅︎
|
||||
- * `MllamaForConditionalGeneration`
|
||||
* Llama 3.2
|
||||
* T + I<sup>+</sup>
|
||||
@ -1118,5 +1145,5 @@ We have the following levels of testing for models:
|
||||
|
||||
1. **Strict Consistency**: We compare the output of the model with the output of the model in the HuggingFace Transformers library under greedy decoding. This is the most stringent test. Please refer to [models tests](https://github.com/vllm-project/vllm/blob/main/tests/models) for the models that have passed this test.
|
||||
2. **Output Sensibility**: We check if the output of the model is sensible and coherent, by measuring the perplexity of the output and checking for any obvious errors. This is a less stringent test.
|
||||
3. **Runtime Functionality**: We check if the model can be loaded and run without errors. This is the least stringent test. Please refer to [functionality tests](gh-dir:tests) and [examples](gh-dir:main/examples) for the models that have passed this test.
|
||||
3. **Runtime Functionality**: We check if the model can be loaded and run without errors. This is the least stringent test. Please refer to [functionality tests](gh-dir:tests) and [examples](gh-dir:examples) for the models that have passed this test.
|
||||
4. **Community Feedback**: We rely on the community to provide feedback on the models. If a model is broken or not working as expected, we encourage users to raise issues to report it or open pull requests to fix it. The rest of the models fall under this category.
|
||||
|
||||
@ -1,6 +1,8 @@
|
||||
# Usage Stats Collection
|
||||
|
||||
vLLM collects anonymous usage data by default to help the engineering team better understand which hardware and model configurations are widely used. This data allows them to prioritize their efforts on the most common workloads. The collected data is transparent, does not contain any sensitive information, and will be publicly released for the community's benefit.
|
||||
vLLM collects anonymous usage data by default to help the engineering team better understand which hardware and model configurations are widely used. This data allows them to prioritize their efforts on the most common workloads. The collected data is transparent, does not contain any sensitive information.
|
||||
|
||||
A subset of the data, after cleaning and aggregation, will be publicly released for the community's benefit. For example, you can see the 2024 usage report [here](https://2024.vllm.ai).
|
||||
|
||||
## What data is collected?
|
||||
|
||||
|
||||
@ -47,7 +47,7 @@ def run_minicpmo(question: str, audio_count: int) -> ModelRequestData:
|
||||
model=model_name,
|
||||
trust_remote_code=True,
|
||||
max_model_len=4096,
|
||||
max_num_seqs=5,
|
||||
max_num_seqs=2,
|
||||
limit_mm_per_prompt={"audio": audio_count},
|
||||
)
|
||||
|
||||
|
||||
@ -7,89 +7,102 @@ from transformers import AutoTokenizer
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
parser.add_argument(
|
||||
"--dataset",
|
||||
type=str,
|
||||
default="./examples/data/gsm8k.jsonl",
|
||||
help="downloaded from the eagle repo " \
|
||||
"https://github.com/SafeAILab/EAGLE/blob/main/eagle/data/"
|
||||
)
|
||||
parser.add_argument("--max_num_seqs", type=int, default=8)
|
||||
parser.add_argument("--num_prompts", type=int, default=80)
|
||||
parser.add_argument("--num_spec_tokens", type=int, default=2)
|
||||
parser.add_argument("--tp", type=int, default=1)
|
||||
parser.add_argument("--draft_tp", type=int, default=1)
|
||||
parser.add_argument("--enforce_eager", action='store_true')
|
||||
parser.add_argument("--enable_chunked_prefill", action='store_true')
|
||||
parser.add_argument("--max_num_batched_tokens", type=int, default=2048)
|
||||
parser.add_argument("--temp", type=float, default=0)
|
||||
def load_prompts(dataset_path, num_prompts):
|
||||
if os.path.exists(dataset_path):
|
||||
prompts = []
|
||||
try:
|
||||
with open(dataset_path) as f:
|
||||
for line in f:
|
||||
data = json.loads(line)
|
||||
prompts.append(data["turns"][0])
|
||||
except Exception as e:
|
||||
print(f"Error reading dataset: {e}")
|
||||
return []
|
||||
else:
|
||||
prompts = [
|
||||
"The future of AI is", "The president of the United States is"
|
||||
]
|
||||
|
||||
args = parser.parse_args()
|
||||
return prompts[:num_prompts]
|
||||
|
||||
print(args)
|
||||
|
||||
model_dir = "meta-llama/Meta-Llama-3-8B-Instruct"
|
||||
eagle_dir = "abhigoyal/EAGLE-LLaMA3-Instruct-8B-vllm"
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"--dataset",
|
||||
type=str,
|
||||
default="./examples/data/gsm8k.jsonl",
|
||||
help="downloaded from the eagle repo " \
|
||||
"https://github.com/SafeAILab/EAGLE/blob/main/eagle/data/"
|
||||
)
|
||||
parser.add_argument("--max_num_seqs", type=int, default=8)
|
||||
parser.add_argument("--num_prompts", type=int, default=80)
|
||||
parser.add_argument("--num_spec_tokens", type=int, default=2)
|
||||
parser.add_argument("--tp", type=int, default=1)
|
||||
parser.add_argument("--draft_tp", type=int, default=1)
|
||||
parser.add_argument("--enforce_eager", action='store_true')
|
||||
parser.add_argument("--enable_chunked_prefill", action='store_true')
|
||||
parser.add_argument("--max_num_batched_tokens", type=int, default=2048)
|
||||
parser.add_argument("--temp", type=float, default=0)
|
||||
args = parser.parse_args()
|
||||
|
||||
max_model_len = 2048
|
||||
model_dir = "meta-llama/Meta-Llama-3-8B-Instruct"
|
||||
eagle_dir = "abhigoyal/EAGLE-LLaMA3-Instruct-8B-vllm"
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_dir)
|
||||
max_model_len = 2048
|
||||
|
||||
if os.path.exists(args.dataset):
|
||||
prompts = []
|
||||
num_prompts = args.num_prompts
|
||||
with open(args.dataset) as f:
|
||||
for line in f:
|
||||
data = json.loads(line)
|
||||
prompts.append(data["turns"][0])
|
||||
else:
|
||||
prompts = ["The future of AI is", "The president of the United States is"]
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_dir)
|
||||
|
||||
prompts = prompts[:args.num_prompts]
|
||||
num_prompts = len(prompts)
|
||||
prompts = load_prompts(args.dataset, args.num_prompts)
|
||||
|
||||
prompt_ids = [
|
||||
tokenizer.apply_chat_template([{
|
||||
"role": "user",
|
||||
"content": prompt
|
||||
}],
|
||||
add_generation_prompt=True)
|
||||
for prompt in prompts
|
||||
]
|
||||
prompt_ids = [
|
||||
tokenizer.apply_chat_template([{
|
||||
"role": "user",
|
||||
"content": prompt
|
||||
}],
|
||||
add_generation_prompt=True)
|
||||
for prompt in prompts
|
||||
]
|
||||
|
||||
llm = LLM(
|
||||
model=model_dir,
|
||||
trust_remote_code=True,
|
||||
tensor_parallel_size=args.tp,
|
||||
enable_chunked_prefill=args.enable_chunked_prefill,
|
||||
max_num_batched_tokens=args.max_num_batched_tokens,
|
||||
enforce_eager=args.enforce_eager,
|
||||
max_model_len=max_model_len,
|
||||
max_num_seqs=args.max_num_seqs,
|
||||
gpu_memory_utilization=0.8,
|
||||
speculative_config={
|
||||
"model": eagle_dir,
|
||||
"num_speculative_tokens": args.num_spec_tokens,
|
||||
"draft_tensor_parallel_size": args.draft_tp,
|
||||
"max_model_len": max_model_len,
|
||||
},
|
||||
disable_log_stats=False,
|
||||
)
|
||||
llm = LLM(
|
||||
model=model_dir,
|
||||
trust_remote_code=True,
|
||||
tensor_parallel_size=args.tp,
|
||||
enable_chunked_prefill=args.enable_chunked_prefill,
|
||||
max_num_batched_tokens=args.max_num_batched_tokens,
|
||||
enforce_eager=args.enforce_eager,
|
||||
max_model_len=max_model_len,
|
||||
max_num_seqs=args.max_num_seqs,
|
||||
gpu_memory_utilization=0.8,
|
||||
speculative_config={
|
||||
"model": eagle_dir,
|
||||
"num_speculative_tokens": args.num_spec_tokens,
|
||||
"draft_tensor_parallel_size": args.draft_tp,
|
||||
"max_model_len": max_model_len,
|
||||
},
|
||||
disable_log_stats=False,
|
||||
)
|
||||
|
||||
sampling_params = SamplingParams(temperature=args.temp, max_tokens=256)
|
||||
sampling_params = SamplingParams(temperature=args.temp, max_tokens=256)
|
||||
|
||||
outputs = llm.generate(prompt_token_ids=prompt_ids,
|
||||
sampling_params=sampling_params)
|
||||
outputs = llm.generate(prompt_token_ids=prompt_ids,
|
||||
sampling_params=sampling_params)
|
||||
|
||||
# calculate the average number of accepted tokens per forward pass, +1 is
|
||||
# to account for the token from the target model that's always going to be
|
||||
# accepted
|
||||
acceptance_counts = [0] * (args.num_spec_tokens + 1)
|
||||
for output in outputs:
|
||||
for step, count in enumerate(output.metrics.spec_token_acceptance_counts):
|
||||
acceptance_counts[step] += count
|
||||
# calculate the average number of accepted tokens per forward pass, +1 is
|
||||
# to account for the token from the target model that's always going to be
|
||||
# accepted
|
||||
acceptance_counts = [0] * (args.num_spec_tokens + 1)
|
||||
for output in outputs:
|
||||
for step, count in enumerate(
|
||||
output.metrics.spec_token_acceptance_counts):
|
||||
acceptance_counts[step] += count
|
||||
|
||||
print(f"mean acceptance length: \
|
||||
{sum(acceptance_counts) / acceptance_counts[0]:.2f}")
|
||||
print("-" * 50)
|
||||
print(f"mean acceptance length: \
|
||||
{sum(acceptance_counts) / acceptance_counts[0]:.2f}")
|
||||
print("-" * 50)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@ -75,8 +75,6 @@ prompts = [
|
||||
enc_dec_prompt1, enc_dec_prompt2, enc_dec_prompt3
|
||||
] + zipped_prompt_list
|
||||
|
||||
print(prompts)
|
||||
|
||||
# Create a sampling params object.
|
||||
sampling_params = SamplingParams(
|
||||
temperature=0,
|
||||
@ -91,10 +89,13 @@ sampling_params = SamplingParams(
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
|
||||
# Print the outputs.
|
||||
for output in outputs:
|
||||
print("-" * 50)
|
||||
for i, output in enumerate(outputs):
|
||||
prompt = output.prompt
|
||||
encoder_prompt = output.encoder_prompt
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Encoder prompt: {encoder_prompt!r}, "
|
||||
f"Decoder prompt: {prompt!r}, "
|
||||
print(f"Output {i+1}:")
|
||||
print(f"Encoder prompt: {encoder_prompt!r}\n"
|
||||
f"Decoder prompt: {prompt!r}\n"
|
||||
f"Generated text: {generated_text!r}")
|
||||
print("-" * 50)
|
||||
|
||||
@ -1,5 +1,8 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
"""
|
||||
This file demonstrates using the `LLMEngine`
|
||||
for processing prompts with various sampling parameters.
|
||||
"""
|
||||
import argparse
|
||||
|
||||
from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams
|
||||
@ -26,6 +29,7 @@ def process_requests(engine: LLMEngine,
|
||||
"""Continuously process a list of prompts and handle the outputs."""
|
||||
request_id = 0
|
||||
|
||||
print('-' * 50)
|
||||
while test_prompts or engine.has_unfinished_requests():
|
||||
if test_prompts:
|
||||
prompt, sampling_params = test_prompts.pop(0)
|
||||
@ -37,6 +41,7 @@ def process_requests(engine: LLMEngine,
|
||||
for request_output in request_outputs:
|
||||
if request_output.finished:
|
||||
print(request_output)
|
||||
print('-' * 50)
|
||||
|
||||
|
||||
def initialize_engine(args: argparse.Namespace) -> LLMEngine:
|
||||
|
||||
93
examples/offline_inference/load_sharded_state.py
Normal file
93
examples/offline_inference/load_sharded_state.py
Normal file
@ -0,0 +1,93 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
"""
|
||||
Validates the loading of a model saved with the sharded_state format.
|
||||
This script demonstrates how to load a model that was previously saved
|
||||
using save_sharded_state.py and validates it by running inference.
|
||||
Example usage:
|
||||
(First need to save a sharded_state mode)
|
||||
|
||||
python save_sharded_state.py \
|
||||
--model /path/to/load \
|
||||
--quantization deepspeedfp \
|
||||
--tensor-parallel-size 8 \
|
||||
--output /path/to/save/sharded/modele
|
||||
|
||||
python load_sharded_state.py \
|
||||
--model /path/to/saved/sharded/model \
|
||||
--load-format sharded_state \
|
||||
--quantization deepspeedfp \
|
||||
--tensor-parallel-size 8 \
|
||||
--prompt "Hello, my name is" \
|
||||
--max-tokens 50
|
||||
"""
|
||||
|
||||
import dataclasses
|
||||
|
||||
from vllm import LLM, EngineArgs, SamplingParams
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = FlexibleArgumentParser()
|
||||
# Add engine arguments
|
||||
EngineArgs.add_cli_args(parser)
|
||||
|
||||
# Override default load_format for clarity
|
||||
parser.set_defaults(load_format="sharded_state")
|
||||
|
||||
# Add validation arguments
|
||||
parser.add_argument("--prompt",
|
||||
type=str,
|
||||
default="Hello, world!",
|
||||
help="Prompt for validation")
|
||||
parser.add_argument("--max-tokens",
|
||||
type=int,
|
||||
default=100,
|
||||
help="Maximum number of tokens to generate")
|
||||
parser.add_argument("--temperature",
|
||||
type=float,
|
||||
default=0.7,
|
||||
help="Sampling temperature")
|
||||
parser.add_argument("--top-p",
|
||||
type=float,
|
||||
default=1.0,
|
||||
help="Top-p sampling parameter")
|
||||
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def main():
|
||||
args = parse_args()
|
||||
engine_args = EngineArgs.from_cli_args(args)
|
||||
|
||||
print(f"Loading model from {engine_args.model} "
|
||||
f"using format {engine_args.load_format}")
|
||||
print(f"Tensor parallel size: {engine_args.tensor_parallel_size}")
|
||||
|
||||
# Load the model using engine args
|
||||
llm = LLM(**dataclasses.asdict(engine_args))
|
||||
|
||||
# Prepare sampling parameters
|
||||
sampling_params = SamplingParams(
|
||||
temperature=args.temperature,
|
||||
top_p=args.top_p,
|
||||
max_tokens=args.max_tokens,
|
||||
)
|
||||
|
||||
print("\nRunning inference:")
|
||||
print(f"Prompt: {args.prompt}")
|
||||
|
||||
# Generate completion
|
||||
outputs = llm.generate(args.prompt, sampling_params)
|
||||
|
||||
# Display generated text
|
||||
print("\nGenerated outputs:")
|
||||
for output in outputs:
|
||||
generated_text = output.outputs[0].text
|
||||
print("-" * 50)
|
||||
print(f"Full output: {args.prompt}{generated_text}")
|
||||
print("-" * 50)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@ -13,9 +13,14 @@ from vllm.sampling_params import SamplingParams
|
||||
# - Server:
|
||||
#
|
||||
# ```bash
|
||||
# # Mistral format
|
||||
# vllm serve mistralai/Mistral-Small-3.1-24B-Instruct-2503 \
|
||||
# --tokenizer-mode mistral --config-format mistral --load-format mistral \
|
||||
# --limit-mm-per-prompt 'image=4' --max-model-len 16384
|
||||
#
|
||||
# # HF format
|
||||
# vllm serve mistralai/Mistral-Small-3.1-24B-Instruct-2503 \
|
||||
# --limit-mm-per-prompt 'image=4' --max-model-len 16384
|
||||
# ```
|
||||
#
|
||||
# - Client:
|
||||
@ -44,19 +49,22 @@ from vllm.sampling_params import SamplingParams
|
||||
# python demo.py simple
|
||||
# python demo.py advanced
|
||||
|
||||
# Lower max_model_len and/or max_num_seqs on low-VRAM GPUs.
|
||||
# These scripts have been tested on 2x L40 GPUs
|
||||
|
||||
|
||||
def run_simple_demo(args: argparse.Namespace):
|
||||
model_name = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
|
||||
sampling_params = SamplingParams(max_tokens=8192)
|
||||
|
||||
# Lower max_model_len and/or max_num_seqs on low-VRAM GPUs.
|
||||
llm = LLM(
|
||||
model=model_name,
|
||||
tokenizer_mode="mistral",
|
||||
config_format="mistral",
|
||||
load_format="mistral",
|
||||
tokenizer_mode="mistral" if args.format == "mistral" else "auto",
|
||||
config_format="mistral" if args.format == "mistral" else "auto",
|
||||
load_format="mistral" if args.format == "mistral" else "auto",
|
||||
max_model_len=4096,
|
||||
max_num_seqs=2,
|
||||
tensor_parallel_size=2,
|
||||
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
|
||||
)
|
||||
|
||||
@ -88,17 +96,18 @@ def run_simple_demo(args: argparse.Namespace):
|
||||
|
||||
def run_advanced_demo(args: argparse.Namespace):
|
||||
model_name = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
|
||||
max_img_per_msg = 5
|
||||
max_img_per_msg = 3
|
||||
max_tokens_per_img = 4096
|
||||
|
||||
sampling_params = SamplingParams(max_tokens=8192, temperature=0.7)
|
||||
llm = LLM(
|
||||
model=model_name,
|
||||
tokenizer_mode="mistral",
|
||||
config_format="mistral",
|
||||
load_format="mistral",
|
||||
tokenizer_mode="mistral" if args.format == "mistral" else "auto",
|
||||
config_format="mistral" if args.format == "mistral" else "auto",
|
||||
load_format="mistral" if args.format == "mistral" else "auto",
|
||||
limit_mm_per_prompt={"image": max_img_per_msg},
|
||||
max_model_len=max_img_per_msg * max_tokens_per_img,
|
||||
tensor_parallel_size=2,
|
||||
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
|
||||
)
|
||||
|
||||
@ -166,6 +175,11 @@ def main():
|
||||
help="Specify the demo mode: 'simple' or 'advanced'",
|
||||
)
|
||||
|
||||
parser.add_argument('--format',
|
||||
choices=["mistral", "hf"],
|
||||
default="mistral",
|
||||
help='Specify the format of the model to load.')
|
||||
|
||||
parser.add_argument(
|
||||
'--disable-mm-preprocessor-cache',
|
||||
action='store_true',
|
||||
|
||||
@ -1,4 +1,11 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
"""
|
||||
This file demonstrates the usage of text generation with an LLM model,
|
||||
comparing the performance with and without speculative decoding.
|
||||
|
||||
Note that still not support `v1`:
|
||||
VLLM_USE_V1=0 python examples/offline_inference/mlpspeculator.py
|
||||
"""
|
||||
|
||||
import gc
|
||||
import time
|
||||
@ -7,7 +14,7 @@ from vllm import LLM, SamplingParams
|
||||
|
||||
|
||||
def time_generation(llm: LLM, prompts: list[str],
|
||||
sampling_params: SamplingParams):
|
||||
sampling_params: SamplingParams, title: str):
|
||||
# Generate texts from the prompts. The output is a list of RequestOutput
|
||||
# objects that contain the prompt, generated text, and other information.
|
||||
# Warmup first
|
||||
@ -16,11 +23,15 @@ def time_generation(llm: LLM, prompts: list[str],
|
||||
start = time.time()
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
end = time.time()
|
||||
print((end - start) / sum([len(o.outputs[0].token_ids) for o in outputs]))
|
||||
print("-" * 50)
|
||||
print(title)
|
||||
print("time: ",
|
||||
(end - start) / sum(len(o.outputs[0].token_ids) for o in outputs))
|
||||
# Print the outputs.
|
||||
for output in outputs:
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"text: {generated_text!r}")
|
||||
print("-" * 50)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
@ -41,8 +52,7 @@ if __name__ == "__main__":
|
||||
# Create an LLM without spec decoding
|
||||
llm = LLM(model="meta-llama/Llama-2-13b-chat-hf")
|
||||
|
||||
print("Without speculation")
|
||||
time_generation(llm, prompts, sampling_params)
|
||||
time_generation(llm, prompts, sampling_params, "Without speculation")
|
||||
|
||||
del llm
|
||||
gc.collect()
|
||||
@ -55,5 +65,4 @@ if __name__ == "__main__":
|
||||
},
|
||||
)
|
||||
|
||||
print("With speculation")
|
||||
time_generation(llm, prompts, sampling_params)
|
||||
time_generation(llm, prompts, sampling_params, "With speculation")
|
||||
|
||||
@ -57,10 +57,25 @@ def main(args):
|
||||
# Prepare output directory
|
||||
Path(args.output).mkdir(exist_ok=True)
|
||||
# Dump worker states to output directory
|
||||
model_executor = llm.llm_engine.model_executor
|
||||
model_executor.save_sharded_state(path=args.output,
|
||||
pattern=args.file_pattern,
|
||||
max_size=args.max_file_size)
|
||||
|
||||
# Check which engine version is being used
|
||||
is_v1_engine = hasattr(llm.llm_engine, "engine_core")
|
||||
|
||||
if is_v1_engine:
|
||||
# For V1 engine, we need to use engine_core.save_sharded_state
|
||||
print("Using V1 engine save path")
|
||||
llm.llm_engine.engine_core.save_sharded_state(
|
||||
path=args.output,
|
||||
pattern=args.file_pattern,
|
||||
max_size=args.max_file_size)
|
||||
else:
|
||||
# For V0 engine
|
||||
print("Using V0 engine save path")
|
||||
model_executor = llm.llm_engine.model_executor
|
||||
model_executor.save_sharded_state(path=args.output,
|
||||
pattern=args.file_pattern,
|
||||
max_size=args.max_file_size)
|
||||
|
||||
# Copy metadata files to output directory
|
||||
for file in os.listdir(model_path):
|
||||
if os.path.splitext(file)[1] not in (".bin", ".pt", ".safetensors"):
|
||||
|
||||
@ -23,10 +23,14 @@ sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
||||
|
||||
# Use `distributed_executor_backend="external_launcher"` so that
|
||||
# this llm engine/instance only creates one worker.
|
||||
# it is important to set an explicit seed to make sure that
|
||||
# all ranks have the same random seed, so that sampling can be
|
||||
# deterministic across ranks.
|
||||
llm = LLM(
|
||||
model="facebook/opt-125m",
|
||||
tensor_parallel_size=2,
|
||||
distributed_executor_backend="external_launcher",
|
||||
seed=0,
|
||||
)
|
||||
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
|
||||
@ -582,6 +582,42 @@ def run_mllama(questions: list[str], modality: str) -> ModelRequestData:
|
||||
)
|
||||
|
||||
|
||||
def run_llama4(questions: list[str], modality: str):
|
||||
assert modality == "image"
|
||||
|
||||
model_name = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
|
||||
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
max_model_len=8192,
|
||||
max_num_seqs=4,
|
||||
tensor_parallel_size=8,
|
||||
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
|
||||
gpu_memory_utilization=0.4,
|
||||
)
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
||||
messages = [[{
|
||||
"role":
|
||||
"user",
|
||||
"content": [{
|
||||
"type": "image"
|
||||
}, {
|
||||
"type": "text",
|
||||
"text": f"{question}"
|
||||
}]
|
||||
}] for question in questions]
|
||||
prompts = tokenizer.apply_chat_template(messages,
|
||||
add_generation_prompt=True,
|
||||
tokenize=False)
|
||||
stop_token_ids = None
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompts=prompts,
|
||||
stop_token_ids=stop_token_ids,
|
||||
)
|
||||
|
||||
|
||||
# Molmo
|
||||
def run_molmo(questions: list[str], modality: str) -> ModelRequestData:
|
||||
assert modality == "image"
|
||||
@ -907,6 +943,7 @@ model_example_map = {
|
||||
"minicpmv": run_minicpmv,
|
||||
"mistral3": run_mistral3,
|
||||
"mllama": run_mllama,
|
||||
"llama4": run_llama4,
|
||||
"molmo": run_molmo,
|
||||
"NVLM_D": run_nvlm_d,
|
||||
"paligemma": run_paligemma,
|
||||
|
||||
@ -253,6 +253,43 @@ def load_internvl(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
)
|
||||
|
||||
|
||||
def load_llama4(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
model_name = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
|
||||
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
max_model_len=8192,
|
||||
max_num_seqs=4,
|
||||
tensor_parallel_size=8,
|
||||
limit_mm_per_prompt={"image": len(image_urls)},
|
||||
)
|
||||
|
||||
placeholders = [{"type": "image", "image": url} for url in image_urls]
|
||||
messages = [{
|
||||
"role":
|
||||
"user",
|
||||
"content": [
|
||||
*placeholders,
|
||||
{
|
||||
"type": "text",
|
||||
"text": question
|
||||
},
|
||||
],
|
||||
}]
|
||||
|
||||
processor = AutoProcessor.from_pretrained(model_name)
|
||||
|
||||
prompt = processor.apply_chat_template(messages,
|
||||
tokenize=False,
|
||||
add_generation_prompt=True)
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompt=prompt,
|
||||
image_data=[fetch_image(url) for url in image_urls],
|
||||
)
|
||||
|
||||
|
||||
def load_mistral3(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
model_name = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
|
||||
|
||||
@ -567,6 +604,7 @@ model_example_map = {
|
||||
"h2ovl_chat": load_h2ovl,
|
||||
"idefics3": load_idefics3,
|
||||
"internvl_chat": load_internvl,
|
||||
"llama4": load_llama4,
|
||||
"mistral3": load_mistral3,
|
||||
"mllama": load_mllama,
|
||||
"NVLM_D": load_nvlm_d,
|
||||
|
||||
@ -0,0 +1,136 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
"""
|
||||
To run this example, you can start the vLLM server
|
||||
without any specific flags:
|
||||
|
||||
```bash
|
||||
VLLM_USE_V1=0 vllm serve unsloth/Llama-3.2-1B-Instruct \
|
||||
--guided-decoding-backend outlines
|
||||
```
|
||||
|
||||
This example demonstrates how to generate chat completions
|
||||
using the OpenAI Python client library.
|
||||
"""
|
||||
|
||||
from openai import OpenAI
|
||||
|
||||
# Modify OpenAI's API key and API base to use vLLM's API server.
|
||||
openai_api_key = "EMPTY"
|
||||
openai_api_base = "http://localhost:8000/v1"
|
||||
|
||||
client = OpenAI(
|
||||
# defaults to os.environ.get("OPENAI_API_KEY")
|
||||
api_key=openai_api_key,
|
||||
base_url=openai_api_base,
|
||||
)
|
||||
|
||||
models = client.models.list()
|
||||
model = models.data[0].id
|
||||
|
||||
tools = [
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "get_current_weather",
|
||||
"description": "Get the current weather in a given location",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"city": {
|
||||
"type":
|
||||
"string",
|
||||
"description":
|
||||
"The city to find the weather for"
|
||||
", e.g. 'San Francisco'",
|
||||
},
|
||||
"state": {
|
||||
"type":
|
||||
"string",
|
||||
"description":
|
||||
"the two-letter abbreviation for the state that the "
|
||||
"city is in, e.g. 'CA' which would mean 'California'",
|
||||
},
|
||||
"unit": {
|
||||
"type": "string",
|
||||
"description": "The unit to fetch the temperature in",
|
||||
"enum": ["celsius", "fahrenheit"],
|
||||
},
|
||||
},
|
||||
"required": ["city", "state", "unit"],
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "get_forecast",
|
||||
"description": "Get the weather forecast for a given location",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"city": {
|
||||
"type":
|
||||
"string",
|
||||
"description":
|
||||
"The city to get the forecast for, e.g. 'New York'",
|
||||
},
|
||||
"state": {
|
||||
"type":
|
||||
"string",
|
||||
"description":
|
||||
"The two-letter abbreviation for the state, e.g. 'NY'",
|
||||
},
|
||||
"days": {
|
||||
"type":
|
||||
"integer",
|
||||
"description":
|
||||
"Number of days to get the forecast for (1-7)",
|
||||
},
|
||||
"unit": {
|
||||
"type": "string",
|
||||
"description": "The unit to fetch the temperature in",
|
||||
"enum": ["celsius", "fahrenheit"],
|
||||
},
|
||||
},
|
||||
"required": ["city", "state", "days", "unit"],
|
||||
},
|
||||
},
|
||||
},
|
||||
]
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Hi! How are you doing today?"
|
||||
},
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": "I'm doing well! How can I help you?"
|
||||
},
|
||||
{
|
||||
"role":
|
||||
"user",
|
||||
"content":
|
||||
"Can you tell me what the current weather is in Dallas \
|
||||
and the forecast for the next 5 days, in fahrenheit?",
|
||||
},
|
||||
]
|
||||
|
||||
chat_completion = client.chat.completions.create(
|
||||
messages=messages,
|
||||
model=model,
|
||||
tools=tools,
|
||||
tool_choice="required",
|
||||
stream=True # Enable streaming response
|
||||
)
|
||||
|
||||
for chunk in chat_completion:
|
||||
if chunk.choices and chunk.choices[0].delta.tool_calls:
|
||||
print(chunk.choices[0].delta.tool_calls)
|
||||
|
||||
chat_completion = client.chat.completions.create(messages=messages,
|
||||
model=model,
|
||||
tools=tools,
|
||||
tool_choice="required")
|
||||
|
||||
print(chat_completion.choices[0].message.tool_calls)
|
||||
7
examples/template_florence2.jinja
Normal file
7
examples/template_florence2.jinja
Normal file
@ -0,0 +1,7 @@
|
||||
{%- for message in messages -%}
|
||||
{%- if message['role'] == 'user' -%}
|
||||
{{- message['content'] -}}
|
||||
{%- elif message['role'] == 'assistant' -%}
|
||||
{{- message['content'] -}}
|
||||
{%- endif -%}
|
||||
{%- endfor -%}
|
||||
@ -76,7 +76,7 @@
|
||||
{{- tool_call.name + '(' -}}
|
||||
{%- for param in tool_call.arguments %}
|
||||
{{- param + '=' -}}
|
||||
{{- "%sr" | format(tool_call.arguments[param]) -}}
|
||||
{{- "%s" | format(tool_call.arguments[param]) -}}
|
||||
{% if not loop.last %}, {% endif %}
|
||||
{%- endfor %}
|
||||
{{- ')' -}}
|
||||
|
||||
@ -44,7 +44,7 @@
|
||||
{{- tool_call.name + '(' -}}
|
||||
{%- for param in tool_call.arguments %}
|
||||
{{- param + '=' -}}
|
||||
{{- "%sr" | format(tool_call.arguments[param]) -}}
|
||||
{{- "%s" | format(tool_call.arguments[param]) -}}
|
||||
{% if not loop.last %}, {% endif %}
|
||||
{%- endfor %}
|
||||
{{- ')' -}}
|
||||
|
||||
@ -30,7 +30,7 @@ classifiers = [
|
||||
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
||||
"Topic :: Scientific/Engineering :: Information Analysis",
|
||||
]
|
||||
requires-python = ">=3.9"
|
||||
requires-python = ">=3.9,<3.13"
|
||||
dynamic = [ "version", "dependencies", "optional-dependencies"]
|
||||
|
||||
[project.urls]
|
||||
|
||||
@ -6,7 +6,8 @@ requests >= 2.26.0
|
||||
tqdm
|
||||
blake3
|
||||
py-cpuinfo
|
||||
transformers >= 4.50.3
|
||||
transformers >= 4.51.0
|
||||
huggingface-hub[hf_xet] >= 0.30.0 # Required for Xet downloads.
|
||||
tokenizers >= 0.19.1 # Required for Llama 3.
|
||||
protobuf # Required by LlamaTokenizer.
|
||||
fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint.
|
||||
@ -21,7 +22,7 @@ lm-format-enforcer >= 0.10.11, < 0.11
|
||||
llguidance >= 0.7.9, < 0.8.0; platform_machine == "x86_64" or platform_machine == "arm64" or platform_machine == "aarch64"
|
||||
outlines == 0.1.11
|
||||
lark == 1.2.2
|
||||
xgrammar == 0.1.16; platform_machine == "x86_64" or platform_machine == "aarch64"
|
||||
xgrammar == 0.1.17; platform_machine == "x86_64" or platform_machine == "aarch64"
|
||||
typing_extensions >= 4.10
|
||||
filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317
|
||||
partial-json-parser # used for parsing partial JSON outputs
|
||||
|
||||
@ -29,8 +29,9 @@ matplotlib # required for qwen-vl test
|
||||
mistral_common[opencv] >= 1.5.4 # required for pixtral test
|
||||
opencv-python-headless >= 4.11.0 # required for video test
|
||||
datamodel_code_generator # required for minicpm3 test
|
||||
lm-eval[api]==0.4.4 # required for model evaluation test
|
||||
transformers==4.50.3
|
||||
lm-eval[api]==0.4.8 # required for model evaluation test
|
||||
transformers==4.51.0
|
||||
huggingface-hub[hf_xet]>=0.30.0 # Required for Xet downloads.
|
||||
# quantization
|
||||
bitsandbytes>=0.45.3
|
||||
buildkite-test-collector==0.1.9
|
||||
|
||||
@ -152,14 +152,17 @@ genson==1.3.0
|
||||
# via datamodel-code-generator
|
||||
h11==0.14.0
|
||||
# via httpcore
|
||||
hf-xet==0.1.4
|
||||
# via huggingface-hub
|
||||
hiredis==3.0.0
|
||||
# via tensorizer
|
||||
httpcore==1.0.6
|
||||
# via httpx
|
||||
httpx==0.27.2
|
||||
# via -r requirements/test.in
|
||||
huggingface-hub==0.26.2
|
||||
huggingface-hub==0.30.1
|
||||
# via
|
||||
# -r requirements/test.in
|
||||
# accelerate
|
||||
# datasets
|
||||
# evaluate
|
||||
@ -219,7 +222,7 @@ librosa==0.10.2.post1
|
||||
# via -r requirements/test.in
|
||||
llvmlite==0.44.0
|
||||
# via numba
|
||||
lm-eval==0.4.4
|
||||
lm-eval==0.4.8
|
||||
# via -r requirements/test.in
|
||||
lxml==5.3.0
|
||||
# via sacrebleu
|
||||
@ -642,7 +645,7 @@ tqdm==4.66.6
|
||||
# transformers
|
||||
tqdm-multiprocess==0.0.11
|
||||
# via lm-eval
|
||||
transformers==4.50.3
|
||||
transformers==4.51.0
|
||||
# via
|
||||
# -r requirements/test.in
|
||||
# genai-perf
|
||||
|
||||
@ -17,9 +17,10 @@ ray[data]
|
||||
--find-links https://storage.googleapis.com/libtpu-releases/index.html
|
||||
--find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html
|
||||
--find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
|
||||
torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250328-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
|
||||
torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250328-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
|
||||
torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250328-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
|
||||
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250328-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
|
||||
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250328-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
|
||||
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250328-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
|
||||
torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250406-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
|
||||
torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250406-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
|
||||
torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250406-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
|
||||
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250406-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
|
||||
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250406-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
|
||||
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250406-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
|
||||
|
||||
|
||||
@ -155,6 +155,24 @@ def test_end_to_end(monkeypatch: pytest.MonkeyPatch, model: str, use_v1: bool):
|
||||
|
||||
llm.wake_up()
|
||||
output2 = llm.generate(prompt, sampling_params)
|
||||
|
||||
# cmp output
|
||||
assert output[0].outputs[0].text == output2[0].outputs[0].text
|
||||
|
||||
llm.sleep(level=1)
|
||||
llm.wake_up(tags=["weights"])
|
||||
|
||||
free_gpu_bytes_wake_up_w, total = torch.cuda.mem_get_info()
|
||||
used_bytes = total - free_gpu_bytes_wake_up_w - used_bytes_baseline
|
||||
|
||||
# should just reallocate memory for weights (1B model, ~2GiB weights)
|
||||
if use_v1:
|
||||
assert used_bytes < 10 * GiB_bytes
|
||||
else:
|
||||
assert used_bytes < 6 * GiB_bytes
|
||||
|
||||
# now allocate kv cache memory
|
||||
llm.wake_up(tags=["kv_cache"])
|
||||
output3 = llm.generate(prompt, sampling_params)
|
||||
|
||||
# cmp output
|
||||
assert output[0].outputs[0].text == output3[0].outputs[0].text
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
from argparse import ArgumentTypeError
|
||||
from argparse import ArgumentError, ArgumentTypeError
|
||||
|
||||
import pytest
|
||||
|
||||
@ -142,3 +142,39 @@ def test_composite_arg_parser(arg, expected, option):
|
||||
else:
|
||||
args = parser.parse_args([f"--{option}", arg])
|
||||
assert getattr(args, option.replace("-", "_")) == expected
|
||||
|
||||
|
||||
def test_human_readable_model_len():
|
||||
# `exit_on_error` disabled to test invalid values below
|
||||
parser = EngineArgs.add_cli_args(
|
||||
FlexibleArgumentParser(exit_on_error=False))
|
||||
|
||||
args = parser.parse_args([])
|
||||
assert args.max_model_len is None
|
||||
|
||||
args = parser.parse_args(["--max-model-len", "1024"])
|
||||
assert args.max_model_len == 1024
|
||||
|
||||
# Lower
|
||||
args = parser.parse_args(["--max-model-len", "1m"])
|
||||
assert args.max_model_len == 1_000_000
|
||||
args = parser.parse_args(["--max-model-len", "10k"])
|
||||
assert args.max_model_len == 10_000
|
||||
|
||||
# Capital
|
||||
args = parser.parse_args(["--max-model-len", "3K"])
|
||||
assert args.max_model_len == 1024 * 3
|
||||
args = parser.parse_args(["--max-model-len", "10M"])
|
||||
assert args.max_model_len == 2**20 * 10
|
||||
|
||||
# Decimal values
|
||||
args = parser.parse_args(["--max-model-len", "10.2k"])
|
||||
assert args.max_model_len == 10200
|
||||
# ..truncated to the nearest int
|
||||
args = parser.parse_args(["--max-model-len", "10.212345k"])
|
||||
assert args.max_model_len == 10212
|
||||
|
||||
# Invalid (do not allow decimals with binary multipliers)
|
||||
for invalid in ["1a", "pwd", "10.24", "1.23M"]:
|
||||
with pytest.raises(ArgumentError):
|
||||
args = parser.parse_args(["--max-model-len", invalid])
|
||||
|
||||
@ -13,18 +13,24 @@ import pytest
|
||||
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
MODEL_NAME = "Qwen/Qwen2-1.5B-Instruct"
|
||||
MODEL_NAMES = [
|
||||
"Qwen/Qwen2-1.5B-Instruct",
|
||||
"google/gemma-3-1b-it",
|
||||
]
|
||||
NUM_CONCURRENT = 500
|
||||
TASK = "gsm8k"
|
||||
FILTER = "exact_match,strict-match"
|
||||
RTOL = 0.03
|
||||
EXPECTED_VALUE = 0.58
|
||||
EXPECTED_VALUES = {
|
||||
"Qwen/Qwen2-1.5B-Instruct": 0.58,
|
||||
"google/gemma-3-1b-it": 0.25,
|
||||
}
|
||||
|
||||
|
||||
def run_test(more_args=None):
|
||||
def run_test(model_name, more_args=None):
|
||||
"""Run the end to end accuracy test."""
|
||||
|
||||
model_args = f"pretrained={MODEL_NAME},max_model_len=4096"
|
||||
model_args = f"pretrained={model_name},max_model_len=4096"
|
||||
|
||||
if more_args is not None:
|
||||
model_args = "{},{}".format(model_args, more_args)
|
||||
@ -37,9 +43,12 @@ def run_test(more_args=None):
|
||||
)
|
||||
|
||||
measured_value = results["results"][TASK][FILTER]
|
||||
assert (measured_value - RTOL < EXPECTED_VALUE
|
||||
and measured_value + RTOL > EXPECTED_VALUE
|
||||
), f"Expected: {EXPECTED_VALUE} | Measured: {measured_value}"
|
||||
assert model_name in EXPECTED_VALUES, (
|
||||
f"Cannot find the expected value for the model {model_name=}")
|
||||
expected_value = EXPECTED_VALUES[model_name]
|
||||
assert (measured_value - RTOL < expected_value
|
||||
and measured_value + RTOL > expected_value
|
||||
), f"Expected: {expected_value} | Measured: {measured_value}"
|
||||
|
||||
|
||||
# TODO: [AlexM] Fix it with new CI/CD tests
|
||||
@ -49,7 +58,8 @@ TPU_TP_TEST_STR = "" #"tensor_parallel_size=4"
|
||||
@pytest.mark.skipif(not current_platform.is_cuda()
|
||||
and not current_platform.is_tpu(),
|
||||
reason="V1 is currently only supported on CUDA and TPU")
|
||||
def test_lm_eval_accuracy_v1_engine(monkeypatch: pytest.MonkeyPatch):
|
||||
@pytest.mark.parametrize("model", MODEL_NAMES)
|
||||
def test_lm_eval_accuracy_v1_engine(model, monkeypatch: pytest.MonkeyPatch):
|
||||
"""Run with the V1 Engine."""
|
||||
|
||||
with monkeypatch.context() as m:
|
||||
@ -64,7 +74,7 @@ def test_lm_eval_accuracy_v1_engine(monkeypatch: pytest.MonkeyPatch):
|
||||
if TPU_TP_TEST_STR:
|
||||
more_args += ",{}".format(TPU_TP_TEST_STR)
|
||||
|
||||
run_test(more_args)
|
||||
run_test(model, more_args)
|
||||
|
||||
|
||||
def test_lm_eval_accuracy_v0_engine(monkeypatch: pytest.MonkeyPatch):
|
||||
@ -72,4 +82,4 @@ def test_lm_eval_accuracy_v0_engine(monkeypatch: pytest.MonkeyPatch):
|
||||
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_USE_V1", "0")
|
||||
run_test()
|
||||
run_test("Qwen/Qwen2-1.5B-Instruct")
|
||||
|
||||
@ -11,7 +11,7 @@ import pytest
|
||||
import pytest_asyncio
|
||||
import requests
|
||||
import torch
|
||||
from openai import BadRequestError
|
||||
from openai import BadRequestError, OpenAI
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
from .test_completion import zephyr_lora_added_tokens_files # noqa: F401
|
||||
@ -786,56 +786,135 @@ async def test_named_tool_use(client: openai.AsyncOpenAI, is_v1_server: bool,
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_required_tool_use_not_yet_supported(client: openai.AsyncOpenAI,
|
||||
sample_json_schema):
|
||||
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_required_tool_use(client: openai.AsyncOpenAI,
|
||||
is_v1_server: bool, model_name: str):
|
||||
if is_v1_server:
|
||||
pytest.skip("sample_json_schema has features unsupported on V1")
|
||||
pytest.skip(
|
||||
"tool_choice='required' requires features unsupported on V1")
|
||||
|
||||
messages = [{
|
||||
"role": "system",
|
||||
"content": "you are a helpful assistant"
|
||||
}, {
|
||||
"role":
|
||||
"user",
|
||||
"content":
|
||||
f"Give an example JSON for an employee profile that "
|
||||
f"fits this schema: {sample_json_schema}"
|
||||
}]
|
||||
tools = [
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "get_current_weather",
|
||||
"description": "Get the current weather in a given location",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"city": {
|
||||
"type": "string",
|
||||
"description":
|
||||
"The city to find the weather for, e.g. 'Vienna'",
|
||||
"default": "Vienna",
|
||||
},
|
||||
"country": {
|
||||
"type":
|
||||
"string",
|
||||
"description":
|
||||
"The country that the city is in, e.g. 'Austria'",
|
||||
},
|
||||
"unit": {
|
||||
"type": "string",
|
||||
"description":
|
||||
"The unit to fetch the temperature in",
|
||||
"enum": ["celsius", "fahrenheit"],
|
||||
},
|
||||
},
|
||||
"required": ["country", "unit"],
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "get_forecast",
|
||||
"description": "Get the weather forecast for a given location",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"city": {
|
||||
"type": "string",
|
||||
"description":
|
||||
"The city to get the forecast for, e.g. 'Vienna'",
|
||||
"default": "Vienna",
|
||||
},
|
||||
"country": {
|
||||
"type":
|
||||
"string",
|
||||
"description":
|
||||
"The country that the city is in, e.g. 'Austria'",
|
||||
},
|
||||
"days": {
|
||||
"type":
|
||||
"integer",
|
||||
"description":
|
||||
"Number of days to get the forecast for (1-7)",
|
||||
},
|
||||
"unit": {
|
||||
"type": "string",
|
||||
"description":
|
||||
"The unit to fetch the temperature in",
|
||||
"enum": ["celsius", "fahrenheit"],
|
||||
},
|
||||
},
|
||||
"required": ["country", "days", "unit"],
|
||||
},
|
||||
},
|
||||
},
|
||||
]
|
||||
|
||||
with pytest.raises(openai.BadRequestError):
|
||||
await client.chat.completions.create(
|
||||
model=MODEL_NAME,
|
||||
messages=messages,
|
||||
max_completion_tokens=1000,
|
||||
tools=[{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "dummy_function_name",
|
||||
"description": "This is a dummy function",
|
||||
"parameters": sample_json_schema
|
||||
}
|
||||
}],
|
||||
tool_choice="required")
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Hi! How are you doing today?"
|
||||
},
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": "I'm doing well! How can I help you?"
|
||||
},
|
||||
{
|
||||
"role":
|
||||
"user",
|
||||
"content":
|
||||
"Can you tell me what the current weather is in Berlin and the "\
|
||||
"forecast for the next 5 days, in fahrenheit?",
|
||||
},
|
||||
]
|
||||
|
||||
with pytest.raises(openai.BadRequestError):
|
||||
await client.chat.completions.create(
|
||||
model=MODEL_NAME,
|
||||
messages=messages,
|
||||
max_completion_tokens=1000,
|
||||
tools=[{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "dummy_function_name",
|
||||
"description": "This is a dummy function",
|
||||
"parameters": sample_json_schema
|
||||
}
|
||||
}],
|
||||
tool_choice="auto")
|
||||
# Non-streaming test
|
||||
chat_completion = await client.chat.completions.create(
|
||||
messages=messages,
|
||||
model=model_name,
|
||||
tools=tools,
|
||||
tool_choice="required",
|
||||
extra_body=dict(guided_decoding_backend="outlines"),
|
||||
)
|
||||
|
||||
assert chat_completion.choices[0].message.tool_calls is not None
|
||||
assert len(chat_completion.choices[0].message.tool_calls) > 0
|
||||
|
||||
# Streaming test
|
||||
stream = await client.chat.completions.create(
|
||||
messages=messages,
|
||||
model=model_name,
|
||||
tools=tools,
|
||||
tool_choice="required",
|
||||
extra_body=dict(guided_decoding_backend="outlines"),
|
||||
stream=True,
|
||||
)
|
||||
|
||||
output = []
|
||||
async for chunk in stream:
|
||||
if chunk.choices and chunk.choices[0].delta.tool_calls:
|
||||
output.extend(chunk.choices[0].delta.tool_calls)
|
||||
|
||||
assert len(output) > 0
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_inconsistent_tool_choice_and_tools(client: openai.AsyncOpenAI,
|
||||
is_v1_server: bool,
|
||||
sample_json_schema):
|
||||
|
||||
if is_v1_server:
|
||||
@ -1054,7 +1133,7 @@ async def test_long_seed(client: openai.AsyncOpenAI):
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_http_chat_wo_model_name(server: RemoteOpenAIServer):
|
||||
async def test_http_chat_no_model_name_with_curl(server: RemoteOpenAIServer):
|
||||
url = f"http://localhost:{server.port}/v1/chat/completions"
|
||||
headers = {
|
||||
"Content-Type": "application/json",
|
||||
@ -1075,10 +1154,35 @@ async def test_http_chat_wo_model_name(server: RemoteOpenAIServer):
|
||||
response = requests.post(url, headers=headers, json=data)
|
||||
response_data = response.json()
|
||||
print(response_data)
|
||||
|
||||
assert response_data.get("model") == MODEL_NAME
|
||||
choice = response_data.get("choices")[0]
|
||||
message = choice.get("message")
|
||||
assert message is not None
|
||||
content = message.get("content")
|
||||
assert content is not None
|
||||
assert len(content) > 0
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME, ""])
|
||||
async def test_http_chat_no_model_name_with_openai(server: RemoteOpenAIServer,
|
||||
model_name: str):
|
||||
|
||||
openai_api_key = "EMPTY"
|
||||
openai_api_base = f"http://localhost:{server.port}/v1"
|
||||
|
||||
client = OpenAI(
|
||||
api_key=openai_api_key,
|
||||
base_url=openai_api_base,
|
||||
)
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Hello, vLLM!"
|
||||
},
|
||||
]
|
||||
response = client.chat.completions.create(
|
||||
model="", # empty string
|
||||
messages=messages,
|
||||
)
|
||||
assert response.model == MODEL_NAME
|
||||
|
||||
@ -13,9 +13,12 @@ import requests
|
||||
from prometheus_client.parser import text_string_to_metric_families
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
from vllm import version
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
|
||||
PREV_MINOR_VERSION = version._prev_minor_version()
|
||||
|
||||
|
||||
@pytest.fixture(scope="module", params=[True, False])
|
||||
@ -55,6 +58,7 @@ def default_server_args():
|
||||
"",
|
||||
"--enable-chunked-prefill",
|
||||
"--disable-frontend-multiprocessing",
|
||||
f"--show-hidden-metrics-for-version={PREV_MINOR_VERSION}",
|
||||
])
|
||||
def server(use_v1, default_server_args, request):
|
||||
if request.param:
|
||||
@ -129,7 +133,9 @@ async def test_metrics_counts(server: RemoteOpenAIServer,
|
||||
|
||||
# Loop over all expected metric_families
|
||||
for metric_family, suffix_values_list in EXPECTED_VALUES.items():
|
||||
if use_v1 and metric_family not in EXPECTED_METRICS_V1:
|
||||
if ((use_v1 and metric_family not in EXPECTED_METRICS_V1)
|
||||
or (not server.show_hidden_metrics
|
||||
and metric_family in HIDDEN_DEPRECATED_METRICS)):
|
||||
continue
|
||||
|
||||
found_metric = False
|
||||
@ -165,10 +171,10 @@ async def test_metrics_counts(server: RemoteOpenAIServer,
|
||||
|
||||
EXPECTED_METRICS = [
|
||||
"vllm:num_requests_running",
|
||||
"vllm:num_requests_swapped",
|
||||
"vllm:num_requests_swapped", # deprecated
|
||||
"vllm:num_requests_waiting",
|
||||
"vllm:gpu_cache_usage_perc",
|
||||
"vllm:cpu_cache_usage_perc",
|
||||
"vllm:cpu_cache_usage_perc", # deprecated
|
||||
"vllm:time_to_first_token_seconds_sum",
|
||||
"vllm:time_to_first_token_seconds_bucket",
|
||||
"vllm:time_to_first_token_seconds_count",
|
||||
@ -268,6 +274,11 @@ EXPECTED_METRICS_V1 = [
|
||||
"vllm:request_decode_time_seconds_count",
|
||||
]
|
||||
|
||||
HIDDEN_DEPRECATED_METRICS = [
|
||||
"vllm:num_requests_swapped",
|
||||
"vllm:cpu_cache_usage_perc",
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_metrics_exist(server: RemoteOpenAIServer,
|
||||
@ -282,7 +293,9 @@ async def test_metrics_exist(server: RemoteOpenAIServer,
|
||||
assert response.status_code == HTTPStatus.OK
|
||||
|
||||
for metric in (EXPECTED_METRICS_V1 if use_v1 else EXPECTED_METRICS):
|
||||
assert metric in response.text
|
||||
if (not server.show_hidden_metrics
|
||||
and metric not in HIDDEN_DEPRECATED_METRICS):
|
||||
assert metric in response.text
|
||||
|
||||
|
||||
def test_metrics_exist_run_batch(use_v1: bool):
|
||||
|
||||
@ -25,16 +25,37 @@ def test_sleep_mode():
|
||||
"VLLM_SERVER_DEV_MODE": "1",
|
||||
"CUDA_VISIBLE_DEVICES": "0"
|
||||
}) as remote_server:
|
||||
|
||||
response = requests.post(remote_server.url_for("/sleep"),
|
||||
response = requests.post(remote_server.url_for("sleep"),
|
||||
params={"level": "1"})
|
||||
assert response.status_code == 200
|
||||
response = requests.get(remote_server.url_for("/is_sleeping"))
|
||||
response = requests.get(remote_server.url_for("is_sleeping"))
|
||||
assert response.status_code == 200
|
||||
assert response.json().get("is_sleeping") is True
|
||||
|
||||
response = requests.post(remote_server.url_for("/wake_up"))
|
||||
response = requests.post(remote_server.url_for("wake_up"))
|
||||
assert response.status_code == 200
|
||||
response = requests.get(remote_server.url_for("/is_sleeping"))
|
||||
response = requests.get(remote_server.url_for("is_sleeping"))
|
||||
assert response.status_code == 200
|
||||
assert response.json().get("is_sleeping") is False
|
||||
|
||||
# test wake up with tags
|
||||
response = requests.post(remote_server.url_for("sleep"),
|
||||
params={"level": "1"})
|
||||
assert response.status_code == 200
|
||||
|
||||
response = requests.post(remote_server.url_for("wake_up"),
|
||||
params={"tags": ["weights"]})
|
||||
assert response.status_code == 200
|
||||
|
||||
# is sleeping should be false after waking up any part of the engine
|
||||
response = requests.get(remote_server.url_for("is_sleeping"))
|
||||
assert response.status_code == 200
|
||||
assert response.json().get("is_sleeping") is True
|
||||
|
||||
response = requests.post(remote_server.url_for("wake_up"),
|
||||
params={"tags": ["kv_cache"]})
|
||||
assert response.status_code == 200
|
||||
|
||||
response = requests.get(remote_server.url_for("is_sleeping"))
|
||||
assert response.status_code == 200
|
||||
assert response.json().get("is_sleeping") is False
|
||||
|
||||
@ -9,8 +9,11 @@ import torch
|
||||
from vllm.config import VllmConfig, set_current_vllm_config
|
||||
from vllm.model_executor.layers.activation import SiluAndMul
|
||||
from vllm.model_executor.layers.fused_moe import fused_moe
|
||||
from vllm.model_executor.layers.fused_moe.fused_moe import (
|
||||
deep_gemm_moe_fp8, fused_topk, moe_align_block_size)
|
||||
from vllm.model_executor.layers.fused_moe.deep_gemm_moe import (
|
||||
deep_gemm_moe_fp8)
|
||||
from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
|
||||
from vllm.model_executor.layers.fused_moe.moe_align_block_size import (
|
||||
moe_align_block_size)
|
||||
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
|
||||
per_token_group_quant_fp8, w8a8_block_fp8_matmul)
|
||||
from vllm.platforms import current_platform
|
||||
@ -357,7 +360,7 @@ def fp8_perm(m, idx):
|
||||
return m[idx, ...]
|
||||
|
||||
|
||||
def test_moe_permute(a, a_s, topk_ids, num_groups, topk, block_m):
|
||||
def _moe_permute(a, a_s, topk_ids, num_groups, topk, block_m):
|
||||
M, K = a.shape
|
||||
|
||||
sorted_token_ids, m_indices, num_pad = moe_align_block_size(
|
||||
@ -376,7 +379,7 @@ def test_moe_permute(a, a_s, topk_ids, num_groups, topk, block_m):
|
||||
return a, a_s, m_indices, inv_perm
|
||||
|
||||
|
||||
def test_moe_unpermute(out, inv_perm, topk, K, topk_weight):
|
||||
def _moe_unpermute(out, inv_perm, topk, K, topk_weight):
|
||||
M = topk_weight.shape[0]
|
||||
out = out[inv_perm, ...]
|
||||
tmp_out = out.view(-1, topk, K)
|
||||
@ -398,8 +401,8 @@ def deep_gemm_w8a8_block_fp8_moe(M, K, a, w1, w2, w1_s, w2_s, score, topk,
|
||||
|
||||
a_q, a_s = per_token_group_quant_fp8(a, block_m)
|
||||
|
||||
a_q, a_s, m_indices, inv_perm = test_moe_permute(a_q, a_s, topk_ids,
|
||||
num_groups, topk, block_m)
|
||||
a_q, a_s, m_indices, inv_perm = _moe_permute(a_q, a_s, topk_ids,
|
||||
num_groups, topk, block_m)
|
||||
|
||||
inter_out = torch.zeros((a_q.shape[0], N * 2),
|
||||
dtype=torch.bfloat16,
|
||||
@ -416,7 +419,7 @@ def deep_gemm_w8a8_block_fp8_moe(M, K, a, w1, w2, w1_s, w2_s, score, topk,
|
||||
deep_gemm.m_grouped_gemm_fp8_fp8_bf16_nt_contiguous(
|
||||
(act_out_q, act_out_s), (w2, w2_s), out, m_indices)
|
||||
|
||||
final_out = test_moe_unpermute(out, inv_perm, topk, K, topk_weight)
|
||||
final_out = _moe_unpermute(out, inv_perm, topk, K, topk_weight)
|
||||
|
||||
return final_out
|
||||
|
||||
@ -437,7 +440,7 @@ def test_w8a8_block_fp8_deep_gemm_fused_moe(M, N, K, E, topk, seed):
|
||||
pytest.skip(
|
||||
f"Skipping test; bad size m={M}, n={N}, k={K}, topk={topk}, E={E}")
|
||||
|
||||
if (N <= 512):
|
||||
if N <= 512:
|
||||
pytest.skip("Skipping N <= 512 until performance issues solved.")
|
||||
|
||||
vllm_config = VllmConfig()
|
||||
|
||||
@ -4,8 +4,8 @@ import torch
|
||||
|
||||
from vllm import _custom_ops as ops
|
||||
from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
|
||||
from vllm.model_executor.layers.fused_moe.fused_moe import (cutlass_moe_fp8,
|
||||
fused_experts,
|
||||
from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp8
|
||||
from vllm.model_executor.layers.fused_moe.fused_moe import (fused_experts,
|
||||
fused_topk)
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
@ -131,9 +131,9 @@ def test_cutlass_moe_no_graph(
|
||||
c_strides2,
|
||||
a1_scale=a_scale1)
|
||||
|
||||
print(triton_output)
|
||||
print(cutlass_output)
|
||||
print("*")
|
||||
#print(triton_output)
|
||||
#print(cutlass_output)
|
||||
#print("*")
|
||||
|
||||
torch.testing.assert_close(triton_output,
|
||||
cutlass_output,
|
||||
@ -234,9 +234,9 @@ def test_cutlass_moe_cuda_graph(
|
||||
graph.replay()
|
||||
torch.cuda.synchronize()
|
||||
|
||||
print(triton_output)
|
||||
print(cutlass_output)
|
||||
print("*")
|
||||
#print(triton_output)
|
||||
#print(cutlass_output)
|
||||
#print("*")
|
||||
|
||||
torch.testing.assert_close(triton_output,
|
||||
cutlass_output,
|
||||
|
||||
@ -15,7 +15,8 @@ def test_ggml_opcheck(quant_type):
|
||||
qweight = torch.randint(0, 100, shape, device='cuda', dtype=torch.uint8)
|
||||
m = qweight.shape[0]
|
||||
n = qweight.shape[1] // type_size * block_size
|
||||
opcheck(torch.ops._C.ggml_dequantize, (qweight, quant_type, m, n))
|
||||
opcheck(torch.ops._C.ggml_dequantize,
|
||||
(qweight, quant_type, m, n, torch.float16))
|
||||
|
||||
x = torch.rand((m, 512), device='cuda', dtype=torch.float16)
|
||||
opcheck(torch.ops._C.ggml_mul_mat_a8,
|
||||
|
||||
@ -65,7 +65,7 @@ QUANT_TYPES = [
|
||||
|
||||
|
||||
@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
|
||||
@pytest.mark.parametrize("dtype", [torch.half])
|
||||
@pytest.mark.parametrize("dtype", DTYPES)
|
||||
@pytest.mark.parametrize("quant_type", QUANT_TYPES)
|
||||
@torch.inference_mode()
|
||||
def test_dequantize(hidden_size: int, dtype: torch.dtype,
|
||||
@ -78,7 +78,7 @@ def test_dequantize(hidden_size: int, dtype: torch.dtype,
|
||||
ref_output = torch.tensor(dequantize(tensor.data, quant_type),
|
||||
device="cuda").to(dtype)
|
||||
output = ops.ggml_dequantize(torch.tensor(tensor.data, device="cuda"),
|
||||
quant_type, *list(shape)).to(dtype)
|
||||
quant_type, *list(shape), dtype)
|
||||
|
||||
torch.testing.assert_close(output, ref_output, atol=1e-2, rtol=4e-2)
|
||||
|
||||
|
||||
286
tests/kernels/test_lightning_attn.py
Normal file
286
tests/kernels/test_lightning_attn.py
Normal file
@ -0,0 +1,286 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from vllm.model_executor.layers.lightning_attn import (
|
||||
linear_decode_forward_triton)
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
NUM_HEADS = [4, 8]
|
||||
HEAD_SIZES = [64]
|
||||
BATCH_SIZES = [1, 2]
|
||||
SEQ_LENGTHS = [16]
|
||||
DTYPES = [torch.float32]
|
||||
|
||||
|
||||
def reference_lightning_attention(q, k, v, ed, block_size, kv_history):
|
||||
"""Reference implementation of lightning attention core algorithm
|
||||
|
||||
The difference from the main implementation is that this processes
|
||||
each step sequentially, instead of using parallelized triton kernels
|
||||
"""
|
||||
B, H, S, D = q.shape
|
||||
E = v.shape[-1]
|
||||
dtype = q.dtype
|
||||
output = torch.zeros((B, H, S, E), dtype=dtype, device=q.device)
|
||||
|
||||
# Use clone() to ensure an independent copy
|
||||
if kv_history is None:
|
||||
kv_cache = torch.zeros((B, H, D, E), dtype=dtype, device=q.device)
|
||||
else:
|
||||
kv_cache = kv_history.clone()
|
||||
|
||||
# More efficient implementation
|
||||
# Convert decay factors to matrix form
|
||||
if ed.dim() == 1:
|
||||
decay = torch.exp(-ed).view(1, -1, 1, 1)
|
||||
else:
|
||||
decay = torch.exp(-ed)
|
||||
|
||||
for b in range(B):
|
||||
for step in range(S):
|
||||
# Process all heads at once for this position
|
||||
q_bs = q[b, :, step] # [H, D]
|
||||
k_bs = k[b, :, step] # [H, D]
|
||||
v_bs = v[b, :, step] # [H, E]
|
||||
|
||||
# Calculate KV outer products for all heads
|
||||
for h in range(H):
|
||||
# Calculate KV outer product
|
||||
kv_outer = torch.outer(k_bs[h], v_bs[h])
|
||||
|
||||
# Update KV cache with decay
|
||||
# Note: Using the same order as in the Triton kernel
|
||||
kv_cache[b, h] = decay[0, h, 0, 0] * kv_cache[b, h] + kv_outer
|
||||
|
||||
# Calculate attention output
|
||||
output[b, h, step] = torch.matmul(q_bs[h], kv_cache[b, h])
|
||||
|
||||
# Match the shape returned by the actual implementation
|
||||
# The actual implementation returns a tensor of shape [B, H, 2, D, E]
|
||||
# where dimension 2 contains both KV and KV history
|
||||
kv_reshaped = kv_cache.unsqueeze(2) # [B, H, 1, D, E]
|
||||
final_kv_cache = torch.cat([kv_reshaped, kv_reshaped],
|
||||
dim=2) # [B, H, 2, D, E]
|
||||
|
||||
return output, final_kv_cache
|
||||
|
||||
|
||||
def reference_linear_decode(q, k, v, kv_caches, slope_rate, slot_idx):
|
||||
"""Reference implementation: linear attention decode function"""
|
||||
B, H, _, D = q.shape
|
||||
output = torch.zeros(B, H * D, dtype=q.dtype, device=q.device)
|
||||
|
||||
# Calculate decay factors once (more efficient)
|
||||
decay = torch.exp(-slope_rate).view(-1, 1, 1) # [H, 1, 1]
|
||||
|
||||
# Process each batch
|
||||
for b in range(B):
|
||||
slot_id = slot_idx[b].item()
|
||||
|
||||
# Skip padding positions
|
||||
if slot_id == -1:
|
||||
continue
|
||||
|
||||
# Process all heads at once for this batch
|
||||
q_b = q[b, :, 0] # [H, D]
|
||||
k_b = k[b, :, 0] # [H, D]
|
||||
v_b = v[b, :, 0] # [H, D]
|
||||
|
||||
# Process each attention head
|
||||
for h in range(H):
|
||||
# Get current query, key and value
|
||||
q_bh = q_b[h]
|
||||
k_bh = k_b[h]
|
||||
v_bh = v_b[h]
|
||||
|
||||
# Get cache
|
||||
kv_cache_old = kv_caches[b, h]
|
||||
|
||||
# Calculate new key-value outer product
|
||||
kv_outer = torch.outer(k_bh, v_bh)
|
||||
|
||||
# Apply decay and update cache
|
||||
kv_new = kv_outer + decay[h, 0, 0] * kv_cache_old
|
||||
|
||||
# Calculate output
|
||||
out_h = torch.matmul(q_bh, kv_new)
|
||||
|
||||
# Update output and cache
|
||||
output[b, h * D:(h + 1) * D] = out_h
|
||||
kv_caches[b, h] = kv_new
|
||||
|
||||
return output
|
||||
|
||||
|
||||
@pytest.mark.parametrize("batch_size", BATCH_SIZES)
|
||||
@pytest.mark.parametrize("num_heads", NUM_HEADS)
|
||||
@pytest.mark.parametrize("head_size", HEAD_SIZES)
|
||||
@pytest.mark.parametrize("dtype", DTYPES)
|
||||
@torch.inference_mode()
|
||||
def test_linear_decode_forward_triton(
|
||||
batch_size: int,
|
||||
num_heads: int,
|
||||
head_size: int,
|
||||
dtype: torch.dtype,
|
||||
):
|
||||
torch.set_default_device("cuda")
|
||||
torch.manual_seed(42)
|
||||
torch.cuda.manual_seed_all(42)
|
||||
current_platform.seed_everything(42)
|
||||
base = 0.01
|
||||
q = base * torch.randn(batch_size, num_heads, 1, head_size, dtype=dtype)
|
||||
k = base * torch.randn(batch_size, num_heads, 1, head_size, dtype=dtype)
|
||||
v = base * torch.randn(batch_size, num_heads, 1, head_size, dtype=dtype)
|
||||
|
||||
kv_caches = base * torch.randn(batch_size,
|
||||
num_heads,
|
||||
head_size,
|
||||
head_size,
|
||||
dtype=dtype,
|
||||
device="cuda")
|
||||
|
||||
kv_caches_copy = kv_caches.clone()
|
||||
|
||||
slope_rate = torch.zeros(num_heads, device="cuda")
|
||||
for h in range(num_heads):
|
||||
slope_rate[h] = 0.1 * (h + 1)
|
||||
|
||||
slot_idx = torch.arange(batch_size, device="cuda")
|
||||
|
||||
triton_output = linear_decode_forward_triton(q, k, v, kv_caches,
|
||||
slope_rate, slot_idx)
|
||||
|
||||
reference_output = reference_linear_decode(q, k, v, kv_caches_copy,
|
||||
slope_rate, slot_idx)
|
||||
torch.testing.assert_close(triton_output,
|
||||
reference_output,
|
||||
rtol=1e-1,
|
||||
atol=1e-1)
|
||||
torch.testing.assert_close(kv_caches, kv_caches_copy, rtol=1e-1, atol=1e-1)
|
||||
|
||||
assert triton_output.shape == (batch_size, num_heads * head_size)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("num_heads", NUM_HEADS)
|
||||
@pytest.mark.parametrize("head_size", HEAD_SIZES)
|
||||
@pytest.mark.parametrize("dtype", DTYPES)
|
||||
@torch.inference_mode()
|
||||
def test_linear_decode_forward_triton_with_padding(
|
||||
num_heads: int,
|
||||
head_size: int,
|
||||
dtype: torch.dtype,
|
||||
):
|
||||
torch.set_default_device("cuda")
|
||||
torch.manual_seed(42)
|
||||
torch.cuda.manual_seed_all(42)
|
||||
current_platform.seed_everything(42)
|
||||
|
||||
batch_size = 4
|
||||
base = 0.01
|
||||
q = base * torch.randn(batch_size, num_heads, 1, head_size, dtype=dtype)
|
||||
k = base * torch.randn(batch_size, num_heads, 1, head_size, dtype=dtype)
|
||||
v = base * torch.randn(batch_size, num_heads, 1, head_size, dtype=dtype)
|
||||
|
||||
kv_caches = base * torch.randn(batch_size,
|
||||
num_heads,
|
||||
head_size,
|
||||
head_size,
|
||||
dtype=dtype,
|
||||
device="cuda")
|
||||
|
||||
kv_caches_copy = kv_caches.clone()
|
||||
|
||||
slope_rate = torch.zeros(num_heads, device="cuda")
|
||||
for h in range(num_heads):
|
||||
slope_rate[h] = 0.1 * (h + 1)
|
||||
|
||||
slot_idx = torch.tensor([0, 1, -1, 2], device="cuda")
|
||||
|
||||
triton_output = linear_decode_forward_triton(q, k, v, kv_caches,
|
||||
slope_rate, slot_idx)
|
||||
|
||||
reference_output = reference_linear_decode(q, k, v, kv_caches_copy,
|
||||
slope_rate, slot_idx)
|
||||
|
||||
padding_mask = (slot_idx
|
||||
!= -1).unsqueeze(1).expand(-1, num_heads * head_size)
|
||||
|
||||
triton_masked = triton_output[padding_mask]
|
||||
reference_masked = reference_output[padding_mask]
|
||||
|
||||
atol, rtol = 1.5e-1, 1.5e-1
|
||||
|
||||
valid_indices = slot_idx != -1
|
||||
|
||||
for i in range(batch_size):
|
||||
if valid_indices[i] > 0:
|
||||
torch.testing.assert_close(kv_caches[i],
|
||||
kv_caches_copy[i],
|
||||
rtol=rtol,
|
||||
atol=atol)
|
||||
|
||||
torch.testing.assert_close(triton_masked,
|
||||
reference_masked,
|
||||
rtol=rtol,
|
||||
atol=atol)
|
||||
|
||||
assert triton_output.shape == (batch_size, num_heads * head_size)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("batch_size", BATCH_SIZES)
|
||||
@pytest.mark.parametrize("num_heads", NUM_HEADS)
|
||||
@pytest.mark.parametrize("head_size", HEAD_SIZES)
|
||||
@pytest.mark.parametrize("seq_len", SEQ_LENGTHS)
|
||||
@pytest.mark.parametrize("dtype", DTYPES)
|
||||
@torch.inference_mode()
|
||||
def test_lightning_attention_reference(
|
||||
batch_size: int,
|
||||
num_heads: int,
|
||||
head_size: int,
|
||||
seq_len: int,
|
||||
dtype: torch.dtype,
|
||||
):
|
||||
torch.set_default_device("cuda")
|
||||
torch.manual_seed(42)
|
||||
torch.cuda.manual_seed_all(42)
|
||||
current_platform.seed_everything(42)
|
||||
|
||||
base = 0.01
|
||||
q = base * torch.randn(
|
||||
batch_size, num_heads, seq_len, head_size, dtype=dtype)
|
||||
k = base * torch.randn(
|
||||
batch_size, num_heads, seq_len, head_size, dtype=dtype)
|
||||
v = base * torch.randn(
|
||||
batch_size, num_heads, seq_len, head_size, dtype=dtype)
|
||||
|
||||
ed = torch.zeros(num_heads, device="cuda")
|
||||
for h in range(num_heads):
|
||||
ed[h] = 0.1 * (h + 1)
|
||||
|
||||
kv_history = base * torch.randn(batch_size,
|
||||
num_heads,
|
||||
head_size,
|
||||
head_size,
|
||||
dtype=dtype,
|
||||
device="cuda")
|
||||
|
||||
kv_history_clone = kv_history.clone()
|
||||
|
||||
ref_output, ref_kv_cache = reference_lightning_attention(
|
||||
q, k, v, ed, 256, kv_history)
|
||||
|
||||
from vllm.model_executor.layers.lightning_attn import lightning_attention
|
||||
actual_output, actual_kv_cache = lightning_attention(
|
||||
q, k, v, ed, 256, kv_history_clone)
|
||||
|
||||
atol, rtol = 1.5e-1, 1.5e-1
|
||||
torch.testing.assert_close(ref_output, actual_output, rtol=rtol, atol=atol)
|
||||
torch.testing.assert_close(ref_kv_cache,
|
||||
actual_kv_cache,
|
||||
rtol=rtol,
|
||||
atol=atol)
|
||||
|
||||
assert ref_output.shape == (batch_size, num_heads, seq_len, head_size)
|
||||
assert ref_kv_cache.shape == actual_kv_cache.shape
|
||||
@ -164,6 +164,7 @@ def test_contexted_kv_attention(
|
||||
block_table,
|
||||
b_start_loc,
|
||||
b_seq_len,
|
||||
MAX_CTX_LEN,
|
||||
max_input_len,
|
||||
k_scale,
|
||||
v_scale,
|
||||
@ -180,6 +181,7 @@ def test_contexted_kv_attention(
|
||||
block_table,
|
||||
b_start_loc,
|
||||
b_seq_len,
|
||||
MAX_CTX_LEN,
|
||||
max_input_len,
|
||||
k_scale,
|
||||
v_scale,
|
||||
@ -397,6 +399,7 @@ def test_contexted_kv_attention_alibi(
|
||||
block_table,
|
||||
b_start_loc,
|
||||
b_seq_len,
|
||||
MAX_CTX_LEN,
|
||||
max_input_len,
|
||||
k_scale,
|
||||
v_scale,
|
||||
@ -413,6 +416,7 @@ def test_contexted_kv_attention_alibi(
|
||||
block_table,
|
||||
b_start_loc,
|
||||
b_seq_len,
|
||||
MAX_CTX_LEN,
|
||||
max_input_len,
|
||||
k_scale,
|
||||
v_scale,
|
||||
|
||||
@ -2,7 +2,6 @@
|
||||
|
||||
import tempfile
|
||||
from collections import OrderedDict
|
||||
from typing import TypedDict
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
import pytest
|
||||
@ -26,28 +25,6 @@ from vllm.model_executor.models.interfaces import SupportsLoRA
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
|
||||
class ContextIDInfo(TypedDict):
|
||||
lora_id: int
|
||||
context_length: str
|
||||
|
||||
|
||||
class ContextInfo(TypedDict):
|
||||
lora: str
|
||||
context_length: str
|
||||
|
||||
|
||||
LONG_LORA_INFOS: list[ContextIDInfo] = [{
|
||||
"lora_id": 1,
|
||||
"context_length": "16k",
|
||||
}, {
|
||||
"lora_id": 2,
|
||||
"context_length": "16k",
|
||||
}, {
|
||||
"lora_id": 3,
|
||||
"context_length": "32k",
|
||||
}]
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def should_do_global_cleanup_after_test(request) -> bool:
|
||||
"""Allow subdirectories to skip global cleanup by overriding this fixture.
|
||||
|
||||
@ -59,7 +59,7 @@ DEVICES = ([
|
||||
# prefill stage(True) or decode stage(False)
|
||||
STAGES = [True, False]
|
||||
|
||||
NUM_RANDOM_SEEDS = 10
|
||||
NUM_RANDOM_SEEDS = 6
|
||||
|
||||
VOCAB_PARALLEL_EMBEDDING_TEST_NUM_RANDOM_SEEDS = 128
|
||||
|
||||
|
||||
@ -153,20 +153,3 @@ def test_llama_lora_tp4_fully_sharded_loras(sql_lora_files):
|
||||
enable_chunked_prefill=True,
|
||||
)
|
||||
generate_and_test(llm, sql_lora_files)
|
||||
|
||||
|
||||
@multi_gpu_test(num_gpus=4)
|
||||
@create_new_process_for_each_test()
|
||||
def test_llama_lora_tp4_fully_sharded_enable_bias(sql_lora_files):
|
||||
|
||||
llm = vllm.LLM(
|
||||
MODEL_PATH,
|
||||
enable_lora=True,
|
||||
max_num_seqs=16,
|
||||
max_loras=4,
|
||||
tensor_parallel_size=4,
|
||||
fully_sharded_loras=True,
|
||||
enable_lora_bias=True,
|
||||
enable_chunked_prefill=True,
|
||||
)
|
||||
generate_and_test(llm, sql_lora_files)
|
||||
|
||||
@ -58,7 +58,6 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
|
||||
@pytest.mark.xfail(
|
||||
current_platform.is_rocm(),
|
||||
reason="MiniCPM-V dependency xformers incompatible with ROCm")
|
||||
@create_new_process_for_each_test()
|
||||
def test_minicpmv_lora(minicpmv_lora_files):
|
||||
llm = vllm.LLM(
|
||||
MODEL_PATH,
|
||||
|
||||
@ -1,7 +1,10 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
import pytest
|
||||
|
||||
import vllm
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
from ..utils import create_new_process_for_each_test, multi_gpu_test
|
||||
|
||||
@ -44,7 +47,6 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
|
||||
return generated_texts
|
||||
|
||||
|
||||
@create_new_process_for_each_test()
|
||||
def test_ilama_lora(ilama_lora_files):
|
||||
llm = vllm.LLM(MODEL_PATH,
|
||||
max_model_len=1024,
|
||||
@ -63,6 +65,8 @@ def test_ilama_lora(ilama_lora_files):
|
||||
assert output2[i] == EXPECTED_LORA_OUTPUT[i]
|
||||
|
||||
|
||||
@pytest.mark.skipif(current_platform.is_cuda_alike(),
|
||||
reason="Skipping to avoid redundant model tests")
|
||||
@multi_gpu_test(num_gpus=4)
|
||||
@create_new_process_for_each_test()
|
||||
def test_ilama_lora_tp4(ilama_lora_files):
|
||||
@ -84,6 +88,8 @@ def test_ilama_lora_tp4(ilama_lora_files):
|
||||
assert output2[i] == EXPECTED_LORA_OUTPUT[i]
|
||||
|
||||
|
||||
@pytest.mark.skipif(current_platform.is_cuda_alike(),
|
||||
reason="Skipping to avoid redundant model tests")
|
||||
@multi_gpu_test(num_gpus=4)
|
||||
@create_new_process_for_each_test()
|
||||
def test_ilama_lora_tp4_fully_sharded_loras(ilama_lora_files):
|
||||
|
||||
@ -12,6 +12,7 @@ from vllm.sequence import SampleLogprobs
|
||||
|
||||
from ....conftest import HfRunner, VllmRunner
|
||||
from ....utils import RemoteOpenAIServer
|
||||
from ...registry import HF_EXAMPLE_MODELS
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
MODEL_NAME = "fixie-ai/ultravox-v0_5-llama-3_2-1b"
|
||||
@ -55,7 +56,10 @@ def server(request, audio_assets):
|
||||
for key, value in request.param.items()
|
||||
]
|
||||
|
||||
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
||||
with RemoteOpenAIServer(MODEL_NAME,
|
||||
args,
|
||||
env_dict={"VLLM_AUDIO_FETCH_TIMEOUT":
|
||||
"30"}) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@ -106,6 +110,10 @@ def run_test(
|
||||
**kwargs,
|
||||
):
|
||||
"""Inference result should be the same between hf and vllm."""
|
||||
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
|
||||
model_info.check_available_online(on_fail="skip")
|
||||
model_info.check_transformers_version(on_fail="skip")
|
||||
|
||||
# NOTE: take care of the order. run vLLM first, and then run HF.
|
||||
# vLLM needs a fresh new process without cuda initialization.
|
||||
# if we run HF first, the cuda initialization will be done and it
|
||||
@ -156,6 +164,10 @@ def run_multi_audio_test(
|
||||
num_logprobs: int,
|
||||
**kwargs,
|
||||
):
|
||||
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
|
||||
model_info.check_available_online(on_fail="skip")
|
||||
model_info.check_transformers_version(on_fail="skip")
|
||||
|
||||
with vllm_runner(model,
|
||||
dtype=dtype,
|
||||
enforce_eager=True,
|
||||
|
||||
@ -160,17 +160,32 @@ VLM_TEST_SETTINGS = {
|
||||
),
|
||||
"aya_vision": VLMTestInfo(
|
||||
models=["CohereForAI/aya-vision-8b"],
|
||||
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
||||
test_type=(VLMTestType.IMAGE),
|
||||
prompt_formatter=lambda img_prompt: f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{img_prompt}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>", # noqa: E501
|
||||
single_image_prompts=IMAGE_ASSETS.prompts({
|
||||
"stop_sign": "<image>What's the content in the center of the image?", # noqa: E501
|
||||
"cherry_blossom": "<image>What is the season?", # noqa: E501
|
||||
}),
|
||||
multi_image_prompt="<image><image>Describe the two images in detail.", # noqa: E501
|
||||
max_model_len=8192,
|
||||
max_model_len=4096,
|
||||
max_num_seqs=2,
|
||||
auto_cls=AutoModelForImageTextToText,
|
||||
vllm_runner_kwargs={"mm_processor_kwargs": {"crop_to_patches": True}}
|
||||
vllm_runner_kwargs={"mm_processor_kwargs": {"crop_to_patches": True}},
|
||||
),
|
||||
"aya_vision-multi_image": VLMTestInfo(
|
||||
models=["CohereForAI/aya-vision-8b"],
|
||||
test_type=(VLMTestType.MULTI_IMAGE),
|
||||
prompt_formatter=lambda img_prompt: f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{img_prompt}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>", # noqa: E501
|
||||
single_image_prompts=IMAGE_ASSETS.prompts({
|
||||
"stop_sign": "<image>What's the content in the center of the image?", # noqa: E501
|
||||
"cherry_blossom": "<image>What is the season?", # noqa: E501
|
||||
}),
|
||||
multi_image_prompt="<image><image>Describe the two images in detail.", # noqa: E501
|
||||
max_model_len=4096,
|
||||
max_num_seqs=2,
|
||||
auto_cls=AutoModelForImageTextToText,
|
||||
vllm_runner_kwargs={"mm_processor_kwargs": {"crop_to_patches": True}},
|
||||
marks=[large_gpu_mark(min_gb=32)],
|
||||
),
|
||||
"blip2": VLMTestInfo(
|
||||
# TODO: Change back to 2.7b once head_dim = 80 is supported
|
||||
@ -303,6 +318,22 @@ VLM_TEST_SETTINGS = {
|
||||
use_tokenizer_eos=True,
|
||||
patch_hf_runner=model_utils.internvl_patch_hf_runner,
|
||||
),
|
||||
"llama4": VLMTestInfo(
|
||||
models=["meta-llama/Llama-4-Scout-17B-16E-Instruct"],
|
||||
prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|header_start|>user<|header_end|>\n\n{img_prompt}<|eot|><|header_start|>assistant<|header_end|>\n\n", # noqa: E501
|
||||
img_idx_to_prompt=lambda _: "<|image|>",
|
||||
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
||||
distributed_executor_backend="mp",
|
||||
image_size_factors=[(.25, 0.5, 1.0)],
|
||||
hf_model_kwargs={"device_map": "auto"},
|
||||
max_model_len=8192,
|
||||
max_num_seqs=4,
|
||||
dtype="bfloat16",
|
||||
auto_cls=AutoModelForImageTextToText,
|
||||
tensor_parallel_size=8,
|
||||
vllm_runner_kwargs={"gpu_memory_utilization": 0.8},
|
||||
marks=multi_gpu_marks(num_gpus=8),
|
||||
),
|
||||
"llava_next": VLMTestInfo(
|
||||
models=["llava-hf/llava-v1.6-mistral-7b-hf"],
|
||||
test_type=(VLMTestType.IMAGE, VLMTestType.CUSTOM_INPUTS),
|
||||
|
||||
@ -5,7 +5,9 @@ import re
|
||||
from typing import Optional
|
||||
|
||||
import pytest
|
||||
from packaging.version import Version
|
||||
from transformers import AutoTokenizer
|
||||
from transformers import __version__ as TRANSFORMERS_VERSION
|
||||
|
||||
from vllm.multimodal.image import rescale_image_size
|
||||
from vllm.platforms import current_platform
|
||||
@ -81,6 +83,13 @@ def run_test(
|
||||
from transformers import AutoImageProcessor # noqa: F401
|
||||
from transformers import AutoProcessor # noqa: F401
|
||||
|
||||
# Once the model repo is updated to 4.49, we should be able to run the
|
||||
# test in `test_models.py` without the above workaround
|
||||
if Version(TRANSFORMERS_VERSION) >= Version("4.49"):
|
||||
pytest.skip(f"`transformers=={TRANSFORMERS_VERSION}` installed, "
|
||||
"but `transformers<=4.49` is required to run this model. "
|
||||
"Reason: Cannot run HF implementation")
|
||||
|
||||
# NOTE: take care of the order. run vLLM first, and then run HF.
|
||||
# vLLM needs a fresh new process without cuda initialization.
|
||||
# if we run HF first, the cuda initialization will be done and it
|
||||
|
||||
@ -176,6 +176,8 @@ def test_chat(
|
||||
model,
|
||||
dtype=dtype,
|
||||
tokenizer_mode="mistral",
|
||||
load_format="mistral",
|
||||
config_format="mistral",
|
||||
max_model_len=max_model_len,
|
||||
limit_mm_per_prompt=LIMIT_MM_PER_PROMPT,
|
||||
) as vllm_model:
|
||||
|
||||
@ -257,6 +257,7 @@ def _test_processing_correctness_mistral(
|
||||
"h2oai/h2ovl-mississippi-800m",
|
||||
"OpenGVLab/InternVL2-1B",
|
||||
"HuggingFaceM4/Idefics3-8B-Llama3",
|
||||
"meta-llama/Llama-4-Scout-17B-16E-Instruct",
|
||||
"llava-hf/llava-1.5-7b-hf",
|
||||
"llava-hf/llava-v1.6-mistral-7b-hf",
|
||||
"llava-hf/LLaVA-NeXT-Video-7B-hf",
|
||||
|
||||
99
tests/models/multimodal/processing/test_llama4.py
Normal file
99
tests/models/multimodal/processing/test_llama4.py
Normal file
@ -0,0 +1,99 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
"""Tests for Llama4's multimodal preprocessing kwargs."""
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.transformers_utils.tokenizer import encode_tokens
|
||||
|
||||
from ....conftest import _ImageAssets
|
||||
from ...utils import build_model_context
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_id",
|
||||
["meta-llama/Llama-4-Scout-17B-16E-Instruct"])
|
||||
@pytest.mark.parametrize("mm_processor_kwargs", [{}])
|
||||
@pytest.mark.parametrize("num_imgs", [1, 5])
|
||||
@pytest.mark.parametrize("disable_mm_preprocessor_cache", [True, False])
|
||||
@pytest.mark.parametrize("tokenized_prompt", [True, False])
|
||||
def test_processor_override(
|
||||
image_assets: _ImageAssets,
|
||||
model_id: str,
|
||||
mm_processor_kwargs: dict,
|
||||
num_imgs: int,
|
||||
disable_mm_preprocessor_cache: bool,
|
||||
tokenized_prompt: bool,
|
||||
):
|
||||
"""Ensure llama4 processor works properly."""
|
||||
ctx = build_model_context(
|
||||
model_id,
|
||||
mm_processor_kwargs=mm_processor_kwargs,
|
||||
limit_mm_per_prompt={"image": num_imgs},
|
||||
disable_mm_preprocessor_cache=disable_mm_preprocessor_cache,
|
||||
)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
|
||||
config = processor.info.get_hf_config()
|
||||
tokenizer = processor.info.get_tokenizer()
|
||||
hf_processor = processor.info.get_hf_processor()
|
||||
vocab = tokenizer.get_vocab()
|
||||
|
||||
prompt = "<|begin_of_text|><|header_start|>user<|header_end|>" \
|
||||
+ "<|image|>" * num_imgs \
|
||||
+ "<|eot|><|header_start|>assistant<|header_end|>"
|
||||
mm_data = {
|
||||
"image": [
|
||||
image_assets[(i % len(image_assets))].pil_image
|
||||
for i in range(num_imgs)
|
||||
]
|
||||
}
|
||||
if tokenized_prompt:
|
||||
prompt = encode_tokens(tokenizer, prompt)
|
||||
|
||||
processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
|
||||
mm_kwargs = processed_inputs["mm_kwargs"]
|
||||
|
||||
# place holder replacements
|
||||
prompt_token_ids = processed_inputs["prompt_token_ids"]
|
||||
assert prompt_token_ids.count(config.boi_token_index) == num_imgs
|
||||
assert prompt_token_ids.count(config.eoi_token_index) == num_imgs
|
||||
assert prompt_token_ids.count(vocab[hf_processor.image_token]) == num_imgs
|
||||
aspect_ratios = mm_kwargs["aspect_ratios"]
|
||||
num_x_separators = num_y_separators = 0
|
||||
for tiles_y, tiles_x in aspect_ratios:
|
||||
if tiles_x * tiles_y > 1:
|
||||
num_x_separators += (tiles_x - 1) * tiles_y
|
||||
num_y_separators += tiles_y
|
||||
assert prompt_token_ids.count(vocab[hf_processor.tile_token]) \
|
||||
== num_x_separators
|
||||
assert prompt_token_ids.count(vocab[hf_processor.tile_global_token]) \
|
||||
== num_y_separators
|
||||
|
||||
# image token offsets
|
||||
img_locs = processed_inputs["mm_placeholders"].get("image", [])
|
||||
assert len(img_locs) == num_imgs
|
||||
assert [img_loc["offset"] for img_loc in img_locs] == \
|
||||
[i for i, v in enumerate(prompt_token_ids) \
|
||||
if v == config.boi_token_index]
|
||||
|
||||
# patch sizes and masks
|
||||
assert prompt_token_ids.count(config.image_token_index) \
|
||||
== sum(img_patch.sum() for img_patch in mm_kwargs["embed_is_patch"])
|
||||
patch_token_id = vocab[hf_processor.img_patch_token]
|
||||
num_patches = processed_inputs["prompt_token_ids"].count(patch_token_id)
|
||||
mm_counts = {"image": num_imgs}
|
||||
assert num_patches / num_imgs <= \
|
||||
processor.info.get_mm_max_tokens_per_item(32768, mm_counts)["image"]
|
||||
num_patches_per_chunk = processor.info.get_patch_per_chunk(
|
||||
config.vision_config)
|
||||
assert prompt_token_ids.count(config.image_token_index) \
|
||||
== mm_kwargs["patches_per_image"].sum() * num_patches_per_chunk
|
||||
assert mm_kwargs["pixel_values"].shape[0] \
|
||||
== mm_kwargs["patches_per_image"].sum()
|
||||
|
||||
for embed_is_patch, aspect_ratio in zip(mm_kwargs["embed_is_patch"],
|
||||
mm_kwargs["aspect_ratios"]):
|
||||
assert embed_is_patch.shape[0] == \
|
||||
len(tokenizer.encode(
|
||||
hf_processor._prompt_split_image(
|
||||
aspect_ratio, num_patches_per_chunk),
|
||||
add_special_tokens=False))
|
||||
71
tests/models/multimodal/processing/test_mllama.py
Normal file
71
tests/models/multimodal/processing/test_mllama.py
Normal file
@ -0,0 +1,71 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
"""Tests for mllama's multimodal preprocessing and profiling."""
|
||||
import pytest
|
||||
from transformers import MllamaConfig
|
||||
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.profiling import MultiModalProfiler
|
||||
|
||||
from ...utils import build_model_context
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_id",
|
||||
["meta-llama/Llama-3.2-11B-Vision-Instruct"])
|
||||
@pytest.mark.parametrize("max_model_len", [4096, 8192, 25600, 131072])
|
||||
@pytest.mark.parametrize("max_num_seqs", [1, 2, 8])
|
||||
def test_profiling(
|
||||
model_id: str,
|
||||
max_model_len: int,
|
||||
max_num_seqs: int,
|
||||
):
|
||||
# regression test for https://github.com/vllm-project/vllm/issues/13929
|
||||
from vllm.model_executor.models.mllama import calc_token_per_chunk
|
||||
|
||||
model_config_kwargs = {
|
||||
"max_model_len": max_model_len,
|
||||
}
|
||||
ctx = build_model_context(
|
||||
model_id,
|
||||
model_config_kwargs=model_config_kwargs,
|
||||
limit_mm_per_prompt={"image": 1},
|
||||
)
|
||||
|
||||
mm_config = ctx.get_mm_config()
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
|
||||
profiler = MultiModalProfiler(processor)
|
||||
|
||||
dummy_encoder_data = profiler.get_encoder_dummy_data(
|
||||
max_model_len,
|
||||
mm_counts=mm_config.limit_per_prompt,
|
||||
)
|
||||
dummy_mm_data = processor.dummy_inputs.get_dummy_processor_inputs(
|
||||
max_model_len,
|
||||
mm_counts=mm_config.limit_per_prompt,
|
||||
)
|
||||
|
||||
hf_config = ctx.get_hf_config(MllamaConfig)
|
||||
image_size = hf_config.vision_config.image_size
|
||||
encoder_seq_lens = [len(dummy_encoder_data.prompt_token_ids)
|
||||
] * max_num_seqs
|
||||
|
||||
mm_kwargs = processor.apply(
|
||||
prompt=dummy_mm_data.prompt_text,
|
||||
mm_data=dummy_mm_data.mm_data,
|
||||
hf_processor_mm_kwargs=dict(),
|
||||
)["mm_kwargs"]
|
||||
|
||||
# Get the actual number of encoder tokens for each sample.
|
||||
# Because attn_metadata.encoder_seq_lens only counts the last
|
||||
# group of images for each sample, which is used to cheat the
|
||||
# block manager to allocate blocks for those images only.
|
||||
# See MllamaMultiModalProcessor for more details.
|
||||
num_tiles = [[t] for t in mm_kwargs.pop("num_tiles")]
|
||||
num_tokens_per_tile = calc_token_per_chunk(image_size)
|
||||
actual_encoder_seq_lens = [
|
||||
sum(num_tile) * num_tokens_per_tile for num_tile in num_tiles
|
||||
]
|
||||
|
||||
# simulate mllama image-present prefill.
|
||||
for actual_len, last_group_len in zip(actual_encoder_seq_lens,
|
||||
encoder_seq_lens):
|
||||
assert actual_len >= last_group_len
|
||||
@ -124,6 +124,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
|
||||
"BloomForCausalLM": _HfExamplesInfo("bigscience/bloomz-1b1"),
|
||||
"ChatGLMModel": _HfExamplesInfo("THUDM/chatglm3-6b",
|
||||
trust_remote_code=True),
|
||||
"ChatGLMForConditionalGeneration": _HfExamplesInfo("thu-coai/ShieldLM-6B-chatglm3", # noqa: E501
|
||||
trust_remote_code=True),
|
||||
"CohereForCausalLM": _HfExamplesInfo("CohereForAI/c4ai-command-r-v01",
|
||||
trust_remote_code=True),
|
||||
"Cohere2ForCausalLM": _HfExamplesInfo("CohereForAI/c4ai-command-r7b-12-2024", # noqa: E501
|
||||
@ -176,6 +178,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
|
||||
trust_remote_code=True),
|
||||
"MiniCPM3ForCausalLM": _HfExamplesInfo("openbmb/MiniCPM3-4B",
|
||||
trust_remote_code=True),
|
||||
"MiniMaxText01ForCausalLM": _HfExamplesInfo("MiniMaxAI/MiniMax-Text-01",
|
||||
trust_remote_code=True),
|
||||
"MistralForCausalLM": _HfExamplesInfo("mistralai/Mistral-7B-Instruct-v0.1"),
|
||||
"MixtralForCausalLM": _HfExamplesInfo("mistralai/Mixtral-8x7B-Instruct-v0.1"), # noqa: E501
|
||||
"QuantMixtralForCausalLM": _HfExamplesInfo("mistral-community/Mixtral-8x22B-v0.1-AWQ"), # noqa: E501
|
||||
@ -200,6 +204,16 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
|
||||
"Qwen2ForCausalLM": _HfExamplesInfo("Qwen/Qwen2-7B-Instruct",
|
||||
extras={"2.5": "Qwen/Qwen2.5-7B-Instruct"}), # noqa: E501
|
||||
"Qwen2MoeForCausalLM": _HfExamplesInfo("Qwen/Qwen1.5-MoE-A2.7B-Chat"),
|
||||
"Qwen3ForCausalLM": _HfExamplesInfo(
|
||||
"Qwen/Qwen3-8B",
|
||||
is_available_online=False,
|
||||
min_transformers_version="4.51"
|
||||
),
|
||||
"Qwen3MoeForCausalLM": _HfExamplesInfo(
|
||||
"Qwen/Qwen3-MoE-15B-A2B",
|
||||
is_available_online=False,
|
||||
min_transformers_version="4.51"
|
||||
),
|
||||
"RWForCausalLM": _HfExamplesInfo("tiiuae/falcon-40b",
|
||||
is_available_online=False),
|
||||
"StableLMEpochForCausalLM": _HfExamplesInfo("stabilityai/stablelm-zephyr-3b", # noqa: E501
|
||||
@ -275,12 +289,16 @@ _MULTIMODAL_EXAMPLE_MODELS = {
|
||||
trust_remote_code=True,
|
||||
hf_overrides={"architectures": ["GLM4VForCausalLM"]}), # noqa: E501
|
||||
"H2OVLChatModel": _HfExamplesInfo("h2oai/h2ovl-mississippi-800m",
|
||||
extras={"2b": "h2oai/h2ovl-mississippi-2b"}), # noqa: E501
|
||||
extras={"2b": "h2oai/h2ovl-mississippi-2b"}, # noqa: E501
|
||||
max_transformers_version="4.48", # noqa: E501
|
||||
transformers_version_reason="HF model is not compatible."), # noqa: E501
|
||||
"InternVLChatModel": _HfExamplesInfo("OpenGVLab/InternVL2-1B",
|
||||
extras={"2B": "OpenGVLab/InternVL2-2B"}, # noqa: E501
|
||||
trust_remote_code=True),
|
||||
"Idefics3ForConditionalGeneration": _HfExamplesInfo("HuggingFaceM4/Idefics3-8B-Llama3", # noqa: E501
|
||||
{"tiny": "HuggingFaceTB/SmolVLM-256M-Instruct"}), # noqa: E501
|
||||
"Llama4ForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-4-Scout-17B-16E-Instruct", # noqa: E501
|
||||
min_transformers_version="4.51"),
|
||||
"LlavaForConditionalGeneration": _HfExamplesInfo("llava-hf/llava-1.5-7b-hf",
|
||||
extras={"mistral": "mistral-community/pixtral-12b", # noqa: E501
|
||||
"mistral-fp8": "nm-testing/pixtral-12b-FP8-dynamic"}), # noqa: E501
|
||||
@ -327,7 +345,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
|
||||
min_transformers_version="4.49"), # noqa: E501
|
||||
"SkyworkR1VChatModel": _HfExamplesInfo("Skywork/Skywork-R1V-38B"),
|
||||
"UltravoxModel": _HfExamplesInfo("fixie-ai/ultravox-v0_5-llama-3_2-1b", # noqa: E501
|
||||
trust_remote_code=True),
|
||||
trust_remote_code=True,
|
||||
max_transformers_version="4.50"),
|
||||
# [Encoder-decoder]
|
||||
# Florence-2 uses BartFastTokenizer which can't be loaded from AutoTokenizer
|
||||
# Therefore, we borrow the BartTokenizer from the original Bart model
|
||||
|
||||
@ -7,6 +7,8 @@ from transformers import PretrainedConfig
|
||||
|
||||
from vllm import LLM
|
||||
from vllm.engine.llm_engine import LLMEngine as V0LLMEngine
|
||||
from vllm.utils import GiB_bytes
|
||||
from vllm.v1.core.kv_cache_utils import get_kv_cache_config
|
||||
from vllm.v1.engine.core import EngineCore as V1EngineCore
|
||||
|
||||
from .registry import HF_EXAMPLE_MODELS
|
||||
@ -42,14 +44,21 @@ def test_can_initialize(model_arch):
|
||||
self.cache_config.num_gpu_blocks = 0
|
||||
self.cache_config.num_cpu_blocks = 0
|
||||
|
||||
def _initalize_kv_caches_v1(self, vllm_config):
|
||||
# gpu_blocks (> 0), cpu_blocks
|
||||
return 1, 0
|
||||
def _initialize_kv_caches_v1(self, vllm_config):
|
||||
kv_cache_specs = self.model_executor.get_kv_cache_specs()
|
||||
scheduler_kv_cache_config = get_kv_cache_config(
|
||||
vllm_config,
|
||||
kv_cache_specs[0],
|
||||
20 * GiB_bytes,
|
||||
)
|
||||
|
||||
# gpu_blocks (> 0), cpu_blocks, scheduler_kv_cache_config
|
||||
return 1, 0, scheduler_kv_cache_config
|
||||
|
||||
with (patch.object(V0LLMEngine, "_initialize_kv_caches",
|
||||
_initialize_kv_caches_v0),
|
||||
patch.object(V1EngineCore, "_initialize_kv_caches",
|
||||
_initalize_kv_caches_v1)):
|
||||
_initialize_kv_caches_v1)):
|
||||
LLM(
|
||||
model_info.default,
|
||||
tokenizer=model_info.tokenizer,
|
||||
|
||||
@ -255,6 +255,7 @@ def build_model_context(
|
||||
model_id: str,
|
||||
task: TaskOption = "auto",
|
||||
dtype: Union[str, torch.dtype] = "auto",
|
||||
model_config_kwargs: Optional[dict[str, Any]] = None,
|
||||
mm_processor_kwargs: Optional[dict[str, Any]] = None,
|
||||
limit_mm_per_prompt: Optional[dict[str, int]] = None,
|
||||
disable_mm_preprocessor_cache: bool = True,
|
||||
@ -274,6 +275,7 @@ def build_model_context(
|
||||
model_info.check_available_online(on_fail="skip")
|
||||
model_info.check_transformers_version(on_fail="skip")
|
||||
|
||||
model_config_kwargs = model_config_kwargs or {}
|
||||
model_config = ModelConfig(
|
||||
model_id,
|
||||
task=task,
|
||||
@ -286,5 +288,6 @@ def build_model_context(
|
||||
limit_mm_per_prompt=limit_mm_per_prompt,
|
||||
disable_mm_preprocessor_cache=disable_mm_preprocessor_cache,
|
||||
hf_overrides=model_info.hf_overrides,
|
||||
**model_config_kwargs,
|
||||
)
|
||||
return InputContext(model_config)
|
||||
|
||||
@ -64,9 +64,11 @@ def test_reshape_and_cache(num_tokens, n_kv_head, d_head, num_blocks,
|
||||
key_cache = torch.zeros_like(key_cache_cpu, device=device)
|
||||
value_cache = torch.zeros_like(value_cache_cpu, device=device)
|
||||
slot_mapping = slot_mapping_cpu.to(device)
|
||||
kv_cache = torch.stack([key_cache, value_cache])
|
||||
|
||||
# Run vectorized implementation on XLA device
|
||||
reshape_and_cache(key, value, key_cache, value_cache, slot_mapping)
|
||||
reshape_and_cache(key, value, kv_cache, slot_mapping)
|
||||
key_cache, value_cache = torch.unbind(kv_cache, dim=0)
|
||||
|
||||
# Move results back to CPU for comparison
|
||||
key_cache_result = key_cache.cpu()
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user