Compare commits
2 Commits
pil_image
...
dynamo-pat
| Author | SHA1 | Date | |
|---|---|---|---|
| 6de0982dd0 | |||
| 45fa7f9b8e |
@ -1,4 +1,3 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m deepseek-ai/DeepSeek-V2-Lite-Chat -b "auto" -l 1000 -f 5 -t 2
|
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m deepseek-ai/DeepSeek-V2-Lite-Chat -b "auto" -l 1000 -f 5 -t 2
|
||||||
model_name: "deepseek-ai/DeepSeek-V2-Lite-Chat"
|
model_name: "deepseek-ai/DeepSeek-V2-Lite-Chat"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@ -1,4 +1,3 @@
|
|||||||
# For hf script, without -t option (tensor parallel size).
|
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5
|
||||||
model_name: "nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform"
|
model_name: "nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@ -1,4 +1,3 @@
|
|||||||
# For hf script, without -t option (tensor parallel size).
|
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-70B-Instruct -b 32 -l 250 -f 5
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-70B-Instruct -b 32 -l 250 -f 5
|
||||||
model_name: "meta-llama/Meta-Llama-3-70B-Instruct"
|
model_name: "meta-llama/Meta-Llama-3-70B-Instruct"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@ -1,4 +1,3 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors -b auto -l 1000 -f 5 -t 1
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors -b auto -l 1000 -f 5 -t 1
|
||||||
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors"
|
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@ -1,4 +1,3 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5 -t 1
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5 -t 1
|
||||||
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform"
|
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@ -1,4 +1,3 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test -b 32 -l 1000 -f 5 -t 1
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test -b 32 -l 1000 -f 5 -t 1
|
||||||
model_name: "nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test"
|
model_name: "nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@ -1,4 +1,3 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Meta-Llama-3-8B-Instruct-FP8 -b 32 -l 250 -f 5 -t 1
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Meta-Llama-3-8B-Instruct-FP8 -b 32 -l 250 -f 5 -t 1
|
||||||
model_name: "neuralmagic/Meta-Llama-3-8B-Instruct-FP8"
|
model_name: "neuralmagic/Meta-Llama-3-8B-Instruct-FP8"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@ -1,4 +1,3 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
|
||||||
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test"
|
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@ -1,4 +1,3 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
|
||||||
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test"
|
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@ -1,4 +1,3 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test -b auto -l 1000 -f 5 -t 1
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test -b auto -l 1000 -f 5 -t 1
|
||||||
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test"
|
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@ -1,5 +1,4 @@
|
|||||||
# For hf script, without -t option (tensor parallel size).
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-8B-Instruct -b 32 -l 250 -f 5 -t 1
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-8B-Instruct -b 32 -l 250 -f 5
|
|
||||||
model_name: "meta-llama/Meta-Llama-3-8B-Instruct"
|
model_name: "meta-llama/Meta-Llama-3-8B-Instruct"
|
||||||
tasks:
|
tasks:
|
||||||
- name: "gsm8k"
|
- name: "gsm8k"
|
||||||
|
|||||||
@ -1,4 +1,3 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m HandH1998/QQQ-Llama-3-8b-g128 -b 32 -l 1000 -f 5 -t 1
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m HandH1998/QQQ-Llama-3-8b-g128 -b 32 -l 1000 -f 5 -t 1
|
||||||
model_name: "HandH1998/QQQ-Llama-3-8b-g128"
|
model_name: "HandH1998/QQQ-Llama-3-8b-g128"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@ -1,11 +0,0 @@
|
|||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Llama-3.2-1B-Instruct-FP8 -b "auto" -l 1319 -f 5 -t 1
|
|
||||||
model_name: "RedHatAI/Llama-3.2-1B-Instruct-FP8"
|
|
||||||
tasks:
|
|
||||||
- name: "gsm8k"
|
|
||||||
metrics:
|
|
||||||
- name: "exact_match,strict-match"
|
|
||||||
value: 0.335
|
|
||||||
- name: "exact_match,flexible-extract"
|
|
||||||
value: 0.323
|
|
||||||
limit: 1319
|
|
||||||
num_fewshot: 5
|
|
||||||
@ -1,4 +1,3 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
|
||||||
model_name: "neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8"
|
model_name: "neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@ -1,12 +1,11 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m mgoin/Minitron-4B-Base-FP8 -b auto -l 1000 -f 5 -t 1
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m mgoin/Minitron-4B-Base-FP8 -b auto -l 1000 -f 5 -t 1
|
||||||
model_name: "mgoin/Minitron-4B-Base-FP8"
|
model_name: "mgoin/Minitron-4B-Base-FP8"
|
||||||
tasks:
|
tasks:
|
||||||
- name: "gsm8k"
|
- name: "gsm8k"
|
||||||
metrics:
|
metrics:
|
||||||
- name: "exact_match,strict-match"
|
- name: "exact_match,strict-match"
|
||||||
value: 0.231
|
value: 0.233
|
||||||
- name: "exact_match,flexible-extract"
|
- name: "exact_match,flexible-extract"
|
||||||
value: 0.22
|
value: 0.236
|
||||||
limit: 1000
|
limit: 1000
|
||||||
num_fewshot: 5
|
num_fewshot: 5
|
||||||
|
|||||||
@ -1,4 +1,3 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic -b "auto" -l 250 -f 5 -t 8
|
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic -b "auto" -l 250 -f 5 -t 8
|
||||||
model_name: "neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic"
|
model_name: "neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@ -1,4 +1,3 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8 -b "auto" -l 250 -f 5 -t 4
|
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8 -b "auto" -l 250 -f 5 -t 4
|
||||||
model_name: "neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8"
|
model_name: "neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@ -1,5 +1,4 @@
|
|||||||
# For hf script, without -t option (tensor parallel size).
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1 -b 32 -l 250 -f 5 -t 4
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1 -b 32 -l 250 -f 5
|
|
||||||
model_name: "mistralai/Mixtral-8x7B-Instruct-v0.1"
|
model_name: "mistralai/Mixtral-8x7B-Instruct-v0.1"
|
||||||
tasks:
|
tasks:
|
||||||
- name: "gsm8k"
|
- name: "gsm8k"
|
||||||
|
|||||||
@ -1,12 +0,0 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16 -b auto -l 1319 -f 5 -t 1
|
|
||||||
model_name: "nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16"
|
|
||||||
tasks:
|
|
||||||
- name: "gsm8k"
|
|
||||||
metrics:
|
|
||||||
- name: "exact_match,strict-match"
|
|
||||||
value: 0.30
|
|
||||||
- name: "exact_match,flexible-extract"
|
|
||||||
value: 0.465
|
|
||||||
limit: 1319
|
|
||||||
num_fewshot: 5
|
|
||||||
@ -1,4 +1,3 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-FP8W8 -b auto -l 1000 -f 5 -t 1
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-FP8W8 -b auto -l 1000 -f 5 -t 1
|
||||||
model_name: "nm-testing/Qwen2-1.5B-Instruct-FP8W8"
|
model_name: "nm-testing/Qwen2-1.5B-Instruct-FP8W8"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@ -1,4 +1,3 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
|
||||||
model_name: "neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8"
|
model_name: "neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@ -1,4 +1,3 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise -b "auto" -l 1000 -f 5 -t 1
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise -b "auto" -l 1000 -f 5 -t 1
|
||||||
model_name: "nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise"
|
model_name: "nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@ -1,4 +1,3 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m Qwen/Qwen2-57B-A14B-Instruct -b "auto" -l 250 -f 5 -t 4
|
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m Qwen/Qwen2-57B-A14B-Instruct -b "auto" -l 250 -f 5 -t 4
|
||||||
model_name: "Qwen/Qwen2-57B-A14B-Instruct"
|
model_name: "Qwen/Qwen2-57B-A14B-Instruct"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@ -1,11 +0,0 @@
|
|||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m Qwen/Qwen2.5-1.5B-Instruct -b auto -l 1319 -f 5 -t 1
|
|
||||||
model_name: "Qwen/Qwen2.5-1.5B-Instruct"
|
|
||||||
tasks:
|
|
||||||
- name: "gsm8k"
|
|
||||||
metrics:
|
|
||||||
- name: "exact_match,strict-match"
|
|
||||||
value: 0.54
|
|
||||||
- name: "exact_match,flexible-extract"
|
|
||||||
value: 0.59
|
|
||||||
limit: 1319
|
|
||||||
num_fewshot: 5
|
|
||||||
@ -1,11 +0,0 @@
|
|||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -b auto -l 1319 -f 5 -t 1
|
|
||||||
model_name: "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic"
|
|
||||||
tasks:
|
|
||||||
- name: "gsm8k"
|
|
||||||
metrics:
|
|
||||||
- name: "exact_match,strict-match"
|
|
||||||
value: 0.47
|
|
||||||
- name: "exact_match,flexible-extract"
|
|
||||||
value: 0.64
|
|
||||||
limit: 1319
|
|
||||||
num_fewshot: 5
|
|
||||||
@ -1,4 +1,3 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM -b "auto" -t 2
|
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM -b "auto" -t 2
|
||||||
model_name: "nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM"
|
model_name: "nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@ -3,4 +3,3 @@ Meta-Llama-3-70B-Instruct.yaml
|
|||||||
Mixtral-8x7B-Instruct-v0.1.yaml
|
Mixtral-8x7B-Instruct-v0.1.yaml
|
||||||
Qwen2-57B-A14-Instruct.yaml
|
Qwen2-57B-A14-Instruct.yaml
|
||||||
DeepSeek-V2-Lite-Chat.yaml
|
DeepSeek-V2-Lite-Chat.yaml
|
||||||
Meta-Llama-3-8B-QQQ.yaml
|
|
||||||
|
|||||||
@ -1,6 +1,10 @@
|
|||||||
Qwen2.5-1.5B-Instruct.yaml
|
Meta-Llama-3-8B-Instruct.yaml
|
||||||
|
Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
|
||||||
Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
|
Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
|
||||||
Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
|
Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
|
||||||
Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
|
Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
|
||||||
Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
|
Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
|
||||||
Qwen1.5-MoE-W4A16-compressed-tensors.yaml
|
Minitron-4B-Base-FP8.yaml
|
||||||
|
Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
|
||||||
|
Qwen2-1.5B-Instruct-FP8W8.yaml
|
||||||
|
Meta-Llama-3-8B-QQQ.yaml
|
||||||
|
|||||||
@ -1,39 +0,0 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
|
|
||||||
def pytest_addoption(parser):
|
|
||||||
parser.addoption(
|
|
||||||
"--config-list-file",
|
|
||||||
action="store",
|
|
||||||
help="Path to the file listing model config YAMLs (one per line)")
|
|
||||||
parser.addoption("--tp-size",
|
|
||||||
action="store",
|
|
||||||
default="1",
|
|
||||||
help="Tensor parallel size to use for evaluation")
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
|
||||||
def config_list_file(pytestconfig, config_dir):
|
|
||||||
rel_path = pytestconfig.getoption("--config-list-file")
|
|
||||||
return config_dir / rel_path
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
|
||||||
def tp_size(pytestconfig):
|
|
||||||
return pytestconfig.getoption("--tp-size")
|
|
||||||
|
|
||||||
|
|
||||||
def pytest_generate_tests(metafunc):
|
|
||||||
if "config_filename" in metafunc.fixturenames:
|
|
||||||
rel_path = metafunc.config.getoption("--config-list-file")
|
|
||||||
config_list_file = Path(rel_path).resolve()
|
|
||||||
config_dir = config_list_file.parent
|
|
||||||
with open(config_list_file, encoding="utf-8") as f:
|
|
||||||
configs = [
|
|
||||||
config_dir / line.strip() for line in f
|
|
||||||
if line.strip() and not line.startswith("#")
|
|
||||||
]
|
|
||||||
metafunc.parametrize("config_filename", configs)
|
|
||||||
59
.buildkite/lm-eval-harness/run-tests.sh
Normal file
59
.buildkite/lm-eval-harness/run-tests.sh
Normal file
@ -0,0 +1,59 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
usage() {
|
||||||
|
echo``
|
||||||
|
echo "Runs lm eval harness on GSM8k using vllm and compares to "
|
||||||
|
echo "precomputed baseline (measured by HF transformers.)"
|
||||||
|
echo
|
||||||
|
echo "usage: ${0} <options>"
|
||||||
|
echo
|
||||||
|
echo " -c - path to the test data config (e.g. configs/small-models.txt)"
|
||||||
|
echo " -t - tensor parallel size"
|
||||||
|
echo
|
||||||
|
}
|
||||||
|
|
||||||
|
SUCCESS=0
|
||||||
|
|
||||||
|
while getopts "c:t:" OPT; do
|
||||||
|
case ${OPT} in
|
||||||
|
c )
|
||||||
|
CONFIG="$OPTARG"
|
||||||
|
;;
|
||||||
|
t )
|
||||||
|
TP_SIZE="$OPTARG"
|
||||||
|
;;
|
||||||
|
\? )
|
||||||
|
usage
|
||||||
|
exit 1
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
|
||||||
|
# Parse list of configs.
|
||||||
|
IFS=$'\n' read -d '' -r -a MODEL_CONFIGS < "$CONFIG"
|
||||||
|
|
||||||
|
for MODEL_CONFIG in "${MODEL_CONFIGS[@]}"
|
||||||
|
do
|
||||||
|
LOCAL_SUCCESS=0
|
||||||
|
|
||||||
|
echo "=== RUNNING MODEL: $MODEL_CONFIG WITH TP SIZE: $TP_SIZE==="
|
||||||
|
|
||||||
|
export LM_EVAL_TEST_DATA_FILE=$PWD/configs/${MODEL_CONFIG}
|
||||||
|
export LM_EVAL_TP_SIZE=$TP_SIZE
|
||||||
|
pytest -s test_lm_eval_correctness.py || LOCAL_SUCCESS=$?
|
||||||
|
|
||||||
|
if [[ $LOCAL_SUCCESS == 0 ]]; then
|
||||||
|
echo "=== PASSED MODEL: ${MODEL_CONFIG} ==="
|
||||||
|
else
|
||||||
|
echo "=== FAILED MODEL: ${MODEL_CONFIG} ==="
|
||||||
|
fi
|
||||||
|
|
||||||
|
SUCCESS=$((SUCCESS + LOCAL_SUCCESS))
|
||||||
|
|
||||||
|
done
|
||||||
|
|
||||||
|
if [ "${SUCCESS}" -eq "0" ]; then
|
||||||
|
exit 0
|
||||||
|
else
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
@ -3,25 +3,34 @@
|
|||||||
LM eval harness on model to compare vs HF baseline computed offline.
|
LM eval harness on model to compare vs HF baseline computed offline.
|
||||||
Configs are found in configs/$MODEL.yaml
|
Configs are found in configs/$MODEL.yaml
|
||||||
|
|
||||||
pytest -s -v test_lm_eval_correctness.py \
|
* export LM_EVAL_TEST_DATA_FILE=configs/Meta-Llama-3-70B-Instruct.yaml
|
||||||
--config-list-file=configs/models-small.txt \
|
* export LM_EVAL_TP_SIZE=4
|
||||||
--tp-size=1
|
* pytest -s test_lm_eval_correctness.py
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
import lm_eval
|
import lm_eval
|
||||||
import numpy as np
|
import numpy
|
||||||
import yaml
|
import yaml
|
||||||
|
|
||||||
RTOL = 0.08
|
RTOL = 0.05
|
||||||
|
TEST_DATA_FILE = os.environ.get(
|
||||||
|
"LM_EVAL_TEST_DATA_FILE",
|
||||||
|
".buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml")
|
||||||
|
|
||||||
|
TP_SIZE = os.environ.get("LM_EVAL_TP_SIZE", 1)
|
||||||
|
|
||||||
|
|
||||||
def launch_lm_eval(eval_config, tp_size):
|
def launch_lm_eval(eval_config):
|
||||||
trust_remote_code = eval_config.get('trust_remote_code', False)
|
trust_remote_code = eval_config.get('trust_remote_code', False)
|
||||||
|
|
||||||
model_args = f"pretrained={eval_config['model_name']}," \
|
model_args = f"pretrained={eval_config['model_name']}," \
|
||||||
f"tensor_parallel_size={tp_size}," \
|
f"tensor_parallel_size={TP_SIZE}," \
|
||||||
f"enforce_eager=true," \
|
|
||||||
f"add_bos_token=true," \
|
f"add_bos_token=true," \
|
||||||
f"trust_remote_code={trust_remote_code}"
|
f"trust_remote_code={trust_remote_code}"
|
||||||
|
|
||||||
results = lm_eval.simple_evaluate(
|
results = lm_eval.simple_evaluate(
|
||||||
model="vllm",
|
model="vllm",
|
||||||
model_args=model_args,
|
model_args=model_args,
|
||||||
@ -29,14 +38,18 @@ def launch_lm_eval(eval_config, tp_size):
|
|||||||
num_fewshot=eval_config["num_fewshot"],
|
num_fewshot=eval_config["num_fewshot"],
|
||||||
limit=eval_config["limit"],
|
limit=eval_config["limit"],
|
||||||
batch_size="auto")
|
batch_size="auto")
|
||||||
|
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
|
||||||
def test_lm_eval_correctness_param(config_filename, tp_size):
|
def test_lm_eval_correctness():
|
||||||
eval_config = yaml.safe_load(config_filename.read_text(encoding="utf-8"))
|
eval_config = yaml.safe_load(
|
||||||
|
Path(TEST_DATA_FILE).read_text(encoding="utf-8"))
|
||||||
|
|
||||||
results = launch_lm_eval(eval_config, tp_size)
|
# Launch eval requests.
|
||||||
|
results = launch_lm_eval(eval_config)
|
||||||
|
|
||||||
|
# Confirm scores match ground truth.
|
||||||
success = True
|
success = True
|
||||||
for task in eval_config["tasks"]:
|
for task in eval_config["tasks"]:
|
||||||
for metric in task["metrics"]:
|
for metric in task["metrics"]:
|
||||||
@ -44,7 +57,8 @@ def test_lm_eval_correctness_param(config_filename, tp_size):
|
|||||||
measured_value = results["results"][task["name"]][metric["name"]]
|
measured_value = results["results"][task["name"]][metric["name"]]
|
||||||
print(f'{task["name"]} | {metric["name"]}: '
|
print(f'{task["name"]} | {metric["name"]}: '
|
||||||
f'ground_truth={ground_truth} | measured={measured_value}')
|
f'ground_truth={ground_truth} | measured={measured_value}')
|
||||||
success = success and np.isclose(
|
success = success and numpy.isclose(
|
||||||
ground_truth, measured_value, rtol=RTOL)
|
ground_truth, measured_value, rtol=RTOL)
|
||||||
|
|
||||||
|
# Assert at the end, print all scores even on failure for debugging.
|
||||||
assert success
|
assert success
|
||||||
|
|||||||
@ -1,13 +1,15 @@
|
|||||||
# vLLM benchmark suite
|
# vLLM benchmark suite
|
||||||
|
|
||||||
|
|
||||||
## Introduction
|
## Introduction
|
||||||
|
|
||||||
This directory contains two sets of benchmark for vllm.
|
This directory contains two sets of benchmark for vllm.
|
||||||
|
|
||||||
- Performance benchmark: benchmark vllm's performance under various workload, for **developers** to gain clarity on whether their PR improves/degrades vllm's performance
|
- Performance benchmark: benchmark vllm's performance under various workload, for **developers** to gain clarity on whether their PR improves/degrades vllm's performance
|
||||||
- Nightly benchmark: compare vllm's performance against alternatives (tgi, trt-llm and lmdeploy), for **the public** to know when to choose vllm.
|
- Nightly benchmark: compare vllm's performance against alternatives (tgi, trt-llm and lmdeploy), for **the public** to know when to choose vllm.
|
||||||
|
|
||||||
See [vLLM performance dashboard](https://perf.vllm.ai) for the latest performance benchmark results and [vLLM GitHub README](https://github.com/vllm-project/vllm/blob/main/README.md) for latest nightly benchmark results.
|
|
||||||
|
See [vLLM performance dashboard](https://perf.vllm.ai) for the latest performance benchmark results and [vLLM GitHub README](https://github.com/vllm-project/vllm/blob/main/README.md) for latest nightly benchmark results.
|
||||||
|
|
||||||
|
|
||||||
## Performance benchmark quick overview
|
## Performance benchmark quick overview
|
||||||
|
|
||||||
@ -17,14 +19,17 @@ See [vLLM performance dashboard](https://perf.vllm.ai) for the latest performanc
|
|||||||
|
|
||||||
**For benchmarking developers**: please try your best to constraint the duration of benchmarking to about 1 hr so that it won't take forever to run.
|
**For benchmarking developers**: please try your best to constraint the duration of benchmarking to about 1 hr so that it won't take forever to run.
|
||||||
|
|
||||||
|
|
||||||
## Nightly benchmark quick overview
|
## Nightly benchmark quick overview
|
||||||
|
|
||||||
**Benchmarking Coverage**: Fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!) on Llama-3 8B, 70B and Mixtral 8x7B.
|
**Benchmarking Coverage**: Fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!) on Llama-3 8B, 70B and Mixtral 8x7B.
|
||||||
|
|
||||||
**Benchmarking engines**: vllm, TGI, trt-llm and lmdeploy.
|
**Benchmarking engines**: vllm, TGI, trt-llm and lmdeploy.
|
||||||
|
|
||||||
**Benchmarking Duration**: about 3.5hrs.
|
**Benchmarking Duration**: about 3.5hrs.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
## Trigger the benchmark
|
## Trigger the benchmark
|
||||||
|
|
||||||
Performance benchmark will be triggered when:
|
Performance benchmark will be triggered when:
|
||||||
@ -34,11 +39,16 @@ Performance benchmark will be triggered when:
|
|||||||
Nightly benchmark will be triggered when:
|
Nightly benchmark will be triggered when:
|
||||||
- Every commit for those PRs with `perf-benchmarks` label and `nightly-benchmarks` label.
|
- Every commit for those PRs with `perf-benchmarks` label and `nightly-benchmarks` label.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
## Performance benchmark details
|
## Performance benchmark details
|
||||||
|
|
||||||
|
|
||||||
See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.
|
See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.
|
||||||
|
|
||||||
### Latency test
|
|
||||||
|
#### Latency test
|
||||||
|
|
||||||
Here is an example of one test inside `latency-tests.json`:
|
Here is an example of one test inside `latency-tests.json`:
|
||||||
|
|
||||||
@ -58,25 +68,23 @@ Here is an example of one test inside `latency-tests.json`:
|
|||||||
```
|
```
|
||||||
|
|
||||||
In this example:
|
In this example:
|
||||||
|
- The `test_name` attributes is a unique identifier for the test. In `latency-tests.json`, it must start with `latency_`.
|
||||||
- The `test_name` attributes is a unique identifier for the test. In `latency-tests.json`, it must start with `latency_`.
|
- The `parameters` attribute control the command line arguments to be used for `benchmark_latency.py`. Note that please use underline `_` instead of the dash `-` when specifying the command line arguments, and `run-performance-benchmarks.sh` will convert the underline to dash when feeding the arguments to `benchmark_latency.py`. For example, the corresponding command line arguments for `benchmark_latency.py` will be `--model meta-llama/Meta-Llama-3-8B --tensor-parallel-size 1 --load-format dummy --num-iters-warmup 5 --num-iters 15`
|
||||||
- The `parameters` attribute control the command line arguments to be used for `benchmark_latency.py`. Note that please use underline `_` instead of the dash `-` when specifying the command line arguments, and `run-performance-benchmarks.sh` will convert the underline to dash when feeding the arguments to `benchmark_latency.py`. For example, the corresponding command line arguments for `benchmark_latency.py` will be `--model meta-llama/Meta-Llama-3-8B --tensor-parallel-size 1 --load-format dummy --num-iters-warmup 5 --num-iters 15`
|
|
||||||
|
|
||||||
Note that the performance numbers are highly sensitive to the value of the parameters. Please make sure the parameters are set correctly.
|
Note that the performance numbers are highly sensitive to the value of the parameters. Please make sure the parameters are set correctly.
|
||||||
|
|
||||||
WARNING: The benchmarking script will save json results by itself, so please do not configure `--output-json` parameter in the json file.
|
WARNING: The benchmarking script will save json results by itself, so please do not configure `--output-json` parameter in the json file.
|
||||||
|
|
||||||
### Throughput test
|
|
||||||
|
|
||||||
|
#### Throughput test
|
||||||
The tests are specified in `throughput-tests.json`. The syntax is similar to `latency-tests.json`, except for that the parameters will be fed forward to `benchmark_throughput.py`.
|
The tests are specified in `throughput-tests.json`. The syntax is similar to `latency-tests.json`, except for that the parameters will be fed forward to `benchmark_throughput.py`.
|
||||||
|
|
||||||
The number of this test is also stable -- a slight change on the value of this number might vary the performance numbers by a lot.
|
The number of this test is also stable -- a slight change on the value of this number might vary the performance numbers by a lot.
|
||||||
|
|
||||||
### Serving test
|
#### Serving test
|
||||||
|
|
||||||
We test the throughput by using `benchmark_serving.py` with request rate = inf to cover the online serving overhead. The corresponding parameters are in `serving-tests.json`, and here is an example:
|
We test the throughput by using `benchmark_serving.py` with request rate = inf to cover the online serving overhead. The corresponding parameters are in `serving-tests.json`, and here is an example:
|
||||||
|
|
||||||
```json
|
```
|
||||||
[
|
[
|
||||||
{
|
{
|
||||||
"test_name": "serving_llama8B_tp1_sharegpt",
|
"test_name": "serving_llama8B_tp1_sharegpt",
|
||||||
@ -101,7 +109,6 @@ We test the throughput by using `benchmark_serving.py` with request rate = inf t
|
|||||||
```
|
```
|
||||||
|
|
||||||
Inside this example:
|
Inside this example:
|
||||||
|
|
||||||
- The `test_name` attribute is also a unique identifier for the test. It must start with `serving_`.
|
- The `test_name` attribute is also a unique identifier for the test. It must start with `serving_`.
|
||||||
- The `server-parameters` includes the command line arguments for vLLM server.
|
- The `server-parameters` includes the command line arguments for vLLM server.
|
||||||
- The `client-parameters` includes the command line arguments for `benchmark_serving.py`.
|
- The `client-parameters` includes the command line arguments for `benchmark_serving.py`.
|
||||||
@ -111,33 +118,36 @@ The number of this test is less stable compared to the delay and latency benchma
|
|||||||
|
|
||||||
WARNING: The benchmarking script will save json results by itself, so please do not configure `--save-results` or other results-saving-related parameters in `serving-tests.json`.
|
WARNING: The benchmarking script will save json results by itself, so please do not configure `--save-results` or other results-saving-related parameters in `serving-tests.json`.
|
||||||
|
|
||||||
### Visualizing the results
|
#### Visualizing the results
|
||||||
|
|
||||||
The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](tests/descriptions.md) with real benchmarking results.
|
The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](tests/descriptions.md) with real benchmarking results.
|
||||||
You can find the result presented as a table inside the `buildkite/performance-benchmark` job page.
|
You can find the result presented as a table inside the `buildkite/performance-benchmark` job page.
|
||||||
If you do not see the table, please wait till the benchmark finish running.
|
If you do not see the table, please wait till the benchmark finish running.
|
||||||
The json version of the table (together with the json version of the benchmark) will be also attached to the markdown file.
|
The json version of the table (together with the json version of the benchmark) will be also attached to the markdown file.
|
||||||
The raw benchmarking results (in the format of json files) are in the `Artifacts` tab of the benchmarking.
|
The raw benchmarking results (in the format of json files) are in the `Artifacts` tab of the benchmarking.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
## Nightly test details
|
## Nightly test details
|
||||||
|
|
||||||
See [nightly-descriptions.md](nightly-descriptions.md) for the detailed description on test workload, models and docker containers of benchmarking other llm engines.
|
See [nightly-descriptions.md](nightly-descriptions.md) for the detailed description on test workload, models and docker containers of benchmarking other llm engines.
|
||||||
|
|
||||||
### Workflow
|
|
||||||
|
|
||||||
- The [nightly-pipeline.yaml](nightly-pipeline.yaml) specifies the docker containers for different LLM serving engines.
|
#### Workflow
|
||||||
|
|
||||||
|
- The [nightly-pipeline.yaml](nightly-pipeline.yaml) specifies the docker containers for different LLM serving engines.
|
||||||
- Inside each container, we run [run-nightly-suite.sh](run-nightly-suite.sh), which will probe the serving engine of the current container.
|
- Inside each container, we run [run-nightly-suite.sh](run-nightly-suite.sh), which will probe the serving engine of the current container.
|
||||||
- The `run-nightly-suite.sh` will redirect the request to `tests/run-[llm serving engine name]-nightly.sh`, which parses the workload described in [nightly-tests.json](tests/nightly-tests.json) and performs the benchmark.
|
- The `run-nightly-suite.sh` will redirect the request to `tests/run-[llm serving engine name]-nightly.sh`, which parses the workload described in [nightly-tests.json](tests/nightly-tests.json) and performs the benchmark.
|
||||||
- At last, we run [scripts/plot-nightly-results.py](scripts/plot-nightly-results.py) to collect and plot the final benchmarking results, and update the results to buildkite.
|
- At last, we run [scripts/plot-nightly-results.py](scripts/plot-nightly-results.py) to collect and plot the final benchmarking results, and update the results to buildkite.
|
||||||
|
|
||||||
### Nightly tests
|
#### Nightly tests
|
||||||
|
|
||||||
In [nightly-tests.json](tests/nightly-tests.json), we include the command line arguments for benchmarking commands, together with the benchmarking test cases. The format is highly similar to performance benchmark.
|
In [nightly-tests.json](tests/nightly-tests.json), we include the command line arguments for benchmarking commands, together with the benchmarking test cases. The format is highly similar to performance benchmark.
|
||||||
|
|
||||||
### Docker containers
|
#### Docker containers
|
||||||
|
|
||||||
The docker containers for benchmarking are specified in `nightly-pipeline.yaml`.
|
The docker containers for benchmarking are specified in `nightly-pipeline.yaml`.
|
||||||
|
|
||||||
WARNING: the docker versions are HARD-CODED and SHOULD BE ALIGNED WITH `nightly-descriptions.md`. The docker versions need to be hard-coded as there are several version-specific bug fixes inside `tests/run-[llm serving engine name]-nightly.sh`.
|
WARNING: the docker versions are HARD-CODED and SHOULD BE ALIGNED WITH `nightly-descriptions.md`. The docker versions need to be hard-coded as there are several version-specific bug fixes inside `tests/run-[llm serving engine name]-nightly.sh`.
|
||||||
|
|
||||||
WARNING: populating `trt-llm` to latest version is not easy, as it requires updating several protobuf files in [tensorrt-demo](https://github.com/neuralmagic/tensorrt-demo.git).
|
WARNING: populating `trt-llm` to latest version is not easy, as it requires updating several protobuf files in [tensorrt-demo](https://github.com/neuralmagic/tensorrt-demo.git).
|
||||||
|
|
||||||
|
|||||||
@ -10,18 +10,12 @@ steps:
|
|||||||
- image: badouralix/curl-jq
|
- image: badouralix/curl-jq
|
||||||
command:
|
command:
|
||||||
- sh .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
|
- sh .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
|
||||||
- label: "Cleanup H100"
|
|
||||||
agents:
|
|
||||||
queue: H100
|
|
||||||
depends_on: ~
|
|
||||||
command: docker system prune -a --volumes --force
|
|
||||||
|
|
||||||
- label: "A100"
|
- label: "A100"
|
||||||
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
|
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
|
||||||
agents:
|
agents:
|
||||||
queue: A100
|
queue: A100
|
||||||
depends_on: wait-for-container-image
|
depends_on: wait-for-container-image
|
||||||
if: build.branch == "main"
|
|
||||||
plugins:
|
plugins:
|
||||||
- kubernetes:
|
- kubernetes:
|
||||||
podSpec:
|
podSpec:
|
||||||
@ -56,7 +50,6 @@ steps:
|
|||||||
agents:
|
agents:
|
||||||
queue: H200
|
queue: H200
|
||||||
depends_on: wait-for-container-image
|
depends_on: wait-for-container-image
|
||||||
if: build.branch == "main"
|
|
||||||
plugins:
|
plugins:
|
||||||
- docker#v5.12.0:
|
- docker#v5.12.0:
|
||||||
image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
|
image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
|
||||||
@ -82,7 +75,6 @@ steps:
|
|||||||
agents:
|
agents:
|
||||||
queue: H100
|
queue: H100
|
||||||
depends_on: wait-for-container-image
|
depends_on: wait-for-container-image
|
||||||
if: build.branch == "main"
|
|
||||||
plugins:
|
plugins:
|
||||||
- docker#v5.12.0:
|
- docker#v5.12.0:
|
||||||
image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
|
image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
|
||||||
@ -98,87 +90,3 @@ steps:
|
|||||||
environment:
|
environment:
|
||||||
- VLLM_USAGE_SOURCE
|
- VLLM_USAGE_SOURCE
|
||||||
- HF_TOKEN
|
- HF_TOKEN
|
||||||
|
|
||||||
# Premerge benchmark
|
|
||||||
- label: "A100"
|
|
||||||
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
|
|
||||||
agents:
|
|
||||||
queue: A100
|
|
||||||
depends_on: wait-for-container-image
|
|
||||||
if: build.branch != "main"
|
|
||||||
plugins:
|
|
||||||
- kubernetes:
|
|
||||||
podSpec:
|
|
||||||
priorityClassName: perf-benchmark
|
|
||||||
containers:
|
|
||||||
- image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
|
|
||||||
command:
|
|
||||||
- bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
|
|
||||||
resources:
|
|
||||||
limits:
|
|
||||||
nvidia.com/gpu: 8
|
|
||||||
volumeMounts:
|
|
||||||
- name: devshm
|
|
||||||
mountPath: /dev/shm
|
|
||||||
env:
|
|
||||||
- name: VLLM_USAGE_SOURCE
|
|
||||||
value: ci-test
|
|
||||||
- name: HF_TOKEN
|
|
||||||
valueFrom:
|
|
||||||
secretKeyRef:
|
|
||||||
name: hf-token-secret
|
|
||||||
key: token
|
|
||||||
nodeSelector:
|
|
||||||
nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
|
|
||||||
volumes:
|
|
||||||
- name: devshm
|
|
||||||
emptyDir:
|
|
||||||
medium: Memory
|
|
||||||
|
|
||||||
- label: "H200"
|
|
||||||
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
|
|
||||||
agents:
|
|
||||||
queue: H200
|
|
||||||
depends_on: wait-for-container-image
|
|
||||||
if: build.branch != "main"
|
|
||||||
plugins:
|
|
||||||
- docker#v5.12.0:
|
|
||||||
image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
|
|
||||||
command:
|
|
||||||
- bash
|
|
||||||
- .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
|
|
||||||
mount-buildkite-agent: true
|
|
||||||
propagate-environment: true
|
|
||||||
ipc: host
|
|
||||||
gpus: 4,5,6,7
|
|
||||||
volumes:
|
|
||||||
- /data/benchmark-hf-cache:/root/.cache/huggingface
|
|
||||||
environment:
|
|
||||||
- VLLM_USAGE_SOURCE
|
|
||||||
- HF_TOKEN
|
|
||||||
|
|
||||||
#- block: "Run H100 Benchmark"
|
|
||||||
#key: block-h100
|
|
||||||
#depends_on: ~
|
|
||||||
|
|
||||||
- label: "H100"
|
|
||||||
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
|
|
||||||
agents:
|
|
||||||
queue: H100
|
|
||||||
depends_on: wait-for-container-image
|
|
||||||
if: build.branch != "main"
|
|
||||||
plugins:
|
|
||||||
- docker#v5.12.0:
|
|
||||||
image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
|
|
||||||
command:
|
|
||||||
- bash
|
|
||||||
- .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
|
|
||||||
mount-buildkite-agent: true
|
|
||||||
propagate-environment: true
|
|
||||||
ipc: host
|
|
||||||
gpus: all # see CUDA_VISIBLE_DEVICES for actual GPUs used
|
|
||||||
volumes:
|
|
||||||
- /data/benchmark-hf-cache:/root/.cache/huggingface
|
|
||||||
environment:
|
|
||||||
- VLLM_USAGE_SOURCE
|
|
||||||
- HF_TOKEN
|
|
||||||
|
|||||||
@ -9,19 +9,20 @@ This file contains the downloading link for benchmarking results.
|
|||||||
|
|
||||||
Please download the visualization scripts in the post
|
Please download the visualization scripts in the post
|
||||||
|
|
||||||
|
|
||||||
## Results reproduction
|
## Results reproduction
|
||||||
|
|
||||||
- Find the docker we use in `benchmarking pipeline`
|
- Find the docker we use in `benchmarking pipeline`
|
||||||
- Deploy the docker, and inside the docker:
|
- Deploy the docker, and inside the docker:
|
||||||
- Download `nightly-benchmarks.zip`.
|
- Download `nightly-benchmarks.zip`.
|
||||||
- In the same folder, run the following code:
|
- In the same folder, run the following code
|
||||||
|
```
|
||||||
```console
|
export HF_TOKEN=<your HF token>
|
||||||
export HF_TOKEN=<your HF token>
|
apt update
|
||||||
apt update
|
apt install -y git
|
||||||
apt install -y git
|
unzip nightly-benchmarks.zip
|
||||||
unzip nightly-benchmarks.zip
|
VLLM_SOURCE_CODE_LOC=./ bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
|
||||||
VLLM_SOURCE_CODE_LOC=./ bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
|
```
|
||||||
```
|
|
||||||
|
|
||||||
And the results will be inside `./benchmarks/results`.
|
And the results will be inside `./benchmarks/results`.
|
||||||
|
|
||||||
|
|||||||
@ -2,7 +2,6 @@
|
|||||||
# Nightly benchmark
|
# Nightly benchmark
|
||||||
|
|
||||||
This benchmark aims to:
|
This benchmark aims to:
|
||||||
|
|
||||||
- Provide performance clarity: Provide clarity on which one (vllm, tensorrt-llm, lmdeploy and SGLang) leads in performance in what workload.
|
- Provide performance clarity: Provide clarity on which one (vllm, tensorrt-llm, lmdeploy and SGLang) leads in performance in what workload.
|
||||||
- Be reproducible: one can run the exact same set of benchmarking commands inside the exact same docker by following reproducing instructions.
|
- Be reproducible: one can run the exact same set of benchmarking commands inside the exact same docker by following reproducing instructions.
|
||||||
|
|
||||||
@ -10,6 +9,7 @@ Latest results: [results link](https://blog.vllm.ai/2024/09/05/perf-update.html)
|
|||||||
|
|
||||||
Latest reproduction guilde: [github issue link](https://github.com/vllm-project/vllm/issues/8176)
|
Latest reproduction guilde: [github issue link](https://github.com/vllm-project/vllm/issues/8176)
|
||||||
|
|
||||||
|
|
||||||
## Setup
|
## Setup
|
||||||
|
|
||||||
- Docker images:
|
- Docker images:
|
||||||
@ -33,7 +33,7 @@ Latest reproduction guilde: [github issue link](https://github.com/vllm-project/
|
|||||||
- Queries are randomly sampled, and arrival patterns are determined via Poisson process, but all with fixed random seed.
|
- Queries are randomly sampled, and arrival patterns are determined via Poisson process, but all with fixed random seed.
|
||||||
- Evaluation metrics: Throughput (higher the better), TTFT (time to the first token, lower the better), ITL (inter-token latency, lower the better).
|
- Evaluation metrics: Throughput (higher the better), TTFT (time to the first token, lower the better), ITL (inter-token latency, lower the better).
|
||||||
|
|
||||||
## Known issues
|
# Known issues
|
||||||
|
|
||||||
- TRT-LLM crashes with Llama 3.1 8B [issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105).
|
- TRT-LLM crashes with Llama 3.1 8B [issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105).
|
||||||
- TGI does not support `ignore-eos` flag.
|
- TGI does not support `ignore-eos` flag.
|
||||||
@ -7,8 +7,10 @@
|
|||||||
- Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
|
- Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
|
||||||
- Evaluation metrics: end-to-end latency (mean, median, p99).
|
- Evaluation metrics: end-to-end latency (mean, median, p99).
|
||||||
|
|
||||||
|
|
||||||
{latency_tests_markdown_table}
|
{latency_tests_markdown_table}
|
||||||
|
|
||||||
|
|
||||||
## Throughput tests
|
## Throughput tests
|
||||||
|
|
||||||
- Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
|
- Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
|
||||||
@ -17,8 +19,10 @@
|
|||||||
- Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
|
- Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
|
||||||
- Evaluation metrics: throughput.
|
- Evaluation metrics: throughput.
|
||||||
|
|
||||||
|
|
||||||
{throughput_tests_markdown_table}
|
{throughput_tests_markdown_table}
|
||||||
|
|
||||||
|
|
||||||
## Serving tests
|
## Serving tests
|
||||||
|
|
||||||
- Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
|
- Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
|
||||||
@ -29,11 +33,13 @@
|
|||||||
- We also added a speculative decoding test for llama-3 70B, under QPS 2
|
- We also added a speculative decoding test for llama-3 70B, under QPS 2
|
||||||
- Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99).
|
- Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99).
|
||||||
|
|
||||||
|
|
||||||
{serving_tests_markdown_table}
|
{serving_tests_markdown_table}
|
||||||
|
|
||||||
|
|
||||||
## json version of the benchmarking tables
|
## json version of the benchmarking tables
|
||||||
|
|
||||||
This section contains the data of the markdown tables above in JSON format.
|
This section contains the data of the markdown tables above in JSON format.
|
||||||
You can load the benchmarking tables into pandas dataframes as follows:
|
You can load the benchmarking tables into pandas dataframes as follows:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
@ -48,9 +54,9 @@ serving_results = pd.DataFrame.from_dict(benchmarking_results["serving"])
|
|||||||
```
|
```
|
||||||
|
|
||||||
The json string for all benchmarking tables:
|
The json string for all benchmarking tables:
|
||||||
|
|
||||||
```json
|
```json
|
||||||
{benchmarking_results_in_json_string}
|
{benchmarking_results_in_json_string}
|
||||||
```
|
```
|
||||||
|
|
||||||
You can also check the raw experiment data in the Artifact tab of the Buildkite page.
|
You can also check the raw experiment data in the Artifact tab of the Buildkite page.
|
||||||
|
|
||||||
|
|||||||
@ -84,13 +84,8 @@ if __name__ == "__main__":
|
|||||||
# this result is generated via `benchmark_serving.py`
|
# this result is generated via `benchmark_serving.py`
|
||||||
|
|
||||||
# attach the benchmarking command to raw_result
|
# attach the benchmarking command to raw_result
|
||||||
try:
|
with open(test_file.with_suffix(".commands")) as f:
|
||||||
with open(test_file.with_suffix(".commands")) as f:
|
command = json.loads(f.read())
|
||||||
command = json.loads(f.read())
|
|
||||||
except OSError as e:
|
|
||||||
print(e)
|
|
||||||
continue
|
|
||||||
|
|
||||||
raw_result.update(command)
|
raw_result.update(command)
|
||||||
|
|
||||||
# update the test name of this result
|
# update the test name of this result
|
||||||
@ -104,13 +99,8 @@ if __name__ == "__main__":
|
|||||||
# this result is generated via `benchmark_latency.py`
|
# this result is generated via `benchmark_latency.py`
|
||||||
|
|
||||||
# attach the benchmarking command to raw_result
|
# attach the benchmarking command to raw_result
|
||||||
try:
|
with open(test_file.with_suffix(".commands")) as f:
|
||||||
with open(test_file.with_suffix(".commands")) as f:
|
command = json.loads(f.read())
|
||||||
command = json.loads(f.read())
|
|
||||||
except OSError as e:
|
|
||||||
print(e)
|
|
||||||
continue
|
|
||||||
|
|
||||||
raw_result.update(command)
|
raw_result.update(command)
|
||||||
|
|
||||||
# update the test name of this result
|
# update the test name of this result
|
||||||
@ -131,13 +121,8 @@ if __name__ == "__main__":
|
|||||||
# this result is generated via `benchmark_throughput.py`
|
# this result is generated via `benchmark_throughput.py`
|
||||||
|
|
||||||
# attach the benchmarking command to raw_result
|
# attach the benchmarking command to raw_result
|
||||||
try:
|
with open(test_file.with_suffix(".commands")) as f:
|
||||||
with open(test_file.with_suffix(".commands")) as f:
|
command = json.loads(f.read())
|
||||||
command = json.loads(f.read())
|
|
||||||
except OSError as e:
|
|
||||||
print(e)
|
|
||||||
continue
|
|
||||||
|
|
||||||
raw_result.update(command)
|
raw_result.update(command)
|
||||||
|
|
||||||
# update the test name of this result
|
# update the test name of this result
|
||||||
|
|||||||
@ -426,7 +426,7 @@ main() {
|
|||||||
|
|
||||||
pip install -U transformers
|
pip install -U transformers
|
||||||
|
|
||||||
pip install -r requirements/dev.txt
|
pip install -r requirements-dev.txt
|
||||||
which genai-perf
|
which genai-perf
|
||||||
|
|
||||||
# check storage
|
# check storage
|
||||||
|
|||||||
@ -10,24 +10,15 @@ set -x
|
|||||||
set -o pipefail
|
set -o pipefail
|
||||||
|
|
||||||
check_gpus() {
|
check_gpus() {
|
||||||
if command -v nvidia-smi; then
|
# check the number of GPUs and GPU type.
|
||||||
# check the number of GPUs and GPU type.
|
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
|
||||||
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
|
|
||||||
elif command -v amd-smi; then
|
|
||||||
declare -g gpu_count=$(amd-smi list | grep 'GPU' | wc -l)
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ $gpu_count -gt 0 ]]; then
|
if [[ $gpu_count -gt 0 ]]; then
|
||||||
echo "GPU found."
|
echo "GPU found."
|
||||||
else
|
else
|
||||||
echo "Need at least 1 GPU to run benchmarking."
|
echo "Need at least 1 GPU to run benchmarking."
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
if command -v nvidia-smi; then
|
declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')
|
||||||
declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')
|
|
||||||
elif command -v amd-smi; then
|
|
||||||
declare -g gpu_type=$(amd-smi static -g 0 -a | grep 'MARKET_NAME' | awk '{print $2}')
|
|
||||||
fi
|
|
||||||
echo "GPU type is $gpu_type"
|
echo "GPU type is $gpu_type"
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -99,15 +90,9 @@ kill_gpu_processes() {
|
|||||||
|
|
||||||
|
|
||||||
# wait until GPU memory usage smaller than 1GB
|
# wait until GPU memory usage smaller than 1GB
|
||||||
if command -v nvidia-smi; then
|
while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
|
||||||
while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
|
sleep 1
|
||||||
sleep 1
|
done
|
||||||
done
|
|
||||||
elif command -v amd-smi; then
|
|
||||||
while [ "$(amd-smi metric -g 0 | grep 'USED_VRAM' | awk '{print $2}')" -ge 1000 ]; do
|
|
||||||
sleep 1
|
|
||||||
done
|
|
||||||
fi
|
|
||||||
|
|
||||||
# remove vllm config file
|
# remove vllm config file
|
||||||
rm -rf ~/.config/vllm
|
rm -rf ~/.config/vllm
|
||||||
@ -324,14 +309,11 @@ run_serving_tests() {
|
|||||||
|
|
||||||
new_test_name=$test_name"_qps_"$qps
|
new_test_name=$test_name"_qps_"$qps
|
||||||
|
|
||||||
# pass the tensor parallel size to the client so that it can be displayed
|
|
||||||
# on the benchmark dashboard
|
|
||||||
client_command="python3 benchmark_serving.py \
|
client_command="python3 benchmark_serving.py \
|
||||||
--save-result \
|
--save-result \
|
||||||
--result-dir $RESULTS_FOLDER \
|
--result-dir $RESULTS_FOLDER \
|
||||||
--result-filename ${new_test_name}.json \
|
--result-filename ${new_test_name}.json \
|
||||||
--request-rate $qps \
|
--request-rate $qps \
|
||||||
--metadata "tensor_parallel_size=$tp" \
|
|
||||||
$client_args"
|
$client_args"
|
||||||
|
|
||||||
echo "Running test case $test_name with qps $qps"
|
echo "Running test case $test_name with qps $qps"
|
||||||
@ -363,11 +345,6 @@ main() {
|
|||||||
check_gpus
|
check_gpus
|
||||||
check_hf_token
|
check_hf_token
|
||||||
|
|
||||||
# Set to v1 to run v1 benchmark
|
|
||||||
if [[ "${ENGINE_VERSION:-v0}" == "v1" ]]; then
|
|
||||||
export VLLM_USE_V1=1
|
|
||||||
fi
|
|
||||||
|
|
||||||
# dependencies
|
# dependencies
|
||||||
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
|
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
|
||||||
(which jq) || (apt-get update && apt-get -y install jq)
|
(which jq) || (apt-get update && apt-get -y install jq)
|
||||||
@ -376,7 +353,7 @@ main() {
|
|||||||
# get the current IP address, required by benchmark_serving.py
|
# get the current IP address, required by benchmark_serving.py
|
||||||
export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
|
export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
|
||||||
# turn of the reporting of the status of each request, to clean up the terminal output
|
# turn of the reporting of the status of each request, to clean up the terminal output
|
||||||
export VLLM_LOGGING_LEVEL="WARNING"
|
export VLLM_LOG_LEVEL="WARNING"
|
||||||
|
|
||||||
# prepare for benchmarking
|
# prepare for benchmarking
|
||||||
cd benchmarks || exit 1
|
cd benchmarks || exit 1
|
||||||
|
|||||||
@ -1,10 +1,6 @@
|
|||||||
#!/bin/sh
|
#!/bin/sh
|
||||||
TOKEN=$(curl -s -L "https://public.ecr.aws/token?service=public.ecr.aws&scope=repository:q9t5s3a7/vllm-ci-postmerge-repo:pull" | jq -r .token)
|
TOKEN=$(curl -s -L "https://public.ecr.aws/token?service=public.ecr.aws&scope=repository:q9t5s3a7/vllm-ci-postmerge-repo:pull" | jq -r .token)
|
||||||
if [[ "$BUILDKITE_BRANCH" == "main" ]]; then
|
URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-postmerge-repo/manifests/$BUILDKITE_COMMIT"
|
||||||
URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-postmerge-repo/manifests/$BUILDKITE_COMMIT"
|
|
||||||
else
|
|
||||||
URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-test-repo/manifests/$BUILDKITE_COMMIT"
|
|
||||||
fi
|
|
||||||
|
|
||||||
TIMEOUT_SECONDS=10
|
TIMEOUT_SECONDS=10
|
||||||
|
|
||||||
|
|||||||
@ -29,4 +29,4 @@
|
|||||||
"num-iters": 15
|
"num-iters": 15
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
@ -63,12 +63,11 @@
|
|||||||
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
||||||
"disable_log_requests": "",
|
"disable_log_requests": "",
|
||||||
"tensor_parallel_size": 4,
|
"tensor_parallel_size": 4,
|
||||||
"swap_space": 16,
|
"swap_space": 16,
|
||||||
"speculative_config": {
|
"speculative_model": "turboderp/Qwama-0.5B-Instruct",
|
||||||
"model": "turboderp/Qwama-0.5B-Instruct",
|
"num_speculative_tokens": 4,
|
||||||
"num_speculative_tokens": 4,
|
"speculative_draft_tensor_parallel_size": 1,
|
||||||
"draft_tensor_parallel_size": 1
|
"use_v2_block_manager": ""
|
||||||
}
|
|
||||||
},
|
},
|
||||||
"client_parameters": {
|
"client_parameters": {
|
||||||
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
||||||
|
|||||||
@ -32,4 +32,4 @@
|
|||||||
"backend": "vllm"
|
"backend": "vllm"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
@ -1,23 +1,12 @@
|
|||||||
steps:
|
steps:
|
||||||
- label: "Build wheel - CUDA 12.8"
|
- label: "Build wheel - CUDA 12.1"
|
||||||
agents:
|
agents:
|
||||||
queue: cpu_queue_postmerge
|
queue: cpu_queue_postmerge
|
||||||
commands:
|
commands:
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain ."
|
||||||
- "mkdir artifacts"
|
- "mkdir artifacts"
|
||||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||||
- "bash .buildkite/scripts/upload-wheels.sh"
|
- "bash .buildkite/upload-wheels.sh"
|
||||||
env:
|
|
||||||
DOCKER_BUILDKIT: "1"
|
|
||||||
|
|
||||||
- label: "Build wheel - CUDA 12.6"
|
|
||||||
agents:
|
|
||||||
queue: cpu_queue_postmerge
|
|
||||||
commands:
|
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.6.3 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
|
||||||
- "mkdir artifacts"
|
|
||||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
|
||||||
- "bash .buildkite/scripts/upload-wheels.sh"
|
|
||||||
env:
|
env:
|
||||||
DOCKER_BUILDKIT: "1"
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|
||||||
@ -31,10 +20,10 @@ steps:
|
|||||||
agents:
|
agents:
|
||||||
queue: cpu_queue_postmerge
|
queue: cpu_queue_postmerge
|
||||||
commands:
|
commands:
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain ."
|
||||||
- "mkdir artifacts"
|
- "mkdir artifacts"
|
||||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||||
- "bash .buildkite/scripts/upload-wheels.sh"
|
- "bash .buildkite/upload-wheels.sh"
|
||||||
env:
|
env:
|
||||||
DOCKER_BUILDKIT: "1"
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|
||||||
@ -48,7 +37,7 @@ steps:
|
|||||||
queue: cpu_queue_postmerge
|
queue: cpu_queue_postmerge
|
||||||
commands:
|
commands:
|
||||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ."
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain ."
|
||||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
||||||
|
|
||||||
- label: "Build and publish TPU release image"
|
- label: "Build and publish TPU release image"
|
||||||
@ -57,9 +46,7 @@ steps:
|
|||||||
agents:
|
agents:
|
||||||
queue: tpu_queue_postmerge
|
queue: tpu_queue_postmerge
|
||||||
commands:
|
commands:
|
||||||
- "yes | docker system prune -a"
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f Dockerfile.tpu ."
|
||||||
- "git fetch --all"
|
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f docker/Dockerfile.tpu ."
|
|
||||||
- "docker push vllm/vllm-tpu:nightly"
|
- "docker push vllm/vllm-tpu:nightly"
|
||||||
- "docker push vllm/vllm-tpu:$BUILDKITE_COMMIT"
|
- "docker push vllm/vllm-tpu:$BUILDKITE_COMMIT"
|
||||||
plugins:
|
plugins:
|
||||||
@ -84,22 +71,7 @@ steps:
|
|||||||
queue: cpu_queue_postmerge
|
queue: cpu_queue_postmerge
|
||||||
commands:
|
commands:
|
||||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --progress plain -f Dockerfile.cpu ."
|
||||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
|
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
|
||||||
env:
|
env:
|
||||||
DOCKER_BUILDKIT: "1"
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|
||||||
- block: "Build Neuron release image"
|
|
||||||
key: block-neuron-release-image-build
|
|
||||||
depends_on: ~
|
|
||||||
|
|
||||||
- label: "Build and publish Neuron release image"
|
|
||||||
depends_on: block-neuron-release-image-build
|
|
||||||
agents:
|
|
||||||
queue: neuron-postmerge
|
|
||||||
commands:
|
|
||||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:latest --progress plain -f docker/Dockerfile.neuron ."
|
|
||||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version)"
|
|
||||||
env:
|
|
||||||
DOCKER_BUILDKIT: "1"
|
|
||||||
|
|||||||
@ -75,91 +75,36 @@ HF_MOUNT="/root/.cache/huggingface"
|
|||||||
commands=$@
|
commands=$@
|
||||||
echo "Commands:$commands"
|
echo "Commands:$commands"
|
||||||
#ignore certain kernels tests
|
#ignore certain kernels tests
|
||||||
if [[ $commands == *" kernels/core"* ]]; then
|
if [[ $commands == *" kernels "* ]]; then
|
||||||
commands="${commands} \
|
commands="${commands} \
|
||||||
--ignore=kernels/core/test_fused_quant_layernorm.py \
|
--ignore=kernels/test_attention.py \
|
||||||
--ignore=kernels/core/test_permute_cols.py"
|
--ignore=kernels/test_attention_selector.py \
|
||||||
|
--ignore=kernels/test_blocksparse_attention.py \
|
||||||
|
--ignore=kernels/test_causal_conv1d.py \
|
||||||
|
--ignore=kernels/test_cutlass.py \
|
||||||
|
--ignore=kernels/test_encoder_decoder_attn.py \
|
||||||
|
--ignore=kernels/test_flash_attn.py \
|
||||||
|
--ignore=kernels/test_flashinfer.py \
|
||||||
|
--ignore=kernels/test_int8_quant.py \
|
||||||
|
--ignore=kernels/test_machete_gemm.py \
|
||||||
|
--ignore=kernels/test_mamba_ssm.py \
|
||||||
|
--ignore=kernels/test_marlin_gemm.py \
|
||||||
|
--ignore=kernels/test_moe.py \
|
||||||
|
--ignore=kernels/test_prefix_prefill.py \
|
||||||
|
--ignore=kernels/test_rand.py \
|
||||||
|
--ignore=kernels/test_sampler.py"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [[ $commands == *" kernels/attention"* ]]; then
|
#ignore certain Entrypoints tests
|
||||||
commands="${commands} \
|
|
||||||
--ignore=kernels/attention/stest_attention_selector.py \
|
|
||||||
--ignore=kernels/attention/test_blocksparse_attention.py \
|
|
||||||
--ignore=kernels/attention/test_encoder_decoder_attn.py \
|
|
||||||
--ignore=kernels/attention/test_attention_selector.py \
|
|
||||||
--ignore=kernels/attention/test_flash_attn.py \
|
|
||||||
--ignore=kernels/attention/test_flashinfer.py \
|
|
||||||
--ignore=kernels/attention/test_prefix_prefill.py \
|
|
||||||
--ignore=kernels/attention/test_cascade_flash_attn.py \
|
|
||||||
--ignore=kernels/attention/test_mha_attn.py \
|
|
||||||
--ignore=kernels/attention/test_lightning_attn.py \
|
|
||||||
--ignore=kernels/attention/test_attention.py"
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ $commands == *" kernels/quantization"* ]]; then
|
|
||||||
commands="${commands} \
|
|
||||||
--ignore=kernels/quantization/test_int8_quant.py \
|
|
||||||
--ignore=kernels/quantization/test_aqlm.py \
|
|
||||||
--ignore=kernels/quantization/test_machete_mm.py \
|
|
||||||
--ignore=kernels/quantization/test_block_fp8.py \
|
|
||||||
--ignore=kernels/quantization/test_block_int8.py \
|
|
||||||
--ignore=kernels/quantization/test_marlin_gemm.py \
|
|
||||||
--ignore=kernels/quantization/test_cutlass_scaled_mm.py \
|
|
||||||
--ignore=kernels/quantization/test_int8_kernel.py"
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ $commands == *" kernels/mamba"* ]]; then
|
|
||||||
commands="${commands} \
|
|
||||||
--ignore=kernels/mamba/test_mamba_mixer2.py \
|
|
||||||
--ignore=kernels/mamba/test_causal_conv1d.py \
|
|
||||||
--ignore=kernels/mamba/test_mamba_ssm_ssd.py"
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ $commands == *" kernels/moe"* ]]; then
|
|
||||||
commands="${commands} \
|
|
||||||
--ignore=kernels/moe/test_moe.py \
|
|
||||||
--ignore=kernels/moe/test_cutlass_moe.py \
|
|
||||||
--ignore=kernels/moe/test_triton_moe_ptpc_fp8.py"
|
|
||||||
fi
|
|
||||||
|
|
||||||
#ignore certain Entrypoints/openai tests
|
|
||||||
if [[ $commands == *" entrypoints/openai "* ]]; then
|
if [[ $commands == *" entrypoints/openai "* ]]; then
|
||||||
commands=${commands//" entrypoints/openai "/" entrypoints/openai \
|
commands=${commands//" entrypoints/openai "/" entrypoints/openai \
|
||||||
|
--ignore=entrypoints/openai/test_accuracy.py \
|
||||||
--ignore=entrypoints/openai/test_audio.py \
|
--ignore=entrypoints/openai/test_audio.py \
|
||||||
--ignore=entrypoints/openai/test_shutdown.py \
|
--ignore=entrypoints/openai/test_encoder_decoder.py \
|
||||||
--ignore=entrypoints/openai/test_completion.py \
|
--ignore=entrypoints/openai/test_embedding.py \
|
||||||
--ignore=entrypoints/openai/test_sleep.py \
|
--ignore=entrypoints/openai/test_oot_registration.py "}
|
||||||
--ignore=entrypoints/openai/test_models.py \
|
|
||||||
--ignore=entrypoints/openai/test_lora_adapters.py \
|
|
||||||
--ignore=entrypoints/openai/test_return_tokens_as_ids.py \
|
|
||||||
--ignore=entrypoints/openai/test_root_path.py \
|
|
||||||
--ignore=entrypoints/openai/test_tokenization.py \
|
|
||||||
--ignore=entrypoints/openai/test_prompt_validation.py "}
|
|
||||||
fi
|
fi
|
||||||
|
|
||||||
#ignore certain Entrypoints/llm tests
|
|
||||||
if [[ $commands == *" entrypoints/llm "* ]]; then
|
|
||||||
commands=${commands//" entrypoints/llm "/" entrypoints/llm \
|
|
||||||
--ignore=entrypoints/llm/test_chat.py \
|
|
||||||
--ignore=entrypoints/llm/test_accuracy.py \
|
|
||||||
--ignore=entrypoints/llm/test_init.py \
|
|
||||||
--ignore=entrypoints/llm/test_generate_multiple_loras.py \
|
|
||||||
--ignore=entrypoints/llm/test_prompt_validation.py "}
|
|
||||||
fi
|
|
||||||
|
|
||||||
#Obsolete currently
|
|
||||||
##ignore certain Entrypoints/llm tests
|
|
||||||
#if [[ $commands == *" && pytest -v -s entrypoints/llm/test_guided_generate.py"* ]]; then
|
|
||||||
# commands=${commands//" && pytest -v -s entrypoints/llm/test_guided_generate.py"/" "}
|
|
||||||
#fi
|
|
||||||
|
|
||||||
# --ignore=entrypoints/openai/test_encoder_decoder.py \
|
|
||||||
# --ignore=entrypoints/openai/test_embedding.py \
|
|
||||||
# --ignore=entrypoints/openai/test_oot_registration.py
|
|
||||||
# --ignore=entrypoints/openai/test_accuracy.py \
|
|
||||||
# --ignore=entrypoints/openai/test_models.py <= Fails on MI250 but passes on MI300 as of 2025-03-13
|
|
||||||
|
|
||||||
|
|
||||||
PARALLEL_JOB_COUNT=8
|
PARALLEL_JOB_COUNT=8
|
||||||
# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
|
# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
|
||||||
if [[ $commands == *"--shard-id="* ]]; then
|
if [[ $commands == *"--shard-id="* ]]; then
|
||||||
@ -169,16 +114,13 @@ if [[ $commands == *"--shard-id="* ]]; then
|
|||||||
# assign shard-id for each shard
|
# assign shard-id for each shard
|
||||||
commands_gpu=${commands//"--shard-id= "/"--shard-id=${GPU} "}
|
commands_gpu=${commands//"--shard-id= "/"--shard-id=${GPU} "}
|
||||||
echo "Shard ${GPU} commands:$commands_gpu"
|
echo "Shard ${GPU} commands:$commands_gpu"
|
||||||
echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
|
|
||||||
docker run \
|
docker run \
|
||||||
--device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
|
--device /dev/kfd --device /dev/dri \
|
||||||
--network=host \
|
--network host \
|
||||||
--shm-size=16gb \
|
--shm-size=16gb \
|
||||||
--rm \
|
--rm \
|
||||||
-e HIP_VISIBLE_DEVICES="${GPU}" \
|
-e HIP_VISIBLE_DEVICES="${GPU}" \
|
||||||
-e HF_TOKEN \
|
-e HF_TOKEN \
|
||||||
-e AWS_ACCESS_KEY_ID \
|
|
||||||
-e AWS_SECRET_ACCESS_KEY \
|
|
||||||
-v "${HF_CACHE}:${HF_MOUNT}" \
|
-v "${HF_CACHE}:${HF_MOUNT}" \
|
||||||
-e "HF_HOME=${HF_MOUNT}" \
|
-e "HF_HOME=${HF_MOUNT}" \
|
||||||
--name "${container_name}_${GPU}" \
|
--name "${container_name}_${GPU}" \
|
||||||
@ -199,16 +141,13 @@ if [[ $commands == *"--shard-id="* ]]; then
|
|||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
else
|
else
|
||||||
echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
|
|
||||||
docker run \
|
docker run \
|
||||||
--device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
|
--device /dev/kfd --device /dev/dri \
|
||||||
--network=host \
|
--network host \
|
||||||
--shm-size=16gb \
|
--shm-size=16gb \
|
||||||
--rm \
|
--rm \
|
||||||
-e HIP_VISIBLE_DEVICES=0 \
|
-e HIP_VISIBLE_DEVICES=0 \
|
||||||
-e HF_TOKEN \
|
-e HF_TOKEN \
|
||||||
-e AWS_ACCESS_KEY_ID \
|
|
||||||
-e AWS_SECRET_ACCESS_KEY \
|
|
||||||
-v "${HF_CACHE}:${HF_MOUNT}" \
|
-v "${HF_CACHE}:${HF_MOUNT}" \
|
||||||
-e "HF_HOME=${HF_MOUNT}" \
|
-e "HF_HOME=${HF_MOUNT}" \
|
||||||
--name "${container_name}" \
|
--name "${container_name}" \
|
||||||
@ -5,8 +5,8 @@
|
|||||||
set -ex
|
set -ex
|
||||||
set -o pipefail
|
set -o pipefail
|
||||||
|
|
||||||
# cd 2 levels into the working directory
|
# cd into parent directory of this file
|
||||||
cd "$(dirname "${BASH_SOURCE[0]}")/../.."
|
cd "$(dirname "${BASH_SOURCE[0]}")/.."
|
||||||
|
|
||||||
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
|
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
|
||||||
|
|
||||||
@ -10,4 +10,5 @@ trap remove_docker_container EXIT
|
|||||||
remove_docker_container
|
remove_docker_container
|
||||||
|
|
||||||
# Try building the docker image
|
# Try building the docker image
|
||||||
docker build -t cpu-test -f docker/Dockerfile.s390x .
|
docker build -t cpu-test -f Dockerfile.ppc64le .
|
||||||
|
|
||||||
@ -8,40 +8,34 @@ set -ex
|
|||||||
CORE_RANGE=${CORE_RANGE:-48-95}
|
CORE_RANGE=${CORE_RANGE:-48-95}
|
||||||
NUMA_NODE=${NUMA_NODE:-1}
|
NUMA_NODE=${NUMA_NODE:-1}
|
||||||
|
|
||||||
|
# Try building the docker image
|
||||||
|
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build -t cpu-test-"$BUILDKITE_BUILD_NUMBER" -f Dockerfile.cpu .
|
||||||
|
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 -f Dockerfile.cpu .
|
||||||
|
|
||||||
# Setup cleanup
|
# Setup cleanup
|
||||||
remove_docker_container() {
|
remove_docker_container() { set -e; docker rm -f cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" || true; }
|
||||||
set -e;
|
|
||||||
docker rm -f cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" || true;
|
|
||||||
docker image rm cpu-test-"$BUILDKITE_BUILD_NUMBER" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 || true;
|
|
||||||
}
|
|
||||||
trap remove_docker_container EXIT
|
trap remove_docker_container EXIT
|
||||||
remove_docker_container
|
remove_docker_container
|
||||||
|
|
||||||
# Try building the docker image
|
|
||||||
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$BUILDKITE_BUILD_NUMBER" --target vllm-test -f docker/Dockerfile.cpu .
|
|
||||||
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
|
|
||||||
|
|
||||||
# Run the image, setting --shm-size=4g for tensor parallel.
|
# Run the image, setting --shm-size=4g for tensor parallel.
|
||||||
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
|
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
|
||||||
--cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"
|
--cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"
|
||||||
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
|
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
|
||||||
--cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2
|
--cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2
|
||||||
|
|
||||||
function cpu_tests() {
|
function cpu_tests() {
|
||||||
set -e
|
set -e
|
||||||
export NUMA_NODE=$2
|
export NUMA_NODE=$2
|
||||||
export BUILDKITE_BUILD_NUMBER=$3
|
|
||||||
|
|
||||||
# offline inference
|
# offline inference
|
||||||
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" bash -c "
|
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" bash -c "
|
||||||
set -e
|
set -e
|
||||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
|
python3 examples/offline_inference/basic.py"
|
||||||
|
|
||||||
# Run basic model test
|
# Run basic model test
|
||||||
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
|
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
|
||||||
set -e
|
set -e
|
||||||
pytest -v -s tests/kernels/test_cache.py -m cpu_model
|
pip install -r vllm/requirements-test.txt
|
||||||
pytest -v -s tests/kernels/test_mla_decode_cpu.py -m cpu_model
|
|
||||||
pytest -v -s tests/models/decoder_only/language -m cpu_model
|
pytest -v -s tests/models/decoder_only/language -m cpu_model
|
||||||
pytest -v -s tests/models/embedding/language -m cpu_model
|
pytest -v -s tests/models/embedding/language -m cpu_model
|
||||||
pytest -v -s tests/models/encoder_decoder/language -m cpu_model
|
pytest -v -s tests/models/encoder_decoder/language -m cpu_model
|
||||||
@ -91,4 +85,4 @@ function cpu_tests() {
|
|||||||
|
|
||||||
# All of CPU tests are expected to be finished less than 40 mins.
|
# All of CPU tests are expected to be finished less than 40 mins.
|
||||||
export -f cpu_tests
|
export -f cpu_tests
|
||||||
timeout 40m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE $BUILDKITE_BUILD_NUMBER"
|
timeout 40m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
|
||||||
@ -9,13 +9,11 @@ python3 use_existing_torch.py
|
|||||||
|
|
||||||
# Try building the docker image
|
# Try building the docker image
|
||||||
DOCKER_BUILDKIT=1 docker build . \
|
DOCKER_BUILDKIT=1 docker build . \
|
||||||
--file docker/Dockerfile \
|
|
||||||
--target vllm-openai \
|
--target vllm-openai \
|
||||||
--platform "linux/arm64" \
|
--platform "linux/arm64" \
|
||||||
-t gh200-test \
|
-t gh200-test \
|
||||||
--build-arg max_jobs=66 \
|
--build-arg max_jobs=66 \
|
||||||
--build-arg nvcc_threads=2 \
|
--build-arg nvcc_threads=2 \
|
||||||
--build-arg RUN_WHEEL_CHECK=false \
|
|
||||||
--build-arg torch_cuda_arch_list="9.0+PTX" \
|
--build-arg torch_cuda_arch_list="9.0+PTX" \
|
||||||
--build-arg vllm_fa_cmake_gpu_arches="90-real"
|
--build-arg vllm_fa_cmake_gpu_arches="90-real"
|
||||||
|
|
||||||
@ -25,6 +23,6 @@ trap remove_docker_container EXIT
|
|||||||
remove_docker_container
|
remove_docker_container
|
||||||
|
|
||||||
# Run the image and test offline inference
|
# Run the image and test offline inference
|
||||||
docker run -e HF_TOKEN -e VLLM_WORKER_MULTIPROC_METHOD=spawn -v /root/.cache/huggingface:/root/.cache/huggingface --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
|
docker run -e HF_TOKEN -v /root/.cache/huggingface:/root/.cache/huggingface --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
|
||||||
python3 examples/offline_inference/basic/generate.py --model meta-llama/Llama-3.2-1B
|
python3 examples/offline_inference/cli.py --model meta-llama/Llama-3.2-1B
|
||||||
'
|
'
|
||||||
@ -5,7 +5,7 @@
|
|||||||
set -ex
|
set -ex
|
||||||
|
|
||||||
# Try building the docker image
|
# Try building the docker image
|
||||||
docker build -t hpu-test-env -f docker/Dockerfile.hpu .
|
docker build -t hpu-test-env -f Dockerfile.hpu .
|
||||||
|
|
||||||
# Setup cleanup
|
# Setup cleanup
|
||||||
# certain versions of HPU software stack have a bug that can
|
# certain versions of HPU software stack have a bug that can
|
||||||
@ -20,5 +20,5 @@ trap remove_docker_container_and_exit EXIT
|
|||||||
remove_docker_container
|
remove_docker_container
|
||||||
|
|
||||||
# Run the image and launch offline inference
|
# Run the image and launch offline inference
|
||||||
docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m
|
docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/basic.py
|
||||||
EXITCODE=$?
|
EXITCODE=$?
|
||||||
@ -3,7 +3,7 @@
|
|||||||
set -euox pipefail
|
set -euox pipefail
|
||||||
|
|
||||||
if [[ $# -lt 4 ]]; then
|
if [[ $# -lt 4 ]]; then
|
||||||
echo "Usage: .buildkite/scripts/run-multi-node-test.sh WORKING_DIR NUM_NODES NUM_GPUS DOCKER_IMAGE COMMAND1 COMMAND2 ... COMMANDN"
|
echo "Usage: .buildkite/run-multi-node-test.sh WORKING_DIR NUM_NODES NUM_GPUS DOCKER_IMAGE COMMAND1 COMMAND2 ... COMMANDN"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
@ -29,13 +29,16 @@ if [ -f /tmp/neuron-docker-build-timestamp ]; then
|
|||||||
docker image prune -f
|
docker image prune -f
|
||||||
# Remove unused volumes / force the system prune for old images as well.
|
# Remove unused volumes / force the system prune for old images as well.
|
||||||
docker volume prune -f && docker system prune -f
|
docker volume prune -f && docker system prune -f
|
||||||
|
# Remove huggingface model artifacts and compiler cache
|
||||||
|
rm -rf "${HF_MOUNT:?}/*"
|
||||||
|
rm -rf "${NEURON_COMPILE_CACHE_MOUNT:?}/*"
|
||||||
echo "$current_time" > /tmp/neuron-docker-build-timestamp
|
echo "$current_time" > /tmp/neuron-docker-build-timestamp
|
||||||
fi
|
fi
|
||||||
else
|
else
|
||||||
date "+%s" > /tmp/neuron-docker-build-timestamp
|
date "+%s" > /tmp/neuron-docker-build-timestamp
|
||||||
fi
|
fi
|
||||||
|
|
||||||
docker build -t "${image_name}" -f docker/Dockerfile.neuron .
|
docker build -t "${image_name}" -f Dockerfile.neuron .
|
||||||
|
|
||||||
# Setup cleanup
|
# Setup cleanup
|
||||||
remove_docker_container() {
|
remove_docker_container() {
|
||||||
@ -44,11 +47,11 @@ remove_docker_container() {
|
|||||||
trap remove_docker_container EXIT
|
trap remove_docker_container EXIT
|
||||||
|
|
||||||
# Run the image
|
# Run the image
|
||||||
docker run --rm -it --device=/dev/neuron0 --network bridge \
|
docker run --rm -it --device=/dev/neuron0 --device=/dev/neuron1 --network host \
|
||||||
-v "${HF_CACHE}:${HF_MOUNT}" \
|
-v "${HF_CACHE}:${HF_MOUNT}" \
|
||||||
-e "HF_HOME=${HF_MOUNT}" \
|
-e "HF_HOME=${HF_MOUNT}" \
|
||||||
-v "${NEURON_COMPILE_CACHE_URL}:${NEURON_COMPILE_CACHE_MOUNT}" \
|
-v "${NEURON_COMPILE_CACHE_URL}:${NEURON_COMPILE_CACHE_MOUNT}" \
|
||||||
-e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \
|
-e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \
|
||||||
--name "${container_name}" \
|
--name "${container_name}" \
|
||||||
${image_name} \
|
${image_name} \
|
||||||
/bin/bash -c "python3 /workspace/vllm/examples/offline_inference/neuron.py && python3 -m pytest /workspace/vllm/tests/neuron/1_core/ -v --capture=tee-sys && python3 -m pytest /workspace/vllm/tests/neuron/2_core/ -v --capture=tee-sys"
|
/bin/bash -c "python3 /workspace/vllm/examples/offline_inference/neuron.py && python3 -m pytest /workspace/vllm/tests/neuron/ -v --capture=tee-sys"
|
||||||
16
.buildkite/run-openvino-test.sh
Executable file
16
.buildkite/run-openvino-test.sh
Executable file
@ -0,0 +1,16 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# This script build the OpenVINO docker image and run the offline inference inside the container.
|
||||||
|
# It serves a sanity check for compilation and basic model usage.
|
||||||
|
set -ex
|
||||||
|
|
||||||
|
# Try building the docker image
|
||||||
|
docker build -t openvino-test -f Dockerfile.openvino .
|
||||||
|
|
||||||
|
# Setup cleanup
|
||||||
|
remove_docker_container() { docker rm -f openvino-test || true; }
|
||||||
|
trap remove_docker_container EXIT
|
||||||
|
remove_docker_container
|
||||||
|
|
||||||
|
# Run the image and launch offline inference
|
||||||
|
docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference/basic.py
|
||||||
26
.buildkite/run-tpu-test.sh
Executable file
26
.buildkite/run-tpu-test.sh
Executable file
@ -0,0 +1,26 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
# Build the docker image.
|
||||||
|
docker build -f Dockerfile.tpu -t vllm-tpu .
|
||||||
|
|
||||||
|
# Set up cleanup.
|
||||||
|
remove_docker_container() { docker rm -f tpu-test || true; }
|
||||||
|
trap remove_docker_container EXIT
|
||||||
|
# Remove the container that might not be cleaned up in the previous run.
|
||||||
|
remove_docker_container
|
||||||
|
|
||||||
|
# For HF_TOKEN.
|
||||||
|
source /etc/environment
|
||||||
|
# Run a simple end-to-end example.
|
||||||
|
docker run --privileged --net host --shm-size=16G -it \
|
||||||
|
-e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
|
||||||
|
vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \
|
||||||
|
&& python3 -m pip install pytest \
|
||||||
|
&& python3 -m pip install lm_eval[api]==0.4.4 \
|
||||||
|
&& pytest -v -s /workspace/vllm/tests/entrypoints/openai/test_accuracy.py \
|
||||||
|
&& pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
|
||||||
|
&& python3 /workspace/vllm/tests/tpu/test_compilation.py \
|
||||||
|
&& python3 /workspace/vllm/tests/tpu/test_quantization_accuracy.py \
|
||||||
|
&& python3 /workspace/vllm/examples/offline_inference/tpu.py"
|
||||||
19
.buildkite/run-xpu-test.sh
Normal file
19
.buildkite/run-xpu-test.sh
Normal file
@ -0,0 +1,19 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# This script build the CPU docker image and run the offline inference inside the container.
|
||||||
|
# It serves a sanity check for compilation and basic model usage.
|
||||||
|
set -ex
|
||||||
|
|
||||||
|
# Try building the docker image
|
||||||
|
docker build -t xpu-test -f Dockerfile.xpu .
|
||||||
|
|
||||||
|
# Setup cleanup
|
||||||
|
remove_docker_container() { docker rm -f xpu-test || true; }
|
||||||
|
trap remove_docker_container EXIT
|
||||||
|
remove_docker_container
|
||||||
|
|
||||||
|
# Run the image and test offline inference/tensor parallel
|
||||||
|
docker run --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path --entrypoint="" xpu-test sh -c '
|
||||||
|
python3 examples/offline_inference/basic.py
|
||||||
|
python3 examples/offline_inference/cli.py -tp 2
|
||||||
|
'
|
||||||
@ -1,45 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
# This script build the CPU docker image and run the offline inference inside the container.
|
|
||||||
# It serves a sanity check for compilation and basic model usage.
|
|
||||||
set -ex
|
|
||||||
|
|
||||||
# Setup cleanup
|
|
||||||
remove_docker_container() {
|
|
||||||
if [[ -n "$container_id" ]]; then
|
|
||||||
podman rm -f "$container_id" || true
|
|
||||||
fi
|
|
||||||
podman system prune -f
|
|
||||||
}
|
|
||||||
trap remove_docker_container EXIT
|
|
||||||
remove_docker_container
|
|
||||||
|
|
||||||
# Try building the docker image
|
|
||||||
podman build -t cpu-test-ubi9-ppc -f docker/Dockerfile.ppc64le .
|
|
||||||
|
|
||||||
# Run the image
|
|
||||||
container_id=$(podman run -itd --entrypoint /bin/bash -v /tmp/:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN cpu-test-ubi9-ppc)
|
|
||||||
|
|
||||||
function cpu_tests() {
|
|
||||||
|
|
||||||
# offline inference
|
|
||||||
podman exec -it "$container_id" bash -c "
|
|
||||||
set -e
|
|
||||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
|
|
||||||
|
|
||||||
# Run basic model test
|
|
||||||
podman exec -it "$container_id" bash -c "
|
|
||||||
set -e
|
|
||||||
pip install pytest pytest-asyncio einops peft Pillow soundfile transformers_stream_generator matplotlib
|
|
||||||
pip install sentence-transformers datamodel_code_generator
|
|
||||||
pytest -v -s tests/models/embedding/language/test_cls_models.py::test_classification_models[float-jason9693/Qwen2.5-1.5B-apeach]
|
|
||||||
pytest -v -s tests/models/embedding/language/test_embedding.py::test_models[half-BAAI/bge-base-en-v1.5]
|
|
||||||
pytest -v -s tests/models/encoder_decoder/language -m cpu_model"
|
|
||||||
}
|
|
||||||
|
|
||||||
# All of CPU tests are expected to be finished less than 40 mins.
|
|
||||||
|
|
||||||
export container_id
|
|
||||||
export -f cpu_tests
|
|
||||||
timeout 40m bash -c cpu_tests
|
|
||||||
|
|
||||||
@ -1,103 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
set -xu
|
|
||||||
|
|
||||||
# Build the docker image.
|
|
||||||
docker build -f docker/Dockerfile.tpu -t vllm-tpu .
|
|
||||||
|
|
||||||
# Set up cleanup.
|
|
||||||
remove_docker_container() { docker rm -f tpu-test || true; }
|
|
||||||
trap remove_docker_container EXIT
|
|
||||||
# Remove the container that might not be cleaned up in the previous run.
|
|
||||||
remove_docker_container
|
|
||||||
|
|
||||||
# For HF_TOKEN.
|
|
||||||
source /etc/environment
|
|
||||||
# Run a simple end-to-end example.
|
|
||||||
docker run --privileged --net host --shm-size=16G -it \
|
|
||||||
-e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
|
|
||||||
vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \
|
|
||||||
&& python3 -m pip install pytest pytest-asyncio tpu-info \
|
|
||||||
&& python3 -m pip install lm_eval[api]==0.4.4 \
|
|
||||||
&& export VLLM_XLA_CACHE_PATH= \
|
|
||||||
&& export VLLM_USE_V1=1 \
|
|
||||||
&& export VLLM_XLA_CHECK_RECOMPILATION=1 \
|
|
||||||
&& echo HARDWARE \
|
|
||||||
&& tpu-info \
|
|
||||||
&& { \
|
|
||||||
echo TEST_0: Running test_perf.py; \
|
|
||||||
pytest -s -v /workspace/vllm/tests/tpu/test_perf.py; \
|
|
||||||
echo TEST_0_EXIT_CODE: \$?; \
|
|
||||||
} & \
|
|
||||||
&& { \
|
|
||||||
echo TEST_1: Running test_compilation.py; \
|
|
||||||
pytest -s -v /workspace/vllm/tests/tpu/test_compilation.py; \
|
|
||||||
echo TEST_1_EXIT_CODE: \$?; \
|
|
||||||
} & \
|
|
||||||
{ \
|
|
||||||
echo TEST_2: Running test_basic.py; \
|
|
||||||
pytest -s -v /workspace/vllm/tests/v1/tpu/test_basic.py; \
|
|
||||||
echo TEST_2_EXIT_CODE: \$?; \
|
|
||||||
} & \
|
|
||||||
{ \
|
|
||||||
echo TEST_3: Running test_accuracy.py::test_lm_eval_accuracy_v1_engine; \
|
|
||||||
pytest -s -v /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine; \
|
|
||||||
echo TEST_3_EXIT_CODE: \$?; \
|
|
||||||
} & \
|
|
||||||
{ \
|
|
||||||
echo TEST_4: Running test_quantization_accuracy.py; \
|
|
||||||
pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py; \
|
|
||||||
echo TEST_4_EXIT_CODE: \$?; \
|
|
||||||
} & \
|
|
||||||
{ \
|
|
||||||
echo TEST_5: Running examples/offline_inference/tpu.py; \
|
|
||||||
python3 /workspace/vllm/examples/offline_inference/tpu.py; \
|
|
||||||
echo TEST_5_EXIT_CODE: \$?; \
|
|
||||||
} & \
|
|
||||||
{ \
|
|
||||||
echo TEST_6: Running test_tpu_model_runner.py; \
|
|
||||||
pytest -s -v /workspace/vllm/tests/tpu/worker/test_tpu_model_runner.py; \
|
|
||||||
echo TEST_6_EXIT_CODE: \$?; \
|
|
||||||
} & \
|
|
||||||
&& { \
|
|
||||||
echo TEST_7: Running test_sampler.py; \
|
|
||||||
pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py; \
|
|
||||||
echo TEST_7_EXIT_CODE: \$?; \
|
|
||||||
} & \
|
|
||||||
&& { \
|
|
||||||
echo TEST_8: Running test_topk_topp_sampler.py; \
|
|
||||||
pytest -s -v /workspace/vllm/tests/v1/tpu/test_topk_topp_sampler.py; \
|
|
||||||
echo TEST_8_EXIT_CODE: \$?; \
|
|
||||||
} & \
|
|
||||||
&& { \
|
|
||||||
echo TEST_9: Running test_multimodal.py; \
|
|
||||||
pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py; \
|
|
||||||
echo TEST_9_EXIT_CODE: \$?; \
|
|
||||||
} & \
|
|
||||||
&& { \
|
|
||||||
echo TEST_10: Running test_pallas.py; \
|
|
||||||
pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py; \
|
|
||||||
echo TEST_10_EXIT_CODE: \$?; \
|
|
||||||
} & \
|
|
||||||
&& { \
|
|
||||||
echo TEST_11: Running test_struct_output_generate.py; \
|
|
||||||
pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py; \
|
|
||||||
echo TEST_11_EXIT_CODE: \$?; \
|
|
||||||
} & \
|
|
||||||
&& { \
|
|
||||||
echo TEST_12: Running test_moe_pallas.py; \
|
|
||||||
pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py; \
|
|
||||||
echo TEST_12_EXIT_CODE: \$?; \
|
|
||||||
} & \
|
|
||||||
# Disable the TPU LoRA tests until the feature is activated
|
|
||||||
# && { \
|
|
||||||
# echo TEST_13: Running test_moe_pallas.py; \
|
|
||||||
# pytest -s -v /workspace/vllm/tests/tpu/lora/; \
|
|
||||||
# echo TEST_13_EXIT_CODE: \$?; \
|
|
||||||
# } & \
|
|
||||||
wait \
|
|
||||||
&& echo 'All tests have attempted to run. Check logs for individual test statuses and exit codes.' \
|
|
||||||
"
|
|
||||||
|
|
||||||
# TODO: This test fails because it uses RANDOM_SEED sampling
|
|
||||||
# && VLLM_USE_V1=1 pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
|
|
||||||
@ -1,31 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
# This script build the CPU docker image and run the offline inference inside the container.
|
|
||||||
# It serves a sanity check for compilation and basic model usage.
|
|
||||||
set -ex
|
|
||||||
|
|
||||||
image_name="xpu/vllm-ci:${BUILDKITE_COMMIT}"
|
|
||||||
container_name="xpu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
|
|
||||||
|
|
||||||
# Try building the docker image
|
|
||||||
docker build -t ${image_name} -f docker/Dockerfile.xpu .
|
|
||||||
|
|
||||||
# Setup cleanup
|
|
||||||
remove_docker_container() {
|
|
||||||
docker rm -f "${container_name}" || true;
|
|
||||||
docker image rm -f "${image_name}" || true;
|
|
||||||
docker system prune -f || true;
|
|
||||||
}
|
|
||||||
trap remove_docker_container EXIT
|
|
||||||
|
|
||||||
# Run the image and test offline inference/tensor parallel
|
|
||||||
docker run \
|
|
||||||
--device /dev/dri \
|
|
||||||
-v /dev/dri/by-path:/dev/dri/by-path \
|
|
||||||
--entrypoint="" \
|
|
||||||
--name "${container_name}" \
|
|
||||||
"${image_name}" \
|
|
||||||
sh -c '
|
|
||||||
VLLM_USE_V1=0 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m
|
|
||||||
VLLM_USE_V1=0 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m -tp 2
|
|
||||||
'
|
|
||||||
@ -2,13 +2,12 @@
|
|||||||
# adding a new command to an existing step. See different options here for examples.
|
# adding a new command to an existing step. See different options here for examples.
|
||||||
|
|
||||||
# This script will be feed into Jinja template in `test-template-aws.j2` at
|
# This script will be feed into Jinja template in `test-template-aws.j2` at
|
||||||
# https://github.com/vllm-project/buildkite-ci/blob/main/scripts/test-template-aws.j2
|
# https://github.com/vllm-project/buildkite-ci/blob/main/scripts/test-template-aws.j2
|
||||||
# to generate the final pipeline yaml file.
|
# to generate the final pipeline yaml file.
|
||||||
|
|
||||||
# Documentation
|
# Documentation
|
||||||
# label(str): the name of the test. emoji allowed.
|
# label(str): the name of the test. emoji allowed.
|
||||||
# fast_check(bool): whether to run this on each commit on fastcheck pipeline.
|
# fast_check(bool): whether to run this on each commit on fastcheck pipeline.
|
||||||
# torch_nightly(bool): whether to run this on vllm against torch nightly pipeline.
|
|
||||||
# fast_check_only(bool): run this test on fastcheck pipeline only
|
# fast_check_only(bool): run this test on fastcheck pipeline only
|
||||||
# optional(bool): never run this test by default (i.e. need to unblock manually) unless it's scheduled nightly run.
|
# optional(bool): never run this test by default (i.e. need to unblock manually) unless it's scheduled nightly run.
|
||||||
# command(str): the single command to run for tests. incompatible with commands.
|
# command(str): the single command to run for tests. incompatible with commands.
|
||||||
@ -16,7 +15,7 @@
|
|||||||
# mirror_hardwares(list): the list of hardwares to run the test on as well. currently only supports [amd]
|
# mirror_hardwares(list): the list of hardwares to run the test on as well. currently only supports [amd]
|
||||||
# gpu(str): override the GPU selection for the test. default is on L4 GPUs. currently only supports a100
|
# gpu(str): override the GPU selection for the test. default is on L4 GPUs. currently only supports a100
|
||||||
# num_gpus(int): override the number of GPUs for the test. default to 1 GPU. currently support 2,4.
|
# num_gpus(int): override the number of GPUs for the test. default to 1 GPU. currently support 2,4.
|
||||||
# num_nodes(int): whether to simulate multi-node setup by launch multiple containers on one host,
|
# num_nodes(int): whether to simulate multi-node setup by launch multiple containers on one host,
|
||||||
# in this case, commands must be specified. the first command runs on first host, the second
|
# in this case, commands must be specified. the first command runs on first host, the second
|
||||||
# command runs on the second host.
|
# command runs on the second host.
|
||||||
# working_dir(str): specify the place where command should execute, default to /vllm-workspace/tests
|
# working_dir(str): specify the place where command should execute, default to /vllm-workspace/tests
|
||||||
@ -25,8 +24,8 @@
|
|||||||
# When adding a test
|
# When adding a test
|
||||||
# - If the test belong to an existing group, add it there
|
# - If the test belong to an existing group, add it there
|
||||||
# - If the test is short, add to any existing step
|
# - If the test is short, add to any existing step
|
||||||
# - If the test takes more than 10min, then it is okay to create a new step.
|
# - If the test takes more than 10min, then it is okay to create a new step.
|
||||||
# Note that all steps execute in parallel.
|
# Note that all steps execute in parallel.
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
##### fast check tests #####
|
##### fast check tests #####
|
||||||
@ -36,12 +35,13 @@ steps:
|
|||||||
fast_check: true
|
fast_check: true
|
||||||
no_gpu: True
|
no_gpu: True
|
||||||
commands:
|
commands:
|
||||||
- pip install -r ../../requirements/docs.txt
|
- pip install -r requirements-docs.txt
|
||||||
- SPHINXOPTS=\"-W\" make html
|
- SPHINXOPTS=\"-W\" make html
|
||||||
# Check API reference (if it fails, you may have missing mock imports)
|
# Check API reference (if it fails, you may have missing mock imports)
|
||||||
- grep \"sig sig-object py\" build/html/api/vllm/vllm.sampling_params.html
|
- grep \"sig sig-object py\" build/html/api/inference_params.html
|
||||||
|
|
||||||
- label: Async Engine, Inputs, Utils, Worker Test # 24min
|
- label: Async Engine, Inputs, Utils, Worker Test # 24min
|
||||||
|
fast_check: true
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/mq_llm_engine
|
- tests/mq_llm_engine
|
||||||
@ -71,7 +71,6 @@ steps:
|
|||||||
- label: Basic Correctness Test # 30min
|
- label: Basic Correctness Test # 30min
|
||||||
#mirror_hardwares: [amd]
|
#mirror_hardwares: [amd]
|
||||||
fast_check: true
|
fast_check: true
|
||||||
torch_nightly: true
|
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/basic_correctness/test_basic_correctness
|
- tests/basic_correctness/test_basic_correctness
|
||||||
@ -79,7 +78,6 @@ steps:
|
|||||||
- tests/basic_correctness/test_preemption
|
- tests/basic_correctness/test_preemption
|
||||||
- tests/basic_correctness/test_cumem.py
|
- tests/basic_correctness/test_cumem.py
|
||||||
commands:
|
commands:
|
||||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
|
||||||
- pytest -v -s basic_correctness/test_cumem.py
|
- pytest -v -s basic_correctness/test_cumem.py
|
||||||
- pytest -v -s basic_correctness/test_basic_correctness.py
|
- pytest -v -s basic_correctness/test_basic_correctness.py
|
||||||
- pytest -v -s basic_correctness/test_cpu_offload.py
|
- pytest -v -s basic_correctness/test_cpu_offload.py
|
||||||
@ -106,73 +104,62 @@ steps:
|
|||||||
- label: Entrypoints Test # 40min
|
- label: Entrypoints Test # 40min
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
fast_check: true
|
fast_check: true
|
||||||
torch_nightly: true
|
mirror_hardwares: [amd]
|
||||||
#mirror_hardwares: [amd]
|
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/entrypoints/llm
|
|
||||||
- tests/entrypoints/openai
|
|
||||||
- tests/entrypoints/test_chat_utils
|
|
||||||
- tests/entrypoints/offline_mode
|
|
||||||
commands:
|
commands:
|
||||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
|
||||||
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_guided_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
|
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_guided_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
|
||||||
- pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
|
- pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
|
||||||
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
|
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
|
||||||
- pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
|
- pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
|
||||||
- VLLM_USE_V1=0 pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
|
- pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
|
||||||
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_openai_schema.py
|
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py
|
||||||
- pytest -v -s entrypoints/test_chat_utils.py
|
- pytest -v -s entrypoints/test_chat_utils.py
|
||||||
- VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
|
- pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
|
||||||
|
|
||||||
- label: Distributed Tests (4 GPUs) # 10min
|
- label: Distributed Tests (4 GPUs) # 10min
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
num_gpus: 4
|
num_gpus: 4
|
||||||
|
fast_check: true
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/distributed/
|
- vllm/distributed/
|
||||||
- vllm/core/
|
- vllm/core/
|
||||||
- tests/distributed/test_utils
|
- tests/distributed
|
||||||
- tests/distributed/test_pynccl
|
|
||||||
- tests/spec_decode/e2e/test_integration_dist_tp4
|
- tests/spec_decode/e2e/test_integration_dist_tp4
|
||||||
- tests/compile/test_basic_correctness
|
- tests/compile
|
||||||
- examples/offline_inference/rlhf.py
|
- examples/offline_inference/rlhf.py
|
||||||
- examples/offline_inference/rlhf_colocate.py
|
- examples/offline_inference/ray_placement.py
|
||||||
- tests/examples/offline_inference/data_parallel.py
|
|
||||||
- tests/v1/test_async_llm_dp.py
|
|
||||||
commands:
|
commands:
|
||||||
# test with tp=2 and external_dp=2
|
|
||||||
- VLLM_USE_V1=0 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
|
|
||||||
- torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
|
|
||||||
# test with internal dp
|
|
||||||
- python3 ../examples/offline_inference/data_parallel.py
|
|
||||||
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
|
|
||||||
- pytest -v -s distributed/test_utils.py
|
- pytest -v -s distributed/test_utils.py
|
||||||
- pytest -v -s compile/test_basic_correctness.py
|
- pytest -v -s compile/test_basic_correctness.py
|
||||||
- pytest -v -s distributed/test_pynccl.py
|
- pytest -v -s distributed/test_pynccl.py
|
||||||
- pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
|
- pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
|
||||||
# TODO: create a dedicated test section for multi-GPU example tests
|
# TODO: create a dedicated test section for multi-GPU example tests
|
||||||
# when we have multiple distributed example tests
|
# when we have multiple distributed example tests
|
||||||
- pushd ../examples/offline_inference
|
- python3 ../examples/offline_inference/rlhf.py
|
||||||
- python3 rlhf.py
|
- RAY_DEDUP_LOGS=0 python3 ../examples/offline_inference/ray_placement.py
|
||||||
- RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
|
|
||||||
- popd
|
|
||||||
|
|
||||||
- label: Metrics, Tracing Test # 10min
|
- label: Metrics, Tracing Test # 10min
|
||||||
mirror_hardwares: [amd]
|
num_gpus: 2
|
||||||
num_gpus: 2
|
fast_check: true
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/metrics
|
- tests/metrics
|
||||||
- tests/tracing
|
- tests/tracing
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s metrics
|
- pytest -v -s metrics
|
||||||
|
- "pip install \
|
||||||
|
'opentelemetry-sdk>=1.26.0,<1.27.0' \
|
||||||
|
'opentelemetry-api>=1.26.0,<1.27.0' \
|
||||||
|
'opentelemetry-exporter-otlp>=1.26.0,<1.27.0' \
|
||||||
|
'opentelemetry-semantic-conventions-ai>=0.4.1,<0.5.0'"
|
||||||
- pytest -v -s tracing
|
- pytest -v -s tracing
|
||||||
|
|
||||||
##### fast check tests #####
|
##### fast check tests #####
|
||||||
##### 1 GPU test #####
|
##### 1 GPU test #####
|
||||||
|
|
||||||
- label: Regression Test # 5min
|
- label: Regression Test # 5min
|
||||||
#mirror_hardwares: [amd]
|
mirror_hardwares: [amd]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/test_regression
|
- tests/test_regression
|
||||||
@ -187,9 +174,6 @@ steps:
|
|||||||
- vllm/
|
- vllm/
|
||||||
- tests/engine
|
- tests/engine
|
||||||
- tests/tokenization
|
- tests/tokenization
|
||||||
- tests/test_sequence
|
|
||||||
- tests/test_config
|
|
||||||
- tests/test_logger
|
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s engine test_sequence.py test_config.py test_logger.py
|
- pytest -v -s engine test_sequence.py test_config.py test_logger.py
|
||||||
# OOM in the CI unless we run this separately
|
# OOM in the CI unless we run this separately
|
||||||
@ -202,23 +186,15 @@ steps:
|
|||||||
- tests/v1
|
- tests/v1
|
||||||
commands:
|
commands:
|
||||||
# split the test to avoid interference
|
# split the test to avoid interference
|
||||||
- pytest -v -s v1/core
|
- VLLM_USE_V1=1 pytest -v -s v1/core
|
||||||
- pytest -v -s v1/engine
|
- VLLM_USE_V1=1 pytest -v -s v1/engine
|
||||||
- pytest -v -s v1/entrypoints
|
- VLLM_USE_V1=1 pytest -v -s v1/sample
|
||||||
- pytest -v -s v1/sample
|
- VLLM_USE_V1=1 pytest -v -s v1/worker
|
||||||
- pytest -v -s v1/worker
|
- VLLM_USE_V1=1 pytest -v -s v1/test_stats.py
|
||||||
- pytest -v -s v1/structured_output
|
- VLLM_USE_V1=1 pytest -v -s v1/test_utils.py
|
||||||
- pytest -v -s v1/spec_decode
|
|
||||||
- pytest -v -s v1/test_serial_utils.py
|
|
||||||
- pytest -v -s v1/test_stats.py
|
|
||||||
- pytest -v -s v1/test_utils.py
|
|
||||||
- pytest -v -s v1/test_oracle.py
|
|
||||||
# TODO: accuracy does not match, whether setting
|
# TODO: accuracy does not match, whether setting
|
||||||
# VLLM_USE_FLASHINFER_SAMPLER or not on H100.
|
# VLLM_USE_FLASHINFER_SAMPLER or not on H100.
|
||||||
- pytest -v -s v1/e2e
|
- VLLM_USE_V1=1 pytest -v -s v1/e2e
|
||||||
# Integration test for streaming correctness (requires special branch).
|
|
||||||
- pip install -U git+https://github.com/robertgshaw2-neuralmagic/lm-evaluation-harness.git@streaming-api
|
|
||||||
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
|
|
||||||
|
|
||||||
- label: Examples Test # 25min
|
- label: Examples Test # 25min
|
||||||
working_dir: "/vllm-workspace/examples"
|
working_dir: "/vllm-workspace/examples"
|
||||||
@ -228,22 +204,19 @@ steps:
|
|||||||
- examples/
|
- examples/
|
||||||
commands:
|
commands:
|
||||||
- pip install tensorizer # for tensorizer test
|
- pip install tensorizer # for tensorizer test
|
||||||
- python3 offline_inference/basic/generate.py --model facebook/opt-125m
|
- python3 offline_inference/basic.py
|
||||||
- python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
|
- python3 offline_inference/cpu_offload.py
|
||||||
- python3 offline_inference/basic/chat.py
|
- python3 offline_inference/chat.py
|
||||||
- python3 offline_inference/prefix_caching.py
|
- python3 offline_inference/prefix_caching.py
|
||||||
- python3 offline_inference/llm_engine_example.py
|
- python3 offline_inference/llm_engine_example.py
|
||||||
- python3 offline_inference/audio_language.py --seed 0
|
- python3 offline_inference/vision_language.py
|
||||||
- python3 offline_inference/vision_language.py --seed 0
|
- python3 offline_inference/vision_language_multi_image.py
|
||||||
- python3 offline_inference/vision_language_embedding.py --seed 0
|
- python3 other/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 other/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
||||||
- python3 offline_inference/vision_language_multi_image.py --seed 0
|
|
||||||
- VLLM_USE_V1=0 python3 other/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 other/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
|
||||||
- python3 offline_inference/encoder_decoder.py
|
- python3 offline_inference/encoder_decoder.py
|
||||||
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
|
- python3 offline_inference/classification.py
|
||||||
- python3 offline_inference/basic/classify.py
|
- python3 offline_inference/embedding.py
|
||||||
- python3 offline_inference/basic/embed.py
|
- python3 offline_inference/scoring.py
|
||||||
- python3 offline_inference/basic/score.py
|
- python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2
|
||||||
- VLLM_USE_V1=0 python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2
|
|
||||||
|
|
||||||
- label: Prefix Caching Test # 9min
|
- label: Prefix Caching Test # 9min
|
||||||
mirror_hardwares: [amd]
|
mirror_hardwares: [amd]
|
||||||
@ -270,7 +243,7 @@ steps:
|
|||||||
- vllm/model_executor/guided_decoding
|
- vllm/model_executor/guided_decoding
|
||||||
- tests/test_logits_processor
|
- tests/test_logits_processor
|
||||||
- tests/model_executor/test_guided_processors
|
- tests/model_executor/test_guided_processors
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s test_logits_processor.py
|
- pytest -v -s test_logits_processor.py
|
||||||
- pytest -v -s model_executor/test_guided_processors.py
|
- pytest -v -s model_executor/test_guided_processors.py
|
||||||
|
|
||||||
@ -281,29 +254,19 @@ steps:
|
|||||||
- vllm/model_executor/models/eagle.py
|
- vllm/model_executor/models/eagle.py
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s spec_decode/e2e/test_multistep_correctness.py
|
- pytest -v -s spec_decode/e2e/test_multistep_correctness.py
|
||||||
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py --ignore=spec_decode/e2e/test_mtp_correctness.py
|
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py
|
||||||
- pytest -v -s spec_decode/e2e/test_eagle_correctness.py
|
- pytest -v -s spec_decode/e2e/test_eagle_correctness.py
|
||||||
|
|
||||||
- label: LoRA Test %N # 15min each
|
- label: LoRA Test %N # 15min each
|
||||||
#mirror_hardwares: [amd]
|
mirror_hardwares: [amd]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/lora
|
- vllm/lora
|
||||||
- tests/lora
|
- tests/lora
|
||||||
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py
|
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_minicpmv_tp.py
|
||||||
parallelism: 4
|
parallelism: 4
|
||||||
|
|
||||||
- label: PyTorch Compilation Unit Tests
|
- label: "PyTorch Fullgraph Smoke Test" # 9min
|
||||||
torch_nightly: true
|
fast_check: true
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/
|
|
||||||
- tests/compile
|
|
||||||
commands:
|
|
||||||
- pytest -v -s compile/test_pass_manager.py
|
|
||||||
- pytest -v -s compile/test_fusion.py
|
|
||||||
- pytest -v -s compile/test_sequence_parallelism.py
|
|
||||||
|
|
||||||
- label: PyTorch Fullgraph Smoke Test # 9min
|
|
||||||
torch_nightly: true
|
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/compile
|
- tests/compile
|
||||||
@ -313,62 +276,25 @@ steps:
|
|||||||
- pytest -v -s compile/piecewise/test_simple.py
|
- pytest -v -s compile/piecewise/test_simple.py
|
||||||
- pytest -v -s compile/piecewise/test_toy_llama.py
|
- pytest -v -s compile/piecewise/test_toy_llama.py
|
||||||
|
|
||||||
- label: PyTorch Fullgraph Test # 18min
|
- label: "PyTorch Fullgraph Test" # 18min
|
||||||
torch_nightly: true
|
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/compile
|
- tests/compile
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s compile/test_full_graph.py
|
- pytest -v -s compile/test_full_graph.py
|
||||||
|
|
||||||
- label: Kernels Core Operation Test
|
- label: Kernels Test %N # 1h each
|
||||||
mirror_hardwares: [amd]
|
mirror_hardwares: [amd]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- csrc/
|
- csrc/
|
||||||
- tests/kernels/core
|
|
||||||
commands:
|
|
||||||
- pytest -v -s kernels/core
|
|
||||||
|
|
||||||
- label: Kernels Attention Test %N
|
|
||||||
mirror_hardwares: [amd]
|
|
||||||
source_file_dependencies:
|
|
||||||
- csrc/attention/
|
|
||||||
- vllm/attention
|
- vllm/attention
|
||||||
- vllm/v1/attention
|
- tests/kernels
|
||||||
- tests/kernels/attention
|
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
- pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
||||||
parallelism: 2
|
parallelism: 4
|
||||||
|
|
||||||
- label: Kernels Quantization Test %N
|
|
||||||
mirror_hardwares: [amd]
|
|
||||||
source_file_dependencies:
|
|
||||||
- csrc/quantization/
|
|
||||||
- vllm/model_executor/layers/quantization
|
|
||||||
- tests/kernels/quantization
|
|
||||||
commands:
|
|
||||||
- pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
|
||||||
parallelism: 2
|
|
||||||
|
|
||||||
- label: Kernels MoE Test
|
|
||||||
#mirror_hardwares: [amd]
|
|
||||||
source_file_dependencies:
|
|
||||||
- csrc/moe/
|
|
||||||
- tests/kernels/moe
|
|
||||||
- vllm/model_executor/layers/fused_moe/
|
|
||||||
commands:
|
|
||||||
- pytest -v -s kernels/moe
|
|
||||||
|
|
||||||
- label: Kernels Mamba Test
|
|
||||||
#mirror_hardwares: [amd]
|
|
||||||
source_file_dependencies:
|
|
||||||
- csrc/mamba/
|
|
||||||
- tests/kernels/mamba
|
|
||||||
commands:
|
|
||||||
- pytest -v -s kernels/mamba
|
|
||||||
|
|
||||||
- label: Tensorizer Test # 11min
|
- label: Tensorizer Test # 11min
|
||||||
# mirror_hardwares: [amd]
|
mirror_hardwares: [amd]
|
||||||
soft_fail: true
|
soft_fail: true
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/model_executor/model_loader
|
- vllm/model_executor/model_loader
|
||||||
@ -384,22 +310,14 @@ steps:
|
|||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- benchmarks/
|
- benchmarks/
|
||||||
commands:
|
commands:
|
||||||
- bash scripts/run-benchmarks.sh
|
- bash run-benchmarks.sh
|
||||||
|
|
||||||
- label: Benchmarks CLI Test # 10min
|
- label: Quantization Test # 33min
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/
|
|
||||||
- tests/benchmarks/
|
|
||||||
commands:
|
|
||||||
- pytest -v -s benchmarks/
|
|
||||||
|
|
||||||
- label: Quantization Test
|
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- csrc/
|
- csrc/
|
||||||
- vllm/model_executor/layers/quantization
|
- vllm/model_executor/layers/quantization
|
||||||
- tests/quantization
|
- tests/quantization
|
||||||
commands:
|
command: VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
|
||||||
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
|
|
||||||
|
|
||||||
- label: LM Eval Small Models # 53min
|
- label: LM Eval Small Models # 53min
|
||||||
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
||||||
@ -408,15 +326,7 @@ steps:
|
|||||||
- vllm/model_executor/layers/quantization
|
- vllm/model_executor/layers/quantization
|
||||||
commands:
|
commands:
|
||||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
|
- bash ./run-tests.sh -c configs/models-small.txt -t 1
|
||||||
|
|
||||||
- label: OpenAI API correctness
|
|
||||||
source_file_dependencies:
|
|
||||||
- csrc/
|
|
||||||
- vllm/entrypoints/openai/
|
|
||||||
- vllm/model_executor/models/whisper.py
|
|
||||||
commands: # LMEval+Transcription WER check
|
|
||||||
- pytest -s entrypoints/openai/correctness/
|
|
||||||
|
|
||||||
- label: Encoder Decoder tests # 5min
|
- label: Encoder Decoder tests # 5min
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
@ -427,101 +337,96 @@ steps:
|
|||||||
|
|
||||||
- label: OpenAI-Compatible Tool Use # 20 min
|
- label: OpenAI-Compatible Tool Use # 20 min
|
||||||
fast_check: false
|
fast_check: false
|
||||||
#mirror_hardwares: [ amd ]
|
mirror_hardwares: [ amd ]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/tool_use
|
- tests/tool_use
|
||||||
- tests/mistral_tool_use
|
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s tool_use
|
- pytest -v -s tool_use
|
||||||
- pytest -v -s mistral_tool_use
|
|
||||||
|
|
||||||
##### models test #####
|
##### models test #####
|
||||||
|
|
||||||
- label: Basic Models Test # 24min
|
- label: Basic Models Test # 24min
|
||||||
torch_nightly: true
|
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/models
|
- tests/models
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s models/test_transformers.py
|
- pytest -v -s models/test_transformers.py
|
||||||
- pytest -v -s models/test_registry.py
|
- pytest -v -s models/test_registry.py
|
||||||
- pytest -v -s models/test_utils.py
|
- pytest -v -s models/test_initialization.py
|
||||||
- pytest -v -s models/test_vision.py
|
|
||||||
# V1 Test: https://github.com/vllm-project/vllm/issues/14531
|
|
||||||
- VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'
|
|
||||||
- VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'llama4'
|
|
||||||
- VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'plamo2'
|
|
||||||
|
|
||||||
- label: Language Models Test (Standard)
|
- label: Language Models Test (Standard) # 32min
|
||||||
#mirror_hardwares: [amd]
|
#mirror_hardwares: [amd]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/models/language
|
- tests/models/decoder_only/language
|
||||||
|
- tests/models/embedding/language
|
||||||
|
- tests/models/encoder_decoder/language
|
||||||
commands:
|
commands:
|
||||||
# Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
|
- pytest -v -s models/decoder_only/language -m 'core_model or quant_model'
|
||||||
- pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
|
- pytest -v -s models/embedding/language -m core_model
|
||||||
- pytest -v -s models/language -m core_model
|
|
||||||
|
|
||||||
- label: Language Models Test (Extended)
|
- label: Language Models Test (Extended) # 1h10min
|
||||||
optional: true
|
optional: true
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/models/language
|
- tests/models/decoder_only/language
|
||||||
|
- tests/models/embedding/language
|
||||||
|
- tests/models/encoder_decoder/language
|
||||||
commands:
|
commands:
|
||||||
# Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
|
- pytest -v -s models/decoder_only/language -m 'not core_model and not quant_model'
|
||||||
- pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
|
- pytest -v -s models/embedding/language -m 'not core_model'
|
||||||
- pytest -v -s models/language -m 'not core_model'
|
|
||||||
|
|
||||||
- label: Multi-Modal Models Test (Standard)
|
- label: Multi-Modal Models Test (Standard) # 40min
|
||||||
#mirror_hardwares: [amd]
|
#mirror_hardwares: [amd]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/models/multimodal
|
- tests/models/decoder_only/audio_language
|
||||||
|
- tests/models/decoder_only/vision_language
|
||||||
|
- tests/models/embedding/vision_language
|
||||||
|
- tests/models/encoder_decoder/audio_language
|
||||||
|
- tests/models/encoder_decoder/vision_language
|
||||||
commands:
|
commands:
|
||||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||||
- pytest -v -s models/multimodal/processing
|
- pytest -v -s models/multimodal
|
||||||
- pytest -v -s --ignore models/multimodal/generation/test_whisper.py models/multimodal -m core_model
|
- pytest -v -s models/decoder_only/audio_language -m 'core_model or quant_model'
|
||||||
- cd .. && pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
|
- pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'core_model or quant_model'
|
||||||
|
- pytest -v -s models/embedding/vision_language -m core_model
|
||||||
|
- pytest -v -s models/encoder_decoder/audio_language -m core_model
|
||||||
|
- pytest -v -s models/encoder_decoder/language -m core_model
|
||||||
|
- pytest -v -s models/encoder_decoder/vision_language -m core_model
|
||||||
|
|
||||||
- label: Multi-Modal Models Test (Extended) 1
|
- label: Multi-Modal Models Test (Extended) 1 # 48m
|
||||||
optional: true
|
optional: true
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/models/multimodal
|
- tests/models/decoder_only/audio_language
|
||||||
|
- tests/models/decoder_only/vision_language
|
||||||
|
- tests/models/embedding/vision_language
|
||||||
|
- tests/models/encoder_decoder/vision_language
|
||||||
commands:
|
commands:
|
||||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||||
- pytest -v -s --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing models/multimodal -m 'not core_model'
|
- pytest -v -s models/decoder_only/audio_language -m 'not core_model and not quant_model'
|
||||||
|
- pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=0) and not core_model and not quant_model'
|
||||||
|
# HACK - run phi3v tests separately to sidestep this transformers bug
|
||||||
|
# https://github.com/huggingface/transformers/issues/34307
|
||||||
|
- pytest -v -s models/decoder_only/vision_language/test_phi3v.py
|
||||||
|
- pytest -v -s --ignore models/decoder_only/vision_language/test_models.py --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'not core_model and not quant_model'
|
||||||
|
- pytest -v -s models/embedding/vision_language -m 'not core_model'
|
||||||
|
- pytest -v -s models/encoder_decoder/language -m 'not core_model'
|
||||||
|
- pytest -v -s models/encoder_decoder/vision_language -m 'not core_model'
|
||||||
|
|
||||||
- label: Multi-Modal Models Test (Extended) 2
|
- label: Multi-Modal Models Test (Extended) 2 # 38m
|
||||||
optional: true
|
optional: true
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/models/multimodal
|
- tests/models/decoder_only/vision_language
|
||||||
commands:
|
commands:
|
||||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||||
- pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model'
|
- pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=1) and not core_model and not quant_model'
|
||||||
|
|
||||||
- label: Multi-Modal Models Test (Extended) 3
|
|
||||||
optional: true
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/
|
|
||||||
- tests/models/multimodal
|
|
||||||
commands:
|
|
||||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
|
||||||
- pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model'
|
|
||||||
|
|
||||||
- label: Quantized Models Test
|
|
||||||
#mirror_hardwares: [amd]
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/model_executor/layers/quantization
|
|
||||||
- tests/models/quantization
|
|
||||||
commands:
|
|
||||||
- pytest -v -s models/quantization
|
|
||||||
|
|
||||||
# This test is used only in PR development phase to test individual models and should never run on main
|
# This test is used only in PR development phase to test individual models and should never run on main
|
||||||
- label: Custom Models Test
|
- label: Custom Models Test
|
||||||
mirror_hardwares: [amd]
|
|
||||||
optional: true
|
optional: true
|
||||||
commands:
|
commands:
|
||||||
- echo 'Testing custom models...'
|
- echo 'Testing custom models...'
|
||||||
@ -533,7 +438,6 @@ steps:
|
|||||||
##### multi gpus test #####
|
##### multi gpus test #####
|
||||||
|
|
||||||
- label: Distributed Comm Ops Test # 7min
|
- label: Distributed Comm Ops Test # 7min
|
||||||
mirror_hardwares: [amd]
|
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
num_gpus: 2
|
num_gpus: 2
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
@ -576,31 +480,28 @@ steps:
|
|||||||
- vllm/worker/worker.py
|
- vllm/worker/worker.py
|
||||||
- vllm/worker/model_runner.py
|
- vllm/worker/model_runner.py
|
||||||
- entrypoints/llm/test_collective_rpc.py
|
- entrypoints/llm/test_collective_rpc.py
|
||||||
- tests/v1/test_async_llm_dp.py
|
|
||||||
- vllm/v1/engine/
|
|
||||||
commands:
|
commands:
|
||||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
|
|
||||||
- pytest -v -s entrypoints/llm/test_collective_rpc.py
|
- pytest -v -s entrypoints/llm/test_collective_rpc.py
|
||||||
|
- torchrun --nproc-per-node=2 distributed/test_torchrun_example.py
|
||||||
- pytest -v -s ./compile/test_basic_correctness.py
|
- pytest -v -s ./compile/test_basic_correctness.py
|
||||||
- pytest -v -s ./compile/test_wrapper.py
|
- pytest -v -s ./compile/test_wrapper.py
|
||||||
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
||||||
- TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
|
- TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
|
||||||
# Avoid importing model tests that cause CUDA reinitialization error
|
# Avoid importing model tests that cause CUDA reinitialization error
|
||||||
- pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
|
- pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
|
||||||
- pytest models/language -v -s -m 'distributed(num_gpus=2)'
|
- pytest models/encoder_decoder/language/test_bart.py -v -s -m 'distributed(num_gpus=2)'
|
||||||
- pytest models/multimodal -v -s -m 'distributed(num_gpus=2)'
|
- pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m 'distributed(num_gpus=2)'
|
||||||
# test sequence parallel
|
- pytest models/decoder_only/vision_language/test_models.py -v -s -m 'distributed(num_gpus=2)'
|
||||||
- pytest -v -s distributed/test_sequence_parallel.py
|
|
||||||
# this test fails consistently.
|
# this test fails consistently.
|
||||||
# TODO: investigate and fix
|
# TODO: investigate and fix
|
||||||
# - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
|
# - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
|
||||||
- VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
|
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
|
||||||
- VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/test_disagg.py
|
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/disagg_test.py
|
||||||
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
|
|
||||||
|
|
||||||
- label: Plugin Tests (2 GPUs) # 40min
|
- label: Plugin Tests (2 GPUs) # 40min
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
num_gpus: 2
|
num_gpus: 2
|
||||||
|
fast_check: true
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/plugins/
|
- vllm/plugins/
|
||||||
- tests/plugins/
|
- tests/plugins/
|
||||||
@ -611,7 +512,6 @@ steps:
|
|||||||
- pip uninstall vllm_add_dummy_platform -y
|
- pip uninstall vllm_add_dummy_platform -y
|
||||||
# end platform plugin tests
|
# end platform plugin tests
|
||||||
# other tests continue here:
|
# other tests continue here:
|
||||||
- pytest -v -s plugins_tests/test_scheduler_plugins.py
|
|
||||||
- pip install -e ./plugins/vllm_add_dummy_model
|
- pip install -e ./plugins/vllm_add_dummy_model
|
||||||
- pytest -v -s distributed/test_distributed_oot.py
|
- pytest -v -s distributed/test_distributed_oot.py
|
||||||
- pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
|
- pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
|
||||||
@ -659,10 +559,13 @@ steps:
|
|||||||
# FIXIT: find out which code initialize cuda before running the test
|
# FIXIT: find out which code initialize cuda before running the test
|
||||||
# before the fix, we need to use spawn to test it
|
# before the fix, we need to use spawn to test it
|
||||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
# There is some Tensor Parallelism related processing logic in LoRA that
|
# This test runs llama 13B, so it is required to run on 4 GPUs.
|
||||||
|
- pytest -v -s -x lora/test_long_context.py
|
||||||
|
# There is some Tensor Parallelism related processing logic in LoRA that
|
||||||
# requires multi-GPU testing for validation.
|
# requires multi-GPU testing for validation.
|
||||||
- pytest -v -s -x lora/test_chatglm3_tp.py
|
- pytest -v -s -x lora/test_chatglm3_tp.py
|
||||||
- pytest -v -s -x lora/test_llama_tp.py
|
- pytest -v -s -x lora/test_llama_tp.py
|
||||||
|
- pytest -v -s -x lora/test_minicpmv_tp.py
|
||||||
|
|
||||||
|
|
||||||
- label: Weight Loading Multiple GPU Test # 33min
|
- label: Weight Loading Multiple GPU Test # 33min
|
||||||
@ -683,7 +586,7 @@ steps:
|
|||||||
- vllm/
|
- vllm/
|
||||||
- tests/weight_loading
|
- tests/weight_loading
|
||||||
commands:
|
commands:
|
||||||
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
|
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
|
||||||
|
|
||||||
|
|
||||||
##### multi gpus test #####
|
##### multi gpus test #####
|
||||||
@ -695,7 +598,7 @@ steps:
|
|||||||
num_gpus: 4
|
num_gpus: 4
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
commands:
|
commands:
|
||||||
# NOTE: don't test llama model here, it seems hf implementation is buggy
|
# NOTE: don't test llama model here, it seems hf implementation is buggy
|
||||||
# see https://github.com/vllm-project/vllm/pull/5689 for details
|
# see https://github.com/vllm-project/vllm/pull/5689 for details
|
||||||
- pytest -v -s distributed/test_custom_all_reduce.py
|
- pytest -v -s distributed/test_custom_all_reduce.py
|
||||||
@ -713,4 +616,4 @@ steps:
|
|||||||
- vllm/model_executor/layers/quantization
|
- vllm/model_executor/layers/quantization
|
||||||
commands:
|
commands:
|
||||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
|
- bash ./run-tests.sh -c configs/models-large.txt -t 4
|
||||||
|
|||||||
@ -50,11 +50,8 @@ aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
|
|||||||
if [[ $normal_wheel == *"cu118"* ]]; then
|
if [[ $normal_wheel == *"cu118"* ]]; then
|
||||||
# if $normal_wheel matches cu118, do not upload the index.html
|
# if $normal_wheel matches cu118, do not upload the index.html
|
||||||
echo "Skipping index files for cu118 wheels"
|
echo "Skipping index files for cu118 wheels"
|
||||||
elif [[ $normal_wheel == *"cu126"* ]]; then
|
|
||||||
# if $normal_wheel matches cu126, do not upload the index.html
|
|
||||||
echo "Skipping index files for cu126 wheels"
|
|
||||||
else
|
else
|
||||||
# only upload index.html for cu128 wheels (default wheels)
|
# only upload index.html for cu12 wheels (default wheels)
|
||||||
aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
|
aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
|
||||||
aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
|
aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
|
||||||
fi
|
fi
|
||||||
@ -66,12 +63,9 @@ aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"
|
|||||||
if [[ $normal_wheel == *"cu118"* ]]; then
|
if [[ $normal_wheel == *"cu118"* ]]; then
|
||||||
# if $normal_wheel matches cu118, do not upload the index.html
|
# if $normal_wheel matches cu118, do not upload the index.html
|
||||||
echo "Skipping index files for cu118 wheels"
|
echo "Skipping index files for cu118 wheels"
|
||||||
elif [[ $normal_wheel == *"cu126"* ]]; then
|
|
||||||
# if $normal_wheel matches cu126, do not upload the index.html
|
|
||||||
echo "Skipping index files for cu126 wheels"
|
|
||||||
else
|
else
|
||||||
# only upload index.html for cu128 wheels (default wheels)
|
# only upload index.html for cu12 wheels (default wheels)
|
||||||
aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
|
aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
|
aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
|
||||||
28
.github/CODEOWNERS
vendored
28
.github/CODEOWNERS
vendored
@ -10,33 +10,27 @@
|
|||||||
/vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
/vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
||||||
/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
||||||
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth
|
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth
|
||||||
/vllm/model_executor/guided_decoding @mgoin @russellb
|
/vllm/model_executor/guided_decoding @mgoin
|
||||||
/vllm/multimodal @DarkLight1337 @ywang96
|
/vllm/multimodal @DarkLight1337 @ywang96
|
||||||
/vllm/vllm_flash_attn @LucasWilkinson
|
|
||||||
CMakeLists.txt @tlrmchlsmth
|
CMakeLists.txt @tlrmchlsmth
|
||||||
|
|
||||||
# vLLM V1
|
# vLLM V1
|
||||||
/vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
|
/vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
|
||||||
/vllm/v1/structured_output @mgoin @russellb
|
|
||||||
|
|
||||||
# Test ownership
|
# Test ownership
|
||||||
/.buildkite/lm-eval-harness @mgoin @simon-mo
|
|
||||||
/tests/async_engine @njhill @robertgshaw2-redhat @simon-mo
|
/tests/async_engine @njhill @robertgshaw2-redhat @simon-mo
|
||||||
/tests/basic_correctness/test_chunked_prefill @rkooo567 @comaniac
|
/tests/test_inputs.py @DarkLight1337 @ywang96
|
||||||
|
/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo
|
||||||
|
/tests/models @DarkLight1337 @ywang96
|
||||||
|
/tests/multimodal @DarkLight1337 @ywang96
|
||||||
|
/tests/prefix_caching @comaniac @KuntaiDu
|
||||||
|
/tests/spec_decode @njhill @LiuXiaoxuanPKU
|
||||||
|
/tests/kernels @tlrmchlsmth @WoosukKwon
|
||||||
|
/tests/quantization @mgoin @robertgshaw2-redhat
|
||||||
|
/.buildkite/lm-eval-harness @mgoin @simon-mo
|
||||||
/tests/distributed/test_multi_node_assignment.py @youkaichao
|
/tests/distributed/test_multi_node_assignment.py @youkaichao
|
||||||
/tests/distributed/test_pipeline_parallel.py @youkaichao
|
/tests/distributed/test_pipeline_parallel.py @youkaichao
|
||||||
/tests/distributed/test_same_node.py @youkaichao
|
/tests/distributed/test_same_node.py @youkaichao
|
||||||
/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo
|
|
||||||
/tests/entrypoints/llm/test_guided_generate.py @mgoin @russellb
|
|
||||||
/tests/kernels @tlrmchlsmth @WoosukKwon
|
|
||||||
/tests/model_executor/test_guided_processors.py @mgoin @russellb
|
|
||||||
/tests/models @DarkLight1337 @ywang96
|
|
||||||
/tests/multi_step @alexm-redhat @comaniac
|
/tests/multi_step @alexm-redhat @comaniac
|
||||||
/tests/multimodal @DarkLight1337 @ywang96
|
|
||||||
/tests/prefix_caching @comaniac @KuntaiDu
|
|
||||||
/tests/quantization @mgoin @robertgshaw2-redhat
|
|
||||||
/tests/spec_decode @njhill @LiuXiaoxuanPKU
|
|
||||||
/tests/test_inputs.py @DarkLight1337 @ywang96
|
|
||||||
/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb
|
|
||||||
/tests/v1/structured_output @mgoin @russellb
|
|
||||||
/tests/weight_loading @mgoin @youkaichao
|
/tests/weight_loading @mgoin @youkaichao
|
||||||
|
/tests/basic_correctness/test_chunked_prefill @rkooo567 @comaniac
|
||||||
|
|||||||
2
.github/ISSUE_TEMPLATE/200-installation.yml
vendored
2
.github/ISSUE_TEMPLATE/200-installation.yml
vendored
@ -14,7 +14,7 @@ body:
|
|||||||
description: |
|
description: |
|
||||||
Please run the following and paste the output below.
|
Please run the following and paste the output below.
|
||||||
```sh
|
```sh
|
||||||
wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py
|
wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
|
||||||
# For security purposes, please feel free to check the contents of collect_env.py before running it.
|
# For security purposes, please feel free to check the contents of collect_env.py before running it.
|
||||||
python collect_env.py
|
python collect_env.py
|
||||||
```
|
```
|
||||||
|
|||||||
2
.github/ISSUE_TEMPLATE/300-usage.yml
vendored
2
.github/ISSUE_TEMPLATE/300-usage.yml
vendored
@ -14,7 +14,7 @@ body:
|
|||||||
description: |
|
description: |
|
||||||
Please run the following and paste the output below.
|
Please run the following and paste the output below.
|
||||||
```sh
|
```sh
|
||||||
wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py
|
wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
|
||||||
# For security purposes, please feel free to check the contents of collect_env.py before running it.
|
# For security purposes, please feel free to check the contents of collect_env.py before running it.
|
||||||
python collect_env.py
|
python collect_env.py
|
||||||
```
|
```
|
||||||
|
|||||||
8
.github/ISSUE_TEMPLATE/400-bug-report.yml
vendored
8
.github/ISSUE_TEMPLATE/400-bug-report.yml
vendored
@ -14,19 +14,19 @@ body:
|
|||||||
description: |
|
description: |
|
||||||
Please run the following and paste the output below.
|
Please run the following and paste the output below.
|
||||||
```sh
|
```sh
|
||||||
wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py
|
wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
|
||||||
# For security purposes, please feel free to check the contents of collect_env.py before running it.
|
# For security purposes, please feel free to check the contents of collect_env.py before running it.
|
||||||
python collect_env.py
|
python collect_env.py
|
||||||
```
|
```
|
||||||
It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues.
|
It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues.
|
||||||
value: |
|
value: |
|
||||||
<details>
|
<details>
|
||||||
<summary>The output of <code>python collect_env.py</code></summary>
|
<summary>The output of `python collect_env.py`</summary>
|
||||||
|
|
||||||
```text
|
```text
|
||||||
Your output of `python collect_env.py` here
|
Your output of `python collect_env.py` here
|
||||||
```
|
```
|
||||||
|
|
||||||
</details>
|
</details>
|
||||||
validations:
|
validations:
|
||||||
required: true
|
required: true
|
||||||
@ -75,7 +75,7 @@ body:
|
|||||||
```
|
```
|
||||||
|
|
||||||
```
|
```
|
||||||
The error message you got, with the full traceback and the error logs with [dump_input.py:##] if present.
|
The error message you got, with the full traceback.
|
||||||
```
|
```
|
||||||
validations:
|
validations:
|
||||||
required: true
|
required: true
|
||||||
|
|||||||
2
.github/ISSUE_TEMPLATE/600-new-model.yml
vendored
2
.github/ISSUE_TEMPLATE/600-new-model.yml
vendored
@ -9,7 +9,7 @@ body:
|
|||||||
value: >
|
value: >
|
||||||
#### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
|
#### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
|
||||||
|
|
||||||
#### We also highly recommend you read https://docs.vllm.ai/en/latest/contributing/model/index.html first to understand how to add a new model.
|
#### We also highly recommend you read https://docs.vllm.ai/en/latest/contributing/model/adding_model.html first to understand how to add a new model.
|
||||||
- type: textarea
|
- type: textarea
|
||||||
attributes:
|
attributes:
|
||||||
label: The model to consider.
|
label: The model to consider.
|
||||||
|
|||||||
@ -35,7 +35,7 @@ body:
|
|||||||
description: |
|
description: |
|
||||||
Please run the following and paste the output below.
|
Please run the following and paste the output below.
|
||||||
```sh
|
```sh
|
||||||
wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py
|
wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
|
||||||
# For security purposes, please feel free to check the contents of collect_env.py before running it.
|
# For security purposes, please feel free to check the contents of collect_env.py before running it.
|
||||||
python collect_env.py
|
python collect_env.py
|
||||||
```
|
```
|
||||||
|
|||||||
28
.github/ISSUE_TEMPLATE/800-misc-discussion.yml
vendored
Normal file
28
.github/ISSUE_TEMPLATE/800-misc-discussion.yml
vendored
Normal file
@ -0,0 +1,28 @@
|
|||||||
|
name: 🎲 Misc/random discussions that do not fit into the above categories.
|
||||||
|
description: Submit a discussion as you like. Note that developers are heavily overloaded and we mainly rely on community users to answer these issues.
|
||||||
|
title: "[Misc]: "
|
||||||
|
labels: ["misc"]
|
||||||
|
|
||||||
|
body:
|
||||||
|
- type: markdown
|
||||||
|
attributes:
|
||||||
|
value: >
|
||||||
|
#### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
|
||||||
|
- type: textarea
|
||||||
|
attributes:
|
||||||
|
label: Anything you want to discuss about vllm.
|
||||||
|
description: >
|
||||||
|
Anything you want to discuss about vllm.
|
||||||
|
validations:
|
||||||
|
required: true
|
||||||
|
- type: markdown
|
||||||
|
attributes:
|
||||||
|
value: >
|
||||||
|
Thanks for contributing 🎉!
|
||||||
|
- type: checkboxes
|
||||||
|
id: askllm
|
||||||
|
attributes:
|
||||||
|
label: Before submitting a new issue...
|
||||||
|
options:
|
||||||
|
- label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions.
|
||||||
|
required: true
|
||||||
4
.github/ISSUE_TEMPLATE/config.yml
vendored
4
.github/ISSUE_TEMPLATE/config.yml
vendored
@ -1,5 +1 @@
|
|||||||
blank_issues_enabled: false
|
blank_issues_enabled: false
|
||||||
contact_links:
|
|
||||||
- name: Questions
|
|
||||||
url: https://discuss.vllm.ai
|
|
||||||
about: Ask questions and discuss with other vLLM community members
|
|
||||||
|
|||||||
3
.github/PULL_REQUEST_TEMPLATE.md
vendored
3
.github/PULL_REQUEST_TEMPLATE.md
vendored
@ -2,5 +2,4 @@ FILL IN THE PR DESCRIPTION HERE
|
|||||||
|
|
||||||
FIX #xxxx (*link existing issues this PR will resolve*)
|
FIX #xxxx (*link existing issues this PR will resolve*)
|
||||||
|
|
||||||
<!--- pyml disable-next-line no-emphasis-as-heading -->
|
**BEFORE SUBMITTING, PLEASE READ https://docs.vllm.ai/en/latest/contributing/overview.html **
|
||||||
**BEFORE SUBMITTING, PLEASE READ <https://docs.vllm.ai/en/latest/contributing/overview.html>** (anything written below this line will be removed by GitHub Actions)
|
|
||||||
|
|||||||
2
.github/dependabot.yml
vendored
2
.github/dependabot.yml
vendored
@ -23,7 +23,7 @@ updates:
|
|||||||
- dependency-name: "lm-format-enforcer"
|
- dependency-name: "lm-format-enforcer"
|
||||||
- dependency-name: "gguf"
|
- dependency-name: "gguf"
|
||||||
- dependency-name: "compressed-tensors"
|
- dependency-name: "compressed-tensors"
|
||||||
- dependency-name: "ray[cgraph]" # Ray Compiled Graph
|
- dependency-name: "ray[adag]"
|
||||||
- dependency-name: "lm-eval"
|
- dependency-name: "lm-eval"
|
||||||
groups:
|
groups:
|
||||||
minor-update:
|
minor-update:
|
||||||
|
|||||||
82
.github/mergify.yml
vendored
82
.github/mergify.yml
vendored
@ -5,7 +5,6 @@ pull_request_rules:
|
|||||||
- or:
|
- or:
|
||||||
- files~=^[^/]+\.md$
|
- files~=^[^/]+\.md$
|
||||||
- files~=^docs/
|
- files~=^docs/
|
||||||
- files~=^examples/
|
|
||||||
actions:
|
actions:
|
||||||
label:
|
label:
|
||||||
add:
|
add:
|
||||||
@ -19,7 +18,7 @@ pull_request_rules:
|
|||||||
- files~=\.buildkite/
|
- files~=\.buildkite/
|
||||||
- files~=^cmake/
|
- files~=^cmake/
|
||||||
- files=CMakeLists.txt
|
- files=CMakeLists.txt
|
||||||
- files~=^docker/Dockerfile
|
- files~=^Dockerfile
|
||||||
- files~=^requirements.*\.txt
|
- files~=^requirements.*\.txt
|
||||||
- files=setup.py
|
- files=setup.py
|
||||||
actions:
|
actions:
|
||||||
@ -36,38 +35,15 @@ pull_request_rules:
|
|||||||
add:
|
add:
|
||||||
- frontend
|
- frontend
|
||||||
|
|
||||||
- name: label-multi-modality
|
|
||||||
description: Automatically apply multi-modality label
|
|
||||||
conditions:
|
|
||||||
- or:
|
|
||||||
- files~=^vllm/multimodal/
|
|
||||||
- files~=^tests/multimodal/
|
|
||||||
- files~=^tests/models/multimodal/
|
|
||||||
- files~=^tests/models/*/audio_language/
|
|
||||||
- files~=^tests/models/*/vision_language/
|
|
||||||
- files=tests/models/test_vision.py
|
|
||||||
actions:
|
|
||||||
label:
|
|
||||||
add:
|
|
||||||
- multi-modality
|
|
||||||
|
|
||||||
- name: label-structured-output
|
- name: label-structured-output
|
||||||
description: Automatically apply structured-output label
|
description: Automatically apply structured-output label
|
||||||
conditions:
|
conditions:
|
||||||
- or:
|
- or:
|
||||||
- files~=^benchmarks/structured_schemas/
|
|
||||||
- files=benchmarks/benchmark_serving_structured_output.py
|
|
||||||
- files=benchmarks/run_structured_output_benchmark.sh
|
|
||||||
- files=docs/source/features/structured_outputs.md
|
|
||||||
- files=examples/offline_inference/structured_outputs.py
|
|
||||||
- files=examples/online_serving/openai_chat_completion_structured_outputs.py
|
|
||||||
- files=examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
|
|
||||||
- files~=^vllm/model_executor/guided_decoding/
|
- files~=^vllm/model_executor/guided_decoding/
|
||||||
- files=tests/model_executor/test_guided_processors.py
|
- files=tests/model_executor/test_guided_processors.py
|
||||||
- files=tests/entrypoints/llm/test_guided_generate.py
|
- files=tests/entrypoints/llm/test_guided_generate.py
|
||||||
- files~=^tests/v1/structured_output/
|
- files=benchmarks/benchmark_serving_guided.py
|
||||||
- files=tests/v1/entrypoints/llm/test_guided_generate.py
|
- files=benchmarks/benchmark_guided.py
|
||||||
- files~=^vllm/v1/structured_output/
|
|
||||||
actions:
|
actions:
|
||||||
label:
|
label:
|
||||||
add:
|
add:
|
||||||
@ -96,58 +72,6 @@ pull_request_rules:
|
|||||||
add:
|
add:
|
||||||
- v1
|
- v1
|
||||||
|
|
||||||
- name: label-tpu
|
|
||||||
description: Automatically apply tpu label
|
|
||||||
# Keep this list in sync with `label-tpu-remove` conditions
|
|
||||||
conditions:
|
|
||||||
- or:
|
|
||||||
- files~=tpu.py
|
|
||||||
- files~=_tpu
|
|
||||||
- files~=tpu_
|
|
||||||
- files~=/tpu/
|
|
||||||
- files~=pallas
|
|
||||||
actions:
|
|
||||||
label:
|
|
||||||
add:
|
|
||||||
- tpu
|
|
||||||
|
|
||||||
- name: label-tpu-remove
|
|
||||||
description: Automatically remove tpu label
|
|
||||||
# Keep this list in sync with `label-tpu` conditions
|
|
||||||
conditions:
|
|
||||||
- and:
|
|
||||||
- -files~=tpu.py
|
|
||||||
- -files~=_tpu
|
|
||||||
- -files~=tpu_
|
|
||||||
- -files~=/tpu/
|
|
||||||
- -files~=pallas
|
|
||||||
actions:
|
|
||||||
label:
|
|
||||||
remove:
|
|
||||||
- tpu
|
|
||||||
|
|
||||||
- name: label-tool-calling
|
|
||||||
description: Automatically add tool-calling label
|
|
||||||
conditions:
|
|
||||||
- or:
|
|
||||||
- files~=^tests/tool_use/
|
|
||||||
- files~=^tests/mistral_tool_use/
|
|
||||||
- files~=^tests/entrypoints/openai/tool_parsers/
|
|
||||||
- files=tests/entrypoints/openai/test_chat_with_tool_reasoning.py
|
|
||||||
- files~=^vllm/entrypoints/openai/tool_parsers/
|
|
||||||
- files=docs/source/features/tool_calling.md
|
|
||||||
- files=docs/source/getting_started/examples/openai_chat_completion_client_with_tools.md
|
|
||||||
- files=docs/source/getting_started/examples/chat_with_tools.md
|
|
||||||
- files~=^examples/tool_chat_*
|
|
||||||
- files=examples/offline_inference/chat_with_tools.py
|
|
||||||
- files=examples/online_serving/openai_chat_completion_client_with_tools_required.py
|
|
||||||
- files=examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py
|
|
||||||
- files=examples/online_serving/openai_chat_completion_client_with_tools.py
|
|
||||||
actions:
|
|
||||||
label:
|
|
||||||
add:
|
|
||||||
- tool-calling
|
|
||||||
|
|
||||||
- name: ping author on conflicts and add 'needs-rebase' label
|
- name: ping author on conflicts and add 'needs-rebase' label
|
||||||
conditions:
|
conditions:
|
||||||
- conflict
|
- conflict
|
||||||
|
|||||||
2
.github/workflows/cleanup_pr_body.yml
vendored
2
.github/workflows/cleanup_pr_body.yml
vendored
@ -16,7 +16,7 @@ jobs:
|
|||||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||||
|
|
||||||
- name: Set up Python
|
- name: Set up Python
|
||||||
uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
|
uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
|
||||||
with:
|
with:
|
||||||
python-version: '3.12'
|
python-version: '3.12'
|
||||||
|
|
||||||
|
|||||||
14
.github/workflows/lint-and-deploy.yaml
vendored
14
.github/workflows/lint-and-deploy.yaml
vendored
@ -12,17 +12,17 @@ jobs:
|
|||||||
fetch-depth: 0
|
fetch-depth: 0
|
||||||
|
|
||||||
- name: Set up Helm
|
- name: Set up Helm
|
||||||
uses: azure/setup-helm@b9e51907a09c216f16ebe8536097933489208112 # v4.3.0
|
uses: azure/setup-helm@fe7b79cd5ee1e45176fcad797de68ecaf3ca4814 # v4.2.0
|
||||||
with:
|
with:
|
||||||
version: v3.14.4
|
version: v3.14.4
|
||||||
|
|
||||||
#Python is required because ct lint runs Yamale and yamllint which require Python.
|
#Python is required because ct lint runs Yamale and yamllint which require Python.
|
||||||
- uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
|
- uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
|
||||||
with:
|
with:
|
||||||
python-version: '3.13'
|
python-version: '3.13'
|
||||||
|
|
||||||
- name: Set up chart-testing
|
- name: Set up chart-testing
|
||||||
uses: helm/chart-testing-action@0d28d3144d3a25ea2cc349d6e59901c4ff469b3b # v2.7.0
|
uses: helm/chart-testing-action@e6669bcd63d7cb57cb4380c33043eebe5d111992 # v2.6.1
|
||||||
with:
|
with:
|
||||||
version: v3.10.1
|
version: v3.10.1
|
||||||
|
|
||||||
@ -47,10 +47,10 @@ jobs:
|
|||||||
aws --endpoint-url http://127.0.0.1:9000/ s3 cp opt-125m/ s3://testbucket/opt-125m --recursive
|
aws --endpoint-url http://127.0.0.1:9000/ s3 cp opt-125m/ s3://testbucket/opt-125m --recursive
|
||||||
|
|
||||||
- name: Create kind cluster
|
- name: Create kind cluster
|
||||||
uses: helm/kind-action@a1b0e391336a6ee6713a0583f8c6240d70863de3 # v1.12.0
|
uses: helm/kind-action@0025e74a8c7512023d06dc019c617aa3cf561fde # v1.10.0
|
||||||
|
|
||||||
- name: Build the Docker image vllm cpu
|
- name: Build the Docker image vllm cpu
|
||||||
run: docker buildx build -f docker/Dockerfile.cpu -t vllm-cpu-env .
|
run: docker buildx build -f Dockerfile.cpu -t vllm-cpu-env .
|
||||||
|
|
||||||
- name: Configuration of docker images, network and namespace for the kind cluster
|
- name: Configuration of docker images, network and namespace for the kind cluster
|
||||||
run: |
|
run: |
|
||||||
@ -66,7 +66,7 @@ jobs:
|
|||||||
export AWS_SECRET_ACCESS_KEY=minioadmin
|
export AWS_SECRET_ACCESS_KEY=minioadmin
|
||||||
sleep 30 && kubectl -n ns-vllm logs -f "$(kubectl -n ns-vllm get pods | awk '/deployment/ {print $1;exit}')" &
|
sleep 30 && kubectl -n ns-vllm logs -f "$(kubectl -n ns-vllm get pods | awk '/deployment/ {print $1;exit}')" &
|
||||||
helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/online_serving/chart-helm -f examples/online_serving/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env"
|
helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/online_serving/chart-helm -f examples/online_serving/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env"
|
||||||
|
|
||||||
- name: curl test
|
- name: curl test
|
||||||
run: |
|
run: |
|
||||||
kubectl -n ns-vllm port-forward service/test-vllm-service 8001:80 &
|
kubectl -n ns-vllm port-forward service/test-vllm-service 8001:80 &
|
||||||
@ -79,4 +79,4 @@ jobs:
|
|||||||
"max_tokens": 7,
|
"max_tokens": 7,
|
||||||
"temperature": 0
|
"temperature": 0
|
||||||
}'):$CODE"
|
}'):$CODE"
|
||||||
echo "$CODE"
|
echo "$CODE"
|
||||||
3
.github/workflows/pre-commit.yml
vendored
3
.github/workflows/pre-commit.yml
vendored
@ -10,11 +10,10 @@ jobs:
|
|||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||||
- uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
|
- uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
|
||||||
with:
|
with:
|
||||||
python-version: "3.12"
|
python-version: "3.12"
|
||||||
- run: echo "::add-matcher::.github/workflows/matchers/actionlint.json"
|
- run: echo "::add-matcher::.github/workflows/matchers/actionlint.json"
|
||||||
- run: echo "::add-matcher::.github/workflows/matchers/mypy.json"
|
|
||||||
- uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1
|
- uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1
|
||||||
with:
|
with:
|
||||||
extra_args: --all-files --hook-stage manual
|
extra_args: --all-files --hook-stage manual
|
||||||
|
|||||||
4
.github/workflows/publish.yml
vendored
4
.github/workflows/publish.yml
vendored
@ -39,7 +39,7 @@ jobs:
|
|||||||
const script = require('.github/workflows/scripts/create_release.js')
|
const script = require('.github/workflows/scripts/create_release.js')
|
||||||
await script(github, context, core)
|
await script(github, context, core)
|
||||||
|
|
||||||
# NOTE(simon): No longer build wheel using GitHub Actions. See buildkite's release workflow.
|
# NOTE(simon): No longer build wheel using Github Actions. See buildkite's release workflow.
|
||||||
# wheel:
|
# wheel:
|
||||||
# name: Build Wheel
|
# name: Build Wheel
|
||||||
# runs-on: ${{ matrix.os }}
|
# runs-on: ${{ matrix.os }}
|
||||||
@ -50,7 +50,7 @@ jobs:
|
|||||||
# matrix:
|
# matrix:
|
||||||
# os: ['ubuntu-20.04']
|
# os: ['ubuntu-20.04']
|
||||||
# python-version: ['3.9', '3.10', '3.11', '3.12']
|
# python-version: ['3.9', '3.10', '3.11', '3.12']
|
||||||
# pytorch-version: ['2.4.0'] # Must be the most recent version that meets requirements/cuda.txt.
|
# pytorch-version: ['2.4.0'] # Must be the most recent version that meets requirements-cuda.txt.
|
||||||
# cuda-version: ['11.8', '12.1']
|
# cuda-version: ['11.8', '12.1']
|
||||||
|
|
||||||
# steps:
|
# steps:
|
||||||
|
|||||||
2
.github/workflows/scripts/build.sh
vendored
2
.github/workflows/scripts/build.sh
vendored
@ -9,7 +9,7 @@ PATH=${cuda_home}/bin:$PATH
|
|||||||
LD_LIBRARY_PATH=${cuda_home}/lib64:$LD_LIBRARY_PATH
|
LD_LIBRARY_PATH=${cuda_home}/lib64:$LD_LIBRARY_PATH
|
||||||
|
|
||||||
# Install requirements
|
# Install requirements
|
||||||
$python_executable -m pip install -r requirements/build.txt -r requirements/cuda.txt
|
$python_executable -m pip install -r requirements-build.txt -r requirements-cuda.txt
|
||||||
|
|
||||||
# Limit the number of parallel jobs to avoid OOM
|
# Limit the number of parallel jobs to avoid OOM
|
||||||
export MAX_JOBS=1
|
export MAX_JOBS=1
|
||||||
|
|||||||
2
.github/workflows/scripts/create_release.js
vendored
2
.github/workflows/scripts/create_release.js
vendored
@ -1,4 +1,4 @@
|
|||||||
// Uses GitHub's API to create the release and wait for result.
|
// Uses Github's API to create the release and wait for result.
|
||||||
// We use a JS script since github CLI doesn't provide a way to wait for the release's creation and returns immediately.
|
// We use a JS script since github CLI doesn't provide a way to wait for the release's creation and returns immediately.
|
||||||
|
|
||||||
module.exports = async (github, context, core) => {
|
module.exports = async (github, context, core) => {
|
||||||
|
|||||||
2
.github/workflows/stale.yml
vendored
2
.github/workflows/stale.yml
vendored
@ -13,7 +13,7 @@ jobs:
|
|||||||
actions: write
|
actions: write
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/stale@5bef64f19d7facfb25b37b414482c7164d639639 # v9.1.0
|
- uses: actions/stale@28ca1036281a5e5922ead5184a1bbf96e5fc984e # v9.0.0
|
||||||
with:
|
with:
|
||||||
# Increasing this value ensures that changes to this workflow
|
# Increasing this value ensures that changes to this workflow
|
||||||
# propagate to all issues and PRs in days rather than months
|
# propagate to all issues and PRs in days rather than months
|
||||||
|
|||||||
8
.gitignore
vendored
8
.gitignore
vendored
@ -2,7 +2,7 @@
|
|||||||
/vllm/_version.py
|
/vllm/_version.py
|
||||||
|
|
||||||
# vllm-flash-attn built from source
|
# vllm-flash-attn built from source
|
||||||
vllm/vllm_flash_attn/*
|
vllm/vllm_flash_attn/
|
||||||
|
|
||||||
# Byte-compiled / optimized / DLL files
|
# Byte-compiled / optimized / DLL files
|
||||||
__pycache__/
|
__pycache__/
|
||||||
@ -80,7 +80,6 @@ instance/
|
|||||||
# Sphinx documentation
|
# Sphinx documentation
|
||||||
docs/_build/
|
docs/_build/
|
||||||
docs/source/getting_started/examples/
|
docs/source/getting_started/examples/
|
||||||
docs/source/api/vllm
|
|
||||||
|
|
||||||
# PyBuilder
|
# PyBuilder
|
||||||
.pybuilder/
|
.pybuilder/
|
||||||
@ -198,11 +197,8 @@ _build/
|
|||||||
hip_compat.h
|
hip_compat.h
|
||||||
|
|
||||||
# Benchmark dataset
|
# Benchmark dataset
|
||||||
benchmarks/**/*.json
|
benchmarks/*.json
|
||||||
|
|
||||||
# Linting
|
# Linting
|
||||||
actionlint
|
actionlint
|
||||||
shellcheck*/
|
shellcheck*/
|
||||||
|
|
||||||
# Ingore moe/marlin_moe gen code
|
|
||||||
csrc/moe/marlin_moe_wna16/kernel_*
|
|
||||||
|
|||||||
@ -1,53 +1,43 @@
|
|||||||
default_install_hook_types:
|
|
||||||
- pre-commit
|
|
||||||
- commit-msg
|
|
||||||
default_stages:
|
default_stages:
|
||||||
- pre-commit # Run locally
|
- pre-commit # Run locally
|
||||||
- manual # Run in CI
|
- manual # Run in CI
|
||||||
exclude: 'vllm/third_party/.*'
|
|
||||||
repos:
|
repos:
|
||||||
- repo: https://github.com/google/yapf
|
- repo: https://github.com/google/yapf
|
||||||
rev: v0.43.0
|
rev: v0.43.0
|
||||||
hooks:
|
hooks:
|
||||||
- id: yapf
|
- id: yapf
|
||||||
args: [--in-place, --verbose]
|
args: [--in-place, --verbose]
|
||||||
|
additional_dependencies: [toml] # TODO: Remove when yapf is upgraded
|
||||||
- repo: https://github.com/astral-sh/ruff-pre-commit
|
- repo: https://github.com/astral-sh/ruff-pre-commit
|
||||||
rev: v0.11.7
|
rev: v0.9.3
|
||||||
hooks:
|
hooks:
|
||||||
- id: ruff
|
- id: ruff
|
||||||
args: [--output-format, github, --fix]
|
args: [--output-format, github]
|
||||||
- repo: https://github.com/codespell-project/codespell
|
- repo: https://github.com/codespell-project/codespell
|
||||||
rev: v2.4.1
|
rev: v2.4.0
|
||||||
hooks:
|
hooks:
|
||||||
- id: codespell
|
- id: codespell
|
||||||
additional_dependencies: ['tomli']
|
exclude: 'benchmarks/sonnet.txt|(build|tests/(lora/data|models/fixtures|prompts))/.*'
|
||||||
args: ['--toml', 'pyproject.toml']
|
|
||||||
- repo: https://github.com/PyCQA/isort
|
- repo: https://github.com/PyCQA/isort
|
||||||
rev: 6.0.1
|
rev: 5.13.2
|
||||||
hooks:
|
hooks:
|
||||||
- id: isort
|
- id: isort
|
||||||
- repo: https://github.com/pre-commit/mirrors-clang-format
|
- repo: https://github.com/pre-commit/mirrors-clang-format
|
||||||
rev: v20.1.3
|
rev: v19.1.7
|
||||||
hooks:
|
hooks:
|
||||||
- id: clang-format
|
- id: clang-format
|
||||||
exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))|vllm/third_party/.*'
|
exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))'
|
||||||
types_or: [c++, cuda]
|
types_or: [c++, cuda]
|
||||||
args: [--style=file, --verbose]
|
args: [--style=file, --verbose]
|
||||||
- repo: https://github.com/jackdewinter/pymarkdown
|
- repo: https://github.com/jackdewinter/pymarkdown
|
||||||
rev: v0.9.29
|
rev: v0.9.27
|
||||||
hooks:
|
hooks:
|
||||||
- id: pymarkdown
|
- id: pymarkdown
|
||||||
args: [fix]
|
files: docs/.*
|
||||||
- repo: https://github.com/rhysd/actionlint
|
- repo: https://github.com/rhysd/actionlint
|
||||||
rev: v1.7.7
|
rev: v1.7.7
|
||||||
hooks:
|
hooks:
|
||||||
- id: actionlint
|
- id: actionlint
|
||||||
- repo: https://github.com/astral-sh/uv-pre-commit
|
|
||||||
rev: 0.6.17
|
|
||||||
hooks:
|
|
||||||
- id: pip-compile
|
|
||||||
args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu128]
|
|
||||||
files: ^requirements/test\.(in|txt)$
|
|
||||||
- repo: local
|
- repo: local
|
||||||
hooks:
|
hooks:
|
||||||
- id: mypy-local
|
- id: mypy-local
|
||||||
@ -55,7 +45,7 @@ repos:
|
|||||||
entry: tools/mypy.sh 0 "local"
|
entry: tools/mypy.sh 0 "local"
|
||||||
language: python
|
language: python
|
||||||
types: [python]
|
types: [python]
|
||||||
additional_dependencies: &mypy_deps [mypy==1.11.1, types-cachetools, types-setuptools, types-PyYAML, types-requests]
|
additional_dependencies: &mypy_deps [mypy==1.11.1, types-setuptools, types-PyYAML, types-requests]
|
||||||
stages: [pre-commit] # Don't run in CI
|
stages: [pre-commit] # Don't run in CI
|
||||||
- id: mypy-3.9 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
|
- id: mypy-3.9 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
|
||||||
name: Run mypy for Python 3.9
|
name: Run mypy for Python 3.9
|
||||||
@ -101,8 +91,8 @@ repos:
|
|||||||
args:
|
args:
|
||||||
- -c
|
- -c
|
||||||
- |
|
- |
|
||||||
if ! grep -q "^Signed-off-by: $(git config user.name) <$(git config user.email)>" "$(git rev-parse --git-path COMMIT_EDITMSG)"; then
|
if ! grep -q "^Signed-off-by: $(git config user.name) <$(git config user.email)>" .git/COMMIT_EDITMSG; then
|
||||||
printf "\nSigned-off-by: $(git config user.name) <$(git config user.email)>\n" >> "$(git rev-parse --git-path COMMIT_EDITMSG)"
|
printf "\nSigned-off-by: $(git config user.name) <$(git config user.email)>\n" >> .git/COMMIT_EDITMSG
|
||||||
fi
|
fi
|
||||||
language: system
|
language: system
|
||||||
verbose: true
|
verbose: true
|
||||||
@ -112,24 +102,9 @@ repos:
|
|||||||
entry: python tools/check_spdx_header.py
|
entry: python tools/check_spdx_header.py
|
||||||
language: python
|
language: python
|
||||||
types: [python]
|
types: [python]
|
||||||
- id: check-filenames
|
|
||||||
name: Check for spaces in all filenames
|
|
||||||
entry: bash
|
|
||||||
args:
|
|
||||||
- -c
|
|
||||||
- 'git ls-files | grep " " && echo "Filenames should not contain spaces!" && exit 1 || exit 0'
|
|
||||||
language: system
|
|
||||||
always_run: true
|
|
||||||
pass_filenames: false
|
|
||||||
- id: update-dockerfile-graph
|
|
||||||
name: Update Dockerfile dependency graph
|
|
||||||
entry: tools/update-dockerfile-graph.sh
|
|
||||||
language: script
|
|
||||||
# Keep `suggestion` last
|
|
||||||
- id: suggestion
|
- id: suggestion
|
||||||
name: Suggestion
|
name: Suggestion
|
||||||
entry: bash -c 'echo "To bypass pre-commit hooks, add --no-verify to git commit."'
|
entry: bash -c 'echo "To bypass pre-commit hooks, add --no-verify to git commit."'
|
||||||
language: system
|
language: system
|
||||||
verbose: true
|
verbose: true
|
||||||
pass_filenames: false
|
pass_filenames: false
|
||||||
# Insert new entries above the `suggestion` entry
|
|
||||||
|
|||||||
@ -18,4 +18,4 @@ formats: []
|
|||||||
# Optionally declare the Python requirements required to build your docs
|
# Optionally declare the Python requirements required to build your docs
|
||||||
python:
|
python:
|
||||||
install:
|
install:
|
||||||
- requirements: requirements/docs.txt
|
- requirements: docs/requirements-docs.txt
|
||||||
|
|||||||
410
CMakeLists.txt
Normal file → Executable file
410
CMakeLists.txt
Normal file → Executable file
@ -15,6 +15,7 @@ project(vllm_extensions LANGUAGES CXX)
|
|||||||
|
|
||||||
# CUDA by default, can be overridden by using -DVLLM_TARGET_DEVICE=... (used by setup.py)
|
# CUDA by default, can be overridden by using -DVLLM_TARGET_DEVICE=... (used by setup.py)
|
||||||
set(VLLM_TARGET_DEVICE "cuda" CACHE STRING "Target device backend for vLLM")
|
set(VLLM_TARGET_DEVICE "cuda" CACHE STRING "Target device backend for vLLM")
|
||||||
|
|
||||||
message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
|
message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
|
||||||
message(STATUS "Target device: ${VLLM_TARGET_DEVICE}")
|
message(STATUS "Target device: ${VLLM_TARGET_DEVICE}")
|
||||||
|
|
||||||
@ -30,10 +31,10 @@ set(ignoreMe "${VLLM_PYTHON_PATH}")
|
|||||||
set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12")
|
set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12")
|
||||||
|
|
||||||
# Supported NVIDIA architectures.
|
# Supported NVIDIA architectures.
|
||||||
set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0")
|
set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0")
|
||||||
|
|
||||||
# Supported AMD GPU architectures.
|
# Supported AMD GPU architectures.
|
||||||
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201")
|
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101")
|
||||||
|
|
||||||
#
|
#
|
||||||
# Supported/expected torch versions for CUDA/ROCm.
|
# Supported/expected torch versions for CUDA/ROCm.
|
||||||
@ -43,10 +44,10 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1
|
|||||||
#
|
#
|
||||||
# Note: the CUDA torch version is derived from pyproject.toml and various
|
# Note: the CUDA torch version is derived from pyproject.toml and various
|
||||||
# requirements.txt files and should be kept consistent. The ROCm torch
|
# requirements.txt files and should be kept consistent. The ROCm torch
|
||||||
# versions are derived from docker/Dockerfile.rocm
|
# versions are derived from Dockerfile.rocm
|
||||||
#
|
#
|
||||||
set(TORCH_SUPPORTED_VERSION_CUDA "2.7.0")
|
set(TORCH_SUPPORTED_VERSION_CUDA "2.5.1")
|
||||||
set(TORCH_SUPPORTED_VERSION_ROCM "2.7.0")
|
set(TORCH_SUPPORTED_VERSION_ROCM "2.5.1")
|
||||||
|
|
||||||
#
|
#
|
||||||
# Try to find python package with an executable that exactly matches
|
# Try to find python package with an executable that exactly matches
|
||||||
@ -173,25 +174,6 @@ include(FetchContent)
|
|||||||
file(MAKE_DIRECTORY ${FETCHCONTENT_BASE_DIR}) # Ensure the directory exists
|
file(MAKE_DIRECTORY ${FETCHCONTENT_BASE_DIR}) # Ensure the directory exists
|
||||||
message(STATUS "FetchContent base directory: ${FETCHCONTENT_BASE_DIR}")
|
message(STATUS "FetchContent base directory: ${FETCHCONTENT_BASE_DIR}")
|
||||||
|
|
||||||
#
|
|
||||||
# Set rocm version dev int.
|
|
||||||
#
|
|
||||||
if(VLLM_GPU_LANG STREQUAL "HIP")
|
|
||||||
#
|
|
||||||
# Overriding the default -O set up by cmake, adding ggdb3 for the most verbose devug info
|
|
||||||
#
|
|
||||||
set(CMAKE_${VLLM_GPU_LANG}_FLAGS_DEBUG "${CMAKE_${VLLM_GPU_LANG}_FLAGS_DEBUG} -O0 -ggdb3")
|
|
||||||
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0 -ggdb3")
|
|
||||||
|
|
||||||
|
|
||||||
#
|
|
||||||
# Certain HIP functions are marked as [[nodiscard]], yet vllm ignores the result which generates
|
|
||||||
# a lot of warnings that always mask real issues. Suppressing until this is properly addressed.
|
|
||||||
#
|
|
||||||
set(CMAKE_${VLLM_GPU_LANG}_FLAGS "${CMAKE_${VLLM_GPU_LANG}_FLAGS} -Wno-unused-result")
|
|
||||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-result")
|
|
||||||
endif()
|
|
||||||
|
|
||||||
#
|
#
|
||||||
# Define other extension targets
|
# Define other extension targets
|
||||||
#
|
#
|
||||||
@ -210,7 +192,7 @@ set_gencode_flags_for_srcs(
|
|||||||
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||||
message(STATUS "Enabling cumem allocator extension.")
|
message(STATUS "Enabling cumem allocator extension.")
|
||||||
# link against cuda driver library
|
# link against cuda driver library
|
||||||
list(APPEND CUMEM_LIBS CUDA::cuda_driver)
|
list(APPEND CUMEM_LIBS cuda)
|
||||||
define_gpu_extension_target(
|
define_gpu_extension_target(
|
||||||
cumem_allocator
|
cumem_allocator
|
||||||
DESTINATION vllm
|
DESTINATION vllm
|
||||||
@ -229,28 +211,24 @@ set(VLLM_EXT_SRC
|
|||||||
"csrc/cache_kernels.cu"
|
"csrc/cache_kernels.cu"
|
||||||
"csrc/attention/paged_attention_v1.cu"
|
"csrc/attention/paged_attention_v1.cu"
|
||||||
"csrc/attention/paged_attention_v2.cu"
|
"csrc/attention/paged_attention_v2.cu"
|
||||||
"csrc/attention/merge_attn_states.cu"
|
|
||||||
"csrc/pos_encoding_kernels.cu"
|
"csrc/pos_encoding_kernels.cu"
|
||||||
"csrc/activation_kernels.cu"
|
"csrc/activation_kernels.cu"
|
||||||
"csrc/layernorm_kernels.cu"
|
"csrc/layernorm_kernels.cu"
|
||||||
"csrc/layernorm_quant_kernels.cu"
|
"csrc/layernorm_quant_kernels.cu"
|
||||||
"csrc/cuda_view.cu"
|
|
||||||
"csrc/quantization/gptq/q_gemm.cu"
|
"csrc/quantization/gptq/q_gemm.cu"
|
||||||
"csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
|
"csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
|
||||||
"csrc/quantization/fp8/common.cu"
|
"csrc/quantization/fp8/common.cu"
|
||||||
"csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu"
|
"csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu"
|
||||||
"csrc/quantization/gguf/gguf_kernel.cu"
|
"csrc/quantization/gguf/gguf_kernel.cu"
|
||||||
"csrc/quantization/activation_kernels.cu"
|
|
||||||
"csrc/cuda_utils_kernels.cu"
|
"csrc/cuda_utils_kernels.cu"
|
||||||
"csrc/prepare_inputs/advance_step.cu"
|
"csrc/prepare_inputs/advance_step.cu"
|
||||||
"csrc/custom_all_reduce.cu"
|
|
||||||
"csrc/torch_bindings.cpp")
|
"csrc/torch_bindings.cpp")
|
||||||
|
|
||||||
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||||
SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
|
SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
|
||||||
|
|
||||||
# Set CUTLASS_REVISION. Used for FetchContent. Also fixes some bogus messages when building.
|
# Set CUTLASS_REVISION manually -- its revision detection doesn't work in this case.
|
||||||
set(CUTLASS_REVISION "v3.9.2" CACHE STRING "CUTLASS revision to use")
|
set(CUTLASS_REVISION "v3.6.0" CACHE STRING "CUTLASS revision to use")
|
||||||
|
|
||||||
# Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
|
# Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
|
||||||
if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
|
if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
|
||||||
@ -267,8 +245,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
FetchContent_Declare(
|
FetchContent_Declare(
|
||||||
cutlass
|
cutlass
|
||||||
GIT_REPOSITORY https://github.com/nvidia/cutlass.git
|
GIT_REPOSITORY https://github.com/nvidia/cutlass.git
|
||||||
# Please keep this in sync with CUTLASS_REVISION line above.
|
GIT_TAG v3.7.0
|
||||||
GIT_TAG ${CUTLASS_REVISION}
|
|
||||||
GIT_PROGRESS TRUE
|
GIT_PROGRESS TRUE
|
||||||
|
|
||||||
# Speed up CUTLASS download by retrieving only the specified GIT_TAG instead of the history.
|
# Speed up CUTLASS download by retrieving only the specified GIT_TAG instead of the history.
|
||||||
@ -284,13 +261,12 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
"csrc/mamba/causal_conv1d/causal_conv1d.cu"
|
"csrc/mamba/causal_conv1d/causal_conv1d.cu"
|
||||||
"csrc/quantization/aqlm/gemm_kernels.cu"
|
"csrc/quantization/aqlm/gemm_kernels.cu"
|
||||||
"csrc/quantization/awq/gemm_kernels.cu"
|
"csrc/quantization/awq/gemm_kernels.cu"
|
||||||
|
"csrc/custom_all_reduce.cu"
|
||||||
"csrc/permute_cols.cu"
|
"csrc/permute_cols.cu"
|
||||||
"csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
|
"csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
|
||||||
"csrc/quantization/fp4/nvfp4_quant_entry.cu"
|
|
||||||
"csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu"
|
|
||||||
"csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
|
"csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
|
||||||
"csrc/cutlass_extensions/common.cpp"
|
"csrc/sparse/cutlass/sparse_compressor_entry.cu"
|
||||||
"csrc/attention/mla/cutlass_mla_entry.cu")
|
"csrc/cutlass_extensions/common.cpp")
|
||||||
|
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${VLLM_EXT_SRC}"
|
SRCS "${VLLM_EXT_SRC}"
|
||||||
@ -299,54 +275,10 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
# Only build Marlin kernels if we are building for at least some compatible archs.
|
# Only build Marlin kernels if we are building for at least some compatible archs.
|
||||||
# Keep building Marlin for 9.0 as there are some group sizes and shapes that
|
# Keep building Marlin for 9.0 as there are some group sizes and shapes that
|
||||||
# are not supported by Machete yet.
|
# are not supported by Machete yet.
|
||||||
cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0" "${CUDA_ARCHS}")
|
cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.7;8.9;9.0" "${CUDA_ARCHS}")
|
||||||
if (MARLIN_ARCHS)
|
if (MARLIN_ARCHS)
|
||||||
|
|
||||||
#
|
|
||||||
# For the Marlin kernels we automatically generate sources for various
|
|
||||||
# preselected input type pairs and schedules.
|
|
||||||
# Generate sources:
|
|
||||||
set(MARLIN_GEN_SCRIPT
|
|
||||||
${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/gptq_marlin/generate_kernels.py)
|
|
||||||
file(MD5 ${MARLIN_GEN_SCRIPT} MARLIN_GEN_SCRIPT_HASH)
|
|
||||||
|
|
||||||
message(STATUS "Marlin generation script hash: ${MARLIN_GEN_SCRIPT_HASH}")
|
|
||||||
message(STATUS "Last run Marlin generate script hash: $CACHE{MARLIN_GEN_SCRIPT_HASH}")
|
|
||||||
|
|
||||||
if (NOT DEFINED CACHE{MARLIN_GEN_SCRIPT_HASH}
|
|
||||||
OR NOT $CACHE{MARLIN_GEN_SCRIPT_HASH} STREQUAL ${MARLIN_GEN_SCRIPT_HASH})
|
|
||||||
execute_process(
|
|
||||||
COMMAND ${CMAKE_COMMAND} -E env
|
|
||||||
PYTHONPATH=$PYTHONPATH
|
|
||||||
${Python_EXECUTABLE} ${MARLIN_GEN_SCRIPT}
|
|
||||||
RESULT_VARIABLE marlin_generation_result
|
|
||||||
OUTPUT_VARIABLE marlin_generation_result
|
|
||||||
OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log
|
|
||||||
ERROR_FILE ${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log
|
|
||||||
)
|
|
||||||
|
|
||||||
if (NOT marlin_generation_result EQUAL 0)
|
|
||||||
message(FATAL_ERROR "Marlin generation failed."
|
|
||||||
" Result: \"${marlin_generation_result}\""
|
|
||||||
"\nCheck the log for details: "
|
|
||||||
"${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log")
|
|
||||||
else()
|
|
||||||
set(MARLIN_GEN_SCRIPT_HASH ${MARLIN_GEN_SCRIPT_HASH}
|
|
||||||
CACHE STRING "Last run Marlin generate script hash" FORCE)
|
|
||||||
message(STATUS "Marlin generation completed successfully.")
|
|
||||||
endif()
|
|
||||||
else()
|
|
||||||
message(STATUS "Marlin generation script has not changed, skipping generation.")
|
|
||||||
endif()
|
|
||||||
|
|
||||||
file(GLOB MARLIN_TEMPLATE_KERNEL_SRC "csrc/quantization/gptq_marlin/kernel_*.cu")
|
|
||||||
set_gencode_flags_for_srcs(
|
|
||||||
SRCS "${MARLIN_TEMPLATE_KERNEL_SRC}"
|
|
||||||
CUDA_ARCHS "${MARLIN_ARCHS}")
|
|
||||||
|
|
||||||
list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_KERNEL_SRC})
|
|
||||||
|
|
||||||
set(MARLIN_SRCS
|
set(MARLIN_SRCS
|
||||||
|
"csrc/quantization/fp8/fp8_marlin.cu"
|
||||||
"csrc/quantization/marlin/dense/marlin_cuda_kernel.cu"
|
"csrc/quantization/marlin/dense/marlin_cuda_kernel.cu"
|
||||||
"csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
|
"csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
|
||||||
"csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu"
|
"csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu"
|
||||||
@ -363,88 +295,43 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
" in CUDA target architectures")
|
" in CUDA target architectures")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
# Only build AllSpark kernels if we are building for at least some compatible archs.
|
|
||||||
cuda_archs_loose_intersection(ALLSPARK_ARCHS "8.0;8.6;8.7;8.9" "${CUDA_ARCHS}")
|
|
||||||
if (ALLSPARK_ARCHS)
|
|
||||||
set(ALLSPARK_SRCS
|
|
||||||
"csrc/quantization/gptq_allspark/allspark_repack.cu"
|
|
||||||
"csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu")
|
|
||||||
set_gencode_flags_for_srcs(
|
|
||||||
SRCS "${ALLSPARK_SRCS}"
|
|
||||||
CUDA_ARCHS "${ALLSPARK_ARCHS}")
|
|
||||||
list(APPEND VLLM_EXT_SRC "${ALLSPARK_SRCS}")
|
|
||||||
message(STATUS "Building AllSpark kernels for archs: ${ALLSPARK_ARCHS}")
|
|
||||||
else()
|
|
||||||
message(STATUS "Not building AllSpark kernels as no compatible archs found"
|
|
||||||
" in CUDA target architectures")
|
|
||||||
endif()
|
|
||||||
|
|
||||||
|
|
||||||
set(SCALED_MM_3X_ARCHS)
|
|
||||||
# The cutlass_scaled_mm kernels for Hopper (c3x, i.e. CUTLASS 3.x) require
|
# The cutlass_scaled_mm kernels for Hopper (c3x, i.e. CUTLASS 3.x) require
|
||||||
# CUDA 12.0 or later
|
# CUDA 12.0 or later (and only work on Hopper, 9.0a for now).
|
||||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}")
|
cuda_archs_loose_intersection(SCALED_MM_3X_ARCHS "9.0a" "${CUDA_ARCHS}")
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_ARCHS)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
|
||||||
set(SRCS
|
set(SRCS
|
||||||
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90.cu"
|
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu"
|
||||||
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu"
|
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu"
|
||||||
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8.cu"
|
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8.cu"
|
||||||
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_azp_sm90_int8.cu"
|
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_azp_sm90_int8.cu"
|
||||||
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8.cu")
|
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8.cu")
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${SRCS}"
|
SRCS "${SRCS}"
|
||||||
CUDA_ARCHS "${SCALED_MM_ARCHS}")
|
CUDA_ARCHS "${SCALED_MM_3X_ARCHS}")
|
||||||
list(APPEND VLLM_EXT_SRC "${SRCS}")
|
list(APPEND VLLM_EXT_SRC "${SRCS}")
|
||||||
list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_SM90=1")
|
list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_C3X=1")
|
||||||
# Let scaled_mm_c2x know it doesn't need to build these arches
|
message(STATUS "Building scaled_mm_c3x for archs: ${SCALED_MM_3X_ARCHS}")
|
||||||
list(APPEND SCALED_MM_3X_ARCHS "${SCALED_MM_ARCHS}")
|
|
||||||
message(STATUS "Building scaled_mm_c3x_sm90 for archs: ${SCALED_MM_ARCHS}")
|
|
||||||
else()
|
else()
|
||||||
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_ARCHS)
|
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
|
||||||
message(STATUS "Not building scaled_mm_c3x_sm90 as CUDA Compiler version is "
|
message(STATUS "Not building scaled_mm_c3x as CUDA Compiler version is "
|
||||||
"not >= 12.0, we recommend upgrading to CUDA 12.0 or "
|
"not >= 12.0, we recommend upgrading to CUDA 12.0 or "
|
||||||
"later if you intend on running FP8 quantized models on "
|
"later if you intend on running FP8 quantized models on "
|
||||||
"Hopper.")
|
"Hopper.")
|
||||||
else()
|
else()
|
||||||
message(STATUS "Not building scaled_mm_c3x_sm90 as no compatible archs found "
|
message(STATUS "Not building scaled_mm_c3x as no compatible archs found "
|
||||||
"in CUDA target architectures")
|
"in CUDA target architectures")
|
||||||
endif()
|
endif()
|
||||||
endif()
|
|
||||||
|
|
||||||
# The cutlass_scaled_mm kernels for Blackwell (c3x, i.e. CUTLASS 3.x) require
|
# clear SCALED_MM_3X_ARCHS so the scaled_mm_c2x kernels know we didn't
|
||||||
# CUDA 12.8 or later
|
# build any 3x kernels
|
||||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;12.0a" "${CUDA_ARCHS}")
|
set(SCALED_MM_3X_ARCHS)
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND SCALED_MM_ARCHS)
|
|
||||||
set(SRCS
|
|
||||||
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu"
|
|
||||||
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu"
|
|
||||||
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8.cu"
|
|
||||||
)
|
|
||||||
set_gencode_flags_for_srcs(
|
|
||||||
SRCS "${SRCS}"
|
|
||||||
CUDA_ARCHS "${SCALED_MM_ARCHS}")
|
|
||||||
list(APPEND VLLM_EXT_SRC "${SRCS}")
|
|
||||||
list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_SM100=1")
|
|
||||||
# Let scaled_mm_c2x know it doesn't need to build these arches
|
|
||||||
list(APPEND SCALED_MM_3X_ARCHS "${SCALED_MM_ARCHS}")
|
|
||||||
message(STATUS "Building scaled_mm_c3x_sm100 for archs: ${SCALED_MM_ARCHS}")
|
|
||||||
else()
|
|
||||||
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND SCALED_MM_ARCHS)
|
|
||||||
message(STATUS "Not building scaled_mm_c3x_sm100 as CUDA Compiler version is "
|
|
||||||
"not >= 12.8, we recommend upgrading to CUDA 12.8 or "
|
|
||||||
"later if you intend on running FP8 quantized models on "
|
|
||||||
"Blackwell.")
|
|
||||||
else()
|
|
||||||
message(STATUS "Not building scaled_mm_c3x_100 as no compatible archs found "
|
|
||||||
"in CUDA target architectures")
|
|
||||||
endif()
|
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
#
|
#
|
||||||
# For the cutlass_scaled_mm kernels we want to build the c2x (CUTLASS 2.x)
|
# For the cutlass_scaled_mm kernels we want to build the c2x (CUTLASS 2.x)
|
||||||
# kernels for the remaining archs that are not already built for 3x.
|
# kernels for the remaining archs that are not already built for 3x.
|
||||||
cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS
|
cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS
|
||||||
"7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0" "${CUDA_ARCHS}")
|
"7.5;8.0;8.6;8.7;8.9;9.0" "${CUDA_ARCHS}")
|
||||||
# subtract out the archs that are already built for 3x
|
# subtract out the archs that are already built for 3x
|
||||||
list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
|
list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
|
||||||
if (SCALED_MM_2X_ARCHS)
|
if (SCALED_MM_2X_ARCHS)
|
||||||
@ -469,18 +356,18 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
# 2:4 Sparse Kernels
|
# 2:4 Sparse Kernels
|
||||||
|
|
||||||
# The 2:4 sparse kernels cutlass_scaled_sparse_mm and cutlass_compressor
|
# The 2:4 sparse kernels cutlass_scaled_sparse_mm and cutlass_compressor
|
||||||
# require CUDA 12.2 or later (and only work on Hopper).
|
# require CUDA 12.2 or later (and only work on Hopper, 9.0a for now).
|
||||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}")
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_3X_ARCHS)
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_ARCHS)
|
set(SRCS "csrc/sparse/cutlass/sparse_compressor_c3x.cu"
|
||||||
set(SRCS "csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu")
|
"csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu")
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${SRCS}"
|
SRCS "${SRCS}"
|
||||||
CUDA_ARCHS "${SCALED_MM_ARCHS}")
|
CUDA_ARCHS "${SCALED_MM_3X_ARCHS}")
|
||||||
list(APPEND VLLM_EXT_SRC "${SRCS}")
|
list(APPEND VLLM_EXT_SRC "${SRCS}")
|
||||||
list(APPEND VLLM_GPU_FLAGS "-DENABLE_SPARSE_SCALED_MM_C3X=1")
|
list(APPEND VLLM_GPU_FLAGS "-DENABLE_SPARSE_SCALED_MM_C3X=1")
|
||||||
message(STATUS "Building sparse_scaled_mm_c3x for archs: ${SCALED_MM_ARCHS}")
|
message(STATUS "Building sparse_scaled_mm_c3x for archs: ${SCALED_MM_3X_ARCHS}")
|
||||||
else()
|
else()
|
||||||
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_ARCHS)
|
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_3X_ARCHS)
|
||||||
message(STATUS "Not building sparse_scaled_mm_c3x kernels as CUDA Compiler version is "
|
message(STATUS "Not building sparse_scaled_mm_c3x kernels as CUDA Compiler version is "
|
||||||
"not >= 12.2, we recommend upgrading to CUDA 12.2 or later "
|
"not >= 12.2, we recommend upgrading to CUDA 12.2 or later "
|
||||||
"if you intend on running FP8 sparse quantized models on Hopper.")
|
"if you intend on running FP8 sparse quantized models on Hopper.")
|
||||||
@ -490,69 +377,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
# FP4 Archs and flags
|
|
||||||
cuda_archs_loose_intersection(FP4_ARCHS "10.0a" "${CUDA_ARCHS}")
|
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND FP4_ARCHS)
|
|
||||||
set(SRCS
|
|
||||||
"csrc/quantization/fp4/nvfp4_quant_kernels.cu"
|
|
||||||
"csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu")
|
|
||||||
set_gencode_flags_for_srcs(
|
|
||||||
SRCS "${SRCS}"
|
|
||||||
CUDA_ARCHS "${FP4_ARCHS}")
|
|
||||||
list(APPEND VLLM_EXT_SRC "${SRCS}")
|
|
||||||
list(APPEND VLLM_GPU_FLAGS "-DENABLE_NVFP4=1")
|
|
||||||
message(STATUS "Building NVFP4 for archs: ${FP4_ARCHS}")
|
|
||||||
else()
|
|
||||||
message(STATUS "Not building NVFP4 as no compatible archs were found.")
|
|
||||||
# clear FP4_ARCHS
|
|
||||||
set(FP4_ARCHS)
|
|
||||||
endif()
|
|
||||||
|
|
||||||
# CUTLASS MLA Archs and flags
|
|
||||||
cuda_archs_loose_intersection(MLA_ARCHS "10.0a" "${CUDA_ARCHS}")
|
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND MLA_ARCHS)
|
|
||||||
set(SRCS
|
|
||||||
"csrc/attention/mla/cutlass_mla_kernels.cu")
|
|
||||||
set_gencode_flags_for_srcs(
|
|
||||||
SRCS "${SRCS}"
|
|
||||||
CUDA_ARCHS "${MLA_ARCHS}")
|
|
||||||
list(APPEND VLLM_EXT_SRC "${SRCS}")
|
|
||||||
list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MLA=1")
|
|
||||||
# Add MLA-specific include directories only to MLA source files
|
|
||||||
set_source_files_properties(${SRCS}
|
|
||||||
PROPERTIES INCLUDE_DIRECTORIES "${CUTLASS_DIR}/examples/77_blackwell_fmha;${CUTLASS_DIR}/examples/common")
|
|
||||||
message(STATUS "Building CUTLASS MLA for archs: ${MLA_ARCHS}")
|
|
||||||
else()
|
|
||||||
message(STATUS "Not building CUTLASS MLA as no compatible archs were found.")
|
|
||||||
# clear MLA_ARCHS
|
|
||||||
set(MLA_ARCHS)
|
|
||||||
endif()
|
|
||||||
|
|
||||||
# CUTLASS MoE kernels
|
|
||||||
|
|
||||||
# The MoE kernel cutlass_moe_mm requires CUDA 12.3 or later (and only works
|
|
||||||
# on Hopper). get_cutlass_moe_mm_data should only be compiled if it's possible
|
|
||||||
# to compile MoE kernels that use its output.
|
|
||||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}")
|
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
|
|
||||||
set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu"
|
|
||||||
"csrc/quantization/cutlass_w8a8/moe/moe_data.cu")
|
|
||||||
set_gencode_flags_for_srcs(
|
|
||||||
SRCS "${SRCS}"
|
|
||||||
CUDA_ARCHS "${SCALED_MM_ARCHS}")
|
|
||||||
list(APPEND VLLM_EXT_SRC "${SRCS}")
|
|
||||||
list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MOE_SM90=1")
|
|
||||||
message(STATUS "Building grouped_mm_c3x for archs: ${SCALED_MM_ARCHS}")
|
|
||||||
else()
|
|
||||||
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
|
|
||||||
message(STATUS "Not building grouped_mm_c3x kernels as CUDA Compiler version is "
|
|
||||||
"not >= 12.3, we recommend upgrading to CUDA 12.3 or later "
|
|
||||||
"if you intend on running FP8 quantized MoE models on Hopper.")
|
|
||||||
else()
|
|
||||||
message(STATUS "Not building grouped_mm_c3x as no compatible archs found "
|
|
||||||
"in CUDA target architectures")
|
|
||||||
endif()
|
|
||||||
endif()
|
|
||||||
|
|
||||||
#
|
#
|
||||||
# Machete kernels
|
# Machete kernels
|
||||||
@ -634,8 +458,7 @@ define_gpu_extension_target(
|
|||||||
SOURCES ${VLLM_EXT_SRC}
|
SOURCES ${VLLM_EXT_SRC}
|
||||||
COMPILE_FLAGS ${VLLM_GPU_FLAGS}
|
COMPILE_FLAGS ${VLLM_GPU_FLAGS}
|
||||||
ARCHITECTURES ${VLLM_GPU_ARCHES}
|
ARCHITECTURES ${VLLM_GPU_ARCHES}
|
||||||
INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR}
|
INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR};${CUTLASS_TOOLS_UTIL_INCLUDE_DIR}
|
||||||
INCLUDE_DIRECTORIES ${CUTLASS_TOOLS_UTIL_INCLUDE_DIR}
|
|
||||||
USE_SABI 3
|
USE_SABI 3
|
||||||
WITH_SOABI)
|
WITH_SOABI)
|
||||||
|
|
||||||
@ -654,70 +477,28 @@ set(VLLM_MOE_EXT_SRC
|
|||||||
"csrc/moe/moe_align_sum_kernels.cu"
|
"csrc/moe/moe_align_sum_kernels.cu"
|
||||||
"csrc/moe/topk_softmax_kernels.cu")
|
"csrc/moe/topk_softmax_kernels.cu")
|
||||||
|
|
||||||
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|
||||||
list(APPEND VLLM_MOE_EXT_SRC "csrc/moe/moe_wna16.cu")
|
|
||||||
endif()
|
|
||||||
|
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${VLLM_MOE_EXT_SRC}"
|
SRCS "${VLLM_MOE_EXT_SRC}"
|
||||||
CUDA_ARCHS "${CUDA_ARCHS}")
|
CUDA_ARCHS "${CUDA_ARCHS}")
|
||||||
|
|
||||||
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||||
set(VLLM_MOE_WNA16_SRC
|
cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.6;8.7;8.9;9.0" "${CUDA_ARCHS}")
|
||||||
"csrc/moe/moe_wna16.cu")
|
|
||||||
|
|
||||||
set_gencode_flags_for_srcs(
|
|
||||||
SRCS "${VLLM_MOE_WNA16_SRC}"
|
|
||||||
CUDA_ARCHS "${CUDA_ARCHS}")
|
|
||||||
|
|
||||||
list(APPEND VLLM_MOE_EXT_SRC "${VLLM_MOE_WNA16_SRC}")
|
|
||||||
cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0" "${CUDA_ARCHS}")
|
|
||||||
if (MARLIN_MOE_ARCHS)
|
if (MARLIN_MOE_ARCHS)
|
||||||
|
set(MARLIN_MOE_SRC
|
||||||
|
"csrc/moe/marlin_kernels/marlin_moe_kernel.h"
|
||||||
|
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.h"
|
||||||
|
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.cu"
|
||||||
|
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.h"
|
||||||
|
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.cu"
|
||||||
|
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.h"
|
||||||
|
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.cu"
|
||||||
|
"csrc/moe/marlin_moe_ops.cu")
|
||||||
|
|
||||||
#
|
|
||||||
# For the Marlin MOE kernels we automatically generate sources for various
|
|
||||||
# preselected input type pairs and schedules.
|
|
||||||
# Generate sources:
|
|
||||||
set(MOE_MARLIN_GEN_SCRIPT
|
|
||||||
${CMAKE_CURRENT_SOURCE_DIR}/csrc/moe/marlin_moe_wna16/generate_kernels.py)
|
|
||||||
file(MD5 ${MOE_MARLIN_GEN_SCRIPT} MOE_MARLIN_GEN_SCRIPT_HASH)
|
|
||||||
|
|
||||||
message(STATUS "Marlin MOE generation script hash: ${MOE_MARLIN_GEN_SCRIPT_HASH}")
|
|
||||||
message(STATUS "Last run Marlin MOE generate script hash: $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH}")
|
|
||||||
|
|
||||||
if (NOT DEFINED CACHE{MOE_MARLIN_GEN_SCRIPT_HASH}
|
|
||||||
OR NOT $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH} STREQUAL ${MOE_MARLIN_GEN_SCRIPT_HASH})
|
|
||||||
execute_process(
|
|
||||||
COMMAND ${CMAKE_COMMAND} -E env
|
|
||||||
PYTHONPATH=$PYTHONPATH
|
|
||||||
${Python_EXECUTABLE} ${MOE_MARLIN_GEN_SCRIPT}
|
|
||||||
RESULT_VARIABLE moe_marlin_generation_result
|
|
||||||
OUTPUT_VARIABLE moe_marlin_generation_output
|
|
||||||
OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log
|
|
||||||
ERROR_FILE ${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log
|
|
||||||
)
|
|
||||||
|
|
||||||
if (NOT moe_marlin_generation_result EQUAL 0)
|
|
||||||
message(FATAL_ERROR "Marlin MOE generation failed."
|
|
||||||
" Result: \"${moe_marlin_generation_result}\""
|
|
||||||
"\nCheck the log for details: "
|
|
||||||
"${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log")
|
|
||||||
else()
|
|
||||||
set(MOE_MARLIN_GEN_SCRIPT_HASH ${MOE_MARLIN_GEN_SCRIPT_HASH}
|
|
||||||
CACHE STRING "Last run Marlin MOE generate script hash" FORCE)
|
|
||||||
message(STATUS "Marlin MOE generation completed successfully.")
|
|
||||||
endif()
|
|
||||||
else()
|
|
||||||
message(STATUS "Marlin MOE generation script has not changed, skipping generation.")
|
|
||||||
endif()
|
|
||||||
|
|
||||||
file(GLOB MOE_WNAA16_MARLIN_SRC "csrc/moe/marlin_moe_wna16/*.cu")
|
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${MOE_WNAA16_MARLIN_SRC}"
|
SRCS "${MARLIN_MOE_SRC}"
|
||||||
CUDA_ARCHS "${MARLIN_MOE_ARCHS}")
|
CUDA_ARCHS "${MARLIN_MOE_ARCHS}")
|
||||||
|
|
||||||
list(APPEND VLLM_MOE_EXT_SRC ${MOE_WNAA16_MARLIN_SRC})
|
list(APPEND VLLM_MOE_EXT_SRC "${MARLIN_MOE_SRC}")
|
||||||
|
|
||||||
message(STATUS "Building Marlin MOE kernels for archs: ${MARLIN_MOE_ARCHS}")
|
message(STATUS "Building Marlin MOE kernels for archs: ${MARLIN_MOE_ARCHS}")
|
||||||
else()
|
else()
|
||||||
message(STATUS "Not building Marlin MOE kernels as no compatible archs found"
|
message(STATUS "Not building Marlin MOE kernels as no compatible archs found"
|
||||||
@ -725,17 +506,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|
||||||
set(MOE_PERMUTE_SRC
|
|
||||||
"csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.cu"
|
|
||||||
"csrc/moe/moe_permute_unpermute_op.cu")
|
|
||||||
|
|
||||||
set_gencode_flags_for_srcs(
|
|
||||||
SRCS "${MARLIN_PERMUTE_SRC}"
|
|
||||||
CUDA_ARCHS "${MOE_PERMUTE_ARCHS}")
|
|
||||||
|
|
||||||
list(APPEND VLLM_MOE_EXT_SRC "${MOE_PERMUTE_SRC}")
|
|
||||||
endif()
|
|
||||||
message(STATUS "Enabling moe extension.")
|
message(STATUS "Enabling moe extension.")
|
||||||
define_gpu_extension_target(
|
define_gpu_extension_target(
|
||||||
_moe_C
|
_moe_C
|
||||||
@ -744,8 +514,6 @@ define_gpu_extension_target(
|
|||||||
SOURCES ${VLLM_MOE_EXT_SRC}
|
SOURCES ${VLLM_MOE_EXT_SRC}
|
||||||
COMPILE_FLAGS ${VLLM_GPU_FLAGS}
|
COMPILE_FLAGS ${VLLM_GPU_FLAGS}
|
||||||
ARCHITECTURES ${VLLM_GPU_ARCHES}
|
ARCHITECTURES ${VLLM_GPU_ARCHES}
|
||||||
INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR}
|
|
||||||
INCLUDE_DIRECTORIES ${CUTLASS_TOOLS_UTIL_INCLUDE_DIR}
|
|
||||||
USE_SABI 3
|
USE_SABI 3
|
||||||
WITH_SOABI)
|
WITH_SOABI)
|
||||||
|
|
||||||
@ -755,7 +523,6 @@ if(VLLM_GPU_LANG STREQUAL "HIP")
|
|||||||
#
|
#
|
||||||
set(VLLM_ROCM_EXT_SRC
|
set(VLLM_ROCM_EXT_SRC
|
||||||
"csrc/rocm/torch_bindings.cpp"
|
"csrc/rocm/torch_bindings.cpp"
|
||||||
"csrc/rocm/skinny_gemms.cu"
|
|
||||||
"csrc/rocm/attention.cu")
|
"csrc/rocm/attention.cu")
|
||||||
|
|
||||||
define_gpu_extension_target(
|
define_gpu_extension_target(
|
||||||
@ -769,8 +536,77 @@ if(VLLM_GPU_LANG STREQUAL "HIP")
|
|||||||
WITH_SOABI)
|
WITH_SOABI)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
# For CUDA we also build and ship some external projects.
|
# vllm-flash-attn currently only supported on CUDA
|
||||||
if (VLLM_GPU_LANG STREQUAL "CUDA")
|
if (NOT VLLM_GPU_LANG STREQUAL "CUDA")
|
||||||
include(cmake/external_projects/flashmla.cmake)
|
return()
|
||||||
include(cmake/external_projects/vllm_flash_attn.cmake)
|
|
||||||
endif ()
|
endif ()
|
||||||
|
|
||||||
|
# vLLM flash attention requires VLLM_GPU_ARCHES to contain the set of target
|
||||||
|
# arches in the CMake syntax (75-real, 89-virtual, etc), since we clear the
|
||||||
|
# arches in the CUDA case (and instead set the gencodes on a per file basis)
|
||||||
|
# we need to manually set VLLM_GPU_ARCHES here.
|
||||||
|
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||||
|
foreach(_ARCH ${CUDA_ARCHS})
|
||||||
|
string(REPLACE "." "" _ARCH "${_ARCH}")
|
||||||
|
list(APPEND VLLM_GPU_ARCHES "${_ARCH}-real")
|
||||||
|
endforeach()
|
||||||
|
endif()
|
||||||
|
|
||||||
|
#
|
||||||
|
# Build vLLM flash attention from source
|
||||||
|
#
|
||||||
|
# IMPORTANT: This has to be the last thing we do, because vllm-flash-attn uses the same macros/functions as vLLM.
|
||||||
|
# Because functions all belong to the global scope, vllm-flash-attn's functions overwrite vLLMs.
|
||||||
|
# They should be identical but if they aren't, this is a massive footgun.
|
||||||
|
#
|
||||||
|
# The vllm-flash-attn install rules are nested under vllm to make sure the library gets installed in the correct place.
|
||||||
|
# To only install vllm-flash-attn, use --component _vllm_fa2_C (for FA2) or --component _vllm_fa3_C (for FA3).
|
||||||
|
# If no component is specified, vllm-flash-attn is still installed.
|
||||||
|
|
||||||
|
# If VLLM_FLASH_ATTN_SRC_DIR is set, vllm-flash-attn is installed from that directory instead of downloading.
|
||||||
|
# This is to enable local development of vllm-flash-attn within vLLM.
|
||||||
|
# It can be set as an environment variable or passed as a cmake argument.
|
||||||
|
# The environment variable takes precedence.
|
||||||
|
if (DEFINED ENV{VLLM_FLASH_ATTN_SRC_DIR})
|
||||||
|
set(VLLM_FLASH_ATTN_SRC_DIR $ENV{VLLM_FLASH_ATTN_SRC_DIR})
|
||||||
|
endif()
|
||||||
|
|
||||||
|
if(VLLM_FLASH_ATTN_SRC_DIR)
|
||||||
|
FetchContent_Declare(
|
||||||
|
vllm-flash-attn SOURCE_DIR
|
||||||
|
${VLLM_FLASH_ATTN_SRC_DIR}
|
||||||
|
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
|
||||||
|
)
|
||||||
|
else()
|
||||||
|
FetchContent_Declare(
|
||||||
|
vllm-flash-attn
|
||||||
|
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
|
||||||
|
GIT_TAG d4e09037abf588af1ec47d0e966b237ee376876c
|
||||||
|
GIT_PROGRESS TRUE
|
||||||
|
# Don't share the vllm-flash-attn build between build types
|
||||||
|
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
|
||||||
|
)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
|
||||||
|
# Fetch the vllm-flash-attn library
|
||||||
|
FetchContent_MakeAvailable(vllm-flash-attn)
|
||||||
|
message(STATUS "vllm-flash-attn is available at ${vllm-flash-attn_SOURCE_DIR}")
|
||||||
|
|
||||||
|
# Copy over the vllm-flash-attn python files (duplicated for fa2 and fa3, in
|
||||||
|
# case only one is built, in the case both are built redundant work is done)
|
||||||
|
install(
|
||||||
|
DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/
|
||||||
|
DESTINATION vllm_flash_attn
|
||||||
|
COMPONENT _vllm_fa2_C
|
||||||
|
FILES_MATCHING PATTERN "*.py"
|
||||||
|
)
|
||||||
|
|
||||||
|
install(
|
||||||
|
DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/
|
||||||
|
DESTINATION vllm_flash_attn
|
||||||
|
COMPONENT _vllm_fa3_C
|
||||||
|
FILES_MATCHING PATTERN "*.py"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Nothing after vllm-flash-attn, see comment about macros above
|
||||||
|
|||||||
@ -125,3 +125,4 @@ Community Impact Guidelines were inspired by
|
|||||||
For answers to common questions about this code of conduct, see the
|
For answers to common questions about this code of conduct, see the
|
||||||
[Contributor Covenant FAQ](https://www.contributor-covenant.org/faq). Translations are available at
|
[Contributor Covenant FAQ](https://www.contributor-covenant.org/faq). Translations are available at
|
||||||
[Contributor Covenant translations](https://www.contributor-covenant.org/translations).
|
[Contributor Covenant translations](https://www.contributor-covenant.org/translations).
|
||||||
|
|
||||||
|
|||||||
@ -5,11 +5,11 @@
|
|||||||
# docs/source/contributing/dockerfile/dockerfile.md and
|
# docs/source/contributing/dockerfile/dockerfile.md and
|
||||||
# docs/source/assets/contributing/dockerfile-stages-dependency.png
|
# docs/source/assets/contributing/dockerfile-stages-dependency.png
|
||||||
|
|
||||||
ARG CUDA_VERSION=12.8.1
|
ARG CUDA_VERSION=12.4.1
|
||||||
#################### BASE BUILD IMAGE ####################
|
#################### BASE BUILD IMAGE ####################
|
||||||
# prepare basic build environment
|
# prepare basic build environment
|
||||||
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 AS base
|
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 AS base
|
||||||
ARG CUDA_VERSION=12.8.1
|
ARG CUDA_VERSION=12.4.1
|
||||||
ARG PYTHON_VERSION=3.12
|
ARG PYTHON_VERSION=3.12
|
||||||
ARG TARGETPLATFORM
|
ARG TARGETPLATFORM
|
||||||
ENV DEBIAN_FRONTEND=noninteractive
|
ENV DEBIAN_FRONTEND=noninteractive
|
||||||
@ -19,10 +19,7 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
|
|||||||
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
|
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
|
||||||
&& apt-get update -y \
|
&& apt-get update -y \
|
||||||
&& apt-get install -y ccache software-properties-common git curl sudo \
|
&& apt-get install -y ccache software-properties-common git curl sudo \
|
||||||
&& for i in 1 2 3; do \
|
&& add-apt-repository ppa:deadsnakes/ppa \
|
||||||
add-apt-repository -y ppa:deadsnakes/ppa && break || \
|
|
||||||
{ echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \
|
|
||||||
done \
|
|
||||||
&& apt-get update -y \
|
&& apt-get update -y \
|
||||||
&& apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
|
&& apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
|
||||||
&& update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
|
&& update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
|
||||||
@ -30,14 +27,6 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
|
|||||||
&& ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
|
&& ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
|
||||||
&& curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
|
&& curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
|
||||||
&& python3 --version && python3 -m pip --version
|
&& python3 --version && python3 -m pip --version
|
||||||
# Install uv for faster pip installs
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
|
||||||
python3 -m pip install uv
|
|
||||||
|
|
||||||
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
|
|
||||||
# Reference: https://github.com/astral-sh/uv/pull/1694
|
|
||||||
ENV UV_HTTP_TIMEOUT=500
|
|
||||||
ENV UV_INDEX_STRATEGY="unsafe-best-match"
|
|
||||||
|
|
||||||
# Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519
|
# Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519
|
||||||
# as it was causing spam when compiling the CUTLASS kernels
|
# as it was causing spam when compiling the CUTLASS kernels
|
||||||
@ -61,17 +50,15 @@ WORKDIR /workspace
|
|||||||
# we need to install torch and torchvision from the nightly builds first,
|
# we need to install torch and torchvision from the nightly builds first,
|
||||||
# pytorch will not appear as a vLLM dependency in all of the following steps
|
# pytorch will not appear as a vLLM dependency in all of the following steps
|
||||||
# after this step
|
# after this step
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
|
if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
|
||||||
uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu128 "torch==2.8.0.dev20250318+cu128" "torchvision==0.22.0.dev20250319"; \
|
python3 -m pip install --index-url https://download.pytorch.org/whl/nightly/cu126 "torch==2.7.0.dev20250121+cu126" "torchvision==0.22.0.dev20250121"; \
|
||||||
uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu128 --pre pytorch_triton==3.3.0+gitab727c40; \
|
|
||||||
fi
|
fi
|
||||||
|
|
||||||
COPY requirements/common.txt requirements/common.txt
|
COPY requirements-common.txt requirements-common.txt
|
||||||
COPY requirements/cuda.txt requirements/cuda.txt
|
COPY requirements-cuda.txt requirements-cuda.txt
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
uv pip install --system -r requirements/cuda.txt \
|
python3 -m pip install -r requirements-cuda.txt
|
||||||
--extra-index-url https://download.pytorch.org/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
|
|
||||||
|
|
||||||
# cuda arch list used by torch
|
# cuda arch list used by torch
|
||||||
# can be useful for both `dev` and `test`
|
# can be useful for both `dev` and `test`
|
||||||
@ -89,21 +76,15 @@ FROM base AS build
|
|||||||
ARG TARGETPLATFORM
|
ARG TARGETPLATFORM
|
||||||
|
|
||||||
# install build dependencies
|
# install build dependencies
|
||||||
COPY requirements/build.txt requirements/build.txt
|
COPY requirements-build.txt requirements-build.txt
|
||||||
|
|
||||||
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
# Reference: https://github.com/astral-sh/uv/pull/1694
|
python3 -m pip install -r requirements-build.txt
|
||||||
ENV UV_HTTP_TIMEOUT=500
|
|
||||||
ENV UV_INDEX_STRATEGY="unsafe-best-match"
|
|
||||||
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
|
||||||
uv pip install --system -r requirements/build.txt \
|
|
||||||
--extra-index-url https://download.pytorch.org/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
|
|
||||||
|
|
||||||
COPY . .
|
COPY . .
|
||||||
ARG GIT_REPO_CHECK=0
|
ARG GIT_REPO_CHECK=0
|
||||||
RUN --mount=type=bind,source=.git,target=.git \
|
RUN --mount=type=bind,source=.git,target=.git \
|
||||||
if [ "$GIT_REPO_CHECK" != "0" ]; then bash tools/check_repo.sh ; fi
|
if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
|
||||||
|
|
||||||
# max jobs used by Ninja to build extensions
|
# max jobs used by Ninja to build extensions
|
||||||
ARG max_jobs=2
|
ARG max_jobs=2
|
||||||
@ -117,7 +98,7 @@ ARG SCCACHE_BUCKET_NAME=vllm-build-sccache
|
|||||||
ARG SCCACHE_REGION_NAME=us-west-2
|
ARG SCCACHE_REGION_NAME=us-west-2
|
||||||
ARG SCCACHE_S3_NO_CREDENTIALS=0
|
ARG SCCACHE_S3_NO_CREDENTIALS=0
|
||||||
# if USE_SCCACHE is set, use sccache to speed up compilation
|
# if USE_SCCACHE is set, use sccache to speed up compilation
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
--mount=type=bind,source=.git,target=.git \
|
--mount=type=bind,source=.git,target=.git \
|
||||||
if [ "$USE_SCCACHE" = "1" ]; then \
|
if [ "$USE_SCCACHE" = "1" ]; then \
|
||||||
echo "Installing sccache..." \
|
echo "Installing sccache..." \
|
||||||
@ -137,12 +118,9 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
|||||||
|
|
||||||
ENV CCACHE_DIR=/root/.cache/ccache
|
ENV CCACHE_DIR=/root/.cache/ccache
|
||||||
RUN --mount=type=cache,target=/root/.cache/ccache \
|
RUN --mount=type=cache,target=/root/.cache/ccache \
|
||||||
--mount=type=cache,target=/root/.cache/uv \
|
--mount=type=cache,target=/root/.cache/pip \
|
||||||
--mount=type=bind,source=.git,target=.git \
|
--mount=type=bind,source=.git,target=.git \
|
||||||
if [ "$USE_SCCACHE" != "1" ]; then \
|
if [ "$USE_SCCACHE" != "1" ]; then \
|
||||||
# Clean any existing CMake artifacts
|
|
||||||
rm -rf .deps && \
|
|
||||||
mkdir -p .deps && \
|
|
||||||
python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \
|
python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \
|
||||||
fi
|
fi
|
||||||
|
|
||||||
@ -162,28 +140,18 @@ RUN if [ "$RUN_WHEEL_CHECK" = "true" ]; then \
|
|||||||
#################### DEV IMAGE ####################
|
#################### DEV IMAGE ####################
|
||||||
FROM base as dev
|
FROM base as dev
|
||||||
|
|
||||||
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
|
COPY requirements-lint.txt requirements-lint.txt
|
||||||
# Reference: https://github.com/astral-sh/uv/pull/1694
|
COPY requirements-test.txt requirements-test.txt
|
||||||
ENV UV_HTTP_TIMEOUT=500
|
COPY requirements-dev.txt requirements-dev.txt
|
||||||
ENV UV_INDEX_STRATEGY="unsafe-best-match"
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
|
python3 -m pip install -r requirements-dev.txt
|
||||||
# Workaround for #17068
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
|
||||||
uv pip install --system --no-build-isolation "git+https://github.com/state-spaces/mamba@v2.2.4"
|
|
||||||
|
|
||||||
COPY requirements/lint.txt requirements/lint.txt
|
|
||||||
COPY requirements/test.txt requirements/test.txt
|
|
||||||
COPY requirements/dev.txt requirements/dev.txt
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
|
||||||
uv pip install --system -r requirements/dev.txt \
|
|
||||||
--extra-index-url https://download.pytorch.org/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
|
|
||||||
#################### DEV IMAGE ####################
|
#################### DEV IMAGE ####################
|
||||||
|
|
||||||
#################### vLLM installation IMAGE ####################
|
#################### vLLM installation IMAGE ####################
|
||||||
# image with vLLM installed
|
# image with vLLM installed
|
||||||
# TODO: Restore to base image after FlashInfer AOT wheel fixed
|
# TODO: Restore to base image after FlashInfer AOT wheel fixed
|
||||||
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS vllm-base
|
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS vllm-base
|
||||||
ARG CUDA_VERSION=12.8.1
|
ARG CUDA_VERSION=12.4.1
|
||||||
ARG PYTHON_VERSION=3.12
|
ARG PYTHON_VERSION=3.12
|
||||||
WORKDIR /vllm-workspace
|
WORKDIR /vllm-workspace
|
||||||
ENV DEBIAN_FRONTEND=noninteractive
|
ENV DEBIAN_FRONTEND=noninteractive
|
||||||
@ -198,10 +166,7 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
|
|||||||
&& apt-get update -y \
|
&& apt-get update -y \
|
||||||
&& apt-get install -y ccache software-properties-common git curl wget sudo vim python3-pip \
|
&& apt-get install -y ccache software-properties-common git curl wget sudo vim python3-pip \
|
||||||
&& apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
|
&& apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
|
||||||
&& for i in 1 2 3; do \
|
&& add-apt-repository ppa:deadsnakes/ppa \
|
||||||
add-apt-repository -y ppa:deadsnakes/ppa && break || \
|
|
||||||
{ echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \
|
|
||||||
done \
|
|
||||||
&& apt-get update -y \
|
&& apt-get update -y \
|
||||||
&& apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv libibverbs-dev \
|
&& apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv libibverbs-dev \
|
||||||
&& update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
|
&& update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
|
||||||
@ -209,14 +174,6 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
|
|||||||
&& ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
|
&& ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
|
||||||
&& curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
|
&& curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
|
||||||
&& python3 --version && python3 -m pip --version
|
&& python3 --version && python3 -m pip --version
|
||||||
# Install uv for faster pip installs
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
|
||||||
python3 -m pip install uv
|
|
||||||
|
|
||||||
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
|
|
||||||
# Reference: https://github.com/astral-sh/uv/pull/1694
|
|
||||||
ENV UV_HTTP_TIMEOUT=500
|
|
||||||
ENV UV_INDEX_STRATEGY="unsafe-best-match"
|
|
||||||
|
|
||||||
# Workaround for https://github.com/openai/triton/issues/2507 and
|
# Workaround for https://github.com/openai/triton/issues/2507 and
|
||||||
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
|
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
|
||||||
@ -228,53 +185,39 @@ RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
|
|||||||
# we need to install torch and torchvision from the nightly builds first,
|
# we need to install torch and torchvision from the nightly builds first,
|
||||||
# pytorch will not appear as a vLLM dependency in all of the following steps
|
# pytorch will not appear as a vLLM dependency in all of the following steps
|
||||||
# after this step
|
# after this step
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
|
if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
|
||||||
uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu128 "torch==2.8.0.dev20250318+cu128" "torchvision==0.22.0.dev20250319"; \
|
python3 -m pip install --index-url https://download.pytorch.org/whl/nightly/cu124 "torch==2.6.0.dev20241210+cu124" "torchvision==0.22.0.dev20241215"; \
|
||||||
uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu128 --pre pytorch_triton==3.3.0+gitab727c40; \
|
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Install vllm wheel first, so that torch etc will be installed.
|
# Install vllm wheel first, so that torch etc will be installed.
|
||||||
RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
|
RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
|
||||||
--mount=type=cache,target=/root/.cache/uv \
|
--mount=type=cache,target=/root/.cache/pip \
|
||||||
uv pip install --system dist/*.whl --verbose \
|
python3 -m pip install dist/*.whl --verbose
|
||||||
--extra-index-url https://download.pytorch.org/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
|
|
||||||
|
|
||||||
# If we need to build FlashInfer wheel before its release:
|
# How to build this FlashInfer wheel:
|
||||||
# $ export FLASHINFER_ENABLE_AOT=1
|
# $ export FLASHINFER_ENABLE_AOT=1
|
||||||
# $ # Note we remove 7.0 from the arch list compared to the list below, since FlashInfer only supports sm75+
|
# $ # Note we remove 7.0 from the arch list compared to the list below, since FlashInfer only supports sm75+
|
||||||
# $ export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.6 8.9 9.0+PTX'
|
# $ export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.6 8.9 9.0+PTX'
|
||||||
# $ git clone https://github.com/flashinfer-ai/flashinfer.git --recursive
|
# $ git clone https://github.com/flashinfer-ai/flashinfer.git --recursive
|
||||||
# $ cd flashinfer
|
# $ cd flashinfer
|
||||||
# $ git checkout 524304395bd1d8cd7d07db083859523fcaa246a4
|
# $ git checkout 524304395bd1d8cd7d07db083859523fcaa246a4
|
||||||
# $ rm -rf build
|
|
||||||
# $ python3 setup.py bdist_wheel --dist-dir=dist --verbose
|
# $ python3 setup.py bdist_wheel --dist-dir=dist --verbose
|
||||||
# $ ls dist
|
|
||||||
# $ # upload the wheel to a public location, e.g. https://wheels.vllm.ai/flashinfer/524304395bd1d8cd7d07db083859523fcaa246a4/flashinfer_python-0.2.1.post1+cu124torch2.5-cp38-abi3-linux_x86_64.whl
|
|
||||||
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
. /etc/environment && \
|
. /etc/environment && \
|
||||||
if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \
|
if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \
|
||||||
# TESTING: install FlashInfer from source to test 2.7.0 final RC
|
python3 -m pip install https://wheels.vllm.ai/flashinfer/524304395bd1d8cd7d07db083859523fcaa246a4/flashinfer_python-0.2.0.post1-cp${PYTHON_VERSION_STR}-cp${PYTHON_VERSION_STR}-linux_x86_64.whl; \
|
||||||
FLASHINFER_ENABLE_AOT=1 TORCH_CUDA_ARCH_LIST='7.5 8.0 8.6 8.9 9.0+PTX' \
|
|
||||||
uv pip install --system --no-build-isolation "git+https://github.com/flashinfer-ai/flashinfer@v0.2.2.post1" ; \
|
|
||||||
fi
|
fi
|
||||||
COPY examples examples
|
COPY examples examples
|
||||||
COPY benchmarks benchmarks
|
|
||||||
COPY ./vllm/collect_env.py .
|
|
||||||
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
|
||||||
. /etc/environment && \
|
|
||||||
uv pip list
|
|
||||||
|
|
||||||
# Although we build Flashinfer with AOT mode, there's still
|
# Although we build Flashinfer with AOT mode, there's still
|
||||||
# some issues w.r.t. JIT compilation. Therefore we need to
|
# some issues w.r.t. JIT compilation. Therefore we need to
|
||||||
# install build dependencies for JIT compilation.
|
# install build dependencies for JIT compilation.
|
||||||
# TODO: Remove this once FlashInfer AOT wheel is fixed
|
# TODO: Remove this once FlashInfer AOT wheel is fixed
|
||||||
COPY requirements/build.txt requirements/build.txt
|
COPY requirements-build.txt requirements-build.txt
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
uv pip install --system -r requirements/build.txt \
|
python3 -m pip install -r requirements-build.txt
|
||||||
--extra-index-url https://download.pytorch.org/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
|
|
||||||
|
|
||||||
#################### vLLM installation IMAGE ####################
|
#################### vLLM installation IMAGE ####################
|
||||||
|
|
||||||
@ -285,26 +228,17 @@ FROM vllm-base AS test
|
|||||||
|
|
||||||
ADD . /vllm-workspace/
|
ADD . /vllm-workspace/
|
||||||
|
|
||||||
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
|
# install development dependencies (for testing)
|
||||||
# Reference: https://github.com/astral-sh/uv/pull/1694
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
ENV UV_HTTP_TIMEOUT=500
|
python3 -m pip install -r requirements-dev.txt
|
||||||
ENV UV_INDEX_STRATEGY="unsafe-best-match"
|
|
||||||
|
|
||||||
# Workaround for #17068
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
|
||||||
uv pip install --system --no-build-isolation "git+https://github.com/state-spaces/mamba@v2.2.4"
|
|
||||||
|
|
||||||
# install development dependencies (for testing)
|
# install development dependencies (for testing)
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
uv pip install --system -r requirements/dev.txt
|
python3 -m pip install -e tests/vllm_test_utils
|
||||||
|
|
||||||
# install development dependencies (for testing)
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
|
||||||
uv pip install --system -e tests/vllm_test_utils
|
|
||||||
|
|
||||||
# enable fast downloads from hf (for testing)
|
# enable fast downloads from hf (for testing)
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
uv pip install --system hf_transfer
|
python3 -m pip install hf_transfer
|
||||||
ENV HF_HUB_ENABLE_HF_TRANSFER 1
|
ENV HF_HUB_ENABLE_HF_TRANSFER 1
|
||||||
|
|
||||||
# Copy in the v1 package for testing (it isn't distributed yet)
|
# Copy in the v1 package for testing (it isn't distributed yet)
|
||||||
@ -321,18 +255,13 @@ RUN mv vllm test_docs/
|
|||||||
#################### OPENAI API SERVER ####################
|
#################### OPENAI API SERVER ####################
|
||||||
# base openai image with additional requirements, for any subsequent openai-style images
|
# base openai image with additional requirements, for any subsequent openai-style images
|
||||||
FROM vllm-base AS vllm-openai-base
|
FROM vllm-base AS vllm-openai-base
|
||||||
ARG TARGETPLATFORM
|
|
||||||
|
|
||||||
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
|
|
||||||
# Reference: https://github.com/astral-sh/uv/pull/1694
|
|
||||||
ENV UV_HTTP_TIMEOUT=500
|
|
||||||
|
|
||||||
# install additional dependencies for openai api server
|
# install additional dependencies for openai api server
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
|
if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
|
||||||
uv pip install --system accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.42.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
|
pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.42.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
|
||||||
else \
|
else \
|
||||||
uv pip install --system accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.45.3' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
|
pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.45.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
|
||||||
fi
|
fi
|
||||||
|
|
||||||
ENV VLLM_USAGE_SOURCE production-docker-image
|
ENV VLLM_USAGE_SOURCE production-docker-image
|
||||||
@ -26,18 +26,18 @@ WORKDIR /workspace
|
|||||||
ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
|
ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
|
||||||
ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
|
ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
--mount=type=bind,src=requirements/build.txt,target=requirements/build.txt \
|
--mount=type=bind,src=requirements-build.txt,target=requirements-build.txt \
|
||||||
pip install --upgrade pip && \
|
pip install --upgrade pip && \
|
||||||
pip install -r requirements/build.txt
|
pip install -r requirements-build.txt
|
||||||
|
|
||||||
FROM cpu-test-arm AS build
|
FROM cpu-test-arm AS build
|
||||||
|
|
||||||
WORKDIR /workspace/vllm
|
WORKDIR /workspace/vllm
|
||||||
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
--mount=type=bind,src=requirements/common.txt,target=requirements/common.txt \
|
--mount=type=bind,src=requirements-common.txt,target=requirements-common.txt \
|
||||||
--mount=type=bind,src=requirements/cpu.txt,target=requirements/cpu.txt \
|
--mount=type=bind,src=requirements-cpu.txt,target=requirements-cpu.txt \
|
||||||
pip install -v -r requirements/cpu.txt
|
pip install -v -r requirements-cpu.txt
|
||||||
|
|
||||||
COPY . .
|
COPY . .
|
||||||
ARG GIT_REPO_CHECK=0
|
ARG GIT_REPO_CHECK=0
|
||||||
69
Dockerfile.cpu
Normal file
69
Dockerfile.cpu
Normal file
@ -0,0 +1,69 @@
|
|||||||
|
# This vLLM Dockerfile is used to construct image that can build and run vLLM on x86 CPU platform.
|
||||||
|
|
||||||
|
FROM ubuntu:22.04 AS cpu-test-1
|
||||||
|
|
||||||
|
ENV CCACHE_DIR=/root/.cache/ccache
|
||||||
|
|
||||||
|
ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache
|
||||||
|
|
||||||
|
RUN --mount=type=cache,target=/var/cache/apt \
|
||||||
|
apt-get update -y \
|
||||||
|
&& apt-get install -y curl ccache git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 libnuma-dev \
|
||||||
|
&& apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
|
||||||
|
&& update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
|
||||||
|
|
||||||
|
# https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/performance_tuning/tuning_guide.html
|
||||||
|
# intel-openmp provides additional performance improvement vs. openmp
|
||||||
|
# tcmalloc provides better memory allocation efficiency, e.g, holding memory in caches to speed up access of commonly-used objects.
|
||||||
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
|
pip install intel-openmp==2025.0.1
|
||||||
|
|
||||||
|
ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/usr/local/lib/libiomp5.so"
|
||||||
|
|
||||||
|
RUN echo 'ulimit -c 0' >> ~/.bashrc
|
||||||
|
|
||||||
|
RUN pip install intel_extension_for_pytorch==2.5.0
|
||||||
|
|
||||||
|
WORKDIR /workspace
|
||||||
|
|
||||||
|
ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
|
||||||
|
ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
|
||||||
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
|
--mount=type=bind,src=requirements-build.txt,target=requirements-build.txt \
|
||||||
|
pip install --upgrade pip && \
|
||||||
|
pip install -r requirements-build.txt
|
||||||
|
|
||||||
|
FROM cpu-test-1 AS build
|
||||||
|
|
||||||
|
WORKDIR /workspace/vllm
|
||||||
|
|
||||||
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
|
--mount=type=bind,src=requirements-common.txt,target=requirements-common.txt \
|
||||||
|
--mount=type=bind,src=requirements-cpu.txt,target=requirements-cpu.txt \
|
||||||
|
pip install -v -r requirements-cpu.txt
|
||||||
|
|
||||||
|
COPY . .
|
||||||
|
ARG GIT_REPO_CHECK=0
|
||||||
|
RUN --mount=type=bind,source=.git,target=.git \
|
||||||
|
if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
|
||||||
|
|
||||||
|
# Support for building with non-AVX512 vLLM: docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" ...
|
||||||
|
ARG VLLM_CPU_DISABLE_AVX512
|
||||||
|
ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}
|
||||||
|
|
||||||
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
|
--mount=type=cache,target=/root/.cache/ccache \
|
||||||
|
--mount=type=bind,source=.git,target=.git \
|
||||||
|
VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel && \
|
||||||
|
pip install dist/*.whl && \
|
||||||
|
rm -rf dist
|
||||||
|
|
||||||
|
WORKDIR /workspace/
|
||||||
|
|
||||||
|
RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
|
||||||
|
|
||||||
|
# install development dependencies (for testing)
|
||||||
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
|
pip install -e tests/vllm_test_utils
|
||||||
|
|
||||||
|
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
|
||||||
@ -1,10 +1,10 @@
|
|||||||
FROM vault.habana.ai/gaudi-docker/1.20.1/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
|
FROM vault.habana.ai/gaudi-docker/1.19.1/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest
|
||||||
|
|
||||||
COPY ./ /workspace/vllm
|
COPY ./ /workspace/vllm
|
||||||
|
|
||||||
WORKDIR /workspace/vllm
|
WORKDIR /workspace/vllm
|
||||||
|
|
||||||
RUN pip install -v -r requirements/hpu.txt
|
RUN pip install -v -r requirements-hpu.txt
|
||||||
|
|
||||||
ENV no_proxy=localhost,127.0.0.1
|
ENV no_proxy=localhost,127.0.0.1
|
||||||
ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true
|
ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true
|
||||||
@ -1,6 +1,6 @@
|
|||||||
# default base image
|
# default base image
|
||||||
# https://gallery.ecr.aws/neuron/pytorch-inference-neuronx
|
# https://gallery.ecr.aws/neuron/pytorch-inference-neuronx
|
||||||
ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.5.1-neuronx-py310-sdk2.22.0-ubuntu22.04"
|
ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.5.1-neuronx-py310-sdk2.21.0-ubuntu22.04"
|
||||||
|
|
||||||
FROM $BASE_IMAGE
|
FROM $BASE_IMAGE
|
||||||
|
|
||||||
@ -21,14 +21,12 @@ VOLUME [ ${APP_MOUNT} ]
|
|||||||
WORKDIR ${APP_MOUNT}/vllm
|
WORKDIR ${APP_MOUNT}/vllm
|
||||||
|
|
||||||
RUN python3 -m pip install --upgrade pip
|
RUN python3 -m pip install --upgrade pip
|
||||||
RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas tenacity
|
RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas
|
||||||
RUN python3 -m pip install sentencepiece transformers==4.48.0 -U
|
RUN python3 -m pip install sentencepiece transformers==4.45.2 -U
|
||||||
RUN python3 -m pip install neuronx-cc==2.17.194.0 --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
|
RUN python3 -m pip install transformers-neuronx --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
|
||||||
|
RUN python3 -m pip install neuronx-cc==2.16.345.0 --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
|
||||||
RUN python3 -m pip install pytest
|
RUN python3 -m pip install pytest
|
||||||
|
|
||||||
# uninstall transformers-neuronx package explicitly to avoid version conflict
|
|
||||||
RUN python3 -m pip uninstall -y transformers-neuronx
|
|
||||||
|
|
||||||
COPY . .
|
COPY . .
|
||||||
ARG GIT_REPO_CHECK=0
|
ARG GIT_REPO_CHECK=0
|
||||||
RUN --mount=type=bind,source=.git,target=.git \
|
RUN --mount=type=bind,source=.git,target=.git \
|
||||||
@ -36,7 +34,7 @@ RUN --mount=type=bind,source=.git,target=.git \
|
|||||||
|
|
||||||
RUN python3 -m pip install -U \
|
RUN python3 -m pip install -U \
|
||||||
'cmake>=3.26' ninja packaging 'setuptools-scm>=8' wheel jinja2 \
|
'cmake>=3.26' ninja packaging 'setuptools-scm>=8' wheel jinja2 \
|
||||||
-r requirements/neuron.txt
|
-r requirements-neuron.txt
|
||||||
|
|
||||||
ENV VLLM_TARGET_DEVICE neuron
|
ENV VLLM_TARGET_DEVICE neuron
|
||||||
RUN --mount=type=bind,source=.git,target=.git \
|
RUN --mount=type=bind,source=.git,target=.git \
|
||||||
@ -45,10 +43,6 @@ RUN --mount=type=bind,source=.git,target=.git \
|
|||||||
# install development dependencies (for testing)
|
# install development dependencies (for testing)
|
||||||
RUN python3 -m pip install -e tests/vllm_test_utils
|
RUN python3 -m pip install -e tests/vllm_test_utils
|
||||||
|
|
||||||
# install transformers-neuronx package as an optional dependencies (for V0)
|
|
||||||
# FIXME: `--no-deps` argument is temporarily added to resolve transformers package version conflict
|
|
||||||
RUN python3 -m pip install transformers-neuronx==0.13.* --extra-index-url=https://pip.repos.neuron.amazonaws.com -U --no-deps
|
|
||||||
|
|
||||||
# overwrite entrypoint to run bash script
|
# overwrite entrypoint to run bash script
|
||||||
RUN echo "import subprocess; import sys; subprocess.check_call(sys.argv[1:])" > /usr/local/bin/dockerd-entrypoint.py
|
RUN echo "import subprocess; import sys; subprocess.check_call(sys.argv[1:])" > /usr/local/bin/dockerd-entrypoint.py
|
||||||
|
|
||||||
29
Dockerfile.openvino
Normal file
29
Dockerfile.openvino
Normal file
@ -0,0 +1,29 @@
|
|||||||
|
# The vLLM Dockerfile is used to construct vLLM image that can be directly used
|
||||||
|
# to run the OpenAI compatible server.
|
||||||
|
|
||||||
|
FROM ubuntu:22.04 AS dev
|
||||||
|
|
||||||
|
RUN apt-get update -y && \
|
||||||
|
apt-get install -y \
|
||||||
|
git python3-pip \
|
||||||
|
ffmpeg libsm6 libxext6 libgl1
|
||||||
|
WORKDIR /workspace
|
||||||
|
|
||||||
|
COPY . .
|
||||||
|
ARG GIT_REPO_CHECK=0
|
||||||
|
RUN --mount=type=bind,source=.git,target=.git \
|
||||||
|
if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
|
||||||
|
|
||||||
|
RUN python3 -m pip install -U pip
|
||||||
|
# install build requirements
|
||||||
|
RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/requirements-build.txt
|
||||||
|
# build vLLM with OpenVINO backend
|
||||||
|
RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE="openvino" python3 -m pip install /workspace
|
||||||
|
|
||||||
|
COPY examples/ /workspace/examples
|
||||||
|
COPY benchmarks/ /workspace/benchmarks
|
||||||
|
|
||||||
|
# install development dependencies (for testing)
|
||||||
|
RUN python3 -m pip install -e tests/vllm_test_utils
|
||||||
|
|
||||||
|
CMD ["/bin/bash"]
|
||||||
37
Dockerfile.ppc64le
Normal file
37
Dockerfile.ppc64le
Normal file
@ -0,0 +1,37 @@
|
|||||||
|
FROM mambaorg/micromamba
|
||||||
|
ARG MAMBA_DOCKERFILE_ACTIVATE=1
|
||||||
|
USER root
|
||||||
|
|
||||||
|
ENV PATH="/usr/local/cargo/bin:$PATH:/opt/conda/bin/"
|
||||||
|
|
||||||
|
RUN apt-get update -y && apt-get install -y git wget kmod curl vim libnuma-dev libsndfile-dev libprotobuf-dev build-essential ffmpeg libsm6 libxext6 libgl1 libssl-dev
|
||||||
|
|
||||||
|
# Some packages in requirements-cpu are installed here
|
||||||
|
# IBM provides optimized packages for ppc64le processors in the open-ce project for mamba
|
||||||
|
# Currently these may not be available for venv or pip directly
|
||||||
|
RUN micromamba install -y -n base -c https://ftp.osuosl.org/pub/open-ce/1.11.0-p10/ -c defaults python=3.10 rust && micromamba clean --all --yes
|
||||||
|
|
||||||
|
COPY ./ /workspace/vllm
|
||||||
|
|
||||||
|
WORKDIR /workspace/vllm
|
||||||
|
ARG GIT_REPO_CHECK=0
|
||||||
|
RUN --mount=type=bind,source=.git,target=.git \
|
||||||
|
if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh; fi
|
||||||
|
|
||||||
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
|
RUSTFLAGS='-L /opt/conda/lib' pip install -v --prefer-binary --extra-index-url https://repo.fury.io/mgiessing \
|
||||||
|
'cmake>=3.26' ninja packaging 'setuptools-scm>=8' wheel jinja2 \
|
||||||
|
-r requirements-cpu.txt \
|
||||||
|
xformers uvloop==0.20.0
|
||||||
|
|
||||||
|
RUN --mount=type=bind,source=.git,target=.git \
|
||||||
|
VLLM_TARGET_DEVICE=cpu python3 setup.py install
|
||||||
|
|
||||||
|
# install development dependencies (for testing)
|
||||||
|
RUN python3 -m pip install -e tests/vllm_test_utils
|
||||||
|
|
||||||
|
WORKDIR /workspace/
|
||||||
|
|
||||||
|
RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
|
||||||
|
|
||||||
|
ENTRYPOINT ["/opt/conda/bin/python3", "-m", "vllm.entrypoints.openai.api_server"]
|
||||||
@ -12,8 +12,7 @@ ENV PYTORCH_ROCM_ARCH=${ARG_PYTORCH_ROCM_ARCH:-${PYTORCH_ROCM_ARCH}}
|
|||||||
|
|
||||||
# Install some basic utilities
|
# Install some basic utilities
|
||||||
RUN apt-get update -q -y && apt-get install -q -y \
|
RUN apt-get update -q -y && apt-get install -q -y \
|
||||||
sqlite3 libsqlite3-dev libfmt-dev libmsgpack-dev libsuitesparse-dev \
|
sqlite3 libsqlite3-dev libfmt-dev libmsgpack-dev libsuitesparse-dev
|
||||||
apt-transport-https ca-certificates wget curl
|
|
||||||
# Remove sccache
|
# Remove sccache
|
||||||
RUN python3 -m pip install --upgrade pip && pip install setuptools_scm
|
RUN python3 -m pip install --upgrade pip && pip install setuptools_scm
|
||||||
RUN apt-get purge -y sccache; python3 -m pip uninstall -y sccache; rm -f "$(which sccache)"
|
RUN apt-get purge -y sccache; python3 -m pip uninstall -y sccache; rm -f "$(which sccache)"
|
||||||
@ -39,14 +38,14 @@ FROM fetch_vllm AS build_vllm
|
|||||||
ARG USE_CYTHON
|
ARG USE_CYTHON
|
||||||
# Build vLLM
|
# Build vLLM
|
||||||
RUN cd vllm \
|
RUN cd vllm \
|
||||||
&& python3 -m pip install -r requirements/rocm.txt \
|
&& python3 -m pip install -r requirements-rocm.txt \
|
||||||
&& python3 setup.py clean --all \
|
&& python3 setup.py clean --all \
|
||||||
&& if [ ${USE_CYTHON} -eq "1" ]; then python3 tests/build_cython.py build_ext --inplace; fi \
|
&& if [ ${USE_CYTHON} -eq "1" ]; then python3 setup_cython.py build_ext --inplace; fi \
|
||||||
&& python3 setup.py bdist_wheel --dist-dir=dist
|
&& python3 setup.py bdist_wheel --dist-dir=dist
|
||||||
FROM scratch AS export_vllm
|
FROM scratch AS export_vllm
|
||||||
ARG COMMON_WORKDIR
|
ARG COMMON_WORKDIR
|
||||||
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/dist/*.whl /
|
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/dist/*.whl /
|
||||||
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/requirements /requirements
|
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/requirements*.txt /
|
||||||
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/benchmarks /benchmarks
|
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/benchmarks /benchmarks
|
||||||
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/tests /tests
|
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/tests /tests
|
||||||
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/examples /examples
|
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/examples /examples
|
||||||
@ -61,8 +60,7 @@ RUN python3 -m pip install --upgrade pip && rm -rf /var/lib/apt/lists/*
|
|||||||
# Install vLLM
|
# Install vLLM
|
||||||
RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
|
RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
|
||||||
cd /install \
|
cd /install \
|
||||||
&& pip install -U -r requirements/rocm.txt \
|
&& pip install -U -r requirements-rocm.txt \
|
||||||
&& pip install -U -r requirements/rocm-test.txt \
|
|
||||||
&& pip uninstall -y vllm \
|
&& pip uninstall -y vllm \
|
||||||
&& pip install *.whl
|
&& pip install *.whl
|
||||||
|
|
||||||
@ -101,7 +99,7 @@ RUN if [ ${BUILD_RPD} -eq "1" ]; then \
|
|||||||
# Install vLLM
|
# Install vLLM
|
||||||
RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
|
RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
|
||||||
cd /install \
|
cd /install \
|
||||||
&& pip install -U -r requirements/rocm.txt \
|
&& pip install -U -r requirements-rocm.txt \
|
||||||
&& pip uninstall -y vllm \
|
&& pip uninstall -y vllm \
|
||||||
&& pip install *.whl
|
&& pip install *.whl
|
||||||
|
|
||||||
@ -114,16 +112,8 @@ COPY --from=export_vllm /examples ${COMMON_WORKDIR}/vllm/examples
|
|||||||
ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
|
ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
|
||||||
ENV TOKENIZERS_PARALLELISM=false
|
ENV TOKENIZERS_PARALLELISM=false
|
||||||
|
|
||||||
# ENV that can improve safe tensor loading, and end-to-end time
|
|
||||||
ENV SAFETENSORS_FAST_GPU=1
|
|
||||||
|
|
||||||
# User-friendly environment setting for multi-processing to avoid below RuntimeError.
|
|
||||||
# RuntimeError: Cannot re-initialize CUDA in forked subprocess. To use CUDA with multiprocessing,
|
|
||||||
# you must use the 'spawn' start method
|
|
||||||
# See https://pytorch.org/docs/stable/notes/multiprocessing.html#cuda-in-multiprocessing
|
|
||||||
ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
|
|
||||||
|
|
||||||
# Performance environment variable.
|
# Performance environment variable.
|
||||||
ENV HIP_FORCE_DEV_KERNARG=1
|
ENV HIP_FORCE_DEV_KERNARG=1
|
||||||
|
|
||||||
CMD ["/bin/bash"]
|
CMD ["/bin/bash"]
|
||||||
|
|
||||||
@ -1,26 +1,24 @@
|
|||||||
ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:6.3.1-complete
|
ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:6.3.1-complete
|
||||||
ARG HIPBLASLT_BRANCH="db8e93b4"
|
ARG HIPBLASLT_BRANCH="4d40e36"
|
||||||
ARG HIPBLAS_COMMON_BRANCH="7c1566b"
|
ARG HIPBLAS_COMMON_BRANCH="7c1566b"
|
||||||
ARG LEGACY_HIPBLASLT_OPTION=
|
ARG LEGACY_HIPBLASLT_OPTION=
|
||||||
ARG RCCL_BRANCH="648a58d"
|
ARG RCCL_BRANCH="648a58d"
|
||||||
ARG RCCL_REPO="https://github.com/ROCm/rccl"
|
ARG RCCL_REPO="https://github.com/ROCm/rccl"
|
||||||
ARG TRITON_BRANCH="e5be006"
|
ARG TRITON_BRANCH="e5be006"
|
||||||
ARG TRITON_REPO="https://github.com/triton-lang/triton.git"
|
ARG TRITON_REPO="https://github.com/triton-lang/triton.git"
|
||||||
ARG PYTORCH_BRANCH="295f2ed4"
|
ARG PYTORCH_BRANCH="8d4926e"
|
||||||
ARG PYTORCH_VISION_BRANCH="v0.21.0"
|
ARG PYTORCH_VISION_BRANCH="v0.19.1"
|
||||||
ARG PYTORCH_REPO="https://github.com/pytorch/pytorch.git"
|
ARG PYTORCH_REPO="https://github.com/pytorch/pytorch.git"
|
||||||
ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"
|
ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"
|
||||||
ARG FA_BRANCH="1a7f4dfa"
|
ARG FA_BRANCH="b7d29fb"
|
||||||
ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git"
|
ARG FA_REPO="https://github.com/ROCm/flash-attention.git"
|
||||||
ARG AITER_BRANCH="5a77249"
|
|
||||||
ARG AITER_REPO="https://github.com/ROCm/aiter.git"
|
|
||||||
|
|
||||||
FROM ${BASE_IMAGE} AS base
|
FROM ${BASE_IMAGE} AS base
|
||||||
|
|
||||||
ENV PATH=/opt/rocm/llvm/bin:$PATH
|
ENV PATH=/opt/rocm/llvm/bin:$PATH
|
||||||
ENV ROCM_PATH=/opt/rocm
|
ENV ROCM_PATH=/opt/rocm
|
||||||
ENV LD_LIBRARY_PATH=/opt/rocm/lib:/usr/local/lib:
|
ENV LD_LIBRARY_PATH=/opt/rocm/lib:/usr/local/lib:
|
||||||
ARG PYTORCH_ROCM_ARCH=gfx90a;gfx942;gfx1100;gfx1101;gfx1200;gfx1201
|
ARG PYTORCH_ROCM_ARCH=gfx90a;gfx942
|
||||||
ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}
|
ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}
|
||||||
|
|
||||||
ARG PYTHON_VERSION=3.12
|
ARG PYTHON_VERSION=3.12
|
||||||
@ -31,11 +29,8 @@ ENV DEBIAN_FRONTEND=noninteractive
|
|||||||
|
|
||||||
# Install Python and other dependencies
|
# Install Python and other dependencies
|
||||||
RUN apt-get update -y \
|
RUN apt-get update -y \
|
||||||
&& apt-get install -y software-properties-common git curl sudo vim less libgfortran5 \
|
&& apt-get install -y software-properties-common git curl sudo vim less \
|
||||||
&& for i in 1 2 3; do \
|
&& add-apt-repository ppa:deadsnakes/ppa \
|
||||||
add-apt-repository -y ppa:deadsnakes/ppa && break || \
|
|
||||||
{ echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \
|
|
||||||
done \
|
|
||||||
&& apt-get update -y \
|
&& apt-get update -y \
|
||||||
&& apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
|
&& apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
|
||||||
python${PYTHON_VERSION}-lib2to3 python-is-python3 \
|
python${PYTHON_VERSION}-lib2to3 python-is-python3 \
|
||||||
@ -45,7 +40,7 @@ RUN apt-get update -y \
|
|||||||
&& curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
|
&& curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
|
||||||
&& python3 --version && python3 -m pip --version
|
&& python3 --version && python3 -m pip --version
|
||||||
|
|
||||||
RUN pip install -U packaging 'cmake<4' ninja wheel setuptools pybind11 Cython
|
RUN pip install -U packaging cmake ninja wheel setuptools pybind11 Cython
|
||||||
|
|
||||||
FROM base AS build_hipblaslt
|
FROM base AS build_hipblaslt
|
||||||
ARG HIPBLASLT_BRANCH
|
ARG HIPBLASLT_BRANCH
|
||||||
@ -63,8 +58,7 @@ RUN cd hipBLAS-common \
|
|||||||
RUN git clone https://github.com/ROCm/hipBLASLt
|
RUN git clone https://github.com/ROCm/hipBLASLt
|
||||||
RUN cd hipBLASLt \
|
RUN cd hipBLASLt \
|
||||||
&& git checkout ${HIPBLASLT_BRANCH} \
|
&& git checkout ${HIPBLASLT_BRANCH} \
|
||||||
&& apt-get install -y llvm-dev \
|
&& ./install.sh -d --architecture ${PYTORCH_ROCM_ARCH} ${LEGACY_HIPBLASLT_OPTION} \
|
||||||
&& ./install.sh -dc --architecture ${PYTORCH_ROCM_ARCH} ${LEGACY_HIPBLASLT_OPTION} \
|
|
||||||
&& cd build/release \
|
&& cd build/release \
|
||||||
&& make package
|
&& make package
|
||||||
RUN mkdir -p /app/install && cp /app/hipBLASLt/build/release/*.deb /app/hipBLAS-common/build/*.deb /app/install
|
RUN mkdir -p /app/install && cp /app/hipBLASLt/build/release/*.deb /app/hipBLAS-common/build/*.deb /app/install
|
||||||
@ -114,24 +108,11 @@ RUN git clone ${FA_REPO}
|
|||||||
RUN cd flash-attention \
|
RUN cd flash-attention \
|
||||||
&& git checkout ${FA_BRANCH} \
|
&& git checkout ${FA_BRANCH} \
|
||||||
&& git submodule update --init \
|
&& git submodule update --init \
|
||||||
&& GPU_ARCHS=$(echo ${PYTORCH_ROCM_ARCH} | sed -e 's/;gfx1[0-9]\{3\}//g') python3 setup.py bdist_wheel --dist-dir=dist
|
&& MAX_JOBS=64 GPU_ARCHS=${PYTORCH_ROCM_ARCH} python3 setup.py bdist_wheel --dist-dir=dist
|
||||||
RUN mkdir -p /app/install && cp /app/pytorch/dist/*.whl /app/install \
|
RUN mkdir -p /app/install && cp /app/pytorch/dist/*.whl /app/install \
|
||||||
&& cp /app/vision/dist/*.whl /app/install \
|
&& cp /app/vision/dist/*.whl /app/install \
|
||||||
&& cp /app/flash-attention/dist/*.whl /app/install
|
&& cp /app/flash-attention/dist/*.whl /app/install
|
||||||
|
|
||||||
FROM base AS build_aiter
|
|
||||||
ARG AITER_BRANCH
|
|
||||||
ARG AITER_REPO
|
|
||||||
RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
|
|
||||||
pip install /install/*.whl
|
|
||||||
RUN git clone --recursive ${AITER_REPO}
|
|
||||||
RUN cd aiter \
|
|
||||||
&& git checkout ${AITER_BRANCH} \
|
|
||||||
&& git submodule update --init --recursive \
|
|
||||||
&& pip install -r requirements.txt
|
|
||||||
RUN pip install pyyaml && cd aiter && PREBUILD_KERNELS=1 GPU_ARCHS=gfx942 python3 setup.py bdist_wheel --dist-dir=dist && ls /app/aiter/dist/*.whl
|
|
||||||
RUN mkdir -p /app/install && cp /app/aiter/dist/*.whl /app/install
|
|
||||||
|
|
||||||
FROM base AS final
|
FROM base AS final
|
||||||
RUN --mount=type=bind,from=build_hipblaslt,src=/app/install/,target=/install \
|
RUN --mount=type=bind,from=build_hipblaslt,src=/app/install/,target=/install \
|
||||||
dpkg -i /install/*deb \
|
dpkg -i /install/*deb \
|
||||||
@ -147,11 +128,8 @@ RUN --mount=type=bind,from=build_amdsmi,src=/app/install/,target=/install \
|
|||||||
pip install /install/*.whl
|
pip install /install/*.whl
|
||||||
RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
|
RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
|
||||||
pip install /install/*.whl
|
pip install /install/*.whl
|
||||||
RUN --mount=type=bind,from=build_aiter,src=/app/install/,target=/install \
|
|
||||||
pip install /install/*.whl
|
|
||||||
|
|
||||||
ARG BASE_IMAGE
|
ARG BASE_IMAGE
|
||||||
ARG HIPBLAS_COMMON_BRANCH
|
|
||||||
ARG HIPBLASLT_BRANCH
|
ARG HIPBLASLT_BRANCH
|
||||||
ARG LEGACY_HIPBLASLT_OPTION
|
ARG LEGACY_HIPBLASLT_OPTION
|
||||||
ARG RCCL_BRANCH
|
ARG RCCL_BRANCH
|
||||||
@ -164,8 +142,6 @@ ARG PYTORCH_REPO
|
|||||||
ARG PYTORCH_VISION_REPO
|
ARG PYTORCH_VISION_REPO
|
||||||
ARG FA_BRANCH
|
ARG FA_BRANCH
|
||||||
ARG FA_REPO
|
ARG FA_REPO
|
||||||
ARG AITER_BRANCH
|
|
||||||
ARG AITER_REPO
|
|
||||||
RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
|
RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
|
||||||
&& echo "HIPBLAS_COMMON_BRANCH: ${HIPBLAS_COMMON_BRANCH}" >> /app/versions.txt \
|
&& echo "HIPBLAS_COMMON_BRANCH: ${HIPBLAS_COMMON_BRANCH}" >> /app/versions.txt \
|
||||||
&& echo "HIPBLASLT_BRANCH: ${HIPBLASLT_BRANCH}" >> /app/versions.txt \
|
&& echo "HIPBLASLT_BRANCH: ${HIPBLASLT_BRANCH}" >> /app/versions.txt \
|
||||||
@ -179,5 +155,4 @@ RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
|
|||||||
&& echo "PYTORCH_REPO: ${PYTORCH_REPO}" >> /app/versions.txt \
|
&& echo "PYTORCH_REPO: ${PYTORCH_REPO}" >> /app/versions.txt \
|
||||||
&& echo "PYTORCH_VISION_REPO: ${PYTORCH_VISION_REPO}" >> /app/versions.txt \
|
&& echo "PYTORCH_VISION_REPO: ${PYTORCH_VISION_REPO}" >> /app/versions.txt \
|
||||||
&& echo "FA_BRANCH: ${FA_BRANCH}" >> /app/versions.txt \
|
&& echo "FA_BRANCH: ${FA_BRANCH}" >> /app/versions.txt \
|
||||||
&& echo "AITER_BRANCH: ${AITER_BRANCH}" >> /app/versions.txt \
|
&& echo "FA_REPO: ${FA_REPO}" >> /app/versions.txt
|
||||||
&& echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt
|
|
||||||
@ -15,15 +15,12 @@ ARG GIT_REPO_CHECK=0
|
|||||||
RUN --mount=type=bind,source=.git,target=.git \
|
RUN --mount=type=bind,source=.git,target=.git \
|
||||||
if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh; fi
|
if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh; fi
|
||||||
|
|
||||||
# Remove existing versions of dependencies
|
|
||||||
RUN pip uninstall -y torch torch_xla torchvision
|
|
||||||
|
|
||||||
ENV VLLM_TARGET_DEVICE="tpu"
|
ENV VLLM_TARGET_DEVICE="tpu"
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
--mount=type=bind,source=.git,target=.git \
|
--mount=type=bind,source=.git,target=.git \
|
||||||
python3 -m pip install \
|
python3 -m pip install \
|
||||||
-r requirements/tpu.txt
|
-r requirements-tpu.txt
|
||||||
RUN python3 -m pip install -e .
|
RUN python3 setup.py develop
|
||||||
|
|
||||||
# install development dependencies (for testing)
|
# install development dependencies (for testing)
|
||||||
RUN python3 -m pip install -e tests/vllm_test_utils
|
RUN python3 -m pip install -e tests/vllm_test_utils
|
||||||
69
Dockerfile.xpu
Normal file
69
Dockerfile.xpu
Normal file
@ -0,0 +1,69 @@
|
|||||||
|
FROM intel/oneapi-basekit:2024.2.1-0-devel-ubuntu22.04 AS vllm-base
|
||||||
|
|
||||||
|
RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
|
||||||
|
echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
|
||||||
|
chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \
|
||||||
|
wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \
|
||||||
|
echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
|
||||||
|
chmod 644 /usr/share/keyrings/intel-graphics.gpg
|
||||||
|
|
||||||
|
RUN apt-get update -y && \
|
||||||
|
apt-get install -y --no-install-recommends --fix-missing \
|
||||||
|
curl \
|
||||||
|
ffmpeg \
|
||||||
|
git \
|
||||||
|
libsndfile1 \
|
||||||
|
libsm6 \
|
||||||
|
libxext6 \
|
||||||
|
libgl1 \
|
||||||
|
lsb-release \
|
||||||
|
numactl \
|
||||||
|
python3 \
|
||||||
|
python3-dev \
|
||||||
|
python3-pip \
|
||||||
|
# vim \
|
||||||
|
wget
|
||||||
|
|
||||||
|
WORKDIR /workspace/vllm
|
||||||
|
COPY requirements-xpu.txt /workspace/vllm/requirements-xpu.txt
|
||||||
|
COPY requirements-common.txt /workspace/vllm/requirements-common.txt
|
||||||
|
|
||||||
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
|
pip install --no-cache-dir \
|
||||||
|
-r requirements-xpu.txt
|
||||||
|
|
||||||
|
RUN git clone https://github.com/intel/pti-gpu && \
|
||||||
|
cd pti-gpu/sdk && \
|
||||||
|
git checkout 6c491f07a777ed872c2654ca9942f1d0dde0a082 && \
|
||||||
|
mkdir build && \
|
||||||
|
cd build && \
|
||||||
|
cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_TOOLCHAIN_FILE=../cmake/toolchains/icpx_toolchain.cmake -DBUILD_TESTING=OFF .. && \
|
||||||
|
make -j && \
|
||||||
|
cmake --install . --config Release --prefix "/usr/local"
|
||||||
|
|
||||||
|
ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/lib/"
|
||||||
|
|
||||||
|
COPY . .
|
||||||
|
ARG GIT_REPO_CHECK
|
||||||
|
RUN --mount=type=bind,source=.git,target=.git \
|
||||||
|
if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh; fi
|
||||||
|
|
||||||
|
ENV VLLM_TARGET_DEVICE=xpu
|
||||||
|
|
||||||
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
|
--mount=type=bind,source=.git,target=.git \
|
||||||
|
python3 setup.py install
|
||||||
|
|
||||||
|
CMD ["/bin/bash"]
|
||||||
|
|
||||||
|
FROM vllm-base AS vllm-openai
|
||||||
|
|
||||||
|
# install additional dependencies for openai api server
|
||||||
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
|
pip install accelerate hf_transfer 'modelscope!=1.15.0'
|
||||||
|
|
||||||
|
ENV VLLM_USAGE_SOURCE production-docker-image \
|
||||||
|
TRITON_XPU_PROFILE 1
|
||||||
|
# install development dependencies (for testing)
|
||||||
|
RUN python3 -m pip install -e tests/vllm_test_utils
|
||||||
|
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
|
||||||
10
MANIFEST.in
10
MANIFEST.in
@ -1,9 +1,9 @@
|
|||||||
include LICENSE
|
include LICENSE
|
||||||
include requirements/common.txt
|
include requirements-common.txt
|
||||||
include requirements/cuda.txt
|
include requirements-cuda.txt
|
||||||
include requirements/rocm.txt
|
include requirements-rocm.txt
|
||||||
include requirements/neuron.txt
|
include requirements-neuron.txt
|
||||||
include requirements/cpu.txt
|
include requirements-cpu.txt
|
||||||
include CMakeLists.txt
|
include CMakeLists.txt
|
||||||
|
|
||||||
recursive-include cmake *
|
recursive-include cmake *
|
||||||
|
|||||||
35
README.md
35
README.md
@ -10,25 +10,14 @@ Easy, fast, and cheap LLM serving for everyone
|
|||||||
</h3>
|
</h3>
|
||||||
|
|
||||||
<p align="center">
|
<p align="center">
|
||||||
| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://blog.vllm.ai/"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> | <a href="https://discuss.vllm.ai"><b>User Forum</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> |
|
| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://vllm.ai"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> |
|
||||||
</p>
|
</p>
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
*Latest News* 🔥
|
*Latest News* 🔥
|
||||||
- [2025/05] We hosted [NYC vLLM Meetup](https://lu.ma/c1rqyf1f)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1_q_aW_ioMJWUImf1s1YM-ZhjXz8cUeL0IJvaquOYBeA/edit?usp=sharing).
|
|
||||||
- [2025/05] vLLM is now a hosted project under PyTorch Foundation! Please find the announcement [here](https://pytorch.org/blog/pytorch-foundation-welcomes-vllm/).
|
|
||||||
- [2025/04] We hosted [Asia Developer Day](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/19cp6Qu8u48ihB91A064XfaXruNYiBOUKrBxAmDOllOo/edit?usp=sharing).
|
|
||||||
- [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html).
|
- [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html).
|
||||||
|
- [2025/01] We hosted [the eighth vLLM meetup](https://lu.ma/zep56hui) with Google Cloud! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1epVkt4Zu8Jz_S5OhEHPc798emsYh2BwYfRuDDVEF7u4/edit?usp=sharing).
|
||||||
<details>
|
|
||||||
<summary>Previous News</summary>
|
|
||||||
|
|
||||||
- [2025/03] We hosted [vLLM x Ollama Inference Night](https://lu.ma/vllm-ollama)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/16T2PDD1YwRnZ4Tu8Q5r6n53c5Lr5c73UV9Vd2_eBo4U/edit?usp=sharing).
|
|
||||||
- [2025/03] We hosted [the first vLLM China Meetup](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg)! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1REHvfQMKGnvz6p3Fd23HhSO4c8j5WPGZV0bKYLwnHyQ/edit?usp=sharing).
|
|
||||||
- [2025/03] We hosted [the East Coast vLLM Meetup](https://lu.ma/7mu4k4xx)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1NHiv8EUFF1NLd3fEYODm56nDmL26lEeXCaDgyDlTsRs/edit#slide=id.g31441846c39_0_0).
|
|
||||||
- [2025/02] We hosted [the ninth vLLM meetup](https://lu.ma/h7g3kuj9) with Meta! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1jzC_PZVXrVNSFVCW-V4cFXb6pn7zZ2CyP_Flwo05aqg/edit?usp=sharing) and AMD [here](https://drive.google.com/file/d/1Zk5qEJIkTmlQ2eQcXQZlljAx3m9s7nwn/view?usp=sharing). The slides from Meta will not be posted.
|
|
||||||
- [2025/01] We hosted [the eighth vLLM meetup](https://lu.ma/zep56hui) with Google Cloud! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1epVkt4Zu8Jz_S5OhEHPc798emsYh2BwYfRuDDVEF7u4/edit?usp=sharing), and Google Cloud team [here](https://drive.google.com/file/d/1h24pHewANyRL11xy5dXUbvRC9F9Kkjix/view?usp=sharing).
|
|
||||||
- [2024/12] vLLM joins [pytorch ecosystem](https://pytorch.org/blog/vllm-joins-pytorch)! Easy, Fast, and Cheap LLM Serving for Everyone!
|
- [2024/12] vLLM joins [pytorch ecosystem](https://pytorch.org/blog/vllm-joins-pytorch)! Easy, Fast, and Cheap LLM Serving for Everyone!
|
||||||
- [2024/11] We hosted [the seventh vLLM meetup](https://lu.ma/h0qvrajz) with Snowflake! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing), and Snowflake team [here](https://docs.google.com/presentation/d/1qF3RkDAbOULwz9WK5TOltt2fE9t6uIc_hVNLFAaQX6A/edit?usp=sharing).
|
- [2024/11] We hosted [the seventh vLLM meetup](https://lu.ma/h0qvrajz) with Snowflake! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing), and Snowflake team [here](https://docs.google.com/presentation/d/1qF3RkDAbOULwz9WK5TOltt2fE9t6uIc_hVNLFAaQX6A/edit?usp=sharing).
|
||||||
- [2024/10] We have just created a developer slack ([slack.vllm.ai](https://slack.vllm.ai)) focusing on coordinating contributions and discussing features. Please feel free to join us there!
|
- [2024/10] We have just created a developer slack ([slack.vllm.ai](https://slack.vllm.ai)) focusing on coordinating contributions and discussing features. Please feel free to join us there!
|
||||||
@ -43,11 +32,8 @@ Easy, fast, and cheap LLM serving for everyone
|
|||||||
- [2023/08] We would like to express our sincere gratitude to [Andreessen Horowitz](https://a16z.com/2023/08/30/supporting-the-open-source-ai-community/) (a16z) for providing a generous grant to support the open-source development and research of vLLM.
|
- [2023/08] We would like to express our sincere gratitude to [Andreessen Horowitz](https://a16z.com/2023/08/30/supporting-the-open-source-ai-community/) (a16z) for providing a generous grant to support the open-source development and research of vLLM.
|
||||||
- [2023/06] We officially released vLLM! FastChat-vLLM integration has powered [LMSYS Vicuna and Chatbot Arena](https://chat.lmsys.org) since mid-April. Check out our [blog post](https://vllm.ai).
|
- [2023/06] We officially released vLLM! FastChat-vLLM integration has powered [LMSYS Vicuna and Chatbot Arena](https://chat.lmsys.org) since mid-April. Check out our [blog post](https://vllm.ai).
|
||||||
|
|
||||||
</details>
|
|
||||||
|
|
||||||
---
|
---
|
||||||
## About
|
## About
|
||||||
|
|
||||||
vLLM is a fast and easy-to-use library for LLM inference and serving.
|
vLLM is a fast and easy-to-use library for LLM inference and serving.
|
||||||
|
|
||||||
Originally developed in the [Sky Computing Lab](https://sky.cs.berkeley.edu) at UC Berkeley, vLLM has evolved into a community-driven project with contributions from both academia and industry.
|
Originally developed in the [Sky Computing Lab](https://sky.cs.berkeley.edu) at UC Berkeley, vLLM has evolved into a community-driven project with contributions from both academia and industry.
|
||||||
@ -93,14 +79,14 @@ pip install vllm
|
|||||||
```
|
```
|
||||||
|
|
||||||
Visit our [documentation](https://docs.vllm.ai/en/latest/) to learn more.
|
Visit our [documentation](https://docs.vllm.ai/en/latest/) to learn more.
|
||||||
- [Installation](https://docs.vllm.ai/en/latest/getting_started/installation.html)
|
- [Installation](https://docs.vllm.ai/en/latest/getting_started/installation/index.html)
|
||||||
- [Quickstart](https://docs.vllm.ai/en/latest/getting_started/quickstart.html)
|
- [Quickstart](https://docs.vllm.ai/en/latest/getting_started/quickstart.html)
|
||||||
- [List of Supported Models](https://docs.vllm.ai/en/latest/models/supported_models.html)
|
- [List of Supported Models](https://docs.vllm.ai/en/latest/models/supported_models.html)
|
||||||
|
|
||||||
## Contributing
|
## Contributing
|
||||||
|
|
||||||
We welcome and value any contributions and collaborations.
|
We welcome and value any contributions and collaborations.
|
||||||
Please check out [Contributing to vLLM](https://docs.vllm.ai/en/stable/contributing/overview.html) for how to get involved.
|
Please check out [CONTRIBUTING.md](./CONTRIBUTING.md) for how to get involved.
|
||||||
|
|
||||||
## Sponsors
|
## Sponsors
|
||||||
|
|
||||||
@ -123,7 +109,6 @@ Compute Resources:
|
|||||||
- Databricks
|
- Databricks
|
||||||
- DeepInfra
|
- DeepInfra
|
||||||
- Google Cloud
|
- Google Cloud
|
||||||
- Intel
|
|
||||||
- Lambda Lab
|
- Lambda Lab
|
||||||
- Nebius
|
- Nebius
|
||||||
- Novita AI
|
- Novita AI
|
||||||
@ -142,7 +127,6 @@ We also have an official fundraising venue through [OpenCollective](https://open
|
|||||||
## Citation
|
## Citation
|
||||||
|
|
||||||
If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs/2309.06180):
|
If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs/2309.06180):
|
||||||
|
|
||||||
```bibtex
|
```bibtex
|
||||||
@inproceedings{kwon2023efficient,
|
@inproceedings{kwon2023efficient,
|
||||||
title={Efficient Memory Management for Large Language Model Serving with PagedAttention},
|
title={Efficient Memory Management for Large Language Model Serving with PagedAttention},
|
||||||
@ -154,12 +138,11 @@ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs
|
|||||||
|
|
||||||
## Contact Us
|
## Contact Us
|
||||||
|
|
||||||
- For technical questions and feature requests, please use GitHub [Issues](https://github.com/vllm-project/vllm/issues) or [Discussions](https://github.com/vllm-project/vllm/discussions)
|
* For technical questions and feature requests, please use Github issues or discussions.
|
||||||
- For discussing with fellow users, please use the [vLLM Forum](https://discuss.vllm.ai)
|
* For discussing with fellow users and coordinating contributions and development, please use Slack.
|
||||||
- coordinating contributions and development, please use [Slack](https://slack.vllm.ai)
|
* For security disclosures, please use Github's security advisory feature.
|
||||||
- For security disclosures, please use GitHub's [Security Advisories](https://github.com/vllm-project/vllm/security/advisories) feature
|
* For collaborations and partnerships, please contact us at vllm-questions AT lists.berkeley.edu.
|
||||||
- For collaborations and partnerships, please contact us at [vllm-questions@lists.berkeley.edu](mailto:vllm-questions@lists.berkeley.edu)
|
|
||||||
|
|
||||||
## Media Kit
|
## Media Kit
|
||||||
|
|
||||||
- If you wish to use vLLM's logo, please refer to [our media kit repo](https://github.com/vllm-project/media-kit).
|
* If you wish to use vLLM's logo, please refer to [our media kit repo](https://github.com/vllm-project/media-kit).
|
||||||
|
|||||||
54
RELEASE.md
54
RELEASE.md
@ -1,54 +0,0 @@
|
|||||||
# Releasing vLLM
|
|
||||||
|
|
||||||
vLLM releases offer a reliable version of the code base, packaged into a binary format that can be conveniently accessed via PyPI. These releases also serve as key milestones for the development team to communicate with the community about newly available features, improvements, and upcoming changes that could affect users, including potential breaking changes.
|
|
||||||
|
|
||||||
## Release Versioning
|
|
||||||
|
|
||||||
vLLM uses a “right-shifted” versioning scheme where a new patch release is out every 2 weeks. And patch releases contain features and bug fixes (as opposed to semver where patch release contains only backwards-compatible bug fixes). When critical fixes need to be made, special release post1 is released.
|
|
||||||
|
|
||||||
* _major_ major architectural milestone and when incompatible API changes are made, similar to PyTorch 2.0.
|
|
||||||
* _minor_ major features
|
|
||||||
* _patch_ features and backwards-compatible bug fixes
|
|
||||||
* _post1_ or _patch-1_ backwards-compatible bug fixes, either explicit or implicit post release
|
|
||||||
|
|
||||||
## Release Cadence
|
|
||||||
|
|
||||||
Patch release is released on bi-weekly basis. Post release 1-3 days after patch release and uses same branch as patch release.
|
|
||||||
Following is the release cadence for year 2025. All future release dates below are tentative. Please note: Post releases are optional.
|
|
||||||
|
|
||||||
| Release Date | Patch release versions | Post Release versions |
|
|
||||||
| --- | --- | --- |
|
|
||||||
| Jan 2025 | 0.7.0 | --- |
|
|
||||||
| Feb 2025 | 0.7.1, 0.7.2, 0.7.3 | --- |
|
|
||||||
| Mar 2025 | 0.7.4, 0.7.5 | --- |
|
|
||||||
| Apr 2025 | 0.7.6, 0.7.7 | --- |
|
|
||||||
| May 2025 | 0.7.8, 0.7.9 | --- |
|
|
||||||
| Jun 2025 | 0.7.10, 0.7.11 | --- |
|
|
||||||
| Jul 2025 | 0.7.12, 0.7.13 | --- |
|
|
||||||
| Aug 2025 | 0.7.14, 0.7.15 | --- |
|
|
||||||
| Sep 2025 | 0.7.16, 0.7.17 | --- |
|
|
||||||
| Oct 2025 | 0.7.18, 0.7.19 | --- |
|
|
||||||
| Nov 2025 | 0.7.20, 0.7.21 | --- |
|
|
||||||
| Dec 2025 | 0.7.22, 0.7.23 | --- |
|
|
||||||
|
|
||||||
## Release branch
|
|
||||||
|
|
||||||
Each release is built from a dedicated release branch.
|
|
||||||
|
|
||||||
* For _major_, _minor_, _patch_ releases, the release branch cut is performed 1-2 days before release is live.
|
|
||||||
* For post releases, previously cut release branch is reused
|
|
||||||
* Release builds are triggered via push to RC tag like vX.Y.Z-rc1 . This enables us to build and test multiple RCs for each release.
|
|
||||||
* Final tag : vX.Y.Z does not trigger the build but used for Release notes and assets.
|
|
||||||
* After branch cut is created we monitor the main branch for any reverts and apply these reverts to a release branch.
|
|
||||||
|
|
||||||
## Release Cherry-Pick Criteria
|
|
||||||
|
|
||||||
After branch cut, we approach finalizing the release branch with clear criteria on what cherry picks are allowed in. Note: a cherry pick is a process to land a PR in the release branch after branch cut. These are typically limited to ensure that the team has sufficient time to complete a thorough round of testing on a stable code base.
|
|
||||||
|
|
||||||
* Regression fixes - that address functional/performance regression against the most recent release (e.g. 0.7.0 for 0.7.1 release)
|
|
||||||
* Critical fixes - critical fixes for severe issue such as silent incorrectness, backwards compatibility, crashes, deadlocks, (large) memory leaks
|
|
||||||
* Fixes to new features introduced in the most recent release (e.g. 0.7.0 for 0.7.1 release)
|
|
||||||
* Documentation improvements
|
|
||||||
* Release branch specific changes (e.g. change version identifiers or CI fixes)
|
|
||||||
|
|
||||||
Please note: **No feature work allowed for cherry picks**. All PRs that are considered for cherry-picks need to be merged on trunk, the only exception are Release branch specific changes.
|
|
||||||
@ -1,343 +1,19 @@
|
|||||||
# Benchmarking vLLM
|
# Benchmarking vLLM
|
||||||
|
|
||||||
This README guides you through running benchmark tests with the extensive
|
## Downloading the ShareGPT dataset
|
||||||
datasets supported on vLLM. It’s a living document, updated as new features and datasets
|
|
||||||
become available.
|
|
||||||
|
|
||||||
## Dataset Overview
|
|
||||||
|
|
||||||
<table style="width:100%; border-collapse: collapse;">
|
|
||||||
<thead>
|
|
||||||
<tr>
|
|
||||||
<th style="width:15%; text-align: left;">Dataset</th>
|
|
||||||
<th style="width:10%; text-align: center;">Online</th>
|
|
||||||
<th style="width:10%; text-align: center;">Offline</th>
|
|
||||||
<th style="width:65%; text-align: left;">Data Path</th>
|
|
||||||
</tr>
|
|
||||||
</thead>
|
|
||||||
<tbody>
|
|
||||||
<tr>
|
|
||||||
<td><strong>ShareGPT</strong></td>
|
|
||||||
<td style="text-align: center;">✅</td>
|
|
||||||
<td style="text-align: center;">✅</td>
|
|
||||||
<td><code>wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json</code></td>
|
|
||||||
</tr>
|
|
||||||
<tr>
|
|
||||||
<td><strong>BurstGPT</strong></td>
|
|
||||||
<td style="text-align: center;">✅</td>
|
|
||||||
<td style="text-align: center;">✅</td>
|
|
||||||
<td><code>wget https://github.com/HPMLL/BurstGPT/releases/download/v1.1/BurstGPT_without_fails_2.csv</code></td>
|
|
||||||
</tr>
|
|
||||||
<tr>
|
|
||||||
<td><strong>Sonnet</strong></td>
|
|
||||||
<td style="text-align: center;">✅</td>
|
|
||||||
<td style="text-align: center;">✅</td>
|
|
||||||
<td>Local file: <code>benchmarks/sonnet.txt</code></td>
|
|
||||||
</tr>
|
|
||||||
<tr>
|
|
||||||
<td><strong>Random</strong></td>
|
|
||||||
<td style="text-align: center;">✅</td>
|
|
||||||
<td style="text-align: center;">✅</td>
|
|
||||||
<td><code>synthetic</code></td>
|
|
||||||
</tr>
|
|
||||||
<tr>
|
|
||||||
<td><strong>HuggingFace-VisionArena</strong></td>
|
|
||||||
<td style="text-align: center;">✅</td>
|
|
||||||
<td style="text-align: center;">✅</td>
|
|
||||||
<td><code>lmarena-ai/VisionArena-Chat</code></td>
|
|
||||||
</tr>
|
|
||||||
<tr>
|
|
||||||
<td><strong>HuggingFace-InstructCoder</strong></td>
|
|
||||||
<td style="text-align: center;">✅</td>
|
|
||||||
<td style="text-align: center;">✅</td>
|
|
||||||
<td><code>likaixin/InstructCoder</code></td>
|
|
||||||
</tr>
|
|
||||||
<tr>
|
|
||||||
<td><strong>HuggingFace-AIMO</strong></td>
|
|
||||||
<td style="text-align: center;">✅</td>
|
|
||||||
<td style="text-align: center;">✅</td>
|
|
||||||
<td><code>AI-MO/aimo-validation-aime</code> , <code>AI-MO/NuminaMath-1.5</code>, <code>AI-MO/NuminaMath-CoT</code></td>
|
|
||||||
</tr>
|
|
||||||
<tr>
|
|
||||||
<td><strong>HuggingFace-Other</strong></td>
|
|
||||||
<td style="text-align: center;">✅</td>
|
|
||||||
<td style="text-align: center;">✅</td>
|
|
||||||
<td><code>lmms-lab/LLaVA-OneVision-Data</code>, <code>Aeala/ShareGPT_Vicuna_unfiltered</code></td>
|
|
||||||
</tr>
|
|
||||||
</tbody>
|
|
||||||
</table>
|
|
||||||
|
|
||||||
✅: supported
|
|
||||||
|
|
||||||
🟡: Partial support
|
|
||||||
|
|
||||||
🚧: to be supported
|
|
||||||
|
|
||||||
**Note**: HuggingFace dataset's `dataset-name` should be set to `hf`
|
|
||||||
|
|
||||||
---
|
|
||||||
## Example - Online Benchmark
|
|
||||||
|
|
||||||
First start serving your model
|
|
||||||
|
|
||||||
|
You can download the dataset by running:
|
||||||
```bash
|
```bash
|
||||||
vllm serve NousResearch/Hermes-3-Llama-3.1-8B --disable-log-requests
|
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
||||||
```
|
```
|
||||||
|
|
||||||
Then run the benchmarking script
|
## Downloading the ShareGPT4V dataset
|
||||||
|
|
||||||
|
The json file refers to several image datasets (coco, llava, etc.). The benchmark scripts
|
||||||
|
will ignore a datapoint if the referred image is missing.
|
||||||
```bash
|
```bash
|
||||||
# download dataset
|
wget https://huggingface.co/datasets/Lin-Chen/ShareGPT4V/resolve/main/sharegpt4v_instruct_gpt4-vision_cap100k.json
|
||||||
# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
mkdir coco -p
|
||||||
python3 vllm/benchmarks/benchmark_serving.py \
|
wget http://images.cocodataset.org/zips/train2017.zip -O coco/train2017.zip
|
||||||
--backend vllm \
|
unzip coco/train2017.zip -d coco/
|
||||||
--model NousResearch/Hermes-3-Llama-3.1-8B \
|
|
||||||
--endpoint /v1/completions \
|
|
||||||
--dataset-name sharegpt \
|
|
||||||
--dataset-path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
|
|
||||||
--num-prompts 10
|
|
||||||
```
|
```
|
||||||
|
|
||||||
If successful, you will see the following output
|
|
||||||
|
|
||||||
```
|
|
||||||
============ Serving Benchmark Result ============
|
|
||||||
Successful requests: 10
|
|
||||||
Benchmark duration (s): 5.78
|
|
||||||
Total input tokens: 1369
|
|
||||||
Total generated tokens: 2212
|
|
||||||
Request throughput (req/s): 1.73
|
|
||||||
Output token throughput (tok/s): 382.89
|
|
||||||
Total Token throughput (tok/s): 619.85
|
|
||||||
---------------Time to First Token----------------
|
|
||||||
Mean TTFT (ms): 71.54
|
|
||||||
Median TTFT (ms): 73.88
|
|
||||||
P99 TTFT (ms): 79.49
|
|
||||||
-----Time per Output Token (excl. 1st token)------
|
|
||||||
Mean TPOT (ms): 7.91
|
|
||||||
Median TPOT (ms): 7.96
|
|
||||||
P99 TPOT (ms): 8.03
|
|
||||||
---------------Inter-token Latency----------------
|
|
||||||
Mean ITL (ms): 7.74
|
|
||||||
Median ITL (ms): 7.70
|
|
||||||
P99 ITL (ms): 8.39
|
|
||||||
==================================================
|
|
||||||
```
|
|
||||||
|
|
||||||
### VisionArena Benchmark for Vision Language Models
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# need a model with vision capability here
|
|
||||||
vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
|
|
||||||
```
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python3 vllm/benchmarks/benchmark_serving.py \
|
|
||||||
--backend openai-chat \
|
|
||||||
--model Qwen/Qwen2-VL-7B-Instruct \
|
|
||||||
--endpoint /v1/chat/completions \
|
|
||||||
--dataset-name hf \
|
|
||||||
--dataset-path lmarena-ai/VisionArena-Chat \
|
|
||||||
--hf-split train \
|
|
||||||
--num-prompts 1000
|
|
||||||
```
|
|
||||||
|
|
||||||
### InstructCoder Benchmark with Speculative Decoding
|
|
||||||
|
|
||||||
``` bash
|
|
||||||
VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
|
|
||||||
--speculative-model "[ngram]" \
|
|
||||||
--ngram_prompt_lookup_min 2 \
|
|
||||||
--ngram-prompt-lookup-max 5 \
|
|
||||||
--num_speculative_tokens 5
|
|
||||||
```
|
|
||||||
|
|
||||||
``` bash
|
|
||||||
python3 benchmarks/benchmark_serving.py \
|
|
||||||
--model meta-llama/Meta-Llama-3-8B-Instruct \
|
|
||||||
--dataset-name hf \
|
|
||||||
--dataset-path likaixin/InstructCoder \
|
|
||||||
--num-prompts 2048
|
|
||||||
```
|
|
||||||
|
|
||||||
### Other HuggingFaceDataset Examples
|
|
||||||
|
|
||||||
```bash
|
|
||||||
vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
|
|
||||||
```
|
|
||||||
|
|
||||||
**`lmms-lab/LLaVA-OneVision-Data`**
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python3 vllm/benchmarks/benchmark_serving.py \
|
|
||||||
--backend openai-chat \
|
|
||||||
--model Qwen/Qwen2-VL-7B-Instruct \
|
|
||||||
--endpoint /v1/chat/completions \
|
|
||||||
--dataset-name hf \
|
|
||||||
--dataset-path lmms-lab/LLaVA-OneVision-Data \
|
|
||||||
--hf-split train \
|
|
||||||
--hf-subset "chart2text(cauldron)" \
|
|
||||||
--num-prompts 10
|
|
||||||
```
|
|
||||||
|
|
||||||
**`Aeala/ShareGPT_Vicuna_unfiltered`**
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python3 vllm/benchmarks/benchmark_serving.py \
|
|
||||||
--backend openai-chat \
|
|
||||||
--model Qwen/Qwen2-VL-7B-Instruct \
|
|
||||||
--endpoint /v1/chat/completions \
|
|
||||||
--dataset-name hf \
|
|
||||||
--dataset-path Aeala/ShareGPT_Vicuna_unfiltered \
|
|
||||||
--hf-split train \
|
|
||||||
--num-prompts 10
|
|
||||||
```
|
|
||||||
|
|
||||||
**`AI-MO/aimo-validation-aime`**
|
|
||||||
|
|
||||||
``` bash
|
|
||||||
python3 vllm/benchmarks/benchmark_serving.py \
|
|
||||||
--model Qwen/QwQ-32B \
|
|
||||||
--dataset-name hf \
|
|
||||||
--dataset-path AI-MO/aimo-validation-aime \
|
|
||||||
--num-prompts 10 \
|
|
||||||
--seed 42
|
|
||||||
```
|
|
||||||
|
|
||||||
### Running With Sampling Parameters
|
|
||||||
|
|
||||||
When using OpenAI-compatible backends such as `vllm`, optional sampling
|
|
||||||
parameters can be specified. Example client command:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python3 vllm/benchmarks/benchmark_serving.py \
|
|
||||||
--backend vllm \
|
|
||||||
--model NousResearch/Hermes-3-Llama-3.1-8B \
|
|
||||||
--endpoint /v1/completions \
|
|
||||||
--dataset-name sharegpt \
|
|
||||||
--dataset-path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
|
|
||||||
--top-k 10 \
|
|
||||||
--top-p 0.9 \
|
|
||||||
--temperature 0.5 \
|
|
||||||
--num-prompts 10
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
## Example - Offline Throughput Benchmark
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python3 vllm/benchmarks/benchmark_throughput.py \
|
|
||||||
--model NousResearch/Hermes-3-Llama-3.1-8B \
|
|
||||||
--dataset-name sonnet \
|
|
||||||
--dataset-path vllm/benchmarks/sonnet.txt \
|
|
||||||
--num-prompts 10
|
|
||||||
```
|
|
||||||
|
|
||||||
If successful, you will see the following output
|
|
||||||
|
|
||||||
```
|
|
||||||
Throughput: 7.15 requests/s, 4656.00 total tokens/s, 1072.15 output tokens/s
|
|
||||||
Total num prompt tokens: 5014
|
|
||||||
Total num output tokens: 1500
|
|
||||||
```
|
|
||||||
|
|
||||||
### VisionArena Benchmark for Vision Language Models
|
|
||||||
|
|
||||||
``` bash
|
|
||||||
python3 vllm/benchmarks/benchmark_throughput.py \
|
|
||||||
--model Qwen/Qwen2-VL-7B-Instruct \
|
|
||||||
--backend vllm-chat \
|
|
||||||
--dataset-name hf \
|
|
||||||
--dataset-path lmarena-ai/VisionArena-Chat \
|
|
||||||
--num-prompts 1000 \
|
|
||||||
--hf-split train
|
|
||||||
```
|
|
||||||
|
|
||||||
The `num prompt tokens` now includes image token counts
|
|
||||||
|
|
||||||
```
|
|
||||||
Throughput: 2.55 requests/s, 4036.92 total tokens/s, 326.90 output tokens/s
|
|
||||||
Total num prompt tokens: 14527
|
|
||||||
Total num output tokens: 1280
|
|
||||||
```
|
|
||||||
|
|
||||||
### InstructCoder Benchmark with Speculative Decoding
|
|
||||||
|
|
||||||
``` bash
|
|
||||||
VLLM_WORKER_MULTIPROC_METHOD=spawn \
|
|
||||||
VLLM_USE_V1=1 \
|
|
||||||
python3 vllm/benchmarks/benchmark_throughput.py \
|
|
||||||
--dataset-name=hf \
|
|
||||||
--dataset-path=likaixin/InstructCoder \
|
|
||||||
--model=meta-llama/Meta-Llama-3-8B-Instruct \
|
|
||||||
--input-len=1000 \
|
|
||||||
--output-len=100 \
|
|
||||||
--num-prompts=2048 \
|
|
||||||
--async-engine \
|
|
||||||
--speculative-model="[ngram]" \
|
|
||||||
--ngram_prompt_lookup_min=2 \
|
|
||||||
--ngram-prompt-lookup-max=5 \
|
|
||||||
--num_speculative_tokens=5
|
|
||||||
```
|
|
||||||
|
|
||||||
```
|
|
||||||
Throughput: 104.77 requests/s, 23836.22 total tokens/s, 10477.10 output tokens/s
|
|
||||||
Total num prompt tokens: 261136
|
|
||||||
Total num output tokens: 204800
|
|
||||||
```
|
|
||||||
|
|
||||||
### Other HuggingFaceDataset Examples
|
|
||||||
|
|
||||||
**`lmms-lab/LLaVA-OneVision-Data`**
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python3 vllm/benchmarks/benchmark_throughput.py \
|
|
||||||
--model Qwen/Qwen2-VL-7B-Instruct \
|
|
||||||
--backend vllm-chat \
|
|
||||||
--dataset-name hf \
|
|
||||||
--dataset-path lmms-lab/LLaVA-OneVision-Data \
|
|
||||||
--hf-split train \
|
|
||||||
--hf-subset "chart2text(cauldron)" \
|
|
||||||
--num-prompts 10
|
|
||||||
```
|
|
||||||
|
|
||||||
**`Aeala/ShareGPT_Vicuna_unfiltered`**
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python3 vllm/benchmarks/benchmark_throughput.py \
|
|
||||||
--model Qwen/Qwen2-VL-7B-Instruct \
|
|
||||||
--backend vllm-chat \
|
|
||||||
--dataset-name hf \
|
|
||||||
--dataset-path Aeala/ShareGPT_Vicuna_unfiltered \
|
|
||||||
--hf-split train \
|
|
||||||
--num-prompts 10
|
|
||||||
```
|
|
||||||
|
|
||||||
**`AI-MO/aimo-validation-aime`**
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python3 benchmarks/benchmark_throughput.py \
|
|
||||||
--model Qwen/QwQ-32B \
|
|
||||||
--backend vllm \
|
|
||||||
--dataset-name hf \
|
|
||||||
--dataset-path AI-MO/aimo-validation-aime \
|
|
||||||
--hf-split train \
|
|
||||||
--num-prompts 10
|
|
||||||
```
|
|
||||||
|
|
||||||
### Benchmark with LoRA Adapters
|
|
||||||
|
|
||||||
``` bash
|
|
||||||
# download dataset
|
|
||||||
# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
|
||||||
python3 vllm/benchmarks/benchmark_throughput.py \
|
|
||||||
--model meta-llama/Llama-2-7b-hf \
|
|
||||||
--backend vllm \
|
|
||||||
--dataset_path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
|
|
||||||
--dataset_name sharegpt \
|
|
||||||
--num-prompts 10 \
|
|
||||||
--max-loras 2 \
|
|
||||||
--max-lora-rank 8 \
|
|
||||||
--enable-lora \
|
|
||||||
--lora-path yard1/llama-2-7b-sql-lora-test
|
|
||||||
```
|
|
||||||
|
|||||||
@ -1,212 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
# This script aims to tune the best server parameter combinations to maximize throughput for given requirement.
|
|
||||||
# The current server parameter combination is max_num_seqs and max_num_batched_tokens
|
|
||||||
# It also supports additional requirement: e2e latency and prefix cache.
|
|
||||||
|
|
||||||
# Pre-requisite:
|
|
||||||
# 1. Checkout to your branch, install/ update the correct running env. For TPU, activate conda env and install the corresponding torch, xla version.
|
|
||||||
# 2. If the model is customized, replace the MODEL's config with the customized config.
|
|
||||||
# 3. Set variables (ALL REQUIRED)
|
|
||||||
# BASE: your directory for vllm repo
|
|
||||||
# MODEL: the model served by vllm
|
|
||||||
# DOWNLOAD_DIR: directory to download and load model weights.
|
|
||||||
# INPUT_LEN: request input len
|
|
||||||
# OUTPUT_LEN: request output len
|
|
||||||
# MIN_CACHE_HIT_PCT: prefix cache rate
|
|
||||||
# MAX_LATENCY_ALLOWED_MS: (e2e) latency requirement. If there's no latency requirement, set it to a large number like 1000000000
|
|
||||||
# 4. Run the script, it might take a long time, you can use tmux to avoid the script stop if disconnection happens.
|
|
||||||
# 5. The final result will be saved in RESULT file.
|
|
||||||
|
|
||||||
|
|
||||||
# Example use cases
|
|
||||||
# 1. Given input_len=1800, output_len=20, what's the best max_num_seqs and max_num_batched_tokens to get highest throughput?
|
|
||||||
# Use INPUT_LEN=1800, OUTPUT_LEN=20, MIN_CACHE_HIT_PCT=0, MAX_LATENCY_ALLOWED_MS=100000000000
|
|
||||||
# 2. If we have latency requirement to be lower than 500ms, what's the best server parameter?
|
|
||||||
# Use INPUT_LEN=1800, OUTPUT_LEN=20, MIN_CACHE_HIT_PCT=0, MAX_LATENCY_ALLOWED_MS=500
|
|
||||||
# 3. If we want to reach 60% prefix cache, what's the best server parameter?
|
|
||||||
# Use INPUT_LEN=1800, OUTPUT_LEN=20, MIN_CACHE_HIT_PCT=60, MAX_LATENCY_ALLOWED_MS=500
|
|
||||||
|
|
||||||
TAG=$(date +"%Y_%m_%d_%H_%M")
|
|
||||||
BASE=""
|
|
||||||
MODEL="meta-llama/Llama-3.1-8B-Instruct"
|
|
||||||
DOWNLOAD_DIR=""
|
|
||||||
INPUT_LEN=4000
|
|
||||||
OUTPUT_LEN=16
|
|
||||||
MIN_CACHE_HIT_PCT_PCT=0
|
|
||||||
MAX_LATENCY_ALLOWED_MS=100000000000
|
|
||||||
|
|
||||||
LOG_FOLDER="$BASE/auto-benchmark/$TAG"
|
|
||||||
RESULT="$LOG_FOLDER/result.txt"
|
|
||||||
|
|
||||||
echo "result file$ $RESULT"
|
|
||||||
echo "model: $MODEL"
|
|
||||||
echo
|
|
||||||
|
|
||||||
rm -rf $LOG_FOLDER
|
|
||||||
mkdir -p $LOG_FOLDER
|
|
||||||
|
|
||||||
cd "$BASE/vllm"
|
|
||||||
# create sonnet-4x.txt so that we can sample 2048 tokens for input
|
|
||||||
echo "" > benchmarks/sonnet_4x.txt
|
|
||||||
for _ in {1..4}
|
|
||||||
do
|
|
||||||
cat benchmarks/sonnet.txt >> benchmarks/sonnet_4x.txt
|
|
||||||
done
|
|
||||||
|
|
||||||
pip install datasets
|
|
||||||
|
|
||||||
current_hash=$(git rev-parse HEAD)
|
|
||||||
echo "hash:$current_hash" >> "$RESULT"
|
|
||||||
echo "current_hash: $current_hash"
|
|
||||||
|
|
||||||
best_throughput=0
|
|
||||||
best_max_num_seqs=0
|
|
||||||
best_num_batched_tokens=0
|
|
||||||
best_goodput=0
|
|
||||||
run_benchmark() {
|
|
||||||
local max_num_seqs=$1
|
|
||||||
local max_num_batched_tokens=$2
|
|
||||||
echo "max_num_seq: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens"
|
|
||||||
local vllm_log="$LOG_FOLDER/vllm_log_${max_num_seqs}_${max_num_batched_tokens}.txt"
|
|
||||||
echo "vllm_log: $vllm_log"
|
|
||||||
echo
|
|
||||||
rm -f $vllm_log
|
|
||||||
|
|
||||||
# start the server
|
|
||||||
VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 vllm serve $MODEL \
|
|
||||||
--disable-log-requests \
|
|
||||||
--port 8004 \
|
|
||||||
--gpu-memory-utilization 0.98 \
|
|
||||||
--max-num-seqs $max_num_seqs \
|
|
||||||
--max-num-batched-tokens $max_num_batched_tokens \
|
|
||||||
--tensor-parallel-size 1 \
|
|
||||||
--enable-prefix-caching \
|
|
||||||
--load-format dummy \
|
|
||||||
--download-dir $DOWNLOAD_DIR \
|
|
||||||
--max-model-len $(( INPUT_LEN+OUTPUT_LEN )) > "$vllm_log" 2>&1 &
|
|
||||||
echo "wait for 10 minutes.."
|
|
||||||
echo
|
|
||||||
# wait for 10 minutes...
|
|
||||||
server_started=0
|
|
||||||
for i in {1..60}; do
|
|
||||||
if grep -Fq "Application startup complete" "$vllm_log"; then
|
|
||||||
echo "Application started"
|
|
||||||
server_started=1
|
|
||||||
break
|
|
||||||
else
|
|
||||||
# echo "wait for 10 seconds..."
|
|
||||||
sleep 10
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
|
|
||||||
if (( ! server_started )); then
|
|
||||||
echo "server did not start within 10 minutes, terminate the benchmarking. Please check server log at $vllm_log"
|
|
||||||
echo "pkill -f vllm"
|
|
||||||
echo
|
|
||||||
pkill vllm
|
|
||||||
sleep 10
|
|
||||||
return 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "run benchmark test..."
|
|
||||||
echo
|
|
||||||
meet_latency_requirement=0
|
|
||||||
# get a basic qps by using request-rate inf
|
|
||||||
bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_inf.txt"
|
|
||||||
prefix_len=$(( INPUT_LEN * MIN_CACHE_HIT_PCT / 100 ))
|
|
||||||
python benchmarks/benchmark_serving.py \
|
|
||||||
--backend vllm \
|
|
||||||
--model $MODEL \
|
|
||||||
--dataset-name sonnet \
|
|
||||||
--dataset-path benchmarks/sonnet_4x.txt \
|
|
||||||
--sonnet-input-len $INPUT_LEN \
|
|
||||||
--sonnet-output-len $OUTPUT_LEN \
|
|
||||||
--ignore-eos \
|
|
||||||
--disable-tqdm \
|
|
||||||
--request-rate inf \
|
|
||||||
--percentile-metrics ttft,tpot,itl,e2el \
|
|
||||||
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \
|
|
||||||
--num-prompts 100 \
|
|
||||||
--sonnet-prefix-len $prefix_len \
|
|
||||||
--port 8004 > "$bm_log"
|
|
||||||
through_put=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
|
|
||||||
e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
|
|
||||||
goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
|
|
||||||
|
|
||||||
if (( $(echo "$e2el <= $MAX_LATENCY_ALLOWED_MS" | bc -l) )); then
|
|
||||||
meet_latency_requirement=1
|
|
||||||
fi
|
|
||||||
|
|
||||||
if (( ! meet_latency_requirement )); then
|
|
||||||
# start from request-rate as int(through_put) + 1
|
|
||||||
request_rate=$((${through_put%.*} + 1))
|
|
||||||
while ((request_rate > 0)); do
|
|
||||||
# clear prefix cache
|
|
||||||
curl -X POST http://0.0.0.0:8004/reset_prefix_cache
|
|
||||||
sleep 5
|
|
||||||
bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_${request_rate}.txt"
|
|
||||||
python benchmarks/benchmark_serving.py \
|
|
||||||
--backend vllm \
|
|
||||||
--model $MODEL \
|
|
||||||
--dataset-name sonnet \
|
|
||||||
--dataset-path benchmarks/sonnet_4x.txt \
|
|
||||||
--sonnet-input-len $INPUT_LEN \
|
|
||||||
--sonnet-output-len $OUTPUT_LEN \
|
|
||||||
--ignore_eos \
|
|
||||||
--disable-tqdm \
|
|
||||||
--request-rate $request_rate \
|
|
||||||
--percentile-metrics ttft,tpot,itl,e2el \
|
|
||||||
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \
|
|
||||||
--num-prompts 100 \
|
|
||||||
--sonnet-prefix-len $prefix_len \
|
|
||||||
--port 8004 > "$bm_log"
|
|
||||||
through_put=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
|
|
||||||
e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
|
|
||||||
goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
|
|
||||||
if (( $(echo "$e2el <= $MAX_LATENCY_ALLOWED_MS" | bc -l) )); then
|
|
||||||
meet_latency_requirement=1
|
|
||||||
break
|
|
||||||
fi
|
|
||||||
request_rate=$((request_rate-1))
|
|
||||||
done
|
|
||||||
fi
|
|
||||||
# write the results and update the best result.
|
|
||||||
if ((meet_latency_requirement)); then
|
|
||||||
echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens, request_rate: $request_rate, e2el: $e2el, through put: $through_put, goodput: $goodput"
|
|
||||||
echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens, request_rate: $request_rate, e2el: $e2el, through put: $through_put, goodput: $goodput" >> "$RESULT"
|
|
||||||
if (( $(echo "$through_put > $best_throughput" | bc -l) )); then
|
|
||||||
best_throughput=$through_put
|
|
||||||
best_max_num_seqs=$max_num_seqs
|
|
||||||
best_num_batched_tokens=$max_num_batched_tokens
|
|
||||||
best_goodput=$goodput
|
|
||||||
fi
|
|
||||||
else
|
|
||||||
echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens does not meet latency requirement ${MAX_LATENCY_ALLOWED_MS}"
|
|
||||||
echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens does not meet latency requirement ${MAX_LATENCY_ALLOWED_MS}" >> "$RESULT"
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput"
|
|
||||||
|
|
||||||
echo "pkill -f vllm"
|
|
||||||
echo
|
|
||||||
pkill vllm
|
|
||||||
sleep 10
|
|
||||||
rm -f $vllm_log
|
|
||||||
printf '=%.0s' $(seq 1 20)
|
|
||||||
return 0
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
num_seqs_list="128 256"
|
|
||||||
num_batched_tokens_list="512 1024 2048 4096"
|
|
||||||
for num_seqs in $num_seqs_list; do
|
|
||||||
for num_batched_tokens in $num_batched_tokens_list; do
|
|
||||||
run_benchmark $num_seqs $num_batched_tokens
|
|
||||||
exit 0
|
|
||||||
done
|
|
||||||
done
|
|
||||||
echo "finish permutations"
|
|
||||||
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput"
|
|
||||||
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput" >> "$RESULT"
|
|
||||||
|
|
||||||
@ -1,13 +1,12 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
import io
|
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
import time
|
import time
|
||||||
import traceback
|
import traceback
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
from typing import Optional, Union
|
from typing import List, Optional, Union
|
||||||
|
|
||||||
import aiohttp
|
import aiohttp
|
||||||
import huggingface_hub.constants
|
import huggingface_hub.constants
|
||||||
@ -15,9 +14,6 @@ from tqdm.asyncio import tqdm
|
|||||||
from transformers import (AutoTokenizer, PreTrainedTokenizer,
|
from transformers import (AutoTokenizer, PreTrainedTokenizer,
|
||||||
PreTrainedTokenizerFast)
|
PreTrainedTokenizerFast)
|
||||||
|
|
||||||
# NOTE(simon): do not import vLLM here so the benchmark script
|
|
||||||
# can run without vLLM installed.
|
|
||||||
|
|
||||||
AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
|
AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
|
||||||
|
|
||||||
|
|
||||||
@ -29,11 +25,11 @@ class RequestFuncInput:
|
|||||||
output_len: int
|
output_len: int
|
||||||
model: str
|
model: str
|
||||||
model_name: Optional[str] = None
|
model_name: Optional[str] = None
|
||||||
|
best_of: int = 1
|
||||||
logprobs: Optional[int] = None
|
logprobs: Optional[int] = None
|
||||||
extra_body: Optional[dict] = None
|
extra_body: Optional[dict] = None
|
||||||
multi_modal_content: Optional[dict] = None
|
multi_modal_content: Optional[dict] = None
|
||||||
ignore_eos: bool = False
|
ignore_eos: bool = False
|
||||||
language: Optional[str] = None
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
@ -43,8 +39,8 @@ class RequestFuncOutput:
|
|||||||
latency: float = 0.0
|
latency: float = 0.0
|
||||||
output_tokens: int = 0
|
output_tokens: int = 0
|
||||||
ttft: float = 0.0 # Time to first token
|
ttft: float = 0.0 # Time to first token
|
||||||
itl: list[float] = field(
|
itl: List[float] = field(
|
||||||
default_factory=list) # list of inter-token latencies
|
default_factory=list) # List of inter-token latencies
|
||||||
tpot: float = 0.0 # avg next-token latencies
|
tpot: float = 0.0 # avg next-token latencies
|
||||||
prompt_len: int = 0
|
prompt_len: int = 0
|
||||||
error: str = ""
|
error: str = ""
|
||||||
@ -60,12 +56,13 @@ async def async_request_tgi(
|
|||||||
async with aiohttp.ClientSession(trust_env=True,
|
async with aiohttp.ClientSession(trust_env=True,
|
||||||
timeout=AIOHTTP_TIMEOUT) as session:
|
timeout=AIOHTTP_TIMEOUT) as session:
|
||||||
params = {
|
params = {
|
||||||
|
"best_of": request_func_input.best_of,
|
||||||
"max_new_tokens": request_func_input.output_len,
|
"max_new_tokens": request_func_input.output_len,
|
||||||
"do_sample": True,
|
"do_sample": True,
|
||||||
"temperature": 0.01, # TGI does not accept 0.0 temperature.
|
"temperature": 0.01, # TGI does not accept 0.0 temperature.
|
||||||
"top_p": 0.99, # TGI does not accept 1.0 top_p.
|
"top_p": 0.99, # TGI does not accept 1.0 top_p.
|
||||||
"truncate": request_func_input.prompt_len,
|
"truncate": request_func_input.prompt_len,
|
||||||
"ignore_eos_token": request_func_input.ignore_eos,
|
# TGI does not accept ignore_eos flag.
|
||||||
}
|
}
|
||||||
payload = {
|
payload = {
|
||||||
"inputs": request_func_input.prompt,
|
"inputs": request_func_input.prompt,
|
||||||
@ -73,10 +70,6 @@ async def async_request_tgi(
|
|||||||
}
|
}
|
||||||
output = RequestFuncOutput()
|
output = RequestFuncOutput()
|
||||||
output.prompt_len = request_func_input.prompt_len
|
output.prompt_len = request_func_input.prompt_len
|
||||||
if request_func_input.ignore_eos:
|
|
||||||
output.output_tokens = request_func_input.output_len
|
|
||||||
else:
|
|
||||||
output.output_tokens = None
|
|
||||||
|
|
||||||
ttft = 0.0
|
ttft = 0.0
|
||||||
st = time.perf_counter()
|
st = time.perf_counter()
|
||||||
@ -135,6 +128,7 @@ async def async_request_trt_llm(
|
|||||||
|
|
||||||
async with aiohttp.ClientSession(trust_env=True,
|
async with aiohttp.ClientSession(trust_env=True,
|
||||||
timeout=AIOHTTP_TIMEOUT) as session:
|
timeout=AIOHTTP_TIMEOUT) as session:
|
||||||
|
assert request_func_input.best_of == 1
|
||||||
payload = {
|
payload = {
|
||||||
"accumulate_tokens": True,
|
"accumulate_tokens": True,
|
||||||
"text_input": request_func_input.prompt,
|
"text_input": request_func_input.prompt,
|
||||||
@ -199,9 +193,9 @@ async def async_request_deepspeed_mii(
|
|||||||
) -> RequestFuncOutput:
|
) -> RequestFuncOutput:
|
||||||
async with aiohttp.ClientSession(trust_env=True,
|
async with aiohttp.ClientSession(trust_env=True,
|
||||||
timeout=AIOHTTP_TIMEOUT) as session:
|
timeout=AIOHTTP_TIMEOUT) as session:
|
||||||
|
assert request_func_input.best_of == 1
|
||||||
|
|
||||||
payload = {
|
payload = {
|
||||||
"model": request_func_input.model,
|
|
||||||
"prompt": request_func_input.prompt,
|
"prompt": request_func_input.prompt,
|
||||||
"max_tokens": request_func_input.output_len,
|
"max_tokens": request_func_input.output_len,
|
||||||
"temperature": 0.01, # deepspeed-mii does not accept 0.0 temp.
|
"temperature": 0.01, # deepspeed-mii does not accept 0.0 temp.
|
||||||
@ -222,15 +216,7 @@ async def async_request_deepspeed_mii(
|
|||||||
if response.status == 200:
|
if response.status == 200:
|
||||||
parsed_resp = await response.json()
|
parsed_resp = await response.json()
|
||||||
output.latency = time.perf_counter() - st
|
output.latency = time.perf_counter() - st
|
||||||
if "choices" in parsed_resp:
|
output.generated_text = parsed_resp["text"][0]
|
||||||
output.generated_text = parsed_resp["choices"][0][
|
|
||||||
"text"]
|
|
||||||
elif "text" in parsed_resp:
|
|
||||||
output.generated_text = parsed_resp["text"][0]
|
|
||||||
else:
|
|
||||||
output.error = ("Unexpected response format: "
|
|
||||||
"neither 'choices' nor 'text' found")
|
|
||||||
output.success = False
|
|
||||||
output.success = True
|
output.success = True
|
||||||
else:
|
else:
|
||||||
output.error = response.reason or ""
|
output.error = response.reason or ""
|
||||||
@ -261,7 +247,7 @@ async def async_request_openai_completions(
|
|||||||
if request_func_input.model_name else request_func_input.model,
|
if request_func_input.model_name else request_func_input.model,
|
||||||
"prompt": request_func_input.prompt,
|
"prompt": request_func_input.prompt,
|
||||||
"temperature": 0.0,
|
"temperature": 0.0,
|
||||||
"repetition_penalty": 1.0,
|
"best_of": request_func_input.best_of,
|
||||||
"max_tokens": request_func_input.output_len,
|
"max_tokens": request_func_input.output_len,
|
||||||
"logprobs": request_func_input.logprobs,
|
"logprobs": request_func_input.logprobs,
|
||||||
"stream": True,
|
"stream": True,
|
||||||
@ -350,7 +336,7 @@ async def async_request_openai_chat_completions(
|
|||||||
) -> RequestFuncOutput:
|
) -> RequestFuncOutput:
|
||||||
api_url = request_func_input.api_url
|
api_url = request_func_input.api_url
|
||||||
assert api_url.endswith(
|
assert api_url.endswith(
|
||||||
("chat/completions", "profile")
|
"chat/completions"
|
||||||
), "OpenAI Chat Completions API URL must end with 'chat/completions'."
|
), "OpenAI Chat Completions API URL must end with 'chat/completions'."
|
||||||
|
|
||||||
async with aiohttp.ClientSession(trust_env=True,
|
async with aiohttp.ClientSession(trust_env=True,
|
||||||
@ -440,125 +426,16 @@ async def async_request_openai_chat_completions(
|
|||||||
return output
|
return output
|
||||||
|
|
||||||
|
|
||||||
async def async_request_openai_audio(
|
|
||||||
request_func_input: RequestFuncInput,
|
|
||||||
pbar: Optional[tqdm] = None,
|
|
||||||
) -> RequestFuncOutput:
|
|
||||||
# Lazy import without PlaceholderModule to avoid vllm dep.
|
|
||||||
import soundfile
|
|
||||||
api_url = request_func_input.api_url
|
|
||||||
assert api_url.endswith(
|
|
||||||
("transcriptions", "translations"
|
|
||||||
)), "OpenAI Chat Completions API URL must end with 'transcriptions' "
|
|
||||||
"or `translations`."
|
|
||||||
|
|
||||||
async with aiohttp.ClientSession(trust_env=True,
|
|
||||||
timeout=AIOHTTP_TIMEOUT) as session:
|
|
||||||
content = [{"type": "text", "text": request_func_input.prompt}]
|
|
||||||
payload = {
|
|
||||||
"model": request_func_input.model_name \
|
|
||||||
if request_func_input.model_name else request_func_input.model,
|
|
||||||
"temperature": 0.0,
|
|
||||||
"max_completion_tokens": request_func_input.output_len,
|
|
||||||
"stream": True,
|
|
||||||
"language": "en",
|
|
||||||
# Flattened due to multipart/form-data
|
|
||||||
"stream_include_usage": True,
|
|
||||||
"stream_continuous_usage_stats": True
|
|
||||||
}
|
|
||||||
if request_func_input.extra_body:
|
|
||||||
payload.update(request_func_input.extra_body)
|
|
||||||
headers = {
|
|
||||||
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
|
|
||||||
}
|
|
||||||
|
|
||||||
# Send audio file
|
|
||||||
def to_bytes(y, sr):
|
|
||||||
buffer = io.BytesIO()
|
|
||||||
soundfile.write(buffer, y, sr, format="WAV")
|
|
||||||
buffer.seek(0)
|
|
||||||
return buffer
|
|
||||||
|
|
||||||
with to_bytes(*request_func_input.multi_modal_content['audio']) as f:
|
|
||||||
form = aiohttp.FormData()
|
|
||||||
form.add_field('file', f, content_type='audio/wav')
|
|
||||||
for key, value in payload.items():
|
|
||||||
form.add_field(key, str(value))
|
|
||||||
|
|
||||||
output = RequestFuncOutput()
|
|
||||||
output.prompt_len = request_func_input.prompt_len
|
|
||||||
|
|
||||||
generated_text = ""
|
|
||||||
ttft = 0.0
|
|
||||||
st = time.perf_counter()
|
|
||||||
most_recent_timestamp = st
|
|
||||||
try:
|
|
||||||
async with session.post(url=api_url,
|
|
||||||
data=form,
|
|
||||||
headers=headers) as response:
|
|
||||||
if response.status == 200:
|
|
||||||
async for chunk_bytes in response.content:
|
|
||||||
chunk_bytes = chunk_bytes.strip()
|
|
||||||
if not chunk_bytes:
|
|
||||||
continue
|
|
||||||
|
|
||||||
chunk = chunk_bytes.decode("utf-8").removeprefix(
|
|
||||||
"data: ")
|
|
||||||
if chunk != "[DONE]":
|
|
||||||
timestamp = time.perf_counter()
|
|
||||||
data = json.loads(chunk)
|
|
||||||
|
|
||||||
if choices := data.get("choices"):
|
|
||||||
content = choices[0]["delta"].get(
|
|
||||||
"content")
|
|
||||||
# First token
|
|
||||||
if ttft == 0.0:
|
|
||||||
ttft = timestamp - st
|
|
||||||
output.ttft = ttft
|
|
||||||
|
|
||||||
# Decoding phase
|
|
||||||
else:
|
|
||||||
output.itl.append(
|
|
||||||
timestamp - most_recent_timestamp)
|
|
||||||
|
|
||||||
generated_text += content or ""
|
|
||||||
elif usage := data.get("usage"):
|
|
||||||
output.output_tokens = usage.get(
|
|
||||||
"completion_tokens")
|
|
||||||
|
|
||||||
most_recent_timestamp = timestamp
|
|
||||||
|
|
||||||
output.generated_text = generated_text
|
|
||||||
output.success = True
|
|
||||||
output.latency = most_recent_timestamp - st
|
|
||||||
else:
|
|
||||||
output.error = response.reason or ""
|
|
||||||
output.success = False
|
|
||||||
except Exception:
|
|
||||||
output.success = False
|
|
||||||
exc_info = sys.exc_info()
|
|
||||||
output.error = "".join(traceback.format_exception(*exc_info))
|
|
||||||
|
|
||||||
if pbar:
|
|
||||||
pbar.update(1)
|
|
||||||
return output
|
|
||||||
|
|
||||||
|
|
||||||
def get_model(pretrained_model_name_or_path: str) -> str:
|
def get_model(pretrained_model_name_or_path: str) -> str:
|
||||||
if os.getenv('VLLM_USE_MODELSCOPE', 'False').lower() == 'true':
|
if os.getenv('VLLM_USE_MODELSCOPE', 'False').lower() == 'true':
|
||||||
from modelscope import snapshot_download
|
from modelscope import snapshot_download
|
||||||
|
|
||||||
from vllm.model_executor.model_loader.weight_utils import get_lock
|
model_path = snapshot_download(
|
||||||
|
model_id=pretrained_model_name_or_path,
|
||||||
|
local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
|
||||||
|
ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"])
|
||||||
|
|
||||||
# Use file lock to prevent multiple processes from
|
return model_path
|
||||||
# downloading the same model weights at the same time.
|
|
||||||
with get_lock(pretrained_model_name_or_path):
|
|
||||||
model_path = snapshot_download(
|
|
||||||
model_id=pretrained_model_name_or_path,
|
|
||||||
local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
|
|
||||||
ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"])
|
|
||||||
|
|
||||||
return model_path
|
|
||||||
return pretrained_model_name_or_path
|
return pretrained_model_name_or_path
|
||||||
|
|
||||||
|
|
||||||
@ -601,14 +478,7 @@ ASYNC_REQUEST_FUNCS = {
|
|||||||
"deepspeed-mii": async_request_deepspeed_mii,
|
"deepspeed-mii": async_request_deepspeed_mii,
|
||||||
"openai": async_request_openai_completions,
|
"openai": async_request_openai_completions,
|
||||||
"openai-chat": async_request_openai_chat_completions,
|
"openai-chat": async_request_openai_chat_completions,
|
||||||
"openai-audio": async_request_openai_audio,
|
|
||||||
"tensorrt-llm": async_request_trt_llm,
|
"tensorrt-llm": async_request_trt_llm,
|
||||||
"scalellm": async_request_openai_completions,
|
"scalellm": async_request_openai_completions,
|
||||||
"sglang": async_request_openai_completions,
|
"sglang": async_request_openai_completions,
|
||||||
}
|
}
|
||||||
|
|
||||||
OPENAI_COMPATIBLE_BACKENDS = [
|
|
||||||
k for k, v in ASYNC_REQUEST_FUNCS.items()
|
|
||||||
if v in (async_request_openai_completions,
|
|
||||||
async_request_openai_chat_completions)
|
|
||||||
]
|
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user