Compare commits
4 Commits
pil_image
...
sampler-en
| Author | SHA1 | Date | |
|---|---|---|---|
| 4c42267293 | |||
| 24f68342b4 | |||
| c5d963835b | |||
| b313220727 |
@ -1,4 +1,3 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m deepseek-ai/DeepSeek-V2-Lite-Chat -b "auto" -l 1000 -f 5 -t 2
|
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m deepseek-ai/DeepSeek-V2-Lite-Chat -b "auto" -l 1000 -f 5 -t 2
|
||||||
model_name: "deepseek-ai/DeepSeek-V2-Lite-Chat"
|
model_name: "deepseek-ai/DeepSeek-V2-Lite-Chat"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@ -1,4 +1,3 @@
|
|||||||
# For hf script, without -t option (tensor parallel size).
|
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5
|
||||||
model_name: "nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform"
|
model_name: "nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@ -1,4 +1,3 @@
|
|||||||
# For hf script, without -t option (tensor parallel size).
|
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-70B-Instruct -b 32 -l 250 -f 5
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-70B-Instruct -b 32 -l 250 -f 5
|
||||||
model_name: "meta-llama/Meta-Llama-3-70B-Instruct"
|
model_name: "meta-llama/Meta-Llama-3-70B-Instruct"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@ -1,4 +1,3 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors -b auto -l 1000 -f 5 -t 1
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors -b auto -l 1000 -f 5 -t 1
|
||||||
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors"
|
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@ -1,4 +1,3 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5 -t 1
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5 -t 1
|
||||||
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform"
|
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@ -1,4 +1,3 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test -b 32 -l 1000 -f 5 -t 1
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test -b 32 -l 1000 -f 5 -t 1
|
||||||
model_name: "nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test"
|
model_name: "nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@ -1,4 +1,3 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Meta-Llama-3-8B-Instruct-FP8 -b 32 -l 250 -f 5 -t 1
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Meta-Llama-3-8B-Instruct-FP8 -b 32 -l 250 -f 5 -t 1
|
||||||
model_name: "neuralmagic/Meta-Llama-3-8B-Instruct-FP8"
|
model_name: "neuralmagic/Meta-Llama-3-8B-Instruct-FP8"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@ -1,4 +1,3 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
|
||||||
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test"
|
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@ -1,4 +1,3 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
|
||||||
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test"
|
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@ -1,4 +1,3 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test -b auto -l 1000 -f 5 -t 1
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test -b auto -l 1000 -f 5 -t 1
|
||||||
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test"
|
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@ -1,5 +1,4 @@
|
|||||||
# For hf script, without -t option (tensor parallel size).
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-8B-Instruct -b 32 -l 250 -f 5 -t 1
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-8B-Instruct -b 32 -l 250 -f 5
|
|
||||||
model_name: "meta-llama/Meta-Llama-3-8B-Instruct"
|
model_name: "meta-llama/Meta-Llama-3-8B-Instruct"
|
||||||
tasks:
|
tasks:
|
||||||
- name: "gsm8k"
|
- name: "gsm8k"
|
||||||
|
|||||||
@ -1,4 +1,3 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m HandH1998/QQQ-Llama-3-8b-g128 -b 32 -l 1000 -f 5 -t 1
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m HandH1998/QQQ-Llama-3-8b-g128 -b 32 -l 1000 -f 5 -t 1
|
||||||
model_name: "HandH1998/QQQ-Llama-3-8b-g128"
|
model_name: "HandH1998/QQQ-Llama-3-8b-g128"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@ -1,11 +0,0 @@
|
|||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Llama-3.2-1B-Instruct-FP8 -b "auto" -l 1319 -f 5 -t 1
|
|
||||||
model_name: "RedHatAI/Llama-3.2-1B-Instruct-FP8"
|
|
||||||
tasks:
|
|
||||||
- name: "gsm8k"
|
|
||||||
metrics:
|
|
||||||
- name: "exact_match,strict-match"
|
|
||||||
value: 0.335
|
|
||||||
- name: "exact_match,flexible-extract"
|
|
||||||
value: 0.323
|
|
||||||
limit: 1319
|
|
||||||
num_fewshot: 5
|
|
||||||
@ -1,4 +1,3 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
|
||||||
model_name: "neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8"
|
model_name: "neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@ -1,4 +1,3 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m mgoin/Minitron-4B-Base-FP8 -b auto -l 1000 -f 5 -t 1
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m mgoin/Minitron-4B-Base-FP8 -b auto -l 1000 -f 5 -t 1
|
||||||
model_name: "mgoin/Minitron-4B-Base-FP8"
|
model_name: "mgoin/Minitron-4B-Base-FP8"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@ -1,4 +1,3 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic -b "auto" -l 250 -f 5 -t 8
|
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic -b "auto" -l 250 -f 5 -t 8
|
||||||
model_name: "neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic"
|
model_name: "neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@ -1,4 +1,3 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8 -b "auto" -l 250 -f 5 -t 4
|
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8 -b "auto" -l 250 -f 5 -t 4
|
||||||
model_name: "neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8"
|
model_name: "neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@ -1,5 +1,4 @@
|
|||||||
# For hf script, without -t option (tensor parallel size).
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1 -b 32 -l 250 -f 5 -t 4
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1 -b 32 -l 250 -f 5
|
|
||||||
model_name: "mistralai/Mixtral-8x7B-Instruct-v0.1"
|
model_name: "mistralai/Mixtral-8x7B-Instruct-v0.1"
|
||||||
tasks:
|
tasks:
|
||||||
- name: "gsm8k"
|
- name: "gsm8k"
|
||||||
|
|||||||
@ -1,12 +0,0 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16 -b auto -l 1319 -f 5 -t 1
|
|
||||||
model_name: "nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16"
|
|
||||||
tasks:
|
|
||||||
- name: "gsm8k"
|
|
||||||
metrics:
|
|
||||||
- name: "exact_match,strict-match"
|
|
||||||
value: 0.30
|
|
||||||
- name: "exact_match,flexible-extract"
|
|
||||||
value: 0.465
|
|
||||||
limit: 1319
|
|
||||||
num_fewshot: 5
|
|
||||||
@ -1,4 +1,3 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-FP8W8 -b auto -l 1000 -f 5 -t 1
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-FP8W8 -b auto -l 1000 -f 5 -t 1
|
||||||
model_name: "nm-testing/Qwen2-1.5B-Instruct-FP8W8"
|
model_name: "nm-testing/Qwen2-1.5B-Instruct-FP8W8"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@ -1,4 +1,3 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
|
||||||
model_name: "neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8"
|
model_name: "neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@ -1,4 +1,3 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise -b "auto" -l 1000 -f 5 -t 1
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise -b "auto" -l 1000 -f 5 -t 1
|
||||||
model_name: "nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise"
|
model_name: "nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@ -1,4 +1,3 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m Qwen/Qwen2-57B-A14B-Instruct -b "auto" -l 250 -f 5 -t 4
|
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m Qwen/Qwen2-57B-A14B-Instruct -b "auto" -l 250 -f 5 -t 4
|
||||||
model_name: "Qwen/Qwen2-57B-A14B-Instruct"
|
model_name: "Qwen/Qwen2-57B-A14B-Instruct"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@ -1,11 +0,0 @@
|
|||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m Qwen/Qwen2.5-1.5B-Instruct -b auto -l 1319 -f 5 -t 1
|
|
||||||
model_name: "Qwen/Qwen2.5-1.5B-Instruct"
|
|
||||||
tasks:
|
|
||||||
- name: "gsm8k"
|
|
||||||
metrics:
|
|
||||||
- name: "exact_match,strict-match"
|
|
||||||
value: 0.54
|
|
||||||
- name: "exact_match,flexible-extract"
|
|
||||||
value: 0.59
|
|
||||||
limit: 1319
|
|
||||||
num_fewshot: 5
|
|
||||||
@ -1,11 +0,0 @@
|
|||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -b auto -l 1319 -f 5 -t 1
|
|
||||||
model_name: "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic"
|
|
||||||
tasks:
|
|
||||||
- name: "gsm8k"
|
|
||||||
metrics:
|
|
||||||
- name: "exact_match,strict-match"
|
|
||||||
value: 0.47
|
|
||||||
- name: "exact_match,flexible-extract"
|
|
||||||
value: 0.64
|
|
||||||
limit: 1319
|
|
||||||
num_fewshot: 5
|
|
||||||
@ -1,4 +1,3 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM -b "auto" -t 2
|
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM -b "auto" -t 2
|
||||||
model_name: "nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM"
|
model_name: "nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@ -3,4 +3,3 @@ Meta-Llama-3-70B-Instruct.yaml
|
|||||||
Mixtral-8x7B-Instruct-v0.1.yaml
|
Mixtral-8x7B-Instruct-v0.1.yaml
|
||||||
Qwen2-57B-A14-Instruct.yaml
|
Qwen2-57B-A14-Instruct.yaml
|
||||||
DeepSeek-V2-Lite-Chat.yaml
|
DeepSeek-V2-Lite-Chat.yaml
|
||||||
Meta-Llama-3-8B-QQQ.yaml
|
|
||||||
|
|||||||
@ -1,6 +1,10 @@
|
|||||||
Qwen2.5-1.5B-Instruct.yaml
|
Meta-Llama-3-8B-Instruct.yaml
|
||||||
|
Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
|
||||||
Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
|
Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
|
||||||
Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
|
Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
|
||||||
Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
|
Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
|
||||||
Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
|
Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
|
||||||
Qwen1.5-MoE-W4A16-compressed-tensors.yaml
|
Minitron-4B-Base-FP8.yaml
|
||||||
|
Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
|
||||||
|
Qwen2-1.5B-Instruct-FP8W8.yaml
|
||||||
|
Meta-Llama-3-8B-QQQ.yaml
|
||||||
|
|||||||
@ -1,39 +0,0 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
|
|
||||||
def pytest_addoption(parser):
|
|
||||||
parser.addoption(
|
|
||||||
"--config-list-file",
|
|
||||||
action="store",
|
|
||||||
help="Path to the file listing model config YAMLs (one per line)")
|
|
||||||
parser.addoption("--tp-size",
|
|
||||||
action="store",
|
|
||||||
default="1",
|
|
||||||
help="Tensor parallel size to use for evaluation")
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
|
||||||
def config_list_file(pytestconfig, config_dir):
|
|
||||||
rel_path = pytestconfig.getoption("--config-list-file")
|
|
||||||
return config_dir / rel_path
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
|
||||||
def tp_size(pytestconfig):
|
|
||||||
return pytestconfig.getoption("--tp-size")
|
|
||||||
|
|
||||||
|
|
||||||
def pytest_generate_tests(metafunc):
|
|
||||||
if "config_filename" in metafunc.fixturenames:
|
|
||||||
rel_path = metafunc.config.getoption("--config-list-file")
|
|
||||||
config_list_file = Path(rel_path).resolve()
|
|
||||||
config_dir = config_list_file.parent
|
|
||||||
with open(config_list_file, encoding="utf-8") as f:
|
|
||||||
configs = [
|
|
||||||
config_dir / line.strip() for line in f
|
|
||||||
if line.strip() and not line.startswith("#")
|
|
||||||
]
|
|
||||||
metafunc.parametrize("config_filename", configs)
|
|
||||||
59
.buildkite/lm-eval-harness/run-tests.sh
Normal file
59
.buildkite/lm-eval-harness/run-tests.sh
Normal file
@ -0,0 +1,59 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
usage() {
|
||||||
|
echo``
|
||||||
|
echo "Runs lm eval harness on GSM8k using vllm and compares to "
|
||||||
|
echo "precomputed baseline (measured by HF transformers.)"
|
||||||
|
echo
|
||||||
|
echo "usage: ${0} <options>"
|
||||||
|
echo
|
||||||
|
echo " -c - path to the test data config (e.g. configs/small-models.txt)"
|
||||||
|
echo " -t - tensor parallel size"
|
||||||
|
echo
|
||||||
|
}
|
||||||
|
|
||||||
|
SUCCESS=0
|
||||||
|
|
||||||
|
while getopts "c:t:" OPT; do
|
||||||
|
case ${OPT} in
|
||||||
|
c )
|
||||||
|
CONFIG="$OPTARG"
|
||||||
|
;;
|
||||||
|
t )
|
||||||
|
TP_SIZE="$OPTARG"
|
||||||
|
;;
|
||||||
|
\? )
|
||||||
|
usage
|
||||||
|
exit 1
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
|
||||||
|
# Parse list of configs.
|
||||||
|
IFS=$'\n' read -d '' -r -a MODEL_CONFIGS < "$CONFIG"
|
||||||
|
|
||||||
|
for MODEL_CONFIG in "${MODEL_CONFIGS[@]}"
|
||||||
|
do
|
||||||
|
LOCAL_SUCCESS=0
|
||||||
|
|
||||||
|
echo "=== RUNNING MODEL: $MODEL_CONFIG WITH TP SIZE: $TP_SIZE==="
|
||||||
|
|
||||||
|
export LM_EVAL_TEST_DATA_FILE=$PWD/configs/${MODEL_CONFIG}
|
||||||
|
export LM_EVAL_TP_SIZE=$TP_SIZE
|
||||||
|
pytest -s test_lm_eval_correctness.py || LOCAL_SUCCESS=$?
|
||||||
|
|
||||||
|
if [[ $LOCAL_SUCCESS == 0 ]]; then
|
||||||
|
echo "=== PASSED MODEL: ${MODEL_CONFIG} ==="
|
||||||
|
else
|
||||||
|
echo "=== FAILED MODEL: ${MODEL_CONFIG} ==="
|
||||||
|
fi
|
||||||
|
|
||||||
|
SUCCESS=$((SUCCESS + LOCAL_SUCCESS))
|
||||||
|
|
||||||
|
done
|
||||||
|
|
||||||
|
if [ "${SUCCESS}" -eq "0" ]; then
|
||||||
|
exit 0
|
||||||
|
else
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
@ -3,25 +3,35 @@
|
|||||||
LM eval harness on model to compare vs HF baseline computed offline.
|
LM eval harness on model to compare vs HF baseline computed offline.
|
||||||
Configs are found in configs/$MODEL.yaml
|
Configs are found in configs/$MODEL.yaml
|
||||||
|
|
||||||
pytest -s -v test_lm_eval_correctness.py \
|
* export LM_EVAL_TEST_DATA_FILE=configs/Meta-Llama-3-70B-Instruct.yaml
|
||||||
--config-list-file=configs/models-small.txt \
|
* export LM_EVAL_TP_SIZE=4
|
||||||
--tp-size=1
|
* pytest -s test_lm_eval_correctness.py
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
import lm_eval
|
import lm_eval
|
||||||
import numpy as np
|
import numpy
|
||||||
|
import pytest
|
||||||
import yaml
|
import yaml
|
||||||
|
|
||||||
RTOL = 0.08
|
RTOL = 0.05
|
||||||
|
TEST_DATA_FILE = os.environ.get(
|
||||||
|
"LM_EVAL_TEST_DATA_FILE",
|
||||||
|
".buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml")
|
||||||
|
|
||||||
|
TP_SIZE = os.environ.get("LM_EVAL_TP_SIZE", 1)
|
||||||
|
|
||||||
|
|
||||||
def launch_lm_eval(eval_config, tp_size):
|
def launch_lm_eval(eval_config):
|
||||||
trust_remote_code = eval_config.get('trust_remote_code', False)
|
trust_remote_code = eval_config.get('trust_remote_code', False)
|
||||||
|
|
||||||
model_args = f"pretrained={eval_config['model_name']}," \
|
model_args = f"pretrained={eval_config['model_name']}," \
|
||||||
f"tensor_parallel_size={tp_size}," \
|
f"tensor_parallel_size={TP_SIZE}," \
|
||||||
f"enforce_eager=true," \
|
|
||||||
f"add_bos_token=true," \
|
f"add_bos_token=true," \
|
||||||
f"trust_remote_code={trust_remote_code}"
|
f"trust_remote_code={trust_remote_code}"
|
||||||
|
|
||||||
results = lm_eval.simple_evaluate(
|
results = lm_eval.simple_evaluate(
|
||||||
model="vllm",
|
model="vllm",
|
||||||
model_args=model_args,
|
model_args=model_args,
|
||||||
@ -29,14 +39,22 @@ def launch_lm_eval(eval_config, tp_size):
|
|||||||
num_fewshot=eval_config["num_fewshot"],
|
num_fewshot=eval_config["num_fewshot"],
|
||||||
limit=eval_config["limit"],
|
limit=eval_config["limit"],
|
||||||
batch_size="auto")
|
batch_size="auto")
|
||||||
|
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
|
||||||
def test_lm_eval_correctness_param(config_filename, tp_size):
|
def test_lm_eval_correctness():
|
||||||
eval_config = yaml.safe_load(config_filename.read_text(encoding="utf-8"))
|
eval_config = yaml.safe_load(
|
||||||
|
Path(TEST_DATA_FILE).read_text(encoding="utf-8"))
|
||||||
|
|
||||||
results = launch_lm_eval(eval_config, tp_size)
|
if eval_config[
|
||||||
|
"model_name"] == "nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform": #noqa: E501
|
||||||
|
pytest.skip("FBGEMM is currently failing on main.")
|
||||||
|
|
||||||
|
# Launch eval requests.
|
||||||
|
results = launch_lm_eval(eval_config)
|
||||||
|
|
||||||
|
# Confirm scores match ground truth.
|
||||||
success = True
|
success = True
|
||||||
for task in eval_config["tasks"]:
|
for task in eval_config["tasks"]:
|
||||||
for metric in task["metrics"]:
|
for metric in task["metrics"]:
|
||||||
@ -44,7 +62,8 @@ def test_lm_eval_correctness_param(config_filename, tp_size):
|
|||||||
measured_value = results["results"][task["name"]][metric["name"]]
|
measured_value = results["results"][task["name"]][metric["name"]]
|
||||||
print(f'{task["name"]} | {metric["name"]}: '
|
print(f'{task["name"]} | {metric["name"]}: '
|
||||||
f'ground_truth={ground_truth} | measured={measured_value}')
|
f'ground_truth={ground_truth} | measured={measured_value}')
|
||||||
success = success and np.isclose(
|
success = success and numpy.isclose(
|
||||||
ground_truth, measured_value, rtol=RTOL)
|
ground_truth, measured_value, rtol=RTOL)
|
||||||
|
|
||||||
|
# Assert at the end, print all scores even on failure for debugging.
|
||||||
assert success
|
assert success
|
||||||
|
|||||||
@ -10,24 +10,15 @@ set -x
|
|||||||
set -o pipefail
|
set -o pipefail
|
||||||
|
|
||||||
check_gpus() {
|
check_gpus() {
|
||||||
if command -v nvidia-smi; then
|
# check the number of GPUs and GPU type.
|
||||||
# check the number of GPUs and GPU type.
|
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
|
||||||
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
|
|
||||||
elif command -v amd-smi; then
|
|
||||||
declare -g gpu_count=$(amd-smi list | grep 'GPU' | wc -l)
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ $gpu_count -gt 0 ]]; then
|
if [[ $gpu_count -gt 0 ]]; then
|
||||||
echo "GPU found."
|
echo "GPU found."
|
||||||
else
|
else
|
||||||
echo "Need at least 1 GPU to run benchmarking."
|
echo "Need at least 1 GPU to run benchmarking."
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
if command -v nvidia-smi; then
|
declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')
|
||||||
declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')
|
|
||||||
elif command -v amd-smi; then
|
|
||||||
declare -g gpu_type=$(amd-smi static -g 0 -a | grep 'MARKET_NAME' | awk '{print $2}')
|
|
||||||
fi
|
|
||||||
echo "GPU type is $gpu_type"
|
echo "GPU type is $gpu_type"
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -99,15 +90,9 @@ kill_gpu_processes() {
|
|||||||
|
|
||||||
|
|
||||||
# wait until GPU memory usage smaller than 1GB
|
# wait until GPU memory usage smaller than 1GB
|
||||||
if command -v nvidia-smi; then
|
while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
|
||||||
while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
|
sleep 1
|
||||||
sleep 1
|
done
|
||||||
done
|
|
||||||
elif command -v amd-smi; then
|
|
||||||
while [ "$(amd-smi metric -g 0 | grep 'USED_VRAM' | awk '{print $2}')" -ge 1000 ]; do
|
|
||||||
sleep 1
|
|
||||||
done
|
|
||||||
fi
|
|
||||||
|
|
||||||
# remove vllm config file
|
# remove vllm config file
|
||||||
rm -rf ~/.config/vllm
|
rm -rf ~/.config/vllm
|
||||||
|
|||||||
@ -63,12 +63,10 @@
|
|||||||
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
||||||
"disable_log_requests": "",
|
"disable_log_requests": "",
|
||||||
"tensor_parallel_size": 4,
|
"tensor_parallel_size": 4,
|
||||||
"swap_space": 16,
|
"swap_space": 16,
|
||||||
"speculative_config": {
|
"speculative_model": "turboderp/Qwama-0.5B-Instruct",
|
||||||
"model": "turboderp/Qwama-0.5B-Instruct",
|
"num_speculative_tokens": 4,
|
||||||
"num_speculative_tokens": 4,
|
"speculative_draft_tensor_parallel_size": 1
|
||||||
"draft_tensor_parallel_size": 1
|
|
||||||
}
|
|
||||||
},
|
},
|
||||||
"client_parameters": {
|
"client_parameters": {
|
||||||
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
||||||
|
|||||||
@ -1,23 +1,23 @@
|
|||||||
steps:
|
steps:
|
||||||
- label: "Build wheel - CUDA 12.8"
|
- label: "Build wheel - CUDA 12.4"
|
||||||
agents:
|
agents:
|
||||||
queue: cpu_queue_postmerge
|
queue: cpu_queue_postmerge
|
||||||
commands:
|
commands:
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag vllm-ci:build-image --target build --progress plain ."
|
||||||
- "mkdir artifacts"
|
- "mkdir artifacts"
|
||||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||||
- "bash .buildkite/scripts/upload-wheels.sh"
|
- "bash .buildkite/upload-wheels.sh"
|
||||||
env:
|
env:
|
||||||
DOCKER_BUILDKIT: "1"
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|
||||||
- label: "Build wheel - CUDA 12.6"
|
- label: "Build wheel - CUDA 12.1"
|
||||||
agents:
|
agents:
|
||||||
queue: cpu_queue_postmerge
|
queue: cpu_queue_postmerge
|
||||||
commands:
|
commands:
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.6.3 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain ."
|
||||||
- "mkdir artifacts"
|
- "mkdir artifacts"
|
||||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||||
- "bash .buildkite/scripts/upload-wheels.sh"
|
- "bash .buildkite/upload-wheels.sh"
|
||||||
env:
|
env:
|
||||||
DOCKER_BUILDKIT: "1"
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|
||||||
@ -31,10 +31,10 @@ steps:
|
|||||||
agents:
|
agents:
|
||||||
queue: cpu_queue_postmerge
|
queue: cpu_queue_postmerge
|
||||||
commands:
|
commands:
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain ."
|
||||||
- "mkdir artifacts"
|
- "mkdir artifacts"
|
||||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||||
- "bash .buildkite/scripts/upload-wheels.sh"
|
- "bash .buildkite/upload-wheels.sh"
|
||||||
env:
|
env:
|
||||||
DOCKER_BUILDKIT: "1"
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|
||||||
@ -48,7 +48,7 @@ steps:
|
|||||||
queue: cpu_queue_postmerge
|
queue: cpu_queue_postmerge
|
||||||
commands:
|
commands:
|
||||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ."
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain ."
|
||||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
||||||
|
|
||||||
- label: "Build and publish TPU release image"
|
- label: "Build and publish TPU release image"
|
||||||
@ -57,9 +57,7 @@ steps:
|
|||||||
agents:
|
agents:
|
||||||
queue: tpu_queue_postmerge
|
queue: tpu_queue_postmerge
|
||||||
commands:
|
commands:
|
||||||
- "yes | docker system prune -a"
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f Dockerfile.tpu ."
|
||||||
- "git fetch --all"
|
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f docker/Dockerfile.tpu ."
|
|
||||||
- "docker push vllm/vllm-tpu:nightly"
|
- "docker push vllm/vllm-tpu:nightly"
|
||||||
- "docker push vllm/vllm-tpu:$BUILDKITE_COMMIT"
|
- "docker push vllm/vllm-tpu:$BUILDKITE_COMMIT"
|
||||||
plugins:
|
plugins:
|
||||||
@ -84,22 +82,7 @@ steps:
|
|||||||
queue: cpu_queue_postmerge
|
queue: cpu_queue_postmerge
|
||||||
commands:
|
commands:
|
||||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain -f Dockerfile.cpu ."
|
||||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
|
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
|
||||||
env:
|
env:
|
||||||
DOCKER_BUILDKIT: "1"
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|
||||||
- block: "Build Neuron release image"
|
|
||||||
key: block-neuron-release-image-build
|
|
||||||
depends_on: ~
|
|
||||||
|
|
||||||
- label: "Build and publish Neuron release image"
|
|
||||||
depends_on: block-neuron-release-image-build
|
|
||||||
agents:
|
|
||||||
queue: neuron-postmerge
|
|
||||||
commands:
|
|
||||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:latest --progress plain -f docker/Dockerfile.neuron ."
|
|
||||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version)"
|
|
||||||
env:
|
|
||||||
DOCKER_BUILDKIT: "1"
|
|
||||||
|
|||||||
@ -75,84 +75,49 @@ HF_MOUNT="/root/.cache/huggingface"
|
|||||||
commands=$@
|
commands=$@
|
||||||
echo "Commands:$commands"
|
echo "Commands:$commands"
|
||||||
#ignore certain kernels tests
|
#ignore certain kernels tests
|
||||||
if [[ $commands == *" kernels/core"* ]]; then
|
if [[ $commands == *" kernels "* ]]; then
|
||||||
commands="${commands} \
|
commands="${commands} \
|
||||||
--ignore=kernels/core/test_fused_quant_layernorm.py \
|
--ignore=kernels/test_attention_selector.py \
|
||||||
--ignore=kernels/core/test_permute_cols.py"
|
--ignore=kernels/test_blocksparse_attention.py \
|
||||||
fi
|
--ignore=kernels/test_causal_conv1d.py \
|
||||||
|
--ignore=kernels/test_cutlass.py \
|
||||||
if [[ $commands == *" kernels/attention"* ]]; then
|
--ignore=kernels/test_encoder_decoder_attn.py \
|
||||||
commands="${commands} \
|
--ignore=kernels/test_flash_attn.py \
|
||||||
--ignore=kernels/attention/stest_attention_selector.py \
|
--ignore=kernels/test_flashinfer.py \
|
||||||
--ignore=kernels/attention/test_blocksparse_attention.py \
|
--ignore=kernels/test_int8_quant.py \
|
||||||
--ignore=kernels/attention/test_encoder_decoder_attn.py \
|
--ignore=kernels/test_machete_gemm.py \
|
||||||
--ignore=kernels/attention/test_attention_selector.py \
|
--ignore=kernels/test_mamba_ssm.py \
|
||||||
--ignore=kernels/attention/test_flash_attn.py \
|
--ignore=kernels/test_marlin_gemm.py \
|
||||||
--ignore=kernels/attention/test_flashinfer.py \
|
--ignore=kernels/test_moe.py \
|
||||||
--ignore=kernels/attention/test_prefix_prefill.py \
|
--ignore=kernels/test_prefix_prefill.py \
|
||||||
--ignore=kernels/attention/test_cascade_flash_attn.py \
|
--ignore=kernels/test_rand.py \
|
||||||
--ignore=kernels/attention/test_mha_attn.py \
|
--ignore=kernels/test_sampler.py \
|
||||||
--ignore=kernels/attention/test_lightning_attn.py \
|
--ignore=kernels/test_cascade_flash_attn.py \
|
||||||
--ignore=kernels/attention/test_attention.py"
|
--ignore=kernels/test_mamba_mixer2.py \
|
||||||
fi
|
--ignore=kernels/test_aqlm.py \
|
||||||
|
--ignore=kernels/test_machete_mm.py \
|
||||||
if [[ $commands == *" kernels/quantization"* ]]; then
|
--ignore=kernels/test_mha_attn.py \
|
||||||
commands="${commands} \
|
--ignore=kernels/test_block_fp8.py \
|
||||||
--ignore=kernels/quantization/test_int8_quant.py \
|
--ignore=kernels/test_permute_cols.py"
|
||||||
--ignore=kernels/quantization/test_aqlm.py \
|
|
||||||
--ignore=kernels/quantization/test_machete_mm.py \
|
|
||||||
--ignore=kernels/quantization/test_block_fp8.py \
|
|
||||||
--ignore=kernels/quantization/test_block_int8.py \
|
|
||||||
--ignore=kernels/quantization/test_marlin_gemm.py \
|
|
||||||
--ignore=kernels/quantization/test_cutlass_scaled_mm.py \
|
|
||||||
--ignore=kernels/quantization/test_int8_kernel.py"
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ $commands == *" kernels/mamba"* ]]; then
|
|
||||||
commands="${commands} \
|
|
||||||
--ignore=kernels/mamba/test_mamba_mixer2.py \
|
|
||||||
--ignore=kernels/mamba/test_causal_conv1d.py \
|
|
||||||
--ignore=kernels/mamba/test_mamba_ssm_ssd.py"
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ $commands == *" kernels/moe"* ]]; then
|
|
||||||
commands="${commands} \
|
|
||||||
--ignore=kernels/moe/test_moe.py \
|
|
||||||
--ignore=kernels/moe/test_cutlass_moe.py \
|
|
||||||
--ignore=kernels/moe/test_triton_moe_ptpc_fp8.py"
|
|
||||||
fi
|
fi
|
||||||
|
|
||||||
#ignore certain Entrypoints/openai tests
|
#ignore certain Entrypoints/openai tests
|
||||||
if [[ $commands == *" entrypoints/openai "* ]]; then
|
if [[ $commands == *" entrypoints/openai "* ]]; then
|
||||||
commands=${commands//" entrypoints/openai "/" entrypoints/openai \
|
commands=${commands//" entrypoints/openai "/" entrypoints/openai \
|
||||||
--ignore=entrypoints/openai/test_audio.py \
|
--ignore=entrypoints/openai/test_audio.py \
|
||||||
|
--ignore=entrypoints/openai/test_chat.py \
|
||||||
--ignore=entrypoints/openai/test_shutdown.py \
|
--ignore=entrypoints/openai/test_shutdown.py \
|
||||||
--ignore=entrypoints/openai/test_completion.py \
|
--ignore=entrypoints/openai/test_completion.py \
|
||||||
--ignore=entrypoints/openai/test_sleep.py \
|
--ignore=entrypoints/openai/test_sleep.py \
|
||||||
--ignore=entrypoints/openai/test_models.py \
|
--ignore=entrypoints/openai/test_models.py \
|
||||||
--ignore=entrypoints/openai/test_lora_adapters.py \
|
|
||||||
--ignore=entrypoints/openai/test_return_tokens_as_ids.py \
|
|
||||||
--ignore=entrypoints/openai/test_root_path.py \
|
|
||||||
--ignore=entrypoints/openai/test_tokenization.py \
|
|
||||||
--ignore=entrypoints/openai/test_prompt_validation.py "}
|
--ignore=entrypoints/openai/test_prompt_validation.py "}
|
||||||
fi
|
fi
|
||||||
|
|
||||||
#ignore certain Entrypoints/llm tests
|
#ignore certain Entrypoints/llm tests
|
||||||
if [[ $commands == *" entrypoints/llm "* ]]; then
|
if [[ $commands == *" && pytest -v -s entrypoints/llm/test_guided_generate.py"* ]]; then
|
||||||
commands=${commands//" entrypoints/llm "/" entrypoints/llm \
|
commands=${commands//" && pytest -v -s entrypoints/llm/test_guided_generate.py"/" "}
|
||||||
--ignore=entrypoints/llm/test_chat.py \
|
|
||||||
--ignore=entrypoints/llm/test_accuracy.py \
|
|
||||||
--ignore=entrypoints/llm/test_init.py \
|
|
||||||
--ignore=entrypoints/llm/test_generate_multiple_loras.py \
|
|
||||||
--ignore=entrypoints/llm/test_prompt_validation.py "}
|
|
||||||
fi
|
fi
|
||||||
|
|
||||||
#Obsolete currently
|
|
||||||
##ignore certain Entrypoints/llm tests
|
|
||||||
#if [[ $commands == *" && pytest -v -s entrypoints/llm/test_guided_generate.py"* ]]; then
|
|
||||||
# commands=${commands//" && pytest -v -s entrypoints/llm/test_guided_generate.py"/" "}
|
|
||||||
#fi
|
|
||||||
|
|
||||||
# --ignore=entrypoints/openai/test_encoder_decoder.py \
|
# --ignore=entrypoints/openai/test_encoder_decoder.py \
|
||||||
# --ignore=entrypoints/openai/test_embedding.py \
|
# --ignore=entrypoints/openai/test_embedding.py \
|
||||||
# --ignore=entrypoints/openai/test_oot_registration.py
|
# --ignore=entrypoints/openai/test_oot_registration.py
|
||||||
@ -5,8 +5,8 @@
|
|||||||
set -ex
|
set -ex
|
||||||
set -o pipefail
|
set -o pipefail
|
||||||
|
|
||||||
# cd 2 levels into the working directory
|
# cd into parent directory of this file
|
||||||
cd "$(dirname "${BASH_SOURCE[0]}")/../.."
|
cd "$(dirname "${BASH_SOURCE[0]}")/.."
|
||||||
|
|
||||||
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
|
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
|
||||||
|
|
||||||
@ -10,4 +10,5 @@ trap remove_docker_container EXIT
|
|||||||
remove_docker_container
|
remove_docker_container
|
||||||
|
|
||||||
# Try building the docker image
|
# Try building the docker image
|
||||||
docker build -t cpu-test -f docker/Dockerfile.s390x .
|
docker build -t cpu-test -f Dockerfile.ppc64le .
|
||||||
|
|
||||||
@ -8,19 +8,15 @@ set -ex
|
|||||||
CORE_RANGE=${CORE_RANGE:-48-95}
|
CORE_RANGE=${CORE_RANGE:-48-95}
|
||||||
NUMA_NODE=${NUMA_NODE:-1}
|
NUMA_NODE=${NUMA_NODE:-1}
|
||||||
|
|
||||||
|
# Try building the docker image
|
||||||
|
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build -t cpu-test-"$BUILDKITE_BUILD_NUMBER" -f Dockerfile.cpu .
|
||||||
|
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 -f Dockerfile.cpu .
|
||||||
|
|
||||||
# Setup cleanup
|
# Setup cleanup
|
||||||
remove_docker_container() {
|
remove_docker_container() { set -e; docker rm -f cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" || true; }
|
||||||
set -e;
|
|
||||||
docker rm -f cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" || true;
|
|
||||||
docker image rm cpu-test-"$BUILDKITE_BUILD_NUMBER" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 || true;
|
|
||||||
}
|
|
||||||
trap remove_docker_container EXIT
|
trap remove_docker_container EXIT
|
||||||
remove_docker_container
|
remove_docker_container
|
||||||
|
|
||||||
# Try building the docker image
|
|
||||||
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$BUILDKITE_BUILD_NUMBER" --target vllm-test -f docker/Dockerfile.cpu .
|
|
||||||
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
|
|
||||||
|
|
||||||
# Run the image, setting --shm-size=4g for tensor parallel.
|
# Run the image, setting --shm-size=4g for tensor parallel.
|
||||||
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
|
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
|
||||||
--cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"
|
--cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"
|
||||||
@ -40,6 +36,8 @@ function cpu_tests() {
|
|||||||
# Run basic model test
|
# Run basic model test
|
||||||
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
|
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
|
||||||
set -e
|
set -e
|
||||||
|
pip install -r vllm/requirements/test.txt
|
||||||
|
pip install -r vllm/requirements/cpu.txt
|
||||||
pytest -v -s tests/kernels/test_cache.py -m cpu_model
|
pytest -v -s tests/kernels/test_cache.py -m cpu_model
|
||||||
pytest -v -s tests/kernels/test_mla_decode_cpu.py -m cpu_model
|
pytest -v -s tests/kernels/test_mla_decode_cpu.py -m cpu_model
|
||||||
pytest -v -s tests/models/decoder_only/language -m cpu_model
|
pytest -v -s tests/models/decoder_only/language -m cpu_model
|
||||||
@ -9,7 +9,6 @@ python3 use_existing_torch.py
|
|||||||
|
|
||||||
# Try building the docker image
|
# Try building the docker image
|
||||||
DOCKER_BUILDKIT=1 docker build . \
|
DOCKER_BUILDKIT=1 docker build . \
|
||||||
--file docker/Dockerfile \
|
|
||||||
--target vllm-openai \
|
--target vllm-openai \
|
||||||
--platform "linux/arm64" \
|
--platform "linux/arm64" \
|
||||||
-t gh200-test \
|
-t gh200-test \
|
||||||
@ -5,7 +5,7 @@
|
|||||||
set -ex
|
set -ex
|
||||||
|
|
||||||
# Try building the docker image
|
# Try building the docker image
|
||||||
docker build -t hpu-test-env -f docker/Dockerfile.hpu .
|
docker build -t hpu-test-env -f Dockerfile.hpu .
|
||||||
|
|
||||||
# Setup cleanup
|
# Setup cleanup
|
||||||
# certain versions of HPU software stack have a bug that can
|
# certain versions of HPU software stack have a bug that can
|
||||||
@ -3,7 +3,7 @@
|
|||||||
set -euox pipefail
|
set -euox pipefail
|
||||||
|
|
||||||
if [[ $# -lt 4 ]]; then
|
if [[ $# -lt 4 ]]; then
|
||||||
echo "Usage: .buildkite/scripts/run-multi-node-test.sh WORKING_DIR NUM_NODES NUM_GPUS DOCKER_IMAGE COMMAND1 COMMAND2 ... COMMANDN"
|
echo "Usage: .buildkite/run-multi-node-test.sh WORKING_DIR NUM_NODES NUM_GPUS DOCKER_IMAGE COMMAND1 COMMAND2 ... COMMANDN"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
@ -35,7 +35,7 @@ else
|
|||||||
date "+%s" > /tmp/neuron-docker-build-timestamp
|
date "+%s" > /tmp/neuron-docker-build-timestamp
|
||||||
fi
|
fi
|
||||||
|
|
||||||
docker build -t "${image_name}" -f docker/Dockerfile.neuron .
|
docker build -t "${image_name}" -f Dockerfile.neuron .
|
||||||
|
|
||||||
# Setup cleanup
|
# Setup cleanup
|
||||||
remove_docker_container() {
|
remove_docker_container() {
|
||||||
42
.buildkite/run-tpu-v1-test.sh
Executable file
42
.buildkite/run-tpu-v1-test.sh
Executable file
@ -0,0 +1,42 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
# Build the docker image.
|
||||||
|
docker build -f Dockerfile.tpu -t vllm-tpu .
|
||||||
|
|
||||||
|
# Set up cleanup.
|
||||||
|
remove_docker_container() { docker rm -f tpu-test || true; }
|
||||||
|
trap remove_docker_container EXIT
|
||||||
|
# Remove the container that might not be cleaned up in the previous run.
|
||||||
|
remove_docker_container
|
||||||
|
|
||||||
|
# For HF_TOKEN.
|
||||||
|
source /etc/environment
|
||||||
|
# Run a simple end-to-end example.
|
||||||
|
docker run --privileged --net host --shm-size=16G -it \
|
||||||
|
-e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
|
||||||
|
vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \
|
||||||
|
&& python3 -m pip install pytest \
|
||||||
|
&& python3 -m pip install lm_eval[api]==0.4.4 \
|
||||||
|
&& export VLLM_USE_V1=1 \
|
||||||
|
&& export VLLM_XLA_CHECK_RECOMPILATION=1 \
|
||||||
|
&& echo TEST_1 \
|
||||||
|
&& pytest -v -s /workspace/vllm/tests/tpu/test_compilation.py \
|
||||||
|
&& echo TEST_2 \
|
||||||
|
&& pytest -v -s /workspace/vllm/tests/v1/tpu/test_basic.py \
|
||||||
|
&& echo TEST_3 \
|
||||||
|
&& pytest -v -s /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine \
|
||||||
|
&& echo TEST_4 \
|
||||||
|
&& pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py \
|
||||||
|
&& echo TEST_5 \
|
||||||
|
&& python3 /workspace/vllm/examples/offline_inference/tpu.py \
|
||||||
|
&& echo TEST_6 \
|
||||||
|
&& pytest -s -v /workspace/vllm/tests/tpu/worker/test_tpu_model_runner.py \
|
||||||
|
&& echo TEST_7 \
|
||||||
|
&& pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py" \
|
||||||
|
|
||||||
|
|
||||||
|
# TODO: This test fails because it uses RANDOM_SEED sampling
|
||||||
|
# && VLLM_USE_V1=1 pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
|
||||||
|
|
||||||
@ -8,7 +8,7 @@ image_name="xpu/vllm-ci:${BUILDKITE_COMMIT}"
|
|||||||
container_name="xpu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
|
container_name="xpu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
|
||||||
|
|
||||||
# Try building the docker image
|
# Try building the docker image
|
||||||
docker build -t ${image_name} -f docker/Dockerfile.xpu .
|
docker build -t ${image_name} -f Dockerfile.xpu .
|
||||||
|
|
||||||
# Setup cleanup
|
# Setup cleanup
|
||||||
remove_docker_container() {
|
remove_docker_container() {
|
||||||
@ -1,45 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
# This script build the CPU docker image and run the offline inference inside the container.
|
|
||||||
# It serves a sanity check for compilation and basic model usage.
|
|
||||||
set -ex
|
|
||||||
|
|
||||||
# Setup cleanup
|
|
||||||
remove_docker_container() {
|
|
||||||
if [[ -n "$container_id" ]]; then
|
|
||||||
podman rm -f "$container_id" || true
|
|
||||||
fi
|
|
||||||
podman system prune -f
|
|
||||||
}
|
|
||||||
trap remove_docker_container EXIT
|
|
||||||
remove_docker_container
|
|
||||||
|
|
||||||
# Try building the docker image
|
|
||||||
podman build -t cpu-test-ubi9-ppc -f docker/Dockerfile.ppc64le .
|
|
||||||
|
|
||||||
# Run the image
|
|
||||||
container_id=$(podman run -itd --entrypoint /bin/bash -v /tmp/:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN cpu-test-ubi9-ppc)
|
|
||||||
|
|
||||||
function cpu_tests() {
|
|
||||||
|
|
||||||
# offline inference
|
|
||||||
podman exec -it "$container_id" bash -c "
|
|
||||||
set -e
|
|
||||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
|
|
||||||
|
|
||||||
# Run basic model test
|
|
||||||
podman exec -it "$container_id" bash -c "
|
|
||||||
set -e
|
|
||||||
pip install pytest pytest-asyncio einops peft Pillow soundfile transformers_stream_generator matplotlib
|
|
||||||
pip install sentence-transformers datamodel_code_generator
|
|
||||||
pytest -v -s tests/models/embedding/language/test_cls_models.py::test_classification_models[float-jason9693/Qwen2.5-1.5B-apeach]
|
|
||||||
pytest -v -s tests/models/embedding/language/test_embedding.py::test_models[half-BAAI/bge-base-en-v1.5]
|
|
||||||
pytest -v -s tests/models/encoder_decoder/language -m cpu_model"
|
|
||||||
}
|
|
||||||
|
|
||||||
# All of CPU tests are expected to be finished less than 40 mins.
|
|
||||||
|
|
||||||
export container_id
|
|
||||||
export -f cpu_tests
|
|
||||||
timeout 40m bash -c cpu_tests
|
|
||||||
|
|
||||||
@ -1,103 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
set -xu
|
|
||||||
|
|
||||||
# Build the docker image.
|
|
||||||
docker build -f docker/Dockerfile.tpu -t vllm-tpu .
|
|
||||||
|
|
||||||
# Set up cleanup.
|
|
||||||
remove_docker_container() { docker rm -f tpu-test || true; }
|
|
||||||
trap remove_docker_container EXIT
|
|
||||||
# Remove the container that might not be cleaned up in the previous run.
|
|
||||||
remove_docker_container
|
|
||||||
|
|
||||||
# For HF_TOKEN.
|
|
||||||
source /etc/environment
|
|
||||||
# Run a simple end-to-end example.
|
|
||||||
docker run --privileged --net host --shm-size=16G -it \
|
|
||||||
-e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
|
|
||||||
vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \
|
|
||||||
&& python3 -m pip install pytest pytest-asyncio tpu-info \
|
|
||||||
&& python3 -m pip install lm_eval[api]==0.4.4 \
|
|
||||||
&& export VLLM_XLA_CACHE_PATH= \
|
|
||||||
&& export VLLM_USE_V1=1 \
|
|
||||||
&& export VLLM_XLA_CHECK_RECOMPILATION=1 \
|
|
||||||
&& echo HARDWARE \
|
|
||||||
&& tpu-info \
|
|
||||||
&& { \
|
|
||||||
echo TEST_0: Running test_perf.py; \
|
|
||||||
pytest -s -v /workspace/vllm/tests/tpu/test_perf.py; \
|
|
||||||
echo TEST_0_EXIT_CODE: \$?; \
|
|
||||||
} & \
|
|
||||||
&& { \
|
|
||||||
echo TEST_1: Running test_compilation.py; \
|
|
||||||
pytest -s -v /workspace/vllm/tests/tpu/test_compilation.py; \
|
|
||||||
echo TEST_1_EXIT_CODE: \$?; \
|
|
||||||
} & \
|
|
||||||
{ \
|
|
||||||
echo TEST_2: Running test_basic.py; \
|
|
||||||
pytest -s -v /workspace/vllm/tests/v1/tpu/test_basic.py; \
|
|
||||||
echo TEST_2_EXIT_CODE: \$?; \
|
|
||||||
} & \
|
|
||||||
{ \
|
|
||||||
echo TEST_3: Running test_accuracy.py::test_lm_eval_accuracy_v1_engine; \
|
|
||||||
pytest -s -v /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine; \
|
|
||||||
echo TEST_3_EXIT_CODE: \$?; \
|
|
||||||
} & \
|
|
||||||
{ \
|
|
||||||
echo TEST_4: Running test_quantization_accuracy.py; \
|
|
||||||
pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py; \
|
|
||||||
echo TEST_4_EXIT_CODE: \$?; \
|
|
||||||
} & \
|
|
||||||
{ \
|
|
||||||
echo TEST_5: Running examples/offline_inference/tpu.py; \
|
|
||||||
python3 /workspace/vllm/examples/offline_inference/tpu.py; \
|
|
||||||
echo TEST_5_EXIT_CODE: \$?; \
|
|
||||||
} & \
|
|
||||||
{ \
|
|
||||||
echo TEST_6: Running test_tpu_model_runner.py; \
|
|
||||||
pytest -s -v /workspace/vllm/tests/tpu/worker/test_tpu_model_runner.py; \
|
|
||||||
echo TEST_6_EXIT_CODE: \$?; \
|
|
||||||
} & \
|
|
||||||
&& { \
|
|
||||||
echo TEST_7: Running test_sampler.py; \
|
|
||||||
pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py; \
|
|
||||||
echo TEST_7_EXIT_CODE: \$?; \
|
|
||||||
} & \
|
|
||||||
&& { \
|
|
||||||
echo TEST_8: Running test_topk_topp_sampler.py; \
|
|
||||||
pytest -s -v /workspace/vllm/tests/v1/tpu/test_topk_topp_sampler.py; \
|
|
||||||
echo TEST_8_EXIT_CODE: \$?; \
|
|
||||||
} & \
|
|
||||||
&& { \
|
|
||||||
echo TEST_9: Running test_multimodal.py; \
|
|
||||||
pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py; \
|
|
||||||
echo TEST_9_EXIT_CODE: \$?; \
|
|
||||||
} & \
|
|
||||||
&& { \
|
|
||||||
echo TEST_10: Running test_pallas.py; \
|
|
||||||
pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py; \
|
|
||||||
echo TEST_10_EXIT_CODE: \$?; \
|
|
||||||
} & \
|
|
||||||
&& { \
|
|
||||||
echo TEST_11: Running test_struct_output_generate.py; \
|
|
||||||
pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py; \
|
|
||||||
echo TEST_11_EXIT_CODE: \$?; \
|
|
||||||
} & \
|
|
||||||
&& { \
|
|
||||||
echo TEST_12: Running test_moe_pallas.py; \
|
|
||||||
pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py; \
|
|
||||||
echo TEST_12_EXIT_CODE: \$?; \
|
|
||||||
} & \
|
|
||||||
# Disable the TPU LoRA tests until the feature is activated
|
|
||||||
# && { \
|
|
||||||
# echo TEST_13: Running test_moe_pallas.py; \
|
|
||||||
# pytest -s -v /workspace/vllm/tests/tpu/lora/; \
|
|
||||||
# echo TEST_13_EXIT_CODE: \$?; \
|
|
||||||
# } & \
|
|
||||||
wait \
|
|
||||||
&& echo 'All tests have attempted to run. Check logs for individual test statuses and exit codes.' \
|
|
||||||
"
|
|
||||||
|
|
||||||
# TODO: This test fails because it uses RANDOM_SEED sampling
|
|
||||||
# && VLLM_USE_V1=1 pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
|
|
||||||
@ -8,7 +8,6 @@
|
|||||||
# Documentation
|
# Documentation
|
||||||
# label(str): the name of the test. emoji allowed.
|
# label(str): the name of the test. emoji allowed.
|
||||||
# fast_check(bool): whether to run this on each commit on fastcheck pipeline.
|
# fast_check(bool): whether to run this on each commit on fastcheck pipeline.
|
||||||
# torch_nightly(bool): whether to run this on vllm against torch nightly pipeline.
|
|
||||||
# fast_check_only(bool): run this test on fastcheck pipeline only
|
# fast_check_only(bool): run this test on fastcheck pipeline only
|
||||||
# optional(bool): never run this test by default (i.e. need to unblock manually) unless it's scheduled nightly run.
|
# optional(bool): never run this test by default (i.e. need to unblock manually) unless it's scheduled nightly run.
|
||||||
# command(str): the single command to run for tests. incompatible with commands.
|
# command(str): the single command to run for tests. incompatible with commands.
|
||||||
@ -39,7 +38,7 @@ steps:
|
|||||||
- pip install -r ../../requirements/docs.txt
|
- pip install -r ../../requirements/docs.txt
|
||||||
- SPHINXOPTS=\"-W\" make html
|
- SPHINXOPTS=\"-W\" make html
|
||||||
# Check API reference (if it fails, you may have missing mock imports)
|
# Check API reference (if it fails, you may have missing mock imports)
|
||||||
- grep \"sig sig-object py\" build/html/api/vllm/vllm.sampling_params.html
|
- grep \"sig sig-object py\" build/html/api/inference_params.html
|
||||||
|
|
||||||
- label: Async Engine, Inputs, Utils, Worker Test # 24min
|
- label: Async Engine, Inputs, Utils, Worker Test # 24min
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
@ -71,7 +70,6 @@ steps:
|
|||||||
- label: Basic Correctness Test # 30min
|
- label: Basic Correctness Test # 30min
|
||||||
#mirror_hardwares: [amd]
|
#mirror_hardwares: [amd]
|
||||||
fast_check: true
|
fast_check: true
|
||||||
torch_nightly: true
|
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/basic_correctness/test_basic_correctness
|
- tests/basic_correctness/test_basic_correctness
|
||||||
@ -106,8 +104,7 @@ steps:
|
|||||||
- label: Entrypoints Test # 40min
|
- label: Entrypoints Test # 40min
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
fast_check: true
|
fast_check: true
|
||||||
torch_nightly: true
|
mirror_hardwares: [amd]
|
||||||
#mirror_hardwares: [amd]
|
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/entrypoints/llm
|
- tests/entrypoints/llm
|
||||||
@ -121,7 +118,7 @@ steps:
|
|||||||
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
|
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
|
||||||
- pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
|
- pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
|
||||||
- VLLM_USE_V1=0 pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
|
- VLLM_USE_V1=0 pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
|
||||||
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_openai_schema.py
|
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/correctness/
|
||||||
- pytest -v -s entrypoints/test_chat_utils.py
|
- pytest -v -s entrypoints/test_chat_utils.py
|
||||||
- VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
|
- VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
|
||||||
|
|
||||||
@ -153,12 +150,11 @@ steps:
|
|||||||
# TODO: create a dedicated test section for multi-GPU example tests
|
# TODO: create a dedicated test section for multi-GPU example tests
|
||||||
# when we have multiple distributed example tests
|
# when we have multiple distributed example tests
|
||||||
- pushd ../examples/offline_inference
|
- pushd ../examples/offline_inference
|
||||||
- python3 rlhf.py
|
- VLLM_ENABLE_V1_MULTIPROCESSING=0 python3 rlhf.py
|
||||||
- RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
|
- VLLM_ENABLE_V1_MULTIPROCESSING=0 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
|
||||||
- popd
|
- popd
|
||||||
|
|
||||||
- label: Metrics, Tracing Test # 10min
|
- label: Metrics, Tracing Test # 10min
|
||||||
mirror_hardwares: [amd]
|
|
||||||
num_gpus: 2
|
num_gpus: 2
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
@ -166,13 +162,18 @@ steps:
|
|||||||
- tests/tracing
|
- tests/tracing
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s metrics
|
- pytest -v -s metrics
|
||||||
|
- "pip install \
|
||||||
|
'opentelemetry-sdk>=1.26.0,<1.27.0' \
|
||||||
|
'opentelemetry-api>=1.26.0,<1.27.0' \
|
||||||
|
'opentelemetry-exporter-otlp>=1.26.0,<1.27.0' \
|
||||||
|
'opentelemetry-semantic-conventions-ai>=0.4.1,<0.5.0'"
|
||||||
- pytest -v -s tracing
|
- pytest -v -s tracing
|
||||||
|
|
||||||
##### fast check tests #####
|
##### fast check tests #####
|
||||||
##### 1 GPU test #####
|
##### 1 GPU test #####
|
||||||
|
|
||||||
- label: Regression Test # 5min
|
- label: Regression Test # 5min
|
||||||
#mirror_hardwares: [amd]
|
mirror_hardwares: [amd]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/test_regression
|
- tests/test_regression
|
||||||
@ -203,13 +204,12 @@ steps:
|
|||||||
commands:
|
commands:
|
||||||
# split the test to avoid interference
|
# split the test to avoid interference
|
||||||
- pytest -v -s v1/core
|
- pytest -v -s v1/core
|
||||||
|
- pytest -v -s v1/entrypoints
|
||||||
- pytest -v -s v1/engine
|
- pytest -v -s v1/engine
|
||||||
- pytest -v -s v1/entrypoints
|
- pytest -v -s v1/entrypoints
|
||||||
- pytest -v -s v1/sample
|
- pytest -v -s v1/sample
|
||||||
- pytest -v -s v1/worker
|
- pytest -v -s v1/worker
|
||||||
- pytest -v -s v1/structured_output
|
- pytest -v -s v1/structured_output
|
||||||
- pytest -v -s v1/spec_decode
|
|
||||||
- pytest -v -s v1/test_serial_utils.py
|
|
||||||
- pytest -v -s v1/test_stats.py
|
- pytest -v -s v1/test_stats.py
|
||||||
- pytest -v -s v1/test_utils.py
|
- pytest -v -s v1/test_utils.py
|
||||||
- pytest -v -s v1/test_oracle.py
|
- pytest -v -s v1/test_oracle.py
|
||||||
@ -285,25 +285,14 @@ steps:
|
|||||||
- pytest -v -s spec_decode/e2e/test_eagle_correctness.py
|
- pytest -v -s spec_decode/e2e/test_eagle_correctness.py
|
||||||
|
|
||||||
- label: LoRA Test %N # 15min each
|
- label: LoRA Test %N # 15min each
|
||||||
#mirror_hardwares: [amd]
|
mirror_hardwares: [amd]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/lora
|
- vllm/lora
|
||||||
- tests/lora
|
- tests/lora
|
||||||
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py
|
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_minicpmv_tp.py --ignore=lora/test_transfomers_model.py
|
||||||
parallelism: 4
|
parallelism: 4
|
||||||
|
|
||||||
- label: PyTorch Compilation Unit Tests
|
|
||||||
torch_nightly: true
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/
|
|
||||||
- tests/compile
|
|
||||||
commands:
|
|
||||||
- pytest -v -s compile/test_pass_manager.py
|
|
||||||
- pytest -v -s compile/test_fusion.py
|
|
||||||
- pytest -v -s compile/test_sequence_parallelism.py
|
|
||||||
|
|
||||||
- label: PyTorch Fullgraph Smoke Test # 9min
|
- label: PyTorch Fullgraph Smoke Test # 9min
|
||||||
torch_nightly: true
|
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/compile
|
- tests/compile
|
||||||
@ -312,63 +301,27 @@ steps:
|
|||||||
# these tests need to be separated, cannot combine
|
# these tests need to be separated, cannot combine
|
||||||
- pytest -v -s compile/piecewise/test_simple.py
|
- pytest -v -s compile/piecewise/test_simple.py
|
||||||
- pytest -v -s compile/piecewise/test_toy_llama.py
|
- pytest -v -s compile/piecewise/test_toy_llama.py
|
||||||
|
- pytest -v -s compile/test_pass_manager.py
|
||||||
|
|
||||||
- label: PyTorch Fullgraph Test # 18min
|
- label: PyTorch Fullgraph Test # 18min
|
||||||
torch_nightly: true
|
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/compile
|
- tests/compile
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s compile/test_full_graph.py
|
- pytest -v -s compile/test_full_graph.py
|
||||||
|
|
||||||
- label: Kernels Core Operation Test
|
- label: Kernels Test %N # 1h each
|
||||||
mirror_hardwares: [amd]
|
mirror_hardwares: [amd]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- csrc/
|
- csrc/
|
||||||
- tests/kernels/core
|
|
||||||
commands:
|
|
||||||
- pytest -v -s kernels/core
|
|
||||||
|
|
||||||
- label: Kernels Attention Test %N
|
|
||||||
mirror_hardwares: [amd]
|
|
||||||
source_file_dependencies:
|
|
||||||
- csrc/attention/
|
|
||||||
- vllm/attention
|
- vllm/attention
|
||||||
- vllm/v1/attention
|
- tests/kernels
|
||||||
- tests/kernels/attention
|
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
- pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
||||||
parallelism: 2
|
parallelism: 4
|
||||||
|
|
||||||
- label: Kernels Quantization Test %N
|
|
||||||
mirror_hardwares: [amd]
|
|
||||||
source_file_dependencies:
|
|
||||||
- csrc/quantization/
|
|
||||||
- vllm/model_executor/layers/quantization
|
|
||||||
- tests/kernels/quantization
|
|
||||||
commands:
|
|
||||||
- pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
|
||||||
parallelism: 2
|
|
||||||
|
|
||||||
- label: Kernels MoE Test
|
|
||||||
#mirror_hardwares: [amd]
|
|
||||||
source_file_dependencies:
|
|
||||||
- csrc/moe/
|
|
||||||
- tests/kernels/moe
|
|
||||||
- vllm/model_executor/layers/fused_moe/
|
|
||||||
commands:
|
|
||||||
- pytest -v -s kernels/moe
|
|
||||||
|
|
||||||
- label: Kernels Mamba Test
|
|
||||||
#mirror_hardwares: [amd]
|
|
||||||
source_file_dependencies:
|
|
||||||
- csrc/mamba/
|
|
||||||
- tests/kernels/mamba
|
|
||||||
commands:
|
|
||||||
- pytest -v -s kernels/mamba
|
|
||||||
|
|
||||||
- label: Tensorizer Test # 11min
|
- label: Tensorizer Test # 11min
|
||||||
# mirror_hardwares: [amd]
|
mirror_hardwares: [amd]
|
||||||
soft_fail: true
|
soft_fail: true
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/model_executor/model_loader
|
- vllm/model_executor/model_loader
|
||||||
@ -384,22 +337,14 @@ steps:
|
|||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- benchmarks/
|
- benchmarks/
|
||||||
commands:
|
commands:
|
||||||
- bash scripts/run-benchmarks.sh
|
- bash run-benchmarks.sh
|
||||||
|
|
||||||
- label: Benchmarks CLI Test # 10min
|
- label: Quantization Test # 33min
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/
|
|
||||||
- tests/benchmarks/
|
|
||||||
commands:
|
|
||||||
- pytest -v -s benchmarks/
|
|
||||||
|
|
||||||
- label: Quantization Test
|
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- csrc/
|
- csrc/
|
||||||
- vllm/model_executor/layers/quantization
|
- vllm/model_executor/layers/quantization
|
||||||
- tests/quantization
|
- tests/quantization
|
||||||
commands:
|
command: VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
|
||||||
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
|
|
||||||
|
|
||||||
- label: LM Eval Small Models # 53min
|
- label: LM Eval Small Models # 53min
|
||||||
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
||||||
@ -408,7 +353,7 @@ steps:
|
|||||||
- vllm/model_executor/layers/quantization
|
- vllm/model_executor/layers/quantization
|
||||||
commands:
|
commands:
|
||||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
|
- bash ./run-tests.sh -c configs/models-small.txt -t 1
|
||||||
|
|
||||||
- label: OpenAI API correctness
|
- label: OpenAI API correctness
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
@ -427,101 +372,97 @@ steps:
|
|||||||
|
|
||||||
- label: OpenAI-Compatible Tool Use # 20 min
|
- label: OpenAI-Compatible Tool Use # 20 min
|
||||||
fast_check: false
|
fast_check: false
|
||||||
#mirror_hardwares: [ amd ]
|
mirror_hardwares: [ amd ]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/tool_use
|
- tests/tool_use
|
||||||
- tests/mistral_tool_use
|
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s tool_use
|
- pytest -v -s tool_use
|
||||||
- pytest -v -s mistral_tool_use
|
|
||||||
|
|
||||||
##### models test #####
|
##### models test #####
|
||||||
|
|
||||||
- label: Basic Models Test # 24min
|
- label: Basic Models Test # 24min
|
||||||
torch_nightly: true
|
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/models
|
- tests/models
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s models/test_transformers.py
|
- pytest -v -s models/test_transformers.py
|
||||||
- pytest -v -s models/test_registry.py
|
- pytest -v -s models/test_registry.py
|
||||||
- pytest -v -s models/test_utils.py
|
|
||||||
- pytest -v -s models/test_vision.py
|
|
||||||
# V1 Test: https://github.com/vllm-project/vllm/issues/14531
|
# V1 Test: https://github.com/vllm-project/vllm/issues/14531
|
||||||
- VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'
|
- VLLM_USE_V1=0 pytest -v -s models/test_initialization.py
|
||||||
- VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'llama4'
|
|
||||||
- VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'plamo2'
|
|
||||||
|
|
||||||
- label: Language Models Test (Standard)
|
- label: Language Models Test (Standard) # 32min
|
||||||
#mirror_hardwares: [amd]
|
#mirror_hardwares: [amd]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/models/language
|
- tests/models/decoder_only/language
|
||||||
|
- tests/models/embedding/language
|
||||||
|
- tests/models/encoder_decoder/language
|
||||||
commands:
|
commands:
|
||||||
# Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
|
- pytest -v -s models/decoder_only/language -m 'core_model or quant_model'
|
||||||
- pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
|
- pytest -v -s models/embedding/language -m core_model
|
||||||
- pytest -v -s models/language -m core_model
|
|
||||||
|
|
||||||
- label: Language Models Test (Extended)
|
- label: Language Models Test (Extended) # 1h10min
|
||||||
optional: true
|
optional: true
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/models/language
|
- tests/models/decoder_only/language
|
||||||
|
- tests/models/embedding/language
|
||||||
|
- tests/models/encoder_decoder/language
|
||||||
commands:
|
commands:
|
||||||
# Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
|
- pytest -v -s models/decoder_only/language -m 'not core_model and not quant_model'
|
||||||
- pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
|
- pytest -v -s models/embedding/language -m 'not core_model'
|
||||||
- pytest -v -s models/language -m 'not core_model'
|
|
||||||
|
|
||||||
- label: Multi-Modal Models Test (Standard)
|
- label: Multi-Modal Models Test (Standard) # 40min
|
||||||
#mirror_hardwares: [amd]
|
#mirror_hardwares: [amd]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/models/multimodal
|
- tests/models/decoder_only/audio_language
|
||||||
|
- tests/models/decoder_only/vision_language
|
||||||
|
- tests/models/embedding/vision_language
|
||||||
|
- tests/models/encoder_decoder/audio_language
|
||||||
|
- tests/models/encoder_decoder/vision_language
|
||||||
commands:
|
commands:
|
||||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||||
- pytest -v -s models/multimodal/processing
|
- pytest -v -s models/multimodal
|
||||||
- pytest -v -s --ignore models/multimodal/generation/test_whisper.py models/multimodal -m core_model
|
- pytest -v -s models/decoder_only/audio_language -m 'core_model or quant_model'
|
||||||
- cd .. && pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
|
- pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'core_model or quant_model'
|
||||||
|
- pytest -v -s models/embedding/vision_language -m core_model
|
||||||
|
- pytest -v -s models/encoder_decoder/audio_language -m core_model
|
||||||
|
- pytest -v -s models/encoder_decoder/language -m core_model
|
||||||
|
- pytest -v -s models/encoder_decoder/vision_language -m core_model
|
||||||
|
|
||||||
- label: Multi-Modal Models Test (Extended) 1
|
- label: Multi-Modal Models Test (Extended) 1 # 48m
|
||||||
optional: true
|
optional: true
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/models/multimodal
|
- tests/models/decoder_only/audio_language
|
||||||
|
- tests/models/decoder_only/vision_language
|
||||||
|
- tests/models/embedding/vision_language
|
||||||
|
- tests/models/encoder_decoder/vision_language
|
||||||
commands:
|
commands:
|
||||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||||
- pytest -v -s --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing models/multimodal -m 'not core_model'
|
- pytest -v -s models/decoder_only/audio_language -m 'not core_model and not quant_model'
|
||||||
|
- pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=0) and not core_model and not quant_model'
|
||||||
|
# HACK - run phi3v tests separately to sidestep this transformers bug
|
||||||
|
# https://github.com/huggingface/transformers/issues/34307
|
||||||
|
- pytest -v -s models/decoder_only/vision_language/test_phi3v.py
|
||||||
|
- pytest -v -s --ignore models/decoder_only/vision_language/test_models.py --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'not core_model and not quant_model'
|
||||||
|
- pytest -v -s models/embedding/vision_language -m 'not core_model'
|
||||||
|
- pytest -v -s models/encoder_decoder/language -m 'not core_model'
|
||||||
|
- pytest -v -s models/encoder_decoder/vision_language -m 'not core_model'
|
||||||
|
|
||||||
- label: Multi-Modal Models Test (Extended) 2
|
- label: Multi-Modal Models Test (Extended) 2 # 38m
|
||||||
optional: true
|
optional: true
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/models/multimodal
|
- tests/models/decoder_only/vision_language
|
||||||
commands:
|
commands:
|
||||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||||
- pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model'
|
- pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=1) and not core_model and not quant_model'
|
||||||
|
|
||||||
- label: Multi-Modal Models Test (Extended) 3
|
|
||||||
optional: true
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/
|
|
||||||
- tests/models/multimodal
|
|
||||||
commands:
|
|
||||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
|
||||||
- pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model'
|
|
||||||
|
|
||||||
- label: Quantized Models Test
|
|
||||||
#mirror_hardwares: [amd]
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/model_executor/layers/quantization
|
|
||||||
- tests/models/quantization
|
|
||||||
commands:
|
|
||||||
- pytest -v -s models/quantization
|
|
||||||
|
|
||||||
# This test is used only in PR development phase to test individual models and should never run on main
|
# This test is used only in PR development phase to test individual models and should never run on main
|
||||||
- label: Custom Models Test
|
- label: Custom Models Test
|
||||||
mirror_hardwares: [amd]
|
|
||||||
optional: true
|
optional: true
|
||||||
commands:
|
commands:
|
||||||
- echo 'Testing custom models...'
|
- echo 'Testing custom models...'
|
||||||
@ -533,7 +474,6 @@ steps:
|
|||||||
##### multi gpus test #####
|
##### multi gpus test #####
|
||||||
|
|
||||||
- label: Distributed Comm Ops Test # 7min
|
- label: Distributed Comm Ops Test # 7min
|
||||||
mirror_hardwares: [amd]
|
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
num_gpus: 2
|
num_gpus: 2
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
@ -580,23 +520,21 @@ steps:
|
|||||||
- vllm/v1/engine/
|
- vllm/v1/engine/
|
||||||
commands:
|
commands:
|
||||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
|
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
|
||||||
- pytest -v -s entrypoints/llm/test_collective_rpc.py
|
- VLLM_ENABLE_V1_MULTIPROCESSING=0 pytest -v -s entrypoints/llm/test_collective_rpc.py
|
||||||
- pytest -v -s ./compile/test_basic_correctness.py
|
- pytest -v -s ./compile/test_basic_correctness.py
|
||||||
- pytest -v -s ./compile/test_wrapper.py
|
- pytest -v -s ./compile/test_wrapper.py
|
||||||
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
||||||
- TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
|
- TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
|
||||||
# Avoid importing model tests that cause CUDA reinitialization error
|
# Avoid importing model tests that cause CUDA reinitialization error
|
||||||
- pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
|
- pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
|
||||||
- pytest models/language -v -s -m 'distributed(num_gpus=2)'
|
- pytest models/encoder_decoder/language/test_bart.py -v -s -m 'distributed(num_gpus=2)'
|
||||||
- pytest models/multimodal -v -s -m 'distributed(num_gpus=2)'
|
- pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m 'distributed(num_gpus=2)'
|
||||||
# test sequence parallel
|
- pytest models/decoder_only/vision_language/test_models.py -v -s -m 'distributed(num_gpus=2)'
|
||||||
- pytest -v -s distributed/test_sequence_parallel.py
|
|
||||||
# this test fails consistently.
|
# this test fails consistently.
|
||||||
# TODO: investigate and fix
|
# TODO: investigate and fix
|
||||||
# - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
|
# - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
|
||||||
- VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
|
- VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
|
||||||
- VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/test_disagg.py
|
- VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/test_disagg.py
|
||||||
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
|
|
||||||
|
|
||||||
- label: Plugin Tests (2 GPUs) # 40min
|
- label: Plugin Tests (2 GPUs) # 40min
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
@ -663,6 +601,8 @@ steps:
|
|||||||
# requires multi-GPU testing for validation.
|
# requires multi-GPU testing for validation.
|
||||||
- pytest -v -s -x lora/test_chatglm3_tp.py
|
- pytest -v -s -x lora/test_chatglm3_tp.py
|
||||||
- pytest -v -s -x lora/test_llama_tp.py
|
- pytest -v -s -x lora/test_llama_tp.py
|
||||||
|
- pytest -v -s -x lora/test_minicpmv_tp.py
|
||||||
|
- pytest -v -s -x lora/test_transfomers_model.py
|
||||||
|
|
||||||
|
|
||||||
- label: Weight Loading Multiple GPU Test # 33min
|
- label: Weight Loading Multiple GPU Test # 33min
|
||||||
@ -713,4 +653,4 @@ steps:
|
|||||||
- vllm/model_executor/layers/quantization
|
- vllm/model_executor/layers/quantization
|
||||||
commands:
|
commands:
|
||||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
|
- bash ./run-tests.sh -c configs/models-large.txt -t 4
|
||||||
|
|||||||
@ -50,11 +50,11 @@ aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
|
|||||||
if [[ $normal_wheel == *"cu118"* ]]; then
|
if [[ $normal_wheel == *"cu118"* ]]; then
|
||||||
# if $normal_wheel matches cu118, do not upload the index.html
|
# if $normal_wheel matches cu118, do not upload the index.html
|
||||||
echo "Skipping index files for cu118 wheels"
|
echo "Skipping index files for cu118 wheels"
|
||||||
elif [[ $normal_wheel == *"cu126"* ]]; then
|
elif [[ $normal_wheel == *"cu121"* ]]; then
|
||||||
# if $normal_wheel matches cu126, do not upload the index.html
|
# if $normal_wheel matches cu121, do not upload the index.html
|
||||||
echo "Skipping index files for cu126 wheels"
|
echo "Skipping index files for cu121 wheels"
|
||||||
else
|
else
|
||||||
# only upload index.html for cu128 wheels (default wheels)
|
# only upload index.html for cu124 wheels (default wheels)
|
||||||
aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
|
aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
|
||||||
aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
|
aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
|
||||||
fi
|
fi
|
||||||
@ -66,12 +66,12 @@ aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"
|
|||||||
if [[ $normal_wheel == *"cu118"* ]]; then
|
if [[ $normal_wheel == *"cu118"* ]]; then
|
||||||
# if $normal_wheel matches cu118, do not upload the index.html
|
# if $normal_wheel matches cu118, do not upload the index.html
|
||||||
echo "Skipping index files for cu118 wheels"
|
echo "Skipping index files for cu118 wheels"
|
||||||
elif [[ $normal_wheel == *"cu126"* ]]; then
|
elif [[ $normal_wheel == *"cu121"* ]]; then
|
||||||
# if $normal_wheel matches cu126, do not upload the index.html
|
# if $normal_wheel matches cu121, do not upload the index.html
|
||||||
echo "Skipping index files for cu126 wheels"
|
echo "Skipping index files for cu121 wheels"
|
||||||
else
|
else
|
||||||
# only upload index.html for cu128 wheels (default wheels)
|
# only upload index.html for cu124 wheels (default wheels)
|
||||||
aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
|
aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
|
aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
|
||||||
1
.github/CODEOWNERS
vendored
1
.github/CODEOWNERS
vendored
@ -12,7 +12,6 @@
|
|||||||
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth
|
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth
|
||||||
/vllm/model_executor/guided_decoding @mgoin @russellb
|
/vllm/model_executor/guided_decoding @mgoin @russellb
|
||||||
/vllm/multimodal @DarkLight1337 @ywang96
|
/vllm/multimodal @DarkLight1337 @ywang96
|
||||||
/vllm/vllm_flash_attn @LucasWilkinson
|
|
||||||
CMakeLists.txt @tlrmchlsmth
|
CMakeLists.txt @tlrmchlsmth
|
||||||
|
|
||||||
# vLLM V1
|
# vLLM V1
|
||||||
|
|||||||
2
.github/ISSUE_TEMPLATE/200-installation.yml
vendored
2
.github/ISSUE_TEMPLATE/200-installation.yml
vendored
@ -14,7 +14,7 @@ body:
|
|||||||
description: |
|
description: |
|
||||||
Please run the following and paste the output below.
|
Please run the following and paste the output below.
|
||||||
```sh
|
```sh
|
||||||
wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py
|
wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
|
||||||
# For security purposes, please feel free to check the contents of collect_env.py before running it.
|
# For security purposes, please feel free to check the contents of collect_env.py before running it.
|
||||||
python collect_env.py
|
python collect_env.py
|
||||||
```
|
```
|
||||||
|
|||||||
2
.github/ISSUE_TEMPLATE/300-usage.yml
vendored
2
.github/ISSUE_TEMPLATE/300-usage.yml
vendored
@ -14,7 +14,7 @@ body:
|
|||||||
description: |
|
description: |
|
||||||
Please run the following and paste the output below.
|
Please run the following and paste the output below.
|
||||||
```sh
|
```sh
|
||||||
wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py
|
wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
|
||||||
# For security purposes, please feel free to check the contents of collect_env.py before running it.
|
# For security purposes, please feel free to check the contents of collect_env.py before running it.
|
||||||
python collect_env.py
|
python collect_env.py
|
||||||
```
|
```
|
||||||
|
|||||||
8
.github/ISSUE_TEMPLATE/400-bug-report.yml
vendored
8
.github/ISSUE_TEMPLATE/400-bug-report.yml
vendored
@ -14,19 +14,19 @@ body:
|
|||||||
description: |
|
description: |
|
||||||
Please run the following and paste the output below.
|
Please run the following and paste the output below.
|
||||||
```sh
|
```sh
|
||||||
wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py
|
wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
|
||||||
# For security purposes, please feel free to check the contents of collect_env.py before running it.
|
# For security purposes, please feel free to check the contents of collect_env.py before running it.
|
||||||
python collect_env.py
|
python collect_env.py
|
||||||
```
|
```
|
||||||
It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues.
|
It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues.
|
||||||
value: |
|
value: |
|
||||||
<details>
|
<details>
|
||||||
<summary>The output of <code>python collect_env.py</code></summary>
|
<summary>The output of `python collect_env.py`</summary>
|
||||||
|
|
||||||
```text
|
```text
|
||||||
Your output of `python collect_env.py` here
|
Your output of `python collect_env.py` here
|
||||||
```
|
```
|
||||||
|
|
||||||
</details>
|
</details>
|
||||||
validations:
|
validations:
|
||||||
required: true
|
required: true
|
||||||
@ -75,7 +75,7 @@ body:
|
|||||||
```
|
```
|
||||||
|
|
||||||
```
|
```
|
||||||
The error message you got, with the full traceback and the error logs with [dump_input.py:##] if present.
|
The error message you got, with the full traceback.
|
||||||
```
|
```
|
||||||
validations:
|
validations:
|
||||||
required: true
|
required: true
|
||||||
|
|||||||
2
.github/ISSUE_TEMPLATE/600-new-model.yml
vendored
2
.github/ISSUE_TEMPLATE/600-new-model.yml
vendored
@ -9,7 +9,7 @@ body:
|
|||||||
value: >
|
value: >
|
||||||
#### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
|
#### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
|
||||||
|
|
||||||
#### We also highly recommend you read https://docs.vllm.ai/en/latest/contributing/model/index.html first to understand how to add a new model.
|
#### We also highly recommend you read https://docs.vllm.ai/en/latest/contributing/model/adding_model.html first to understand how to add a new model.
|
||||||
- type: textarea
|
- type: textarea
|
||||||
attributes:
|
attributes:
|
||||||
label: The model to consider.
|
label: The model to consider.
|
||||||
|
|||||||
@ -35,7 +35,7 @@ body:
|
|||||||
description: |
|
description: |
|
||||||
Please run the following and paste the output below.
|
Please run the following and paste the output below.
|
||||||
```sh
|
```sh
|
||||||
wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py
|
wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
|
||||||
# For security purposes, please feel free to check the contents of collect_env.py before running it.
|
# For security purposes, please feel free to check the contents of collect_env.py before running it.
|
||||||
python collect_env.py
|
python collect_env.py
|
||||||
```
|
```
|
||||||
|
|||||||
2
.github/PULL_REQUEST_TEMPLATE.md
vendored
2
.github/PULL_REQUEST_TEMPLATE.md
vendored
@ -3,4 +3,4 @@ FILL IN THE PR DESCRIPTION HERE
|
|||||||
FIX #xxxx (*link existing issues this PR will resolve*)
|
FIX #xxxx (*link existing issues this PR will resolve*)
|
||||||
|
|
||||||
<!--- pyml disable-next-line no-emphasis-as-heading -->
|
<!--- pyml disable-next-line no-emphasis-as-heading -->
|
||||||
**BEFORE SUBMITTING, PLEASE READ <https://docs.vllm.ai/en/latest/contributing/overview.html>** (anything written below this line will be removed by GitHub Actions)
|
**BEFORE SUBMITTING, PLEASE READ <https://docs.vllm.ai/en/latest/contributing/overview.html>**
|
||||||
|
|||||||
36
.github/mergify.yml
vendored
36
.github/mergify.yml
vendored
@ -19,7 +19,7 @@ pull_request_rules:
|
|||||||
- files~=\.buildkite/
|
- files~=\.buildkite/
|
||||||
- files~=^cmake/
|
- files~=^cmake/
|
||||||
- files=CMakeLists.txt
|
- files=CMakeLists.txt
|
||||||
- files~=^docker/Dockerfile
|
- files~=^Dockerfile
|
||||||
- files~=^requirements.*\.txt
|
- files~=^requirements.*\.txt
|
||||||
- files=setup.py
|
- files=setup.py
|
||||||
actions:
|
actions:
|
||||||
@ -55,19 +55,11 @@ pull_request_rules:
|
|||||||
description: Automatically apply structured-output label
|
description: Automatically apply structured-output label
|
||||||
conditions:
|
conditions:
|
||||||
- or:
|
- or:
|
||||||
- files~=^benchmarks/structured_schemas/
|
|
||||||
- files=benchmarks/benchmark_serving_structured_output.py
|
|
||||||
- files=benchmarks/run_structured_output_benchmark.sh
|
|
||||||
- files=docs/source/features/structured_outputs.md
|
|
||||||
- files=examples/offline_inference/structured_outputs.py
|
|
||||||
- files=examples/online_serving/openai_chat_completion_structured_outputs.py
|
|
||||||
- files=examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
|
|
||||||
- files~=^vllm/model_executor/guided_decoding/
|
- files~=^vllm/model_executor/guided_decoding/
|
||||||
- files=tests/model_executor/test_guided_processors.py
|
- files=tests/model_executor/test_guided_processors.py
|
||||||
- files=tests/entrypoints/llm/test_guided_generate.py
|
- files=tests/entrypoints/llm/test_guided_generate.py
|
||||||
- files~=^tests/v1/structured_output/
|
- files=benchmarks/benchmark_serving_guided.py
|
||||||
- files=tests/v1/entrypoints/llm/test_guided_generate.py
|
- files=benchmarks/benchmark_guided.py
|
||||||
- files~=^vllm/v1/structured_output/
|
|
||||||
actions:
|
actions:
|
||||||
label:
|
label:
|
||||||
add:
|
add:
|
||||||
@ -126,28 +118,6 @@ pull_request_rules:
|
|||||||
remove:
|
remove:
|
||||||
- tpu
|
- tpu
|
||||||
|
|
||||||
- name: label-tool-calling
|
|
||||||
description: Automatically add tool-calling label
|
|
||||||
conditions:
|
|
||||||
- or:
|
|
||||||
- files~=^tests/tool_use/
|
|
||||||
- files~=^tests/mistral_tool_use/
|
|
||||||
- files~=^tests/entrypoints/openai/tool_parsers/
|
|
||||||
- files=tests/entrypoints/openai/test_chat_with_tool_reasoning.py
|
|
||||||
- files~=^vllm/entrypoints/openai/tool_parsers/
|
|
||||||
- files=docs/source/features/tool_calling.md
|
|
||||||
- files=docs/source/getting_started/examples/openai_chat_completion_client_with_tools.md
|
|
||||||
- files=docs/source/getting_started/examples/chat_with_tools.md
|
|
||||||
- files~=^examples/tool_chat_*
|
|
||||||
- files=examples/offline_inference/chat_with_tools.py
|
|
||||||
- files=examples/online_serving/openai_chat_completion_client_with_tools_required.py
|
|
||||||
- files=examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py
|
|
||||||
- files=examples/online_serving/openai_chat_completion_client_with_tools.py
|
|
||||||
actions:
|
|
||||||
label:
|
|
||||||
add:
|
|
||||||
- tool-calling
|
|
||||||
|
|
||||||
- name: ping author on conflicts and add 'needs-rebase' label
|
- name: ping author on conflicts and add 'needs-rebase' label
|
||||||
conditions:
|
conditions:
|
||||||
- conflict
|
- conflict
|
||||||
|
|||||||
6
.github/workflows/lint-and-deploy.yaml
vendored
6
.github/workflows/lint-and-deploy.yaml
vendored
@ -50,7 +50,7 @@ jobs:
|
|||||||
uses: helm/kind-action@a1b0e391336a6ee6713a0583f8c6240d70863de3 # v1.12.0
|
uses: helm/kind-action@a1b0e391336a6ee6713a0583f8c6240d70863de3 # v1.12.0
|
||||||
|
|
||||||
- name: Build the Docker image vllm cpu
|
- name: Build the Docker image vllm cpu
|
||||||
run: docker buildx build -f docker/Dockerfile.cpu -t vllm-cpu-env .
|
run: docker buildx build -f Dockerfile.cpu -t vllm-cpu-env .
|
||||||
|
|
||||||
- name: Configuration of docker images, network and namespace for the kind cluster
|
- name: Configuration of docker images, network and namespace for the kind cluster
|
||||||
run: |
|
run: |
|
||||||
@ -66,7 +66,7 @@ jobs:
|
|||||||
export AWS_SECRET_ACCESS_KEY=minioadmin
|
export AWS_SECRET_ACCESS_KEY=minioadmin
|
||||||
sleep 30 && kubectl -n ns-vllm logs -f "$(kubectl -n ns-vllm get pods | awk '/deployment/ {print $1;exit}')" &
|
sleep 30 && kubectl -n ns-vllm logs -f "$(kubectl -n ns-vllm get pods | awk '/deployment/ {print $1;exit}')" &
|
||||||
helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/online_serving/chart-helm -f examples/online_serving/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env"
|
helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/online_serving/chart-helm -f examples/online_serving/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env"
|
||||||
|
|
||||||
- name: curl test
|
- name: curl test
|
||||||
run: |
|
run: |
|
||||||
kubectl -n ns-vllm port-forward service/test-vllm-service 8001:80 &
|
kubectl -n ns-vllm port-forward service/test-vllm-service 8001:80 &
|
||||||
@ -79,4 +79,4 @@ jobs:
|
|||||||
"max_tokens": 7,
|
"max_tokens": 7,
|
||||||
"temperature": 0
|
"temperature": 0
|
||||||
}'):$CODE"
|
}'):$CODE"
|
||||||
echo "$CODE"
|
echo "$CODE"
|
||||||
5
.gitignore
vendored
5
.gitignore
vendored
@ -3,6 +3,7 @@
|
|||||||
|
|
||||||
# vllm-flash-attn built from source
|
# vllm-flash-attn built from source
|
||||||
vllm/vllm_flash_attn/*
|
vllm/vllm_flash_attn/*
|
||||||
|
!vllm/vllm_flash_attn/fa_utils.py
|
||||||
|
|
||||||
# Byte-compiled / optimized / DLL files
|
# Byte-compiled / optimized / DLL files
|
||||||
__pycache__/
|
__pycache__/
|
||||||
@ -80,7 +81,6 @@ instance/
|
|||||||
# Sphinx documentation
|
# Sphinx documentation
|
||||||
docs/_build/
|
docs/_build/
|
||||||
docs/source/getting_started/examples/
|
docs/source/getting_started/examples/
|
||||||
docs/source/api/vllm
|
|
||||||
|
|
||||||
# PyBuilder
|
# PyBuilder
|
||||||
.pybuilder/
|
.pybuilder/
|
||||||
@ -203,6 +203,3 @@ benchmarks/**/*.json
|
|||||||
# Linting
|
# Linting
|
||||||
actionlint
|
actionlint
|
||||||
shellcheck*/
|
shellcheck*/
|
||||||
|
|
||||||
# Ingore moe/marlin_moe gen code
|
|
||||||
csrc/moe/marlin_moe_wna16/kernel_*
|
|
||||||
|
|||||||
@ -1,6 +1,3 @@
|
|||||||
default_install_hook_types:
|
|
||||||
- pre-commit
|
|
||||||
- commit-msg
|
|
||||||
default_stages:
|
default_stages:
|
||||||
- pre-commit # Run locally
|
- pre-commit # Run locally
|
||||||
- manual # Run in CI
|
- manual # Run in CI
|
||||||
@ -11,30 +8,31 @@ repos:
|
|||||||
hooks:
|
hooks:
|
||||||
- id: yapf
|
- id: yapf
|
||||||
args: [--in-place, --verbose]
|
args: [--in-place, --verbose]
|
||||||
|
additional_dependencies: [toml] # TODO: Remove when yapf is upgraded
|
||||||
- repo: https://github.com/astral-sh/ruff-pre-commit
|
- repo: https://github.com/astral-sh/ruff-pre-commit
|
||||||
rev: v0.11.7
|
rev: v0.9.3
|
||||||
hooks:
|
hooks:
|
||||||
- id: ruff
|
- id: ruff
|
||||||
args: [--output-format, github, --fix]
|
args: [--output-format, github, --fix]
|
||||||
- repo: https://github.com/codespell-project/codespell
|
- repo: https://github.com/codespell-project/codespell
|
||||||
rev: v2.4.1
|
rev: v2.4.0
|
||||||
hooks:
|
hooks:
|
||||||
- id: codespell
|
- id: codespell
|
||||||
additional_dependencies: ['tomli']
|
additional_dependencies: ['tomli']
|
||||||
args: ['--toml', 'pyproject.toml']
|
args: ['--toml', 'pyproject.toml']
|
||||||
- repo: https://github.com/PyCQA/isort
|
- repo: https://github.com/PyCQA/isort
|
||||||
rev: 6.0.1
|
rev: 0a0b7a830386ba6a31c2ec8316849ae4d1b8240d # 6.0.0
|
||||||
hooks:
|
hooks:
|
||||||
- id: isort
|
- id: isort
|
||||||
- repo: https://github.com/pre-commit/mirrors-clang-format
|
- repo: https://github.com/pre-commit/mirrors-clang-format
|
||||||
rev: v20.1.3
|
rev: v19.1.7
|
||||||
hooks:
|
hooks:
|
||||||
- id: clang-format
|
- id: clang-format
|
||||||
exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))|vllm/third_party/.*'
|
exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))|vllm/third_party/.*'
|
||||||
types_or: [c++, cuda]
|
types_or: [c++, cuda]
|
||||||
args: [--style=file, --verbose]
|
args: [--style=file, --verbose]
|
||||||
- repo: https://github.com/jackdewinter/pymarkdown
|
- repo: https://github.com/jackdewinter/pymarkdown
|
||||||
rev: v0.9.29
|
rev: v0.9.27
|
||||||
hooks:
|
hooks:
|
||||||
- id: pymarkdown
|
- id: pymarkdown
|
||||||
args: [fix]
|
args: [fix]
|
||||||
@ -43,10 +41,10 @@ repos:
|
|||||||
hooks:
|
hooks:
|
||||||
- id: actionlint
|
- id: actionlint
|
||||||
- repo: https://github.com/astral-sh/uv-pre-commit
|
- repo: https://github.com/astral-sh/uv-pre-commit
|
||||||
rev: 0.6.17
|
rev: 0.6.2
|
||||||
hooks:
|
hooks:
|
||||||
- id: pip-compile
|
- id: pip-compile
|
||||||
args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu128]
|
args: [requirements/test.in, -o, requirements/test.txt]
|
||||||
files: ^requirements/test\.(in|txt)$
|
files: ^requirements/test\.(in|txt)$
|
||||||
- repo: local
|
- repo: local
|
||||||
hooks:
|
hooks:
|
||||||
@ -101,8 +99,8 @@ repos:
|
|||||||
args:
|
args:
|
||||||
- -c
|
- -c
|
||||||
- |
|
- |
|
||||||
if ! grep -q "^Signed-off-by: $(git config user.name) <$(git config user.email)>" "$(git rev-parse --git-path COMMIT_EDITMSG)"; then
|
if ! grep -q "^Signed-off-by: $(git config user.name) <$(git config user.email)>" .git/COMMIT_EDITMSG; then
|
||||||
printf "\nSigned-off-by: $(git config user.name) <$(git config user.email)>\n" >> "$(git rev-parse --git-path COMMIT_EDITMSG)"
|
printf "\nSigned-off-by: $(git config user.name) <$(git config user.email)>\n" >> .git/COMMIT_EDITMSG
|
||||||
fi
|
fi
|
||||||
language: system
|
language: system
|
||||||
verbose: true
|
verbose: true
|
||||||
@ -121,10 +119,6 @@ repos:
|
|||||||
language: system
|
language: system
|
||||||
always_run: true
|
always_run: true
|
||||||
pass_filenames: false
|
pass_filenames: false
|
||||||
- id: update-dockerfile-graph
|
|
||||||
name: Update Dockerfile dependency graph
|
|
||||||
entry: tools/update-dockerfile-graph.sh
|
|
||||||
language: script
|
|
||||||
# Keep `suggestion` last
|
# Keep `suggestion` last
|
||||||
- id: suggestion
|
- id: suggestion
|
||||||
name: Suggestion
|
name: Suggestion
|
||||||
|
|||||||
158
CMakeLists.txt
158
CMakeLists.txt
@ -15,6 +15,7 @@ project(vllm_extensions LANGUAGES CXX)
|
|||||||
|
|
||||||
# CUDA by default, can be overridden by using -DVLLM_TARGET_DEVICE=... (used by setup.py)
|
# CUDA by default, can be overridden by using -DVLLM_TARGET_DEVICE=... (used by setup.py)
|
||||||
set(VLLM_TARGET_DEVICE "cuda" CACHE STRING "Target device backend for vLLM")
|
set(VLLM_TARGET_DEVICE "cuda" CACHE STRING "Target device backend for vLLM")
|
||||||
|
|
||||||
message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
|
message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
|
||||||
message(STATUS "Target device: ${VLLM_TARGET_DEVICE}")
|
message(STATUS "Target device: ${VLLM_TARGET_DEVICE}")
|
||||||
|
|
||||||
@ -33,7 +34,7 @@ set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12")
|
|||||||
set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0")
|
set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0")
|
||||||
|
|
||||||
# Supported AMD GPU architectures.
|
# Supported AMD GPU architectures.
|
||||||
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201")
|
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101")
|
||||||
|
|
||||||
#
|
#
|
||||||
# Supported/expected torch versions for CUDA/ROCm.
|
# Supported/expected torch versions for CUDA/ROCm.
|
||||||
@ -43,10 +44,10 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1
|
|||||||
#
|
#
|
||||||
# Note: the CUDA torch version is derived from pyproject.toml and various
|
# Note: the CUDA torch version is derived from pyproject.toml and various
|
||||||
# requirements.txt files and should be kept consistent. The ROCm torch
|
# requirements.txt files and should be kept consistent. The ROCm torch
|
||||||
# versions are derived from docker/Dockerfile.rocm
|
# versions are derived from Dockerfile.rocm
|
||||||
#
|
#
|
||||||
set(TORCH_SUPPORTED_VERSION_CUDA "2.7.0")
|
set(TORCH_SUPPORTED_VERSION_CUDA "2.6.0")
|
||||||
set(TORCH_SUPPORTED_VERSION_ROCM "2.7.0")
|
set(TORCH_SUPPORTED_VERSION_ROCM "2.6.0")
|
||||||
|
|
||||||
#
|
#
|
||||||
# Try to find python package with an executable that exactly matches
|
# Try to find python package with an executable that exactly matches
|
||||||
@ -229,28 +230,25 @@ set(VLLM_EXT_SRC
|
|||||||
"csrc/cache_kernels.cu"
|
"csrc/cache_kernels.cu"
|
||||||
"csrc/attention/paged_attention_v1.cu"
|
"csrc/attention/paged_attention_v1.cu"
|
||||||
"csrc/attention/paged_attention_v2.cu"
|
"csrc/attention/paged_attention_v2.cu"
|
||||||
"csrc/attention/merge_attn_states.cu"
|
|
||||||
"csrc/pos_encoding_kernels.cu"
|
"csrc/pos_encoding_kernels.cu"
|
||||||
"csrc/activation_kernels.cu"
|
"csrc/activation_kernels.cu"
|
||||||
"csrc/layernorm_kernels.cu"
|
"csrc/layernorm_kernels.cu"
|
||||||
"csrc/layernorm_quant_kernels.cu"
|
"csrc/layernorm_quant_kernels.cu"
|
||||||
"csrc/cuda_view.cu"
|
|
||||||
"csrc/quantization/gptq/q_gemm.cu"
|
"csrc/quantization/gptq/q_gemm.cu"
|
||||||
"csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
|
"csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
|
||||||
"csrc/quantization/fp8/common.cu"
|
"csrc/quantization/fp8/common.cu"
|
||||||
"csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu"
|
"csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu"
|
||||||
"csrc/quantization/gguf/gguf_kernel.cu"
|
"csrc/quantization/gguf/gguf_kernel.cu"
|
||||||
"csrc/quantization/activation_kernels.cu"
|
|
||||||
"csrc/cuda_utils_kernels.cu"
|
"csrc/cuda_utils_kernels.cu"
|
||||||
"csrc/prepare_inputs/advance_step.cu"
|
"csrc/prepare_inputs/advance_step.cu"
|
||||||
"csrc/custom_all_reduce.cu"
|
|
||||||
"csrc/torch_bindings.cpp")
|
"csrc/torch_bindings.cpp")
|
||||||
|
|
||||||
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||||
SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
|
SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
|
||||||
|
|
||||||
# Set CUTLASS_REVISION. Used for FetchContent. Also fixes some bogus messages when building.
|
# Set CUTLASS_REVISION manually -- its revision detection doesn't work in this case.
|
||||||
set(CUTLASS_REVISION "v3.9.2" CACHE STRING "CUTLASS revision to use")
|
# Please keep this in sync with FetchContent_Declare line below.
|
||||||
|
set(CUTLASS_REVISION "v3.8.0" CACHE STRING "CUTLASS revision to use")
|
||||||
|
|
||||||
# Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
|
# Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
|
||||||
if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
|
if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
|
||||||
@ -268,7 +266,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
cutlass
|
cutlass
|
||||||
GIT_REPOSITORY https://github.com/nvidia/cutlass.git
|
GIT_REPOSITORY https://github.com/nvidia/cutlass.git
|
||||||
# Please keep this in sync with CUTLASS_REVISION line above.
|
# Please keep this in sync with CUTLASS_REVISION line above.
|
||||||
GIT_TAG ${CUTLASS_REVISION}
|
GIT_TAG v3.8.0
|
||||||
GIT_PROGRESS TRUE
|
GIT_PROGRESS TRUE
|
||||||
|
|
||||||
# Speed up CUTLASS download by retrieving only the specified GIT_TAG instead of the history.
|
# Speed up CUTLASS download by retrieving only the specified GIT_TAG instead of the history.
|
||||||
@ -284,13 +282,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
"csrc/mamba/causal_conv1d/causal_conv1d.cu"
|
"csrc/mamba/causal_conv1d/causal_conv1d.cu"
|
||||||
"csrc/quantization/aqlm/gemm_kernels.cu"
|
"csrc/quantization/aqlm/gemm_kernels.cu"
|
||||||
"csrc/quantization/awq/gemm_kernels.cu"
|
"csrc/quantization/awq/gemm_kernels.cu"
|
||||||
|
"csrc/custom_all_reduce.cu"
|
||||||
"csrc/permute_cols.cu"
|
"csrc/permute_cols.cu"
|
||||||
"csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
|
"csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
|
||||||
"csrc/quantization/fp4/nvfp4_quant_entry.cu"
|
"csrc/quantization/fp4/nvfp4_quant_entry.cu"
|
||||||
"csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu"
|
"csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu"
|
||||||
"csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
|
"csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
|
||||||
"csrc/cutlass_extensions/common.cpp"
|
"csrc/cutlass_extensions/common.cpp")
|
||||||
"csrc/attention/mla/cutlass_mla_entry.cu")
|
|
||||||
|
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${VLLM_EXT_SRC}"
|
SRCS "${VLLM_EXT_SRC}"
|
||||||
@ -301,52 +299,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
# are not supported by Machete yet.
|
# are not supported by Machete yet.
|
||||||
cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0" "${CUDA_ARCHS}")
|
cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0" "${CUDA_ARCHS}")
|
||||||
if (MARLIN_ARCHS)
|
if (MARLIN_ARCHS)
|
||||||
|
|
||||||
#
|
|
||||||
# For the Marlin kernels we automatically generate sources for various
|
|
||||||
# preselected input type pairs and schedules.
|
|
||||||
# Generate sources:
|
|
||||||
set(MARLIN_GEN_SCRIPT
|
|
||||||
${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/gptq_marlin/generate_kernels.py)
|
|
||||||
file(MD5 ${MARLIN_GEN_SCRIPT} MARLIN_GEN_SCRIPT_HASH)
|
|
||||||
|
|
||||||
message(STATUS "Marlin generation script hash: ${MARLIN_GEN_SCRIPT_HASH}")
|
|
||||||
message(STATUS "Last run Marlin generate script hash: $CACHE{MARLIN_GEN_SCRIPT_HASH}")
|
|
||||||
|
|
||||||
if (NOT DEFINED CACHE{MARLIN_GEN_SCRIPT_HASH}
|
|
||||||
OR NOT $CACHE{MARLIN_GEN_SCRIPT_HASH} STREQUAL ${MARLIN_GEN_SCRIPT_HASH})
|
|
||||||
execute_process(
|
|
||||||
COMMAND ${CMAKE_COMMAND} -E env
|
|
||||||
PYTHONPATH=$PYTHONPATH
|
|
||||||
${Python_EXECUTABLE} ${MARLIN_GEN_SCRIPT}
|
|
||||||
RESULT_VARIABLE marlin_generation_result
|
|
||||||
OUTPUT_VARIABLE marlin_generation_result
|
|
||||||
OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log
|
|
||||||
ERROR_FILE ${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log
|
|
||||||
)
|
|
||||||
|
|
||||||
if (NOT marlin_generation_result EQUAL 0)
|
|
||||||
message(FATAL_ERROR "Marlin generation failed."
|
|
||||||
" Result: \"${marlin_generation_result}\""
|
|
||||||
"\nCheck the log for details: "
|
|
||||||
"${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log")
|
|
||||||
else()
|
|
||||||
set(MARLIN_GEN_SCRIPT_HASH ${MARLIN_GEN_SCRIPT_HASH}
|
|
||||||
CACHE STRING "Last run Marlin generate script hash" FORCE)
|
|
||||||
message(STATUS "Marlin generation completed successfully.")
|
|
||||||
endif()
|
|
||||||
else()
|
|
||||||
message(STATUS "Marlin generation script has not changed, skipping generation.")
|
|
||||||
endif()
|
|
||||||
|
|
||||||
file(GLOB MARLIN_TEMPLATE_KERNEL_SRC "csrc/quantization/gptq_marlin/kernel_*.cu")
|
|
||||||
set_gencode_flags_for_srcs(
|
|
||||||
SRCS "${MARLIN_TEMPLATE_KERNEL_SRC}"
|
|
||||||
CUDA_ARCHS "${MARLIN_ARCHS}")
|
|
||||||
|
|
||||||
list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_KERNEL_SRC})
|
|
||||||
|
|
||||||
set(MARLIN_SRCS
|
set(MARLIN_SRCS
|
||||||
|
"csrc/quantization/fp8/fp8_marlin.cu"
|
||||||
"csrc/quantization/marlin/dense/marlin_cuda_kernel.cu"
|
"csrc/quantization/marlin/dense/marlin_cuda_kernel.cu"
|
||||||
"csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
|
"csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
|
||||||
"csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu"
|
"csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu"
|
||||||
@ -418,7 +372,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
set(SRCS
|
set(SRCS
|
||||||
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu"
|
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu"
|
||||||
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu"
|
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu"
|
||||||
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8.cu"
|
|
||||||
)
|
)
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${SRCS}"
|
SRCS "${SRCS}"
|
||||||
@ -508,26 +461,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
set(FP4_ARCHS)
|
set(FP4_ARCHS)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
# CUTLASS MLA Archs and flags
|
#
|
||||||
cuda_archs_loose_intersection(MLA_ARCHS "10.0a" "${CUDA_ARCHS}")
|
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND MLA_ARCHS)
|
|
||||||
set(SRCS
|
|
||||||
"csrc/attention/mla/cutlass_mla_kernels.cu")
|
|
||||||
set_gencode_flags_for_srcs(
|
|
||||||
SRCS "${SRCS}"
|
|
||||||
CUDA_ARCHS "${MLA_ARCHS}")
|
|
||||||
list(APPEND VLLM_EXT_SRC "${SRCS}")
|
|
||||||
list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MLA=1")
|
|
||||||
# Add MLA-specific include directories only to MLA source files
|
|
||||||
set_source_files_properties(${SRCS}
|
|
||||||
PROPERTIES INCLUDE_DIRECTORIES "${CUTLASS_DIR}/examples/77_blackwell_fmha;${CUTLASS_DIR}/examples/common")
|
|
||||||
message(STATUS "Building CUTLASS MLA for archs: ${MLA_ARCHS}")
|
|
||||||
else()
|
|
||||||
message(STATUS "Not building CUTLASS MLA as no compatible archs were found.")
|
|
||||||
# clear MLA_ARCHS
|
|
||||||
set(MLA_ARCHS)
|
|
||||||
endif()
|
|
||||||
|
|
||||||
# CUTLASS MoE kernels
|
# CUTLASS MoE kernels
|
||||||
|
|
||||||
# The MoE kernel cutlass_moe_mm requires CUDA 12.3 or later (and only works
|
# The MoE kernel cutlass_moe_mm requires CUDA 12.3 or later (and only works
|
||||||
@ -673,51 +607,21 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
list(APPEND VLLM_MOE_EXT_SRC "${VLLM_MOE_WNA16_SRC}")
|
list(APPEND VLLM_MOE_EXT_SRC "${VLLM_MOE_WNA16_SRC}")
|
||||||
cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0" "${CUDA_ARCHS}")
|
cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0" "${CUDA_ARCHS}")
|
||||||
if (MARLIN_MOE_ARCHS)
|
if (MARLIN_MOE_ARCHS)
|
||||||
|
set(MARLIN_MOE_SRC
|
||||||
|
"csrc/moe/marlin_kernels/marlin_moe_kernel.h"
|
||||||
|
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.h"
|
||||||
|
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.cu"
|
||||||
|
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.h"
|
||||||
|
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.cu"
|
||||||
|
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.h"
|
||||||
|
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.cu"
|
||||||
|
"csrc/moe/marlin_moe_ops.cu")
|
||||||
|
|
||||||
#
|
|
||||||
# For the Marlin MOE kernels we automatically generate sources for various
|
|
||||||
# preselected input type pairs and schedules.
|
|
||||||
# Generate sources:
|
|
||||||
set(MOE_MARLIN_GEN_SCRIPT
|
|
||||||
${CMAKE_CURRENT_SOURCE_DIR}/csrc/moe/marlin_moe_wna16/generate_kernels.py)
|
|
||||||
file(MD5 ${MOE_MARLIN_GEN_SCRIPT} MOE_MARLIN_GEN_SCRIPT_HASH)
|
|
||||||
|
|
||||||
message(STATUS "Marlin MOE generation script hash: ${MOE_MARLIN_GEN_SCRIPT_HASH}")
|
|
||||||
message(STATUS "Last run Marlin MOE generate script hash: $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH}")
|
|
||||||
|
|
||||||
if (NOT DEFINED CACHE{MOE_MARLIN_GEN_SCRIPT_HASH}
|
|
||||||
OR NOT $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH} STREQUAL ${MOE_MARLIN_GEN_SCRIPT_HASH})
|
|
||||||
execute_process(
|
|
||||||
COMMAND ${CMAKE_COMMAND} -E env
|
|
||||||
PYTHONPATH=$PYTHONPATH
|
|
||||||
${Python_EXECUTABLE} ${MOE_MARLIN_GEN_SCRIPT}
|
|
||||||
RESULT_VARIABLE moe_marlin_generation_result
|
|
||||||
OUTPUT_VARIABLE moe_marlin_generation_output
|
|
||||||
OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log
|
|
||||||
ERROR_FILE ${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log
|
|
||||||
)
|
|
||||||
|
|
||||||
if (NOT moe_marlin_generation_result EQUAL 0)
|
|
||||||
message(FATAL_ERROR "Marlin MOE generation failed."
|
|
||||||
" Result: \"${moe_marlin_generation_result}\""
|
|
||||||
"\nCheck the log for details: "
|
|
||||||
"${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log")
|
|
||||||
else()
|
|
||||||
set(MOE_MARLIN_GEN_SCRIPT_HASH ${MOE_MARLIN_GEN_SCRIPT_HASH}
|
|
||||||
CACHE STRING "Last run Marlin MOE generate script hash" FORCE)
|
|
||||||
message(STATUS "Marlin MOE generation completed successfully.")
|
|
||||||
endif()
|
|
||||||
else()
|
|
||||||
message(STATUS "Marlin MOE generation script has not changed, skipping generation.")
|
|
||||||
endif()
|
|
||||||
|
|
||||||
file(GLOB MOE_WNAA16_MARLIN_SRC "csrc/moe/marlin_moe_wna16/*.cu")
|
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${MOE_WNAA16_MARLIN_SRC}"
|
SRCS "${MARLIN_MOE_SRC}"
|
||||||
CUDA_ARCHS "${MARLIN_MOE_ARCHS}")
|
CUDA_ARCHS "${MARLIN_MOE_ARCHS}")
|
||||||
|
|
||||||
list(APPEND VLLM_MOE_EXT_SRC ${MOE_WNAA16_MARLIN_SRC})
|
list(APPEND VLLM_MOE_EXT_SRC "${MARLIN_MOE_SRC}")
|
||||||
|
|
||||||
message(STATUS "Building Marlin MOE kernels for archs: ${MARLIN_MOE_ARCHS}")
|
message(STATUS "Building Marlin MOE kernels for archs: ${MARLIN_MOE_ARCHS}")
|
||||||
else()
|
else()
|
||||||
message(STATUS "Not building Marlin MOE kernels as no compatible archs found"
|
message(STATUS "Not building Marlin MOE kernels as no compatible archs found"
|
||||||
@ -725,17 +629,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|
||||||
set(MOE_PERMUTE_SRC
|
|
||||||
"csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.cu"
|
|
||||||
"csrc/moe/moe_permute_unpermute_op.cu")
|
|
||||||
|
|
||||||
set_gencode_flags_for_srcs(
|
|
||||||
SRCS "${MARLIN_PERMUTE_SRC}"
|
|
||||||
CUDA_ARCHS "${MOE_PERMUTE_ARCHS}")
|
|
||||||
|
|
||||||
list(APPEND VLLM_MOE_EXT_SRC "${MOE_PERMUTE_SRC}")
|
|
||||||
endif()
|
|
||||||
message(STATUS "Enabling moe extension.")
|
message(STATUS "Enabling moe extension.")
|
||||||
define_gpu_extension_target(
|
define_gpu_extension_target(
|
||||||
_moe_C
|
_moe_C
|
||||||
@ -744,8 +637,6 @@ define_gpu_extension_target(
|
|||||||
SOURCES ${VLLM_MOE_EXT_SRC}
|
SOURCES ${VLLM_MOE_EXT_SRC}
|
||||||
COMPILE_FLAGS ${VLLM_GPU_FLAGS}
|
COMPILE_FLAGS ${VLLM_GPU_FLAGS}
|
||||||
ARCHITECTURES ${VLLM_GPU_ARCHES}
|
ARCHITECTURES ${VLLM_GPU_ARCHES}
|
||||||
INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR}
|
|
||||||
INCLUDE_DIRECTORIES ${CUTLASS_TOOLS_UTIL_INCLUDE_DIR}
|
|
||||||
USE_SABI 3
|
USE_SABI 3
|
||||||
WITH_SOABI)
|
WITH_SOABI)
|
||||||
|
|
||||||
@ -755,7 +646,6 @@ if(VLLM_GPU_LANG STREQUAL "HIP")
|
|||||||
#
|
#
|
||||||
set(VLLM_ROCM_EXT_SRC
|
set(VLLM_ROCM_EXT_SRC
|
||||||
"csrc/rocm/torch_bindings.cpp"
|
"csrc/rocm/torch_bindings.cpp"
|
||||||
"csrc/rocm/skinny_gemms.cu"
|
|
||||||
"csrc/rocm/attention.cu")
|
"csrc/rocm/attention.cu")
|
||||||
|
|
||||||
define_gpu_extension_target(
|
define_gpu_extension_target(
|
||||||
|
|||||||
@ -5,11 +5,11 @@
|
|||||||
# docs/source/contributing/dockerfile/dockerfile.md and
|
# docs/source/contributing/dockerfile/dockerfile.md and
|
||||||
# docs/source/assets/contributing/dockerfile-stages-dependency.png
|
# docs/source/assets/contributing/dockerfile-stages-dependency.png
|
||||||
|
|
||||||
ARG CUDA_VERSION=12.8.1
|
ARG CUDA_VERSION=12.4.1
|
||||||
#################### BASE BUILD IMAGE ####################
|
#################### BASE BUILD IMAGE ####################
|
||||||
# prepare basic build environment
|
# prepare basic build environment
|
||||||
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 AS base
|
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 AS base
|
||||||
ARG CUDA_VERSION=12.8.1
|
ARG CUDA_VERSION=12.4.1
|
||||||
ARG PYTHON_VERSION=3.12
|
ARG PYTHON_VERSION=3.12
|
||||||
ARG TARGETPLATFORM
|
ARG TARGETPLATFORM
|
||||||
ENV DEBIAN_FRONTEND=noninteractive
|
ENV DEBIAN_FRONTEND=noninteractive
|
||||||
@ -19,10 +19,7 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
|
|||||||
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
|
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
|
||||||
&& apt-get update -y \
|
&& apt-get update -y \
|
||||||
&& apt-get install -y ccache software-properties-common git curl sudo \
|
&& apt-get install -y ccache software-properties-common git curl sudo \
|
||||||
&& for i in 1 2 3; do \
|
&& add-apt-repository ppa:deadsnakes/ppa \
|
||||||
add-apt-repository -y ppa:deadsnakes/ppa && break || \
|
|
||||||
{ echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \
|
|
||||||
done \
|
|
||||||
&& apt-get update -y \
|
&& apt-get update -y \
|
||||||
&& apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
|
&& apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
|
||||||
&& update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
|
&& update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
|
||||||
@ -37,7 +34,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
|||||||
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
|
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
|
||||||
# Reference: https://github.com/astral-sh/uv/pull/1694
|
# Reference: https://github.com/astral-sh/uv/pull/1694
|
||||||
ENV UV_HTTP_TIMEOUT=500
|
ENV UV_HTTP_TIMEOUT=500
|
||||||
ENV UV_INDEX_STRATEGY="unsafe-best-match"
|
|
||||||
|
|
||||||
# Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519
|
# Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519
|
||||||
# as it was causing spam when compiling the CUTLASS kernels
|
# as it was causing spam when compiling the CUTLASS kernels
|
||||||
@ -70,8 +66,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
|||||||
COPY requirements/common.txt requirements/common.txt
|
COPY requirements/common.txt requirements/common.txt
|
||||||
COPY requirements/cuda.txt requirements/cuda.txt
|
COPY requirements/cuda.txt requirements/cuda.txt
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||||
uv pip install --system -r requirements/cuda.txt \
|
uv pip install --system -r requirements/cuda.txt
|
||||||
--extra-index-url https://download.pytorch.org/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
|
|
||||||
|
|
||||||
# cuda arch list used by torch
|
# cuda arch list used by torch
|
||||||
# can be useful for both `dev` and `test`
|
# can be useful for both `dev` and `test`
|
||||||
@ -94,11 +89,9 @@ COPY requirements/build.txt requirements/build.txt
|
|||||||
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
|
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
|
||||||
# Reference: https://github.com/astral-sh/uv/pull/1694
|
# Reference: https://github.com/astral-sh/uv/pull/1694
|
||||||
ENV UV_HTTP_TIMEOUT=500
|
ENV UV_HTTP_TIMEOUT=500
|
||||||
ENV UV_INDEX_STRATEGY="unsafe-best-match"
|
|
||||||
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||||
uv pip install --system -r requirements/build.txt \
|
uv pip install --system -r requirements/build.txt
|
||||||
--extra-index-url https://download.pytorch.org/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
|
|
||||||
|
|
||||||
COPY . .
|
COPY . .
|
||||||
ARG GIT_REPO_CHECK=0
|
ARG GIT_REPO_CHECK=0
|
||||||
@ -165,25 +158,19 @@ FROM base as dev
|
|||||||
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
|
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
|
||||||
# Reference: https://github.com/astral-sh/uv/pull/1694
|
# Reference: https://github.com/astral-sh/uv/pull/1694
|
||||||
ENV UV_HTTP_TIMEOUT=500
|
ENV UV_HTTP_TIMEOUT=500
|
||||||
ENV UV_INDEX_STRATEGY="unsafe-best-match"
|
|
||||||
|
|
||||||
# Workaround for #17068
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
|
||||||
uv pip install --system --no-build-isolation "git+https://github.com/state-spaces/mamba@v2.2.4"
|
|
||||||
|
|
||||||
COPY requirements/lint.txt requirements/lint.txt
|
COPY requirements/lint.txt requirements/lint.txt
|
||||||
COPY requirements/test.txt requirements/test.txt
|
COPY requirements/test.txt requirements/test.txt
|
||||||
COPY requirements/dev.txt requirements/dev.txt
|
COPY requirements/dev.txt requirements/dev.txt
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||||
uv pip install --system -r requirements/dev.txt \
|
uv pip install --system -r requirements/dev.txt
|
||||||
--extra-index-url https://download.pytorch.org/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
|
|
||||||
#################### DEV IMAGE ####################
|
#################### DEV IMAGE ####################
|
||||||
|
|
||||||
#################### vLLM installation IMAGE ####################
|
#################### vLLM installation IMAGE ####################
|
||||||
# image with vLLM installed
|
# image with vLLM installed
|
||||||
# TODO: Restore to base image after FlashInfer AOT wheel fixed
|
# TODO: Restore to base image after FlashInfer AOT wheel fixed
|
||||||
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS vllm-base
|
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS vllm-base
|
||||||
ARG CUDA_VERSION=12.8.1
|
ARG CUDA_VERSION=12.4.1
|
||||||
ARG PYTHON_VERSION=3.12
|
ARG PYTHON_VERSION=3.12
|
||||||
WORKDIR /vllm-workspace
|
WORKDIR /vllm-workspace
|
||||||
ENV DEBIAN_FRONTEND=noninteractive
|
ENV DEBIAN_FRONTEND=noninteractive
|
||||||
@ -198,10 +185,7 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
|
|||||||
&& apt-get update -y \
|
&& apt-get update -y \
|
||||||
&& apt-get install -y ccache software-properties-common git curl wget sudo vim python3-pip \
|
&& apt-get install -y ccache software-properties-common git curl wget sudo vim python3-pip \
|
||||||
&& apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
|
&& apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
|
||||||
&& for i in 1 2 3; do \
|
&& add-apt-repository ppa:deadsnakes/ppa \
|
||||||
add-apt-repository -y ppa:deadsnakes/ppa && break || \
|
|
||||||
{ echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \
|
|
||||||
done \
|
|
||||||
&& apt-get update -y \
|
&& apt-get update -y \
|
||||||
&& apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv libibverbs-dev \
|
&& apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv libibverbs-dev \
|
||||||
&& update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
|
&& update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
|
||||||
@ -216,7 +200,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
|||||||
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
|
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
|
||||||
# Reference: https://github.com/astral-sh/uv/pull/1694
|
# Reference: https://github.com/astral-sh/uv/pull/1694
|
||||||
ENV UV_HTTP_TIMEOUT=500
|
ENV UV_HTTP_TIMEOUT=500
|
||||||
ENV UV_INDEX_STRATEGY="unsafe-best-match"
|
|
||||||
|
|
||||||
# Workaround for https://github.com/openai/triton/issues/2507 and
|
# Workaround for https://github.com/openai/triton/issues/2507 and
|
||||||
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
|
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
|
||||||
@ -237,8 +220,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
|||||||
# Install vllm wheel first, so that torch etc will be installed.
|
# Install vllm wheel first, so that torch etc will be installed.
|
||||||
RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
|
RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
|
||||||
--mount=type=cache,target=/root/.cache/uv \
|
--mount=type=cache,target=/root/.cache/uv \
|
||||||
uv pip install --system dist/*.whl --verbose \
|
uv pip install --system dist/*.whl --verbose
|
||||||
--extra-index-url https://download.pytorch.org/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
|
|
||||||
|
|
||||||
# If we need to build FlashInfer wheel before its release:
|
# If we need to build FlashInfer wheel before its release:
|
||||||
# $ export FLASHINFER_ENABLE_AOT=1
|
# $ export FLASHINFER_ENABLE_AOT=1
|
||||||
@ -255,17 +237,9 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
|
|||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||||
. /etc/environment && \
|
. /etc/environment && \
|
||||||
if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \
|
if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \
|
||||||
# TESTING: install FlashInfer from source to test 2.7.0 final RC
|
uv pip install --system https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.1.post2/flashinfer_python-0.2.1.post2+cu124torch2.6-cp38-abi3-linux_x86_64.whl ; \
|
||||||
FLASHINFER_ENABLE_AOT=1 TORCH_CUDA_ARCH_LIST='7.5 8.0 8.6 8.9 9.0+PTX' \
|
|
||||||
uv pip install --system --no-build-isolation "git+https://github.com/flashinfer-ai/flashinfer@v0.2.2.post1" ; \
|
|
||||||
fi
|
fi
|
||||||
COPY examples examples
|
COPY examples examples
|
||||||
COPY benchmarks benchmarks
|
|
||||||
COPY ./vllm/collect_env.py .
|
|
||||||
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
|
||||||
. /etc/environment && \
|
|
||||||
uv pip list
|
|
||||||
|
|
||||||
# Although we build Flashinfer with AOT mode, there's still
|
# Although we build Flashinfer with AOT mode, there's still
|
||||||
# some issues w.r.t. JIT compilation. Therefore we need to
|
# some issues w.r.t. JIT compilation. Therefore we need to
|
||||||
@ -273,8 +247,7 @@ uv pip list
|
|||||||
# TODO: Remove this once FlashInfer AOT wheel is fixed
|
# TODO: Remove this once FlashInfer AOT wheel is fixed
|
||||||
COPY requirements/build.txt requirements/build.txt
|
COPY requirements/build.txt requirements/build.txt
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||||
uv pip install --system -r requirements/build.txt \
|
uv pip install --system -r requirements/build.txt
|
||||||
--extra-index-url https://download.pytorch.org/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
|
|
||||||
|
|
||||||
#################### vLLM installation IMAGE ####################
|
#################### vLLM installation IMAGE ####################
|
||||||
|
|
||||||
@ -288,11 +261,6 @@ ADD . /vllm-workspace/
|
|||||||
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
|
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
|
||||||
# Reference: https://github.com/astral-sh/uv/pull/1694
|
# Reference: https://github.com/astral-sh/uv/pull/1694
|
||||||
ENV UV_HTTP_TIMEOUT=500
|
ENV UV_HTTP_TIMEOUT=500
|
||||||
ENV UV_INDEX_STRATEGY="unsafe-best-match"
|
|
||||||
|
|
||||||
# Workaround for #17068
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
|
||||||
uv pip install --system --no-build-isolation "git+https://github.com/state-spaces/mamba@v2.2.4"
|
|
||||||
|
|
||||||
# install development dependencies (for testing)
|
# install development dependencies (for testing)
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||||
@ -321,7 +289,6 @@ RUN mv vllm test_docs/
|
|||||||
#################### OPENAI API SERVER ####################
|
#################### OPENAI API SERVER ####################
|
||||||
# base openai image with additional requirements, for any subsequent openai-style images
|
# base openai image with additional requirements, for any subsequent openai-style images
|
||||||
FROM vllm-base AS vllm-openai-base
|
FROM vllm-base AS vllm-openai-base
|
||||||
ARG TARGETPLATFORM
|
|
||||||
|
|
||||||
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
|
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
|
||||||
# Reference: https://github.com/astral-sh/uv/pull/1694
|
# Reference: https://github.com/astral-sh/uv/pull/1694
|
||||||
69
Dockerfile.cpu
Normal file
69
Dockerfile.cpu
Normal file
@ -0,0 +1,69 @@
|
|||||||
|
# This vLLM Dockerfile is used to construct image that can build and run vLLM on x86 CPU platform.
|
||||||
|
|
||||||
|
FROM ubuntu:22.04 AS cpu-test-1
|
||||||
|
|
||||||
|
ENV CCACHE_DIR=/root/.cache/ccache
|
||||||
|
|
||||||
|
ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache
|
||||||
|
|
||||||
|
RUN --mount=type=cache,target=/var/cache/apt \
|
||||||
|
apt-get update -y \
|
||||||
|
&& apt-get install -y curl ccache git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 libnuma-dev \
|
||||||
|
&& apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
|
||||||
|
&& update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
|
||||||
|
|
||||||
|
# https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/performance_tuning/tuning_guide.html
|
||||||
|
# intel-openmp provides additional performance improvement vs. openmp
|
||||||
|
# tcmalloc provides better memory allocation efficiency, e.g, holding memory in caches to speed up access of commonly-used objects.
|
||||||
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
|
pip install intel-openmp==2025.0.1
|
||||||
|
|
||||||
|
ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/usr/local/lib/libiomp5.so"
|
||||||
|
|
||||||
|
RUN echo 'ulimit -c 0' >> ~/.bashrc
|
||||||
|
|
||||||
|
RUN pip install intel_extension_for_pytorch==2.6.0
|
||||||
|
|
||||||
|
WORKDIR /workspace
|
||||||
|
|
||||||
|
ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
|
||||||
|
ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
|
||||||
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
|
--mount=type=bind,src=requirements/build.txt,target=requirements/build.txt \
|
||||||
|
pip install --upgrade pip && \
|
||||||
|
pip install -r requirements/build.txt
|
||||||
|
|
||||||
|
FROM cpu-test-1 AS build
|
||||||
|
|
||||||
|
WORKDIR /workspace/vllm
|
||||||
|
|
||||||
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
|
--mount=type=bind,src=requirements/common.txt,target=requirements/common.txt \
|
||||||
|
--mount=type=bind,src=requirements/cpu.txt,target=requirements/cpu.txt \
|
||||||
|
pip install -v -r requirements/cpu.txt
|
||||||
|
|
||||||
|
COPY . .
|
||||||
|
ARG GIT_REPO_CHECK=0
|
||||||
|
RUN --mount=type=bind,source=.git,target=.git \
|
||||||
|
if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
|
||||||
|
|
||||||
|
# Support for building with non-AVX512 vLLM: docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" ...
|
||||||
|
ARG VLLM_CPU_DISABLE_AVX512
|
||||||
|
ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}
|
||||||
|
|
||||||
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
|
--mount=type=cache,target=/root/.cache/ccache \
|
||||||
|
--mount=type=bind,source=.git,target=.git \
|
||||||
|
VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel && \
|
||||||
|
pip install dist/*.whl && \
|
||||||
|
rm -rf dist
|
||||||
|
|
||||||
|
WORKDIR /workspace/
|
||||||
|
|
||||||
|
RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
|
||||||
|
|
||||||
|
# install development dependencies (for testing)
|
||||||
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
|
pip install -e tests/vllm_test_utils
|
||||||
|
|
||||||
|
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
|
||||||
@ -1,4 +1,4 @@
|
|||||||
FROM vault.habana.ai/gaudi-docker/1.20.1/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
|
FROM vault.habana.ai/gaudi-docker/1.19.1/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest
|
||||||
|
|
||||||
COPY ./ /workspace/vllm
|
COPY ./ /workspace/vllm
|
||||||
|
|
||||||
@ -1,6 +1,6 @@
|
|||||||
# default base image
|
# default base image
|
||||||
# https://gallery.ecr.aws/neuron/pytorch-inference-neuronx
|
# https://gallery.ecr.aws/neuron/pytorch-inference-neuronx
|
||||||
ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.5.1-neuronx-py310-sdk2.22.0-ubuntu22.04"
|
ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.5.1-neuronx-py310-sdk2.21.0-ubuntu22.04"
|
||||||
|
|
||||||
FROM $BASE_IMAGE
|
FROM $BASE_IMAGE
|
||||||
|
|
||||||
@ -21,9 +21,9 @@ VOLUME [ ${APP_MOUNT} ]
|
|||||||
WORKDIR ${APP_MOUNT}/vllm
|
WORKDIR ${APP_MOUNT}/vllm
|
||||||
|
|
||||||
RUN python3 -m pip install --upgrade pip
|
RUN python3 -m pip install --upgrade pip
|
||||||
RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas tenacity
|
RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas
|
||||||
RUN python3 -m pip install sentencepiece transformers==4.48.0 -U
|
RUN python3 -m pip install sentencepiece transformers==4.45.2 -U
|
||||||
RUN python3 -m pip install neuronx-cc==2.17.194.0 --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
|
RUN python3 -m pip install neuronx-cc==2.16.345.0 --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
|
||||||
RUN python3 -m pip install pytest
|
RUN python3 -m pip install pytest
|
||||||
|
|
||||||
# uninstall transformers-neuronx package explicitly to avoid version conflict
|
# uninstall transformers-neuronx package explicitly to avoid version conflict
|
||||||
@ -38,7 +38,7 @@ RUN microdnf install -y openssl-devel dnf \
|
|||||||
&& ln -sf /usr/lib64/libatomic.so.1 /usr/lib64/libatomic.so \
|
&& ln -sf /usr/lib64/libatomic.so.1 /usr/lib64/libatomic.so \
|
||||||
&& python${PYTHON_VERSION} -m venv ${VIRTUAL_ENV} \
|
&& python${PYTHON_VERSION} -m venv ${VIRTUAL_ENV} \
|
||||||
&& python -m pip install -U pip uv \
|
&& python -m pip install -U pip uv \
|
||||||
&& uv pip install wheel build "setuptools<70" setuptools_scm setuptools_rust meson-python 'cmake<4' ninja cython scikit_build_core scikit_build \
|
&& uv pip install wheel build "setuptools<70" setuptools_scm setuptools_rust meson-python cmake ninja cython scikit_build_core scikit_build \
|
||||||
&& curl -sL https://ftp2.osuosl.org/pub/ppc64el/openblas/latest/Openblas_${OPENBLAS_VERSION}_ppc64le.tar.gz | tar xvf - -C /usr/local \
|
&& curl -sL https://ftp2.osuosl.org/pub/ppc64el/openblas/latest/Openblas_${OPENBLAS_VERSION}_ppc64le.tar.gz | tar xvf - -C /usr/local \
|
||||||
&& curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y \
|
&& curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y \
|
||||||
&& cd /tmp && touch control
|
&& cd /tmp && touch control
|
||||||
@ -126,16 +126,13 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
|||||||
FROM base-builder AS cv-builder
|
FROM base-builder AS cv-builder
|
||||||
|
|
||||||
ARG MAX_JOBS
|
ARG MAX_JOBS
|
||||||
ARG OPENCV_VERSION=86
|
ARG OPENCV_VERSION=84
|
||||||
# patch for version 4.11.0.86
|
|
||||||
ARG OPENCV_PATCH=97f3f39
|
|
||||||
ARG ENABLE_HEADLESS=1
|
ARG ENABLE_HEADLESS=1
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||||
source /opt/rh/gcc-toolset-13/enable && \
|
source /opt/rh/gcc-toolset-13/enable && \
|
||||||
git clone --recursive https://github.com/opencv/opencv-python.git -b ${OPENCV_VERSION} && \
|
git clone --recursive https://github.com/opencv/opencv-python.git -b ${OPENCV_VERSION} && \
|
||||||
cd opencv-python && \
|
cd opencv-python && \
|
||||||
sed -i -E -e 's/"setuptools.+",/"setuptools",/g' pyproject.toml && \
|
sed -i 's/"setuptools==59.2.0",/"setuptools<70.0",/g' pyproject.toml && \
|
||||||
cd opencv && git cherry-pick --no-commit $OPENCV_PATCH && cd .. && \
|
|
||||||
python -m build --wheel --installer=uv --outdir /opencvwheels/
|
python -m build --wheel --installer=uv --outdir /opencvwheels/
|
||||||
|
|
||||||
###############################################################
|
###############################################################
|
||||||
@ -151,15 +148,9 @@ COPY --from=arrow-builder /tmp/control /dev/null
|
|||||||
COPY --from=cv-builder /tmp/control /dev/null
|
COPY --from=cv-builder /tmp/control /dev/null
|
||||||
|
|
||||||
ARG VLLM_TARGET_DEVICE=cpu
|
ARG VLLM_TARGET_DEVICE=cpu
|
||||||
ARG GRPC_PYTHON_BUILD_SYSTEM_OPENSSL=1
|
|
||||||
|
|
||||||
# this step installs vllm and populates uv cache
|
# this step installs vllm and populates uv cache
|
||||||
# with all the transitive dependencies
|
# with all the transitive dependencies
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
|
||||||
source /opt/rh/gcc-toolset-13/enable && \
|
|
||||||
git clone https://github.com/huggingface/xet-core.git && cd xet-core/hf_xet/ && \
|
|
||||||
uv pip install maturin && \
|
|
||||||
uv build --wheel --out-dir /hf_wheels/
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||||
--mount=type=bind,from=torch-builder,source=/torchwheels/,target=/torchwheels/,ro \
|
--mount=type=bind,from=torch-builder,source=/torchwheels/,target=/torchwheels/,ro \
|
||||||
--mount=type=bind,from=arrow-builder,source=/arrowwheels/,target=/arrowwheels/,ro \
|
--mount=type=bind,from=arrow-builder,source=/arrowwheels/,target=/arrowwheels/,ro \
|
||||||
@ -168,7 +159,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
|||||||
source /opt/rh/gcc-toolset-13/enable && \
|
source /opt/rh/gcc-toolset-13/enable && \
|
||||||
uv pip install /opencvwheels/*.whl /arrowwheels/*.whl /torchwheels/*.whl && \
|
uv pip install /opencvwheels/*.whl /arrowwheels/*.whl /torchwheels/*.whl && \
|
||||||
sed -i -e 's/.*torch.*//g' /src/pyproject.toml /src/requirements/*.txt && \
|
sed -i -e 's/.*torch.*//g' /src/pyproject.toml /src/requirements/*.txt && \
|
||||||
uv pip install pandas pythran pybind11 /hf_wheels/*.whl && \
|
uv pip install pandas pythran pybind11 && \
|
||||||
# sentencepiece.pc is in some pkgconfig inside uv cache
|
# sentencepiece.pc is in some pkgconfig inside uv cache
|
||||||
export PKG_CONFIG_PATH=$(find / -type d -name "pkgconfig" 2>/dev/null | tr '\n' ':') && \
|
export PKG_CONFIG_PATH=$(find / -type d -name "pkgconfig" 2>/dev/null | tr '\n' ':') && \
|
||||||
uv pip install -r /src/requirements/common.txt -r /src/requirements/cpu.txt -r /src/requirements/build.txt --no-build-isolation && \
|
uv pip install -r /src/requirements/common.txt -r /src/requirements/cpu.txt -r /src/requirements/build.txt --no-build-isolation && \
|
||||||
@ -247,7 +238,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
|||||||
&& python -m pip install -U pip uv --no-cache \
|
&& python -m pip install -U pip uv --no-cache \
|
||||||
&& curl -sL https://ftp2.osuosl.org/pub/ppc64el/openblas/latest/Openblas_${OPENBLAS_VERSION}_ppc64le.tar.gz | tar xvf - -C /usr/local \
|
&& curl -sL https://ftp2.osuosl.org/pub/ppc64el/openblas/latest/Openblas_${OPENBLAS_VERSION}_ppc64le.tar.gz | tar xvf - -C /usr/local \
|
||||||
&& make -C /numactl install \
|
&& make -C /numactl install \
|
||||||
&& uv pip install 'cmake<4' \
|
&& uv pip install cmake \
|
||||||
&& cmake --install /lapack/build \
|
&& cmake --install /lapack/build \
|
||||||
&& uv pip uninstall cmake
|
&& uv pip uninstall cmake
|
||||||
|
|
||||||
@ -256,9 +247,8 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
|||||||
--mount=type=bind,from=torch-builder,source=/torchwheels/,target=/torchwheels/,ro \
|
--mount=type=bind,from=torch-builder,source=/torchwheels/,target=/torchwheels/,ro \
|
||||||
--mount=type=bind,from=arrow-builder,source=/arrowwheels/,target=/arrowwheels/,ro \
|
--mount=type=bind,from=arrow-builder,source=/arrowwheels/,target=/arrowwheels/,ro \
|
||||||
--mount=type=bind,from=cv-builder,source=/opencvwheels/,target=/opencvwheels/,ro \
|
--mount=type=bind,from=cv-builder,source=/opencvwheels/,target=/opencvwheels/,ro \
|
||||||
--mount=type=bind,from=vllmcache-builder,source=/hf_wheels/,target=/hf_wheels/,ro \
|
|
||||||
--mount=type=bind,from=vllmcache-builder,source=/vllmwheel/,target=/vllmwheel/,ro \
|
--mount=type=bind,from=vllmcache-builder,source=/vllmwheel/,target=/vllmwheel/,ro \
|
||||||
HOME=/root uv pip install /opencvwheels/*.whl /arrowwheels/*.whl /torchwheels/*.whl /hf_wheels/*.whl /vllmwheel/*.whl
|
HOME=/root uv pip install /opencvwheels/*.whl /arrowwheels/*.whl /torchwheels/*.whl /vllmwheel/*.whl
|
||||||
|
|
||||||
COPY ./ /workspace/vllm
|
COPY ./ /workspace/vllm
|
||||||
WORKDIR /workspace/vllm
|
WORKDIR /workspace/vllm
|
||||||
@ -114,16 +114,8 @@ COPY --from=export_vllm /examples ${COMMON_WORKDIR}/vllm/examples
|
|||||||
ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
|
ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
|
||||||
ENV TOKENIZERS_PARALLELISM=false
|
ENV TOKENIZERS_PARALLELISM=false
|
||||||
|
|
||||||
# ENV that can improve safe tensor loading, and end-to-end time
|
|
||||||
ENV SAFETENSORS_FAST_GPU=1
|
|
||||||
|
|
||||||
# User-friendly environment setting for multi-processing to avoid below RuntimeError.
|
|
||||||
# RuntimeError: Cannot re-initialize CUDA in forked subprocess. To use CUDA with multiprocessing,
|
|
||||||
# you must use the 'spawn' start method
|
|
||||||
# See https://pytorch.org/docs/stable/notes/multiprocessing.html#cuda-in-multiprocessing
|
|
||||||
ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
|
|
||||||
|
|
||||||
# Performance environment variable.
|
# Performance environment variable.
|
||||||
ENV HIP_FORCE_DEV_KERNARG=1
|
ENV HIP_FORCE_DEV_KERNARG=1
|
||||||
|
|
||||||
CMD ["/bin/bash"]
|
CMD ["/bin/bash"]
|
||||||
|
|
||||||
@ -1,18 +1,18 @@
|
|||||||
ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:6.3.1-complete
|
ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:6.3.1-complete
|
||||||
ARG HIPBLASLT_BRANCH="db8e93b4"
|
ARG HIPBLASLT_BRANCH="4d40e36"
|
||||||
ARG HIPBLAS_COMMON_BRANCH="7c1566b"
|
ARG HIPBLAS_COMMON_BRANCH="7c1566b"
|
||||||
ARG LEGACY_HIPBLASLT_OPTION=
|
ARG LEGACY_HIPBLASLT_OPTION=
|
||||||
ARG RCCL_BRANCH="648a58d"
|
ARG RCCL_BRANCH="648a58d"
|
||||||
ARG RCCL_REPO="https://github.com/ROCm/rccl"
|
ARG RCCL_REPO="https://github.com/ROCm/rccl"
|
||||||
ARG TRITON_BRANCH="e5be006"
|
ARG TRITON_BRANCH="e5be006"
|
||||||
ARG TRITON_REPO="https://github.com/triton-lang/triton.git"
|
ARG TRITON_REPO="https://github.com/triton-lang/triton.git"
|
||||||
ARG PYTORCH_BRANCH="295f2ed4"
|
ARG PYTORCH_BRANCH="3a585126"
|
||||||
ARG PYTORCH_VISION_BRANCH="v0.21.0"
|
ARG PYTORCH_VISION_BRANCH="v0.19.1"
|
||||||
ARG PYTORCH_REPO="https://github.com/pytorch/pytorch.git"
|
ARG PYTORCH_REPO="https://github.com/pytorch/pytorch.git"
|
||||||
ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"
|
ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"
|
||||||
ARG FA_BRANCH="1a7f4dfa"
|
ARG FA_BRANCH="b7d29fb"
|
||||||
ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git"
|
ARG FA_REPO="https://github.com/ROCm/flash-attention.git"
|
||||||
ARG AITER_BRANCH="5a77249"
|
ARG AITER_BRANCH="21d47a9"
|
||||||
ARG AITER_REPO="https://github.com/ROCm/aiter.git"
|
ARG AITER_REPO="https://github.com/ROCm/aiter.git"
|
||||||
|
|
||||||
FROM ${BASE_IMAGE} AS base
|
FROM ${BASE_IMAGE} AS base
|
||||||
@ -20,7 +20,7 @@ FROM ${BASE_IMAGE} AS base
|
|||||||
ENV PATH=/opt/rocm/llvm/bin:$PATH
|
ENV PATH=/opt/rocm/llvm/bin:$PATH
|
||||||
ENV ROCM_PATH=/opt/rocm
|
ENV ROCM_PATH=/opt/rocm
|
||||||
ENV LD_LIBRARY_PATH=/opt/rocm/lib:/usr/local/lib:
|
ENV LD_LIBRARY_PATH=/opt/rocm/lib:/usr/local/lib:
|
||||||
ARG PYTORCH_ROCM_ARCH=gfx90a;gfx942;gfx1100;gfx1101;gfx1200;gfx1201
|
ARG PYTORCH_ROCM_ARCH=gfx90a;gfx942
|
||||||
ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}
|
ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}
|
||||||
|
|
||||||
ARG PYTHON_VERSION=3.12
|
ARG PYTHON_VERSION=3.12
|
||||||
@ -31,11 +31,8 @@ ENV DEBIAN_FRONTEND=noninteractive
|
|||||||
|
|
||||||
# Install Python and other dependencies
|
# Install Python and other dependencies
|
||||||
RUN apt-get update -y \
|
RUN apt-get update -y \
|
||||||
&& apt-get install -y software-properties-common git curl sudo vim less libgfortran5 \
|
&& apt-get install -y software-properties-common git curl sudo vim less \
|
||||||
&& for i in 1 2 3; do \
|
&& add-apt-repository ppa:deadsnakes/ppa \
|
||||||
add-apt-repository -y ppa:deadsnakes/ppa && break || \
|
|
||||||
{ echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \
|
|
||||||
done \
|
|
||||||
&& apt-get update -y \
|
&& apt-get update -y \
|
||||||
&& apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
|
&& apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
|
||||||
python${PYTHON_VERSION}-lib2to3 python-is-python3 \
|
python${PYTHON_VERSION}-lib2to3 python-is-python3 \
|
||||||
@ -45,7 +42,7 @@ RUN apt-get update -y \
|
|||||||
&& curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
|
&& curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
|
||||||
&& python3 --version && python3 -m pip --version
|
&& python3 --version && python3 -m pip --version
|
||||||
|
|
||||||
RUN pip install -U packaging 'cmake<4' ninja wheel setuptools pybind11 Cython
|
RUN pip install -U packaging cmake ninja wheel setuptools pybind11 Cython
|
||||||
|
|
||||||
FROM base AS build_hipblaslt
|
FROM base AS build_hipblaslt
|
||||||
ARG HIPBLASLT_BRANCH
|
ARG HIPBLASLT_BRANCH
|
||||||
@ -63,8 +60,7 @@ RUN cd hipBLAS-common \
|
|||||||
RUN git clone https://github.com/ROCm/hipBLASLt
|
RUN git clone https://github.com/ROCm/hipBLASLt
|
||||||
RUN cd hipBLASLt \
|
RUN cd hipBLASLt \
|
||||||
&& git checkout ${HIPBLASLT_BRANCH} \
|
&& git checkout ${HIPBLASLT_BRANCH} \
|
||||||
&& apt-get install -y llvm-dev \
|
&& ./install.sh -d --architecture ${PYTORCH_ROCM_ARCH} ${LEGACY_HIPBLASLT_OPTION} \
|
||||||
&& ./install.sh -dc --architecture ${PYTORCH_ROCM_ARCH} ${LEGACY_HIPBLASLT_OPTION} \
|
|
||||||
&& cd build/release \
|
&& cd build/release \
|
||||||
&& make package
|
&& make package
|
||||||
RUN mkdir -p /app/install && cp /app/hipBLASLt/build/release/*.deb /app/hipBLAS-common/build/*.deb /app/install
|
RUN mkdir -p /app/install && cp /app/hipBLASLt/build/release/*.deb /app/hipBLAS-common/build/*.deb /app/install
|
||||||
@ -114,24 +110,11 @@ RUN git clone ${FA_REPO}
|
|||||||
RUN cd flash-attention \
|
RUN cd flash-attention \
|
||||||
&& git checkout ${FA_BRANCH} \
|
&& git checkout ${FA_BRANCH} \
|
||||||
&& git submodule update --init \
|
&& git submodule update --init \
|
||||||
&& GPU_ARCHS=$(echo ${PYTORCH_ROCM_ARCH} | sed -e 's/;gfx1[0-9]\{3\}//g') python3 setup.py bdist_wheel --dist-dir=dist
|
&& MAX_JOBS=64 GPU_ARCHS=${PYTORCH_ROCM_ARCH} python3 setup.py bdist_wheel --dist-dir=dist
|
||||||
RUN mkdir -p /app/install && cp /app/pytorch/dist/*.whl /app/install \
|
RUN mkdir -p /app/install && cp /app/pytorch/dist/*.whl /app/install \
|
||||||
&& cp /app/vision/dist/*.whl /app/install \
|
&& cp /app/vision/dist/*.whl /app/install \
|
||||||
&& cp /app/flash-attention/dist/*.whl /app/install
|
&& cp /app/flash-attention/dist/*.whl /app/install
|
||||||
|
|
||||||
FROM base AS build_aiter
|
|
||||||
ARG AITER_BRANCH
|
|
||||||
ARG AITER_REPO
|
|
||||||
RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
|
|
||||||
pip install /install/*.whl
|
|
||||||
RUN git clone --recursive ${AITER_REPO}
|
|
||||||
RUN cd aiter \
|
|
||||||
&& git checkout ${AITER_BRANCH} \
|
|
||||||
&& git submodule update --init --recursive \
|
|
||||||
&& pip install -r requirements.txt
|
|
||||||
RUN pip install pyyaml && cd aiter && PREBUILD_KERNELS=1 GPU_ARCHS=gfx942 python3 setup.py bdist_wheel --dist-dir=dist && ls /app/aiter/dist/*.whl
|
|
||||||
RUN mkdir -p /app/install && cp /app/aiter/dist/*.whl /app/install
|
|
||||||
|
|
||||||
FROM base AS final
|
FROM base AS final
|
||||||
RUN --mount=type=bind,from=build_hipblaslt,src=/app/install/,target=/install \
|
RUN --mount=type=bind,from=build_hipblaslt,src=/app/install/,target=/install \
|
||||||
dpkg -i /install/*deb \
|
dpkg -i /install/*deb \
|
||||||
@ -147,12 +130,19 @@ RUN --mount=type=bind,from=build_amdsmi,src=/app/install/,target=/install \
|
|||||||
pip install /install/*.whl
|
pip install /install/*.whl
|
||||||
RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
|
RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
|
||||||
pip install /install/*.whl
|
pip install /install/*.whl
|
||||||
RUN --mount=type=bind,from=build_aiter,src=/app/install/,target=/install \
|
|
||||||
pip install /install/*.whl
|
ARG AITER_REPO
|
||||||
|
ARG AITER_BRANCH
|
||||||
|
RUN git clone --recursive ${AITER_REPO}
|
||||||
|
RUN cd aiter \
|
||||||
|
&& git checkout ${AITER_BRANCH} \
|
||||||
|
&& git submodule update --init --recursive \
|
||||||
|
&& pip install -r requirements.txt \
|
||||||
|
&& PREBUILD_KERNELS=1 GPU_ARCHS=gfx942 python3 setup.py develop && pip show aiter
|
||||||
|
|
||||||
ARG BASE_IMAGE
|
ARG BASE_IMAGE
|
||||||
ARG HIPBLAS_COMMON_BRANCH
|
|
||||||
ARG HIPBLASLT_BRANCH
|
ARG HIPBLASLT_BRANCH
|
||||||
|
ARG HIPBLAS_COMMON_BRANCH
|
||||||
ARG LEGACY_HIPBLASLT_OPTION
|
ARG LEGACY_HIPBLASLT_OPTION
|
||||||
ARG RCCL_BRANCH
|
ARG RCCL_BRANCH
|
||||||
ARG RCCL_REPO
|
ARG RCCL_REPO
|
||||||
@ -164,8 +154,6 @@ ARG PYTORCH_REPO
|
|||||||
ARG PYTORCH_VISION_REPO
|
ARG PYTORCH_VISION_REPO
|
||||||
ARG FA_BRANCH
|
ARG FA_BRANCH
|
||||||
ARG FA_REPO
|
ARG FA_REPO
|
||||||
ARG AITER_BRANCH
|
|
||||||
ARG AITER_REPO
|
|
||||||
RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
|
RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
|
||||||
&& echo "HIPBLAS_COMMON_BRANCH: ${HIPBLAS_COMMON_BRANCH}" >> /app/versions.txt \
|
&& echo "HIPBLAS_COMMON_BRANCH: ${HIPBLAS_COMMON_BRANCH}" >> /app/versions.txt \
|
||||||
&& echo "HIPBLASLT_BRANCH: ${HIPBLASLT_BRANCH}" >> /app/versions.txt \
|
&& echo "HIPBLASLT_BRANCH: ${HIPBLASLT_BRANCH}" >> /app/versions.txt \
|
||||||
@ -179,5 +167,6 @@ RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
|
|||||||
&& echo "PYTORCH_REPO: ${PYTORCH_REPO}" >> /app/versions.txt \
|
&& echo "PYTORCH_REPO: ${PYTORCH_REPO}" >> /app/versions.txt \
|
||||||
&& echo "PYTORCH_VISION_REPO: ${PYTORCH_VISION_REPO}" >> /app/versions.txt \
|
&& echo "PYTORCH_VISION_REPO: ${PYTORCH_VISION_REPO}" >> /app/versions.txt \
|
||||||
&& echo "FA_BRANCH: ${FA_BRANCH}" >> /app/versions.txt \
|
&& echo "FA_BRANCH: ${FA_BRANCH}" >> /app/versions.txt \
|
||||||
|
&& echo "FA_REPO: ${FA_REPO}" >> /app/versions.txt \
|
||||||
&& echo "AITER_BRANCH: ${AITER_BRANCH}" >> /app/versions.txt \
|
&& echo "AITER_BRANCH: ${AITER_BRANCH}" >> /app/versions.txt \
|
||||||
&& echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt
|
&& echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt
|
||||||
@ -16,7 +16,7 @@ ENV LANG=C.UTF-8 \
|
|||||||
RUN microdnf install -y \
|
RUN microdnf install -y \
|
||||||
which procps findutils tar vim git gcc gcc-gfortran g++ make patch zlib-devel \
|
which procps findutils tar vim git gcc gcc-gfortran g++ make patch zlib-devel \
|
||||||
libjpeg-turbo-devel libtiff-devel libpng-devel libwebp-devel freetype-devel harfbuzz-devel \
|
libjpeg-turbo-devel libtiff-devel libpng-devel libwebp-devel freetype-devel harfbuzz-devel \
|
||||||
openssl-devel openblas openblas-devel autoconf automake libtool cmake numpy && \
|
openssl-devel openblas openblas-devel autoconf automake libtool cmake && \
|
||||||
microdnf clean all
|
microdnf clean all
|
||||||
|
|
||||||
# Python Installation
|
# Python Installation
|
||||||
@ -58,7 +58,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
|||||||
cd ../../python && \
|
cd ../../python && \
|
||||||
export PYARROW_PARALLEL=4 && \
|
export PYARROW_PARALLEL=4 && \
|
||||||
export ARROW_BUILD_TYPE=release && \
|
export ARROW_BUILD_TYPE=release && \
|
||||||
uv pip install -r requirements-build.txt && \
|
uv pip install -r requirements/build.txt && \
|
||||||
python setup.py build_ext --build-type=$ARROW_BUILD_TYPE --bundle-arrow-cpp bdist_wheel
|
python setup.py build_ext --build-type=$ARROW_BUILD_TYPE --bundle-arrow-cpp bdist_wheel
|
||||||
|
|
||||||
FROM python-install AS numa-build
|
FROM python-install AS numa-build
|
||||||
@ -96,22 +96,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
|||||||
uv pip install -v torch==${TORCH_VERSION} --extra-index-url https://download.pytorch.org/whl/nightly/cpu && \
|
uv pip install -v torch==${TORCH_VERSION} --extra-index-url https://download.pytorch.org/whl/nightly/cpu && \
|
||||||
python setup.py bdist_wheel
|
python setup.py bdist_wheel
|
||||||
|
|
||||||
FROM python-install AS hf-xet-builder
|
|
||||||
# Install hf-xet
|
|
||||||
WORKDIR /tmp
|
|
||||||
ENV CARGO_HOME=/root/.cargo
|
|
||||||
ENV RUSTUP_HOME=/root/.rustup
|
|
||||||
ENV PATH="$CARGO_HOME/bin:$RUSTUP_HOME/bin:$PATH"
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
|
||||||
--mount=type=bind,from=rust,source=/root/.cargo,target=/root/.cargo,rw \
|
|
||||||
--mount=type=bind,from=rust,source=/root/.rustup,target=/root/.rustup,rw \
|
|
||||||
git clone https://github.com/huggingface/xet-core.git && \
|
|
||||||
cd xet-core/hf_xet/ && \
|
|
||||||
uv pip install maturin patchelf && \
|
|
||||||
python -m maturin build --release --out dist && \
|
|
||||||
mkdir -p /tmp/hf-xet/dist && \
|
|
||||||
cp dist/*.whl /tmp/hf-xet/dist/
|
|
||||||
|
|
||||||
# Final build stage
|
# Final build stage
|
||||||
FROM python-install AS vllm-cpu
|
FROM python-install AS vllm-cpu
|
||||||
ARG PYTHON_VERSION
|
ARG PYTHON_VERSION
|
||||||
@ -123,7 +107,6 @@ ENV UV_LINK_MODE=copy
|
|||||||
ENV CARGO_HOME=/root/.cargo
|
ENV CARGO_HOME=/root/.cargo
|
||||||
ENV RUSTUP_HOME=/root/.rustup
|
ENV RUSTUP_HOME=/root/.rustup
|
||||||
ENV PATH="$CARGO_HOME/bin:$RUSTUP_HOME/bin:$PATH"
|
ENV PATH="$CARGO_HOME/bin:$RUSTUP_HOME/bin:$PATH"
|
||||||
ENV GRPC_PYTHON_BUILD_SYSTEM_OPENSSL=1
|
|
||||||
|
|
||||||
COPY . /workspace/vllm
|
COPY . /workspace/vllm
|
||||||
WORKDIR /workspace/vllm
|
WORKDIR /workspace/vllm
|
||||||
@ -137,15 +120,12 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
|||||||
--mount=type=bind,from=rust,source=/root/.rustup,target=/root/.rustup,rw \
|
--mount=type=bind,from=rust,source=/root/.rustup,target=/root/.rustup,rw \
|
||||||
--mount=type=bind,from=pyarrow,source=/tmp/arrow/python/dist,target=/tmp/arrow-wheels \
|
--mount=type=bind,from=pyarrow,source=/tmp/arrow/python/dist,target=/tmp/arrow-wheels \
|
||||||
--mount=type=bind,from=torch-vision,source=/tmp/vision/dist,target=/tmp/vision-wheels/ \
|
--mount=type=bind,from=torch-vision,source=/tmp/vision/dist,target=/tmp/vision-wheels/ \
|
||||||
--mount=type=bind,from=hf-xet-builder,source=/tmp/hf-xet/dist,target=/tmp/hf-xet-wheels/ \
|
|
||||||
sed -i '/^torch/d' requirements/build.txt && \
|
sed -i '/^torch/d' requirements/build.txt && \
|
||||||
ARROW_WHL_FILE=$(ls /tmp/arrow-wheels/pyarrow-*.whl | head -n 1) && \
|
ARROW_WHL_FILE=$(ls /tmp/arrow-wheels/pyarrow-*.whl | head -n 1) && \
|
||||||
VISION_WHL_FILE=$(ls /tmp/vision-wheels/*.whl | head -n 1) && \
|
VISION_WHL_FILE=$(ls /tmp/vision-wheels/*.whl | head -n 1) && \
|
||||||
HF_XET_WHL_FILE=$(ls /tmp/hf-xet-wheels/*.whl | head -n 1) && \
|
|
||||||
uv pip install -v \
|
uv pip install -v \
|
||||||
$ARROW_WHL_FILE \
|
$ARROW_WHL_FILE \
|
||||||
$VISION_WHL_FILE \
|
$VISION_WHL_FILE \
|
||||||
$HF_XET_WHL_FILE \
|
|
||||||
--extra-index-url https://download.pytorch.org/whl/nightly/cpu \
|
--extra-index-url https://download.pytorch.org/whl/nightly/cpu \
|
||||||
--index-strategy unsafe-best-match \
|
--index-strategy unsafe-best-match \
|
||||||
-r requirements/build.txt \
|
-r requirements/build.txt \
|
||||||
@ -169,5 +149,4 @@ USER 2000
|
|||||||
WORKDIR /home/vllm
|
WORKDIR /home/vllm
|
||||||
|
|
||||||
# Set the default entrypoint
|
# Set the default entrypoint
|
||||||
ENTRYPOINT ["python", "-m", "vllm.entrypoints.openai.api_server"]
|
ENTRYPOINT ["python", "-m", "vllm.entrypoints.openai.api_server"]
|
||||||
|
|
||||||
@ -23,7 +23,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
|
|||||||
--mount=type=bind,source=.git,target=.git \
|
--mount=type=bind,source=.git,target=.git \
|
||||||
python3 -m pip install \
|
python3 -m pip install \
|
||||||
-r requirements/tpu.txt
|
-r requirements/tpu.txt
|
||||||
RUN python3 -m pip install -e .
|
RUN python3 setup.py develop
|
||||||
|
|
||||||
# install development dependencies (for testing)
|
# install development dependencies (for testing)
|
||||||
RUN python3 -m pip install -e tests/vllm_test_utils
|
RUN python3 -m pip install -e tests/vllm_test_utils
|
||||||
@ -40,6 +40,12 @@ RUN --mount=type=cache,target=/root/.cache/pip \
|
|||||||
--mount=type=bind,source=.git,target=.git \
|
--mount=type=bind,source=.git,target=.git \
|
||||||
python3 setup.py install
|
python3 setup.py install
|
||||||
|
|
||||||
|
# Please refer xpu doc, we need manually install intel-extension-for-pytorch 2.6.10+xpu due to there are some conflict dependencies with torch 2.6.0+xpu
|
||||||
|
# FIXME: This will be fix in ipex 2.7. just leave this here for awareness.
|
||||||
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
|
pip install intel-extension-for-pytorch==2.6.10+xpu \
|
||||||
|
--extra-index-url=https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
|
||||||
|
|
||||||
CMD ["/bin/bash"]
|
CMD ["/bin/bash"]
|
||||||
|
|
||||||
FROM vllm-base AS vllm-openai
|
FROM vllm-base AS vllm-openai
|
||||||
26
README.md
26
README.md
@ -10,26 +10,29 @@ Easy, fast, and cheap LLM serving for everyone
|
|||||||
</h3>
|
</h3>
|
||||||
|
|
||||||
<p align="center">
|
<p align="center">
|
||||||
| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://blog.vllm.ai/"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> | <a href="https://discuss.vllm.ai"><b>User Forum</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> |
|
| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://vllm.ai"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> | <a href="https://discuss.vllm.ai"><b>User Forum</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> |
|
||||||
</p>
|
</p>
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
|
[2025/03] We are collaborating with Ollama to host an [Inference Night](https://lu.ma/vllm-ollama) at Y Combinator in San Francisco on Thursday, March 27, at 6 PM. Discuss all things inference local or data center!
|
||||||
|
|
||||||
|
[2025/04] We're hosting our first-ever *vLLM Asia Developer Day* in Singapore on *April 3rd*! This is a full-day event (9 AM - 9 PM SGT) in partnership with SGInnovate, AMD, and Embedded LLM. Meet the vLLM team and learn about LLM inference for RL, MI300X, and more! [Register Now](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
*Latest News* 🔥
|
*Latest News* 🔥
|
||||||
- [2025/05] We hosted [NYC vLLM Meetup](https://lu.ma/c1rqyf1f)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1_q_aW_ioMJWUImf1s1YM-ZhjXz8cUeL0IJvaquOYBeA/edit?usp=sharing).
|
|
||||||
- [2025/05] vLLM is now a hosted project under PyTorch Foundation! Please find the announcement [here](https://pytorch.org/blog/pytorch-foundation-welcomes-vllm/).
|
- [2025/03] We hosted [the first vLLM China Meetup](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg)! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1REHvfQMKGnvz6p3Fd23HhSO4c8j5WPGZV0bKYLwnHyQ/edit?usp=sharing).
|
||||||
- [2025/04] We hosted [Asia Developer Day](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/19cp6Qu8u48ihB91A064XfaXruNYiBOUKrBxAmDOllOo/edit?usp=sharing).
|
- [2025/03] We hosted [the East Coast vLLM Meetup](https://lu.ma/7mu4k4xx)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1NHiv8EUFF1NLd3fEYODm56nDmL26lEeXCaDgyDlTsRs/edit#slide=id.g31441846c39_0_0).
|
||||||
|
- [2025/02] We hosted [the ninth vLLM meetup](https://lu.ma/h7g3kuj9) with Meta! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1jzC_PZVXrVNSFVCW-V4cFXb6pn7zZ2CyP_Flwo05aqg/edit?usp=sharing) and AMD [here](https://drive.google.com/file/d/1Zk5qEJIkTmlQ2eQcXQZlljAx3m9s7nwn/view?usp=sharing). The slides from Meta will not be posted.
|
||||||
- [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html).
|
- [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html).
|
||||||
|
- [2025/01] We hosted [the eighth vLLM meetup](https://lu.ma/zep56hui) with Google Cloud! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1epVkt4Zu8Jz_S5OhEHPc798emsYh2BwYfRuDDVEF7u4/edit?usp=sharing), and Google Cloud team [here](https://drive.google.com/file/d/1h24pHewANyRL11xy5dXUbvRC9F9Kkjix/view?usp=sharing).
|
||||||
|
- [2024/12] vLLM joins [pytorch ecosystem](https://pytorch.org/blog/vllm-joins-pytorch)! Easy, Fast, and Cheap LLM Serving for Everyone!
|
||||||
|
|
||||||
<details>
|
<details>
|
||||||
<summary>Previous News</summary>
|
<summary>Previous News</summary>
|
||||||
|
|
||||||
- [2025/03] We hosted [vLLM x Ollama Inference Night](https://lu.ma/vllm-ollama)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/16T2PDD1YwRnZ4Tu8Q5r6n53c5Lr5c73UV9Vd2_eBo4U/edit?usp=sharing).
|
|
||||||
- [2025/03] We hosted [the first vLLM China Meetup](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg)! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1REHvfQMKGnvz6p3Fd23HhSO4c8j5WPGZV0bKYLwnHyQ/edit?usp=sharing).
|
|
||||||
- [2025/03] We hosted [the East Coast vLLM Meetup](https://lu.ma/7mu4k4xx)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1NHiv8EUFF1NLd3fEYODm56nDmL26lEeXCaDgyDlTsRs/edit#slide=id.g31441846c39_0_0).
|
|
||||||
- [2025/02] We hosted [the ninth vLLM meetup](https://lu.ma/h7g3kuj9) with Meta! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1jzC_PZVXrVNSFVCW-V4cFXb6pn7zZ2CyP_Flwo05aqg/edit?usp=sharing) and AMD [here](https://drive.google.com/file/d/1Zk5qEJIkTmlQ2eQcXQZlljAx3m9s7nwn/view?usp=sharing). The slides from Meta will not be posted.
|
|
||||||
- [2025/01] We hosted [the eighth vLLM meetup](https://lu.ma/zep56hui) with Google Cloud! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1epVkt4Zu8Jz_S5OhEHPc798emsYh2BwYfRuDDVEF7u4/edit?usp=sharing), and Google Cloud team [here](https://drive.google.com/file/d/1h24pHewANyRL11xy5dXUbvRC9F9Kkjix/view?usp=sharing).
|
|
||||||
- [2024/12] vLLM joins [pytorch ecosystem](https://pytorch.org/blog/vllm-joins-pytorch)! Easy, Fast, and Cheap LLM Serving for Everyone!
|
|
||||||
- [2024/11] We hosted [the seventh vLLM meetup](https://lu.ma/h0qvrajz) with Snowflake! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing), and Snowflake team [here](https://docs.google.com/presentation/d/1qF3RkDAbOULwz9WK5TOltt2fE9t6uIc_hVNLFAaQX6A/edit?usp=sharing).
|
- [2024/11] We hosted [the seventh vLLM meetup](https://lu.ma/h0qvrajz) with Snowflake! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing), and Snowflake team [here](https://docs.google.com/presentation/d/1qF3RkDAbOULwz9WK5TOltt2fE9t6uIc_hVNLFAaQX6A/edit?usp=sharing).
|
||||||
- [2024/10] We have just created a developer slack ([slack.vllm.ai](https://slack.vllm.ai)) focusing on coordinating contributions and discussing features. Please feel free to join us there!
|
- [2024/10] We have just created a developer slack ([slack.vllm.ai](https://slack.vllm.ai)) focusing on coordinating contributions and discussing features. Please feel free to join us there!
|
||||||
- [2024/10] Ray Summit 2024 held a special track for vLLM! Please find the opening talk slides from the vLLM team [here](https://docs.google.com/presentation/d/1B_KQxpHBTRa_mDF-tR6i8rWdOU5QoTZNcEg2MKZxEHM/edit?usp=sharing). Learn more from the [talks](https://www.youtube.com/playlist?list=PLzTswPQNepXl6AQwifuwUImLPFRVpksjR) from other vLLM contributors and users!
|
- [2024/10] Ray Summit 2024 held a special track for vLLM! Please find the opening talk slides from the vLLM team [here](https://docs.google.com/presentation/d/1B_KQxpHBTRa_mDF-tR6i8rWdOU5QoTZNcEg2MKZxEHM/edit?usp=sharing). Learn more from the [talks](https://www.youtube.com/playlist?list=PLzTswPQNepXl6AQwifuwUImLPFRVpksjR) from other vLLM contributors and users!
|
||||||
@ -100,7 +103,7 @@ Visit our [documentation](https://docs.vllm.ai/en/latest/) to learn more.
|
|||||||
## Contributing
|
## Contributing
|
||||||
|
|
||||||
We welcome and value any contributions and collaborations.
|
We welcome and value any contributions and collaborations.
|
||||||
Please check out [Contributing to vLLM](https://docs.vllm.ai/en/stable/contributing/overview.html) for how to get involved.
|
Please check out [CONTRIBUTING.md](./CONTRIBUTING.md) for how to get involved.
|
||||||
|
|
||||||
## Sponsors
|
## Sponsors
|
||||||
|
|
||||||
@ -123,7 +126,6 @@ Compute Resources:
|
|||||||
- Databricks
|
- Databricks
|
||||||
- DeepInfra
|
- DeepInfra
|
||||||
- Google Cloud
|
- Google Cloud
|
||||||
- Intel
|
|
||||||
- Lambda Lab
|
- Lambda Lab
|
||||||
- Nebius
|
- Nebius
|
||||||
- Novita AI
|
- Novita AI
|
||||||
|
|||||||
@ -41,39 +41,29 @@ become available.
|
|||||||
<td><code>synthetic</code></td>
|
<td><code>synthetic</code></td>
|
||||||
</tr>
|
</tr>
|
||||||
<tr>
|
<tr>
|
||||||
<td><strong>HuggingFace-VisionArena</strong></td>
|
<td><strong>HuggingFace</strong></td>
|
||||||
<td style="text-align: center;">✅</td>
|
<td style="text-align: center;">🟡</td>
|
||||||
<td style="text-align: center;">✅</td>
|
<td style="text-align: center;">🟡</td>
|
||||||
<td><code>lmarena-ai/VisionArena-Chat</code></td>
|
<td>Specify your dataset path on HuggingFace</td>
|
||||||
</tr>
|
</tr>
|
||||||
<tr>
|
<tr>
|
||||||
<td><strong>HuggingFace-InstructCoder</strong></td>
|
<td><strong>VisionArena</strong></td>
|
||||||
<td style="text-align: center;">✅</td>
|
<td style="text-align: center;">✅</td>
|
||||||
<td style="text-align: center;">✅</td>
|
<td style="text-align: center;">✅</td>
|
||||||
<td><code>likaixin/InstructCoder</code></td>
|
<td><code>lmarena-ai/vision-arena-bench-v0.1</code> (a HuggingFace dataset)</td>
|
||||||
</tr>
|
|
||||||
<tr>
|
|
||||||
<td><strong>HuggingFace-AIMO</strong></td>
|
|
||||||
<td style="text-align: center;">✅</td>
|
|
||||||
<td style="text-align: center;">✅</td>
|
|
||||||
<td><code>AI-MO/aimo-validation-aime</code> , <code>AI-MO/NuminaMath-1.5</code>, <code>AI-MO/NuminaMath-CoT</code></td>
|
|
||||||
</tr>
|
|
||||||
<tr>
|
|
||||||
<td><strong>HuggingFace-Other</strong></td>
|
|
||||||
<td style="text-align: center;">✅</td>
|
|
||||||
<td style="text-align: center;">✅</td>
|
|
||||||
<td><code>lmms-lab/LLaVA-OneVision-Data</code>, <code>Aeala/ShareGPT_Vicuna_unfiltered</code></td>
|
|
||||||
</tr>
|
</tr>
|
||||||
</tbody>
|
</tbody>
|
||||||
</table>
|
</table>
|
||||||
|
|
||||||
✅: supported
|
✅: supported
|
||||||
|
|
||||||
🟡: Partial support
|
|
||||||
|
|
||||||
🚧: to be supported
|
🚧: to be supported
|
||||||
|
|
||||||
**Note**: HuggingFace dataset's `dataset-name` should be set to `hf`
|
🟡: Partial support. Currently, HuggingFaceDataset only supports dataset formats
|
||||||
|
similar to `lmms-lab/LLaVA-OneVision-Data` and `Aeala/ShareGPT_Vicuna_unfiltered`.
|
||||||
|
If you need support for other dataset formats, please consider contributing.
|
||||||
|
|
||||||
|
**Note**: VisionArena’s `dataset-name` should be set to `hf`
|
||||||
|
|
||||||
---
|
---
|
||||||
## Example - Online Benchmark
|
## Example - Online Benchmark
|
||||||
@ -81,7 +71,8 @@ become available.
|
|||||||
First start serving your model
|
First start serving your model
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
vllm serve NousResearch/Hermes-3-Llama-3.1-8B --disable-log-requests
|
MODEL_NAME="NousResearch/Hermes-3-Llama-3.1-8B"
|
||||||
|
vllm serve ${MODEL_NAME} --disable-log-requests
|
||||||
```
|
```
|
||||||
|
|
||||||
Then run the benchmarking script
|
Then run the benchmarking script
|
||||||
@ -89,13 +80,12 @@ Then run the benchmarking script
|
|||||||
```bash
|
```bash
|
||||||
# download dataset
|
# download dataset
|
||||||
# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
||||||
python3 vllm/benchmarks/benchmark_serving.py \
|
MODEL_NAME="NousResearch/Hermes-3-Llama-3.1-8B"
|
||||||
--backend vllm \
|
NUM_PROMPTS=10
|
||||||
--model NousResearch/Hermes-3-Llama-3.1-8B \
|
BACKEND="vllm"
|
||||||
--endpoint /v1/completions \
|
DATASET_NAME="sharegpt"
|
||||||
--dataset-name sharegpt \
|
DATASET_PATH="<your data path>/ShareGPT_V3_unfiltered_cleaned_split.json"
|
||||||
--dataset-path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
|
python3 vllm/benchmarks/benchmark_serving.py --backend ${BACKEND} --model ${MODEL_NAME} --endpoint /v1/completions --dataset-name ${DATASET_NAME} --dataset-path ${DATASET_PATH} --num-prompts ${NUM_PROMPTS}
|
||||||
--num-prompts 10
|
|
||||||
```
|
```
|
||||||
|
|
||||||
If successful, you will see the following output
|
If successful, you will see the following output
|
||||||
@ -132,105 +122,88 @@ vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
|
|||||||
```
|
```
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
|
MODEL_NAME="Qwen/Qwen2-VL-7B-Instruct"
|
||||||
|
NUM_PROMPTS=10
|
||||||
|
BACKEND="openai-chat"
|
||||||
|
DATASET_NAME="hf"
|
||||||
|
DATASET_PATH="lmarena-ai/vision-arena-bench-v0.1"
|
||||||
|
DATASET_SPLIT='train'
|
||||||
|
|
||||||
python3 vllm/benchmarks/benchmark_serving.py \
|
python3 vllm/benchmarks/benchmark_serving.py \
|
||||||
--backend openai-chat \
|
--backend "${BACKEND}" \
|
||||||
--model Qwen/Qwen2-VL-7B-Instruct \
|
--model "${MODEL_NAME}" \
|
||||||
--endpoint /v1/chat/completions \
|
--endpoint "/v1/chat/completions" \
|
||||||
--dataset-name hf \
|
--dataset-name "${DATASET_NAME}" \
|
||||||
--dataset-path lmarena-ai/VisionArena-Chat \
|
--dataset-path "${DATASET_PATH}" \
|
||||||
--hf-split train \
|
--hf-split "${DATASET_SPLIT}" \
|
||||||
--num-prompts 1000
|
--num-prompts "${NUM_PROMPTS}"
|
||||||
```
|
```
|
||||||
|
|
||||||
### InstructCoder Benchmark with Speculative Decoding
|
### HuggingFaceDataset Examples
|
||||||
|
|
||||||
``` bash
|
Currently, HuggingFaceDataset only supports dataset formats
|
||||||
VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
|
similar to `lmms-lab/LLaVA-OneVision-Data` and `Aeala/ShareGPT_Vicuna_unfiltered`. If you need support for other dataset
|
||||||
--speculative-model "[ngram]" \
|
formats, please consider contributing.
|
||||||
--ngram_prompt_lookup_min 2 \
|
|
||||||
--ngram-prompt-lookup-max 5 \
|
|
||||||
--num_speculative_tokens 5
|
|
||||||
```
|
|
||||||
|
|
||||||
``` bash
|
|
||||||
python3 benchmarks/benchmark_serving.py \
|
|
||||||
--model meta-llama/Meta-Llama-3-8B-Instruct \
|
|
||||||
--dataset-name hf \
|
|
||||||
--dataset-path likaixin/InstructCoder \
|
|
||||||
--num-prompts 2048
|
|
||||||
```
|
|
||||||
|
|
||||||
### Other HuggingFaceDataset Examples
|
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
|
# need a model with vision capability here
|
||||||
vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
|
vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
|
||||||
```
|
```
|
||||||
|
|
||||||
**`lmms-lab/LLaVA-OneVision-Data`**
|
**`lmms-lab/LLaVA-OneVision-Data`**
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
|
MODEL_NAME="Qwen/Qwen2-VL-7B-Instruct"
|
||||||
|
NUM_PROMPTS=10
|
||||||
|
BACKEND="openai-chat"
|
||||||
|
DATASET_NAME="hf"
|
||||||
|
DATASET_PATH="lmms-lab/LLaVA-OneVision-Data"
|
||||||
|
DATASET_SPLIT='train'
|
||||||
|
DATASET_SUBSET='chart2text(cauldron)'
|
||||||
python3 vllm/benchmarks/benchmark_serving.py \
|
python3 vllm/benchmarks/benchmark_serving.py \
|
||||||
--backend openai-chat \
|
--backend "${BACKEND}" \
|
||||||
--model Qwen/Qwen2-VL-7B-Instruct \
|
--model "${MODEL_NAME}" \
|
||||||
--endpoint /v1/chat/completions \
|
--endpoint "/v1/chat/completions" \
|
||||||
--dataset-name hf \
|
--dataset-name "${DATASET_NAME}" \
|
||||||
--dataset-path lmms-lab/LLaVA-OneVision-Data \
|
--dataset-path "${DATASET_PATH}" \
|
||||||
--hf-split train \
|
--hf-split "${DATASET_SPLIT}" \
|
||||||
--hf-subset "chart2text(cauldron)" \
|
--num-prompts "${NUM_PROMPTS}" \
|
||||||
--num-prompts 10
|
--hf-subset "${DATASET_SUBSET}"
|
||||||
```
|
```
|
||||||
|
|
||||||
**`Aeala/ShareGPT_Vicuna_unfiltered`**
|
**`Aeala/ShareGPT_Vicuna_unfiltered`**
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
|
MODEL_NAME="Qwen/Qwen2-VL-7B-Instruct"
|
||||||
|
NUM_PROMPTS=10
|
||||||
|
BACKEND="openai-chat"
|
||||||
|
DATASET_NAME="hf"
|
||||||
|
DATASET_PATH="Aeala/ShareGPT_Vicuna_unfiltered"
|
||||||
|
DATASET_SPLIT='train'
|
||||||
python3 vllm/benchmarks/benchmark_serving.py \
|
python3 vllm/benchmarks/benchmark_serving.py \
|
||||||
--backend openai-chat \
|
--backend "${BACKEND}" \
|
||||||
--model Qwen/Qwen2-VL-7B-Instruct \
|
--model "${MODEL_NAME}" \
|
||||||
--endpoint /v1/chat/completions \
|
--endpoint "/v1/chat/completions" \
|
||||||
--dataset-name hf \
|
--dataset-name "${DATASET_NAME}" \
|
||||||
--dataset-path Aeala/ShareGPT_Vicuna_unfiltered \
|
--dataset-path "${DATASET_PATH}" \
|
||||||
--hf-split train \
|
--hf-split "${DATASET_SPLIT}" \
|
||||||
--num-prompts 10
|
--num-prompts "${NUM_PROMPTS}" \
|
||||||
```
|
|
||||||
|
|
||||||
**`AI-MO/aimo-validation-aime`**
|
|
||||||
|
|
||||||
``` bash
|
|
||||||
python3 vllm/benchmarks/benchmark_serving.py \
|
|
||||||
--model Qwen/QwQ-32B \
|
|
||||||
--dataset-name hf \
|
|
||||||
--dataset-path AI-MO/aimo-validation-aime \
|
|
||||||
--num-prompts 10 \
|
|
||||||
--seed 42
|
|
||||||
```
|
|
||||||
|
|
||||||
### Running With Sampling Parameters
|
|
||||||
|
|
||||||
When using OpenAI-compatible backends such as `vllm`, optional sampling
|
|
||||||
parameters can be specified. Example client command:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python3 vllm/benchmarks/benchmark_serving.py \
|
|
||||||
--backend vllm \
|
|
||||||
--model NousResearch/Hermes-3-Llama-3.1-8B \
|
|
||||||
--endpoint /v1/completions \
|
|
||||||
--dataset-name sharegpt \
|
|
||||||
--dataset-path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
|
|
||||||
--top-k 10 \
|
|
||||||
--top-p 0.9 \
|
|
||||||
--temperature 0.5 \
|
|
||||||
--num-prompts 10
|
|
||||||
```
|
```
|
||||||
|
|
||||||
---
|
---
|
||||||
## Example - Offline Throughput Benchmark
|
## Example - Offline Throughput Benchmark
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
|
MODEL_NAME="NousResearch/Hermes-3-Llama-3.1-8B"
|
||||||
|
NUM_PROMPTS=10
|
||||||
|
DATASET_NAME="sonnet"
|
||||||
|
DATASET_PATH="vllm/benchmarks/sonnet.txt"
|
||||||
|
|
||||||
python3 vllm/benchmarks/benchmark_throughput.py \
|
python3 vllm/benchmarks/benchmark_throughput.py \
|
||||||
--model NousResearch/Hermes-3-Llama-3.1-8B \
|
--model "${MODEL_NAME}" \
|
||||||
--dataset-name sonnet \
|
--dataset-name "${DATASET_NAME}" \
|
||||||
--dataset-path vllm/benchmarks/sonnet.txt \
|
--dataset-path "${DATASET_PATH}" \
|
||||||
--num-prompts 10
|
--num-prompts "${NUM_PROMPTS}"
|
||||||
```
|
```
|
||||||
|
|
||||||
If successful, you will see the following output
|
If successful, you will see the following output
|
||||||
@ -244,13 +217,19 @@ Total num output tokens: 1500
|
|||||||
### VisionArena Benchmark for Vision Language Models
|
### VisionArena Benchmark for Vision Language Models
|
||||||
|
|
||||||
``` bash
|
``` bash
|
||||||
|
MODEL_NAME="Qwen/Qwen2-VL-7B-Instruct"
|
||||||
|
NUM_PROMPTS=10
|
||||||
|
DATASET_NAME="hf"
|
||||||
|
DATASET_PATH="lmarena-ai/vision-arena-bench-v0.1"
|
||||||
|
DATASET_SPLIT="train"
|
||||||
|
|
||||||
python3 vllm/benchmarks/benchmark_throughput.py \
|
python3 vllm/benchmarks/benchmark_throughput.py \
|
||||||
--model Qwen/Qwen2-VL-7B-Instruct \
|
--model "${MODEL_NAME}" \
|
||||||
--backend vllm-chat \
|
--backend "vllm-chat" \
|
||||||
--dataset-name hf \
|
--dataset-name "${DATASET_NAME}" \
|
||||||
--dataset-path lmarena-ai/VisionArena-Chat \
|
--dataset-path "${DATASET_PATH}" \
|
||||||
--num-prompts 1000 \
|
--num-prompts "${NUM_PROMPTS}" \
|
||||||
--hf-split train
|
--hf-split "${DATASET_SPLIT}"
|
||||||
```
|
```
|
||||||
|
|
||||||
The `num prompt tokens` now includes image token counts
|
The `num prompt tokens` now includes image token counts
|
||||||
@ -261,83 +240,29 @@ Total num prompt tokens: 14527
|
|||||||
Total num output tokens: 1280
|
Total num output tokens: 1280
|
||||||
```
|
```
|
||||||
|
|
||||||
### InstructCoder Benchmark with Speculative Decoding
|
|
||||||
|
|
||||||
``` bash
|
|
||||||
VLLM_WORKER_MULTIPROC_METHOD=spawn \
|
|
||||||
VLLM_USE_V1=1 \
|
|
||||||
python3 vllm/benchmarks/benchmark_throughput.py \
|
|
||||||
--dataset-name=hf \
|
|
||||||
--dataset-path=likaixin/InstructCoder \
|
|
||||||
--model=meta-llama/Meta-Llama-3-8B-Instruct \
|
|
||||||
--input-len=1000 \
|
|
||||||
--output-len=100 \
|
|
||||||
--num-prompts=2048 \
|
|
||||||
--async-engine \
|
|
||||||
--speculative-model="[ngram]" \
|
|
||||||
--ngram_prompt_lookup_min=2 \
|
|
||||||
--ngram-prompt-lookup-max=5 \
|
|
||||||
--num_speculative_tokens=5
|
|
||||||
```
|
|
||||||
|
|
||||||
```
|
|
||||||
Throughput: 104.77 requests/s, 23836.22 total tokens/s, 10477.10 output tokens/s
|
|
||||||
Total num prompt tokens: 261136
|
|
||||||
Total num output tokens: 204800
|
|
||||||
```
|
|
||||||
|
|
||||||
### Other HuggingFaceDataset Examples
|
|
||||||
|
|
||||||
**`lmms-lab/LLaVA-OneVision-Data`**
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python3 vllm/benchmarks/benchmark_throughput.py \
|
|
||||||
--model Qwen/Qwen2-VL-7B-Instruct \
|
|
||||||
--backend vllm-chat \
|
|
||||||
--dataset-name hf \
|
|
||||||
--dataset-path lmms-lab/LLaVA-OneVision-Data \
|
|
||||||
--hf-split train \
|
|
||||||
--hf-subset "chart2text(cauldron)" \
|
|
||||||
--num-prompts 10
|
|
||||||
```
|
|
||||||
|
|
||||||
**`Aeala/ShareGPT_Vicuna_unfiltered`**
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python3 vllm/benchmarks/benchmark_throughput.py \
|
|
||||||
--model Qwen/Qwen2-VL-7B-Instruct \
|
|
||||||
--backend vllm-chat \
|
|
||||||
--dataset-name hf \
|
|
||||||
--dataset-path Aeala/ShareGPT_Vicuna_unfiltered \
|
|
||||||
--hf-split train \
|
|
||||||
--num-prompts 10
|
|
||||||
```
|
|
||||||
|
|
||||||
**`AI-MO/aimo-validation-aime`**
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python3 benchmarks/benchmark_throughput.py \
|
|
||||||
--model Qwen/QwQ-32B \
|
|
||||||
--backend vllm \
|
|
||||||
--dataset-name hf \
|
|
||||||
--dataset-path AI-MO/aimo-validation-aime \
|
|
||||||
--hf-split train \
|
|
||||||
--num-prompts 10
|
|
||||||
```
|
|
||||||
|
|
||||||
### Benchmark with LoRA Adapters
|
### Benchmark with LoRA Adapters
|
||||||
|
|
||||||
``` bash
|
``` bash
|
||||||
# download dataset
|
# download dataset
|
||||||
# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
||||||
|
MODEL_NAME="meta-llama/Llama-2-7b-hf"
|
||||||
|
BACKEND="vllm"
|
||||||
|
DATASET_NAME="sharegpt"
|
||||||
|
DATASET_PATH="<your data path>/ShareGPT_V3_unfiltered_cleaned_split.json"
|
||||||
|
NUM_PROMPTS=10
|
||||||
|
MAX_LORAS=2
|
||||||
|
MAX_LORA_RANK=8
|
||||||
|
ENABLE_LORA="--enable-lora"
|
||||||
|
LORA_PATH="yard1/llama-2-7b-sql-lora-test"
|
||||||
|
|
||||||
python3 vllm/benchmarks/benchmark_throughput.py \
|
python3 vllm/benchmarks/benchmark_throughput.py \
|
||||||
--model meta-llama/Llama-2-7b-hf \
|
--model "${MODEL_NAME}" \
|
||||||
--backend vllm \
|
--backend "${BACKEND}" \
|
||||||
--dataset_path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
|
--dataset_path "${DATASET_PATH}" \
|
||||||
--dataset_name sharegpt \
|
--dataset_name "${DATASET_NAME}" \
|
||||||
--num-prompts 10 \
|
--num-prompts "${NUM_PROMPTS}" \
|
||||||
--max-loras 2 \
|
--max-loras "${MAX_LORAS}" \
|
||||||
--max-lora-rank 8 \
|
--max-lora-rank "${MAX_LORA_RANK}" \
|
||||||
--enable-lora \
|
${ENABLE_LORA} \
|
||||||
--lora-path yard1/llama-2-7b-sql-lora-test
|
--lora-path "${LORA_PATH}"
|
||||||
```
|
```
|
||||||
|
|||||||
@ -1,212 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
# This script aims to tune the best server parameter combinations to maximize throughput for given requirement.
|
|
||||||
# The current server parameter combination is max_num_seqs and max_num_batched_tokens
|
|
||||||
# It also supports additional requirement: e2e latency and prefix cache.
|
|
||||||
|
|
||||||
# Pre-requisite:
|
|
||||||
# 1. Checkout to your branch, install/ update the correct running env. For TPU, activate conda env and install the corresponding torch, xla version.
|
|
||||||
# 2. If the model is customized, replace the MODEL's config with the customized config.
|
|
||||||
# 3. Set variables (ALL REQUIRED)
|
|
||||||
# BASE: your directory for vllm repo
|
|
||||||
# MODEL: the model served by vllm
|
|
||||||
# DOWNLOAD_DIR: directory to download and load model weights.
|
|
||||||
# INPUT_LEN: request input len
|
|
||||||
# OUTPUT_LEN: request output len
|
|
||||||
# MIN_CACHE_HIT_PCT: prefix cache rate
|
|
||||||
# MAX_LATENCY_ALLOWED_MS: (e2e) latency requirement. If there's no latency requirement, set it to a large number like 1000000000
|
|
||||||
# 4. Run the script, it might take a long time, you can use tmux to avoid the script stop if disconnection happens.
|
|
||||||
# 5. The final result will be saved in RESULT file.
|
|
||||||
|
|
||||||
|
|
||||||
# Example use cases
|
|
||||||
# 1. Given input_len=1800, output_len=20, what's the best max_num_seqs and max_num_batched_tokens to get highest throughput?
|
|
||||||
# Use INPUT_LEN=1800, OUTPUT_LEN=20, MIN_CACHE_HIT_PCT=0, MAX_LATENCY_ALLOWED_MS=100000000000
|
|
||||||
# 2. If we have latency requirement to be lower than 500ms, what's the best server parameter?
|
|
||||||
# Use INPUT_LEN=1800, OUTPUT_LEN=20, MIN_CACHE_HIT_PCT=0, MAX_LATENCY_ALLOWED_MS=500
|
|
||||||
# 3. If we want to reach 60% prefix cache, what's the best server parameter?
|
|
||||||
# Use INPUT_LEN=1800, OUTPUT_LEN=20, MIN_CACHE_HIT_PCT=60, MAX_LATENCY_ALLOWED_MS=500
|
|
||||||
|
|
||||||
TAG=$(date +"%Y_%m_%d_%H_%M")
|
|
||||||
BASE=""
|
|
||||||
MODEL="meta-llama/Llama-3.1-8B-Instruct"
|
|
||||||
DOWNLOAD_DIR=""
|
|
||||||
INPUT_LEN=4000
|
|
||||||
OUTPUT_LEN=16
|
|
||||||
MIN_CACHE_HIT_PCT_PCT=0
|
|
||||||
MAX_LATENCY_ALLOWED_MS=100000000000
|
|
||||||
|
|
||||||
LOG_FOLDER="$BASE/auto-benchmark/$TAG"
|
|
||||||
RESULT="$LOG_FOLDER/result.txt"
|
|
||||||
|
|
||||||
echo "result file$ $RESULT"
|
|
||||||
echo "model: $MODEL"
|
|
||||||
echo
|
|
||||||
|
|
||||||
rm -rf $LOG_FOLDER
|
|
||||||
mkdir -p $LOG_FOLDER
|
|
||||||
|
|
||||||
cd "$BASE/vllm"
|
|
||||||
# create sonnet-4x.txt so that we can sample 2048 tokens for input
|
|
||||||
echo "" > benchmarks/sonnet_4x.txt
|
|
||||||
for _ in {1..4}
|
|
||||||
do
|
|
||||||
cat benchmarks/sonnet.txt >> benchmarks/sonnet_4x.txt
|
|
||||||
done
|
|
||||||
|
|
||||||
pip install datasets
|
|
||||||
|
|
||||||
current_hash=$(git rev-parse HEAD)
|
|
||||||
echo "hash:$current_hash" >> "$RESULT"
|
|
||||||
echo "current_hash: $current_hash"
|
|
||||||
|
|
||||||
best_throughput=0
|
|
||||||
best_max_num_seqs=0
|
|
||||||
best_num_batched_tokens=0
|
|
||||||
best_goodput=0
|
|
||||||
run_benchmark() {
|
|
||||||
local max_num_seqs=$1
|
|
||||||
local max_num_batched_tokens=$2
|
|
||||||
echo "max_num_seq: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens"
|
|
||||||
local vllm_log="$LOG_FOLDER/vllm_log_${max_num_seqs}_${max_num_batched_tokens}.txt"
|
|
||||||
echo "vllm_log: $vllm_log"
|
|
||||||
echo
|
|
||||||
rm -f $vllm_log
|
|
||||||
|
|
||||||
# start the server
|
|
||||||
VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 vllm serve $MODEL \
|
|
||||||
--disable-log-requests \
|
|
||||||
--port 8004 \
|
|
||||||
--gpu-memory-utilization 0.98 \
|
|
||||||
--max-num-seqs $max_num_seqs \
|
|
||||||
--max-num-batched-tokens $max_num_batched_tokens \
|
|
||||||
--tensor-parallel-size 1 \
|
|
||||||
--enable-prefix-caching \
|
|
||||||
--load-format dummy \
|
|
||||||
--download-dir $DOWNLOAD_DIR \
|
|
||||||
--max-model-len $(( INPUT_LEN+OUTPUT_LEN )) > "$vllm_log" 2>&1 &
|
|
||||||
echo "wait for 10 minutes.."
|
|
||||||
echo
|
|
||||||
# wait for 10 minutes...
|
|
||||||
server_started=0
|
|
||||||
for i in {1..60}; do
|
|
||||||
if grep -Fq "Application startup complete" "$vllm_log"; then
|
|
||||||
echo "Application started"
|
|
||||||
server_started=1
|
|
||||||
break
|
|
||||||
else
|
|
||||||
# echo "wait for 10 seconds..."
|
|
||||||
sleep 10
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
|
|
||||||
if (( ! server_started )); then
|
|
||||||
echo "server did not start within 10 minutes, terminate the benchmarking. Please check server log at $vllm_log"
|
|
||||||
echo "pkill -f vllm"
|
|
||||||
echo
|
|
||||||
pkill vllm
|
|
||||||
sleep 10
|
|
||||||
return 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "run benchmark test..."
|
|
||||||
echo
|
|
||||||
meet_latency_requirement=0
|
|
||||||
# get a basic qps by using request-rate inf
|
|
||||||
bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_inf.txt"
|
|
||||||
prefix_len=$(( INPUT_LEN * MIN_CACHE_HIT_PCT / 100 ))
|
|
||||||
python benchmarks/benchmark_serving.py \
|
|
||||||
--backend vllm \
|
|
||||||
--model $MODEL \
|
|
||||||
--dataset-name sonnet \
|
|
||||||
--dataset-path benchmarks/sonnet_4x.txt \
|
|
||||||
--sonnet-input-len $INPUT_LEN \
|
|
||||||
--sonnet-output-len $OUTPUT_LEN \
|
|
||||||
--ignore-eos \
|
|
||||||
--disable-tqdm \
|
|
||||||
--request-rate inf \
|
|
||||||
--percentile-metrics ttft,tpot,itl,e2el \
|
|
||||||
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \
|
|
||||||
--num-prompts 100 \
|
|
||||||
--sonnet-prefix-len $prefix_len \
|
|
||||||
--port 8004 > "$bm_log"
|
|
||||||
through_put=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
|
|
||||||
e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
|
|
||||||
goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
|
|
||||||
|
|
||||||
if (( $(echo "$e2el <= $MAX_LATENCY_ALLOWED_MS" | bc -l) )); then
|
|
||||||
meet_latency_requirement=1
|
|
||||||
fi
|
|
||||||
|
|
||||||
if (( ! meet_latency_requirement )); then
|
|
||||||
# start from request-rate as int(through_put) + 1
|
|
||||||
request_rate=$((${through_put%.*} + 1))
|
|
||||||
while ((request_rate > 0)); do
|
|
||||||
# clear prefix cache
|
|
||||||
curl -X POST http://0.0.0.0:8004/reset_prefix_cache
|
|
||||||
sleep 5
|
|
||||||
bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_${request_rate}.txt"
|
|
||||||
python benchmarks/benchmark_serving.py \
|
|
||||||
--backend vllm \
|
|
||||||
--model $MODEL \
|
|
||||||
--dataset-name sonnet \
|
|
||||||
--dataset-path benchmarks/sonnet_4x.txt \
|
|
||||||
--sonnet-input-len $INPUT_LEN \
|
|
||||||
--sonnet-output-len $OUTPUT_LEN \
|
|
||||||
--ignore_eos \
|
|
||||||
--disable-tqdm \
|
|
||||||
--request-rate $request_rate \
|
|
||||||
--percentile-metrics ttft,tpot,itl,e2el \
|
|
||||||
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \
|
|
||||||
--num-prompts 100 \
|
|
||||||
--sonnet-prefix-len $prefix_len \
|
|
||||||
--port 8004 > "$bm_log"
|
|
||||||
through_put=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
|
|
||||||
e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
|
|
||||||
goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
|
|
||||||
if (( $(echo "$e2el <= $MAX_LATENCY_ALLOWED_MS" | bc -l) )); then
|
|
||||||
meet_latency_requirement=1
|
|
||||||
break
|
|
||||||
fi
|
|
||||||
request_rate=$((request_rate-1))
|
|
||||||
done
|
|
||||||
fi
|
|
||||||
# write the results and update the best result.
|
|
||||||
if ((meet_latency_requirement)); then
|
|
||||||
echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens, request_rate: $request_rate, e2el: $e2el, through put: $through_put, goodput: $goodput"
|
|
||||||
echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens, request_rate: $request_rate, e2el: $e2el, through put: $through_put, goodput: $goodput" >> "$RESULT"
|
|
||||||
if (( $(echo "$through_put > $best_throughput" | bc -l) )); then
|
|
||||||
best_throughput=$through_put
|
|
||||||
best_max_num_seqs=$max_num_seqs
|
|
||||||
best_num_batched_tokens=$max_num_batched_tokens
|
|
||||||
best_goodput=$goodput
|
|
||||||
fi
|
|
||||||
else
|
|
||||||
echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens does not meet latency requirement ${MAX_LATENCY_ALLOWED_MS}"
|
|
||||||
echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens does not meet latency requirement ${MAX_LATENCY_ALLOWED_MS}" >> "$RESULT"
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput"
|
|
||||||
|
|
||||||
echo "pkill -f vllm"
|
|
||||||
echo
|
|
||||||
pkill vllm
|
|
||||||
sleep 10
|
|
||||||
rm -f $vllm_log
|
|
||||||
printf '=%.0s' $(seq 1 20)
|
|
||||||
return 0
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
num_seqs_list="128 256"
|
|
||||||
num_batched_tokens_list="512 1024 2048 4096"
|
|
||||||
for num_seqs in $num_seqs_list; do
|
|
||||||
for num_batched_tokens in $num_batched_tokens_list; do
|
|
||||||
run_benchmark $num_seqs $num_batched_tokens
|
|
||||||
exit 0
|
|
||||||
done
|
|
||||||
done
|
|
||||||
echo "finish permutations"
|
|
||||||
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput"
|
|
||||||
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput" >> "$RESULT"
|
|
||||||
|
|
||||||
@ -1,6 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
import io
|
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
@ -33,7 +32,6 @@ class RequestFuncInput:
|
|||||||
extra_body: Optional[dict] = None
|
extra_body: Optional[dict] = None
|
||||||
multi_modal_content: Optional[dict] = None
|
multi_modal_content: Optional[dict] = None
|
||||||
ignore_eos: bool = False
|
ignore_eos: bool = False
|
||||||
language: Optional[str] = None
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
@ -201,7 +199,6 @@ async def async_request_deepspeed_mii(
|
|||||||
timeout=AIOHTTP_TIMEOUT) as session:
|
timeout=AIOHTTP_TIMEOUT) as session:
|
||||||
|
|
||||||
payload = {
|
payload = {
|
||||||
"model": request_func_input.model,
|
|
||||||
"prompt": request_func_input.prompt,
|
"prompt": request_func_input.prompt,
|
||||||
"max_tokens": request_func_input.output_len,
|
"max_tokens": request_func_input.output_len,
|
||||||
"temperature": 0.01, # deepspeed-mii does not accept 0.0 temp.
|
"temperature": 0.01, # deepspeed-mii does not accept 0.0 temp.
|
||||||
@ -222,15 +219,7 @@ async def async_request_deepspeed_mii(
|
|||||||
if response.status == 200:
|
if response.status == 200:
|
||||||
parsed_resp = await response.json()
|
parsed_resp = await response.json()
|
||||||
output.latency = time.perf_counter() - st
|
output.latency = time.perf_counter() - st
|
||||||
if "choices" in parsed_resp:
|
output.generated_text = parsed_resp["text"][0]
|
||||||
output.generated_text = parsed_resp["choices"][0][
|
|
||||||
"text"]
|
|
||||||
elif "text" in parsed_resp:
|
|
||||||
output.generated_text = parsed_resp["text"][0]
|
|
||||||
else:
|
|
||||||
output.error = ("Unexpected response format: "
|
|
||||||
"neither 'choices' nor 'text' found")
|
|
||||||
output.success = False
|
|
||||||
output.success = True
|
output.success = True
|
||||||
else:
|
else:
|
||||||
output.error = response.reason or ""
|
output.error = response.reason or ""
|
||||||
@ -261,7 +250,6 @@ async def async_request_openai_completions(
|
|||||||
if request_func_input.model_name else request_func_input.model,
|
if request_func_input.model_name else request_func_input.model,
|
||||||
"prompt": request_func_input.prompt,
|
"prompt": request_func_input.prompt,
|
||||||
"temperature": 0.0,
|
"temperature": 0.0,
|
||||||
"repetition_penalty": 1.0,
|
|
||||||
"max_tokens": request_func_input.output_len,
|
"max_tokens": request_func_input.output_len,
|
||||||
"logprobs": request_func_input.logprobs,
|
"logprobs": request_func_input.logprobs,
|
||||||
"stream": True,
|
"stream": True,
|
||||||
@ -440,110 +428,6 @@ async def async_request_openai_chat_completions(
|
|||||||
return output
|
return output
|
||||||
|
|
||||||
|
|
||||||
async def async_request_openai_audio(
|
|
||||||
request_func_input: RequestFuncInput,
|
|
||||||
pbar: Optional[tqdm] = None,
|
|
||||||
) -> RequestFuncOutput:
|
|
||||||
# Lazy import without PlaceholderModule to avoid vllm dep.
|
|
||||||
import soundfile
|
|
||||||
api_url = request_func_input.api_url
|
|
||||||
assert api_url.endswith(
|
|
||||||
("transcriptions", "translations"
|
|
||||||
)), "OpenAI Chat Completions API URL must end with 'transcriptions' "
|
|
||||||
"or `translations`."
|
|
||||||
|
|
||||||
async with aiohttp.ClientSession(trust_env=True,
|
|
||||||
timeout=AIOHTTP_TIMEOUT) as session:
|
|
||||||
content = [{"type": "text", "text": request_func_input.prompt}]
|
|
||||||
payload = {
|
|
||||||
"model": request_func_input.model_name \
|
|
||||||
if request_func_input.model_name else request_func_input.model,
|
|
||||||
"temperature": 0.0,
|
|
||||||
"max_completion_tokens": request_func_input.output_len,
|
|
||||||
"stream": True,
|
|
||||||
"language": "en",
|
|
||||||
# Flattened due to multipart/form-data
|
|
||||||
"stream_include_usage": True,
|
|
||||||
"stream_continuous_usage_stats": True
|
|
||||||
}
|
|
||||||
if request_func_input.extra_body:
|
|
||||||
payload.update(request_func_input.extra_body)
|
|
||||||
headers = {
|
|
||||||
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
|
|
||||||
}
|
|
||||||
|
|
||||||
# Send audio file
|
|
||||||
def to_bytes(y, sr):
|
|
||||||
buffer = io.BytesIO()
|
|
||||||
soundfile.write(buffer, y, sr, format="WAV")
|
|
||||||
buffer.seek(0)
|
|
||||||
return buffer
|
|
||||||
|
|
||||||
with to_bytes(*request_func_input.multi_modal_content['audio']) as f:
|
|
||||||
form = aiohttp.FormData()
|
|
||||||
form.add_field('file', f, content_type='audio/wav')
|
|
||||||
for key, value in payload.items():
|
|
||||||
form.add_field(key, str(value))
|
|
||||||
|
|
||||||
output = RequestFuncOutput()
|
|
||||||
output.prompt_len = request_func_input.prompt_len
|
|
||||||
|
|
||||||
generated_text = ""
|
|
||||||
ttft = 0.0
|
|
||||||
st = time.perf_counter()
|
|
||||||
most_recent_timestamp = st
|
|
||||||
try:
|
|
||||||
async with session.post(url=api_url,
|
|
||||||
data=form,
|
|
||||||
headers=headers) as response:
|
|
||||||
if response.status == 200:
|
|
||||||
async for chunk_bytes in response.content:
|
|
||||||
chunk_bytes = chunk_bytes.strip()
|
|
||||||
if not chunk_bytes:
|
|
||||||
continue
|
|
||||||
|
|
||||||
chunk = chunk_bytes.decode("utf-8").removeprefix(
|
|
||||||
"data: ")
|
|
||||||
if chunk != "[DONE]":
|
|
||||||
timestamp = time.perf_counter()
|
|
||||||
data = json.loads(chunk)
|
|
||||||
|
|
||||||
if choices := data.get("choices"):
|
|
||||||
content = choices[0]["delta"].get(
|
|
||||||
"content")
|
|
||||||
# First token
|
|
||||||
if ttft == 0.0:
|
|
||||||
ttft = timestamp - st
|
|
||||||
output.ttft = ttft
|
|
||||||
|
|
||||||
# Decoding phase
|
|
||||||
else:
|
|
||||||
output.itl.append(
|
|
||||||
timestamp - most_recent_timestamp)
|
|
||||||
|
|
||||||
generated_text += content or ""
|
|
||||||
elif usage := data.get("usage"):
|
|
||||||
output.output_tokens = usage.get(
|
|
||||||
"completion_tokens")
|
|
||||||
|
|
||||||
most_recent_timestamp = timestamp
|
|
||||||
|
|
||||||
output.generated_text = generated_text
|
|
||||||
output.success = True
|
|
||||||
output.latency = most_recent_timestamp - st
|
|
||||||
else:
|
|
||||||
output.error = response.reason or ""
|
|
||||||
output.success = False
|
|
||||||
except Exception:
|
|
||||||
output.success = False
|
|
||||||
exc_info = sys.exc_info()
|
|
||||||
output.error = "".join(traceback.format_exception(*exc_info))
|
|
||||||
|
|
||||||
if pbar:
|
|
||||||
pbar.update(1)
|
|
||||||
return output
|
|
||||||
|
|
||||||
|
|
||||||
def get_model(pretrained_model_name_or_path: str) -> str:
|
def get_model(pretrained_model_name_or_path: str) -> str:
|
||||||
if os.getenv('VLLM_USE_MODELSCOPE', 'False').lower() == 'true':
|
if os.getenv('VLLM_USE_MODELSCOPE', 'False').lower() == 'true':
|
||||||
from modelscope import snapshot_download
|
from modelscope import snapshot_download
|
||||||
@ -601,14 +485,7 @@ ASYNC_REQUEST_FUNCS = {
|
|||||||
"deepspeed-mii": async_request_deepspeed_mii,
|
"deepspeed-mii": async_request_deepspeed_mii,
|
||||||
"openai": async_request_openai_completions,
|
"openai": async_request_openai_completions,
|
||||||
"openai-chat": async_request_openai_chat_completions,
|
"openai-chat": async_request_openai_chat_completions,
|
||||||
"openai-audio": async_request_openai_audio,
|
|
||||||
"tensorrt-llm": async_request_trt_llm,
|
"tensorrt-llm": async_request_trt_llm,
|
||||||
"scalellm": async_request_openai_completions,
|
"scalellm": async_request_openai_completions,
|
||||||
"sglang": async_request_openai_completions,
|
"sglang": async_request_openai_completions,
|
||||||
}
|
}
|
||||||
|
|
||||||
OPENAI_COMPATIBLE_BACKENDS = [
|
|
||||||
k for k, v in ASYNC_REQUEST_FUNCS.items()
|
|
||||||
if v in (async_request_openai_completions,
|
|
||||||
async_request_openai_chat_completions)
|
|
||||||
]
|
|
||||||
|
|||||||
@ -23,8 +23,7 @@ from abc import ABC, abstractmethod
|
|||||||
from collections.abc import Mapping
|
from collections.abc import Mapping
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from functools import cache
|
from functools import cache
|
||||||
from io import BytesIO
|
from typing import Any, Optional, Union
|
||||||
from typing import Any, Callable, Optional, Union
|
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
@ -64,7 +63,6 @@ class SampleRequest:
|
|||||||
|
|
||||||
class BenchmarkDataset(ABC):
|
class BenchmarkDataset(ABC):
|
||||||
DEFAULT_SEED = 0
|
DEFAULT_SEED = 0
|
||||||
IS_MULTIMODAL = False
|
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
@ -241,24 +239,21 @@ def process_image(image: Any) -> Mapping[str, Any]:
|
|||||||
"""
|
"""
|
||||||
Process a single image input and return a multimedia content dictionary.
|
Process a single image input and return a multimedia content dictionary.
|
||||||
|
|
||||||
Supports three input types:
|
For a PIL.Image.Image input:
|
||||||
|
- Converts the image to RGB.
|
||||||
|
- Saves the image as a JPEG in-memory.
|
||||||
|
- Encodes the JPEG data as a base64 string.
|
||||||
|
- Returns a dictionary with the image as a base64 data URL.
|
||||||
|
|
||||||
1. Dictionary with raw image bytes: - Expects a dict with a 'bytes' key
|
For a string input:
|
||||||
containing raw image data. - Loads the bytes as a PIL.Image.Image.
|
- Treats the string as a URL or file path.
|
||||||
|
- Prepends "file://" if the string doesn't start with "http://" or
|
||||||
2. PIL.Image.Image input: - Converts the image to RGB. - Saves the image as
|
"file://".
|
||||||
a JPEG in memory. - Encodes the JPEG data as a base64 string. - Returns
|
- Returns a dictionary with the image URL.
|
||||||
a dictionary with the image as a base64 data URL.
|
|
||||||
|
|
||||||
3. String input: - Treats the string as a URL or local file path. -
|
|
||||||
Prepends "file://" if the string doesn't start with "http://" or
|
|
||||||
"file://". - Returns a dictionary with the image URL.
|
|
||||||
|
|
||||||
Raises:
|
Raises:
|
||||||
ValueError: If the input is not a supported type.
|
ValueError: If the input is neither a PIL.Image.Image nor a string.
|
||||||
"""
|
"""
|
||||||
if isinstance(image, dict) and 'bytes' in image:
|
|
||||||
image = Image.open(BytesIO(image['bytes']))
|
|
||||||
if isinstance(image, Image.Image):
|
if isinstance(image, Image.Image):
|
||||||
image = image.convert("RGB")
|
image = image.convert("RGB")
|
||||||
with io.BytesIO() as image_data:
|
with io.BytesIO() as image_data:
|
||||||
@ -277,8 +272,8 @@ def process_image(image: Any) -> Mapping[str, Any]:
|
|||||||
("http://", "file://")) else f"file://{image}")
|
("http://", "file://")) else f"file://{image}")
|
||||||
return {"type": "image_url", "image_url": {"url": image_url}}
|
return {"type": "image_url", "image_url": {"url": image_url}}
|
||||||
|
|
||||||
raise ValueError(f"Invalid image input {image}. Must be a PIL.Image.Image"
|
raise ValueError(
|
||||||
" or str or dictionary with raw image bytes.")
|
f"Invalid image input {image}. Must be a PIL.Image.Image or str.")
|
||||||
|
|
||||||
|
|
||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
@ -289,7 +284,7 @@ def process_image(image: Any) -> Mapping[str, Any]:
|
|||||||
class RandomDataset(BenchmarkDataset):
|
class RandomDataset(BenchmarkDataset):
|
||||||
# Default values copied from benchmark_serving.py for the random dataset.
|
# Default values copied from benchmark_serving.py for the random dataset.
|
||||||
DEFAULT_PREFIX_LEN = 0
|
DEFAULT_PREFIX_LEN = 0
|
||||||
DEFAULT_RANGE_RATIO = 0.0
|
DEFAULT_RANGE_RATIO = 1.0
|
||||||
DEFAULT_INPUT_LEN = 1024
|
DEFAULT_INPUT_LEN = 1024
|
||||||
DEFAULT_OUTPUT_LEN = 128
|
DEFAULT_OUTPUT_LEN = 128
|
||||||
|
|
||||||
@ -309,34 +304,19 @@ class RandomDataset(BenchmarkDataset):
|
|||||||
output_len: int = DEFAULT_OUTPUT_LEN,
|
output_len: int = DEFAULT_OUTPUT_LEN,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
) -> list[SampleRequest]:
|
) -> list[SampleRequest]:
|
||||||
# Enforce range_ratio < 1
|
|
||||||
assert range_ratio < 1.0, (
|
|
||||||
"random_range_ratio must be < 1.0 to ensure a valid sampling range"
|
|
||||||
)
|
|
||||||
|
|
||||||
vocab_size = tokenizer.vocab_size
|
vocab_size = tokenizer.vocab_size
|
||||||
num_special_tokens = tokenizer.num_special_tokens_to_add()
|
|
||||||
real_input_len = input_len - num_special_tokens
|
|
||||||
|
|
||||||
prefix_token_ids = (np.random.randint(
|
prefix_token_ids = (np.random.randint(
|
||||||
0, vocab_size, size=prefix_len).tolist() if prefix_len > 0 else [])
|
0, vocab_size, size=prefix_len).tolist() if prefix_len > 0 else [])
|
||||||
|
|
||||||
# New sampling logic: [X * (1 - b), X * (1 + b)]
|
input_low = int(input_len * range_ratio)
|
||||||
input_low = int(real_input_len * (1 - range_ratio))
|
output_low = int(output_len * range_ratio)
|
||||||
input_high = int(real_input_len * (1 + range_ratio))
|
|
||||||
output_low = int(output_len * (1 - range_ratio))
|
|
||||||
output_high = int(output_len * (1 + range_ratio))
|
|
||||||
|
|
||||||
# Add logging for debugging
|
|
||||||
logger.info("Sampling input_len from [%s, %s]", input_low, input_high)
|
|
||||||
logger.info("Sampling output_len from [%s, %s]", output_low,
|
|
||||||
output_high)
|
|
||||||
|
|
||||||
input_lens = np.random.randint(input_low,
|
input_lens = np.random.randint(input_low,
|
||||||
input_high + 1,
|
input_len + 1,
|
||||||
size=num_requests)
|
size=num_requests)
|
||||||
output_lens = np.random.randint(output_low,
|
output_lens = np.random.randint(output_low,
|
||||||
output_high + 1,
|
output_len + 1,
|
||||||
size=num_requests)
|
size=num_requests)
|
||||||
offsets = np.random.randint(0, vocab_size, size=num_requests)
|
offsets = np.random.randint(0, vocab_size, size=num_requests)
|
||||||
|
|
||||||
@ -346,17 +326,6 @@ class RandomDataset(BenchmarkDataset):
|
|||||||
vocab_size).tolist()
|
vocab_size).tolist()
|
||||||
token_sequence = prefix_token_ids + inner_seq
|
token_sequence = prefix_token_ids + inner_seq
|
||||||
prompt = tokenizer.decode(token_sequence)
|
prompt = tokenizer.decode(token_sequence)
|
||||||
# After decoding the prompt we have to encode and decode it again.
|
|
||||||
# This is done because in some cases N consecutive tokens
|
|
||||||
# give a string tokenized into != N number of tokens.
|
|
||||||
# For example for GPT2Tokenizer:
|
|
||||||
# [6880, 6881] -> ['Ġcalls', 'here'] ->
|
|
||||||
# [1650, 939, 486] -> ['Ġcall', 'sh', 'ere']
|
|
||||||
# To avoid uncontrolled change of the prompt length,
|
|
||||||
# the encoded sequence is truncated before being decode again.
|
|
||||||
re_encoded_sequence = tokenizer.encode(
|
|
||||||
prompt, add_special_tokens=False)[:input_lens[i]]
|
|
||||||
prompt = tokenizer.decode(re_encoded_sequence)
|
|
||||||
total_input_len = prefix_len + int(input_lens[i])
|
total_input_len = prefix_len + int(input_lens[i])
|
||||||
requests.append(
|
requests.append(
|
||||||
SampleRequest(
|
SampleRequest(
|
||||||
@ -499,11 +468,11 @@ class SonnetDataset(BenchmarkDataset):
|
|||||||
|
|
||||||
# Determine how many poem lines to use.
|
# Determine how many poem lines to use.
|
||||||
num_input_lines = round((input_len - base_offset) / avg_len)
|
num_input_lines = round((input_len - base_offset) / avg_len)
|
||||||
num_prefix_lines = max(round((prefix_len - base_offset) / avg_len), 0)
|
num_prefix_lines = round((prefix_len - base_offset) / avg_len)
|
||||||
prefix_lines = self.data[:num_prefix_lines]
|
prefix_lines = self.data[:num_prefix_lines]
|
||||||
|
|
||||||
samples = []
|
samples = []
|
||||||
while len(samples) < num_requests:
|
for _ in range(num_requests):
|
||||||
extra_lines = random.choices(self.data,
|
extra_lines = random.choices(self.data,
|
||||||
k=num_input_lines - num_prefix_lines)
|
k=num_input_lines - num_prefix_lines)
|
||||||
prompt = f"{base_prompt}{''.join(prefix_lines + extra_lines)}"
|
prompt = f"{base_prompt}{''.join(prefix_lines + extra_lines)}"
|
||||||
@ -511,14 +480,13 @@ class SonnetDataset(BenchmarkDataset):
|
|||||||
prompt_formatted = tokenizer.apply_chat_template(
|
prompt_formatted = tokenizer.apply_chat_template(
|
||||||
msg, add_generation_prompt=True, tokenize=False)
|
msg, add_generation_prompt=True, tokenize=False)
|
||||||
prompt_len = len(tokenizer(prompt_formatted).input_ids)
|
prompt_len = len(tokenizer(prompt_formatted).input_ids)
|
||||||
if prompt_len <= input_len:
|
samples.append(
|
||||||
samples.append(
|
SampleRequest(
|
||||||
SampleRequest(
|
prompt=prompt_formatted
|
||||||
prompt=prompt_formatted
|
if return_prompt_formatted else prompt,
|
||||||
if return_prompt_formatted else prompt,
|
prompt_len=prompt_len,
|
||||||
prompt_len=prompt_len,
|
expected_output_len=output_len,
|
||||||
expected_output_len=output_len,
|
))
|
||||||
))
|
|
||||||
return samples
|
return samples
|
||||||
|
|
||||||
|
|
||||||
@ -594,48 +562,48 @@ class BurstGPTDataset(BenchmarkDataset):
|
|||||||
|
|
||||||
|
|
||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
# HuggingFace Dataset Base Implementation
|
# HuggingFace Dataset Implementation
|
||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
class HuggingFaceDataset(BenchmarkDataset):
|
|
||||||
"""Base class for datasets hosted on HuggingFace."""
|
|
||||||
|
|
||||||
SUPPORTED_DATASET_PATHS: Union[set[str], dict[str, Callable]] = set()
|
|
||||||
|
class HuggingFaceDataset(BenchmarkDataset):
|
||||||
|
"""
|
||||||
|
Dataset class for processing a HuggingFace dataset with conversation data
|
||||||
|
and optional images.
|
||||||
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
dataset_path: str,
|
|
||||||
dataset_split: str,
|
dataset_split: str,
|
||||||
dataset_subset: Optional[str] = None,
|
dataset_subset: Optional[str] = None,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
) -> None:
|
) -> None:
|
||||||
super().__init__(dataset_path=dataset_path, **kwargs)
|
super().__init__(**kwargs)
|
||||||
|
|
||||||
self.dataset_split = dataset_split
|
self.dataset_split = dataset_split
|
||||||
self.dataset_subset = dataset_subset
|
self.dataset_subset = dataset_subset
|
||||||
|
|
||||||
self.load_data()
|
self.load_data()
|
||||||
|
|
||||||
def load_data(self) -> None:
|
def load_data(self) -> None:
|
||||||
"""Load data from HuggingFace datasets."""
|
if not self.dataset_path:
|
||||||
|
raise ValueError("dataset_path must be provided for loading data.")
|
||||||
|
|
||||||
self.data = load_dataset(
|
self.data = load_dataset(
|
||||||
self.dataset_path,
|
self.dataset_path,
|
||||||
name=self.dataset_subset,
|
name=self.dataset_subset,
|
||||||
split=self.dataset_split,
|
split=self.dataset_split,
|
||||||
streaming=True,
|
streaming=True,
|
||||||
)
|
)
|
||||||
self.data = self.data.shuffle(seed=self.random_seed)
|
if self.data.features is None or "conversations" \
|
||||||
|
not in self.data.features:
|
||||||
|
raise ValueError(
|
||||||
# -----------------------------------------------------------------------------
|
"HuggingFaceDataset currently only supports datasets with "
|
||||||
# Conversation Dataset Implementation
|
"a 'conversations' column like lmms-lab/LLaVA-OneVision-Data. "
|
||||||
# -----------------------------------------------------------------------------
|
"Please consider contributing if you would like to add "
|
||||||
|
"support for additional dataset formats.")
|
||||||
|
# Shuffle and filter examples with at least 2 conversations.
|
||||||
class ConversationDataset(HuggingFaceDataset):
|
self.data = self.data.shuffle(seed=self.random_seed).filter(
|
||||||
"""Dataset for conversation data with multimodal support."""
|
lambda x: len(x["conversations"]) >= 2)
|
||||||
SUPPORTED_DATASET_PATHS = {
|
|
||||||
'lmms-lab/LLaVA-OneVision-Data', 'Aeala/ShareGPT_Vicuna_unfiltered'
|
|
||||||
}
|
|
||||||
IS_MULTIMODAL = True
|
|
||||||
|
|
||||||
def sample(self,
|
def sample(self,
|
||||||
tokenizer: PreTrainedTokenizerBase,
|
tokenizer: PreTrainedTokenizerBase,
|
||||||
@ -643,13 +611,10 @@ class ConversationDataset(HuggingFaceDataset):
|
|||||||
output_len: Optional[int] = None,
|
output_len: Optional[int] = None,
|
||||||
enable_multimodal_chat: bool = False,
|
enable_multimodal_chat: bool = False,
|
||||||
**kwargs) -> list:
|
**kwargs) -> list:
|
||||||
# Filter examples with at least 2 conversations
|
|
||||||
filtered_data = self.data.filter(
|
|
||||||
lambda x: len(x["conversations"]) >= 2)
|
|
||||||
sampled_requests = []
|
sampled_requests = []
|
||||||
dynamic_output = output_len is None
|
dynamic_output = output_len is None
|
||||||
|
|
||||||
for item in filtered_data:
|
for item in self.data:
|
||||||
if len(sampled_requests) >= num_requests:
|
if len(sampled_requests) >= num_requests:
|
||||||
break
|
break
|
||||||
conv = item["conversations"]
|
conv = item["conversations"]
|
||||||
@ -694,13 +659,29 @@ class VisionArenaDataset(HuggingFaceDataset):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
DEFAULT_OUTPUT_LEN = 128
|
DEFAULT_OUTPUT_LEN = 128
|
||||||
SUPPORTED_DATASET_PATHS = {
|
VISION_ARENA_DATASET_PATH = "lmarena-ai/vision-arena-bench-v0.1"
|
||||||
"lmarena-ai/VisionArena-Chat":
|
|
||||||
lambda x: x["conversation"][0][0]["content"],
|
def __init__(
|
||||||
"lmarena-ai/vision-arena-bench-v0.1":
|
self,
|
||||||
lambda x: x["turns"][0][0]["content"]
|
**kwargs,
|
||||||
}
|
) -> None:
|
||||||
IS_MULTIMODAL = True
|
super().__init__(**kwargs)
|
||||||
|
if self.dataset_path != self.VISION_ARENA_DATASET_PATH:
|
||||||
|
raise ValueError(f"Only support Vision Arena dataset.\
|
||||||
|
This data path {self.dataset_path} is not valid.")
|
||||||
|
if self.dataset_subset is None and self.dataset_split != "train":
|
||||||
|
raise ValueError("Dataset split must be 'train'.")
|
||||||
|
|
||||||
|
self.load_data()
|
||||||
|
|
||||||
|
def load_data(self) -> None:
|
||||||
|
dataset = load_dataset(
|
||||||
|
self.dataset_path,
|
||||||
|
name=self.dataset_subset,
|
||||||
|
split=self.dataset_split,
|
||||||
|
streaming=True,
|
||||||
|
)
|
||||||
|
self.data = dataset.shuffle(seed=self.random_seed)
|
||||||
|
|
||||||
def sample(
|
def sample(
|
||||||
self,
|
self,
|
||||||
@ -716,11 +697,7 @@ class VisionArenaDataset(HuggingFaceDataset):
|
|||||||
for item in self.data:
|
for item in self.data:
|
||||||
if len(sampled_requests) >= num_requests:
|
if len(sampled_requests) >= num_requests:
|
||||||
break
|
break
|
||||||
parser_fn = self.SUPPORTED_DATASET_PATHS.get(self.dataset_path)
|
prompt = item["turns"][0][0]["content"]
|
||||||
if parser_fn is None:
|
|
||||||
raise ValueError(
|
|
||||||
f"Unsupported dataset path: {self.dataset_path}")
|
|
||||||
prompt = parser_fn(item)
|
|
||||||
mm_content = process_image(item["images"][0])
|
mm_content = process_image(item["images"][0])
|
||||||
prompt_len = len(tokenizer(prompt).input_ids)
|
prompt_len = len(tokenizer(prompt).input_ids)
|
||||||
if enable_multimodal_chat:
|
if enable_multimodal_chat:
|
||||||
@ -738,315 +715,3 @@ class VisionArenaDataset(HuggingFaceDataset):
|
|||||||
))
|
))
|
||||||
self.maybe_oversample_requests(sampled_requests, num_requests)
|
self.maybe_oversample_requests(sampled_requests, num_requests)
|
||||||
return sampled_requests
|
return sampled_requests
|
||||||
|
|
||||||
|
|
||||||
# -----------------------------------------------------------------------------
|
|
||||||
# Instruct Coder Dataset Implementation
|
|
||||||
# -----------------------------------------------------------------------------
|
|
||||||
|
|
||||||
|
|
||||||
class InstructCoderDataset(HuggingFaceDataset):
|
|
||||||
"""
|
|
||||||
InstructCoder Dataset.
|
|
||||||
https://huggingface.co/datasets/likaixin/InstructCoder
|
|
||||||
|
|
||||||
InstructCoder is the dataset designed for general code editing. It consists
|
|
||||||
of 114,239 instruction-input-output triplets, and covers multiple distinct
|
|
||||||
code editing scenario.
|
|
||||||
"""
|
|
||||||
|
|
||||||
DEFAULT_OUTPUT_LEN = 200 # this is the average default output length
|
|
||||||
SUPPORTED_DATASET_PATHS = {
|
|
||||||
"likaixin/InstructCoder",
|
|
||||||
}
|
|
||||||
|
|
||||||
def sample(self,
|
|
||||||
tokenizer: PreTrainedTokenizerBase,
|
|
||||||
num_requests: int,
|
|
||||||
output_len: Optional[int] = None,
|
|
||||||
enable_multimodal_chat: bool = False,
|
|
||||||
**kwargs) -> list:
|
|
||||||
output_len = (output_len
|
|
||||||
if output_len is not None else self.DEFAULT_OUTPUT_LEN)
|
|
||||||
sampled_requests = []
|
|
||||||
for item in self.data:
|
|
||||||
if len(sampled_requests) >= num_requests:
|
|
||||||
break
|
|
||||||
prompt = f"{item['instruction']}:\n{item['input']}"
|
|
||||||
prompt_len = len(tokenizer(prompt).input_ids)
|
|
||||||
sampled_requests.append(
|
|
||||||
SampleRequest(
|
|
||||||
prompt=prompt,
|
|
||||||
prompt_len=prompt_len,
|
|
||||||
expected_output_len=output_len,
|
|
||||||
))
|
|
||||||
self.maybe_oversample_requests(sampled_requests, num_requests)
|
|
||||||
return sampled_requests
|
|
||||||
|
|
||||||
|
|
||||||
# -----------------------------------------------------------------------------
|
|
||||||
# MT-Bench Dataset Implementation
|
|
||||||
# -----------------------------------------------------------------------------
|
|
||||||
|
|
||||||
|
|
||||||
class MTBenchDataset(HuggingFaceDataset):
|
|
||||||
"""
|
|
||||||
MT-Bench Dataset.
|
|
||||||
https://huggingface.co/datasets/philschmid/mt-bench
|
|
||||||
|
|
||||||
We create a single turn dataset for MT-Bench.
|
|
||||||
This is similar to Spec decoding benchmark setup in vLLM
|
|
||||||
https://github.com/vllm-project/vllm/blob/9d98ab5ec/examples/offline_inference/eagle.py#L14-L18
|
|
||||||
""" # noqa: E501
|
|
||||||
|
|
||||||
DEFAULT_OUTPUT_LEN = 256 # avg len used in SD bench in vLLM
|
|
||||||
SUPPORTED_DATASET_PATHS = {
|
|
||||||
"philschmid/mt-bench",
|
|
||||||
}
|
|
||||||
|
|
||||||
def sample(self,
|
|
||||||
tokenizer: PreTrainedTokenizerBase,
|
|
||||||
num_requests: int,
|
|
||||||
output_len: Optional[int] = None,
|
|
||||||
enable_multimodal_chat: bool = False,
|
|
||||||
**kwargs) -> list:
|
|
||||||
output_len = (output_len
|
|
||||||
if output_len is not None else self.DEFAULT_OUTPUT_LEN)
|
|
||||||
sampled_requests = []
|
|
||||||
|
|
||||||
for item in self.data:
|
|
||||||
if len(sampled_requests) >= num_requests:
|
|
||||||
break
|
|
||||||
prompt = item['turns'][0]
|
|
||||||
|
|
||||||
# apply template
|
|
||||||
prompt = tokenizer.apply_chat_template([{
|
|
||||||
"role": "user",
|
|
||||||
"content": prompt
|
|
||||||
}],
|
|
||||||
add_generation_prompt=True,
|
|
||||||
tokenize=False)
|
|
||||||
|
|
||||||
prompt_len = len(tokenizer(prompt).input_ids)
|
|
||||||
sampled_requests.append(
|
|
||||||
SampleRequest(
|
|
||||||
prompt=prompt,
|
|
||||||
prompt_len=prompt_len,
|
|
||||||
expected_output_len=output_len,
|
|
||||||
))
|
|
||||||
self.maybe_oversample_requests(sampled_requests, num_requests)
|
|
||||||
return sampled_requests
|
|
||||||
|
|
||||||
|
|
||||||
# -----------------------------------------------------------------------------
|
|
||||||
# AIMO Dataset Implementation
|
|
||||||
# -----------------------------------------------------------------------------
|
|
||||||
|
|
||||||
|
|
||||||
class AIMODataset(HuggingFaceDataset):
|
|
||||||
"""
|
|
||||||
Dataset class for processing a AIMO dataset with reasoning questions.
|
|
||||||
"""
|
|
||||||
SUPPORTED_DATASET_PATHS = {
|
|
||||||
"AI-MO/aimo-validation-aime", "AI-MO/NuminaMath-1.5",
|
|
||||||
"AI-MO/NuminaMath-CoT"
|
|
||||||
}
|
|
||||||
|
|
||||||
def sample(self,
|
|
||||||
tokenizer: PreTrainedTokenizerBase,
|
|
||||||
num_requests: int,
|
|
||||||
output_len: Optional[int] = None,
|
|
||||||
**kwargs) -> list:
|
|
||||||
sampled_requests = []
|
|
||||||
dynamic_output = output_len is None
|
|
||||||
|
|
||||||
for item in self.data:
|
|
||||||
if len(sampled_requests) >= num_requests:
|
|
||||||
break
|
|
||||||
prompt, completion = item['problem'], item["solution"]
|
|
||||||
|
|
||||||
prompt_ids = tokenizer(prompt).input_ids
|
|
||||||
completion_ids = tokenizer(completion).input_ids
|
|
||||||
prompt_len = len(prompt_ids)
|
|
||||||
completion_len = len(completion_ids)
|
|
||||||
output_len = completion_len if dynamic_output else output_len
|
|
||||||
assert isinstance(output_len, int) and output_len > 0
|
|
||||||
if dynamic_output and not is_valid_sequence(prompt_len,
|
|
||||||
completion_len,
|
|
||||||
max_prompt_len=2048,
|
|
||||||
max_total_len=32000):
|
|
||||||
continue
|
|
||||||
sampled_requests.append(
|
|
||||||
SampleRequest(
|
|
||||||
prompt=prompt,
|
|
||||||
prompt_len=prompt_len,
|
|
||||||
expected_output_len=output_len,
|
|
||||||
multi_modal_data=None,
|
|
||||||
))
|
|
||||||
self.maybe_oversample_requests(sampled_requests, num_requests)
|
|
||||||
return sampled_requests
|
|
||||||
|
|
||||||
|
|
||||||
# -----------------------------------------------------------------------------
|
|
||||||
# Next Edit Prediction Dataset Implementation
|
|
||||||
# -----------------------------------------------------------------------------
|
|
||||||
|
|
||||||
|
|
||||||
zeta_prompt = """### Instruction:
|
|
||||||
You are a code completion assistant and your task is to analyze user edits and then rewrite an excerpt that the user provides, suggesting the appropriate edits within the excerpt, taking into account the cursor location.
|
|
||||||
|
|
||||||
### User Edits:
|
|
||||||
|
|
||||||
{}
|
|
||||||
|
|
||||||
### User Excerpt:
|
|
||||||
|
|
||||||
{}
|
|
||||||
|
|
||||||
### Response:
|
|
||||||
|
|
||||||
""" # noqa: E501
|
|
||||||
|
|
||||||
|
|
||||||
def _format_zeta_prompt(
|
|
||||||
sample: dict,
|
|
||||||
original_start_marker: str = "<|editable_region_start|>") -> dict:
|
|
||||||
"""Format the zeta prompt for the Next Edit Prediction (NEP) dataset.
|
|
||||||
|
|
||||||
This function formats examples from the NEP dataset
|
|
||||||
into prompts and expected outputs. It could be
|
|
||||||
further extended to support more NEP datasets.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
sample: The dataset sample containing events,
|
|
||||||
inputs, and outputs.
|
|
||||||
original_start_marker: The marker indicating the
|
|
||||||
start of the editable region. Defaults to
|
|
||||||
"<|editable_region_start|>".
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
A dictionary with the formatted prompts and expected outputs.
|
|
||||||
"""
|
|
||||||
events = sample["events"]
|
|
||||||
input = sample["input"]
|
|
||||||
output = sample["output"]
|
|
||||||
prompt = zeta_prompt.format(events, input)
|
|
||||||
|
|
||||||
# following the original implementation, extract the focused region
|
|
||||||
# from the raw output
|
|
||||||
output_start_index = output.find(original_start_marker)
|
|
||||||
output_focused_region = output[output_start_index:]
|
|
||||||
expected_output = output_focused_region
|
|
||||||
|
|
||||||
return {"prompt": prompt, "expected_output": expected_output}
|
|
||||||
|
|
||||||
|
|
||||||
class NextEditPredictionDataset(HuggingFaceDataset):
|
|
||||||
"""
|
|
||||||
Dataset class for processing a Next Edit Prediction dataset.
|
|
||||||
"""
|
|
||||||
|
|
||||||
SUPPORTED_DATASET_PATHS = {
|
|
||||||
"zed-industries/zeta",
|
|
||||||
}
|
|
||||||
MAPPING_PROMPT_FUNCS = {
|
|
||||||
"zed-industries/zeta": _format_zeta_prompt,
|
|
||||||
}
|
|
||||||
|
|
||||||
def sample(self, tokenizer: PreTrainedTokenizerBase, num_requests: int,
|
|
||||||
**kwargs):
|
|
||||||
formatting_prompt_func = self.MAPPING_PROMPT_FUNCS.get(
|
|
||||||
self.dataset_path)
|
|
||||||
if formatting_prompt_func is None:
|
|
||||||
raise ValueError(f"Unsupported dataset path: {self.dataset_path}")
|
|
||||||
samples = []
|
|
||||||
for sample in self.data:
|
|
||||||
sample = formatting_prompt_func(sample)
|
|
||||||
samples.append(
|
|
||||||
SampleRequest(
|
|
||||||
prompt=sample["prompt"],
|
|
||||||
prompt_len=len(tokenizer(sample["prompt"]).input_ids),
|
|
||||||
expected_output_len=len(
|
|
||||||
tokenizer(sample["expected_output"]).input_ids),
|
|
||||||
))
|
|
||||||
if len(samples) >= num_requests:
|
|
||||||
break
|
|
||||||
self.maybe_oversample_requests(samples, num_requests)
|
|
||||||
return samples
|
|
||||||
|
|
||||||
|
|
||||||
# -----------------------------------------------------------------------------
|
|
||||||
# ASR Dataset Implementation
|
|
||||||
# -----------------------------------------------------------------------------
|
|
||||||
|
|
||||||
|
|
||||||
class ASRDataset(HuggingFaceDataset):
|
|
||||||
"""
|
|
||||||
Dataset class for processing a ASR dataset for transcription.
|
|
||||||
Tested on the following set:
|
|
||||||
|
|
||||||
+----------------+----------------------------------------+--------------------------+-----------------------------+
|
|
||||||
| Dataset | Domain | Speaking Style | hf-subset |
|
|
||||||
+----------------+----------------------------------------+--------------------------+-----------------------------+
|
|
||||||
| TED-LIUM | TED talks | Oratory | release1, release2, release3|
|
|
||||||
| | | | release3-speaker-adaptation |
|
|
||||||
| VoxPopuli | European Parliament | Oratory | en, de, it, fr, ... |
|
|
||||||
| LibriSpeech | Audiobook | Narrated | "LIUM/tedlium" |
|
|
||||||
| GigaSpeech | Audiobook, podcast, YouTube | Narrated, spontaneous | xs, s, m, l, xl, dev, test |
|
|
||||||
| SPGISpeech | Financial meetings | Oratory, spontaneous | S, M, L, dev, test |
|
|
||||||
| AMI | Meetings | Spontaneous | ihm, sdm |
|
|
||||||
+----------------+----------------------------------------+--------------------------+-----------------------------+
|
|
||||||
|
|
||||||
""" # noqa: E501
|
|
||||||
SUPPORTED_DATASET_PATHS = {
|
|
||||||
"openslr/librispeech_asr", "facebook/voxpopuli", "LIUM/tedlium",
|
|
||||||
"edinburghcstr/ami", "speechcolab/gigaspeech", "kensho/spgispeech"
|
|
||||||
}
|
|
||||||
|
|
||||||
DEFAULT_OUTPUT_LEN = 128
|
|
||||||
IS_MULTIMODAL = True
|
|
||||||
|
|
||||||
# TODO Whisper-specific. Abstract interface when more models are supported.
|
|
||||||
TRANSCRIPTION_PREAMBLE = "<|startoftranscript|><|en|><|transcribe|>"\
|
|
||||||
"<|notimestamps|>"
|
|
||||||
skip_long_audios: bool = True
|
|
||||||
|
|
||||||
def sample(
|
|
||||||
self,
|
|
||||||
tokenizer: PreTrainedTokenizerBase,
|
|
||||||
num_requests: int,
|
|
||||||
output_len: Optional[int] = None,
|
|
||||||
**kwargs,
|
|
||||||
) -> list:
|
|
||||||
import librosa
|
|
||||||
output_len = (output_len
|
|
||||||
if output_len is not None else self.DEFAULT_OUTPUT_LEN)
|
|
||||||
prompt = ASRDataset.TRANSCRIPTION_PREAMBLE
|
|
||||||
prompt_len = len(tokenizer(prompt).input_ids)
|
|
||||||
sampled_requests = []
|
|
||||||
skipped = 0
|
|
||||||
for item in self.data:
|
|
||||||
if len(sampled_requests) >= num_requests:
|
|
||||||
break
|
|
||||||
audio = item["audio"]
|
|
||||||
y, sr = audio["array"], audio["sampling_rate"]
|
|
||||||
duration_s = librosa.get_duration(y=y, sr=sr)
|
|
||||||
# Whisper max supported duration
|
|
||||||
if self.skip_long_audios and duration_s > 30:
|
|
||||||
skipped += 1
|
|
||||||
continue
|
|
||||||
|
|
||||||
mm_content = {"audio": (y, sr)}
|
|
||||||
sampled_requests.append(
|
|
||||||
SampleRequest(
|
|
||||||
prompt=prompt,
|
|
||||||
prompt_len=prompt_len,
|
|
||||||
expected_output_len=output_len,
|
|
||||||
multi_modal_data=mm_content,
|
|
||||||
))
|
|
||||||
if skipped:
|
|
||||||
logger.warning("%d samples discarded from dataset due to" \
|
|
||||||
" their length being greater than" \
|
|
||||||
" what Whisper supports.", skipped)
|
|
||||||
self.maybe_oversample_requests(sampled_requests, num_requests)
|
|
||||||
return sampled_requests
|
|
||||||
|
|||||||
@ -63,16 +63,14 @@ class Request:
|
|||||||
output_len: int
|
output_len: int
|
||||||
|
|
||||||
|
|
||||||
def sample_tokens(tokenizer: PreTrainedTokenizerBase,
|
def sample_tokens(tokenizer: PreTrainedTokenizerBase, length: int) -> str:
|
||||||
length: int) -> list[int]:
|
|
||||||
vocab = tokenizer.get_vocab()
|
vocab = tokenizer.get_vocab()
|
||||||
all_special_ids = set(tokenizer.all_special_ids)
|
|
||||||
|
|
||||||
# Remove the special tokens.
|
# Remove the special tokens.
|
||||||
return random.choices(
|
vocab = {
|
||||||
[v for k, v in vocab.items() if k not in all_special_ids],
|
k: v
|
||||||
k=length,
|
for k, v in vocab.items() if k not in tokenizer.all_special_ids
|
||||||
)
|
}
|
||||||
|
return random.choices(list(vocab.values()), k=length)
|
||||||
|
|
||||||
|
|
||||||
def sample_requests_from_dataset(
|
def sample_requests_from_dataset(
|
||||||
|
|||||||
@ -7,6 +7,9 @@ On the server side, run one of the following commands:
|
|||||||
--swap-space 16 \
|
--swap-space 16 \
|
||||||
--disable-log-requests
|
--disable-log-requests
|
||||||
|
|
||||||
|
(TGI backend)
|
||||||
|
./launch_tgi_server.sh <your_model> <max_batch_total_tokens>
|
||||||
|
|
||||||
On the client side, run:
|
On the client side, run:
|
||||||
python benchmarks/benchmark_serving.py \
|
python benchmarks/benchmark_serving.py \
|
||||||
--backend <backend> \
|
--backend <backend> \
|
||||||
@ -34,8 +37,7 @@ from datetime import datetime
|
|||||||
from typing import Any, Optional
|
from typing import Any, Optional
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from backend_request_func import (ASYNC_REQUEST_FUNCS,
|
from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput,
|
||||||
OPENAI_COMPATIBLE_BACKENDS, RequestFuncInput,
|
|
||||||
RequestFuncOutput)
|
RequestFuncOutput)
|
||||||
from tqdm.asyncio import tqdm
|
from tqdm.asyncio import tqdm
|
||||||
from transformers import PreTrainedTokenizerBase
|
from transformers import PreTrainedTokenizerBase
|
||||||
@ -50,12 +52,9 @@ try:
|
|||||||
except ImportError:
|
except ImportError:
|
||||||
from argparse import ArgumentParser as FlexibleArgumentParser
|
from argparse import ArgumentParser as FlexibleArgumentParser
|
||||||
|
|
||||||
from benchmark_dataset import (AIMODataset, ASRDataset, BurstGPTDataset,
|
from benchmark_dataset import (BurstGPTDataset, HuggingFaceDataset,
|
||||||
ConversationDataset, HuggingFaceDataset,
|
RandomDataset, SampleRequest, ShareGPTDataset,
|
||||||
InstructCoderDataset, MTBenchDataset,
|
SonnetDataset, VisionArenaDataset)
|
||||||
NextEditPredictionDataset, RandomDataset,
|
|
||||||
SampleRequest, ShareGPTDataset, SonnetDataset,
|
|
||||||
VisionArenaDataset)
|
|
||||||
from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
|
from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
|
||||||
|
|
||||||
MILLISECONDS_TO_SECONDS_CONVERSION = 1000
|
MILLISECONDS_TO_SECONDS_CONVERSION = 1000
|
||||||
@ -157,7 +156,7 @@ def calculate_metrics(
|
|||||||
if outputs[i].success:
|
if outputs[i].success:
|
||||||
output_len = outputs[i].output_tokens
|
output_len = outputs[i].output_tokens
|
||||||
|
|
||||||
if not output_len:
|
if output_len is None:
|
||||||
# We use the tokenizer to count the number of output tokens
|
# We use the tokenizer to count the number of output tokens
|
||||||
# for some serving backends instead of looking at
|
# for some serving backends instead of looking at
|
||||||
# len(outputs[i].itl) since multiple output tokens may be
|
# len(outputs[i].itl) since multiple output tokens may be
|
||||||
@ -262,7 +261,6 @@ async def benchmark(
|
|||||||
goodput_config_dict: dict[str, float],
|
goodput_config_dict: dict[str, float],
|
||||||
max_concurrency: Optional[int],
|
max_concurrency: Optional[int],
|
||||||
lora_modules: Optional[Iterable[str]],
|
lora_modules: Optional[Iterable[str]],
|
||||||
extra_body: Optional[dict],
|
|
||||||
):
|
):
|
||||||
if backend in ASYNC_REQUEST_FUNCS:
|
if backend in ASYNC_REQUEST_FUNCS:
|
||||||
request_func = ASYNC_REQUEST_FUNCS[backend]
|
request_func = ASYNC_REQUEST_FUNCS[backend]
|
||||||
@ -275,6 +273,10 @@ async def benchmark(
|
|||||||
input_requests[0].expected_output_len, \
|
input_requests[0].expected_output_len, \
|
||||||
input_requests[0].multi_modal_data
|
input_requests[0].multi_modal_data
|
||||||
|
|
||||||
|
if backend != "openai-chat" and test_mm_content is not None:
|
||||||
|
# multi-modal benchmark is only available on OpenAI Chat backend.
|
||||||
|
raise ValueError(
|
||||||
|
"Multi-modal content is only supported on 'openai-chat' backend.")
|
||||||
assert test_mm_content is None or isinstance(test_mm_content, dict)
|
assert test_mm_content is None or isinstance(test_mm_content, dict)
|
||||||
test_input = RequestFuncInput(
|
test_input = RequestFuncInput(
|
||||||
model=model_id,
|
model=model_id,
|
||||||
@ -286,7 +288,6 @@ async def benchmark(
|
|||||||
logprobs=logprobs,
|
logprobs=logprobs,
|
||||||
multi_modal_content=test_mm_content,
|
multi_modal_content=test_mm_content,
|
||||||
ignore_eos=ignore_eos,
|
ignore_eos=ignore_eos,
|
||||||
extra_body=extra_body,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
test_output = await request_func(request_func_input=test_input)
|
test_output = await request_func(request_func_input=test_input)
|
||||||
@ -313,8 +314,7 @@ async def benchmark(
|
|||||||
output_len=test_output_len,
|
output_len=test_output_len,
|
||||||
logprobs=logprobs,
|
logprobs=logprobs,
|
||||||
multi_modal_content=test_mm_content,
|
multi_modal_content=test_mm_content,
|
||||||
ignore_eos=ignore_eos,
|
ignore_eos=ignore_eos)
|
||||||
extra_body=extra_body)
|
|
||||||
profile_output = await request_func(request_func_input=profile_input)
|
profile_output = await request_func(request_func_input=profile_input)
|
||||||
if profile_output.success:
|
if profile_output.success:
|
||||||
print("Profiler started")
|
print("Profiler started")
|
||||||
@ -364,8 +364,7 @@ async def benchmark(
|
|||||||
output_len=output_len,
|
output_len=output_len,
|
||||||
logprobs=logprobs,
|
logprobs=logprobs,
|
||||||
multi_modal_content=mm_content,
|
multi_modal_content=mm_content,
|
||||||
ignore_eos=ignore_eos,
|
ignore_eos=ignore_eos)
|
||||||
extra_body=extra_body)
|
|
||||||
tasks.append(
|
tasks.append(
|
||||||
asyncio.create_task(
|
asyncio.create_task(
|
||||||
limited_request_func(request_func_input=request_func_input,
|
limited_request_func(request_func_input=request_func_input,
|
||||||
@ -587,55 +586,19 @@ def main(args: argparse.Namespace):
|
|||||||
return_prompt_formatted=True)
|
return_prompt_formatted=True)
|
||||||
|
|
||||||
elif args.dataset_name == "hf":
|
elif args.dataset_name == "hf":
|
||||||
# all following datasets are implemented from the
|
# Choose between VisionArenaDataset
|
||||||
# HuggingFaceDataset base class
|
# and HuggingFaceDataset based on provided parameters.
|
||||||
if args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS:
|
dataset_class = (VisionArenaDataset if args.dataset_path
|
||||||
dataset_class = VisionArenaDataset
|
== VisionArenaDataset.VISION_ARENA_DATASET_PATH
|
||||||
args.hf_split = "train"
|
and args.hf_subset is None else HuggingFaceDataset)
|
||||||
args.hf_subset = None
|
|
||||||
elif args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS:
|
|
||||||
dataset_class = InstructCoderDataset
|
|
||||||
args.hf_split = "train"
|
|
||||||
elif args.dataset_path in MTBenchDataset.SUPPORTED_DATASET_PATHS:
|
|
||||||
dataset_class = MTBenchDataset
|
|
||||||
args.hf_split = "train"
|
|
||||||
elif args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS:
|
|
||||||
dataset_class = ConversationDataset
|
|
||||||
elif args.dataset_path in AIMODataset.SUPPORTED_DATASET_PATHS:
|
|
||||||
dataset_class = AIMODataset
|
|
||||||
args.hf_split = "train"
|
|
||||||
elif args.dataset_path in NextEditPredictionDataset.SUPPORTED_DATASET_PATHS: # noqa: E501
|
|
||||||
dataset_class = NextEditPredictionDataset
|
|
||||||
args.hf_split = "train"
|
|
||||||
elif args.dataset_path in ASRDataset.SUPPORTED_DATASET_PATHS:
|
|
||||||
dataset_class = ASRDataset
|
|
||||||
args.hf_split = "train"
|
|
||||||
else:
|
|
||||||
supported_datasets = set([
|
|
||||||
dataset_name for cls in HuggingFaceDataset.__subclasses__()
|
|
||||||
for dataset_name in cls.SUPPORTED_DATASET_PATHS
|
|
||||||
])
|
|
||||||
raise ValueError(
|
|
||||||
f"Unsupported dataset path: {args.dataset_path}. "
|
|
||||||
"Huggingface dataset only supports dataset_path"
|
|
||||||
f" from one of following: {supported_datasets}. "
|
|
||||||
"Please consider contributing if you would "
|
|
||||||
"like to add support for additional dataset formats.")
|
|
||||||
|
|
||||||
if (dataset_class.IS_MULTIMODAL and backend not in \
|
|
||||||
["openai-chat", "openai-audio"]):
|
|
||||||
# multi-modal benchmark is only available on OpenAI Chat backend.
|
|
||||||
raise ValueError(
|
|
||||||
"Multi-modal content is only supported on 'openai-chat' and " \
|
|
||||||
"'openai-audio' backend.")
|
|
||||||
input_requests = dataset_class(
|
input_requests = dataset_class(
|
||||||
dataset_path=args.dataset_path,
|
dataset_path=args.dataset_path,
|
||||||
dataset_subset=args.hf_subset,
|
dataset_subset=args.hf_subset,
|
||||||
dataset_split=args.hf_split,
|
dataset_split=args.hf_split,
|
||||||
random_seed=args.seed,
|
|
||||||
).sample(
|
).sample(
|
||||||
num_requests=args.num_prompts,
|
num_requests=args.num_prompts,
|
||||||
tokenizer=tokenizer,
|
tokenizer=tokenizer,
|
||||||
|
random_seed=args.seed,
|
||||||
output_len=args.hf_output_len,
|
output_len=args.hf_output_len,
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -670,26 +633,6 @@ def main(args: argparse.Namespace):
|
|||||||
raise ValueError(f"Unknown dataset: {args.dataset_name}") from err
|
raise ValueError(f"Unknown dataset: {args.dataset_name}") from err
|
||||||
goodput_config_dict = check_goodput_args(args)
|
goodput_config_dict = check_goodput_args(args)
|
||||||
|
|
||||||
# Collect the sampling parameters.
|
|
||||||
sampling_params = {
|
|
||||||
k: v
|
|
||||||
for k, v in {
|
|
||||||
"top_p": args.top_p,
|
|
||||||
"top_k": args.top_k,
|
|
||||||
"min_p": args.min_p,
|
|
||||||
"temperature": args.temperature
|
|
||||||
}.items() if v is not None
|
|
||||||
}
|
|
||||||
|
|
||||||
# Sampling parameters are only supported by openai-compatible backend.
|
|
||||||
if sampling_params and args.backend not in OPENAI_COMPATIBLE_BACKENDS:
|
|
||||||
raise ValueError(
|
|
||||||
"Sampling parameters are only supported by openai-compatible "
|
|
||||||
"backends.")
|
|
||||||
|
|
||||||
if "temperature" not in sampling_params:
|
|
||||||
sampling_params["temperature"] = 0.0 # Default to greedy decoding.
|
|
||||||
|
|
||||||
# Avoid GC processing "static" data - reduce pause times.
|
# Avoid GC processing "static" data - reduce pause times.
|
||||||
gc.collect()
|
gc.collect()
|
||||||
gc.freeze()
|
gc.freeze()
|
||||||
@ -716,11 +659,10 @@ def main(args: argparse.Namespace):
|
|||||||
goodput_config_dict=goodput_config_dict,
|
goodput_config_dict=goodput_config_dict,
|
||||||
max_concurrency=args.max_concurrency,
|
max_concurrency=args.max_concurrency,
|
||||||
lora_modules=args.lora_modules,
|
lora_modules=args.lora_modules,
|
||||||
extra_body=sampling_params,
|
|
||||||
))
|
))
|
||||||
|
|
||||||
# Save config and results to json
|
# Save config and results to json
|
||||||
if args.save_result or args.append_result:
|
if args.save_result:
|
||||||
result_json: dict[str, Any] = {}
|
result_json: dict[str, Any] = {}
|
||||||
|
|
||||||
# Setup
|
# Setup
|
||||||
@ -741,14 +683,6 @@ def main(args: argparse.Namespace):
|
|||||||
raise ValueError(
|
raise ValueError(
|
||||||
"Invalid metadata format. Please use KEY=VALUE format."
|
"Invalid metadata format. Please use KEY=VALUE format."
|
||||||
)
|
)
|
||||||
# Traffic
|
|
||||||
result_json["request_rate"] = (args.request_rate if args.request_rate
|
|
||||||
< float("inf") else "inf")
|
|
||||||
result_json["burstiness"] = args.burstiness
|
|
||||||
result_json["max_concurrency"] = args.max_concurrency
|
|
||||||
|
|
||||||
# Merge with benchmark result
|
|
||||||
result_json = {**result_json, **benchmark_result}
|
|
||||||
|
|
||||||
if not args.save_detailed:
|
if not args.save_detailed:
|
||||||
# Remove fields with too many data points
|
# Remove fields with too many data points
|
||||||
@ -759,6 +693,15 @@ def main(args: argparse.Namespace):
|
|||||||
if field in result_json:
|
if field in result_json:
|
||||||
del result_json[field]
|
del result_json[field]
|
||||||
|
|
||||||
|
# Traffic
|
||||||
|
result_json["request_rate"] = (args.request_rate if args.request_rate
|
||||||
|
< float("inf") else "inf")
|
||||||
|
result_json["burstiness"] = args.burstiness
|
||||||
|
result_json["max_concurrency"] = args.max_concurrency
|
||||||
|
|
||||||
|
# Merge with benchmark result
|
||||||
|
result_json = {**result_json, **benchmark_result}
|
||||||
|
|
||||||
# Save to file
|
# Save to file
|
||||||
base_model_id = model_id.split("/")[-1]
|
base_model_id = model_id.split("/")[-1]
|
||||||
max_concurrency_str = (f"-concurrency{args.max_concurrency}"
|
max_concurrency_str = (f"-concurrency{args.max_concurrency}"
|
||||||
@ -768,12 +711,7 @@ def main(args: argparse.Namespace):
|
|||||||
file_name = args.result_filename
|
file_name = args.result_filename
|
||||||
if args.result_dir:
|
if args.result_dir:
|
||||||
file_name = os.path.join(args.result_dir, file_name)
|
file_name = os.path.join(args.result_dir, file_name)
|
||||||
with open(file_name,
|
with open(file_name, "w", encoding='utf-8') as outfile:
|
||||||
mode="a+" if args.append_result else "w",
|
|
||||||
encoding='utf-8') as outfile:
|
|
||||||
# Append a newline.
|
|
||||||
if args.append_result and outfile.tell() != 0:
|
|
||||||
outfile.write("\n")
|
|
||||||
json.dump(result_json, outfile)
|
json.dump(result_json, outfile)
|
||||||
save_to_pytorch_benchmark_format(args, result_json, file_name)
|
save_to_pytorch_benchmark_format(args, result_json, file_name)
|
||||||
|
|
||||||
@ -905,11 +843,6 @@ if __name__ == "__main__":
|
|||||||
help="When saving the results, whether to include per request "
|
help="When saving the results, whether to include per request "
|
||||||
"information such as response, error, ttfs, tpots, etc.",
|
"information such as response, error, ttfs, tpots, etc.",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
|
||||||
"--append-result",
|
|
||||||
action="store_true",
|
|
||||||
help="Append the benchmark result to the existing json file.",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--metadata",
|
"--metadata",
|
||||||
metavar="KEY=VALUE",
|
metavar="KEY=VALUE",
|
||||||
@ -943,7 +876,7 @@ if __name__ == "__main__":
|
|||||||
"--percentile-metrics",
|
"--percentile-metrics",
|
||||||
type=str,
|
type=str,
|
||||||
default="ttft,tpot,itl",
|
default="ttft,tpot,itl",
|
||||||
help="Comma-separated list of selected metrics to report percentils. "
|
help="Comma-seperated list of selected metrics to report percentils. "
|
||||||
"This argument specifies the metrics to report percentiles. "
|
"This argument specifies the metrics to report percentiles. "
|
||||||
"Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"e2el\". "
|
"Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"e2el\". "
|
||||||
"Default value is \"ttft,tpot,itl\".")
|
"Default value is \"ttft,tpot,itl\".")
|
||||||
@ -951,7 +884,7 @@ if __name__ == "__main__":
|
|||||||
"--metric-percentiles",
|
"--metric-percentiles",
|
||||||
type=str,
|
type=str,
|
||||||
default="99",
|
default="99",
|
||||||
help="Comma-separated list of percentiles for selected metrics. "
|
help="Comma-seperated list of percentiles for selected metrics. "
|
||||||
"To report 25-th, 50-th, and 75-th percentiles, use \"25,50,75\". "
|
"To report 25-th, 50-th, and 75-th percentiles, use \"25,50,75\". "
|
||||||
"Default value is \"99\". "
|
"Default value is \"99\". "
|
||||||
"Use \"--percentile-metrics\" to select metrics.",
|
"Use \"--percentile-metrics\" to select metrics.",
|
||||||
@ -1018,23 +951,18 @@ if __name__ == "__main__":
|
|||||||
random_group.add_argument(
|
random_group.add_argument(
|
||||||
"--random-range-ratio",
|
"--random-range-ratio",
|
||||||
type=float,
|
type=float,
|
||||||
default=0.0,
|
default=1.0,
|
||||||
help="Range ratio for sampling input/output length, "
|
help="Range of sampled ratio of input/output length, "
|
||||||
"used only for random sampling. Must be in the range [0, 1) to define "
|
"used only for random sampling.",
|
||||||
"a symmetric sampling range"
|
|
||||||
"[length * (1 - range_ratio), length * (1 + range_ratio)].",
|
|
||||||
)
|
)
|
||||||
random_group.add_argument(
|
random_group.add_argument(
|
||||||
"--random-prefix-len",
|
"--random-prefix-len",
|
||||||
type=int,
|
type=int,
|
||||||
default=0,
|
default=0,
|
||||||
help=("Number of fixed prefix tokens before the random context "
|
help="Number of fixed prefix tokens before random "
|
||||||
"in a request. "
|
" context. The length range of context in a random "
|
||||||
"The total input length is the sum of `random-prefix-len` and "
|
" request is [random-prefix-len, "
|
||||||
"a random "
|
" random-prefix-len + random-prefix-len * random-range-ratio).")
|
||||||
"context length sampled from [input_len * (1 - range_ratio), "
|
|
||||||
"input_len * (1 + range_ratio)]."),
|
|
||||||
)
|
|
||||||
|
|
||||||
hf_group = parser.add_argument_group("hf dataset options")
|
hf_group = parser.add_argument_group("hf dataset options")
|
||||||
hf_group.add_argument("--hf-subset",
|
hf_group.add_argument("--hf-subset",
|
||||||
@ -1053,33 +981,6 @@ if __name__ == "__main__":
|
|||||||
"from the sampled HF dataset.",
|
"from the sampled HF dataset.",
|
||||||
)
|
)
|
||||||
|
|
||||||
sampling_group = parser.add_argument_group("sampling parameters")
|
|
||||||
sampling_group.add_argument(
|
|
||||||
"--top-p",
|
|
||||||
type=float,
|
|
||||||
default=None,
|
|
||||||
help="Top-p sampling parameter. Only has effect on openai-compatible "
|
|
||||||
"backends.")
|
|
||||||
sampling_group.add_argument(
|
|
||||||
"--top-k",
|
|
||||||
type=int,
|
|
||||||
default=None,
|
|
||||||
help="Top-k sampling parameter. Only has effect on openai-compatible "
|
|
||||||
"backends.")
|
|
||||||
sampling_group.add_argument(
|
|
||||||
"--min-p",
|
|
||||||
type=float,
|
|
||||||
default=None,
|
|
||||||
help="Min-p sampling parameter. Only has effect on openai-compatible "
|
|
||||||
"backends.")
|
|
||||||
sampling_group.add_argument(
|
|
||||||
"--temperature",
|
|
||||||
type=float,
|
|
||||||
default=None,
|
|
||||||
help="Temperature sampling parameter. Only has effect on "
|
|
||||||
"openai-compatible backends. If not specified, default to greedy "
|
|
||||||
"decoding (i.e. temperature==0.0).")
|
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
'--tokenizer-mode',
|
'--tokenizer-mode',
|
||||||
type=str,
|
type=str,
|
||||||
|
|||||||
@ -5,13 +5,16 @@ On the server side, run one of the following commands:
|
|||||||
(vLLM OpenAI API server)
|
(vLLM OpenAI API server)
|
||||||
vllm serve <your_model> --disable-log-requests
|
vllm serve <your_model> --disable-log-requests
|
||||||
|
|
||||||
|
(TGI backend)
|
||||||
|
./launch_tgi_server.sh <your_model> <max_batch_total_tokens>
|
||||||
|
|
||||||
On the client side, run:
|
On the client side, run:
|
||||||
python benchmarks/benchmark_serving_structured_output.py \
|
python benchmarks/benchmark_serving_structured_output.py \
|
||||||
--backend <backend> \
|
--backend <backend> \
|
||||||
--model <your_model> \
|
--model <your_model> \
|
||||||
--dataset json \
|
--dataset json \
|
||||||
--structured-output-ratio 1.0 \
|
--structured-output-ratio 1.0 \
|
||||||
--structured-output-backend auto \
|
--structured-output-backend xgrammar \
|
||||||
--request-rate 10 \
|
--request-rate 10 \
|
||||||
--num-prompts 1000
|
--num-prompts 1000
|
||||||
|
|
||||||
@ -51,7 +54,7 @@ try:
|
|||||||
except ImportError:
|
except ImportError:
|
||||||
from argparse import ArgumentParser as FlexibleArgumentParser
|
from argparse import ArgumentParser as FlexibleArgumentParser
|
||||||
|
|
||||||
from vllm.v1.structured_output.backend_xgrammar import (
|
from vllm.v1.structured_output.utils import (
|
||||||
has_xgrammar_unsupported_json_features)
|
has_xgrammar_unsupported_json_features)
|
||||||
|
|
||||||
MILLISECONDS_TO_SECONDS_CONVERSION = 1000
|
MILLISECONDS_TO_SECONDS_CONVERSION = 1000
|
||||||
@ -123,8 +126,6 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase,
|
|||||||
copy.deepcopy(schema) for _ in range(args.num_prompts)
|
copy.deepcopy(schema) for _ in range(args.num_prompts)
|
||||||
]
|
]
|
||||||
for i in range(len(json_schemas)):
|
for i in range(len(json_schemas)):
|
||||||
if "properties" not in json_schemas[i]:
|
|
||||||
json_schemas[i]["properties"] = {}
|
|
||||||
json_schemas[i]["properties"][
|
json_schemas[i]["properties"][
|
||||||
f"__optional_field_{uuid.uuid4()}"] = {
|
f"__optional_field_{uuid.uuid4()}"] = {
|
||||||
"type":
|
"type":
|
||||||
@ -132,11 +133,10 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase,
|
|||||||
"description":
|
"description":
|
||||||
"An unique optional field to avoid cached schemas"
|
"An unique optional field to avoid cached schemas"
|
||||||
}
|
}
|
||||||
else:
|
|
||||||
json_schemas = [schema] * args.num_prompts
|
|
||||||
|
|
||||||
def gen_prompt(index: int):
|
def gen_prompt(index: int):
|
||||||
return f"Generate an example of a brief user profile given the following schema: {json.dumps(get_schema(index))}" # noqa: E501
|
schema = json_schemas[index % len(json_schemas)]
|
||||||
|
return f"Generate an example of a user profile given the following schema: {json.dumps(schema)}" # noqa: E501
|
||||||
|
|
||||||
def get_schema(index: int):
|
def get_schema(index: int):
|
||||||
return json_schemas[index % len(json_schemas)]
|
return json_schemas[index % len(json_schemas)]
|
||||||
@ -152,17 +152,17 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase,
|
|||||||
|
|
||||||
elif args.dataset == "grammar":
|
elif args.dataset == "grammar":
|
||||||
schema = """
|
schema = """
|
||||||
root ::= select_statement
|
?start: select_statement
|
||||||
|
|
||||||
select_statement ::= "SELECT " column " from " table " where " condition
|
?select_statement: "SELECT " column_list " FROM " table_name
|
||||||
|
|
||||||
column ::= "col_1 " | "col_2 "
|
?column_list: column_name ("," column_name)*
|
||||||
|
|
||||||
table ::= "table_1 " | "table_2 "
|
?table_name: identifier
|
||||||
|
|
||||||
condition ::= column "= " number
|
?column_name: identifier
|
||||||
|
|
||||||
number ::= "1 " | "2 "
|
?identifier: /[a-zA-Z_][a-zA-Z0-9_]*/
|
||||||
"""
|
"""
|
||||||
prompt = "Generate an SQL query to show the 'username' \
|
prompt = "Generate an SQL query to show the 'username' \
|
||||||
and 'email' from the 'users' table."
|
and 'email' from the 'users' table."
|
||||||
@ -233,8 +233,7 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase,
|
|||||||
idx -= len_dataset
|
idx -= len_dataset
|
||||||
schema = dataset["schema"][idx]
|
schema = dataset["schema"][idx]
|
||||||
prompt = tokenizer.apply_chat_template(dataset["prompt"][idx],
|
prompt = tokenizer.apply_chat_template(dataset["prompt"][idx],
|
||||||
tokenize=False,
|
tokenize=False)
|
||||||
add_generation_prompt=True)
|
|
||||||
input_len = len(tokenizer(prompt).input_ids)
|
input_len = len(tokenizer(prompt).input_ids)
|
||||||
completion = dataset["completion"][idx]
|
completion = dataset["completion"][idx]
|
||||||
|
|
||||||
@ -414,6 +413,7 @@ async def benchmark(
|
|||||||
ignore_eos: bool,
|
ignore_eos: bool,
|
||||||
max_concurrency: Optional[int],
|
max_concurrency: Optional[int],
|
||||||
structured_output_ratio: float,
|
structured_output_ratio: float,
|
||||||
|
structured_output_backend: str,
|
||||||
goodput_config_dict: Optional[dict[str, float]] = None,
|
goodput_config_dict: Optional[dict[str, float]] = None,
|
||||||
):
|
):
|
||||||
if backend in ASYNC_REQUEST_FUNCS:
|
if backend in ASYNC_REQUEST_FUNCS:
|
||||||
@ -425,6 +425,8 @@ async def benchmark(
|
|||||||
extra_body = {}
|
extra_body = {}
|
||||||
# Add the schema to the extra_body
|
# Add the schema to the extra_body
|
||||||
extra_body[request.structure_type] = request.schema
|
extra_body[request.structure_type] = request.schema
|
||||||
|
# Add the specific structured_output_backend
|
||||||
|
extra_body["guided_decoding_backend"] = structured_output_backend
|
||||||
return extra_body
|
return extra_body
|
||||||
|
|
||||||
print("Starting initial single prompt test run...")
|
print("Starting initial single prompt test run...")
|
||||||
@ -782,6 +784,7 @@ def main(args: argparse.Namespace):
|
|||||||
ignore_eos=args.ignore_eos,
|
ignore_eos=args.ignore_eos,
|
||||||
max_concurrency=args.max_concurrency,
|
max_concurrency=args.max_concurrency,
|
||||||
structured_output_ratio=args.structured_output_ratio,
|
structured_output_ratio=args.structured_output_ratio,
|
||||||
|
structured_output_backend=args.structured_output_backend,
|
||||||
goodput_config_dict=goodput_config_dict,
|
goodput_config_dict=goodput_config_dict,
|
||||||
))
|
))
|
||||||
|
|
||||||
@ -848,7 +851,7 @@ if __name__ == "__main__":
|
|||||||
'json', 'json-unique', 'grammar', 'regex',
|
'json', 'json-unique', 'grammar', 'regex',
|
||||||
'choice', 'xgrammar_bench'
|
'choice', 'xgrammar_bench'
|
||||||
])
|
])
|
||||||
parser.add_argument("--json-schema-path",
|
parser.add_argument("--json_schema_path",
|
||||||
type=str,
|
type=str,
|
||||||
default=None,
|
default=None,
|
||||||
help="Path to json schema.")
|
help="Path to json schema.")
|
||||||
@ -963,7 +966,7 @@ if __name__ == "__main__":
|
|||||||
"--percentile-metrics",
|
"--percentile-metrics",
|
||||||
type=str,
|
type=str,
|
||||||
default="ttft,tpot,itl",
|
default="ttft,tpot,itl",
|
||||||
help="Comma-separated list of selected metrics to report percentils. "
|
help="Comma-seperated list of selected metrics to report percentils. "
|
||||||
"This argument specifies the metrics to report percentiles. "
|
"This argument specifies the metrics to report percentiles. "
|
||||||
"Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"e2el\". "
|
"Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"e2el\". "
|
||||||
"Default value is \"ttft,tpot,itl\".")
|
"Default value is \"ttft,tpot,itl\".")
|
||||||
@ -971,7 +974,7 @@ if __name__ == "__main__":
|
|||||||
"--metric-percentiles",
|
"--metric-percentiles",
|
||||||
type=str,
|
type=str,
|
||||||
default="99",
|
default="99",
|
||||||
help="Comma-separated list of percentiles for selected metrics. "
|
help="Comma-seperated list of percentiles for selected metrics. "
|
||||||
"To report 25-th, 50-th, and 75-th percentiles, use \"25,50,75\". "
|
"To report 25-th, 50-th, and 75-th percentiles, use \"25,50,75\". "
|
||||||
"Default value is \"99\". "
|
"Default value is \"99\". "
|
||||||
"Use \"--percentile-metrics\" to select metrics.",
|
"Use \"--percentile-metrics\" to select metrics.",
|
||||||
@ -996,6 +999,12 @@ if __name__ == "__main__":
|
|||||||
type=float,
|
type=float,
|
||||||
default=1.0,
|
default=1.0,
|
||||||
help="Ratio of Structured Outputs requests")
|
help="Ratio of Structured Outputs requests")
|
||||||
|
parser.add_argument(
|
||||||
|
"--structured-output-backend",
|
||||||
|
type=str,
|
||||||
|
choices=["outlines", "lm-format-enforcer", "xgrammar", "guidance"],
|
||||||
|
default="xgrammar",
|
||||||
|
help="Backend to use for structured outputs")
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
main(args)
|
main(args)
|
||||||
|
|||||||
@ -11,8 +11,7 @@ from typing import Any, Optional, Union
|
|||||||
|
|
||||||
import torch
|
import torch
|
||||||
import uvloop
|
import uvloop
|
||||||
from benchmark_dataset import (AIMODataset, BurstGPTDataset,
|
from benchmark_dataset import (BurstGPTDataset, HuggingFaceDataset,
|
||||||
ConversationDataset, InstructCoderDataset,
|
|
||||||
RandomDataset, SampleRequest, ShareGPTDataset,
|
RandomDataset, SampleRequest, ShareGPTDataset,
|
||||||
SonnetDataset, VisionArenaDataset)
|
SonnetDataset, VisionArenaDataset)
|
||||||
from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
|
from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
|
||||||
@ -213,17 +212,14 @@ def run_hf(
|
|||||||
max_prompt_len = 0
|
max_prompt_len = 0
|
||||||
max_output_len = 0
|
max_output_len = 0
|
||||||
for i in range(len(requests)):
|
for i in range(len(requests)):
|
||||||
prompt = requests[i].prompt
|
prompt, prompt_len, output_len = requests[i]
|
||||||
prompt_len = requests[i].prompt_len
|
|
||||||
output_len = requests[i].expected_output_len
|
|
||||||
# Add the prompt to the batch.
|
# Add the prompt to the batch.
|
||||||
batch.append(prompt)
|
batch.append(prompt)
|
||||||
max_prompt_len = max(max_prompt_len, prompt_len)
|
max_prompt_len = max(max_prompt_len, prompt_len)
|
||||||
max_output_len = max(max_output_len, output_len)
|
max_output_len = max(max_output_len, output_len)
|
||||||
if len(batch) < max_batch_size and i != len(requests) - 1:
|
if len(batch) < max_batch_size and i != len(requests) - 1:
|
||||||
# Check if we can add more requests to the batch.
|
# Check if we can add more requests to the batch.
|
||||||
next_prompt_len = requests[i + 1].prompt_len
|
_, next_prompt_len, next_output_len = requests[i + 1]
|
||||||
next_output_len = requests[i + 1].expected_output_len
|
|
||||||
if (max(max_prompt_len, next_prompt_len) +
|
if (max(max_prompt_len, next_prompt_len) +
|
||||||
max(max_output_len, next_output_len)) <= 2048:
|
max(max_output_len, next_output_len)) <= 2048:
|
||||||
# We can add more requests to the batch.
|
# We can add more requests to the batch.
|
||||||
@ -304,7 +300,6 @@ def get_requests(args, tokenizer):
|
|||||||
"input_len": args.input_len,
|
"input_len": args.input_len,
|
||||||
"output_len": args.output_len,
|
"output_len": args.output_len,
|
||||||
}
|
}
|
||||||
|
|
||||||
if args.dataset_path is None or args.dataset_name == "random":
|
if args.dataset_path is None or args.dataset_name == "random":
|
||||||
sample_kwargs["range_ratio"] = args.random_range_ratio
|
sample_kwargs["range_ratio"] = args.random_range_ratio
|
||||||
sample_kwargs["prefix_len"] = args.prefix_len
|
sample_kwargs["prefix_len"] = args.prefix_len
|
||||||
@ -322,23 +317,18 @@ def get_requests(args, tokenizer):
|
|||||||
elif args.dataset_name == "burstgpt":
|
elif args.dataset_name == "burstgpt":
|
||||||
dataset_cls = BurstGPTDataset
|
dataset_cls = BurstGPTDataset
|
||||||
elif args.dataset_name == "hf":
|
elif args.dataset_name == "hf":
|
||||||
if args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS:
|
if args.backend != "vllm-chat":
|
||||||
dataset_cls = VisionArenaDataset
|
raise ValueError(
|
||||||
common_kwargs['dataset_subset'] = None
|
"hf datasets only are supported by vllm-chat backend")
|
||||||
common_kwargs['dataset_split'] = "train"
|
# Choose between VisionArenaDataset and HuggingFaceDataset based on
|
||||||
sample_kwargs["enable_multimodal_chat"] = True
|
# provided parameters.
|
||||||
elif args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS:
|
dataset_cls = (VisionArenaDataset if args.dataset_path
|
||||||
dataset_cls = InstructCoderDataset
|
== VisionArenaDataset.VISION_ARENA_DATASET_PATH
|
||||||
common_kwargs['dataset_split'] = "train"
|
and args.hf_subset is None else HuggingFaceDataset)
|
||||||
elif args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS:
|
common_kwargs['dataset_subset'] = args.hf_subset
|
||||||
dataset_cls = ConversationDataset
|
common_kwargs['dataset_split'] = args.hf_split
|
||||||
common_kwargs['dataset_subset'] = args.hf_subset
|
sample_kwargs["enable_multimodal_chat"] = True
|
||||||
common_kwargs['dataset_split'] = args.hf_split
|
|
||||||
sample_kwargs["enable_multimodal_chat"] = True
|
|
||||||
elif args.dataset_path in AIMODataset.SUPPORTED_DATASET_PATHS:
|
|
||||||
dataset_cls = AIMODataset
|
|
||||||
common_kwargs['dataset_subset'] = None
|
|
||||||
common_kwargs['dataset_split'] = "train"
|
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"Unknown dataset name: {args.dataset_name}")
|
raise ValueError(f"Unknown dataset name: {args.dataset_name}")
|
||||||
# Remove None values
|
# Remove None values
|
||||||
@ -472,17 +462,9 @@ def validate_args(args):
|
|||||||
warnings.warn("--hf-subset and --hf-split will be ignored \
|
warnings.warn("--hf-subset and --hf-split will be ignored \
|
||||||
since --dataset-name is not 'hf'.",
|
since --dataset-name is not 'hf'.",
|
||||||
stacklevel=2)
|
stacklevel=2)
|
||||||
elif args.dataset_name == "hf":
|
elif args.dataset_name == "hf" and args.backend != "vllm-chat":
|
||||||
if args.dataset_path in (
|
raise ValueError(
|
||||||
VisionArenaDataset.SUPPORTED_DATASET_PATHS.keys()
|
"When --dataset-name is 'hf', backend must be 'vllm-chat'")
|
||||||
| ConversationDataset.SUPPORTED_DATASET_PATHS):
|
|
||||||
assert args.backend == "vllm-chat", f"{args.dataset_path} needs to use vllm-chat as the backend." #noqa: E501
|
|
||||||
elif args.dataset_path in (InstructCoderDataset.SUPPORTED_DATASET_PATHS
|
|
||||||
| AIMODataset.SUPPORTED_DATASET_PATHS):
|
|
||||||
assert args.backend == "vllm", f"{args.dataset_path} needs to use vllm as the backend." #noqa: E501
|
|
||||||
else:
|
|
||||||
raise ValueError(
|
|
||||||
f"{args.dataset_path} is not supported by hf dataset.")
|
|
||||||
|
|
||||||
# --random-range-ratio: only used when dataset_name is 'random'
|
# --random-range-ratio: only used when dataset_name is 'random'
|
||||||
if args.dataset_name != 'random' and args.random_range_ratio is not None:
|
if args.dataset_name != 'random' and args.random_range_ratio is not None:
|
||||||
@ -523,13 +505,6 @@ def validate_args(args):
|
|||||||
raise ValueError(
|
raise ValueError(
|
||||||
"Tokenizer must be the same as the model for MII backend.")
|
"Tokenizer must be the same as the model for MII backend.")
|
||||||
|
|
||||||
# --data-parallel is not supported currently.
|
|
||||||
# https://github.com/vllm-project/vllm/issues/16222
|
|
||||||
if args.data_parallel_size > 1:
|
|
||||||
raise ValueError(
|
|
||||||
"Data parallel is not supported in offline benchmark, \
|
|
||||||
please use benchmark serving instead")
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
parser = FlexibleArgumentParser(description="Benchmark the throughput.")
|
parser = FlexibleArgumentParser(description="Benchmark the throughput.")
|
||||||
@ -601,30 +576,18 @@ if __name__ == "__main__":
|
|||||||
default=None,
|
default=None,
|
||||||
help="Path to the lora adapters to use. This can be an absolute path, "
|
help="Path to the lora adapters to use. This can be an absolute path, "
|
||||||
"a relative path, or a Hugging Face model identifier.")
|
"a relative path, or a Hugging Face model identifier.")
|
||||||
parser.add_argument(
|
parser.add_argument("--prefix-len",
|
||||||
"--prefix-len",
|
type=int,
|
||||||
type=int,
|
default=None,
|
||||||
default=None,
|
help="Number of prefix tokens per request."
|
||||||
help=f"Number of prefix tokens to be used in RandomDataset "
|
"This is for the RandomDataset and SonnetDataset")
|
||||||
"and SonnetDataset. For RandomDataset, the total input "
|
|
||||||
"length is the sum of prefix-len (default: "
|
|
||||||
f"{RandomDataset.DEFAULT_PREFIX_LEN}) and a random context length "
|
|
||||||
"sampled from [input_len * (1 - range_ratio), "
|
|
||||||
"input_len * (1 + range_ratio)]. For SonnetDataset, "
|
|
||||||
f"prefix_len (default: {SonnetDataset.DEFAULT_PREFIX_LEN}) "
|
|
||||||
"controls how much of the input is fixed lines versus "
|
|
||||||
"random lines, but the total input length remains approximately "
|
|
||||||
"input_len tokens.")
|
|
||||||
# random dataset
|
# random dataset
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--random-range-ratio",
|
"--random-range-ratio",
|
||||||
type=float,
|
type=float,
|
||||||
default=None,
|
default=None,
|
||||||
help=f"Range ratio (default : {RandomDataset.DEFAULT_RANGE_RATIO}) "
|
help="Range of sampled ratio of input/output length, "
|
||||||
"for sampling input/output length, "
|
"used only for RandomDataSet.",
|
||||||
"used only for RandomDataset. Must be in the range [0, 1) to "
|
|
||||||
"define a symmetric sampling range "
|
|
||||||
"[length * (1 - range_ratio), length * (1 + range_ratio)].",
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# hf dtaset
|
# hf dtaset
|
||||||
|
|||||||
@ -1,236 +0,0 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
|
||||||
# Copyright (c) Microsoft Corporation.
|
|
||||||
# Licensed under the MIT License.
|
|
||||||
|
|
||||||
from vllm.model_executor.layers.quantization.utils.bitblas_utils import (
|
|
||||||
MINIMUM_BITBLAS_VERSION)
|
|
||||||
|
|
||||||
try:
|
|
||||||
import bitblas
|
|
||||||
if bitblas.__version__ < MINIMUM_BITBLAS_VERSION:
|
|
||||||
raise ImportError("bitblas version is wrong. Please "
|
|
||||||
f"install bitblas>={MINIMUM_BITBLAS_VERSION}")
|
|
||||||
except ImportError as e:
|
|
||||||
bitblas_import_exception = e
|
|
||||||
raise ValueError("Trying to use the bitblas backend, but could not import"
|
|
||||||
f"with the following error: {bitblas_import_exception}. "
|
|
||||||
"Please install bitblas through the following command: "
|
|
||||||
f"`pip install bitblas>={MINIMUM_BITBLAS_VERSION}`"
|
|
||||||
) from bitblas_import_exception
|
|
||||||
|
|
||||||
from bitblas import Matmul, MatmulConfig, auto_detect_nvidia_target
|
|
||||||
|
|
||||||
from vllm.utils import FlexibleArgumentParser
|
|
||||||
|
|
||||||
parser = FlexibleArgumentParser(
|
|
||||||
description="Benchmark BitBLAS int4 on a specific target.")
|
|
||||||
|
|
||||||
# Add arguments to the parser
|
|
||||||
parser.add_argument(
|
|
||||||
"--target",
|
|
||||||
type=str,
|
|
||||||
default=auto_detect_nvidia_target(),
|
|
||||||
help="Specify the target device for benchmarking.",
|
|
||||||
)
|
|
||||||
parser.add_argument("--group_size",
|
|
||||||
type=int,
|
|
||||||
default=None,
|
|
||||||
help="Group size for grouped quantization.")
|
|
||||||
parser.add_argument(
|
|
||||||
"--A_dtype",
|
|
||||||
type=str,
|
|
||||||
default="float16",
|
|
||||||
choices=["float16", "float32", "float64", "int32", "int8"],
|
|
||||||
help="Data type of activation A.",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--W_dtype",
|
|
||||||
type=str,
|
|
||||||
default="int4",
|
|
||||||
choices=[
|
|
||||||
"float16",
|
|
||||||
"float32",
|
|
||||||
"float64",
|
|
||||||
"int32",
|
|
||||||
"int8",
|
|
||||||
"int4",
|
|
||||||
"int2",
|
|
||||||
"int1",
|
|
||||||
"nf4",
|
|
||||||
"fp4_e2m1",
|
|
||||||
],
|
|
||||||
help="Data type of weight W.",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--accum_dtype",
|
|
||||||
type=str,
|
|
||||||
default="float16",
|
|
||||||
choices=["float16", "int32"],
|
|
||||||
help="Data type for accumulation.",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--out_dtype",
|
|
||||||
type=str,
|
|
||||||
default="float16",
|
|
||||||
choices=["float16", "float32", "int32", "int8"],
|
|
||||||
help="Data type for output.",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--layout",
|
|
||||||
type=str,
|
|
||||||
default="nt",
|
|
||||||
choices=["nt", "nn"],
|
|
||||||
help="Matrix layout, 'nt' for non-transpose A and transpose W.",
|
|
||||||
)
|
|
||||||
parser.add_argument("--with_bias",
|
|
||||||
action="store_true",
|
|
||||||
help="Include bias in the benchmark.")
|
|
||||||
parser.add_argument(
|
|
||||||
"--with_scaling",
|
|
||||||
action="store_true",
|
|
||||||
help="Include scaling factor in the quantization.",
|
|
||||||
)
|
|
||||||
parser.add_argument("--with_zeros",
|
|
||||||
action="store_true",
|
|
||||||
help="Include zeros in the quantization.")
|
|
||||||
parser.add_argument(
|
|
||||||
"--zeros_mode",
|
|
||||||
type=str,
|
|
||||||
default=None,
|
|
||||||
choices=["original", "rescale", "quantized"],
|
|
||||||
help="Specify the mode for calculating zeros.",
|
|
||||||
)
|
|
||||||
|
|
||||||
# Parse the arguments
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
# Assign arguments to variables
|
|
||||||
target = args.target
|
|
||||||
A_dtype = args.A_dtype
|
|
||||||
W_dtype = args.W_dtype
|
|
||||||
accum_dtype = args.accum_dtype
|
|
||||||
out_dtype = args.out_dtype
|
|
||||||
layout = args.layout
|
|
||||||
with_bias = args.with_bias
|
|
||||||
group_size = args.group_size
|
|
||||||
with_scaling = args.with_scaling
|
|
||||||
with_zeros = args.with_zeros
|
|
||||||
zeros_mode = args.zeros_mode
|
|
||||||
|
|
||||||
# Define a list of shared arguments that repeat in every config
|
|
||||||
shared_args = [
|
|
||||||
A_dtype,
|
|
||||||
W_dtype,
|
|
||||||
out_dtype,
|
|
||||||
accum_dtype,
|
|
||||||
layout,
|
|
||||||
with_bias,
|
|
||||||
group_size,
|
|
||||||
with_scaling,
|
|
||||||
with_zeros,
|
|
||||||
zeros_mode,
|
|
||||||
]
|
|
||||||
|
|
||||||
# Define just the (M, K, N) shapes in a more compact list
|
|
||||||
shapes = [
|
|
||||||
# square test
|
|
||||||
(1, 16384, 16384),
|
|
||||||
# BLOOM-176B
|
|
||||||
(1, 43008, 14336),
|
|
||||||
(1, 14336, 14336),
|
|
||||||
(1, 57344, 14336),
|
|
||||||
(1, 14336, 57344),
|
|
||||||
# OPT-65B
|
|
||||||
(1, 9216, 9216),
|
|
||||||
(1, 36864, 9216),
|
|
||||||
(1, 9216, 36864),
|
|
||||||
(1, 22016, 8192),
|
|
||||||
# LLAMA-70B/65B
|
|
||||||
(1, 8192, 22016),
|
|
||||||
(1, 8192, 8192),
|
|
||||||
(1, 28672, 8192),
|
|
||||||
(1, 8192, 28672),
|
|
||||||
# square test
|
|
||||||
(16384, 16384, 16384),
|
|
||||||
# BLOOM-176B
|
|
||||||
(8192, 43008, 14336),
|
|
||||||
(8192, 14336, 14336),
|
|
||||||
(8192, 57344, 14336),
|
|
||||||
(8192, 14336, 57344),
|
|
||||||
# OPT-65B
|
|
||||||
(8192, 9216, 9216),
|
|
||||||
(8192, 36864, 9216),
|
|
||||||
(8192, 9216, 36864),
|
|
||||||
(8192, 22016, 8192),
|
|
||||||
# LLAMA-70B/65B
|
|
||||||
(8192, 8192, 22016),
|
|
||||||
(8192, 8192, 8192),
|
|
||||||
(8192, 28672, 8192),
|
|
||||||
(8192, 8192, 28672),
|
|
||||||
]
|
|
||||||
|
|
||||||
# Build test shapes with all the shared arguments
|
|
||||||
test_shapes = [(MatmulConfig, Matmul, (*shape, *shared_args))
|
|
||||||
for shape in shapes]
|
|
||||||
|
|
||||||
benchmark_sets = []
|
|
||||||
benchmark_sets.extend(test_shapes)
|
|
||||||
|
|
||||||
benchmark_results = {}
|
|
||||||
for config_class, operator, input_args in benchmark_sets:
|
|
||||||
config = config_class(*input_args)
|
|
||||||
matmul = operator(config, target=target, enable_tuning=True)
|
|
||||||
kernel_latency = matmul.profile_latency()
|
|
||||||
|
|
||||||
print("Time cost is: {:.3f} ms".format(kernel_latency))
|
|
||||||
|
|
||||||
profile_config = {
|
|
||||||
f"{operator.__name__}-{'-'.join([str(i) for i in input_args])}": {
|
|
||||||
"BitBLAS_top20_latency": kernel_latency,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
benchmark_results.update(profile_config)
|
|
||||||
|
|
||||||
# Define headers for the table
|
|
||||||
headers = [
|
|
||||||
"PrimFunc",
|
|
||||||
"Input Arguments",
|
|
||||||
"BitBLAS Top20 Latency",
|
|
||||||
]
|
|
||||||
|
|
||||||
# Calculate column widths for pretty printing
|
|
||||||
col_widths = [0, 0, 0]
|
|
||||||
for config_key, values in benchmark_results.items():
|
|
||||||
args_split = config_key.split("-")
|
|
||||||
func_name = args_split[0]
|
|
||||||
input_args_str = "-".join(args_split[1:])
|
|
||||||
col_widths[0] = max(col_widths[0], len(func_name) + 2, len(headers[0]) + 2)
|
|
||||||
col_widths[1] = max(col_widths[1],
|
|
||||||
len(input_args_str) + 2,
|
|
||||||
len(headers[1]) + 2)
|
|
||||||
col_widths[2] = max(col_widths[2],
|
|
||||||
len(f"{values['BitBLAS_top20_latency']:.3f} ms") + 2,
|
|
||||||
len(headers[2]) + 2)
|
|
||||||
# break only if you want to measure widths from a single example;
|
|
||||||
# otherwise, let it loop over all items.
|
|
||||||
|
|
||||||
# Print header
|
|
||||||
for i, header in enumerate(headers):
|
|
||||||
headers[i] = header.ljust(col_widths[i])
|
|
||||||
print("".join(headers))
|
|
||||||
print("-" * sum(col_widths))
|
|
||||||
|
|
||||||
# Print rows
|
|
||||||
for config_key, values in benchmark_results.items():
|
|
||||||
args_split = config_key.split("-")
|
|
||||||
func_name = args_split[0]
|
|
||||||
input_args_str = "-".join(args_split[1:])
|
|
||||||
row = [
|
|
||||||
func_name,
|
|
||||||
input_args_str,
|
|
||||||
f"{values['BitBLAS_top20_latency']:.3f} ms",
|
|
||||||
]
|
|
||||||
row_str = "".join(
|
|
||||||
[str(cell).ljust(col_widths[idx]) for idx, cell in enumerate(row)])
|
|
||||||
print(row_str)
|
|
||||||
@ -90,8 +90,7 @@ def bench_run(results: list[benchmark.Measurement], model: str,
|
|||||||
|
|
||||||
score = torch.randn((m, num_experts), device="cuda", dtype=dtype)
|
score = torch.randn((m, num_experts), device="cuda", dtype=dtype)
|
||||||
|
|
||||||
topk_weights, topk_ids, token_expert_indices = fused_topk(
|
topk_weights, topk_ids = fused_topk(a, score, topk, renormalize=False)
|
||||||
a, score, topk, renormalize=False)
|
|
||||||
|
|
||||||
def run_triton_moe(a: torch.Tensor, w1: torch.Tensor, w2: torch.Tensor,
|
def run_triton_moe(a: torch.Tensor, w1: torch.Tensor, w2: torch.Tensor,
|
||||||
topk_weights: torch.Tensor, topk_ids: torch.Tensor,
|
topk_weights: torch.Tensor, topk_ids: torch.Tensor,
|
||||||
|
|||||||
@ -17,14 +17,8 @@ from torch.utils.benchmark import Measurement as TMeasurement
|
|||||||
from utils import ArgPool, Bench, CudaGraphBenchParams
|
from utils import ArgPool, Bench, CudaGraphBenchParams
|
||||||
from weight_shapes import WEIGHT_SHAPES
|
from weight_shapes import WEIGHT_SHAPES
|
||||||
|
|
||||||
from vllm.triton_utils import HAS_TRITON
|
from vllm.lora.ops.triton_ops import LoRAKernelMeta, lora_expand, lora_shrink
|
||||||
|
from vllm.lora.ops.triton_ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT
|
||||||
if HAS_TRITON:
|
|
||||||
from vllm.lora.ops.triton_ops import (LoRAKernelMeta, lora_expand,
|
|
||||||
lora_shrink)
|
|
||||||
from vllm.lora.ops.triton_ops.utils import (_LORA_A_PTR_DICT,
|
|
||||||
_LORA_B_PTR_DICT)
|
|
||||||
|
|
||||||
from vllm.utils import FlexibleArgumentParser
|
from vllm.utils import FlexibleArgumentParser
|
||||||
|
|
||||||
DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
|
DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
|
||||||
|
|||||||
@ -6,17 +6,16 @@ import time
|
|||||||
from contextlib import nullcontext
|
from contextlib import nullcontext
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from itertools import product
|
from itertools import product
|
||||||
from types import SimpleNamespace
|
|
||||||
from typing import Any, TypedDict
|
from typing import Any, TypedDict
|
||||||
|
|
||||||
import ray
|
import ray
|
||||||
import torch
|
import torch
|
||||||
|
import triton
|
||||||
from ray.experimental.tqdm_ray import tqdm
|
from ray.experimental.tqdm_ray import tqdm
|
||||||
|
from transformers import AutoConfig
|
||||||
|
|
||||||
from vllm.model_executor.layers.fused_moe.fused_moe import *
|
from vllm.model_executor.layers.fused_moe.fused_moe import *
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
from vllm.transformers_utils.config import get_config
|
|
||||||
from vllm.triton_utils import triton
|
|
||||||
from vllm.utils import FlexibleArgumentParser
|
from vllm.utils import FlexibleArgumentParser
|
||||||
|
|
||||||
FP8_DTYPE = current_platform.fp8_dtype()
|
FP8_DTYPE = current_platform.fp8_dtype()
|
||||||
@ -31,18 +30,19 @@ class BenchmarkConfig(TypedDict):
|
|||||||
num_stages: int
|
num_stages: int
|
||||||
|
|
||||||
|
|
||||||
def benchmark_config(config: BenchmarkConfig,
|
def benchmark_config(
|
||||||
num_tokens: int,
|
config: BenchmarkConfig,
|
||||||
num_experts: int,
|
num_tokens: int,
|
||||||
shard_intermediate_size: int,
|
num_experts: int,
|
||||||
hidden_size: int,
|
shard_intermediate_size: int,
|
||||||
topk: int,
|
hidden_size: int,
|
||||||
dtype: torch.dtype,
|
topk: int,
|
||||||
use_fp8_w8a8: bool,
|
dtype: torch.dtype,
|
||||||
use_int8_w8a16: bool,
|
use_fp8_w8a8: bool,
|
||||||
num_iters: int = 100,
|
use_int8_w8a16: bool,
|
||||||
block_quant_shape: List[int] = None,
|
num_iters: int = 100,
|
||||||
use_deep_gemm: bool = False) -> float:
|
block_quant_shape: List[int] = None,
|
||||||
|
) -> float:
|
||||||
init_dtype = torch.float16 if use_fp8_w8a8 else dtype
|
init_dtype = torch.float16 if use_fp8_w8a8 else dtype
|
||||||
x = torch.randn(num_tokens, hidden_size, dtype=dtype)
|
x = torch.randn(num_tokens, hidden_size, dtype=dtype)
|
||||||
if use_int8_w8a16:
|
if use_int8_w8a16:
|
||||||
@ -115,41 +115,22 @@ def benchmark_config(config: BenchmarkConfig,
|
|||||||
def run():
|
def run():
|
||||||
from vllm.model_executor.layers.fused_moe import override_config
|
from vllm.model_executor.layers.fused_moe import override_config
|
||||||
with override_config(config):
|
with override_config(config):
|
||||||
if use_deep_gemm:
|
fused_moe(
|
||||||
topk_weights, topk_ids, token_expert_indices = fused_topk(
|
x,
|
||||||
x, input_gating, topk, False)
|
w1,
|
||||||
return fused_experts(
|
w2,
|
||||||
x,
|
input_gating,
|
||||||
w1,
|
topk,
|
||||||
w2,
|
renormalize=True,
|
||||||
topk_weights,
|
inplace=True,
|
||||||
topk_ids,
|
use_fp8_w8a8=use_fp8_w8a8,
|
||||||
inplace=True,
|
use_int8_w8a16=use_int8_w8a16,
|
||||||
use_fp8_w8a8=use_fp8_w8a8,
|
w1_scale=w1_scale,
|
||||||
w1_scale=w1_scale,
|
w2_scale=w2_scale,
|
||||||
w2_scale=w2_scale,
|
a1_scale=a1_scale,
|
||||||
a1_scale=a1_scale,
|
a2_scale=a2_scale,
|
||||||
a2_scale=a2_scale,
|
block_shape=block_quant_shape,
|
||||||
block_shape=block_quant_shape,
|
)
|
||||||
allow_deep_gemm=True,
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
fused_moe(
|
|
||||||
x,
|
|
||||||
w1,
|
|
||||||
w2,
|
|
||||||
input_gating,
|
|
||||||
topk,
|
|
||||||
renormalize=True,
|
|
||||||
inplace=True,
|
|
||||||
use_fp8_w8a8=use_fp8_w8a8,
|
|
||||||
use_int8_w8a16=use_int8_w8a16,
|
|
||||||
w1_scale=w1_scale,
|
|
||||||
w2_scale=w2_scale,
|
|
||||||
a1_scale=a1_scale,
|
|
||||||
a2_scale=a2_scale,
|
|
||||||
block_shape=block_quant_shape,
|
|
||||||
)
|
|
||||||
|
|
||||||
# JIT compilation & warmup
|
# JIT compilation & warmup
|
||||||
run()
|
run()
|
||||||
@ -385,7 +366,6 @@ class BenchmarkWorker:
|
|||||||
use_fp8_w8a8: bool,
|
use_fp8_w8a8: bool,
|
||||||
use_int8_w8a16: bool,
|
use_int8_w8a16: bool,
|
||||||
block_quant_shape: List[int] = None,
|
block_quant_shape: List[int] = None,
|
||||||
use_deep_gemm: bool = False,
|
|
||||||
) -> tuple[dict[str, int], float]:
|
) -> tuple[dict[str, int], float]:
|
||||||
current_platform.seed_everything(self.seed)
|
current_platform.seed_everything(self.seed)
|
||||||
dtype_str = get_config_dtype_str(dtype,
|
dtype_str = get_config_dtype_str(dtype,
|
||||||
@ -416,8 +396,7 @@ class BenchmarkWorker:
|
|||||||
use_fp8_w8a8,
|
use_fp8_w8a8,
|
||||||
use_int8_w8a16,
|
use_int8_w8a16,
|
||||||
num_iters=100,
|
num_iters=100,
|
||||||
block_quant_shape=block_quant_shape,
|
block_quant_shape=block_quant_shape)
|
||||||
use_deep_gemm=use_deep_gemm)
|
|
||||||
return config, kernel_time
|
return config, kernel_time
|
||||||
|
|
||||||
def tune(
|
def tune(
|
||||||
@ -432,7 +411,6 @@ class BenchmarkWorker:
|
|||||||
use_int8_w8a16: bool,
|
use_int8_w8a16: bool,
|
||||||
search_space: list[dict[str, int]],
|
search_space: list[dict[str, int]],
|
||||||
block_quant_shape: list[int],
|
block_quant_shape: list[int],
|
||||||
use_deep_gemm: bool,
|
|
||||||
) -> dict[str, int]:
|
) -> dict[str, int]:
|
||||||
best_config = None
|
best_config = None
|
||||||
best_time = float("inf")
|
best_time = float("inf")
|
||||||
@ -443,14 +421,8 @@ class BenchmarkWorker:
|
|||||||
hidden_size, search_space,
|
hidden_size, search_space,
|
||||||
is_fp16, topk)
|
is_fp16, topk)
|
||||||
|
|
||||||
need_device_guard = False
|
with torch.cuda.device(self.device_id) if current_platform.is_rocm(
|
||||||
if current_platform.is_rocm():
|
) else nullcontext():
|
||||||
visible_device = os.environ.get("ROCR_VISIBLE_DEVICES", None)
|
|
||||||
if visible_device != f"{self.device_id}":
|
|
||||||
need_device_guard = True
|
|
||||||
|
|
||||||
with torch.cuda.device(
|
|
||||||
self.device_id) if need_device_guard else nullcontext():
|
|
||||||
for config in tqdm(search_space):
|
for config in tqdm(search_space):
|
||||||
try:
|
try:
|
||||||
kernel_time = benchmark_config(
|
kernel_time = benchmark_config(
|
||||||
@ -464,8 +436,7 @@ class BenchmarkWorker:
|
|||||||
use_fp8_w8a8,
|
use_fp8_w8a8,
|
||||||
use_int8_w8a16,
|
use_int8_w8a16,
|
||||||
num_iters=20,
|
num_iters=20,
|
||||||
block_quant_shape=block_quant_shape,
|
block_quant_shape=block_quant_shape)
|
||||||
use_deep_gemm=use_deep_gemm)
|
|
||||||
except triton.runtime.autotuner.OutOfResources:
|
except triton.runtime.autotuner.OutOfResources:
|
||||||
# Some configurations may be invalid and fail to compile.
|
# Some configurations may be invalid and fail to compile.
|
||||||
continue
|
continue
|
||||||
@ -534,13 +505,9 @@ def get_weight_block_size_safety(config, default_value=None):
|
|||||||
|
|
||||||
def main(args: argparse.Namespace):
|
def main(args: argparse.Namespace):
|
||||||
print(args)
|
print(args)
|
||||||
|
block_quant_shape = None
|
||||||
config = get_config(model=args.model,
|
config = AutoConfig.from_pretrained(
|
||||||
trust_remote_code=args.trust_remote_code)
|
args.model, trust_remote_code=args.trust_remote_code)
|
||||||
if args.model_prefix:
|
|
||||||
config = getattr(config, args.model_prefix)
|
|
||||||
config = SimpleNamespace(**config)
|
|
||||||
|
|
||||||
if config.architectures[0] == "DbrxForCausalLM":
|
if config.architectures[0] == "DbrxForCausalLM":
|
||||||
E = config.ffn_config.moe_num_experts
|
E = config.ffn_config.moe_num_experts
|
||||||
topk = config.ffn_config.moe_top_k
|
topk = config.ffn_config.moe_top_k
|
||||||
@ -551,21 +518,19 @@ def main(args: argparse.Namespace):
|
|||||||
topk = config.num_experts_per_tok
|
topk = config.num_experts_per_tok
|
||||||
intermediate_size = config.intermediate_size
|
intermediate_size = config.intermediate_size
|
||||||
shard_intermediate_size = 2 * intermediate_size // args.tp_size
|
shard_intermediate_size = 2 * intermediate_size // args.tp_size
|
||||||
elif (config.architectures[0]
|
elif (config.architectures[0] == "DeepseekV3ForCausalLM"
|
||||||
in ("DeepseekV3ForCausalLM", "DeepseekV2ForCausalLM")):
|
or config.architectures[0] == "DeepseekV2ForCausalLM"):
|
||||||
E = config.n_routed_experts
|
E = config.n_routed_experts
|
||||||
topk = config.num_experts_per_tok
|
topk = config.num_experts_per_tok
|
||||||
intermediate_size = config.moe_intermediate_size
|
intermediate_size = config.moe_intermediate_size
|
||||||
shard_intermediate_size = 2 * intermediate_size // args.tp_size
|
shard_intermediate_size = 2 * intermediate_size // args.tp_size
|
||||||
elif config.architectures[0] in ("Qwen2MoeForCausalLM",
|
block_quant_shape = get_weight_block_size_safety(config)
|
||||||
"Qwen3MoeForCausalLM"):
|
elif config.architectures[0] == "Qwen2MoeForCausalLM":
|
||||||
E = config.num_experts
|
E = config.num_experts
|
||||||
topk = config.num_experts_per_tok
|
topk = config.num_experts_per_tok
|
||||||
intermediate_size = config.moe_intermediate_size
|
intermediate_size = config.moe_intermediate_size
|
||||||
shard_intermediate_size = 2 * intermediate_size // args.tp_size
|
shard_intermediate_size = 2 * intermediate_size // args.tp_size
|
||||||
else:
|
else:
|
||||||
# Support for llama4
|
|
||||||
config = config.get_text_config()
|
|
||||||
# Default: Mixtral.
|
# Default: Mixtral.
|
||||||
E = config.num_local_experts
|
E = config.num_local_experts
|
||||||
topk = config.num_experts_per_tok
|
topk = config.num_experts_per_tok
|
||||||
@ -573,11 +538,9 @@ def main(args: argparse.Namespace):
|
|||||||
shard_intermediate_size = 2 * intermediate_size // args.tp_size
|
shard_intermediate_size = 2 * intermediate_size // args.tp_size
|
||||||
|
|
||||||
hidden_size = config.hidden_size
|
hidden_size = config.hidden_size
|
||||||
dtype = torch.float16 if current_platform.is_rocm() else getattr(
|
dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype
|
||||||
torch, config.torch_dtype)
|
|
||||||
use_fp8_w8a8 = args.dtype == "fp8_w8a8"
|
use_fp8_w8a8 = args.dtype == "fp8_w8a8"
|
||||||
use_int8_w8a16 = args.dtype == "int8_w8a16"
|
use_int8_w8a16 = args.dtype == "int8_w8a16"
|
||||||
block_quant_shape = get_weight_block_size_safety(config)
|
|
||||||
|
|
||||||
if args.batch_size is None:
|
if args.batch_size is None:
|
||||||
batch_sizes = [
|
batch_sizes = [
|
||||||
@ -587,17 +550,6 @@ def main(args: argparse.Namespace):
|
|||||||
else:
|
else:
|
||||||
batch_sizes = [args.batch_size]
|
batch_sizes = [args.batch_size]
|
||||||
|
|
||||||
use_deep_gemm = bool(args.use_deep_gemm)
|
|
||||||
|
|
||||||
if current_platform.is_rocm() and "HIP_VISIBLE_DEVICES" in os.environ:
|
|
||||||
# Ray will set ROCR_VISIBLE_DEVICES for device visibility
|
|
||||||
logger.warning(
|
|
||||||
"Ray uses ROCR_VISIBLE_DEVICES to control device accessibility."
|
|
||||||
"Replacing HIP_VISIBLE_DEVICES with ROCR_VISIBLE_DEVICES.")
|
|
||||||
val = os.environ["HIP_VISIBLE_DEVICES"]
|
|
||||||
os.environ["ROCR_VISIBLE_DEVICES"] = val
|
|
||||||
del os.environ["HIP_VISIBLE_DEVICES"]
|
|
||||||
|
|
||||||
ray.init()
|
ray.init()
|
||||||
num_gpus = int(ray.available_resources()["GPU"])
|
num_gpus = int(ray.available_resources()["GPU"])
|
||||||
workers = [BenchmarkWorker.remote(args.seed) for _ in range(num_gpus)]
|
workers = [BenchmarkWorker.remote(args.seed) for _ in range(num_gpus)]
|
||||||
@ -620,10 +572,10 @@ def main(args: argparse.Namespace):
|
|||||||
|
|
||||||
start = time.time()
|
start = time.time()
|
||||||
configs = _distribute(
|
configs = _distribute(
|
||||||
"tune", [(batch_size, E, shard_intermediate_size, hidden_size,
|
"tune",
|
||||||
topk, dtype, use_fp8_w8a8, use_int8_w8a16, search_space,
|
[(batch_size, E, shard_intermediate_size, hidden_size, topk, dtype,
|
||||||
block_quant_shape, use_deep_gemm)
|
use_fp8_w8a8, use_int8_w8a16, search_space, block_quant_shape)
|
||||||
for batch_size in batch_sizes])
|
for batch_size in batch_sizes])
|
||||||
best_configs = {
|
best_configs = {
|
||||||
M: sort_config(config)
|
M: sort_config(config)
|
||||||
for M, config in zip(batch_sizes, configs)
|
for M, config in zip(batch_sizes, configs)
|
||||||
@ -637,7 +589,7 @@ def main(args: argparse.Namespace):
|
|||||||
outputs = _distribute(
|
outputs = _distribute(
|
||||||
"benchmark",
|
"benchmark",
|
||||||
[(batch_size, E, shard_intermediate_size, hidden_size, topk, dtype,
|
[(batch_size, E, shard_intermediate_size, hidden_size, topk, dtype,
|
||||||
use_fp8_w8a8, use_int8_w8a16, block_quant_shape, use_deep_gemm)
|
use_fp8_w8a8, use_int8_w8a16, block_quant_shape)
|
||||||
for batch_size in batch_sizes])
|
for batch_size in batch_sizes])
|
||||||
|
|
||||||
for batch_size, (config, kernel_time) in zip(batch_sizes, outputs):
|
for batch_size, (config, kernel_time) in zip(batch_sizes, outputs):
|
||||||
@ -659,12 +611,10 @@ if __name__ == "__main__":
|
|||||||
type=str,
|
type=str,
|
||||||
choices=["auto", "fp8_w8a8", "int8_w8a16"],
|
choices=["auto", "fp8_w8a8", "int8_w8a16"],
|
||||||
default="auto")
|
default="auto")
|
||||||
parser.add_argument("--use-deep-gemm", action="store_true")
|
|
||||||
parser.add_argument("--seed", type=int, default=0)
|
parser.add_argument("--seed", type=int, default=0)
|
||||||
parser.add_argument("--batch-size", type=int, required=False)
|
parser.add_argument("--batch-size", type=int, required=False)
|
||||||
parser.add_argument("--tune", action="store_true")
|
parser.add_argument("--tune", action="store_true")
|
||||||
parser.add_argument("--trust-remote-code", action="store_true")
|
parser.add_argument("--trust-remote-code", action="store_true")
|
||||||
parser.add_argument("--model-prefix", type=str, required=False)
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
main(args)
|
main(args)
|
||||||
|
|||||||
@ -1,349 +0,0 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
|
||||||
|
|
||||||
import argparse
|
|
||||||
from typing import Any, TypedDict
|
|
||||||
|
|
||||||
import ray
|
|
||||||
import torch
|
|
||||||
from transformers import AutoConfig
|
|
||||||
|
|
||||||
from vllm.model_executor.layers.fused_moe.deep_gemm_moe import (
|
|
||||||
_moe_permute, _moe_unpermute_and_reduce)
|
|
||||||
from vllm.model_executor.layers.fused_moe.fused_moe import *
|
|
||||||
from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import *
|
|
||||||
from vllm.model_executor.layers.fused_moe.utils import _fp8_quantize
|
|
||||||
from vllm.platforms import current_platform
|
|
||||||
from vllm.utils import FlexibleArgumentParser
|
|
||||||
|
|
||||||
FP8_DTYPE = current_platform.fp8_dtype()
|
|
||||||
|
|
||||||
|
|
||||||
class BenchmarkConfig(TypedDict):
|
|
||||||
BLOCK_SIZE_M: int
|
|
||||||
BLOCK_SIZE_N: int
|
|
||||||
BLOCK_SIZE_K: int
|
|
||||||
GROUP_SIZE_M: int
|
|
||||||
num_warps: int
|
|
||||||
num_stages: int
|
|
||||||
|
|
||||||
|
|
||||||
def benchmark_permute(num_tokens: int,
|
|
||||||
num_experts: int,
|
|
||||||
hidden_size: int,
|
|
||||||
topk: int,
|
|
||||||
dtype: torch.dtype,
|
|
||||||
use_fp8_w8a8: bool,
|
|
||||||
use_int8_w8a16: bool,
|
|
||||||
num_iters: int = 100,
|
|
||||||
use_customized_permute: bool = False) -> float:
|
|
||||||
# init_dtype = torch.float16 if use_fp8_w8a8 else dtype
|
|
||||||
hidden_states = torch.randn(num_tokens, hidden_size, dtype=dtype)
|
|
||||||
# output_hidden_states = torch.empty_like(hidden_states)
|
|
||||||
if use_fp8_w8a8:
|
|
||||||
align_block_size = 128 # deepgemm needs 128 m aligned block
|
|
||||||
qhidden_states, scale = _fp8_quantize(hidden_states, None, None)
|
|
||||||
else:
|
|
||||||
align_block_size = None
|
|
||||||
qhidden_states = hidden_states
|
|
||||||
|
|
||||||
gating_output = torch.randn(num_iters,
|
|
||||||
num_tokens,
|
|
||||||
num_experts,
|
|
||||||
dtype=torch.float32)
|
|
||||||
|
|
||||||
input_gating = torch.randn(num_tokens, num_experts, dtype=torch.float32)
|
|
||||||
topk_weights, topk_ids, token_expert_indices = fused_topk(
|
|
||||||
qhidden_states, input_gating, topk, False)
|
|
||||||
|
|
||||||
def prepare(i: int):
|
|
||||||
input_gating.copy_(gating_output[i])
|
|
||||||
|
|
||||||
def run():
|
|
||||||
if use_customized_permute:
|
|
||||||
(permuted_hidden_states, first_token_off, inv_perm_idx,
|
|
||||||
m_indices) = moe_permute(
|
|
||||||
qhidden_states,
|
|
||||||
topk_weights=topk_weights,
|
|
||||||
topk_ids=topk_ids,
|
|
||||||
token_expert_indices=token_expert_indices,
|
|
||||||
topk=topk,
|
|
||||||
n_expert=num_experts,
|
|
||||||
n_local_expert=num_experts,
|
|
||||||
expert_map=None,
|
|
||||||
align_block_size=align_block_size,
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
(permuted_hidden_states, a1q_scale, sorted_token_ids, expert_ids,
|
|
||||||
inv_perm) = _moe_permute(qhidden_states, None, topk_ids,
|
|
||||||
num_experts, None, align_block_size)
|
|
||||||
|
|
||||||
# JIT compilation & warmup
|
|
||||||
run()
|
|
||||||
torch.cuda.synchronize()
|
|
||||||
|
|
||||||
# Capture 10 invocations with CUDA graph
|
|
||||||
graph = torch.cuda.CUDAGraph()
|
|
||||||
with torch.cuda.graph(graph):
|
|
||||||
for _ in range(10):
|
|
||||||
run()
|
|
||||||
torch.cuda.synchronize()
|
|
||||||
|
|
||||||
# Warmup
|
|
||||||
for _ in range(5):
|
|
||||||
graph.replay()
|
|
||||||
torch.cuda.synchronize()
|
|
||||||
|
|
||||||
start_event = torch.cuda.Event(enable_timing=True)
|
|
||||||
end_event = torch.cuda.Event(enable_timing=True)
|
|
||||||
|
|
||||||
latencies: list[float] = []
|
|
||||||
for i in range(num_iters):
|
|
||||||
prepare(i)
|
|
||||||
torch.cuda.synchronize()
|
|
||||||
|
|
||||||
start_event.record()
|
|
||||||
graph.replay()
|
|
||||||
end_event.record()
|
|
||||||
end_event.synchronize()
|
|
||||||
latencies.append(start_event.elapsed_time(end_event))
|
|
||||||
avg = sum(latencies) / (num_iters * 10) * 1000 # us
|
|
||||||
graph.reset()
|
|
||||||
return avg
|
|
||||||
|
|
||||||
|
|
||||||
def benchmark_unpermute(num_tokens: int,
|
|
||||||
num_experts: int,
|
|
||||||
hidden_size: int,
|
|
||||||
topk: int,
|
|
||||||
dtype: torch.dtype,
|
|
||||||
use_fp8_w8a8: bool,
|
|
||||||
use_int8_w8a16: bool,
|
|
||||||
num_iters: int = 100,
|
|
||||||
use_customized_permute: bool = False) -> float:
|
|
||||||
# init_dtype = torch.float16 if use_fp8_w8a8 else dtype
|
|
||||||
hidden_states = torch.randn(num_tokens, hidden_size, dtype=dtype)
|
|
||||||
output_hidden_states = torch.empty_like(hidden_states)
|
|
||||||
if use_fp8_w8a8:
|
|
||||||
align_block_size = 128 # deepgemm needs 128 m aligned block
|
|
||||||
qhidden_states, scale = _fp8_quantize(hidden_states, None, None)
|
|
||||||
else:
|
|
||||||
align_block_size = None
|
|
||||||
qhidden_states = hidden_states
|
|
||||||
|
|
||||||
input_gating = torch.randn(num_tokens, num_experts, dtype=torch.float32)
|
|
||||||
|
|
||||||
topk_weights, topk_ids, token_expert_indices = fused_topk(
|
|
||||||
qhidden_states, input_gating, topk, False)
|
|
||||||
|
|
||||||
def prepare():
|
|
||||||
if use_customized_permute:
|
|
||||||
(permuted_hidden_states, first_token_off, inv_perm_idx,
|
|
||||||
m_indices) = moe_permute(
|
|
||||||
qhidden_states,
|
|
||||||
topk_weights=topk_weights,
|
|
||||||
topk_ids=topk_ids,
|
|
||||||
token_expert_indices=token_expert_indices,
|
|
||||||
topk=topk,
|
|
||||||
n_expert=num_experts,
|
|
||||||
n_local_expert=num_experts,
|
|
||||||
expert_map=None,
|
|
||||||
align_block_size=align_block_size,
|
|
||||||
)
|
|
||||||
# convert to fp16/bf16 as gemm output
|
|
||||||
return (permuted_hidden_states.to(dtype), first_token_off,
|
|
||||||
inv_perm_idx, m_indices)
|
|
||||||
else:
|
|
||||||
(permuted_qhidden_states, a1q_scale, sorted_token_ids, expert_ids,
|
|
||||||
inv_perm) = _moe_permute(qhidden_states, None, topk_ids,
|
|
||||||
num_experts, None, align_block_size)
|
|
||||||
# convert to fp16/bf16 as gemm output
|
|
||||||
return (permuted_qhidden_states.to(dtype), a1q_scale,
|
|
||||||
sorted_token_ids, expert_ids, inv_perm)
|
|
||||||
|
|
||||||
def run(input: tuple):
|
|
||||||
if use_customized_permute:
|
|
||||||
(permuted_hidden_states, first_token_off, inv_perm_idx,
|
|
||||||
m_indices) = input
|
|
||||||
moe_unpermute(permuted_hidden_states, topk_weights, topk_ids,
|
|
||||||
inv_perm_idx, first_token_off, topk, num_experts,
|
|
||||||
num_experts)
|
|
||||||
else:
|
|
||||||
(permuted_hidden_states, a1q_scale, sorted_token_ids, expert_ids,
|
|
||||||
inv_perm) = input
|
|
||||||
_moe_unpermute_and_reduce(output_hidden_states,
|
|
||||||
permuted_hidden_states, inv_perm,
|
|
||||||
topk_weights)
|
|
||||||
|
|
||||||
# JIT compilation & warmup
|
|
||||||
input = prepare()
|
|
||||||
run(input)
|
|
||||||
torch.cuda.synchronize()
|
|
||||||
|
|
||||||
# Capture 10 invocations with CUDA graph
|
|
||||||
graph = torch.cuda.CUDAGraph()
|
|
||||||
with torch.cuda.graph(graph):
|
|
||||||
for _ in range(10):
|
|
||||||
run(input)
|
|
||||||
torch.cuda.synchronize()
|
|
||||||
|
|
||||||
# Warmup
|
|
||||||
for _ in range(5):
|
|
||||||
graph.replay()
|
|
||||||
torch.cuda.synchronize()
|
|
||||||
|
|
||||||
start_event = torch.cuda.Event(enable_timing=True)
|
|
||||||
end_event = torch.cuda.Event(enable_timing=True)
|
|
||||||
|
|
||||||
latencies: list[float] = []
|
|
||||||
for i in range(num_iters):
|
|
||||||
torch.cuda.synchronize()
|
|
||||||
start_event.record()
|
|
||||||
graph.replay()
|
|
||||||
end_event.record()
|
|
||||||
end_event.synchronize()
|
|
||||||
latencies.append(start_event.elapsed_time(end_event))
|
|
||||||
avg = sum(latencies) / (num_iters * 10) * 1000 # us
|
|
||||||
graph.reset()
|
|
||||||
return avg
|
|
||||||
|
|
||||||
|
|
||||||
@ray.remote(num_gpus=1)
|
|
||||||
class BenchmarkWorker:
|
|
||||||
|
|
||||||
def __init__(self, seed: int) -> None:
|
|
||||||
torch.set_default_device("cuda")
|
|
||||||
current_platform.seed_everything(seed)
|
|
||||||
self.seed = seed
|
|
||||||
# Get the device ID to allocate tensors and kernels
|
|
||||||
# on the respective GPU. This is required for Ray to work
|
|
||||||
# correctly with multi-GPU tuning on the ROCm platform.
|
|
||||||
self.device_id = int(ray.get_gpu_ids()[0])
|
|
||||||
|
|
||||||
def benchmark(
|
|
||||||
self,
|
|
||||||
num_tokens: int,
|
|
||||||
num_experts: int,
|
|
||||||
hidden_size: int,
|
|
||||||
topk: int,
|
|
||||||
dtype: torch.dtype,
|
|
||||||
use_fp8_w8a8: bool,
|
|
||||||
use_int8_w8a16: bool,
|
|
||||||
use_customized_permute: bool = False,
|
|
||||||
) -> tuple[dict[str, int], float]:
|
|
||||||
current_platform.seed_everything(self.seed)
|
|
||||||
|
|
||||||
permute_time = benchmark_permute(
|
|
||||||
num_tokens,
|
|
||||||
num_experts,
|
|
||||||
hidden_size,
|
|
||||||
topk,
|
|
||||||
dtype,
|
|
||||||
use_fp8_w8a8,
|
|
||||||
use_int8_w8a16,
|
|
||||||
num_iters=100,
|
|
||||||
use_customized_permute=use_customized_permute)
|
|
||||||
unpermute_time = benchmark_unpermute(
|
|
||||||
num_tokens,
|
|
||||||
num_experts,
|
|
||||||
hidden_size,
|
|
||||||
topk,
|
|
||||||
dtype,
|
|
||||||
use_fp8_w8a8,
|
|
||||||
use_int8_w8a16,
|
|
||||||
num_iters=100,
|
|
||||||
use_customized_permute=use_customized_permute)
|
|
||||||
return permute_time, unpermute_time
|
|
||||||
|
|
||||||
|
|
||||||
def get_weight_block_size_safety(config, default_value=None):
|
|
||||||
|
|
||||||
quantization_config = getattr(config, 'quantization_config', {})
|
|
||||||
if isinstance(quantization_config, dict):
|
|
||||||
return quantization_config.get('weight_block_size', default_value)
|
|
||||||
return default_value
|
|
||||||
|
|
||||||
|
|
||||||
def main(args: argparse.Namespace):
|
|
||||||
print(args)
|
|
||||||
|
|
||||||
config = AutoConfig.from_pretrained(
|
|
||||||
args.model, trust_remote_code=args.trust_remote_code)
|
|
||||||
if config.architectures[0] == "DbrxForCausalLM":
|
|
||||||
E = config.ffn_config.moe_num_experts
|
|
||||||
topk = config.ffn_config.moe_top_k
|
|
||||||
elif config.architectures[0] == "JambaForCausalLM":
|
|
||||||
E = config.num_experts
|
|
||||||
topk = config.num_experts_per_tok
|
|
||||||
elif (config.architectures[0] == "DeepseekV3ForCausalLM"
|
|
||||||
or config.architectures[0] == "DeepseekV2ForCausalLM"):
|
|
||||||
E = config.n_routed_experts
|
|
||||||
topk = config.num_experts_per_tok
|
|
||||||
elif config.architectures[0] in [
|
|
||||||
"Qwen2MoeForCausalLM", "Qwen3MoeForCausalLM"
|
|
||||||
]:
|
|
||||||
E = config.num_experts
|
|
||||||
topk = config.num_experts_per_tok
|
|
||||||
|
|
||||||
else:
|
|
||||||
# Support for llama4
|
|
||||||
config = config.get_text_config()
|
|
||||||
# Default: Mixtral.
|
|
||||||
E = config.num_local_experts
|
|
||||||
topk = config.num_experts_per_tok
|
|
||||||
|
|
||||||
hidden_size = config.hidden_size
|
|
||||||
dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype
|
|
||||||
use_fp8_w8a8 = args.dtype == "fp8_w8a8"
|
|
||||||
use_int8_w8a16 = args.dtype == "int8_w8a16"
|
|
||||||
use_customized_permute = args.use_customized_permute
|
|
||||||
|
|
||||||
if args.batch_size is None:
|
|
||||||
batch_sizes = [
|
|
||||||
1, 2, 4, 8, 16, 24, 32, 48, 64, 96, 128, 256, 512, 1024, 1536,
|
|
||||||
2048, 3072, 4096
|
|
||||||
]
|
|
||||||
else:
|
|
||||||
batch_sizes = [args.batch_size]
|
|
||||||
|
|
||||||
ray.init()
|
|
||||||
num_gpus = int(ray.available_resources()["GPU"])
|
|
||||||
workers = [BenchmarkWorker.remote(args.seed) for _ in range(num_gpus)]
|
|
||||||
|
|
||||||
def _distribute(method: str, inputs: list[Any]) -> list[Any]:
|
|
||||||
outputs = []
|
|
||||||
worker_idx = 0
|
|
||||||
for input_args in inputs:
|
|
||||||
worker = workers[worker_idx]
|
|
||||||
worker_method = getattr(worker, method)
|
|
||||||
output = worker_method.remote(*input_args)
|
|
||||||
outputs.append(output)
|
|
||||||
worker_idx = (worker_idx + 1) % num_gpus
|
|
||||||
return ray.get(outputs)
|
|
||||||
|
|
||||||
outputs = _distribute(
|
|
||||||
"benchmark", [(batch_size, E, hidden_size, topk, dtype, use_fp8_w8a8,
|
|
||||||
use_int8_w8a16, use_customized_permute)
|
|
||||||
for batch_size in batch_sizes])
|
|
||||||
|
|
||||||
for batch_size, (permute, unpermute) in zip(batch_sizes, outputs):
|
|
||||||
print(f"Batch size: {batch_size}")
|
|
||||||
print(f"Permute time: {permute:.2f} us")
|
|
||||||
print(f"Unpermute time: {unpermute:.2f} us")
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
parser = FlexibleArgumentParser()
|
|
||||||
parser.add_argument("--model",
|
|
||||||
type=str,
|
|
||||||
default="mistralai/Mixtral-8x7B-Instruct-v0.1")
|
|
||||||
parser.add_argument("--dtype",
|
|
||||||
type=str,
|
|
||||||
choices=["auto", "fp8_w8a8", "int8_w8a16"],
|
|
||||||
default="auto")
|
|
||||||
parser.add_argument("--use-customized-permute", action="store_true")
|
|
||||||
parser.add_argument("--seed", type=int, default=0)
|
|
||||||
parser.add_argument("--batch-size", type=int, required=False)
|
|
||||||
parser.add_argument("--trust-remote-code", action="store_true")
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
main(args)
|
|
||||||
@ -4,11 +4,11 @@ import itertools
|
|||||||
from typing import Optional, Union
|
from typing import Optional, Union
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
import triton
|
||||||
from flashinfer.norm import fused_add_rmsnorm, rmsnorm
|
from flashinfer.norm import fused_add_rmsnorm, rmsnorm
|
||||||
from torch import nn
|
from torch import nn
|
||||||
|
|
||||||
from vllm import _custom_ops as vllm_ops
|
from vllm import _custom_ops as vllm_ops
|
||||||
from vllm.triton_utils import triton
|
|
||||||
|
|
||||||
|
|
||||||
class HuggingFaceRMSNorm(nn.Module):
|
class HuggingFaceRMSNorm(nn.Module):
|
||||||
|
|||||||
@ -6,13 +6,13 @@ import time
|
|||||||
# Import DeepGEMM functions
|
# Import DeepGEMM functions
|
||||||
import deep_gemm
|
import deep_gemm
|
||||||
import torch
|
import torch
|
||||||
|
import triton
|
||||||
from deep_gemm import calc_diff, ceil_div, get_col_major_tma_aligned_tensor
|
from deep_gemm import calc_diff, ceil_div, get_col_major_tma_aligned_tensor
|
||||||
|
|
||||||
# Import vLLM functions
|
# Import vLLM functions
|
||||||
from vllm import _custom_ops as ops
|
from vllm import _custom_ops as ops
|
||||||
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
|
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
|
||||||
per_token_group_quant_fp8, w8a8_block_fp8_matmul)
|
per_token_group_quant_fp8, w8a8_block_fp8_matmul)
|
||||||
from vllm.triton_utils import triton
|
|
||||||
|
|
||||||
|
|
||||||
# Copied from
|
# Copied from
|
||||||
|
|||||||
16
benchmarks/launch_tgi_server.sh
Executable file
16
benchmarks/launch_tgi_server.sh
Executable file
@ -0,0 +1,16 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
PORT=8000
|
||||||
|
MODEL=$1
|
||||||
|
TOKENS=$2
|
||||||
|
|
||||||
|
docker run -e "HF_TOKEN=$HF_TOKEN" --gpus all --shm-size 1g -p $PORT:80 \
|
||||||
|
-v "$PWD/data:/data" \
|
||||||
|
ghcr.io/huggingface/text-generation-inference:2.2.0 \
|
||||||
|
--model-id "$MODEL" \
|
||||||
|
--sharded false \
|
||||||
|
--max-input-length 1024 \
|
||||||
|
--max-total-tokens 2048 \
|
||||||
|
--max-best-of 5 \
|
||||||
|
--max-concurrent-requests 5000 \
|
||||||
|
--max-batch-total-tokens "$TOKENS"
|
||||||
@ -9,10 +9,13 @@ BACKEND=${2:-"vllm"}
|
|||||||
# Define the dataset to use
|
# Define the dataset to use
|
||||||
DATASET=${3:-"xgrammar_bench"}
|
DATASET=${3:-"xgrammar_bench"}
|
||||||
|
|
||||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
# Define the guided decoding backend
|
||||||
OUTPUT_DIR=${4:-"$SCRIPT_DIR/structured_output_benchmark_results"}
|
GUIDED_BACKEND=${4:-"xgrammar"}
|
||||||
|
|
||||||
GUIDED_RATIO=${5:-0.5}
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
OUTPUT_DIR=${5:-"$SCRIPT_DIR/structured_output_benchmark_results"}
|
||||||
|
|
||||||
|
GUIDED_RATIO=${6:-0.5}
|
||||||
|
|
||||||
# Create output directory if it doesn't exist
|
# Create output directory if it doesn't exist
|
||||||
mkdir -p "$OUTPUT_DIR"
|
mkdir -p "$OUTPUT_DIR"
|
||||||
@ -24,6 +27,7 @@ QPS_VALUES=(70 60 50 25 20 15 10)
|
|||||||
COMMON_PARAMS="--backend $BACKEND \
|
COMMON_PARAMS="--backend $BACKEND \
|
||||||
--model $MODEL \
|
--model $MODEL \
|
||||||
--dataset $DATASET \
|
--dataset $DATASET \
|
||||||
|
--structured-output-backend $GUIDED_BACKEND \
|
||||||
--structured-output-ratio $GUIDED_RATIO \
|
--structured-output-ratio $GUIDED_RATIO \
|
||||||
--save-results \
|
--save-results \
|
||||||
--result-dir $OUTPUT_DIR"
|
--result-dir $OUTPUT_DIR"
|
||||||
@ -31,6 +35,7 @@ COMMON_PARAMS="--backend $BACKEND \
|
|||||||
echo "Starting structured output benchmark with model: $MODEL"
|
echo "Starting structured output benchmark with model: $MODEL"
|
||||||
echo "Backend: $BACKEND"
|
echo "Backend: $BACKEND"
|
||||||
echo "Dataset: $DATASET"
|
echo "Dataset: $DATASET"
|
||||||
|
echo "Structured output backend: $GUIDED_BACKEND"
|
||||||
echo "Results will be saved to: $OUTPUT_DIR"
|
echo "Results will be saved to: $OUTPUT_DIR"
|
||||||
echo "----------------------------------------"
|
echo "----------------------------------------"
|
||||||
|
|
||||||
@ -43,7 +48,7 @@ for qps in "${QPS_VALUES[@]}"; do
|
|||||||
GIT_BRANCH=$(git rev-parse --abbrev-ref HEAD 2>/dev/null || echo "unknown")
|
GIT_BRANCH=$(git rev-parse --abbrev-ref HEAD 2>/dev/null || echo "unknown")
|
||||||
|
|
||||||
# Construct filename for this run
|
# Construct filename for this run
|
||||||
FILENAME="${BACKEND}_${qps}qps_$(basename $MODEL)_${DATASET}_${GIT_HASH}.json"
|
FILENAME="${GUIDED_BACKEND}_${BACKEND}_${qps}qps_$(basename $MODEL)_${DATASET}_${GIT_HASH}.json"
|
||||||
|
|
||||||
# Run the benchmark
|
# Run the benchmark
|
||||||
python "$SCRIPT_DIR/benchmark_serving_structured_output.py" $COMMON_PARAMS \
|
python "$SCRIPT_DIR/benchmark_serving_structured_output.py" $COMMON_PARAMS \
|
||||||
|
|||||||
@ -33,6 +33,8 @@ endif()
|
|||||||
|
|
||||||
if(MACOSX_FOUND)
|
if(MACOSX_FOUND)
|
||||||
list(APPEND CXX_COMPILE_FLAGS
|
list(APPEND CXX_COMPILE_FLAGS
|
||||||
|
"-Xpreprocessor"
|
||||||
|
"-fopenmp"
|
||||||
"-DVLLM_CPU_EXTENSION")
|
"-DVLLM_CPU_EXTENSION")
|
||||||
else()
|
else()
|
||||||
list(APPEND CXX_COMPILE_FLAGS
|
list(APPEND CXX_COMPILE_FLAGS
|
||||||
@ -167,33 +169,6 @@ if (AVX512_FOUND AND NOT AVX512_DISABLED)
|
|||||||
|
|
||||||
FetchContent_MakeAvailable(oneDNN)
|
FetchContent_MakeAvailable(oneDNN)
|
||||||
|
|
||||||
list(APPEND LIBS dnnl)
|
|
||||||
elseif(POWER10_FOUND)
|
|
||||||
FetchContent_Declare(
|
|
||||||
oneDNN
|
|
||||||
GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git
|
|
||||||
GIT_TAG v3.7.2
|
|
||||||
GIT_PROGRESS TRUE
|
|
||||||
GIT_SHALLOW TRUE
|
|
||||||
)
|
|
||||||
|
|
||||||
set(ONEDNN_LIBRARY_TYPE "STATIC")
|
|
||||||
set(ONEDNN_BUILD_DOC "OFF")
|
|
||||||
set(ONEDNN_BUILD_EXAMPLES "OFF")
|
|
||||||
set(ONEDNN_BUILD_TESTS "OFF")
|
|
||||||
set(ONEDNN_ENABLE_WORKLOAD "INFERENCE")
|
|
||||||
set(ONEDNN_ENABLE_PRIMITIVE "MATMUL;REORDER")
|
|
||||||
set(ONEDNN_BUILD_GRAPH "OFF")
|
|
||||||
set(ONEDNN_ENABLE_JIT_PROFILING "OFF")
|
|
||||||
set(ONEDNN_ENABLE_ITT_TASKS "OFF")
|
|
||||||
set(ONEDNN_ENABLE_MAX_CPU_ISA "OFF")
|
|
||||||
set(ONEDNN_ENABLE_CPU_ISA_HINTS "OFF")
|
|
||||||
set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)
|
|
||||||
|
|
||||||
set(DNNL_CPU_RUNTIME "OMP")
|
|
||||||
|
|
||||||
FetchContent_MakeAvailable(oneDNN)
|
|
||||||
|
|
||||||
list(APPEND LIBS dnnl)
|
list(APPEND LIBS dnnl)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
@ -220,11 +195,6 @@ set(VLLM_EXT_SRC
|
|||||||
"csrc/cpu/torch_bindings.cpp")
|
"csrc/cpu/torch_bindings.cpp")
|
||||||
|
|
||||||
if (AVX512_FOUND AND NOT AVX512_DISABLED)
|
if (AVX512_FOUND AND NOT AVX512_DISABLED)
|
||||||
set(VLLM_EXT_SRC
|
|
||||||
"csrc/cpu/quant.cpp"
|
|
||||||
"csrc/cpu/shm.cpp"
|
|
||||||
${VLLM_EXT_SRC})
|
|
||||||
elseif(POWER10_FOUND)
|
|
||||||
set(VLLM_EXT_SRC
|
set(VLLM_EXT_SRC
|
||||||
"csrc/cpu/quant.cpp"
|
"csrc/cpu/quant.cpp"
|
||||||
${VLLM_EXT_SRC})
|
${VLLM_EXT_SRC})
|
||||||
@ -245,4 +215,4 @@ define_gpu_extension_target(
|
|||||||
WITH_SOABI
|
WITH_SOABI
|
||||||
)
|
)
|
||||||
|
|
||||||
message(STATUS "Enabling C extension.")
|
message(STATUS "Enabling C extension.")
|
||||||
@ -38,7 +38,7 @@ else()
|
|||||||
FetchContent_Declare(
|
FetchContent_Declare(
|
||||||
vllm-flash-attn
|
vllm-flash-attn
|
||||||
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
|
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
|
||||||
GIT_TAG 8798f27777fb57f447070301bf33a9f9c607f491
|
GIT_TAG dc9d410b3e2d6534a4c70724c2515f4def670a22
|
||||||
GIT_PROGRESS TRUE
|
GIT_PROGRESS TRUE
|
||||||
# Don't share the vllm-flash-attn build between build types
|
# Don't share the vllm-flash-attn build between build types
|
||||||
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
|
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
|
||||||
|
|||||||
@ -105,14 +105,8 @@ def run(command):
|
|||||||
else:
|
else:
|
||||||
enc = locale.getpreferredencoding()
|
enc = locale.getpreferredencoding()
|
||||||
output = raw_output.decode(enc)
|
output = raw_output.decode(enc)
|
||||||
if command == 'nvidia-smi topo -m':
|
|
||||||
# don't remove the leading whitespace of `nvidia-smi topo -m`
|
|
||||||
# because they are meaningful
|
|
||||||
output = output.rstrip()
|
|
||||||
else:
|
|
||||||
output = output.strip()
|
|
||||||
err = raw_err.decode(enc)
|
err = raw_err.decode(enc)
|
||||||
return rc, output, err.strip()
|
return rc, output.strip(), err.strip()
|
||||||
|
|
||||||
|
|
||||||
def run_and_read_all(run_lambda, command):
|
def run_and_read_all(run_lambda, command):
|
||||||
@ -282,20 +276,12 @@ def get_vllm_version():
|
|||||||
|
|
||||||
if __version__ == "dev":
|
if __version__ == "dev":
|
||||||
return "N/A (dev)"
|
return "N/A (dev)"
|
||||||
version_str = __version_tuple__[-1]
|
|
||||||
if isinstance(version_str, str) and version_str.startswith('g'):
|
|
||||||
# it's a dev build
|
|
||||||
if '.' in version_str:
|
|
||||||
# it's a dev build containing local changes
|
|
||||||
git_sha = version_str.split('.')[0][1:]
|
|
||||||
date = version_str.split('.')[-1][1:]
|
|
||||||
return f"{__version__} (git sha: {git_sha}, date: {date})"
|
|
||||||
else:
|
|
||||||
# it's a dev build without local changes
|
|
||||||
git_sha = version_str[1:] # type: ignore
|
|
||||||
return f"{__version__} (git sha: {git_sha})"
|
|
||||||
return __version__
|
|
||||||
|
|
||||||
|
if len(__version_tuple__) == 4: # dev build
|
||||||
|
git_sha = __version_tuple__[-1][1:] # type: ignore
|
||||||
|
return f"{__version__} (git sha: {git_sha}"
|
||||||
|
|
||||||
|
return __version__
|
||||||
|
|
||||||
def summarize_vllm_build_flags():
|
def summarize_vllm_build_flags():
|
||||||
# This could be a static method if the flags are constant, or dynamic if you need to check environment variables, etc.
|
# This could be a static method if the flags are constant, or dynamic if you need to check environment variables, etc.
|
||||||
@ -496,30 +482,16 @@ def get_pip_packages(run_lambda, patterns=None):
|
|||||||
if patterns is None:
|
if patterns is None:
|
||||||
patterns = DEFAULT_PIP_PATTERNS
|
patterns = DEFAULT_PIP_PATTERNS
|
||||||
|
|
||||||
def run_with_pip():
|
# People generally have `pip` as `pip` or `pip3`
|
||||||
try:
|
# But here it is invoked as `python -mpip`
|
||||||
import importlib.util
|
def run_with_pip(pip):
|
||||||
pip_spec = importlib.util.find_spec('pip')
|
out = run_and_read_all(run_lambda, pip + ["list", "--format=freeze"])
|
||||||
pip_available = pip_spec is not None
|
|
||||||
except ImportError:
|
|
||||||
pip_available = False
|
|
||||||
|
|
||||||
if pip_available:
|
|
||||||
cmd = [sys.executable, '-mpip', 'list', '--format=freeze']
|
|
||||||
elif os.environ.get("UV") is not None:
|
|
||||||
print("uv is set")
|
|
||||||
cmd = ["uv", "pip", "list", "--format=freeze"]
|
|
||||||
else:
|
|
||||||
raise RuntimeError(
|
|
||||||
"Could not collect pip list output (pip or uv module not available)"
|
|
||||||
)
|
|
||||||
|
|
||||||
out = run_and_read_all(run_lambda, cmd)
|
|
||||||
return "\n".join(line for line in out.splitlines()
|
return "\n".join(line for line in out.splitlines()
|
||||||
if any(name in line for name in patterns))
|
if any(name in line for name in patterns))
|
||||||
|
|
||||||
pip_version = 'pip3' if sys.version[0] == '3' else 'pip'
|
pip_version = 'pip3' if sys.version[0] == '3' else 'pip'
|
||||||
out = run_with_pip()
|
out = run_with_pip([sys.executable, '-mpip'])
|
||||||
|
|
||||||
return pip_version, out
|
return pip_version, out
|
||||||
|
|
||||||
|
|
||||||
@ -545,12 +517,13 @@ def is_xnnpack_available():
|
|||||||
else:
|
else:
|
||||||
return "N/A"
|
return "N/A"
|
||||||
|
|
||||||
|
|
||||||
def get_env_vars():
|
def get_env_vars():
|
||||||
env_vars = ''
|
env_vars = ''
|
||||||
secret_terms = ('secret', 'token', 'api', 'access', 'password')
|
secret_terms=('secret', 'token', 'api', 'access', 'password')
|
||||||
report_prefix = ("TORCH", "NCCL", "PYTORCH", "CUDA", "CUBLAS", "CUDNN",
|
report_prefix = ("TORCH", "NCCL", "PYTORCH",
|
||||||
"OMP_", "MKL_", "NVIDIA")
|
"CUDA", "CUBLAS", "CUDNN",
|
||||||
|
"OMP_", "MKL_",
|
||||||
|
"NVIDIA")
|
||||||
for k, v in os.environ.items():
|
for k, v in os.environ.items():
|
||||||
if any(term in k.lower() for term in secret_terms):
|
if any(term in k.lower() for term in secret_terms):
|
||||||
continue
|
continue
|
||||||
@ -561,7 +534,6 @@ def get_env_vars():
|
|||||||
|
|
||||||
return env_vars
|
return env_vars
|
||||||
|
|
||||||
|
|
||||||
def get_env_info():
|
def get_env_info():
|
||||||
run_lambda = run
|
run_lambda = run
|
||||||
pip_version, pip_list_output = get_pip_packages(run_lambda)
|
pip_version, pip_list_output = get_pip_packages(run_lambda)
|
||||||
@ -1,178 +0,0 @@
|
|||||||
#include <optional>
|
|
||||||
#include <torch/all.h>
|
|
||||||
#include <ATen/cuda/CUDAContext.h>
|
|
||||||
#include <c10/cuda/CUDAGuard.h>
|
|
||||||
#include <algorithm>
|
|
||||||
|
|
||||||
#include "attention_dtypes.h"
|
|
||||||
#include "attention_utils.cuh"
|
|
||||||
|
|
||||||
namespace vllm {
|
|
||||||
|
|
||||||
// Implements section 2.2 of https://www.arxiv.org/pdf/2501.01005
|
|
||||||
// can be used to combine partial attention results (in the split-KV case)
|
|
||||||
template <typename scalar_t, const uint NUM_THREADS>
|
|
||||||
__global__ void merge_attn_states_kernel(
|
|
||||||
scalar_t* output, float* output_lse, const scalar_t* prefix_output,
|
|
||||||
const float* prefix_lse, const scalar_t* suffix_output,
|
|
||||||
const float* suffix_lse, const uint num_tokens, const uint num_heads,
|
|
||||||
const uint head_size) {
|
|
||||||
using pack_128b_t = uint4;
|
|
||||||
const uint pack_size = 16 / sizeof(scalar_t);
|
|
||||||
const uint threads_per_head = head_size / pack_size;
|
|
||||||
|
|
||||||
const uint global_idx = blockIdx.x * NUM_THREADS + threadIdx.x;
|
|
||||||
const uint token_head_threads = num_tokens * num_heads * threads_per_head;
|
|
||||||
|
|
||||||
if (global_idx >= token_head_threads) return;
|
|
||||||
|
|
||||||
// global_idx -> token_idx + head_idx + pack_idx
|
|
||||||
const uint token_head_idx = global_idx / threads_per_head;
|
|
||||||
const uint pack_idx = global_idx % threads_per_head;
|
|
||||||
|
|
||||||
const uint token_idx = token_head_idx / num_heads;
|
|
||||||
const uint head_idx = token_head_idx % num_heads;
|
|
||||||
|
|
||||||
const uint pack_offset = pack_idx * pack_size; // (0~15)*8, etc.
|
|
||||||
const uint head_offset =
|
|
||||||
token_idx * num_heads * head_size + head_idx * head_size;
|
|
||||||
const scalar_t* prefix_head_ptr = prefix_output + head_offset;
|
|
||||||
const scalar_t* suffix_head_ptr = suffix_output + head_offset;
|
|
||||||
scalar_t* output_head_ptr = output + head_offset;
|
|
||||||
|
|
||||||
float p_lse = prefix_lse[head_idx * num_tokens + token_idx];
|
|
||||||
float s_lse = suffix_lse[head_idx * num_tokens + token_idx];
|
|
||||||
p_lse = std::isinf(p_lse) ? -std::numeric_limits<float>::infinity() : p_lse;
|
|
||||||
s_lse = std::isinf(s_lse) ? -std::numeric_limits<float>::infinity() : s_lse;
|
|
||||||
|
|
||||||
const float max_lse = fmaxf(p_lse, s_lse);
|
|
||||||
p_lse = p_lse - max_lse;
|
|
||||||
s_lse = s_lse - max_lse;
|
|
||||||
const float p_se = expf(p_lse);
|
|
||||||
const float s_se = expf(s_lse);
|
|
||||||
const float out_se = p_se + s_se;
|
|
||||||
const float p_scale = p_se / out_se;
|
|
||||||
const float s_scale = s_se / out_se;
|
|
||||||
|
|
||||||
if (pack_offset < head_size) {
|
|
||||||
// Pack 128b load
|
|
||||||
pack_128b_t p_out_pack = reinterpret_cast<const pack_128b_t*>(
|
|
||||||
prefix_head_ptr)[pack_offset / pack_size];
|
|
||||||
pack_128b_t s_out_pack = reinterpret_cast<const pack_128b_t*>(
|
|
||||||
suffix_head_ptr)[pack_offset / pack_size];
|
|
||||||
pack_128b_t o_out_pack;
|
|
||||||
|
|
||||||
#pragma unroll
|
|
||||||
for (uint i = 0; i < pack_size; ++i) {
|
|
||||||
// Always use float for FMA to keep high precision.
|
|
||||||
// half(uint16_t), bfloat16, float -> float.
|
|
||||||
const float p_out_f =
|
|
||||||
vllm::to_float(reinterpret_cast<const scalar_t*>(&p_out_pack)[i]);
|
|
||||||
const float s_out_f =
|
|
||||||
vllm::to_float(reinterpret_cast<const scalar_t*>(&s_out_pack)[i]);
|
|
||||||
// fma: a * b + c = p_out_f * p_scale + (s_out_f * s_scale)
|
|
||||||
const float o_out_f = p_out_f * p_scale + (s_out_f * s_scale);
|
|
||||||
// float -> half(uint16_t), bfloat16, float.
|
|
||||||
vllm::from_float(reinterpret_cast<scalar_t*>(&o_out_pack)[i], o_out_f);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Pack 128b storage
|
|
||||||
reinterpret_cast<pack_128b_t*>(output_head_ptr)[pack_offset / pack_size] =
|
|
||||||
o_out_pack;
|
|
||||||
}
|
|
||||||
// We only need to write to output_lse once per head.
|
|
||||||
if (output_lse != nullptr && pack_idx == 0) {
|
|
||||||
float out_lse = logf(out_se) + max_lse;
|
|
||||||
output_lse[head_idx * num_tokens + token_idx] = out_lse;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace vllm
|
|
||||||
|
|
||||||
// The following macro is used to dispatch the conversion function based on
|
|
||||||
// the output data type. The FN is a macro that calls a function with
|
|
||||||
// template<typename scalar_t>.
|
|
||||||
#define DISPATCH_BY_SCALAR_DTYPE(scalar_dtype, fn) \
|
|
||||||
{ \
|
|
||||||
if (scalar_dtype == at::ScalarType::Float) { \
|
|
||||||
fn(float); \
|
|
||||||
} else if (scalar_dtype == at::ScalarType::Half) { \
|
|
||||||
fn(uint16_t); \
|
|
||||||
} else if (scalar_dtype == at::ScalarType::BFloat16) { \
|
|
||||||
fn(__nv_bfloat16); \
|
|
||||||
} else { \
|
|
||||||
TORCH_CHECK(false, "Unsupported data type of O: ", scalar_dtype); \
|
|
||||||
} \
|
|
||||||
}
|
|
||||||
|
|
||||||
#define LAUNCH_MERGE_ATTN_STATES(scalar_t, NUM_THREADS) \
|
|
||||||
{ \
|
|
||||||
vllm::merge_attn_states_kernel<scalar_t, NUM_THREADS> \
|
|
||||||
<<<grid, block, 0, stream>>>( \
|
|
||||||
reinterpret_cast<scalar_t*>(output.data_ptr()), output_lse_ptr, \
|
|
||||||
reinterpret_cast<scalar_t*>(prefix_output.data_ptr()), \
|
|
||||||
reinterpret_cast<float*>(prefix_lse.data_ptr()), \
|
|
||||||
reinterpret_cast<scalar_t*>(suffix_output.data_ptr()), \
|
|
||||||
reinterpret_cast<float*>(suffix_lse.data_ptr()), num_tokens, \
|
|
||||||
num_heads, head_size); \
|
|
||||||
}
|
|
||||||
|
|
||||||
/*@brief Merges the attention states from prefix and suffix
|
|
||||||
* into the output tensor. NUM_TOKENS: n, NUM_HEADS: h, HEAD_SIZE: d
|
|
||||||
*
|
|
||||||
* @param output [n,h,d] The output tensor to store the merged attention states.
|
|
||||||
* @param output_lse [h,d] Optional tensor to store the log-sum-exp values.
|
|
||||||
* @param prefix_output [n,h,d] The prefix attention states.
|
|
||||||
* @param prefix_lse [h,n] The log-sum-exp values for the prefix attention
|
|
||||||
* states.
|
|
||||||
* @param suffix_output [n,h,d] The suffix attention states.
|
|
||||||
* @param suffix_lse [h,n] The log-sum-exp values for the suffix attention
|
|
||||||
* states.
|
|
||||||
*/
|
|
||||||
template <typename scalar_t>
|
|
||||||
void merge_attn_states_launcher(torch::Tensor& output,
|
|
||||||
std::optional<torch::Tensor> output_lse,
|
|
||||||
const torch::Tensor& prefix_output,
|
|
||||||
const torch::Tensor& prefix_lse,
|
|
||||||
const torch::Tensor& suffix_output,
|
|
||||||
const torch::Tensor& suffix_lse) {
|
|
||||||
constexpr uint NUM_THREADS = 128;
|
|
||||||
const uint num_tokens = output.size(0);
|
|
||||||
const uint num_heads = output.size(1);
|
|
||||||
const uint head_size = output.size(2);
|
|
||||||
const uint pack_size = 16 / sizeof(scalar_t);
|
|
||||||
TORCH_CHECK(head_size % pack_size == 0,
|
|
||||||
"headsize must be multiple of pack_size:", pack_size);
|
|
||||||
float* output_lse_ptr = nullptr;
|
|
||||||
if (output_lse.has_value()) {
|
|
||||||
output_lse_ptr = output_lse.value().data_ptr<float>();
|
|
||||||
}
|
|
||||||
// Process one pack elements per thread. for float, the
|
|
||||||
// pack_size is 4 for half/bf16, the pack_size is 8.
|
|
||||||
const uint threads_per_head = head_size / pack_size;
|
|
||||||
const uint total_threads = num_tokens * num_heads * threads_per_head;
|
|
||||||
|
|
||||||
dim3 block(NUM_THREADS);
|
|
||||||
dim3 grid((total_threads + NUM_THREADS - 1) / NUM_THREADS);
|
|
||||||
|
|
||||||
const c10::cuda::OptionalCUDAGuard device_guard(prefix_output.device());
|
|
||||||
auto stream = at::cuda::getCurrentCUDAStream();
|
|
||||||
|
|
||||||
LAUNCH_MERGE_ATTN_STATES(scalar_t, NUM_THREADS);
|
|
||||||
}
|
|
||||||
|
|
||||||
#define CALL_MERGE_ATTN_STATES_LAUNCHER(scalar_t) \
|
|
||||||
{ \
|
|
||||||
merge_attn_states_launcher<scalar_t>(output, output_lse, prefix_output, \
|
|
||||||
prefix_lse, suffix_output, \
|
|
||||||
suffix_lse); \
|
|
||||||
}
|
|
||||||
|
|
||||||
void merge_attn_states(torch::Tensor& output,
|
|
||||||
std::optional<torch::Tensor> output_lse,
|
|
||||||
const torch::Tensor& prefix_output,
|
|
||||||
const torch::Tensor& prefix_lse,
|
|
||||||
const torch::Tensor& suffix_output,
|
|
||||||
const torch::Tensor& suffix_lse) {
|
|
||||||
DISPATCH_BY_SCALAR_DTYPE(output.dtype(), CALL_MERGE_ATTN_STATES_LAUNCHER);
|
|
||||||
}
|
|
||||||
@ -1,38 +0,0 @@
|
|||||||
/*
|
|
||||||
* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
|
||||||
*
|
|
||||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
* you may not use this file except in compliance with the License.
|
|
||||||
* You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
* See the License for the specific language governing permissions and
|
|
||||||
* limitations under the License.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include <torch/all.h>
|
|
||||||
|
|
||||||
#if defined ENABLE_CUTLASS_MLA && ENABLE_CUTLASS_MLA
|
|
||||||
void cutlass_mla_decode_sm100a(torch::Tensor const& out,
|
|
||||||
torch::Tensor const& q_nope,
|
|
||||||
torch::Tensor const& q_pe,
|
|
||||||
torch::Tensor const& kv_c_and_k_pe_cache,
|
|
||||||
torch::Tensor const& seq_lens,
|
|
||||||
torch::Tensor const& page_table, double scale);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
void cutlass_mla_decode(torch::Tensor const& out, torch::Tensor const& q_nope,
|
|
||||||
torch::Tensor const& q_pe,
|
|
||||||
torch::Tensor const& kv_c_and_k_pe_cache,
|
|
||||||
torch::Tensor const& seq_lens,
|
|
||||||
torch::Tensor const& page_table, double scale) {
|
|
||||||
#if defined ENABLE_CUTLASS_MLA && ENABLE_CUTLASS_MLA
|
|
||||||
return cutlass_mla_decode_sm100a(out, q_nope, q_pe, kv_c_and_k_pe_cache,
|
|
||||||
seq_lens, page_table, scale);
|
|
||||||
#endif
|
|
||||||
TORCH_CHECK_NOT_IMPLEMENTED(false, "No compiled cutlass MLA");
|
|
||||||
}
|
|
||||||
@ -1,225 +0,0 @@
|
|||||||
/*
|
|
||||||
* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
|
||||||
*
|
|
||||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
* you may not use this file except in compliance with the License.
|
|
||||||
* You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
* See the License for the specific language governing permissions and
|
|
||||||
* limitations under the License.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include <torch/all.h>
|
|
||||||
|
|
||||||
#include <ATen/cuda/CUDAContext.h>
|
|
||||||
#include <c10/cuda/CUDAGuard.h>
|
|
||||||
|
|
||||||
#include "cute/tensor.hpp"
|
|
||||||
|
|
||||||
#include "cutlass/cutlass.h"
|
|
||||||
#include "cutlass/kernel_hardware_info.h"
|
|
||||||
|
|
||||||
#include "cutlass_extensions/common.hpp"
|
|
||||||
|
|
||||||
#include "device/sm100_mla.hpp"
|
|
||||||
#include "kernel/sm100_mla_tile_scheduler.hpp"
|
|
||||||
|
|
||||||
using namespace cute;
|
|
||||||
using namespace cutlass::fmha::kernel;
|
|
||||||
|
|
||||||
template <typename T, bool PersistenceOption = true>
|
|
||||||
struct MlaSm100 {
|
|
||||||
using Element = T;
|
|
||||||
using ElementAcc = float;
|
|
||||||
using ElementOut = T;
|
|
||||||
|
|
||||||
using TileShape = Shape<_128, _128, Shape<_512, _64>>;
|
|
||||||
using TileShapeH = cute::tuple_element_t<0, TileShape>;
|
|
||||||
using TileShapeD = cute::tuple_element_t<2, TileShape>;
|
|
||||||
|
|
||||||
// H K (D_latent D_rope) B
|
|
||||||
using ProblemShape = cute::tuple<TileShapeH, int, TileShapeD, int>;
|
|
||||||
|
|
||||||
using StrideQ = cute::tuple<int64_t, _1, int64_t>; // H D B
|
|
||||||
using StrideK = cute::tuple<int64_t, _1, int64_t>; // K D B
|
|
||||||
using StrideO = StrideK; // H D B
|
|
||||||
using StrideLSE = cute::tuple<_1, int>; // H B
|
|
||||||
|
|
||||||
using TileScheduler =
|
|
||||||
std::conditional_t<PersistenceOption, Sm100MlaPersistentTileScheduler,
|
|
||||||
Sm100MlaIndividualTileScheduler>;
|
|
||||||
|
|
||||||
using FmhaKernel =
|
|
||||||
cutlass::fmha::kernel::Sm100FmhaMlaKernelTmaWarpspecialized<
|
|
||||||
TileShape, Element, ElementAcc, ElementOut, ElementAcc, TileScheduler,
|
|
||||||
/*kIsCpAsync=*/true>;
|
|
||||||
using Fmha = cutlass::fmha::device::MLA<FmhaKernel>;
|
|
||||||
};
|
|
||||||
|
|
||||||
template <typename T>
|
|
||||||
typename T::Fmha::Arguments args_from_options(
|
|
||||||
at::Tensor const& out, at::Tensor const& q_nope, at::Tensor const& q_pe,
|
|
||||||
at::Tensor const& kv_c_and_k_pe_cache, at::Tensor const& seq_lens,
|
|
||||||
at::Tensor const& page_table, double scale) {
|
|
||||||
cutlass::KernelHardwareInfo hw_info;
|
|
||||||
hw_info.device_id = q_nope.device().index();
|
|
||||||
hw_info.sm_count =
|
|
||||||
cutlass::KernelHardwareInfo::query_device_multiprocessor_count(
|
|
||||||
hw_info.device_id);
|
|
||||||
|
|
||||||
int batches = q_nope.sizes()[0];
|
|
||||||
int page_count_per_seq = page_table.sizes()[1];
|
|
||||||
int page_count_total = kv_c_and_k_pe_cache.sizes()[0];
|
|
||||||
int page_size = kv_c_and_k_pe_cache.sizes()[1];
|
|
||||||
int max_seq_len = page_size * page_count_per_seq;
|
|
||||||
using TileShapeH = typename T::TileShapeH;
|
|
||||||
using TileShapeD = typename T::TileShapeD;
|
|
||||||
auto problem_shape =
|
|
||||||
cute::make_tuple(TileShapeH{}, max_seq_len, TileShapeD{}, batches);
|
|
||||||
|
|
||||||
auto [H, K, D, B] = problem_shape;
|
|
||||||
auto [D_latent, D_rope] = D;
|
|
||||||
|
|
||||||
using StrideQ = typename T::StrideQ;
|
|
||||||
using StrideK = typename T::StrideK;
|
|
||||||
using StrideO = typename T::StrideO;
|
|
||||||
using StrideLSE = typename T::StrideLSE;
|
|
||||||
|
|
||||||
StrideQ stride_Q_latent = cute::make_tuple(
|
|
||||||
static_cast<int64_t>(D_latent), _1{}, static_cast<int64_t>(H * D_latent));
|
|
||||||
StrideQ stride_Q_rope = cute::make_tuple(static_cast<int64_t>(D_rope), _1{},
|
|
||||||
static_cast<int64_t>(H * D_rope));
|
|
||||||
StrideK stride_C =
|
|
||||||
cute::make_tuple(static_cast<int64_t>(D_latent + D_rope), _1{},
|
|
||||||
static_cast<int64_t>(page_size * (D_latent + D_rope)));
|
|
||||||
StrideLSE stride_PT = cute::make_stride(_1{}, page_count_per_seq);
|
|
||||||
StrideLSE stride_LSE = cute::make_tuple(_1{}, static_cast<int>(H));
|
|
||||||
StrideO stride_O = cute::make_tuple(static_cast<int64_t>(D_latent), _1{},
|
|
||||||
static_cast<int64_t>(H * D_latent));
|
|
||||||
|
|
||||||
using Element = typename T::Element;
|
|
||||||
using ElementOut = typename T::ElementOut;
|
|
||||||
using ElementAcc = typename T::ElementAcc;
|
|
||||||
auto Q_latent_ptr = static_cast<Element*>(q_nope.data_ptr());
|
|
||||||
auto Q_rope_ptr = static_cast<Element*>(q_pe.data_ptr());
|
|
||||||
auto C_ptr = static_cast<Element*>(kv_c_and_k_pe_cache.data_ptr());
|
|
||||||
auto scale_f = static_cast<float>(scale);
|
|
||||||
typename T::Fmha::Arguments arguments{
|
|
||||||
problem_shape,
|
|
||||||
{scale_f, Q_latent_ptr, stride_Q_latent, Q_rope_ptr, stride_Q_rope, C_ptr,
|
|
||||||
stride_C, C_ptr + D_latent, stride_C,
|
|
||||||
static_cast<int*>(seq_lens.data_ptr()),
|
|
||||||
static_cast<int*>(page_table.data_ptr()), stride_PT, page_count_total,
|
|
||||||
page_size},
|
|
||||||
{static_cast<ElementOut*>(out.data_ptr()), stride_O,
|
|
||||||
static_cast<ElementAcc*>(nullptr), stride_LSE},
|
|
||||||
hw_info,
|
|
||||||
-1, // split_kv
|
|
||||||
nullptr, // is_var_split_kv
|
|
||||||
};
|
|
||||||
// TODO(kaixih@nvidia): When split_kv=-1 and is_var_split_kv=false, we compute
|
|
||||||
// split_kv automatically based on batch size and sequence length to balance
|
|
||||||
// workload across available SMs. Consider using var_split_kv for manual
|
|
||||||
// control if needed.
|
|
||||||
T::Fmha::set_split_kv(arguments);
|
|
||||||
return arguments;
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename Element>
|
|
||||||
void runMla(at::Tensor const& out, at::Tensor const& q_nope,
|
|
||||||
at::Tensor const& q_pe, at::Tensor const& kv_c_and_k_pe_cache,
|
|
||||||
at::Tensor const& seq_lens, at::Tensor const& page_table,
|
|
||||||
float scale, cudaStream_t stream) {
|
|
||||||
using MlaSm100Type = MlaSm100<Element>;
|
|
||||||
typename MlaSm100Type::Fmha fmha;
|
|
||||||
auto arguments = args_from_options<MlaSm100Type>(
|
|
||||||
out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, scale);
|
|
||||||
size_t workspace_size = MlaSm100Type::Fmha::get_workspace_size(arguments);
|
|
||||||
auto const workspace_options =
|
|
||||||
torch::TensorOptions().dtype(torch::kUInt8).device(q_nope.device());
|
|
||||||
auto workspace = torch::empty(workspace_size, workspace_options);
|
|
||||||
|
|
||||||
CUTLASS_CHECK(fmha.can_implement(arguments));
|
|
||||||
|
|
||||||
CUTLASS_CHECK(fmha.initialize(arguments, workspace.data_ptr(), stream));
|
|
||||||
|
|
||||||
CUTLASS_CHECK(fmha.run(arguments, workspace.data_ptr(), stream));
|
|
||||||
}
|
|
||||||
|
|
||||||
void cutlass_mla_decode_sm100a(torch::Tensor const& out,
|
|
||||||
torch::Tensor const& q_nope,
|
|
||||||
torch::Tensor const& q_pe,
|
|
||||||
torch::Tensor const& kv_c_and_k_pe_cache,
|
|
||||||
torch::Tensor const& seq_lens,
|
|
||||||
torch::Tensor const& page_table, double scale) {
|
|
||||||
TORCH_CHECK(q_nope.device().is_cuda(), "q_nope must be on CUDA");
|
|
||||||
TORCH_CHECK(q_nope.dim() == 3, "q_nope must be a 3D tensor");
|
|
||||||
TORCH_CHECK(q_pe.dim() == 3, "q_pe must be a 3D tensor");
|
|
||||||
TORCH_CHECK(kv_c_and_k_pe_cache.dim() == 3,
|
|
||||||
"kv_c_and_k_pe_cache must be a 3D tensor");
|
|
||||||
TORCH_CHECK(seq_lens.dim() == 1, "seq_lens must be a 1D tensor");
|
|
||||||
TORCH_CHECK(page_table.dim() == 2, "page_table must be a 2D tensor");
|
|
||||||
TORCH_CHECK(out.dim() == 3, "out must be a 3D tensor");
|
|
||||||
|
|
||||||
auto B_q_nope = q_nope.size(0);
|
|
||||||
auto H_q_nope = q_nope.size(1);
|
|
||||||
auto D_q_nope = q_nope.size(2);
|
|
||||||
auto B_q_pe = q_pe.size(0);
|
|
||||||
auto H_q_pe = q_pe.size(1);
|
|
||||||
auto D_q_pe = q_pe.size(2);
|
|
||||||
auto B_pt = page_table.size(0);
|
|
||||||
auto PAGE_NUM = page_table.size(1);
|
|
||||||
auto PAGE_SIZE = kv_c_and_k_pe_cache.size(1);
|
|
||||||
auto D_ckv = kv_c_and_k_pe_cache.size(2);
|
|
||||||
auto B_o = out.size(0);
|
|
||||||
auto H_o = out.size(1);
|
|
||||||
auto D_o = out.size(2);
|
|
||||||
|
|
||||||
TORCH_CHECK(D_q_nope == 512, "D_q_nope must be equal to 512");
|
|
||||||
TORCH_CHECK(D_q_pe == 64, "D_q_pe must be equal to 64");
|
|
||||||
TORCH_CHECK(D_ckv == 576, "D_ckv must be equal to 576");
|
|
||||||
TORCH_CHECK(H_q_nope == H_q_pe && H_q_nope == H_o && H_o == 128,
|
|
||||||
"H_q_nope, H_q_pe, and H_o must be equal to 128");
|
|
||||||
TORCH_CHECK(PAGE_SIZE > 0 && (PAGE_SIZE & (PAGE_SIZE - 1)) == 0,
|
|
||||||
"PAGE_SIZE must be a power of 2");
|
|
||||||
TORCH_CHECK(
|
|
||||||
B_q_nope == B_q_pe && B_q_nope == B_pt && B_q_nope == B_o,
|
|
||||||
"Batch dims must be same for page_table, q_nope and q_pe, and out");
|
|
||||||
TORCH_CHECK(PAGE_NUM % (128 / PAGE_SIZE) == 0,
|
|
||||||
"PAGE_NUM must be divisible by 128 / PAGE_SIZE");
|
|
||||||
TORCH_CHECK(D_o == 512, "D_o must be equal to 512");
|
|
||||||
|
|
||||||
TORCH_CHECK(q_nope.dtype() == at::ScalarType::Half ||
|
|
||||||
q_nope.dtype() == at::ScalarType::BFloat16 ||
|
|
||||||
q_nope.dtype() == at::ScalarType::Float8_e4m3fn,
|
|
||||||
"q_nope must be a half, bfloat16, or float8_e4m3fn tensor");
|
|
||||||
TORCH_CHECK(kv_c_and_k_pe_cache.dtype() == q_nope.dtype() &&
|
|
||||||
q_nope.dtype() == q_pe.dtype(),
|
|
||||||
"kv_c_and_k_pe_cache, q_nope, and q_pe must be the same type");
|
|
||||||
TORCH_CHECK(seq_lens.dtype() == torch::kInt32,
|
|
||||||
"seq_lens must be a 32-bit integer tensor");
|
|
||||||
TORCH_CHECK(page_table.dtype() == torch::kInt32,
|
|
||||||
"page_table must be a 32-bit integer tensor");
|
|
||||||
|
|
||||||
auto in_dtype = q_nope.dtype();
|
|
||||||
at::cuda::CUDAGuard device_guard{(char)q_nope.get_device()};
|
|
||||||
const cudaStream_t stream =
|
|
||||||
at::cuda::getCurrentCUDAStream(q_nope.get_device());
|
|
||||||
if (in_dtype == at::ScalarType::Half) {
|
|
||||||
runMla<cutlass::half_t>(out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens,
|
|
||||||
page_table, scale, stream);
|
|
||||||
} else if (in_dtype == at::ScalarType::BFloat16) {
|
|
||||||
runMla<cutlass::bfloat16_t>(out, q_nope, q_pe, kv_c_and_k_pe_cache,
|
|
||||||
seq_lens, page_table, scale, stream);
|
|
||||||
} else if (in_dtype == at::ScalarType::Float8_e4m3fn) {
|
|
||||||
runMla<cutlass::float_e4m3_t>(out, q_nope, q_pe, kv_c_and_k_pe_cache,
|
|
||||||
seq_lens, page_table, scale, stream);
|
|
||||||
} else {
|
|
||||||
TORCH_CHECK(false, "Unsupported input data type of MLA");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@ -270,10 +270,9 @@ __global__ void reshape_and_cache_flash_kernel(
|
|||||||
cache_t* __restrict__ value_cache, // [num_blocks, block_size, num_heads,
|
cache_t* __restrict__ value_cache, // [num_blocks, block_size, num_heads,
|
||||||
// head_size]
|
// head_size]
|
||||||
const int64_t* __restrict__ slot_mapping, // [num_tokens]
|
const int64_t* __restrict__ slot_mapping, // [num_tokens]
|
||||||
const int64_t block_stride, const int64_t page_stride,
|
const int block_stride, const int key_stride, const int value_stride,
|
||||||
const int64_t head_stride, const int64_t key_stride,
|
const int num_heads, const int head_size, const int block_size,
|
||||||
const int64_t value_stride, const int num_heads, const int head_size,
|
const float* k_scale, const float* v_scale) {
|
||||||
const int block_size, const float* k_scale, const float* v_scale) {
|
|
||||||
const int64_t token_idx = blockIdx.x;
|
const int64_t token_idx = blockIdx.x;
|
||||||
const int64_t slot_idx = slot_mapping[token_idx];
|
const int64_t slot_idx = slot_mapping[token_idx];
|
||||||
// NOTE: slot_idx can be -1 if the token is padded
|
// NOTE: slot_idx can be -1 if the token is padded
|
||||||
@ -289,8 +288,8 @@ __global__ void reshape_and_cache_flash_kernel(
|
|||||||
const int head_idx = i / head_size;
|
const int head_idx = i / head_size;
|
||||||
const int head_offset = i % head_size;
|
const int head_offset = i % head_size;
|
||||||
const int64_t tgt_key_value_idx = block_idx * block_stride +
|
const int64_t tgt_key_value_idx = block_idx * block_stride +
|
||||||
block_offset * page_stride +
|
block_offset * num_heads * head_size +
|
||||||
head_idx * head_stride + head_offset;
|
head_idx * head_size + head_offset;
|
||||||
scalar_t tgt_key = key[src_key_idx];
|
scalar_t tgt_key = key[src_key_idx];
|
||||||
scalar_t tgt_value = value[src_value_idx];
|
scalar_t tgt_value = value[src_value_idx];
|
||||||
if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
|
if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
|
||||||
@ -397,16 +396,16 @@ void reshape_and_cache(
|
|||||||
// KV_T is the data type of key and value tensors.
|
// KV_T is the data type of key and value tensors.
|
||||||
// CACHE_T is the stored data type of kv-cache.
|
// CACHE_T is the stored data type of kv-cache.
|
||||||
// KV_DTYPE is the real data type of kv-cache.
|
// KV_DTYPE is the real data type of kv-cache.
|
||||||
#define CALL_RESHAPE_AND_CACHE_FLASH(KV_T, CACHE_T, KV_DTYPE) \
|
#define CALL_RESHAPE_AND_CACHE_FLASH(KV_T, CACHE_T, KV_DTYPE) \
|
||||||
vllm::reshape_and_cache_flash_kernel<KV_T, CACHE_T, KV_DTYPE> \
|
vllm::reshape_and_cache_flash_kernel<KV_T, CACHE_T, KV_DTYPE> \
|
||||||
<<<grid, block, 0, stream>>>( \
|
<<<grid, block, 0, stream>>>( \
|
||||||
reinterpret_cast<KV_T*>(key.data_ptr()), \
|
reinterpret_cast<KV_T*>(key.data_ptr()), \
|
||||||
reinterpret_cast<KV_T*>(value.data_ptr()), \
|
reinterpret_cast<KV_T*>(value.data_ptr()), \
|
||||||
reinterpret_cast<CACHE_T*>(key_cache.data_ptr()), \
|
reinterpret_cast<CACHE_T*>(key_cache.data_ptr()), \
|
||||||
reinterpret_cast<CACHE_T*>(value_cache.data_ptr()), \
|
reinterpret_cast<CACHE_T*>(value_cache.data_ptr()), \
|
||||||
slot_mapping.data_ptr<int64_t>(), block_stride, page_stride, \
|
slot_mapping.data_ptr<int64_t>(), block_stride, key_stride, \
|
||||||
head_stride, key_stride, value_stride, num_heads, head_size, \
|
value_stride, num_heads, head_size, block_size, \
|
||||||
block_size, reinterpret_cast<const float*>(k_scale.data_ptr()), \
|
reinterpret_cast<const float*>(k_scale.data_ptr()), \
|
||||||
reinterpret_cast<const float*>(v_scale.data_ptr()));
|
reinterpret_cast<const float*>(v_scale.data_ptr()));
|
||||||
|
|
||||||
void reshape_and_cache_flash(
|
void reshape_and_cache_flash(
|
||||||
@ -433,11 +432,9 @@ void reshape_and_cache_flash(
|
|||||||
int head_size = key.size(2);
|
int head_size = key.size(2);
|
||||||
int block_size = key_cache.size(1);
|
int block_size = key_cache.size(1);
|
||||||
|
|
||||||
int64_t key_stride = key.stride(0);
|
int key_stride = key.stride(0);
|
||||||
int64_t value_stride = value.stride(0);
|
int value_stride = value.stride(0);
|
||||||
int64_t block_stride = key_cache.stride(0);
|
int block_stride = key_cache.stride(0);
|
||||||
int64_t page_stride = key_cache.stride(1);
|
|
||||||
int64_t head_stride = key_cache.stride(2);
|
|
||||||
TORCH_CHECK(key_cache.stride(0) == value_cache.stride(0));
|
TORCH_CHECK(key_cache.stride(0) == value_cache.stride(0));
|
||||||
|
|
||||||
dim3 grid(num_tokens);
|
dim3 grid(num_tokens);
|
||||||
|
|||||||
@ -7,22 +7,3 @@ inline constexpr uint32_t next_pow_2(uint32_t const num) {
|
|||||||
if (num <= 1) return num;
|
if (num <= 1) return num;
|
||||||
return 1 << (CHAR_BIT * sizeof(num) - __builtin_clz(num - 1));
|
return 1 << (CHAR_BIT * sizeof(num) - __builtin_clz(num - 1));
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename A, typename B>
|
|
||||||
static inline constexpr auto div_ceil(A a, B b) {
|
|
||||||
return (a + b - 1) / b;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Round a down to the next multiple of b. The caller is responsible for making
|
|
||||||
// sure that b is non-zero
|
|
||||||
template <typename T>
|
|
||||||
inline constexpr T round_to_previous_multiple_of(T a, T b) {
|
|
||||||
return a % b == 0 ? a : (a / b) * b;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Round a up to the next multiple of b. The caller is responsible for making
|
|
||||||
// sure that b is non-zero
|
|
||||||
template <typename T>
|
|
||||||
inline constexpr T round_to_next_multiple_of(T a, T b) {
|
|
||||||
return a % b == 0 ? a : ((a / b) + 1) * b;
|
|
||||||
}
|
|
||||||
|
|||||||
@ -4,7 +4,6 @@
|
|||||||
|
|
||||||
#include <altivec.h>
|
#include <altivec.h>
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
#include <algorithm>
|
|
||||||
#include <torch/all.h>
|
#include <torch/all.h>
|
||||||
|
|
||||||
namespace vec_op {
|
namespace vec_op {
|
||||||
@ -63,10 +62,6 @@ typedef struct f32x4x4_t {
|
|||||||
__vector float val[4];
|
__vector float val[4];
|
||||||
} f32x4x4_t;
|
} f32x4x4_t;
|
||||||
|
|
||||||
typedef struct i32x4x4_t {
|
|
||||||
__vector int32_t val[4];
|
|
||||||
} i32x4x4_t;
|
|
||||||
|
|
||||||
struct FP32Vec8;
|
struct FP32Vec8;
|
||||||
struct FP32Vec16;
|
struct FP32Vec16;
|
||||||
|
|
||||||
@ -103,28 +98,6 @@ struct BF16Vec16 : public Vec<BF16Vec16> {
|
|||||||
vec_xst(reg.val[0], 0, (signed short*)ptr);
|
vec_xst(reg.val[0], 0, (signed short*)ptr);
|
||||||
vec_xst(reg.val[1], 16, (signed short*)ptr);
|
vec_xst(reg.val[1], 16, (signed short*)ptr);
|
||||||
}
|
}
|
||||||
|
|
||||||
void save(void* ptr, const int elem_num) const {
|
|
||||||
const int clamped_elem = std::max(0, std::min(elem_num, 16));
|
|
||||||
|
|
||||||
// Calculate elements to store in each 128-bit part (8 elements each)
|
|
||||||
const int elements_val0 = std::min(clamped_elem, 8);
|
|
||||||
const int elements_val1 = std::max(clamped_elem - 8, 0);
|
|
||||||
|
|
||||||
// Convert elements to bytes (2 bytes per element)
|
|
||||||
const size_t bytes_val0 = elements_val0 * sizeof(signed short);
|
|
||||||
const size_t bytes_val1 = elements_val1 * sizeof(signed short);
|
|
||||||
|
|
||||||
signed short* dest = static_cast<signed short*>(ptr);
|
|
||||||
// Store the first part using vec_xst_len
|
|
||||||
if (bytes_val0 > 0) {
|
|
||||||
vec_xst_len(reg.val[0], dest, bytes_val0);
|
|
||||||
}
|
|
||||||
// Store the second part if needed
|
|
||||||
if (bytes_val1 > 0) {
|
|
||||||
vec_xst_len(reg.val[1], dest + elements_val0, bytes_val1);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
};
|
||||||
|
|
||||||
const static __vector signed short zero = vec_splats((signed short)0);
|
const static __vector signed short zero = vec_splats((signed short)0);
|
||||||
@ -284,64 +257,6 @@ struct FP32Vec8 : public Vec<FP32Vec8> {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
struct INT32Vec16 : public Vec<INT32Vec16> {
|
|
||||||
constexpr static int VEC_ELEM_NUM = 16;
|
|
||||||
union AliasReg {
|
|
||||||
i32x4x4_t reg;
|
|
||||||
int32_t values[VEC_ELEM_NUM];
|
|
||||||
};
|
|
||||||
|
|
||||||
i32x4x4_t reg;
|
|
||||||
|
|
||||||
explicit INT32Vec16(const void* data_ptr) {
|
|
||||||
reg.val[0] = vec_xl(0, reinterpret_cast<const __vector int32_t*>(data_ptr));
|
|
||||||
reg.val[1] =
|
|
||||||
vec_xl(16, reinterpret_cast<const __vector int32_t*>(data_ptr));
|
|
||||||
reg.val[2] =
|
|
||||||
vec_xl(32, reinterpret_cast<const __vector int32_t*>(data_ptr));
|
|
||||||
reg.val[3] =
|
|
||||||
vec_xl(48, reinterpret_cast<const __vector int32_t*>(data_ptr));
|
|
||||||
}
|
|
||||||
|
|
||||||
void save(int32_t* ptr) const {
|
|
||||||
vec_xst(reg.val[0], 0, reinterpret_cast<__vector int32_t*>(ptr));
|
|
||||||
vec_xst(reg.val[1], 16, reinterpret_cast<__vector int32_t*>(ptr));
|
|
||||||
vec_xst(reg.val[2], 32, reinterpret_cast<__vector int32_t*>(ptr));
|
|
||||||
vec_xst(reg.val[3], 48, reinterpret_cast<__vector int32_t*>(ptr));
|
|
||||||
}
|
|
||||||
|
|
||||||
void save(int32_t* ptr, const int elem_num) const {
|
|
||||||
const int elements_in_chunk1 =
|
|
||||||
(elem_num >= 0) ? ((elem_num >= 4) ? 4 : elem_num) : 0;
|
|
||||||
const int elements_in_chunk2 =
|
|
||||||
(elem_num > 4) ? ((elem_num >= 8) ? 4 : elem_num - 4) : 0;
|
|
||||||
const int elements_in_chunk3 =
|
|
||||||
(elem_num > 8) ? ((elem_num >= 12) ? 4 : elem_num - 8) : 0;
|
|
||||||
const int elements_in_chunk4 =
|
|
||||||
(elem_num > 12) ? ((elem_num >= 16) ? 4 : elem_num - 12) : 0;
|
|
||||||
|
|
||||||
const size_t bytes_chunk1 =
|
|
||||||
static_cast<size_t>(elements_in_chunk1 * sizeof(int32_t));
|
|
||||||
const size_t bytes_chunk2 =
|
|
||||||
static_cast<size_t>(elements_in_chunk2 * sizeof(int32_t));
|
|
||||||
const size_t bytes_chunk3 =
|
|
||||||
static_cast<size_t>(elements_in_chunk3 * sizeof(int32_t));
|
|
||||||
const size_t bytes_chunk4 =
|
|
||||||
static_cast<size_t>(elements_in_chunk4 * sizeof(int32_t));
|
|
||||||
|
|
||||||
vec_xst_len(reg.val[0], reinterpret_cast<int32_t*>(ptr), bytes_chunk1);
|
|
||||||
vec_xst_len(reg.val[1],
|
|
||||||
reinterpret_cast<int32_t*>(reinterpret_cast<char*>(ptr) + 16),
|
|
||||||
bytes_chunk2);
|
|
||||||
vec_xst_len(reg.val[2],
|
|
||||||
reinterpret_cast<int32_t*>(reinterpret_cast<char*>(ptr) + 32),
|
|
||||||
bytes_chunk3);
|
|
||||||
vec_xst_len(reg.val[3],
|
|
||||||
reinterpret_cast<int32_t*>(reinterpret_cast<char*>(ptr) + 48),
|
|
||||||
bytes_chunk4);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
struct FP32Vec16 : public Vec<FP32Vec16> {
|
struct FP32Vec16 : public Vec<FP32Vec16> {
|
||||||
constexpr static int VEC_ELEM_NUM = 16;
|
constexpr static int VEC_ELEM_NUM = 16;
|
||||||
union AliasReg {
|
union AliasReg {
|
||||||
@ -404,13 +319,6 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
|
|||||||
|
|
||||||
explicit FP32Vec16(const BF16Vec8& v) : FP32Vec16(FP32Vec8(v)) {}
|
explicit FP32Vec16(const BF16Vec8& v) : FP32Vec16(FP32Vec8(v)) {}
|
||||||
|
|
||||||
explicit FP32Vec16(const INT32Vec16& v) {
|
|
||||||
reg.val[0] = vec_ctf(v.reg.val[0], 0);
|
|
||||||
reg.val[1] = vec_ctf(v.reg.val[1], 0);
|
|
||||||
reg.val[2] = vec_ctf(v.reg.val[2], 0);
|
|
||||||
reg.val[3] = vec_ctf(v.reg.val[3], 0);
|
|
||||||
}
|
|
||||||
|
|
||||||
FP32Vec16 operator*(const FP32Vec16& b) const {
|
FP32Vec16 operator*(const FP32Vec16& b) const {
|
||||||
return FP32Vec16(f32x4x4_t({vec_mul(reg.val[0], b.reg.val[0]),
|
return FP32Vec16(f32x4x4_t({vec_mul(reg.val[0], b.reg.val[0]),
|
||||||
vec_mul(reg.val[1], b.reg.val[1]),
|
vec_mul(reg.val[1], b.reg.val[1]),
|
||||||
@ -439,117 +347,6 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
|
|||||||
vec_div(reg.val[3], b.reg.val[3])}));
|
vec_div(reg.val[3], b.reg.val[3])}));
|
||||||
}
|
}
|
||||||
|
|
||||||
FP32Vec16 clamp(const FP32Vec16& min, const FP32Vec16& max) const {
|
|
||||||
return FP32Vec16(f32x4x4_t(
|
|
||||||
{vec_min(max.reg.val[0], vec_max(min.reg.val[0], reg.val[0])),
|
|
||||||
vec_min(max.reg.val[1], vec_max(min.reg.val[1], reg.val[1])),
|
|
||||||
vec_min(max.reg.val[2], vec_max(min.reg.val[2], reg.val[2])),
|
|
||||||
vec_min(max.reg.val[3], vec_max(min.reg.val[3], reg.val[3]))}));
|
|
||||||
}
|
|
||||||
|
|
||||||
FP32Vec16 max(const FP32Vec16& b) const {
|
|
||||||
return FP32Vec16(f32x4x4_t({vec_max(reg.val[0], b.reg.val[0]),
|
|
||||||
vec_max(reg.val[1], b.reg.val[1]),
|
|
||||||
vec_max(reg.val[2], b.reg.val[2]),
|
|
||||||
vec_max(reg.val[3], b.reg.val[3])}));
|
|
||||||
}
|
|
||||||
|
|
||||||
FP32Vec16 max(const FP32Vec16& b, int elem_num) const {
|
|
||||||
FP32Vec16 result;
|
|
||||||
|
|
||||||
// Create a vector of element indices for each chunk
|
|
||||||
__vector unsigned int indices = {0, 1, 2, 3};
|
|
||||||
__vector unsigned int elem_num_vec =
|
|
||||||
vec_splats(static_cast<unsigned int>(elem_num));
|
|
||||||
|
|
||||||
// Compute masks for each chunk
|
|
||||||
__vector unsigned int chunk_offset0 = {0, 0, 0,
|
|
||||||
0}; // Chunk 0: Elements 0-3
|
|
||||||
__vector unsigned int chunk_offset1 = {4, 4, 4,
|
|
||||||
4}; // Chunk 1: Elements 4-7
|
|
||||||
__vector unsigned int chunk_offset2 = {8, 8, 8,
|
|
||||||
8}; // Chunk 2: Elements 8-11
|
|
||||||
__vector unsigned int chunk_offset3 = {12, 12, 12,
|
|
||||||
12}; // Chunk 3: Elements 12-15
|
|
||||||
|
|
||||||
// Compute masks for each chunk
|
|
||||||
__vector bool int mask0 = vec_cmplt(indices + chunk_offset0, elem_num_vec);
|
|
||||||
__vector bool int mask1 = vec_cmplt(indices + chunk_offset1, elem_num_vec);
|
|
||||||
__vector bool int mask2 = vec_cmplt(indices + chunk_offset2, elem_num_vec);
|
|
||||||
__vector bool int mask3 = vec_cmplt(indices + chunk_offset3, elem_num_vec);
|
|
||||||
|
|
||||||
// Apply masks to compute the result for each chunk
|
|
||||||
result.reg.val[0] = vec_sel(this->reg.val[0],
|
|
||||||
vec_max(this->reg.val[0], b.reg.val[0]), mask0);
|
|
||||||
result.reg.val[1] = vec_sel(this->reg.val[1],
|
|
||||||
vec_max(this->reg.val[1], b.reg.val[1]), mask1);
|
|
||||||
result.reg.val[2] = vec_sel(this->reg.val[2],
|
|
||||||
vec_max(this->reg.val[2], b.reg.val[2]), mask2);
|
|
||||||
result.reg.val[3] = vec_sel(this->reg.val[3],
|
|
||||||
vec_max(this->reg.val[3], b.reg.val[3]), mask3);
|
|
||||||
|
|
||||||
return FP32Vec16(result.reg);
|
|
||||||
}
|
|
||||||
|
|
||||||
FP32Vec16 min(const FP32Vec16& b) const {
|
|
||||||
return FP32Vec16(f32x4x4_t({vec_min(reg.val[0], b.reg.val[0]),
|
|
||||||
vec_min(reg.val[1], b.reg.val[1]),
|
|
||||||
vec_min(reg.val[2], b.reg.val[2]),
|
|
||||||
vec_min(reg.val[3], b.reg.val[3])}));
|
|
||||||
}
|
|
||||||
|
|
||||||
FP32Vec16 min(const FP32Vec16& b, int elem_num) const {
|
|
||||||
FP32Vec16 result;
|
|
||||||
|
|
||||||
vector unsigned int indices = {0, 1, 2, 3};
|
|
||||||
vector unsigned int elem_num_vec =
|
|
||||||
vec_splats(static_cast<unsigned int>(elem_num));
|
|
||||||
|
|
||||||
vector unsigned int chunk_offset0 = {0, 0, 0, 0};
|
|
||||||
vector unsigned int chunk_offset1 = {4, 4, 4, 4};
|
|
||||||
vector unsigned int chunk_offset2 = {8, 8, 8, 8};
|
|
||||||
vector unsigned int chunk_offset3 = {12, 12, 12, 12};
|
|
||||||
|
|
||||||
vector bool int mask0 = vec_cmplt(indices + chunk_offset0, elem_num_vec);
|
|
||||||
vector bool int mask1 = vec_cmplt(indices + chunk_offset1, elem_num_vec);
|
|
||||||
vector bool int mask2 = vec_cmplt(indices + chunk_offset2, elem_num_vec);
|
|
||||||
vector bool int mask3 = vec_cmplt(indices + chunk_offset3, elem_num_vec);
|
|
||||||
|
|
||||||
result.reg.val[0] = vec_sel(this->reg.val[0],
|
|
||||||
vec_min(this->reg.val[0], b.reg.val[0]), mask0);
|
|
||||||
result.reg.val[1] = vec_sel(this->reg.val[1],
|
|
||||||
vec_min(this->reg.val[1], b.reg.val[1]), mask1);
|
|
||||||
result.reg.val[2] = vec_sel(this->reg.val[2],
|
|
||||||
vec_min(this->reg.val[2], b.reg.val[2]), mask2);
|
|
||||||
result.reg.val[3] = vec_sel(this->reg.val[3],
|
|
||||||
vec_min(this->reg.val[3], b.reg.val[3]), mask3);
|
|
||||||
|
|
||||||
return FP32Vec16(result.reg);
|
|
||||||
}
|
|
||||||
|
|
||||||
FP32Vec16 abs() const {
|
|
||||||
return FP32Vec16(f32x4x4_t({vec_abs(reg.val[0]), vec_abs(reg.val[1]),
|
|
||||||
vec_abs(reg.val[2]), vec_abs(reg.val[3])}));
|
|
||||||
}
|
|
||||||
|
|
||||||
float reduce_max() {
|
|
||||||
__vector float max01 = vec_max(reg.val[0], reg.val[1]);
|
|
||||||
__vector float max23 = vec_max(reg.val[2], reg.val[3]);
|
|
||||||
__vector float max_all = vec_max(max01, max23);
|
|
||||||
__vector float temp = vec_max(max_all, vec_sld(max_all, max_all, 8));
|
|
||||||
temp = vec_max(temp, vec_sld(temp, temp, 4));
|
|
||||||
return vec_extract(temp, 0);
|
|
||||||
}
|
|
||||||
|
|
||||||
float reduce_min() {
|
|
||||||
__vector float min01 = vec_min(reg.val[0], reg.val[1]);
|
|
||||||
__vector float min23 = vec_min(reg.val[2], reg.val[3]);
|
|
||||||
__vector float min_all = vec_min(min01, min23);
|
|
||||||
__vector float temp = vec_min(min_all, vec_sld(min_all, min_all, 8));
|
|
||||||
temp = vec_min(temp, vec_sld(temp, temp, 4));
|
|
||||||
return vec_extract(temp, 0);
|
|
||||||
}
|
|
||||||
|
|
||||||
float reduce_sum() const {
|
float reduce_sum() const {
|
||||||
AliasReg ar;
|
AliasReg ar;
|
||||||
ar.reg = reg;
|
ar.reg = reg;
|
||||||
@ -580,68 +377,6 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
|
|||||||
vec_xst(reg.val[2], 32, ptr);
|
vec_xst(reg.val[2], 32, ptr);
|
||||||
vec_xst(reg.val[3], 48, ptr);
|
vec_xst(reg.val[3], 48, ptr);
|
||||||
}
|
}
|
||||||
|
|
||||||
void save(float* ptr, const int elem_num) const {
|
|
||||||
const int elements_in_chunk1 =
|
|
||||||
(elem_num >= 0) ? ((elem_num >= 4) ? 4 : elem_num) : 0;
|
|
||||||
const int elements_in_chunk2 =
|
|
||||||
(elem_num > 4) ? ((elem_num >= 8) ? 4 : elem_num - 4) : 0;
|
|
||||||
const int elements_in_chunk3 =
|
|
||||||
(elem_num > 8) ? ((elem_num >= 12) ? 4 : elem_num - 8) : 0;
|
|
||||||
const int elements_in_chunk4 =
|
|
||||||
(elem_num > 12) ? ((elem_num >= 16) ? 4 : elem_num - 12) : 0;
|
|
||||||
|
|
||||||
const size_t bytes_chunk1 =
|
|
||||||
static_cast<size_t>(elements_in_chunk1 * sizeof(float));
|
|
||||||
const size_t bytes_chunk2 =
|
|
||||||
static_cast<size_t>(elements_in_chunk2 * sizeof(float));
|
|
||||||
const size_t bytes_chunk3 =
|
|
||||||
static_cast<size_t>(elements_in_chunk3 * sizeof(float));
|
|
||||||
const size_t bytes_chunk4 =
|
|
||||||
static_cast<size_t>(elements_in_chunk4 * sizeof(float));
|
|
||||||
|
|
||||||
vec_xst_len(reg.val[0], ptr, bytes_chunk1);
|
|
||||||
vec_xst_len(reg.val[1],
|
|
||||||
reinterpret_cast<float*>(reinterpret_cast<char*>(ptr) + 16),
|
|
||||||
bytes_chunk2);
|
|
||||||
vec_xst_len(reg.val[2],
|
|
||||||
reinterpret_cast<float*>(reinterpret_cast<char*>(ptr) + 32),
|
|
||||||
bytes_chunk3);
|
|
||||||
vec_xst_len(reg.val[3],
|
|
||||||
reinterpret_cast<float*>(reinterpret_cast<char*>(ptr) + 48),
|
|
||||||
bytes_chunk4);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
struct INT8Vec16 : public Vec<INT8Vec16> {
|
|
||||||
constexpr static int VEC_NUM_ELEM = 16; // 128 bits / 8 bits = 16
|
|
||||||
|
|
||||||
union AliasReg {
|
|
||||||
__vector signed char reg;
|
|
||||||
int8_t values[VEC_NUM_ELEM];
|
|
||||||
};
|
|
||||||
|
|
||||||
__vector signed char reg;
|
|
||||||
|
|
||||||
explicit INT8Vec16(const FP32Vec16& vec) {
|
|
||||||
__vector signed int ret[4];
|
|
||||||
ret[0] = vec_cts(vec.reg.val[0], 0);
|
|
||||||
ret[1] = vec_cts(vec.reg.val[1], 0);
|
|
||||||
ret[2] = vec_cts(vec.reg.val[2], 0);
|
|
||||||
ret[3] = vec_cts(vec.reg.val[3], 0);
|
|
||||||
|
|
||||||
__vector signed short packed1 = vec_packs(ret[0], ret[1]);
|
|
||||||
__vector signed short packed2 = vec_packs(ret[2], ret[3]);
|
|
||||||
|
|
||||||
reg = vec_packs(packed1, packed2);
|
|
||||||
}
|
|
||||||
|
|
||||||
void save(void* ptr) const {
|
|
||||||
*reinterpret_cast<__vector signed char*>(ptr) = reg;
|
|
||||||
}
|
|
||||||
void save(signed char* ptr, const int elem_num) {
|
|
||||||
vec_xst_len(reg, ptr, static_cast<size_t>(elem_num));
|
|
||||||
}
|
|
||||||
};
|
};
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
|
|||||||
@ -78,14 +78,9 @@ struct FP16Vec16 : public Vec<FP16Vec16> {
|
|||||||
|
|
||||||
__m256i reg;
|
__m256i reg;
|
||||||
|
|
||||||
// normal load
|
|
||||||
explicit FP16Vec16(const void* ptr)
|
explicit FP16Vec16(const void* ptr)
|
||||||
: reg((__m256i)_mm256_loadu_si256((__m256i*)ptr)) {}
|
: reg((__m256i)_mm256_loadu_si256((__m256i*)ptr)) {}
|
||||||
|
|
||||||
// non-temproal load
|
|
||||||
explicit FP16Vec16(bool, void* ptr)
|
|
||||||
: reg(_mm256_stream_load_si256((__m256i*)ptr)) {}
|
|
||||||
|
|
||||||
explicit FP16Vec16(const FP32Vec16&);
|
explicit FP16Vec16(const FP32Vec16&);
|
||||||
|
|
||||||
void save(void* ptr) const { *reinterpret_cast<__m256i*>(ptr) = reg; }
|
void save(void* ptr) const { *reinterpret_cast<__m256i*>(ptr) = reg; }
|
||||||
@ -115,14 +110,9 @@ struct BF16Vec16 : public Vec<BF16Vec16> {
|
|||||||
|
|
||||||
__m256i reg;
|
__m256i reg;
|
||||||
|
|
||||||
// normal load
|
|
||||||
explicit BF16Vec16(const void* ptr)
|
explicit BF16Vec16(const void* ptr)
|
||||||
: reg((__m256i)_mm256_loadu_si256((__m256i*)ptr)) {}
|
: reg((__m256i)_mm256_loadu_si256((__m256i*)ptr)) {}
|
||||||
|
|
||||||
// non-temproal load
|
|
||||||
explicit BF16Vec16(bool, void* ptr)
|
|
||||||
: reg(_mm256_stream_load_si256((__m256i*)ptr)) {}
|
|
||||||
|
|
||||||
explicit BF16Vec16(const FP32Vec16&);
|
explicit BF16Vec16(const FP32Vec16&);
|
||||||
|
|
||||||
void save(void* ptr) const { *reinterpret_cast<__m256i*>(ptr) = reg; }
|
void save(void* ptr) const { *reinterpret_cast<__m256i*>(ptr) = reg; }
|
||||||
@ -323,13 +313,8 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
|
|||||||
|
|
||||||
explicit FP32Vec16() : reg(_mm512_set1_ps(0.0)) {}
|
explicit FP32Vec16() : reg(_mm512_set1_ps(0.0)) {}
|
||||||
|
|
||||||
// normal load
|
|
||||||
explicit FP32Vec16(const float* ptr) : reg(_mm512_loadu_ps(ptr)) {}
|
explicit FP32Vec16(const float* ptr) : reg(_mm512_loadu_ps(ptr)) {}
|
||||||
|
|
||||||
// non-temproal load
|
|
||||||
explicit FP32Vec16(bool, void* ptr)
|
|
||||||
: reg((__m512)_mm512_stream_load_si512(ptr)) {}
|
|
||||||
|
|
||||||
explicit FP32Vec16(__m512 data) : reg(data) {}
|
explicit FP32Vec16(__m512 data) : reg(data) {}
|
||||||
|
|
||||||
explicit FP32Vec16(const FP32Vec4& data)
|
explicit FP32Vec16(const FP32Vec4& data)
|
||||||
@ -562,33 +547,6 @@ struct INT8Vec16 : public Vec<INT8Vec16> {
|
|||||||
_mm_mask_storeu_epi8(ptr, mask, reg);
|
_mm_mask_storeu_epi8(ptr, mask, reg);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
struct INT8Vec64 : public Vec<INT8Vec64> {
|
|
||||||
constexpr static int VEC_ELEM_NUM = 64;
|
|
||||||
union AliasReg {
|
|
||||||
__m512i reg;
|
|
||||||
int8_t values[VEC_ELEM_NUM];
|
|
||||||
};
|
|
||||||
|
|
||||||
__m512i reg;
|
|
||||||
|
|
||||||
// normal load
|
|
||||||
explicit INT8Vec64(void* ptr) : reg(_mm512_loadu_epi8(ptr)) {}
|
|
||||||
|
|
||||||
// non-temproal load
|
|
||||||
explicit INT8Vec64(bool, void* ptr) : reg(_mm512_stream_load_si512(ptr)) {}
|
|
||||||
|
|
||||||
void save(void* ptr) const { _mm512_storeu_epi8(ptr, reg); }
|
|
||||||
|
|
||||||
void save(int8_t* ptr, const int elem_num) const {
|
|
||||||
constexpr uint64_t M = 0xFFFFFFFFFFFFFFFF;
|
|
||||||
__mmask64 mask = _cvtu64_mask64(M >> (64 - elem_num));
|
|
||||||
_mm512_mask_storeu_epi8(ptr, mask, reg);
|
|
||||||
}
|
|
||||||
|
|
||||||
// non-temproal save
|
|
||||||
void nt_save(int8_t* ptr) { _mm512_stream_si512((__m512i*)ptr, reg); }
|
|
||||||
};
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
@ -699,22 +657,6 @@ inline BF16Vec16::BF16Vec16(const FP32Vec16& v) {
|
|||||||
|
|
||||||
inline void prefetch(const void* addr) { _mm_prefetch(addr, _MM_HINT_T1); }
|
inline void prefetch(const void* addr) { _mm_prefetch(addr, _MM_HINT_T1); }
|
||||||
|
|
||||||
#ifdef __AVX512F__
|
|
||||||
inline void non_temporal_save(FP16Vec16& vec, void* ptr) {
|
|
||||||
_mm256_stream_si256((__m256i*)ptr, vec.reg);
|
|
||||||
}
|
|
||||||
inline void non_temporal_save(BF16Vec32& vec, void* ptr) {
|
|
||||||
_mm512_stream_si512((__m512i*)ptr, vec.reg);
|
|
||||||
}
|
|
||||||
inline void non_temporal_save(BF16Vec16& vec, void* ptr) {
|
|
||||||
_mm256_stream_si256((__m256i*)ptr, vec.reg);
|
|
||||||
}
|
|
||||||
inline void non_temporal_save(FP32Vec16& vec, void* ptr) {
|
|
||||||
_mm512_stream_ps((float*)ptr, vec.reg);
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
inline void mem_barrier() { _mm_mfence(); }
|
|
||||||
}; // namespace vec_op
|
}; // namespace vec_op
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
@ -9,8 +9,7 @@ void rotary_embedding_impl(
|
|||||||
scalar_t* __restrict__ query, /// [batch_size, seq_len, num_heads,
|
scalar_t* __restrict__ query, /// [batch_size, seq_len, num_heads,
|
||||||
/// head_size] or [num_tokens, num_heads,
|
/// head_size] or [num_tokens, num_heads,
|
||||||
/// head_size]
|
/// head_size]
|
||||||
scalar_t* __restrict__ key, // nullptr (optional) or
|
scalar_t* __restrict__ key, // [batch_size, seq_len, num_kv_heads,
|
||||||
// [batch_size, seq_len, num_kv_heads,
|
|
||||||
// head_size] or [num_tokens, num_kv_heads,
|
// head_size] or [num_tokens, num_kv_heads,
|
||||||
// head_size]
|
// head_size]
|
||||||
const scalar_t* __restrict__ cos_sin_cache, // [max_position, 2, rot_dim //
|
const scalar_t* __restrict__ cos_sin_cache, // [max_position, 2, rot_dim //
|
||||||
@ -86,13 +85,10 @@ void rotary_embedding_impl(
|
|||||||
compute_loop(token_head, cache_ptr, query);
|
compute_loop(token_head, cache_ptr, query);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (key != nullptr) {
|
for (int i = 0; i < num_kv_heads; ++i) {
|
||||||
for (int i = 0; i < num_kv_heads; ++i) {
|
const int head_idx = i;
|
||||||
const int head_idx = i;
|
const int64_t token_head = token_idx * key_stride + head_idx * head_size;
|
||||||
const int64_t token_head =
|
compute_loop(token_head, cache_ptr, key);
|
||||||
token_idx * key_stride + head_idx * head_size;
|
|
||||||
compute_loop(token_head, cache_ptr, key);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -104,8 +100,7 @@ void rotary_embedding_gptj_impl(
|
|||||||
scalar_t* __restrict__ query, /// [batch_size, seq_len, num_heads,
|
scalar_t* __restrict__ query, /// [batch_size, seq_len, num_heads,
|
||||||
/// head_size] or [num_tokens, num_heads,
|
/// head_size] or [num_tokens, num_heads,
|
||||||
/// head_size]
|
/// head_size]
|
||||||
scalar_t* __restrict__ key, // nullptr (optional) or
|
scalar_t* __restrict__ key, // [batch_size, seq_len, num_kv_heads,
|
||||||
// [batch_size, seq_len, num_kv_heads,
|
|
||||||
// head_size] or [num_tokens, num_kv_heads,
|
// head_size] or [num_tokens, num_kv_heads,
|
||||||
// head_size]
|
// head_size]
|
||||||
const scalar_t* __restrict__ cos_sin_cache, // [max_position, 2, rot_dim //
|
const scalar_t* __restrict__ cos_sin_cache, // [max_position, 2, rot_dim //
|
||||||
@ -143,10 +138,6 @@ void rotary_embedding_gptj_impl(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (key == nullptr) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
#pragma omp parallel for collapse(2)
|
#pragma omp parallel for collapse(2)
|
||||||
for (int token_idx = 0; token_idx < num_tokens; ++token_idx) {
|
for (int token_idx = 0; token_idx < num_tokens; ++token_idx) {
|
||||||
for (int i = 0; i < num_kv_heads; ++i) {
|
for (int i = 0; i < num_kv_heads; ++i) {
|
||||||
@ -177,13 +168,13 @@ void rotary_embedding_gptj_impl(
|
|||||||
}; // namespace
|
}; // namespace
|
||||||
|
|
||||||
void rotary_embedding(torch::Tensor& positions, torch::Tensor& query,
|
void rotary_embedding(torch::Tensor& positions, torch::Tensor& query,
|
||||||
std::optional<torch::Tensor> key, int64_t head_size,
|
torch::Tensor& key, int64_t head_size,
|
||||||
torch::Tensor& cos_sin_cache, bool is_neox) {
|
torch::Tensor& cos_sin_cache, bool is_neox) {
|
||||||
int num_tokens = positions.numel();
|
int num_tokens = positions.numel();
|
||||||
int rot_dim = cos_sin_cache.size(1);
|
int rot_dim = cos_sin_cache.size(1);
|
||||||
int num_heads = query.size(-1) / head_size;
|
int num_heads = query.size(-1) / head_size;
|
||||||
int num_kv_heads = key.has_value() ? key->size(-1) / head_size : num_heads;
|
int num_kv_heads = key.size(-1) / head_size;
|
||||||
int64_t key_stride = key.has_value() ? key->stride(-2) : 0;
|
int64_t key_stride = key.stride(-2);
|
||||||
int64_t query_stride = query.stride(-2);
|
int64_t query_stride = query.stride(-2);
|
||||||
|
|
||||||
VLLM_DISPATCH_FLOATING_TYPES(
|
VLLM_DISPATCH_FLOATING_TYPES(
|
||||||
@ -192,15 +183,15 @@ void rotary_embedding(torch::Tensor& positions, torch::Tensor& query,
|
|||||||
if (is_neox) {
|
if (is_neox) {
|
||||||
rotary_embedding_impl(
|
rotary_embedding_impl(
|
||||||
positions.data_ptr<int64_t>(), query.data_ptr<scalar_t>(),
|
positions.data_ptr<int64_t>(), query.data_ptr<scalar_t>(),
|
||||||
key.has_value() ? key->data_ptr<scalar_t>() : nullptr,
|
key.data_ptr<scalar_t>(), cos_sin_cache.data_ptr<scalar_t>(),
|
||||||
cos_sin_cache.data_ptr<scalar_t>(), rot_dim, query_stride,
|
rot_dim, query_stride, key_stride, num_heads, num_kv_heads,
|
||||||
key_stride, num_heads, num_kv_heads, head_size, num_tokens);
|
head_size, num_tokens);
|
||||||
} else {
|
} else {
|
||||||
rotary_embedding_gptj_impl(
|
rotary_embedding_gptj_impl(
|
||||||
positions.data_ptr<int64_t>(), query.data_ptr<scalar_t>(),
|
positions.data_ptr<int64_t>(), query.data_ptr<scalar_t>(),
|
||||||
key.has_value() ? key->data_ptr<scalar_t>() : nullptr,
|
key.data_ptr<scalar_t>(), cos_sin_cache.data_ptr<scalar_t>(),
|
||||||
cos_sin_cache.data_ptr<scalar_t>(), rot_dim, query_stride,
|
rot_dim, query_stride, key_stride, num_heads, num_kv_heads,
|
||||||
key_stride, num_heads, num_kv_heads, head_size, num_tokens);
|
head_size, num_tokens);
|
||||||
}
|
}
|
||||||
|
|
||||||
CPU_KERNEL_GUARD_OUT(rotary_embedding_impl)
|
CPU_KERNEL_GUARD_OUT(rotary_embedding_impl)
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user